wip scraping infra

This commit is contained in:
Sriram Hariharan
2023-03-04 11:51:56 -06:00
parent 2d940493a3
commit c9684beb5b
6 changed files with 231 additions and 32 deletions

View File

@@ -1,6 +1,8 @@
import React, { useEffect, useMemo, useState } from 'react';
import ReactDOM from 'react-dom';
import { Course } from 'src/shared/types/Course';
import { Course, CourseRow, CourseScraper } from 'src/shared/types/Course';
import { CourseCatalogDetailsScraper } from 'src/shared/types/CourseCatalogDetailsScraper';
import { CourseCatalogRowScraper } from 'src/shared/types/CourseCatalogRowScraper';
import useInfiniteScroll from '../hooks/useInfiniteScroll';
import { populateSearchInputs } from '../lib/courseCatalog/populateSearchInputs';
import { SiteSupport } from '../lib/getSiteSupport';
@@ -15,7 +17,7 @@ interface Props {
* This is the top level react component orchestrating the course catalog page.
*/
export default function CourseCatalogMain({ support }: Props) {
const [rows, setRows] = React.useState<HTMLTableRowElement[]>([]);
const [rows, setRows] = React.useState<CourseRow[]>([]);
const [selectedCourse, setSelectedCourse] = useState<Course | null>(null);
const isScrolling = useInfiniteScroll(async () => {
@@ -28,7 +30,7 @@ export default function CourseCatalogMain({ support }: Props) {
}, []);
useEffect(() => {
const rows = scrapeRowsFromCourseTable();
const rows = scrapeCourseRows(support);
setRows(rows);
}, []);
@@ -40,23 +42,44 @@ export default function CourseCatalogMain({ support }: Props) {
<div>
<TableHead>Plus</TableHead>
{rows.map(row => (
<TableRow row={row} onClick={handleRowButtonClick} />
<TableRow element={row.rowElement} support={support} onClick={handleRowButtonClick} />
))}
{isScrolling && <div>Scrolling...</div>}
</div>
);
}
function scrapeRowsFromCourseTable(): HTMLTableRowElement[] {
const rows = Array.from(document.querySelectorAll('table tbody tr')) as HTMLTableRowElement[];
function scrapeCourseRows(support: SiteSupport): CourseRow[] {
const rows: CourseRow[] = [];
return Array.from(rows).filter(row => {
if (row.querySelector('th')) {
return false;
let name: string | null = null;
if (support === SiteSupport.COURSE_CATALOG_DETAILS) {
const header = document.querySelector('#details h2');
if (!header?.textContent) {
throw new Error('Could not find course name on course details page.');
}
if (row.querySelector('td.course_header')) {
return false;
name = header.textContent.trim();
}
document.querySelectorAll<HTMLTableRowElement>('table tbody tr').forEach(row => {
// rows that have a course header are the start of a new section, so save the section name and skip
const header = row.querySelector('td.course_header');
if (header?.textContent) {
name = header.textContent.trim();
return;
}
return true;
if (!name) {
throw new Error('Could not find any course sections.');
}
const course = scrapeCourseFromRow(name, support, row);
});
return rows;
}
function scrapeCourseFromRow(name: string, support: SiteSupport, row: HTMLTableRowElement): Course {
let url = support === SiteSupport.COURSE_CATALOG_DETAILS ? window.location.href : null;
}