wip scraping infra

This commit is contained in:
Sriram Hariharan
2023-03-04 11:51:56 -06:00
parent 2d940493a3
commit c9684beb5b
6 changed files with 231 additions and 32 deletions

View File

@@ -1,29 +1,45 @@
import { Serialized } from 'chrome-extension-toolkit'; import { Serialized } from 'chrome-extension-toolkit';
import { CourseSchedule } from './CourseSchedule';
type CourseSchedule = {}; /**
* A professor's name, first name, and initial (if applicable)
type Professor = { * Also includes a link to their RateMyProfessor page
*/
export type Instructor = {
name: string; name: string;
firstName?: string; firstName?: string;
initial?: string; lastName?: string;
middleInitial?: string;
rateMyProfessorURL?: string;
}; };
type InstructionMode = 'Online' | 'In Person' | 'Hybrid'; /**
* Whether the class is taught online, in person, or a hybrid of the two
*/
export type InstructionMode = 'Online' | 'In Person' | 'Hybrid';
type Links = { export type Links = {
syllabi?: string; syllabi?: string;
textbook?: string; textbook?: string;
rateMyProfessor?: string;
eCIS?: string; eCIS?: string;
}; };
export enum Status {
OPEN = 'OPEN',
CLOSED = 'CLOSED',
WAITLISTED = 'WAITLISTED',
CANCELLED = 'CANCELLED',
}
export class Course { export class Course {
uniqueId: number; uniqueId: number;
number: string; number: string;
name: string; name: string;
department: string; department: string;
professor: Professor; status: Status;
description?: string; instructors: Instructor[];
isReserved: boolean;
description: string[];
schedule: CourseSchedule; schedule: CourseSchedule;
currentStatus: string; currentStatus: string;
url: string; url: string;
@@ -36,3 +52,8 @@ export class Course {
Object.assign(this, course); Object.assign(this, course);
} }
} }
export type CourseRow = {
rowElement: HTMLTableRowElement;
course: Course;
};

View File

@@ -0,0 +1,29 @@
import { Serialized } from 'chrome-extension-toolkit';
type Day = 'M' | 'T' | 'W' | 'TH' | 'F' | 'S' | 'SU';
type Room = {
building: string;
number: string;
};
type CourseSection = {
day: Day;
startTime: number;
endTime: number;
room?: Room;
};
export class CourseSchedule {
sections: CourseSection[];
constructor(courseSchedule: CourseSchedule | Serialized<CourseSchedule>) {
Object.assign(this, courseSchedule);
}
static parse(days: Day[] , times, hours): CourseSchedule {}
toString(): string {
return '';
}
}

View File

@@ -1,6 +1,8 @@
import React, { useEffect, useMemo, useState } from 'react'; import React, { useEffect, useMemo, useState } from 'react';
import ReactDOM from 'react-dom'; import ReactDOM from 'react-dom';
import { Course } from 'src/shared/types/Course'; import { Course, CourseRow, CourseScraper } from 'src/shared/types/Course';
import { CourseCatalogDetailsScraper } from 'src/shared/types/CourseCatalogDetailsScraper';
import { CourseCatalogRowScraper } from 'src/shared/types/CourseCatalogRowScraper';
import useInfiniteScroll from '../hooks/useInfiniteScroll'; import useInfiniteScroll from '../hooks/useInfiniteScroll';
import { populateSearchInputs } from '../lib/courseCatalog/populateSearchInputs'; import { populateSearchInputs } from '../lib/courseCatalog/populateSearchInputs';
import { SiteSupport } from '../lib/getSiteSupport'; import { SiteSupport } from '../lib/getSiteSupport';
@@ -15,7 +17,7 @@ interface Props {
* This is the top level react component orchestrating the course catalog page. * This is the top level react component orchestrating the course catalog page.
*/ */
export default function CourseCatalogMain({ support }: Props) { export default function CourseCatalogMain({ support }: Props) {
const [rows, setRows] = React.useState<HTMLTableRowElement[]>([]); const [rows, setRows] = React.useState<CourseRow[]>([]);
const [selectedCourse, setSelectedCourse] = useState<Course | null>(null); const [selectedCourse, setSelectedCourse] = useState<Course | null>(null);
const isScrolling = useInfiniteScroll(async () => { const isScrolling = useInfiniteScroll(async () => {
@@ -28,7 +30,7 @@ export default function CourseCatalogMain({ support }: Props) {
}, []); }, []);
useEffect(() => { useEffect(() => {
const rows = scrapeRowsFromCourseTable(); const rows = scrapeCourseRows(support);
setRows(rows); setRows(rows);
}, []); }, []);
@@ -40,23 +42,44 @@ export default function CourseCatalogMain({ support }: Props) {
<div> <div>
<TableHead>Plus</TableHead> <TableHead>Plus</TableHead>
{rows.map(row => ( {rows.map(row => (
<TableRow row={row} onClick={handleRowButtonClick} /> <TableRow element={row.rowElement} support={support} onClick={handleRowButtonClick} />
))} ))}
{isScrolling && <div>Scrolling...</div>} {isScrolling && <div>Scrolling...</div>}
</div> </div>
); );
} }
function scrapeRowsFromCourseTable(): HTMLTableRowElement[] { function scrapeCourseRows(support: SiteSupport): CourseRow[] {
const rows = Array.from(document.querySelectorAll('table tbody tr')) as HTMLTableRowElement[]; const rows: CourseRow[] = [];
return Array.from(rows).filter(row => { let name: string | null = null;
if (row.querySelector('th')) { if (support === SiteSupport.COURSE_CATALOG_DETAILS) {
return false; const header = document.querySelector('#details h2');
if (!header?.textContent) {
throw new Error('Could not find course name on course details page.');
} }
if (row.querySelector('td.course_header')) { name = header.textContent.trim();
return false; }
document.querySelectorAll<HTMLTableRowElement>('table tbody tr').forEach(row => {
// rows that have a course header are the start of a new section, so save the section name and skip
const header = row.querySelector('td.course_header');
if (header?.textContent) {
name = header.textContent.trim();
return;
} }
return true; if (!name) {
throw new Error('Could not find any course sections.');
}
const course = scrapeCourseFromRow(name, support, row);
}); });
return rows;
}
function scrapeCourseFromRow(name: string, support: SiteSupport, row: HTMLTableRowElement): Course {
let url = support === SiteSupport.COURSE_CATALOG_DETAILS ? window.location.href : null;
} }

View File

@@ -1,10 +1,12 @@
import React, { useEffect, useState } from 'react'; import React, { useEffect, useState } from 'react';
import ReactDOM from 'react-dom'; import ReactDOM from 'react-dom';
import { Course } from 'src/shared/types/Course'; import { Course } from 'src/shared/types/Course';
import { SiteSupport } from 'src/views/lib/getSiteSupport';
import { Button } from '../common/Button/Button'; import { Button } from '../common/Button/Button';
interface Props { interface Props {
row: HTMLTableRowElement; support: SiteSupport;
element: HTMLTableRowElement;
onClick: (course: Course) => void; onClick: (course: Course) => void;
} }
@@ -12,21 +14,20 @@ interface Props {
* This component is injected into each row of the course catalog table. * This component is injected into each row of the course catalog table.
* @returns a react portal to the new td in the column or null if the column has not been created yet. * @returns a react portal to the new td in the column or null if the column has not been created yet.
*/ */
export default function TableRow({ row, onClick }: Props): JSX.Element | null { export default function TableRow({ support, element, onClick }: Props): JSX.Element | null {
const [container, setContainer] = useState<HTMLTableCellElement | null>(null); const [container, setContainer] = useState<HTMLTableCellElement | null>(null);
const [course, setCourse] = useState<Course | null>(null); const [course, setCourse] = useState<Course | null>(null);
useEffect(() => { useEffect(() => {
const portalContainer = document.createElement('td'); const portalContainer = document.createElement('td');
const lastTableCell = row.querySelector('td:last-child'); const lastTableCell = element.querySelector('td:last-child');
lastTableCell!.after(portalContainer); lastTableCell!.after(portalContainer);
setContainer(portalContainer); setContainer(portalContainer);
}, []); }, []);
useEffect(() => { useEffect(() => {
const course = scrapeCourseFromRow(row);
setCourse(course); setCourse(course);
}, [row]); }, [element]);
if (!container || !course) { if (!container || !course) {
return null; return null;
@@ -38,7 +39,3 @@ export default function TableRow({ row, onClick }: Props): JSX.Element | null {
return ReactDOM.createPortal(<Button onClick={handleOnClick}>Plus</Button>, container); return ReactDOM.createPortal(<Button onClick={handleOnClick}>Plus</Button>, container);
} }
function scrapeCourseFromRow(row): Course {
return null as any;
}

View File

@@ -0,0 +1,131 @@
import { Instructor, Status } from 'src/shared/types/Course';
import { SiteSupport } from 'src/views/lib/getSiteSupport';
enum TableDataSelector {
UNIQUE_ID = 'td[data-th="Unique"]',
REGISTER_URL = 'td[data-th="Add"] a',
INSTRUCTORS = 'td[data-th="Instructor"] span',
STATUS = 'td[data-th="Status"]',
SCHEDULE_DAYS = 'td[data-th="Days"]>span',
SCHEDULE_HOURS = 'td[data-th="Hour"]>span',
SCHEDULE_ROOM = 'td[data-th="Room"]>span',
FLAGS = 'td[data-th="Flags"] ul li',
}
enum CatalogDetailsSelector {
COURSE_NAME = '#details h2',
COURSE_DESCRIPTION = '#details p',
}
export class CourseScraper {
support: SiteSupport;
row: HTMLTableRowElement;
constructor(support: SiteSupport, row: HTMLTableRowElement) {
this.support = support;
this.row = row;
}
scrapeUniqueId(): number {
const div = this.row.querySelector(TableDataSelector.UNIQUE_ID);
if (!div) {
throw new Error('Unique ID not found');
}
return Number(div.textContent);
}
scrapeInstructors(): Instructor[] {
const spans = this.row.querySelectorAll(TableDataSelector.INSTRUCTORS);
const names = Array.from(spans)
.map(span => span.textContent || '')
.map(name => name.trim())
.filter(Boolean);
return names.map(name => {
const [lastName, rest] = name.split(',');
const [firstName, middleInitial] = rest.split(' ');
return {
name,
firstName,
lastName,
middleInitial,
};
});
}
scrapeName(): string {
const div = document.querySelector(CatalogDetailsSelector.COURSE_NAME);
if (!div) {
throw new Error('Course name not found');
}
return div.textContent || '';
}
scrapeRegisterURL(): string | undefined {
const a = this.row.querySelector<HTMLAnchorElement>(TableDataSelector.REGISTER_URL);
return a?.href;
}
scrapeStatus(): [status: Status, isReserved: boolean] {
const div = this.row.querySelector(TableDataSelector.STATUS);
if (!div) {
throw new Error('Status not found');
}
const text = (div.textContent || '').trim().toLowerCase();
if (!text) {
throw new Error('Status not found');
}
const isReserved = text.includes('reserved');
if (text.includes('open')) {
return [Status.OPEN, isReserved];
}
if (text.includes('closed')) {
return [Status.CLOSED, isReserved];
}
if (text.includes('waitlisted')) {
return [Status.WAITLISTED, isReserved];
}
if (text.includes('cancelled')) {
return [Status.CANCELLED, isReserved];
}
throw new Error(`Unknown status: ${text}`);
}
scrapeFlags(): string[] {
const lis = this.row.querySelectorAll(TableDataSelector.FLAGS);
return Array.from(lis).map(li => li.textContent || '');
}
scrapeDescription(): string[] {
const lines = document.querySelectorAll(CatalogDetailsSelector.COURSE_DESCRIPTION);
return Array.from(lines)
.map(line => line.textContent || '')
.filter(Boolean);
}
scrapeSchedule(): CourseSchedule {
const days = this.row.querySelectorAll(TableDataSelector.SCHEDULE_DAYS);
const hours = this.row.querySelectorAll(TableDataSelector.SCHEDULE_HOURS);
const rooms = this.row.querySelectorAll(TableDataSelector.SCHEDULE_ROOM);
if (days.length !== hours.length) {
throw new Error('Schedule data is malformed');
}
// const schedule: = [];
// for (let i = 0; i < days.length; i++) {
// const day = days[i].textContent || '';
// const hour = hours[i].textContent || '';
// const room = rooms[i].textContent || '';
// schedule.push({
// day,
// hour,
// room,
// });
// }
// return schedule;
}
}

View File

@@ -1,2 +0,0 @@