From c9684beb5bec8c7b4ce4e3d66a9c16912cf6c538 Mon Sep 17 00:00:00 2001 From: Sriram Hariharan Date: Sat, 4 Mar 2023 11:51:56 -0600 Subject: [PATCH] wip scraping infra --- src/shared/types/Course.ts | 39 ++++-- src/shared/types/CourseSchedule.ts | 29 ++++ src/views/components/CourseCatalogMain.tsx | 47 +++++-- src/views/components/injected/TableRow.tsx | 15 +-- src/views/lib/courseCatalog/CourseScraper.ts | 131 +++++++++++++++++++ src/views/lib/courseCatalog/index.ts | 2 - 6 files changed, 231 insertions(+), 32 deletions(-) create mode 100644 src/shared/types/CourseSchedule.ts create mode 100644 src/views/lib/courseCatalog/CourseScraper.ts delete mode 100644 src/views/lib/courseCatalog/index.ts diff --git a/src/shared/types/Course.ts b/src/shared/types/Course.ts index d6b7dbcb..fb8ea091 100644 --- a/src/shared/types/Course.ts +++ b/src/shared/types/Course.ts @@ -1,29 +1,45 @@ import { Serialized } from 'chrome-extension-toolkit'; +import { CourseSchedule } from './CourseSchedule'; -type CourseSchedule = {}; - -type Professor = { +/** + * A professor's name, first name, and initial (if applicable) + * Also includes a link to their RateMyProfessor page + */ +export type Instructor = { name: string; firstName?: string; - initial?: string; + lastName?: string; + middleInitial?: string; + rateMyProfessorURL?: string; }; -type InstructionMode = 'Online' | 'In Person' | 'Hybrid'; +/** + * Whether the class is taught online, in person, or a hybrid of the two + */ +export type InstructionMode = 'Online' | 'In Person' | 'Hybrid'; -type Links = { +export type Links = { syllabi?: string; textbook?: string; - rateMyProfessor?: string; eCIS?: string; }; +export enum Status { + OPEN = 'OPEN', + CLOSED = 'CLOSED', + WAITLISTED = 'WAITLISTED', + CANCELLED = 'CANCELLED', +} + export class Course { uniqueId: number; number: string; name: string; department: string; - professor: Professor; - description?: string; + status: Status; + instructors: Instructor[]; + isReserved: boolean; + description: string[]; schedule: CourseSchedule; currentStatus: string; url: string; @@ -36,3 +52,8 @@ export class Course { Object.assign(this, course); } } + +export type CourseRow = { + rowElement: HTMLTableRowElement; + course: Course; +}; diff --git a/src/shared/types/CourseSchedule.ts b/src/shared/types/CourseSchedule.ts new file mode 100644 index 00000000..8fcb5885 --- /dev/null +++ b/src/shared/types/CourseSchedule.ts @@ -0,0 +1,29 @@ +import { Serialized } from 'chrome-extension-toolkit'; + +type Day = 'M' | 'T' | 'W' | 'TH' | 'F' | 'S' | 'SU'; + +type Room = { + building: string; + number: string; +}; + +type CourseSection = { + day: Day; + startTime: number; + endTime: number; + room?: Room; +}; + +export class CourseSchedule { + sections: CourseSection[]; + + constructor(courseSchedule: CourseSchedule | Serialized) { + Object.assign(this, courseSchedule); + } + + static parse(days: Day[] , times, hours): CourseSchedule {} + + toString(): string { + return ''; + } +} diff --git a/src/views/components/CourseCatalogMain.tsx b/src/views/components/CourseCatalogMain.tsx index b0b740f0..338d16be 100644 --- a/src/views/components/CourseCatalogMain.tsx +++ b/src/views/components/CourseCatalogMain.tsx @@ -1,6 +1,8 @@ import React, { useEffect, useMemo, useState } from 'react'; import ReactDOM from 'react-dom'; -import { Course } from 'src/shared/types/Course'; +import { Course, CourseRow, CourseScraper } from 'src/shared/types/Course'; +import { CourseCatalogDetailsScraper } from 'src/shared/types/CourseCatalogDetailsScraper'; +import { CourseCatalogRowScraper } from 'src/shared/types/CourseCatalogRowScraper'; import useInfiniteScroll from '../hooks/useInfiniteScroll'; import { populateSearchInputs } from '../lib/courseCatalog/populateSearchInputs'; import { SiteSupport } from '../lib/getSiteSupport'; @@ -15,7 +17,7 @@ interface Props { * This is the top level react component orchestrating the course catalog page. */ export default function CourseCatalogMain({ support }: Props) { - const [rows, setRows] = React.useState([]); + const [rows, setRows] = React.useState([]); const [selectedCourse, setSelectedCourse] = useState(null); const isScrolling = useInfiniteScroll(async () => { @@ -28,7 +30,7 @@ export default function CourseCatalogMain({ support }: Props) { }, []); useEffect(() => { - const rows = scrapeRowsFromCourseTable(); + const rows = scrapeCourseRows(support); setRows(rows); }, []); @@ -40,23 +42,44 @@ export default function CourseCatalogMain({ support }: Props) {
Plus {rows.map(row => ( - + ))} {isScrolling &&
Scrolling...
}
); } -function scrapeRowsFromCourseTable(): HTMLTableRowElement[] { - const rows = Array.from(document.querySelectorAll('table tbody tr')) as HTMLTableRowElement[]; +function scrapeCourseRows(support: SiteSupport): CourseRow[] { + const rows: CourseRow[] = []; - return Array.from(rows).filter(row => { - if (row.querySelector('th')) { - return false; + let name: string | null = null; + if (support === SiteSupport.COURSE_CATALOG_DETAILS) { + const header = document.querySelector('#details h2'); + if (!header?.textContent) { + throw new Error('Could not find course name on course details page.'); } - if (row.querySelector('td.course_header')) { - return false; + name = header.textContent.trim(); + } + + document.querySelectorAll('table tbody tr').forEach(row => { + // rows that have a course header are the start of a new section, so save the section name and skip + const header = row.querySelector('td.course_header'); + if (header?.textContent) { + name = header.textContent.trim(); + return; } - return true; + if (!name) { + throw new Error('Could not find any course sections.'); + } + + const course = scrapeCourseFromRow(name, support, row); }); + return rows; +} + +function scrapeCourseFromRow(name: string, support: SiteSupport, row: HTMLTableRowElement): Course { + let url = support === SiteSupport.COURSE_CATALOG_DETAILS ? window.location.href : null; + + + } diff --git a/src/views/components/injected/TableRow.tsx b/src/views/components/injected/TableRow.tsx index 73478a27..3fc3c205 100644 --- a/src/views/components/injected/TableRow.tsx +++ b/src/views/components/injected/TableRow.tsx @@ -1,10 +1,12 @@ import React, { useEffect, useState } from 'react'; import ReactDOM from 'react-dom'; import { Course } from 'src/shared/types/Course'; +import { SiteSupport } from 'src/views/lib/getSiteSupport'; import { Button } from '../common/Button/Button'; interface Props { - row: HTMLTableRowElement; + support: SiteSupport; + element: HTMLTableRowElement; onClick: (course: Course) => void; } @@ -12,21 +14,20 @@ interface Props { * This component is injected into each row of the course catalog table. * @returns a react portal to the new td in the column or null if the column has not been created yet. */ -export default function TableRow({ row, onClick }: Props): JSX.Element | null { +export default function TableRow({ support, element, onClick }: Props): JSX.Element | null { const [container, setContainer] = useState(null); const [course, setCourse] = useState(null); useEffect(() => { const portalContainer = document.createElement('td'); - const lastTableCell = row.querySelector('td:last-child'); + const lastTableCell = element.querySelector('td:last-child'); lastTableCell!.after(portalContainer); setContainer(portalContainer); }, []); useEffect(() => { - const course = scrapeCourseFromRow(row); setCourse(course); - }, [row]); + }, [element]); if (!container || !course) { return null; @@ -38,7 +39,3 @@ export default function TableRow({ row, onClick }: Props): JSX.Element | null { return ReactDOM.createPortal(, container); } - -function scrapeCourseFromRow(row): Course { - return null as any; -} diff --git a/src/views/lib/courseCatalog/CourseScraper.ts b/src/views/lib/courseCatalog/CourseScraper.ts new file mode 100644 index 00000000..bbc83c9f --- /dev/null +++ b/src/views/lib/courseCatalog/CourseScraper.ts @@ -0,0 +1,131 @@ +import { Instructor, Status } from 'src/shared/types/Course'; +import { SiteSupport } from 'src/views/lib/getSiteSupport'; + +enum TableDataSelector { + UNIQUE_ID = 'td[data-th="Unique"]', + REGISTER_URL = 'td[data-th="Add"] a', + INSTRUCTORS = 'td[data-th="Instructor"] span', + STATUS = 'td[data-th="Status"]', + SCHEDULE_DAYS = 'td[data-th="Days"]>span', + SCHEDULE_HOURS = 'td[data-th="Hour"]>span', + SCHEDULE_ROOM = 'td[data-th="Room"]>span', + FLAGS = 'td[data-th="Flags"] ul li', +} + +enum CatalogDetailsSelector { + COURSE_NAME = '#details h2', + COURSE_DESCRIPTION = '#details p', +} + +export class CourseScraper { + support: SiteSupport; + row: HTMLTableRowElement; + + constructor(support: SiteSupport, row: HTMLTableRowElement) { + this.support = support; + this.row = row; + } + + scrapeUniqueId(): number { + const div = this.row.querySelector(TableDataSelector.UNIQUE_ID); + if (!div) { + throw new Error('Unique ID not found'); + } + return Number(div.textContent); + } + + scrapeInstructors(): Instructor[] { + const spans = this.row.querySelectorAll(TableDataSelector.INSTRUCTORS); + const names = Array.from(spans) + .map(span => span.textContent || '') + .map(name => name.trim()) + .filter(Boolean); + + return names.map(name => { + const [lastName, rest] = name.split(','); + const [firstName, middleInitial] = rest.split(' '); + + return { + name, + firstName, + lastName, + middleInitial, + }; + }); + } + + scrapeName(): string { + const div = document.querySelector(CatalogDetailsSelector.COURSE_NAME); + if (!div) { + throw new Error('Course name not found'); + } + return div.textContent || ''; + } + + scrapeRegisterURL(): string | undefined { + const a = this.row.querySelector(TableDataSelector.REGISTER_URL); + return a?.href; + } + + scrapeStatus(): [status: Status, isReserved: boolean] { + const div = this.row.querySelector(TableDataSelector.STATUS); + if (!div) { + throw new Error('Status not found'); + } + const text = (div.textContent || '').trim().toLowerCase(); + if (!text) { + throw new Error('Status not found'); + } + const isReserved = text.includes('reserved'); + + if (text.includes('open')) { + return [Status.OPEN, isReserved]; + } + if (text.includes('closed')) { + return [Status.CLOSED, isReserved]; + } + if (text.includes('waitlisted')) { + return [Status.WAITLISTED, isReserved]; + } + if (text.includes('cancelled')) { + return [Status.CANCELLED, isReserved]; + } + throw new Error(`Unknown status: ${text}`); + } + + scrapeFlags(): string[] { + const lis = this.row.querySelectorAll(TableDataSelector.FLAGS); + return Array.from(lis).map(li => li.textContent || ''); + } + + scrapeDescription(): string[] { + const lines = document.querySelectorAll(CatalogDetailsSelector.COURSE_DESCRIPTION); + return Array.from(lines) + .map(line => line.textContent || '') + .filter(Boolean); + } + + scrapeSchedule(): CourseSchedule { + const days = this.row.querySelectorAll(TableDataSelector.SCHEDULE_DAYS); + const hours = this.row.querySelectorAll(TableDataSelector.SCHEDULE_HOURS); + const rooms = this.row.querySelectorAll(TableDataSelector.SCHEDULE_ROOM); + + if (days.length !== hours.length) { + throw new Error('Schedule data is malformed'); + } + + // const schedule: = []; + // for (let i = 0; i < days.length; i++) { + // const day = days[i].textContent || ''; + // const hour = hours[i].textContent || ''; + // const room = rooms[i].textContent || ''; + + // schedule.push({ + // day, + // hour, + // room, + // }); + // } + // return schedule; + } +} diff --git a/src/views/lib/courseCatalog/index.ts b/src/views/lib/courseCatalog/index.ts deleted file mode 100644 index 139597f9..00000000 --- a/src/views/lib/courseCatalog/index.ts +++ /dev/null @@ -1,2 +0,0 @@ - -