wip scraping infra
This commit is contained in:
@@ -1,29 +1,45 @@
|
||||
import { Serialized } from 'chrome-extension-toolkit';
|
||||
import { CourseSchedule } from './CourseSchedule';
|
||||
|
||||
type CourseSchedule = {};
|
||||
|
||||
type Professor = {
|
||||
/**
|
||||
* A professor's name, first name, and initial (if applicable)
|
||||
* Also includes a link to their RateMyProfessor page
|
||||
*/
|
||||
export type Instructor = {
|
||||
name: string;
|
||||
firstName?: string;
|
||||
initial?: string;
|
||||
lastName?: string;
|
||||
middleInitial?: string;
|
||||
rateMyProfessorURL?: string;
|
||||
};
|
||||
|
||||
type InstructionMode = 'Online' | 'In Person' | 'Hybrid';
|
||||
/**
|
||||
* Whether the class is taught online, in person, or a hybrid of the two
|
||||
*/
|
||||
export type InstructionMode = 'Online' | 'In Person' | 'Hybrid';
|
||||
|
||||
type Links = {
|
||||
export type Links = {
|
||||
syllabi?: string;
|
||||
textbook?: string;
|
||||
rateMyProfessor?: string;
|
||||
eCIS?: string;
|
||||
};
|
||||
|
||||
export enum Status {
|
||||
OPEN = 'OPEN',
|
||||
CLOSED = 'CLOSED',
|
||||
WAITLISTED = 'WAITLISTED',
|
||||
CANCELLED = 'CANCELLED',
|
||||
}
|
||||
|
||||
export class Course {
|
||||
uniqueId: number;
|
||||
number: string;
|
||||
name: string;
|
||||
department: string;
|
||||
professor: Professor;
|
||||
description?: string;
|
||||
status: Status;
|
||||
instructors: Instructor[];
|
||||
isReserved: boolean;
|
||||
description: string[];
|
||||
schedule: CourseSchedule;
|
||||
currentStatus: string;
|
||||
url: string;
|
||||
@@ -36,3 +52,8 @@ export class Course {
|
||||
Object.assign(this, course);
|
||||
}
|
||||
}
|
||||
|
||||
export type CourseRow = {
|
||||
rowElement: HTMLTableRowElement;
|
||||
course: Course;
|
||||
};
|
||||
|
||||
29
src/shared/types/CourseSchedule.ts
Normal file
29
src/shared/types/CourseSchedule.ts
Normal file
@@ -0,0 +1,29 @@
|
||||
import { Serialized } from 'chrome-extension-toolkit';
|
||||
|
||||
type Day = 'M' | 'T' | 'W' | 'TH' | 'F' | 'S' | 'SU';
|
||||
|
||||
type Room = {
|
||||
building: string;
|
||||
number: string;
|
||||
};
|
||||
|
||||
type CourseSection = {
|
||||
day: Day;
|
||||
startTime: number;
|
||||
endTime: number;
|
||||
room?: Room;
|
||||
};
|
||||
|
||||
export class CourseSchedule {
|
||||
sections: CourseSection[];
|
||||
|
||||
constructor(courseSchedule: CourseSchedule | Serialized<CourseSchedule>) {
|
||||
Object.assign(this, courseSchedule);
|
||||
}
|
||||
|
||||
static parse(days: Day[] , times, hours): CourseSchedule {}
|
||||
|
||||
toString(): string {
|
||||
return '';
|
||||
}
|
||||
}
|
||||
@@ -1,6 +1,8 @@
|
||||
import React, { useEffect, useMemo, useState } from 'react';
|
||||
import ReactDOM from 'react-dom';
|
||||
import { Course } from 'src/shared/types/Course';
|
||||
import { Course, CourseRow, CourseScraper } from 'src/shared/types/Course';
|
||||
import { CourseCatalogDetailsScraper } from 'src/shared/types/CourseCatalogDetailsScraper';
|
||||
import { CourseCatalogRowScraper } from 'src/shared/types/CourseCatalogRowScraper';
|
||||
import useInfiniteScroll from '../hooks/useInfiniteScroll';
|
||||
import { populateSearchInputs } from '../lib/courseCatalog/populateSearchInputs';
|
||||
import { SiteSupport } from '../lib/getSiteSupport';
|
||||
@@ -15,7 +17,7 @@ interface Props {
|
||||
* This is the top level react component orchestrating the course catalog page.
|
||||
*/
|
||||
export default function CourseCatalogMain({ support }: Props) {
|
||||
const [rows, setRows] = React.useState<HTMLTableRowElement[]>([]);
|
||||
const [rows, setRows] = React.useState<CourseRow[]>([]);
|
||||
const [selectedCourse, setSelectedCourse] = useState<Course | null>(null);
|
||||
|
||||
const isScrolling = useInfiniteScroll(async () => {
|
||||
@@ -28,7 +30,7 @@ export default function CourseCatalogMain({ support }: Props) {
|
||||
}, []);
|
||||
|
||||
useEffect(() => {
|
||||
const rows = scrapeRowsFromCourseTable();
|
||||
const rows = scrapeCourseRows(support);
|
||||
setRows(rows);
|
||||
}, []);
|
||||
|
||||
@@ -40,23 +42,44 @@ export default function CourseCatalogMain({ support }: Props) {
|
||||
<div>
|
||||
<TableHead>Plus</TableHead>
|
||||
{rows.map(row => (
|
||||
<TableRow row={row} onClick={handleRowButtonClick} />
|
||||
<TableRow element={row.rowElement} support={support} onClick={handleRowButtonClick} />
|
||||
))}
|
||||
{isScrolling && <div>Scrolling...</div>}
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
function scrapeRowsFromCourseTable(): HTMLTableRowElement[] {
|
||||
const rows = Array.from(document.querySelectorAll('table tbody tr')) as HTMLTableRowElement[];
|
||||
function scrapeCourseRows(support: SiteSupport): CourseRow[] {
|
||||
const rows: CourseRow[] = [];
|
||||
|
||||
return Array.from(rows).filter(row => {
|
||||
if (row.querySelector('th')) {
|
||||
return false;
|
||||
let name: string | null = null;
|
||||
if (support === SiteSupport.COURSE_CATALOG_DETAILS) {
|
||||
const header = document.querySelector('#details h2');
|
||||
if (!header?.textContent) {
|
||||
throw new Error('Could not find course name on course details page.');
|
||||
}
|
||||
if (row.querySelector('td.course_header')) {
|
||||
return false;
|
||||
name = header.textContent.trim();
|
||||
}
|
||||
|
||||
document.querySelectorAll<HTMLTableRowElement>('table tbody tr').forEach(row => {
|
||||
// rows that have a course header are the start of a new section, so save the section name and skip
|
||||
const header = row.querySelector('td.course_header');
|
||||
if (header?.textContent) {
|
||||
name = header.textContent.trim();
|
||||
return;
|
||||
}
|
||||
return true;
|
||||
if (!name) {
|
||||
throw new Error('Could not find any course sections.');
|
||||
}
|
||||
|
||||
const course = scrapeCourseFromRow(name, support, row);
|
||||
});
|
||||
return rows;
|
||||
}
|
||||
|
||||
function scrapeCourseFromRow(name: string, support: SiteSupport, row: HTMLTableRowElement): Course {
|
||||
let url = support === SiteSupport.COURSE_CATALOG_DETAILS ? window.location.href : null;
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
@@ -1,10 +1,12 @@
|
||||
import React, { useEffect, useState } from 'react';
|
||||
import ReactDOM from 'react-dom';
|
||||
import { Course } from 'src/shared/types/Course';
|
||||
import { SiteSupport } from 'src/views/lib/getSiteSupport';
|
||||
import { Button } from '../common/Button/Button';
|
||||
|
||||
interface Props {
|
||||
row: HTMLTableRowElement;
|
||||
support: SiteSupport;
|
||||
element: HTMLTableRowElement;
|
||||
onClick: (course: Course) => void;
|
||||
}
|
||||
|
||||
@@ -12,21 +14,20 @@ interface Props {
|
||||
* This component is injected into each row of the course catalog table.
|
||||
* @returns a react portal to the new td in the column or null if the column has not been created yet.
|
||||
*/
|
||||
export default function TableRow({ row, onClick }: Props): JSX.Element | null {
|
||||
export default function TableRow({ support, element, onClick }: Props): JSX.Element | null {
|
||||
const [container, setContainer] = useState<HTMLTableCellElement | null>(null);
|
||||
const [course, setCourse] = useState<Course | null>(null);
|
||||
|
||||
useEffect(() => {
|
||||
const portalContainer = document.createElement('td');
|
||||
const lastTableCell = row.querySelector('td:last-child');
|
||||
const lastTableCell = element.querySelector('td:last-child');
|
||||
lastTableCell!.after(portalContainer);
|
||||
setContainer(portalContainer);
|
||||
}, []);
|
||||
|
||||
useEffect(() => {
|
||||
const course = scrapeCourseFromRow(row);
|
||||
setCourse(course);
|
||||
}, [row]);
|
||||
}, [element]);
|
||||
|
||||
if (!container || !course) {
|
||||
return null;
|
||||
@@ -38,7 +39,3 @@ export default function TableRow({ row, onClick }: Props): JSX.Element | null {
|
||||
|
||||
return ReactDOM.createPortal(<Button onClick={handleOnClick}>Plus</Button>, container);
|
||||
}
|
||||
|
||||
function scrapeCourseFromRow(row): Course {
|
||||
return null as any;
|
||||
}
|
||||
|
||||
131
src/views/lib/courseCatalog/CourseScraper.ts
Normal file
131
src/views/lib/courseCatalog/CourseScraper.ts
Normal file
@@ -0,0 +1,131 @@
|
||||
import { Instructor, Status } from 'src/shared/types/Course';
|
||||
import { SiteSupport } from 'src/views/lib/getSiteSupport';
|
||||
|
||||
enum TableDataSelector {
|
||||
UNIQUE_ID = 'td[data-th="Unique"]',
|
||||
REGISTER_URL = 'td[data-th="Add"] a',
|
||||
INSTRUCTORS = 'td[data-th="Instructor"] span',
|
||||
STATUS = 'td[data-th="Status"]',
|
||||
SCHEDULE_DAYS = 'td[data-th="Days"]>span',
|
||||
SCHEDULE_HOURS = 'td[data-th="Hour"]>span',
|
||||
SCHEDULE_ROOM = 'td[data-th="Room"]>span',
|
||||
FLAGS = 'td[data-th="Flags"] ul li',
|
||||
}
|
||||
|
||||
enum CatalogDetailsSelector {
|
||||
COURSE_NAME = '#details h2',
|
||||
COURSE_DESCRIPTION = '#details p',
|
||||
}
|
||||
|
||||
export class CourseScraper {
|
||||
support: SiteSupport;
|
||||
row: HTMLTableRowElement;
|
||||
|
||||
constructor(support: SiteSupport, row: HTMLTableRowElement) {
|
||||
this.support = support;
|
||||
this.row = row;
|
||||
}
|
||||
|
||||
scrapeUniqueId(): number {
|
||||
const div = this.row.querySelector(TableDataSelector.UNIQUE_ID);
|
||||
if (!div) {
|
||||
throw new Error('Unique ID not found');
|
||||
}
|
||||
return Number(div.textContent);
|
||||
}
|
||||
|
||||
scrapeInstructors(): Instructor[] {
|
||||
const spans = this.row.querySelectorAll(TableDataSelector.INSTRUCTORS);
|
||||
const names = Array.from(spans)
|
||||
.map(span => span.textContent || '')
|
||||
.map(name => name.trim())
|
||||
.filter(Boolean);
|
||||
|
||||
return names.map(name => {
|
||||
const [lastName, rest] = name.split(',');
|
||||
const [firstName, middleInitial] = rest.split(' ');
|
||||
|
||||
return {
|
||||
name,
|
||||
firstName,
|
||||
lastName,
|
||||
middleInitial,
|
||||
};
|
||||
});
|
||||
}
|
||||
|
||||
scrapeName(): string {
|
||||
const div = document.querySelector(CatalogDetailsSelector.COURSE_NAME);
|
||||
if (!div) {
|
||||
throw new Error('Course name not found');
|
||||
}
|
||||
return div.textContent || '';
|
||||
}
|
||||
|
||||
scrapeRegisterURL(): string | undefined {
|
||||
const a = this.row.querySelector<HTMLAnchorElement>(TableDataSelector.REGISTER_URL);
|
||||
return a?.href;
|
||||
}
|
||||
|
||||
scrapeStatus(): [status: Status, isReserved: boolean] {
|
||||
const div = this.row.querySelector(TableDataSelector.STATUS);
|
||||
if (!div) {
|
||||
throw new Error('Status not found');
|
||||
}
|
||||
const text = (div.textContent || '').trim().toLowerCase();
|
||||
if (!text) {
|
||||
throw new Error('Status not found');
|
||||
}
|
||||
const isReserved = text.includes('reserved');
|
||||
|
||||
if (text.includes('open')) {
|
||||
return [Status.OPEN, isReserved];
|
||||
}
|
||||
if (text.includes('closed')) {
|
||||
return [Status.CLOSED, isReserved];
|
||||
}
|
||||
if (text.includes('waitlisted')) {
|
||||
return [Status.WAITLISTED, isReserved];
|
||||
}
|
||||
if (text.includes('cancelled')) {
|
||||
return [Status.CANCELLED, isReserved];
|
||||
}
|
||||
throw new Error(`Unknown status: ${text}`);
|
||||
}
|
||||
|
||||
scrapeFlags(): string[] {
|
||||
const lis = this.row.querySelectorAll(TableDataSelector.FLAGS);
|
||||
return Array.from(lis).map(li => li.textContent || '');
|
||||
}
|
||||
|
||||
scrapeDescription(): string[] {
|
||||
const lines = document.querySelectorAll(CatalogDetailsSelector.COURSE_DESCRIPTION);
|
||||
return Array.from(lines)
|
||||
.map(line => line.textContent || '')
|
||||
.filter(Boolean);
|
||||
}
|
||||
|
||||
scrapeSchedule(): CourseSchedule {
|
||||
const days = this.row.querySelectorAll(TableDataSelector.SCHEDULE_DAYS);
|
||||
const hours = this.row.querySelectorAll(TableDataSelector.SCHEDULE_HOURS);
|
||||
const rooms = this.row.querySelectorAll(TableDataSelector.SCHEDULE_ROOM);
|
||||
|
||||
if (days.length !== hours.length) {
|
||||
throw new Error('Schedule data is malformed');
|
||||
}
|
||||
|
||||
// const schedule: = [];
|
||||
// for (let i = 0; i < days.length; i++) {
|
||||
// const day = days[i].textContent || '';
|
||||
// const hour = hours[i].textContent || '';
|
||||
// const room = rooms[i].textContent || '';
|
||||
|
||||
// schedule.push({
|
||||
// day,
|
||||
// hour,
|
||||
// room,
|
||||
// });
|
||||
// }
|
||||
// return schedule;
|
||||
}
|
||||
}
|
||||
@@ -1,2 +0,0 @@
|
||||
|
||||
|
||||
Reference in New Issue
Block a user