CourseScraper completely done

'
This commit is contained in:
Sriram Hariharan
2023-03-04 20:14:26 -06:00
parent c9684beb5b
commit e99ba5864a
5 changed files with 202 additions and 117 deletions

View File

@@ -18,12 +18,6 @@ export type Instructor = {
*/ */
export type InstructionMode = 'Online' | 'In Person' | 'Hybrid'; export type InstructionMode = 'Online' | 'In Person' | 'Hybrid';
export type Links = {
syllabi?: string;
textbook?: string;
eCIS?: string;
};
export enum Status { export enum Status {
OPEN = 'OPEN', OPEN = 'OPEN',
CLOSED = 'CLOSED', CLOSED = 'CLOSED',
@@ -34,16 +28,15 @@ export enum Status {
export class Course { export class Course {
uniqueId: number; uniqueId: number;
number: string; number: string;
name: string; fullName: string;
courseName: string;
department: string; department: string;
status: Status; status: Status;
instructors: Instructor[]; instructors: Instructor[];
isReserved: boolean; isReserved: boolean;
description: string[]; description?: string[];
schedule: CourseSchedule; schedule: CourseSchedule;
currentStatus: string;
url: string; url: string;
links: Links;
registerURL?: string; registerURL?: string;
flags: string[]; flags: string[];
instructionMode: InstructionMode; instructionMode: InstructionMode;

View File

@@ -1,13 +1,23 @@
import { Serialized } from 'chrome-extension-toolkit'; import { Serialized } from 'chrome-extension-toolkit';
type Day = 'M' | 'T' | 'W' | 'TH' | 'F' | 'S' | 'SU'; const dayMap = {
M: 'Monday',
T: 'Tuesday',
W: 'Wednesday',
TH: 'Thursday',
F: 'Friday',
S: 'Saturday',
SU: 'Sunday',
} as const;
type Day = typeof dayMap[keyof typeof dayMap];
type Room = { type Room = {
building: string; building: string;
number: string; number: string;
}; };
type CourseSection = { export type CourseSection = {
day: Day; day: Day;
startTime: number; startTime: number;
endTime: number; endTime: number;
@@ -21,9 +31,49 @@ export class CourseSchedule {
Object.assign(this, courseSchedule); Object.assign(this, courseSchedule);
} }
static parse(days: Day[] , times, hours): CourseSchedule {} static parse(dayLine: string, timeLine: string, roomLine: string): CourseSection[] {
try {
let days: Day[] = dayLine
.split('')
.map((char, i) => {
const nextChar = dayLine.charAt(i + 1);
let day = char;
if (char === 'T' && nextChar === 'H') {
day += nextChar;
}
if (char === 'S' && nextChar === 'U') {
day += nextChar;
}
return dayMap[day];
})
.filter(Boolean) as Day[];
toString(): string { const [startTime, endTime] = timeLine
return ''; .replaceAll('.', '')
.split('-')
.map(time => {
const [hour, rest] = time.split(':');
const [minute, ampm] = rest.split(' ');
if (ampm === 'pm') {
return Number(hour) * 60 + Number(minute) + 12 * 60;
}
return Number(hour) * 60 + Number(minute);
});
const [building, number] = roomLine.split(' ');
return days.map(day => ({
day,
startTime,
endTime,
room: {
building,
number,
},
}));
} catch (e) {
throw new Error(`Failed to parse schedule: ${dayLine} ${timeLine} ${roomLine}`);
}
} }
} }

View File

@@ -1,9 +1,8 @@
import React, { useEffect, useMemo, useState } from 'react'; import React, { useEffect, useState } from 'react';
import ReactDOM from 'react-dom'; import ReactDOM from 'react-dom';
import { Course, CourseRow, CourseScraper } from 'src/shared/types/Course'; import { Course, CourseRow } from 'src/shared/types/Course';
import { CourseCatalogDetailsScraper } from 'src/shared/types/CourseCatalogDetailsScraper';
import { CourseCatalogRowScraper } from 'src/shared/types/CourseCatalogRowScraper';
import useInfiniteScroll from '../hooks/useInfiniteScroll'; import useInfiniteScroll from '../hooks/useInfiniteScroll';
import { CourseScraper } from '../lib/courseCatalog/CourseScraper';
import { populateSearchInputs } from '../lib/courseCatalog/populateSearchInputs'; import { populateSearchInputs } from '../lib/courseCatalog/populateSearchInputs';
import { SiteSupport } from '../lib/getSiteSupport'; import { SiteSupport } from '../lib/getSiteSupport';
import TableHead from './injected/TableHead'; import TableHead from './injected/TableHead';
@@ -22,7 +21,7 @@ export default function CourseCatalogMain({ support }: Props) {
const isScrolling = useInfiniteScroll(async () => { const isScrolling = useInfiniteScroll(async () => {
console.log('infinite scroll'); console.log('infinite scroll');
return false; return true;
}); });
useEffect(() => { useEffect(() => {
@@ -30,7 +29,9 @@ export default function CourseCatalogMain({ support }: Props) {
}, []); }, []);
useEffect(() => { useEffect(() => {
const rows = scrapeCourseRows(support); const scraper = new CourseScraper(support);
const rows = scraper.scrape(document.querySelectorAll<HTMLTableRowElement>('table tbody tr'));
console.log('useEffect -> rows:', rows);
setRows(rows); setRows(rows);
}, []); }, []);
@@ -42,44 +43,14 @@ export default function CourseCatalogMain({ support }: Props) {
<div> <div>
<TableHead>Plus</TableHead> <TableHead>Plus</TableHead>
{rows.map(row => ( {rows.map(row => (
<TableRow element={row.rowElement} support={support} onClick={handleRowButtonClick} /> <TableRow
element={row.rowElement}
course={row.course}
support={support}
onClick={handleRowButtonClick}
/>
))} ))}
{isScrolling && <div>Scrolling...</div>} {isScrolling && <div>Scrolling...</div>}
</div> </div>
); );
} }
function scrapeCourseRows(support: SiteSupport): CourseRow[] {
const rows: CourseRow[] = [];
let name: string | null = null;
if (support === SiteSupport.COURSE_CATALOG_DETAILS) {
const header = document.querySelector('#details h2');
if (!header?.textContent) {
throw new Error('Could not find course name on course details page.');
}
name = header.textContent.trim();
}
document.querySelectorAll<HTMLTableRowElement>('table tbody tr').forEach(row => {
// rows that have a course header are the start of a new section, so save the section name and skip
const header = row.querySelector('td.course_header');
if (header?.textContent) {
name = header.textContent.trim();
return;
}
if (!name) {
throw new Error('Could not find any course sections.');
}
const course = scrapeCourseFromRow(name, support, row);
});
return rows;
}
function scrapeCourseFromRow(name: string, support: SiteSupport, row: HTMLTableRowElement): Course {
let url = support === SiteSupport.COURSE_CATALOG_DETAILS ? window.location.href : null;
}

View File

@@ -1,22 +1,22 @@
import React, { useEffect, useState } from 'react'; import React, { useEffect, useState } from 'react';
import ReactDOM from 'react-dom'; import ReactDOM from 'react-dom';
import { Course } from 'src/shared/types/Course'; import { Course, CourseRow } from 'src/shared/types/Course';
import { SiteSupport } from 'src/views/lib/getSiteSupport'; import { SiteSupport } from 'src/views/lib/getSiteSupport';
import { Button } from '../common/Button/Button'; import { Button } from '../common/Button/Button';
interface Props { interface Props {
support: SiteSupport; support: SiteSupport;
course: Course;
element: HTMLTableRowElement; element: HTMLTableRowElement;
onClick: (course: Course) => void; onClick: (...args: any[]) => any;
} }
/** /**
* This component is injected into each row of the course catalog table. * This component is injected into each row of the course catalog table.
* @returns a react portal to the new td in the column or null if the column has not been created yet. * @returns a react portal to the new td in the column or null if the column has not been created yet.
*/ */
export default function TableRow({ support, element, onClick }: Props): JSX.Element | null { export default function TableRow({ support, course, element, onClick }: Props): JSX.Element | null {
const [container, setContainer] = useState<HTMLTableCellElement | null>(null); const [container, setContainer] = useState<HTMLTableCellElement | null>(null);
const [course, setCourse] = useState<Course | null>(null);
useEffect(() => { useEffect(() => {
const portalContainer = document.createElement('td'); const portalContainer = document.createElement('td');
@@ -25,17 +25,9 @@ export default function TableRow({ support, element, onClick }: Props): JSX.Elem
setContainer(portalContainer); setContainer(portalContainer);
}, []); }, []);
useEffect(() => { if (!container) {
setCourse(course);
}, [element]);
if (!container || !course) {
return null; return null;
} }
const handleOnClick = () => { return ReactDOM.createPortal(<Button onClick={onClick}>Plus</Button>, container);
onClick(course);
};
return ReactDOM.createPortal(<Button onClick={handleOnClick}>Plus</Button>, container);
} }

View File

@@ -1,10 +1,13 @@
import { Instructor, Status } from 'src/shared/types/Course'; import { Course, Instructor, Status, InstructionMode, CourseRow } from 'src/shared/types/Course';
import { CourseSchedule, CourseSection } from 'src/shared/types/CourseSchedule';
import { SiteSupport } from 'src/views/lib/getSiteSupport'; import { SiteSupport } from 'src/views/lib/getSiteSupport';
enum TableDataSelector { enum TableDataSelector {
COURSE_HEADER = 'td.course_header',
UNIQUE_ID = 'td[data-th="Unique"]', UNIQUE_ID = 'td[data-th="Unique"]',
REGISTER_URL = 'td[data-th="Add"] a', REGISTER_URL = 'td[data-th="Add"] a',
INSTRUCTORS = 'td[data-th="Instructor"] span', INSTRUCTORS = 'td[data-th="Instructor"] span',
INSTRUCTION_MODE = 'td[data-th="Instruction Mode"]',
STATUS = 'td[data-th="Status"]', STATUS = 'td[data-th="Status"]',
SCHEDULE_DAYS = 'td[data-th="Days"]>span', SCHEDULE_DAYS = 'td[data-th="Days"]>span',
SCHEDULE_HOURS = 'td[data-th="Hour"]>span', SCHEDULE_HOURS = 'td[data-th="Hour"]>span',
@@ -12,37 +15,95 @@ enum TableDataSelector {
FLAGS = 'td[data-th="Flags"] ul li', FLAGS = 'td[data-th="Flags"] ul li',
} }
enum CatalogDetailsSelector { enum DetailsSelector {
COURSE_NAME = '#details h2', COURSE_NAME = '#details h2',
COURSE_DESCRIPTION = '#details p', COURSE_DESCRIPTION = '#details p',
} }
export class CourseScraper { export class CourseScraper {
support: SiteSupport; support: SiteSupport;
row: HTMLTableRowElement;
constructor(support: SiteSupport, row: HTMLTableRowElement) { constructor(support: SiteSupport) {
this.support = support; this.support = support;
this.row = row;
} }
scrapeUniqueId(): number { public scrape(rows: NodeListOf<HTMLTableRowElement>): CourseRow[] {
const div = this.row.querySelector(TableDataSelector.UNIQUE_ID); const courses: CourseRow[] = [];
let fullName = this.getFullName();
rows.forEach(row => {
if (this.isHeaderRow(row)) {
fullName = this.getFullName(row);
return;
}
// we are now ready to build the course object
if (!fullName) {
throw new Error('Course name not found');
}
fullName = fullName.replace(/\s\s+/g, ' ').trim();
const [courseName, department, number] = this.separateCourseName(fullName);
const [status, isReserved] = this.getStatus(row);
const newCourse = new Course({
fullName,
courseName,
department,
number,
status,
isReserved,
schedule: this.getSchedule(row),
registerURL: this.getRegisterURL(row),
url: this.getURL(row),
flags: this.getFlags(row),
uniqueId: this.getUniqueId(row),
instructionMode: this.getInstructionMode(row),
instructors: this.getInstructors(row),
description: this.getDescription(document),
});
courses.push({
rowElement: row,
course: newCourse,
});
});
return courses;
}
separateCourseName(name: string): [courseName: string, department: string, number: string] {
let courseNumberIndex = name.search(/\d/);
let department = name.substring(0, courseNumberIndex).trim();
let number = name.substring(courseNumberIndex, name.indexOf(' ', courseNumberIndex)).trim();
let courseName = name.substring(name.indexOf(' ', courseNumberIndex)).trim();
return [courseName, department, number];
}
getUniqueId(row: HTMLTableRowElement): number {
const div = row.querySelector(TableDataSelector.UNIQUE_ID);
if (!div) { if (!div) {
throw new Error('Unique ID not found'); throw new Error('Unique ID not found');
} }
return Number(div.textContent); return Number(div.textContent);
} }
scrapeInstructors(): Instructor[] { getURL(row: HTMLTableRowElement): string {
const spans = this.row.querySelectorAll(TableDataSelector.INSTRUCTORS); const div = row.querySelector<HTMLAnchorElement>(`${TableDataSelector.UNIQUE_ID} a`);
return div?.href || window.location.href;
}
getInstructors(row: HTMLTableRowElement): Instructor[] {
const spans = row.querySelectorAll(TableDataSelector.INSTRUCTORS);
const names = Array.from(spans) const names = Array.from(spans)
.map(span => span.textContent || '') .map(span => span.textContent || '')
.map(name => name.trim()) .map(name => name.trim())
.filter(Boolean); .filter(Boolean);
return names.map(name => { return names.map(name => {
const [lastName, rest] = name.split(','); const [lastName, rest] = name.split(',').map(s => s.trim());
const [firstName, middleInitial] = rest.split(' '); const [firstName, middleInitial] = rest.split(' ');
return { return {
@@ -54,21 +115,45 @@ export class CourseScraper {
}); });
} }
scrapeName(): string { isHeaderRow(row: HTMLTableRowElement): boolean {
const div = document.querySelector(CatalogDetailsSelector.COURSE_NAME); return row.querySelector(TableDataSelector.COURSE_HEADER) !== null;
if (!div) {
throw new Error('Course name not found');
}
return div.textContent || '';
} }
scrapeRegisterURL(): string | undefined { getInstructionMode(row: HTMLTableRowElement): InstructionMode {
const a = this.row.querySelector<HTMLAnchorElement>(TableDataSelector.REGISTER_URL); const text = (row.querySelector(TableDataSelector.INSTRUCTION_MODE)?.textContent || '').toLowerCase();
if (text.includes('internet')) {
return 'Online';
}
if (text.includes('hybrid')) {
return 'Hybrid';
}
return 'In Person';
}
getDescription(document: Document): string[] {
const lines = document.querySelectorAll(DetailsSelector.COURSE_DESCRIPTION);
return Array.from(lines)
.map(line => line.textContent || '')
.map(line => line.replace(/\s\s+/g, ' ').trim())
.filter(Boolean);
}
getFullName(row?: HTMLTableRowElement): string {
if (!row) {
return document.querySelector(DetailsSelector.COURSE_NAME)?.textContent || '';
}
const div = row.querySelector(TableDataSelector.COURSE_HEADER);
return div?.textContent || '';
}
getRegisterURL(row: HTMLTableRowElement): string | undefined {
const a = row.querySelector<HTMLAnchorElement>(TableDataSelector.REGISTER_URL);
return a?.href; return a?.href;
} }
scrapeStatus(): [status: Status, isReserved: boolean] { getStatus(row: HTMLTableRowElement): [status: Status, isReserved: boolean] {
const div = this.row.querySelector(TableDataSelector.STATUS); const div = row.querySelector(TableDataSelector.STATUS);
if (!div) { if (!div) {
throw new Error('Status not found'); throw new Error('Status not found');
} }
@@ -93,39 +178,33 @@ export class CourseScraper {
throw new Error(`Unknown status: ${text}`); throw new Error(`Unknown status: ${text}`);
} }
scrapeFlags(): string[] { getFlags(row: HTMLTableRowElement): string[] {
const lis = this.row.querySelectorAll(TableDataSelector.FLAGS); const lis = row.querySelectorAll(TableDataSelector.FLAGS);
return Array.from(lis).map(li => li.textContent || ''); return Array.from(lis).map(li => li.textContent || '');
} }
scrapeDescription(): string[] { getSchedule(row: HTMLTableRowElement): CourseSchedule {
const lines = document.querySelectorAll(CatalogDetailsSelector.COURSE_DESCRIPTION); const dayLines = row.querySelectorAll(TableDataSelector.SCHEDULE_DAYS);
return Array.from(lines) const hourLines = row.querySelectorAll(TableDataSelector.SCHEDULE_HOURS);
.map(line => line.textContent || '') const roomLines = row.querySelectorAll(TableDataSelector.SCHEDULE_ROOM);
.filter(Boolean);
}
scrapeSchedule(): CourseSchedule { if (dayLines.length !== hourLines.length) {
const days = this.row.querySelectorAll(TableDataSelector.SCHEDULE_DAYS);
const hours = this.row.querySelectorAll(TableDataSelector.SCHEDULE_HOURS);
const rooms = this.row.querySelectorAll(TableDataSelector.SCHEDULE_ROOM);
if (days.length !== hours.length) {
throw new Error('Schedule data is malformed'); throw new Error('Schedule data is malformed');
} }
// const schedule: = []; const sections: CourseSection[] = [];
// for (let i = 0; i < days.length; i++) {
// const day = days[i].textContent || '';
// const hour = hours[i].textContent || '';
// const room = rooms[i].textContent || '';
// schedule.push({ for (let i = 0; i < dayLines.length; i += 1) {
// day, const lineSections = CourseSchedule.parse(
// hour, dayLines[i].textContent || '',
// room, hourLines[i].textContent || '',
// }); roomLines[i].textContent || ''
// } );
// return schedule; sections.push(...lineSections);
}
return new CourseSchedule({
sections,
});
} }
} }