Files
UT-Registration-Plus/src/views/lib/CourseCatalogScraper.ts
2024-10-22 21:47:35 -05:00

380 lines
14 KiB
TypeScript

import type { InstructionMode, ScrapedRow, Semester, StatusType } from '@shared/types/Course';
import { Course, Status } from '@shared/types/Course';
import { CourseSchedule } from '@shared/types/CourseSchedule';
import Instructor from '@shared/types/Instructor';
import { getCourseColors } from '@shared/util/colors';
import type { SiteSupportType } from '@views/lib/getSiteSupport';
/**
* The selectors that we use to scrape the course catalog list table (https://utdirect.utexas.edu/apps/registrar/course_schedule/20239/results/?fos_fl=C+S&level=U&search_type_main=FIELD)
*/
const TableDataSelector = {
COURSE_HEADER: 'td.course_header',
UNIQUE_ID: 'td[data-th="Unique"]',
REGISTER_URL: 'td[data-th="Add"] a',
INSTRUCTORS: 'td[data-th="Instructor"] span',
INSTRUCTION_MODE: 'td[data-th="Instruction Mode"]',
STATUS: 'td[data-th="Status"]',
SCHEDULE_DAYS: 'td[data-th="Days"]>span',
SCHEDULE_HOURS: 'td[data-th="Hour"]>span',
SCHEDULE_LOCATION: 'td[data-th="Room"]>span',
FLAGS: 'td[data-th="Flags"] ul li',
CORE_CURRICULUM: 'td[data-th="Core"] ul li',
} as const satisfies Record<string, string>;
/**
* The selectors that we use to scrape the course details page for an individual course (https://utdirect.utexas.edu/apps/registrar/course_schedule/20239/52700/)
*/
const DetailsSelector = {
COURSE_NAME: '#details h2',
COURSE_DESCRIPTION: '#details p',
} as const;
/**
* A class that allows us to scrape information from UT's course catalog to create our internal representation of a course
*/
export class CourseCatalogScraper {
support: SiteSupportType;
doc: Document;
url: string;
constructor(support: SiteSupportType, doc: Document = document, url: string = window.location.href) {
this.support = support;
this.doc = doc;
this.url = url;
}
/**
* Pass in a list of HTMLtable rows and scrape every course from them
* @param rows the rows of the course catalog table
* @param keepHeaders whether to keep the header rows (which contain the course name) in the output
* @returns an array of course row objects (which contain courses corresponding to the htmltable row)
*/
public scrape(rows: NodeListOf<HTMLTableRowElement> | HTMLTableRowElement[], keepHeaders = false): ScrapedRow[] {
const courses: ScrapedRow[] = [];
let fullName = this.getFullName();
rows.forEach(row => {
if (this.isHeaderRow(row)) {
fullName = this.getFullName(row);
if (keepHeaders) {
courses.push({
element: row,
course: null,
});
}
return;
}
// we are now ready to build the course object
if (!fullName) {
throw new Error('Course name not found');
}
fullName = fullName.replace(/\s\s+/g, ' ').trim();
const [courseName, department, number] = this.separateCourseName(fullName);
const [status, isReserved] = this.getStatus(row);
const newCourse = new Course({
fullName,
courseName,
department,
number,
status,
isReserved,
creditHours: this.getCreditHours(number),
schedule: this.getSchedule(row),
registerURL: this.getRegisterURL(row),
url: this.getURL(row),
flags: this.getFlags(row),
uniqueId: this.getUniqueId(row),
instructionMode: this.getInstructionMode(row),
instructors: this.getInstructors(row) as Instructor[],
description: this.getDescription(this.doc),
semester: this.getSemester(),
scrapedAt: Date.now(),
colors: getCourseColors('emerald', 500),
core: this.getCore(row),
});
courses.push({
element: row,
course: newCourse,
});
});
return courses;
}
/**
* Separate the course name into its department, number, and name
* @example separateCourseName("CS 314H - Honors Discrete Structures") => ["Honors Discrete Structures", "CS", "314H"]
* @param courseFullName the full name of the course (e.g. "CS 314H - Honors Discrete Structures")
* @returns an array of the course name , department, and number
*/
separateCourseName(courseFullName: string): [courseName: string, department: string, number: string] {
let courseNumberIndex = courseFullName.search(/\d/);
let department = courseFullName.substring(0, courseNumberIndex).trim();
let number = courseFullName.substring(courseNumberIndex, courseFullName.indexOf(' ', courseNumberIndex)).trim();
let courseName = courseFullName.substring(courseFullName.indexOf(' ', courseNumberIndex)).trim();
return [courseName, department, number];
}
/**
* Gets how many credit hours the course is worth
* @param courseNumber the course number, CS 314H
* @return the number of credit hours the course is worth
*/
getCreditHours(courseNumber: string): number {
let creditHours = Number(courseNumber.split('')[0]);
const lastChar = courseNumber.slice(-1);
// eslint-disable-next-line default-case
switch (lastChar) {
case 'A':
case 'B':
creditHours /= 2;
break;
case 'X':
case 'Y':
case 'Z':
creditHours /= 3;
break;
}
return creditHours;
}
/**
* Scrape the Unique ID from the course catalog table row
* @param row the row of the course catalog table
* @returns the uniqueid of the course as a number
*/
getUniqueId(row: HTMLTableRowElement): number {
const div = row.querySelector(TableDataSelector.UNIQUE_ID);
if (!div) {
throw new Error('Unique ID not found');
}
return Number(div.textContent);
}
/**
* Scrapes the individual URL for a given course that takes you to the course details page
* @param row the row of the course catalog table
* @returns the url of the course details page for the course in the row
*/
getURL(row: HTMLTableRowElement): string {
const div = row.querySelector<HTMLAnchorElement>(`${TableDataSelector.UNIQUE_ID} a`);
return div?.href || this.url;
}
/**
* Scrape who is teaching the course from the course catalog table row with meta-data about their name
* @param row the row of the course catalog table
* @returns an array of instructors for the course
*/
getInstructors(row: HTMLTableRowElement): Instructor[] {
const spans = row.querySelectorAll(TableDataSelector.INSTRUCTORS);
const names = Array.from(spans)
.map(span => span.textContent || '')
.map(name => name.trim())
.filter(Boolean);
return names.map(fullName => {
const [lastName, rest = ''] = fullName.split(',').map(s => s.trim());
const [firstName, middleInitial] = rest.split(' ');
return new Instructor({
fullName,
firstName,
lastName,
middleInitial,
});
});
}
/**
* Whether or not this is a header row for a course within the course catalog list (we can't scrape courses from header rows)
* @param row the row of the course catalog table
* @returns true if this is a header row, false otherwise
*/
isHeaderRow(row: HTMLTableRowElement): boolean {
return row.querySelector(TableDataSelector.COURSE_HEADER) !== null;
}
/**
* Scrape whether the class is being taught online, in person, or a hybrid of the two
* @param row the row of the course catalog table
* @returns the instruction mode of the course
*/
getInstructionMode(row: HTMLTableRowElement): InstructionMode {
const text = (row.querySelector(TableDataSelector.INSTRUCTION_MODE)?.textContent || '').toLowerCase();
if (text.includes('internet')) {
return 'Online';
}
if (text.includes('hybrid')) {
return 'Hybrid';
}
return 'In Person';
}
/**
* Scrapes the description of the course from the course details page and separates it into an array of cleaned up lines
* @param doc the document of the course details page to scrape
* @returns an array of lines of the course description
*/
getDescription(doc: Document): string[] {
const lines = doc.querySelectorAll(DetailsSelector.COURSE_DESCRIPTION);
return Array.from(lines)
.map(line => line.textContent || '')
.map(line => line.replace(/\s\s+/g, ' ').trim())
.filter(Boolean);
}
getSemester(): Semester {
const { pathname } = new URL(this.url);
const code = pathname.split('/')[4];
if (!code) {
throw new Error('Semester not found in URL');
}
let year = Number(code.substring(0, 4));
let seasonCode = Number(code.substring(4, 6));
if (!year || !seasonCode) {
throw new Error('Invalid semester found in URL');
}
let season: Semester['season'];
switch (seasonCode) {
case 2:
season = 'Spring';
break;
case 6:
season = 'Summer';
break;
case 9:
season = 'Fall';
break;
default:
throw new Error('Invalid season code');
}
return {
year,
season,
code,
};
}
/**
* Get the full name of the course from the course catalog table row (e.g. "CS 314H - Honors Discrete Structures")
* @param row the row of the course catalog table
* @returns the full name of the course
*/
getFullName(row?: HTMLTableRowElement): string {
if (!row) {
return this.doc.querySelector(DetailsSelector.COURSE_NAME)?.textContent || '';
}
const div = row.querySelector(TableDataSelector.COURSE_HEADER);
return div?.textContent || '';
}
/**
* When registration is open, the registration URL will show up in the course catalog table row as a link. This will scrape it from the row.
* @param row the row of the course catalog table
* @returns the registration URL for the course if it is currently displayed, undefined otherwise
*/
getRegisterURL(row: HTMLTableRowElement): string | undefined {
const a = row.querySelector<HTMLAnchorElement>(TableDataSelector.REGISTER_URL);
return a?.href;
}
/**
* Scrapes whether the course is open, closed, waitlisted, or cancelled
* @param row the row of the course catalog table
* @returns
*/
getStatus(row: HTMLTableRowElement): [status: StatusType, isReserved: boolean] {
const div = row.querySelector(TableDataSelector.STATUS);
if (!div) {
throw new Error('Status not found');
}
const text = (div.textContent || '').trim().toLowerCase();
if (!text) {
throw new Error('Status not found');
}
const isReserved = text.includes('reserved');
if (text.includes('open')) {
return [Status.OPEN, isReserved];
}
if (text.includes('closed')) {
return [Status.CLOSED, isReserved];
}
if (text.includes('waitlisted')) {
return [Status.WAITLISTED, isReserved];
}
if (text.includes('cancelled')) {
return [Status.CANCELLED, isReserved];
}
throw new Error(`Unknown status: ${text}`);
}
/**
* At UT, some courses have certain "flags" which aid in graduation. This will scrape the flags from the course catalog table row.
* @param row
* @returns an array of flags for the course
*/
getFlags(row: HTMLTableRowElement): string[] {
const lis = row.querySelectorAll(TableDataSelector.FLAGS);
return Array.from(lis).map(li => li.textContent || '');
}
/**
* Get the list of core curriculum requirements the course satisfies
* @param row
* @returns an array of core curriculum codes
*/
getCore(row: HTMLTableRowElement): string[] {
const lis = row.querySelectorAll(TableDataSelector.CORE_CURRICULUM);
return (
Array.from(lis)
// ut schedule is weird and puts a blank core curriculum element even if there aren't any core requirements so filter those out
.filter(li => li.getAttribute('title') !== ' core curriculum requirement')
.map(li => li.textContent || '')
);
}
/**
* This will scrape all the time information from the course catalog table row and return it as a CourseSchedule object, which represents all of the meeting timiestimes/places of the course.
* @param row the row of the course catalog table
* @returns a CourseSchedule object representing all of the meetings of the course
*/
getSchedule(row: HTMLTableRowElement): CourseSchedule {
const dayLines = row.querySelectorAll(TableDataSelector.SCHEDULE_DAYS);
const hourLines = row.querySelectorAll(TableDataSelector.SCHEDULE_HOURS);
const locLines = row.querySelectorAll(TableDataSelector.SCHEDULE_LOCATION);
if (dayLines.length !== hourLines.length) {
throw new Error('Schedule data is malformed');
}
const schedule = new CourseSchedule();
for (let i = 0; i < dayLines.length; i += 1) {
const dayText = dayLines[i]?.textContent || '';
const hourText = hourLines[i]?.textContent || '';
const locationText = locLines[i]?.textContent || '';
schedule.meetings.push(CourseSchedule.parse(dayText, hourText, locationText));
}
return schedule;
}
}