Files
UT-Registration-Plus/src/views/lib/CourseCatalogScraper.ts
2023-03-05 14:34:26 -06:00

299 lines
11 KiB
TypeScript

import { Course, Instructor, Status, InstructionMode, ScrapedRow } from 'src/shared/types/Course';
import { CourseSchedule, CourseMeeting } from 'src/shared/types/CourseSchedule';
import { SiteSupport } from 'src/views/lib/getSiteSupport';
/**
* The selectors that we use to scrape the course catalog list table (https://utdirect.utexas.edu/apps/registrar/course_schedule/20239/results/?fos_fl=C+S&level=U&search_type_main=FIELD)
*/
enum TableDataSelector {
COURSE_HEADER = 'td.course_header',
UNIQUE_ID = 'td[data-th="Unique"]',
REGISTER_URL = 'td[data-th="Add"] a',
INSTRUCTORS = 'td[data-th="Instructor"] span',
INSTRUCTION_MODE = 'td[data-th="Instruction Mode"]',
STATUS = 'td[data-th="Status"]',
SCHEDULE_DAYS = 'td[data-th="Days"]>span',
SCHEDULE_HOURS = 'td[data-th="Hour"]>span',
SCHEDULE_ROOM = 'td[data-th="Room"]>span',
FLAGS = 'td[data-th="Flags"] ul li',
}
/**
* The selectors that we use to scrape the course details page for an individual course (https://utdirect.utexas.edu/apps/registrar/course_schedule/20239/52700/)
*/
enum DetailsSelector {
COURSE_NAME = '#details h2',
COURSE_DESCRIPTION = '#details p',
}
/**
* A class that allows us to scrape information from UT's course catalog to create our internal representation of a course
*/
export class CourseCatalogScraper {
support: SiteSupport;
constructor(support: SiteSupport) {
this.support = support;
}
/**
* Pass in a list of HTMLtable rows and scrape every course from them
* @param rows the rows of the course catalog table
* @returns an array of course row objects (which contain courses corresponding to the htmltable row)
*/
public scrape(rows: NodeListOf<HTMLTableRowElement> | HTMLTableRowElement[]): ScrapedRow[] {
const courses: ScrapedRow[] = [];
let fullName = this.getFullName();
rows.forEach(row => {
if (this.isHeaderRow(row)) {
fullName = this.getFullName(row);
return;
}
// we are now ready to build the course object
if (!fullName) {
throw new Error('Course name not found');
}
fullName = fullName.replace(/\s\s+/g, ' ').trim();
const [courseName, department, number] = this.separateCourseName(fullName);
const [status, isReserved] = this.getStatus(row);
// TODO: get semester from somewhere
const year = new Date().getFullYear();
const month = new Date().getMonth();
let season = 'Fall';
if (month >= 0 && month < 5) {
season = 'Spring';
} else if (month >= 5 && month < 8) {
season = 'Summer';
}
const newCourse = new Course({
fullName,
courseName,
department,
number,
status,
isReserved,
schedule: this.getSchedule(row),
registerURL: this.getRegisterURL(row),
url: this.getURL(row),
flags: this.getFlags(row),
uniqueId: this.getUniqueId(row),
instructionMode: this.getInstructionMode(row),
instructors: this.getInstructors(row),
description: this.getDescription(document),
// TODO: get semester from somewhere
semester: {
year,
season,
},
});
courses.push({
element: row,
course: newCourse,
});
});
return courses;
}
/**
* Separate the course name into its department, number, and name
* @example separateCourseName("CS 314H - Honors Discrete Structures") => ["Honors Discrete Structures", "CS", "314H"]
* @param courseFullName the full name of the course (e.g. "CS 314H - Honors Discrete Structures")
* @returns an array of the course name , department, and number
*/
separateCourseName(courseFullName: string): [courseName: string, department: string, number: string] {
let courseNumberIndex = courseFullName.search(/\d/);
let department = courseFullName.substring(0, courseNumberIndex).trim();
let number = courseFullName.substring(courseNumberIndex, courseFullName.indexOf(' ', courseNumberIndex)).trim();
let courseName = courseFullName.substring(courseFullName.indexOf(' ', courseNumberIndex)).trim();
return [courseName, department, number];
}
/**
* Scrape the Unique ID from the course catalog table row
* @param row the row of the course catalog table
* @returns the uniqueid of the course as a number
*/
getUniqueId(row: HTMLTableRowElement): number {
const div = row.querySelector(TableDataSelector.UNIQUE_ID);
if (!div) {
throw new Error('Unique ID not found');
}
return Number(div.textContent);
}
/**
* Scrapes the individual URL for a given course that takes you to the course details page
* @param row the row of the course catalog table
* @returns the url of the course details page for the course in the row
*/
getURL(row: HTMLTableRowElement): string {
const div = row.querySelector<HTMLAnchorElement>(`${TableDataSelector.UNIQUE_ID} a`);
return div?.href || window.location.href;
}
/**
* Scrape who is teaching the course from the course catalog table row with meta-data about their name
* @param row the row of the course catalog table
* @returns an array of instructors for the course
*/
getInstructors(row: HTMLTableRowElement): Instructor[] {
const spans = row.querySelectorAll(TableDataSelector.INSTRUCTORS);
const names = Array.from(spans)
.map(span => span.textContent || '')
.map(name => name.trim())
.filter(Boolean);
return names.map(name => {
const [lastName, rest] = name.split(',').map(s => s.trim());
const [firstName, middleInitial] = rest.split(' ');
return {
name,
firstName,
lastName,
middleInitial,
};
});
}
/**
* Whether or not this is a header row for a course within the course catalog list (we can't scrape courses from header rows)
* @param row the row of the course catalog table
* @returns true if this is a header row, false otherwise
*/
isHeaderRow(row: HTMLTableRowElement): boolean {
return row.querySelector(TableDataSelector.COURSE_HEADER) !== null;
}
/**
* Scrape whether the class is being taught online, in person, or a hybrid of the two
* @param row the row of the course catalog table
* @returns the instruction mode of the course
*/
getInstructionMode(row: HTMLTableRowElement): InstructionMode {
const text = (row.querySelector(TableDataSelector.INSTRUCTION_MODE)?.textContent || '').toLowerCase();
if (text.includes('internet')) {
return 'Online';
}
if (text.includes('hybrid')) {
return 'Hybrid';
}
return 'In Person';
}
/**
* Scrapes the description of the course from the course details page and separates it into an array of cleaned up lines
* @param document the document of the course details page to scrape
* @returns an array of lines of the course description
*/
getDescription(document: Document): string[] {
const lines = document.querySelectorAll(DetailsSelector.COURSE_DESCRIPTION);
return Array.from(lines)
.map(line => line.textContent || '')
.map(line => line.replace(/\s\s+/g, ' ').trim())
.filter(Boolean);
}
/**
* Get the full name of the course from the course catalog table row (e.g. "CS 314H - Honors Discrete Structures")
* @param row the row of the course catalog table
* @returns the full name of the course
*/
getFullName(row?: HTMLTableRowElement): string {
if (!row) {
return document.querySelector(DetailsSelector.COURSE_NAME)?.textContent || '';
}
const div = row.querySelector(TableDataSelector.COURSE_HEADER);
return div?.textContent || '';
}
/**
* When registration is open, the registration URL will show up in the course catalog table row as a link. This will scrape it from the row.
* @param row the row of the course catalog table
* @returns the registration URL for the course if it is currently displayed, undefined otherwise
*/
getRegisterURL(row: HTMLTableRowElement): string | undefined {
const a = row.querySelector<HTMLAnchorElement>(TableDataSelector.REGISTER_URL);
return a?.href;
}
/**
* Scrapes whether the course is open, closed, waitlisted, or cancelled
* @param row the row of the course catalog table
* @returns
*/
getStatus(row: HTMLTableRowElement): [status: Status, isReserved: boolean] {
const div = row.querySelector(TableDataSelector.STATUS);
if (!div) {
throw new Error('Status not found');
}
const text = (div.textContent || '').trim().toLowerCase();
if (!text) {
throw new Error('Status not found');
}
const isReserved = text.includes('reserved');
if (text.includes('open')) {
return [Status.OPEN, isReserved];
}
if (text.includes('closed')) {
return [Status.CLOSED, isReserved];
}
if (text.includes('waitlisted')) {
return [Status.WAITLISTED, isReserved];
}
if (text.includes('cancelled')) {
return [Status.CANCELLED, isReserved];
}
throw new Error(`Unknown status: ${text}`);
}
/**
* At UT, some courses have certain "flags" which aid in graduation. This will scrape the flags from the course catalog table row.
* @param row
* @returns an array of flags for the course
*/
getFlags(row: HTMLTableRowElement): string[] {
const lis = row.querySelectorAll(TableDataSelector.FLAGS);
return Array.from(lis).map(li => li.textContent || '');
}
/**
* This will scrape all the time information from the course catalog table row and return it as a CourseSchedule object, which represents all of the meeting timiestimes/places of the course.
* @param row the row of the course catalog table
* @returns a CourseSchedule object representing all of the meetings of the course
*/
getSchedule(row: HTMLTableRowElement): CourseSchedule {
const dayLines = row.querySelectorAll(TableDataSelector.SCHEDULE_DAYS);
const hourLines = row.querySelectorAll(TableDataSelector.SCHEDULE_HOURS);
const roomLines = row.querySelectorAll(TableDataSelector.SCHEDULE_ROOM);
if (dayLines.length !== hourLines.length) {
throw new Error('Schedule data is malformed');
}
const meetings: CourseMeeting[] = [];
for (let i = 0; i < dayLines.length; i += 1) {
const lineMeetings = CourseSchedule.parse(
dayLines[i].textContent || '',
hourLines[i].textContent || '',
roomLines[i].textContent || ''
);
meetings.push(...lineMeetings);
}
return new CourseSchedule({ meetings });
}
}