import { Course, Instructor, Status, InstructionMode, ScrapedRow } from 'src/shared/types/Course'; import { CourseSchedule, CourseMeeting } from 'src/shared/types/CourseSchedule'; import { SiteSupport } from 'src/views/lib/getSiteSupport'; /** * The selectors that we use to scrape the course catalog list table (https://utdirect.utexas.edu/apps/registrar/course_schedule/20239/results/?fos_fl=C+S&level=U&search_type_main=FIELD) */ enum TableDataSelector { COURSE_HEADER = 'td.course_header', UNIQUE_ID = 'td[data-th="Unique"]', REGISTER_URL = 'td[data-th="Add"] a', INSTRUCTORS = 'td[data-th="Instructor"] span', INSTRUCTION_MODE = 'td[data-th="Instruction Mode"]', STATUS = 'td[data-th="Status"]', SCHEDULE_DAYS = 'td[data-th="Days"]>span', SCHEDULE_HOURS = 'td[data-th="Hour"]>span', SCHEDULE_ROOM = 'td[data-th="Room"]>span', FLAGS = 'td[data-th="Flags"] ul li', } /** * The selectors that we use to scrape the course details page for an individual course (https://utdirect.utexas.edu/apps/registrar/course_schedule/20239/52700/) */ enum DetailsSelector { COURSE_NAME = '#details h2', COURSE_DESCRIPTION = '#details p', } /** * A class that allows us to scrape information from UT's course catalog to create our internal representation of a course */ export class CourseCatalogScraper { support: SiteSupport; constructor(support: SiteSupport) { this.support = support; } /** * Pass in a list of HTMLtable rows and scrape every course from them * @param rows the rows of the course catalog table * @returns an array of course row objects (which contain courses corresponding to the htmltable row) */ public scrape(rows: NodeListOf | HTMLTableRowElement[]): ScrapedRow[] { const courses: ScrapedRow[] = []; let fullName = this.getFullName(); rows.forEach(row => { if (this.isHeaderRow(row)) { fullName = this.getFullName(row); return; } // we are now ready to build the course object if (!fullName) { throw new Error('Course name not found'); } fullName = fullName.replace(/\s\s+/g, ' ').trim(); const [courseName, department, number] = this.separateCourseName(fullName); const [status, isReserved] = this.getStatus(row); // TODO: get semester from somewhere const year = new Date().getFullYear(); const month = new Date().getMonth(); let season = 'Fall'; if (month >= 0 && month < 5) { season = 'Spring'; } else if (month >= 5 && month < 8) { season = 'Summer'; } const newCourse = new Course({ fullName, courseName, department, number, status, isReserved, schedule: this.getSchedule(row), registerURL: this.getRegisterURL(row), url: this.getURL(row), flags: this.getFlags(row), uniqueId: this.getUniqueId(row), instructionMode: this.getInstructionMode(row), instructors: this.getInstructors(row), description: this.getDescription(document), // TODO: get semester from somewhere semester: { year, season, }, }); courses.push({ element: row, course: newCourse, }); }); return courses; } /** * Separate the course name into its department, number, and name * @example separateCourseName("CS 314H - Honors Discrete Structures") => ["Honors Discrete Structures", "CS", "314H"] * @param courseFullName the full name of the course (e.g. "CS 314H - Honors Discrete Structures") * @returns an array of the course name , department, and number */ separateCourseName(courseFullName: string): [courseName: string, department: string, number: string] { let courseNumberIndex = courseFullName.search(/\d/); let department = courseFullName.substring(0, courseNumberIndex).trim(); let number = courseFullName.substring(courseNumberIndex, courseFullName.indexOf(' ', courseNumberIndex)).trim(); let courseName = courseFullName.substring(courseFullName.indexOf(' ', courseNumberIndex)).trim(); return [courseName, department, number]; } /** * Scrape the Unique ID from the course catalog table row * @param row the row of the course catalog table * @returns the uniqueid of the course as a number */ getUniqueId(row: HTMLTableRowElement): number { const div = row.querySelector(TableDataSelector.UNIQUE_ID); if (!div) { throw new Error('Unique ID not found'); } return Number(div.textContent); } /** * Scrapes the individual URL for a given course that takes you to the course details page * @param row the row of the course catalog table * @returns the url of the course details page for the course in the row */ getURL(row: HTMLTableRowElement): string { const div = row.querySelector(`${TableDataSelector.UNIQUE_ID} a`); return div?.href || window.location.href; } /** * Scrape who is teaching the course from the course catalog table row with meta-data about their name * @param row the row of the course catalog table * @returns an array of instructors for the course */ getInstructors(row: HTMLTableRowElement): Instructor[] { const spans = row.querySelectorAll(TableDataSelector.INSTRUCTORS); const names = Array.from(spans) .map(span => span.textContent || '') .map(name => name.trim()) .filter(Boolean); return names.map(name => { const [lastName, rest] = name.split(',').map(s => s.trim()); const [firstName, middleInitial] = rest.split(' '); return { name, firstName, lastName, middleInitial, }; }); } /** * Whether or not this is a header row for a course within the course catalog list (we can't scrape courses from header rows) * @param row the row of the course catalog table * @returns true if this is a header row, false otherwise */ isHeaderRow(row: HTMLTableRowElement): boolean { return row.querySelector(TableDataSelector.COURSE_HEADER) !== null; } /** * Scrape whether the class is being taught online, in person, or a hybrid of the two * @param row the row of the course catalog table * @returns the instruction mode of the course */ getInstructionMode(row: HTMLTableRowElement): InstructionMode { const text = (row.querySelector(TableDataSelector.INSTRUCTION_MODE)?.textContent || '').toLowerCase(); if (text.includes('internet')) { return 'Online'; } if (text.includes('hybrid')) { return 'Hybrid'; } return 'In Person'; } /** * Scrapes the description of the course from the course details page and separates it into an array of cleaned up lines * @param document the document of the course details page to scrape * @returns an array of lines of the course description */ getDescription(document: Document): string[] { const lines = document.querySelectorAll(DetailsSelector.COURSE_DESCRIPTION); return Array.from(lines) .map(line => line.textContent || '') .map(line => line.replace(/\s\s+/g, ' ').trim()) .filter(Boolean); } /** * Get the full name of the course from the course catalog table row (e.g. "CS 314H - Honors Discrete Structures") * @param row the row of the course catalog table * @returns the full name of the course */ getFullName(row?: HTMLTableRowElement): string { if (!row) { return document.querySelector(DetailsSelector.COURSE_NAME)?.textContent || ''; } const div = row.querySelector(TableDataSelector.COURSE_HEADER); return div?.textContent || ''; } /** * When registration is open, the registration URL will show up in the course catalog table row as a link. This will scrape it from the row. * @param row the row of the course catalog table * @returns the registration URL for the course if it is currently displayed, undefined otherwise */ getRegisterURL(row: HTMLTableRowElement): string | undefined { const a = row.querySelector(TableDataSelector.REGISTER_URL); return a?.href; } /** * Scrapes whether the course is open, closed, waitlisted, or cancelled * @param row the row of the course catalog table * @returns */ getStatus(row: HTMLTableRowElement): [status: Status, isReserved: boolean] { const div = row.querySelector(TableDataSelector.STATUS); if (!div) { throw new Error('Status not found'); } const text = (div.textContent || '').trim().toLowerCase(); if (!text) { throw new Error('Status not found'); } const isReserved = text.includes('reserved'); if (text.includes('open')) { return [Status.OPEN, isReserved]; } if (text.includes('closed')) { return [Status.CLOSED, isReserved]; } if (text.includes('waitlisted')) { return [Status.WAITLISTED, isReserved]; } if (text.includes('cancelled')) { return [Status.CANCELLED, isReserved]; } throw new Error(`Unknown status: ${text}`); } /** * At UT, some courses have certain "flags" which aid in graduation. This will scrape the flags from the course catalog table row. * @param row * @returns an array of flags for the course */ getFlags(row: HTMLTableRowElement): string[] { const lis = row.querySelectorAll(TableDataSelector.FLAGS); return Array.from(lis).map(li => li.textContent || ''); } /** * This will scrape all the time information from the course catalog table row and return it as a CourseSchedule object, which represents all of the meeting timiestimes/places of the course. * @param row the row of the course catalog table * @returns a CourseSchedule object representing all of the meetings of the course */ getSchedule(row: HTMLTableRowElement): CourseSchedule { const dayLines = row.querySelectorAll(TableDataSelector.SCHEDULE_DAYS); const hourLines = row.querySelectorAll(TableDataSelector.SCHEDULE_HOURS); const roomLines = row.querySelectorAll(TableDataSelector.SCHEDULE_ROOM); if (dayLines.length !== hourLines.length) { throw new Error('Schedule data is malformed'); } const meetings: CourseMeeting[] = []; for (let i = 0; i < dayLines.length; i += 1) { const lineMeetings = CourseSchedule.parse( dayLines[i].textContent || '', hourLines[i].textContent || '', roomLines[i].textContent || '' ); meetings.push(...lineMeetings); } return new CourseSchedule({ meetings }); } }