Skip to content

Commit

Permalink
Finals Parser (#890)
Browse files Browse the repository at this point in the history
* Start of finals parser

* Continued creating finals parser

* Fixed some issues, began commenting

* Fixed a few more issues, reconsidered logging

* Improved formatting and fixed bug

Fixed AM/PM bug, other changes

* Converted to python script

* Updated with instructions on how to convert to .py

* Tested updated schedule

* Added logic for cross listed courses

* Added cross listing logic

* Added requirements.txt

* Fixed bug with \ entries

* Updated the .py to match

* remove idea folder

* dont redefine format

---------

Co-authored-by: dorian451 <[email protected]>
  • Loading branch information
becausej and dorian451 authored Nov 1, 2024
1 parent 9ea3e4c commit ffaac6f
Show file tree
Hide file tree
Showing 14 changed files with 2,069 additions and 1 deletion.
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ node_modules/
.vscode
.DS_Store
.python-version
*.ipynb*
courses20.xml
.coverage
compose-dev.yaml
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,293 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "6b64fad0-14d1-4489-937b-857e5ecd8ec2",
"metadata": {},
"source": [
"ASSUMPTIONS/DOCUMENTATION:\n",
"\n",
" - A null section value indicates that all sections share that exam date and time.\n",
" - Some ARCH courses list section as \"80\". No clue what this means.\n",
" - The process uses the FinalsBySubject.pdf document from the RPI website.\n",
" - This pdf should have columns Department, Course, Location, Date, and Grades Due (although the first and last don't matter)\n",
" - It should also be titled at the top of each page with Season Year followed by any amount more text (doesn't matter)\n",
" - If the above are not true, small modifications must be made to the process\n",
" - To handle inconsistent AM/PM labeling we assume that all exams begin at or after 8 AM and we assume all exams end at or before 10 PM\n",
" - The current process assumes the finals document is named finals_by_subject.pdf and is in the same folder as this process\n",
" - The output is a csv file with format: ['Season', 'Year', 'Major', 'Course', 'Section', 'Start', 'End', 'Building', 'Room_Number']\n",
" - Use \"jupyter nbconvert --to script FinalsParser.ipynb\" to convert the .ipynb file to a .py\n",
"\n",
"TODO:\n",
"\n",
" - Make grades due column not break the program - can't be fixed without more filled out version of exam schedule\n",
" - It is possible that this column is never filled in the publicly available version - meaning this isn't an issue."
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "c5584e3c-8c4d-4656-8699-383826a60509",
"metadata": {},
"outputs": [],
"source": [
"from pypdf import PdfReader\n",
"import os\n",
"from datetime import datetime\n",
"import pandas as pd\n",
"import re\n",
"import calendar\n",
"debug_mode = False"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "b89afdbc-6199-4ad8-bc03-467ea9a495ce",
"metadata": {},
"outputs": [],
"source": [
"# Construct a dictionary to get the number of a month from it's word\n",
"months = list(calendar.month_name)\n",
"months = [x.lower() for x in months]\n",
"\n",
"# Turns a time, day of month, month, and year into one datetime object for the table\n",
"# Does this for both the start and end time for an exam\n",
"# This is used to get the start and end times for an exam\n",
"def handle_times(start_text, end_text, day, month, year):\n",
" # Regex to get the hour and minute as seperate values from a string of the format HH:MM AM or HH:MM PM\n",
" start_nums = re.findall(r'\\d+', start_text)\n",
" start_nums = [int(x) for x in start_nums]\n",
" end_nums = re.findall(r'\\d+', end_text)\n",
" end_nums = [int(x) for x in end_nums]\n",
" # Instead of trying to track AM/PM we instead use the logic that exams only happen between 8AM - 9:30 PM and convert to military time\n",
" # This is done because RPI likes to have typos such as 8:00 M instead of 8:00 PM making the AM/PM values unreliable\n",
" if end_nums[0] <= 10:\n",
" end_nums[0] += 12\n",
" if start_nums[0] < 8:\n",
" start_nums[0] += 12\n",
" month_num = months.index(month.lower())\n",
" # Construct and return the datetime object\n",
" start_text = year + str(month_num) + day + str(start_nums[0]) + \":\" + str(start_nums[1])\n",
" end_text = year + str(month_num) + day + str(end_nums[0]) + \":\" + str(end_nums[1])\n",
" format = '%Y%m%d%H:%M'\n",
" start_time = datetime.strptime(start_text, format)\n",
" end_time = datetime.strptime(end_text, format)\n",
" return start_time, end_time"
]
},
{
"cell_type": "code",
"execution_count": 29,
"id": "201b53c9-4435-4371-8463-cc06ec5a19dc",
"metadata": {},
"outputs": [],
"source": [
"def parser():\n",
" files = [f for f in os.listdir('.') if os.path.isfile(f)]\n",
" \n",
" reader = PdfReader(\"finals_schedule.pdf\")\n",
" number_of_pages = len(reader.pages)\n",
" \n",
" db_lines = []\n",
"\n",
" # Process the document page by page\n",
" for page in reader.pages:\n",
" text = page.extract_text(extraction_mode=\"layout\")\n",
" # Process the text to handle the following cases:\n",
" # - If the location is \"TBA\" we replace this with \"TBA TBA\" since location is Room RoomNumber\n",
" # - If the location is \"ONLINE\" we replace this with \"ONLINE NA\" for the same reason (Room=ONLINE,RoomNumber=NA) \n",
" # - If there are \"/\" in the text we remove surrounding spaces so they don't cause issues (multiple different issues)\n",
" # - Remove the word \"SECTIONS\" as it's superflous and inconsistent\n",
" text = text.replace(\" / \", \"/\").replace(\"TBA\", \"TBA TBA\").replace(\"ONLINE\", \"ONLINE NA\").replace(\"(ALL \", \"(ALL\").replace(\"SECTIONS \", \"\")\n",
" # Split text into lines, get the season (Fall,Summer,Spring) and year (20__) from the first line, then remove the first three lines since they are header/bank\n",
" text = text.split('\\n')\n",
" for_year = text[0].split(\" \")\n",
" for_year = [x for x in for_year if x != '']\n",
" season = for_year[0]\n",
" year = for_year[1]\n",
" text.pop(0)\n",
" text.pop(0)\n",
" text.pop(0)\n",
" # Remove a fourth line for the first page only since it has the column headers\n",
" if \"DEPARTMENT\" in text[0] and \"COURSE\" in text[0]:\n",
" text.pop(0)\n",
" \n",
" # Now, parse the lines\n",
" for line in text:\n",
" \n",
" # Remove (in SQL syntax) anything like \"(NEEDS%)\" because a few random courses say (NEEDS 6 HR BLOCK) or something along those lines\n",
" while \"(NEEDS\" in line:\n",
" tmp = line[line.index(\"(NEEDS\"):]\n",
" line = line[0:line.index(\"(NEEDS\")] + line[line.index(\"(NEEDS\") + tmp.index(\")\") + 1:]\n",
" \n",
" # Clean up the line and remove department\n",
" line = line.strip()\n",
" line = line.split(\" \")\n",
" line = [x for x in line if x != '']\n",
" line.pop(0)\n",
" \n",
" # Look for the first number in the line - this will be the course code\n",
" first_num = -1\n",
" for i in range(len(line)):\n",
" if any(char.isdigit() for char in line[i]):\n",
" first_num = i\n",
" break\n",
" # Remove everything before the school code (ARCH, CSCI, etc)\n",
" for i in range(first_num - 1):\n",
" line.pop(0)\n",
" # Get major\n",
" major = line[0]\n",
" line.pop(0)\n",
" # Get the course codes\n",
" course_string = line[0]\n",
" courses = []\n",
" # If there are multiple course codes, separate them out\n",
" while \"/\" in course_string:\n",
" i = course_string.index(\"/\")\n",
" courses.append(course_string[0:i])\n",
" course_string = course_string[i + 1:len(course_string)]\n",
" courses.append(course_string)\n",
" line.pop(0)\n",
"\n",
" # Now the line is of the format:\n",
" # [SECTiONS IN VARIOUS FORMATS, BUILDING, ROOM, DAY OF WEEK, MONTH, DAY OF MONTH, '@', START TIME, '-', END TIME, GRADES DUE]\n",
"\n",
" # Start at the end of the line - this is because we don't know how many entries the SECTIONS will be in since doc is formatted inconsistently\n",
"\n",
" # End time\n",
" time2 = line[len(line) - 1]\n",
" line.pop(len(line) - 1)\n",
" line.pop(len(line) - 1)\n",
"\n",
" # Start time\n",
" time1 = line[len(line) - 1]\n",
" line.pop(len(line) - 1)\n",
" line.pop(len(line) - 1)\n",
"\n",
" # Day of month\n",
" day = line[len(line) - 1]\n",
" line.pop(len(line) - 1)\n",
"\n",
" # Month\n",
" month = line[len(line) - 1]\n",
" line.pop(len(line) - 1)\n",
"\n",
" # Day of week\n",
" weekday = line[len(line) - 1].replace(\",\", '')\n",
" line.pop(len(line) - 1)\n",
"\n",
" # Room\n",
" room = line[len(line) - 1]\n",
" line.pop(len(line) - 1)\n",
"\n",
" # Building\n",
" building = line[len(line) - 1]\n",
" line.pop(len(line) - 1)\n",
"\n",
"\n",
" # Split the major up if it is MATH/CSCI for example\n",
" majors = []\n",
" if '/' in major:\n",
" while '/' in major:\n",
" index = major.index('/')\n",
" majors.append(major[:index])\n",
" major = major[index+1:]\n",
" else:\n",
" majors.append(major)\n",
"\n",
" # Everything left is the sections\n",
" # Get the sections from the remainder and fix some formatting (take out of parens and remove commas and ampersands)\n",
" sections = [x.replace(\",\", \"\").replace(\"(\", \"\").replace(\")\", \"\") for x in line if x != ',' and x != '&']\n",
" # If an entry is info for all sections of a class, write that and skip the rest\n",
" all = False\n",
" done = False\n",
" for tmp_major in majors:\n",
" for section in sections:\n",
" if \"ALL\" in section:\n",
" start_time, end_time = handle_times(time1, time2, day, month, year)\n",
" db_lines.append([season, year, tmp_major, course, None, start_time, end_time, building, room])\n",
" all = True\n",
" if all:\n",
" done = True\n",
" continue\n",
" if done:\n",
" continue\n",
" \n",
" fixed_sections = []\n",
" # Create seperate section entries for all sections within a range ([01-05] becomes [01,02,03,04,05])\n",
" for section in sections:\n",
" if '-' in section:\n",
" num1 = int(section[:section.index(\"-\")])\n",
" num2 = int(section[section.index(\"-\") + 1:])\n",
" sections.remove(section)\n",
" for i in range(num1, num2 + 1):\n",
" fixed_sections.append(i)\n",
" else:\n",
" fixed_sections.append(int(section))\n",
" sections = fixed_sections\n",
" # Adds all the entries into the array\n",
" for tmp_major in majors:\n",
" for section in sections:\n",
" for course in courses:\n",
" start_time, end_time = handle_times(time1, time2, day, month, year)\n",
" db_lines.append([season, year, tmp_major, course, int(section), start_time, end_time, building, room])\n",
" return db_lines"
]
},
{
"cell_type": "code",
"execution_count": 30,
"id": "c100e052-1d8a-432c-8fc2-0086cfd5d334",
"metadata": {},
"outputs": [],
"source": [
"def display_and_write_csv(db_lines):\n",
" # Place into pandas dateframe (not needed but useful for testing & makes writing to csv easier\n",
" df = pd.DataFrame(columns=('Season', 'Year', 'Major', 'Course', 'Section', 'Start', 'End', 'Building', 'Room_Number'))\n",
" for i in range(len(db_lines)):\n",
" df.loc[i] = db_lines[i]\n",
" # standardize datetimes\n",
" df['Start'] = pd.to_datetime(df['Start'])\n",
" df['End'] = pd.to_datetime(df['End']) \n",
" if debug_mode:\n",
" pd.set_option('display.max_rows', 500)\n",
" display(df)\n",
" # write to output csv\n",
" df.to_csv('out.csv')"
]
},
{
"cell_type": "code",
"execution_count": 31,
"id": "e54bdf1b-ba2d-42dc-afbf-39efdcbc02c0",
"metadata": {},
"outputs": [],
"source": [
"db_lines = parser()\n",
"display_and_write_csv(db_lines)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.5"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Loading

0 comments on commit ffaac6f

Please sign in to comment.