Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Webscape update #865

Merged
merged 10 commits into from
Apr 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion rpi_data/modules/course.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def __init__(self, info):
self.section = info[3]
self.credits = info[4]
self.name = info[5]
self.days = info[6]
self.days = info[6].strip()
self.stime = info[7]
self.etime = info[8]
self.max = info[9]
Expand Down
17 changes: 11 additions & 6 deletions rpi_data/modules/headless_login.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,12 +51,6 @@
submit.click()
while len(driver.find_elements(By.XPATH, '/html/body/div/div/div[1]/div/div[2]/div[7]/a'))==0:
time.sleep(.1)
options = driver.find_element(By.XPATH, '/html/body/div/div/div[1]/div/div[2]/div[7]/a')
options.click()
while len(driver.find_elements(By.XPATH, '/html/body/div/div/div[1]/div/div[1]/ul/li[1]/a')) == 0:
time.sleep(.1)
duo_option = driver.find_element(By.XPATH, '/html/body/div/div/div[1]/div/div[1]/ul/li[1]/a')
duo_option.click()
while len(driver.find_elements(By.XPATH, '/html/body/div/div/div[1]/div/div[2]/div[3]')) == 0:
time.sleep(.1)
print("Your DUO code: "+ driver.find_element(by= By.XPATH, value = "/html/body/div/div/div[1]/div/div[2]/div[3]").text) # print the duo code
Expand All @@ -65,8 +59,19 @@
trust_button = driver.find_element(By.XPATH, '//*[@id="trust-browser-button"]') #find and click it
trust_button.click()
time.sleep(3)
while ("https://shib.auth.rpi.edu" in driver.current_url):
driver.get("https://sis.rpi.edu/rss/twbkwbis.P_GenMenu?name=bmenu.P_MainMnu")
if (driver.current_url == "https://sis.rpi.edu/rss/twbkwbis.P_GenMenu?name=bmenu.P_MainMnu"): # check if we're in the right place

Check notice on line 64 in rpi_data/modules/headless_login.py

View check run for this annotation

codefactor.io / CodeFactor

rpi_data/modules/headless_login.py#L64

Unnecessary "else" after "return", remove the "else" and de-indent the code inside it (no-else-return)
return "Success"
else:
print("login failed")
return "Failure"


if __name__ == "__main__":
options = Options()
options.add_argument('--user-agent=Mozilla/5.0 (iPhone; CPU iPhone OS 10_3 like Mac OS X) AppleWebKit/602.1.50 (KHTML, like Gecko) CriOS/56.0.2924.75 Mobile/14E5239e Safari/602.1')
driver = webdriver.Firefox(options=options)
driver.implicitly_wait(2)
login(driver)

Check notice on line 77 in rpi_data/modules/headless_login.py

View check run for this annotation

codefactor.io / CodeFactor

rpi_data/modules/headless_login.py#L77

Trailing newlines (trailing-newlines)
141 changes: 96 additions & 45 deletions rpi_data/modules/new_parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.support.ui import Select
import time
from bs4 import BeautifulSoup as bs
Expand Down Expand Up @@ -46,9 +46,9 @@
basevalue += year * 100 #this makes the basevalue show our year
return basevalue

def sisCourseSearch(driver, term): #main loop of the parser, goes to the course search, selects the desired term, and then loops through each subject to grab the course tables
def sisCourseSearch(driver, term, course_codes_dict): #main loop of the parser, goes to the course search, selects the desired term, and then loops through each subject to grab the course tables
info = list()
course_codes_dict = findAllSubjectCodes(driver)

url = "https://sis.rpi.edu/rss/bwskfcls.p_sel_crse_search"
driver.get(url)
select = Select(driver.find_element(by=By.ID, value = "term_input_id")) # term selection dropdown
Expand All @@ -70,8 +70,8 @@
print("Getting course info")
courses = getCourseInfo(driver, key, course_codes_dict) # creates a list of course objects
with ThreadPoolExecutor(max_workers=50) as pool:
pool.map(getReqForClass, courses)
pool.map(getReqForClass, courses, course_codes_dict.keys())
[info.append(i) for i in courses] # appends each course to our final list

Check notice on line 74 in rpi_data/modules/new_parse.py

View check run for this annotation

codefactor.io / CodeFactor

rpi_data/modules/new_parse.py#L74

Expression "[info.append(i) for i in courses]" is assigned to nothing (expression-not-assigned)
subject = info[len(info)-1].major # gets the subject we just parsed
driver.get(url) # goes back to the start
end = time.time()
Expand All @@ -94,7 +94,7 @@
soup = bs(html, 'html.parser')
ptag = soup.find_all('p') # Entire text of page basically
look_at = []
for all in ptag: # finds all things that are important

Check notice on line 97 in rpi_data/modules/new_parse.py

View check run for this annotation

codefactor.io / CodeFactor

rpi_data/modules/new_parse.py#L97

Redefining built-in 'all' (redefined-builtin)
if all.find('strong'):
look_at.append(all)
for all in look_at: # in every important part
Expand Down Expand Up @@ -201,7 +201,7 @@
for i in range(1, len(data) - 1, 1):
#Edge case where the registrar decides to make a column an inconcsistent width.
#See MGMT 2940 - Readings in MGMT in spring 2024.
#TODO: Accomodate for colspans different than 2.

Check notice on line 204 in rpi_data/modules/new_parse.py

View check run for this annotation

codefactor.io / CodeFactor

rpi_data/modules/new_parse.py#L204

TODO: Accomodate for colspans different than 2. (fixme)
#See https://stackoverflow.com/questions/13263373/beautifulsoup-parsing-tag-table-html-especially-colspan-and-rowspan to start
if(data[i].has_attr("colspan")):
info.append("TBA")
Expand Down Expand Up @@ -231,6 +231,7 @@
return info
#Some admin and grad courses won't have days of the week
#Also the backend doesn't like the days of the week being TBA
info[6] = info[6].strip('\xa0')
if (info[6] == '\xa0' or info[6] == "TBA"):
info[6] = ""
#Generally speaking methods that affect info should come in the order that the affect elements, ie
Expand Down Expand Up @@ -284,6 +285,18 @@
c.addSchool("Interdisciplinary and Other")
courses.append(c)
return courses
# takes a raw phrase and returns a list of all of the course codes included, with repeats
def findCourseCodes(raw, subject_codes) -> list:
course_codes = []
for i in subject_codes:
while (i in raw):
find = raw.find(i)
text = raw[find:find + 9]
raw = raw[:find] + raw[find + 9:]
if (text[4] != " " or not text[5].isdigit()):
continue
course_codes.append(text)
return course_codes
#Given a url for a course, as well as the course code and major, return a list of prereqs, coreqs, and description of the course
#Eg. ITWS 2110 - https://sis.rpi.edu/rss/bwckctlg.p_disp_course_detail?cat_term_in=202401&subj_code_in=ITWS&crse_numb_in=2110
# Prereqs - ITWS 1100
Expand All @@ -293,54 +306,84 @@
# The course uses a hands-on approach in which students actively develop Web-based software systems.
# Additional topics include installation, configuration, and management of Web servers.
# Students are required to have access to a PC on which they can install software such as a Web server and various programming environments.
def getReqFromLink(webres, courseCode, major) -> list:

def getReqFromLink(webres, subject_codes) -> list:
page = webres.content
soup = bs(page, "html.parser")
body = soup.find('td', class_='ntdefault')
#The page is full of \n\n's for some reason, and this nicely splits it into sections
classInfo = body.text.strip('\n\n').split('\n\n')
for i in range(0,len(classInfo),1):
while '\n' in classInfo[i]:
#Some \n's can make it into the parsed data, so we need to get rid of them.
classInfo[i] = classInfo[i].replace('\n','')
key = "Prerequisites/Corequisites"
key = "Prerequisites/Corequisites: "
preKey = "Prerequisite"
prereqs = ""
coreqs = ""
coKey = "Corequisite"
extraKey = "Co-listed"
creditKey = "Credit Hours"
prereqs = []
coreqs = []
raw = ""
desc = classInfo[0]
# uses full so that we can just get all info
full = "".join(classInfo).strip()
# look for starting key
if (key in full):
raw = full.split(key)[1].split(creditKey)[0]
else:
raw = full
if (key not in raw and coKey not in raw and preKey not in raw):
return [str([]), str([]), "", desc]
#If the course does not have a description, usually this menas that classInfo[0] will be the credit value.
if desc.strip()[0].isdigit():
desc = ""
for i in range(1, len(classInfo)):
if key in classInfo[i].strip():
combo = classInfo[i].strip()
combo = combo[len(key):]
coKey = "Corequisite"
if coKey in combo and preKey in combo:
coreqs = combo[combo.find(coKey) + len(coKey):]
prereqs = combo[len(preKey): combo.find(coKey)]
elif coKey in combo:
coreqs = combo[combo.find(coKey) + len(coKey):]
elif preKey in combo:
prereqs = combo[len(preKey):]
else:
#Default case where someone forgets the words we're looking for
#Note that there are still more edge cases(looking at you csci 6560 and 2110 in spring 2024)
prereqs = combo
prereqs = prereqs[prereqs.find(' '):255].strip()
coreqs = coreqs[coreqs.find(' '):255].strip()
if classInfo[i].strip() == (preKey + "s:"):
raw = classInfo[i+1].strip()
retList = [prereqs, coreqs, raw, desc]
#removes Prereq/Coreq starting keyphrase so we can focus on just coreqs, just prereqs, or both if it isn't distinguished
raw = raw.replace(key, "")
raw_prereqs = ""
raw_coreqs = ""
# checks if courses are prereqs, coreqs or both
if (preKey in raw and coKey in raw):
if (raw.find(coKey) < raw.find(preKey)):
raw_coreqs = raw.split(coKey)[1].split(preKey)[0]
raw_prereqs = raw.split(preKey)[1]
else:
raw_prereqs = raw.split(preKey)[1].split(coKey)[0]
raw_coreqs = raw.split(coKey)[1]
elif (preKey in raw):
raw_prereqs = raw
elif (coKey in raw):
raw_coreqs = raw
else:
raw_prereqs = raw
raw_coreqs = raw
#checks for co-listed courses to not include
if (extraKey in raw_prereqs):
raw_prereqs = raw_prereqs.split(extraKey)[0]

if (extraKey in raw_coreqs):
raw_prereqs = raw_coreqs.split(extraKey)[0]
# look for course codes
prereqs = findCourseCodes(raw_prereqs, subject_codes)
coreqs = findCourseCodes(raw_coreqs, subject_codes)
# take out repeats
prereqs = list(set(prereqs))
coreqs = list(set(coreqs))
# makes raw both prereqs and coreqs if they are different
if (raw_prereqs != raw_coreqs):
raw = raw_prereqs + " " + raw_coreqs
else:
if (extraKey in raw):
raw = raw.split(extraKey)
retList = [str(prereqs), str(coreqs), raw, desc]
return retList

Check notice on line 379 in rpi_data/modules/new_parse.py

View check run for this annotation

codefactor.io / CodeFactor

rpi_data/modules/new_parse.py#L310-L379

Complex Method
#Add the prereqs for a course to that course
def getReqForClass(course: Course) -> None:
def getReqForClass(course: Course, course_codes: list) -> None:
semester = getSemester(course)
url = "https://sis.rpi.edu/rss/bwckctlg.p_disp_course_detail?cat_term_in={}&subj_code_in={}&crse_numb_in={}".format(semester, course.major, course.code)
session = requests.session()
webres = session.get(url)
course.addReqsFromList(getReqFromLink(webres, course.code, course.major))
course.addReqsFromList(getReqFromLink(webres, course_codes))
#Given a course, return the basevalue of that course, eg 2024-01 is returned as 202401
def getSemester(course: Course) -> int:
dates = course.sdate.split("-")
Expand Down Expand Up @@ -369,20 +412,28 @@

# This main function is helpful for running the full parser standalone, without needing environmental variables.

def main():
if __name__ == "__main__":
options = Options()
#options.add_argument("--no-sandbox")
#options.add_argument("--disable-dev-shm-usage")
#options.add_argument("--headless")
#options.add_argument("--remote-debugging-port=9222")
driver = webdriver.Firefox()
driver.implicitly_wait(2)
login.login(driver)
start = time.time()
final = sisCourseSearch(driver, "spring2024")
end = time.time()
writeCSV(final, "test.csv")
print("Total Elapsed: " + str(end - start))
fp = webdriver.FirefoxProfile()
# fp.set_preference("network.cookie.cookieBehavior", 2)
fp.set_preference(
"general.useragent.override",
"Mozilla/5.0 (Android 4.4; Mobile; rv:41.0) Gecko/41.0 Firefox/41.0",
)
options.profile = fp
driver = webdriver.Firefox(options)
driver.delete_all_cookies()
try:
driver.implicitly_wait(2)
course_codes_dict = findAllSubjectCodes(driver)
login.login(driver)
start = time.time()
final = sisCourseSearch(driver, "spring2024", course_codes_dict)
end = time.time()

Check notice on line 432 in rpi_data/modules/new_parse.py

View check run for this annotation

codefactor.io / CodeFactor

rpi_data/modules/new_parse.py#L432

No exception type(s) specified (bare-except)
writeCSV(final, "test.csv")
print("Total Elapsed: " + str(end - start))
driver.quit()

Check notice on line 435 in rpi_data/modules/new_parse.py

View check run for this annotation

codefactor.io / CodeFactor

rpi_data/modules/new_parse.py#L435

Trailing newlines (trailing-newlines)
except:
driver.quit()

#main()

2 changes: 2 additions & 0 deletions src/web/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ node_modules
/dist
docs

.venv/

# local env files
.env.local
.env.*.local
Expand Down