Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Webscape update #865

Merged
merged 10 commits into from
Apr 12, 2024
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion rpi_data/modules/course.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def __init__(self, info):
self.section = info[3]
self.credits = info[4]
self.name = info[5]
self.days = info[6]
self.days = info[6].strip()
self.stime = info[7]
self.etime = info[8]
self.max = info[9]
Expand Down
129 changes: 89 additions & 40 deletions rpi_data/modules/new_parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,9 @@
basevalue += year * 100 #this makes the basevalue show our year
return basevalue

def sisCourseSearch(driver, term): #main loop of the parser, goes to the course search, selects the desired term, and then loops through each subject to grab the course tables
def sisCourseSearch(driver, term, course_codes_dict): #main loop of the parser, goes to the course search, selects the desired term, and then loops through each subject to grab the course tables
info = list()
course_codes_dict = findAllSubjectCodes(driver)

url = "https://sis.rpi.edu/rss/bwskfcls.p_sel_crse_search"
driver.get(url)
select = Select(driver.find_element(by=By.ID, value = "term_input_id")) # term selection dropdown
Expand All @@ -70,7 +70,7 @@
print("Getting course info")
courses = getCourseInfo(driver, key, course_codes_dict) # creates a list of course objects
with ThreadPoolExecutor(max_workers=50) as pool:
pool.map(getReqForClass, courses)
pool.map(getReqForClass, courses, course_codes_dict.keys())
[info.append(i) for i in courses] # appends each course to our final list
subject = info[len(info)-1].major # gets the subject we just parsed
driver.get(url) # goes back to the start
Expand Down Expand Up @@ -231,6 +231,7 @@
return info
#Some admin and grad courses won't have days of the week
#Also the backend doesn't like the days of the week being TBA
info[6] = info[6].strip('\xa0')
if (info[6] == '\xa0' or info[6] == "TBA"):
info[6] = ""
#Generally speaking methods that affect info should come in the order that the affect elements, ie
Expand Down Expand Up @@ -284,6 +285,18 @@
c.addSchool("Interdisciplinary and Other")
courses.append(c)
return courses
# takes a raw phrase and returns a list of all of the course codes included, with repeats
def findCourseCodes(raw, subject_codes) -> list:
course_codes = []
for i in subject_codes:
while (i in raw):
find = raw.find(i)
text = raw[find:find + 9]
raw = raw[:find] + raw[find + 9:]
if (text[4] != " " or not text[5].isdigit()):
continue
course_codes.append(text)
return course_codes
#Given a url for a course, as well as the course code and major, return a list of prereqs, coreqs, and description of the course
#Eg. ITWS 2110 - https://sis.rpi.edu/rss/bwckctlg.p_disp_course_detail?cat_term_in=202401&subj_code_in=ITWS&crse_numb_in=2110
# Prereqs - ITWS 1100
Expand All @@ -293,54 +306,84 @@
# The course uses a hands-on approach in which students actively develop Web-based software systems.
# Additional topics include installation, configuration, and management of Web servers.
# Students are required to have access to a PC on which they can install software such as a Web server and various programming environments.
def getReqFromLink(webres, courseCode, major) -> list:

def getReqFromLink(webres, subject_codes) -> list:
page = webres.content
soup = bs(page, "html.parser")
body = soup.find('td', class_='ntdefault')
#The page is full of \n\n's for some reason, and this nicely splits it into sections
classInfo = body.text.strip('\n\n').split('\n\n')
for i in range(0,len(classInfo),1):
while '\n' in classInfo[i]:
#Some \n's can make it into the parsed data, so we need to get rid of them.
classInfo[i] = classInfo[i].replace('\n','')
key = "Prerequisites/Corequisites"
key = "Prerequisites/Corequisites: "
preKey = "Prerequisite"
prereqs = ""
coreqs = ""
coKey = "Corequisite"
extraKey = "Co-listed"
creditKey = "Credit Hours"
prereqs = []
coreqs = []
raw = ""
desc = classInfo[0]
# uses full so that we can just get all info
full = "".join(classInfo).strip()
# look for starting key
if (key in full):
raw = full.split(key)[1].split(creditKey)[0]
else:
raw = full
if (key not in raw and coKey not in raw and preKey not in raw):
return [str([]), str([]), "", desc]
#If the course does not have a description, usually this menas that classInfo[0] will be the credit value.
if desc.strip()[0].isdigit():
desc = ""
for i in range(1, len(classInfo)):
if key in classInfo[i].strip():
combo = classInfo[i].strip()
combo = combo[len(key):]
coKey = "Corequisite"
if coKey in combo and preKey in combo:
coreqs = combo[combo.find(coKey) + len(coKey):]
prereqs = combo[len(preKey): combo.find(coKey)]
elif coKey in combo:
coreqs = combo[combo.find(coKey) + len(coKey):]
elif preKey in combo:
prereqs = combo[len(preKey):]
else:
#Default case where someone forgets the words we're looking for
#Note that there are still more edge cases(looking at you csci 6560 and 2110 in spring 2024)
prereqs = combo
prereqs = prereqs[prereqs.find(' '):255].strip()
coreqs = coreqs[coreqs.find(' '):255].strip()
if classInfo[i].strip() == (preKey + "s:"):
raw = classInfo[i+1].strip()
retList = [prereqs, coreqs, raw, desc]
#removes Prereq/Coreq starting keyphrase so we can focus on just coreqs, just prereqs, or both if it isn't distinguished
raw = raw.replace(key, "")
raw_prereqs = ""
raw_coreqs = ""
# checks if courses are prereqs, coreqs or both
if (preKey in raw and coKey in raw):
if (raw.find(coKey) < raw.find(preKey)):
raw_coreqs = raw.split(coKey)[1].split(preKey)[0]
raw_prereqs = raw.split(preKey)[1]
else:
raw_prereqs = raw.split(preKey)[1].split(coKey)[0]
raw_coreqs = raw.split(coKey)[1]
elif (preKey in raw):
raw_prereqs = raw
elif (coKey in raw):
raw_coreqs = raw
else:
raw_prereqs = raw
raw_coreqs = raw
#checks for co-listed courses to not include
if (extraKey in raw_prereqs):
raw_prereqs = raw_prereqs.split(extraKey)[0]

if (extraKey in raw_coreqs):
raw_prereqs = raw_coreqs.split(extraKey)[0]
# look for course codes
prereqs = findCourseCodes(raw_prereqs, subject_codes)
coreqs = findCourseCodes(raw_coreqs, subject_codes)
# take out repeats
prereqs = list(set(prereqs))
coreqs = list(set(coreqs))
# makes raw both prereqs and coreqs if they are different
if (raw_prereqs != raw_coreqs):
raw = raw_prereqs + " " + raw_coreqs
else:
if (extraKey in raw):
raw = raw.split(extraKey)
retList = [str(prereqs), str(coreqs), raw, desc]
return retList

Check notice on line 379 in rpi_data/modules/new_parse.py

View check run for this annotation

codefactor.io / CodeFactor

rpi_data/modules/new_parse.py#L310-L379

Complex Method
#Add the prereqs for a course to that course
def getReqForClass(course: Course) -> None:
def getReqForClass(course: Course, course_codes: list) -> None:
semester = getSemester(course)
url = "https://sis.rpi.edu/rss/bwckctlg.p_disp_course_detail?cat_term_in={}&subj_code_in={}&crse_numb_in={}".format(semester, course.major, course.code)
session = requests.session()
webres = session.get(url)
course.addReqsFromList(getReqFromLink(webres, course.code, course.major))
course.addReqsFromList(getReqFromLink(webres, course_codes))
#Given a course, return the basevalue of that course, eg 2024-01 is returned as 202401
def getSemester(course: Course) -> int:
dates = course.sdate.split("-")
Expand Down Expand Up @@ -369,20 +412,26 @@

# This main function is helpful for running the full parser standalone, without needing environmental variables.

def main():
if __name__ == "__main__":
options = Options()
#options.add_argument("--no-sandbox")
#options.add_argument("--disable-dev-shm-usage")
#options.add_argument("--headless")
#options.add_argument("--remote-debugging-port=9222")
driver = webdriver.Firefox()
driver.implicitly_wait(2)
login.login(driver)
start = time.time()
final = sisCourseSearch(driver, "spring2024")
end = time.time()
writeCSV(final, "test.csv")
print("Total Elapsed: " + str(end - start))
fp = webdriver.FirefoxProfile()
fp.set_preference("network.cookie.cookieBehavior", 2)
driver = webdriver.Firefox(options==fp)
try:
driver.implicitly_wait(1)
course_codes_dict = findAllSubjectCodes(driver)
login.login(driver)
start = time.time()
final = sisCourseSearch(driver, "spring2024", course_codes_dict)
end = time.time()
writeCSV(final, "test.csv")
print("Total Elapsed: " + str(end - start))
driver.quit()
except:

Check notice on line 434 in rpi_data/modules/new_parse.py

View check run for this annotation

codefactor.io / CodeFactor

rpi_data/modules/new_parse.py#L434

do not use bare 'except' (E722)
driver.quit()

#main()

2 changes: 2 additions & 0 deletions src/web/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ node_modules
/dist
docs

.venv/

# local env files
.env.local
.env.*.local
Expand Down