Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Max mohammadi #18

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 66 additions & 0 deletions src/Cal Poly Graduate Program Scraper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
from bs4 import BeautifulSoup
from bs4.dammit import EncodingDetector
import urllib3
import requests
import re
import pandas as pd
from tabulate import tabulate


def get_graduate_programs(url):
"""Gets links to all available graduate degrees at Cal Poly"""

html_page = requests.get(url)
soup = BeautifulSoup(html_page.text, "html.parser")

# Graduate program names and links are found within a single div class
graduate_program_links_list = soup.find('div', {'id': 'graduatetextcontainer'})

# Add each graduate program name and link into a dictionary
# For every graduate program, go into the respective link and scrape
graduate_programs = {}
for a in graduate_program_links_list.find_all('a', href=True):
if a.parent.parent == graduate_program_links_list:
graduate_programs.setdefault(a.string, [])
graduate_programs[a.string].append(scrape_program_courses("http://catalog.calpoly.edu" + a.get('href')))
# graduate_programs[a.string].append(a.get('href'))

# Call a function to get general information on each graduate program
# Most links to graduate programs have a bio, and some do not
# get_general_information(a.get('href'))

del graduate_programs['Masters Degrees']
del graduate_programs['Graduate Certificates']
del graduate_programs['Back to Top']

return pd.DataFrame(graduate_programs).to_csv(None, index=False)

def scrape_program_courses(url):
"""Gets the course information specific to each program. E.g. required courses,
general education courses, course units, etc."""

# Try except block for finding tables on the page...
try:

html_page = requests.get(url)
soup = BeautifulSoup(html_page.content, 'lxml')
table = soup.find_all('table')
dataframe = pd.read_html(str(table))
return tabulate(dataframe[0], headers='keys', tablefmt='psql')
except:
return "No table on this page"




def get_general_information(url):
"""This function is a helper function to get the graduate program general information"""
pass


if __name__ == "__main__":
pass
dict = get_graduate_programs("http://catalog.calpoly.edu/programsaz/#graduatetext")
print(dict)
# for i in dict:
# print(i + "\n", "\n".join(dict[i]))
60 changes: 60 additions & 0 deletions src/graduate_program_scraper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
from bs4 import BeautifulSoup
import requests
import pandas as pd
from tabulate import tabulate


def get_graduate_programs(url):
"""Gets links to all available graduate degrees at Cal Poly"""

html_page = requests.get(url)
soup = BeautifulSoup(html_page.text, "html.parser")

# Graduate program names and links are found within a single div class
graduate_program_links_list = soup.find('div', {'id': 'graduatetextcontainer'})

# Add each graduate program name and link into a dictionary
# For every graduate program, go into the respective link and scrape
graduate_programs = {}
for a in graduate_program_links_list.find_all('a', href=True):
if a.parent.parent == graduate_program_links_list:
graduate_programs.setdefault(a.string, [])
graduate_programs[a.string].append(scrape_program_courses("http://catalog.calpoly.edu" + a.get('href')))
# graduate_programs[a.string].append(a.get('href'))

del graduate_programs['Masters Degrees']
del graduate_programs['Graduate Certificates']
del graduate_programs['Back to Top']

program_dataframe = pd.DataFrame(graduate_programs)
t = program_dataframe.loc()

return program_dataframe.to_csv(None, index=False)


def scrape_program_courses(url):
"""Gets the course information specific to each program. E.g. required courses,
general education courses, course units, etc."""

# Try except block for finding tables on the page...
try:

html_page = requests.get(url)
soup = BeautifulSoup(html_page.content, 'lxml')
table = soup.find_all('table')
dataframe = pd.read_html(str(table))
return tabulate(dataframe[0], headers='keys', tablefmt='psql')
except:
return "No table on this page"


def get_general_information(url):
"""Helper function to get the graduate program general information"""
pass


if __name__ == "__main__":
dict = get_graduate_programs("http://catalog.calpoly.edu/programsaz/#graduatetext")
print(dict)
#for i in dict:
# print(i + "\n", "\n".join(dict[i]))