-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathutils.py
117 lines (103 loc) · 4.34 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import os
from bs4 import BeautifulSoup
from time import sleep
from requests import session
import csv
def get_proformas(c):
print("getting proformas")
sleep(1)
res = c.get('http://placement.iitk.ac.in/jaf_list/', headers=base_headers)
print(res)
print(res.url)
soup = BeautifulSoup(res.content, 'html5lib')
profiles = soup.findAll('tr')
profiles = profiles[1:]
count = len(profiles)
print(f"total no. of profiles: {count}")
last_path = None
cnt = 1
for i, profile in enumerate(profiles):
profile = profile.findAll('td')
job = profile[0].a.text.replace(
"/", "").replace(".", "").replace(" ", "")
com_name = profile[1].text.replace(
"/", "").replace(".", "").replace(" ", "")
file_path = 'placements20-21/' + com_name + '--' + job + '.html'
org_path = file_path
if file_path == last_path:
file_path = 'placements20-21/' + com_name + \
'--' + job + str(cnt) + '.html'
cnt += 1
else:
cnt = 1
last_path = org_path
if not os.path.exists(os.path.dirname(file_path)):
os.makedirs(os.path.dirname(
file_path))
if not os.path.exists(file_path):
com_link = 'https://placement.iitk.ac.in' + profile[0].a['href']
res = c.get(com_link, headers=profile_headers)
soup = BeautifulSoup(res.content, 'html5lib')
data = soup.find('div', attrs={'class': 'text-center'})
data = bootstrap_cdn + '\n' + data.prettify()
# print the line
print(f"\rprofile no. {i+1}/{count}", end='')
file = open(file_path, 'w', encoding='utf-8')
file.write(data)
file.close()
sleep(0.5)
print("\ndone")
def get_stats(c):
print("downloading/updating stats")
sleep(1)
res = c.get('http://placement.iitk.ac.in/stats/', headers=base_headers)
print(res)
print(res.url)
stats = BeautifulSoup(res.content, 'html5lib').find('tbody').findAll('tr')
print(len(stats))
file_path = 'placements20-21_stats/'
if not os.path.exists(os.path.dirname(file_path)):
os.makedirs(os.path.dirname(file_path))
stats_file = open('placements20-21_stats/stats.csv', 'w',
newline='', encoding='utf-8')
writer = csv.writer(stats_file)
writer.writerow(["Name", "Roll no.", "Company Name",
"Designation", "Program", "Department"])
for stat in stats:
stat = [entry.text for entry in stat.findAll('td')]
writer.writerow(stat)
stats_file.close()
print("done")
base_headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "en-US,en;q=0.9",
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"DNT": "1",
"Host": "placement.iitk.ac.in",
"Pragma": "no-cache",
"Referer": "https://placement.iitk.ac.in/dashboard/",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "same-origin",
"Sec-Fetch-User": "?1",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36"}
profile_headers = {
"Host": "placement.iitk.ac.in",
"Connection": "keep-alive",
"Pragma": "no-cache",
"Cache-Control": "no-cache",
"Upgrade-Insecure-Requests": "1",
"DNT": "1",
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"Sec-Fetch-Site": "same-origin",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-User": "?1",
"Sec-Fetch-Dest": "document",
"Referer": "https://placement.iitk.ac.in/jaf_list/",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "en-US,en;q=0.9"}
bootstrap_cdn = '<link href = "https://stackpath.bootstrapcdn.com/bootstrap/4.3.1/css/bootstrap.min.css" rel = "stylesheet" integrity = "sha384-ggOyR0iXCbMQv3Xipma34MD+dH/1fQ784/j6cY/iJTQUOhcWr7x9JvoRxT2MZw1T" crossorigin = "anonymous" >'