forked from HarshCasper/Rotten-Scripts
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathJobScraper.py
111 lines (83 loc) · 3.7 KB
/
JobScraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import urllib
import time
import requests
import re
import csv
from bs4 import BeautifulSoup
def write_csv(loc, info):
"""
The function writes the job openings collected in a .csv file
"""
headers = ['Title', 'Company Name', 'Location', 'Date', 'Summary', 'Url']
# Adding info into the rows of the file
with open(loc+'_openings.csv', 'a', encoding='utf-8') as csv_f:
csv_p = csv.writer(csv_f, delimiter=',')
csv_p.writerow(headers)
csv_p.writerows(info)
print(f'\n{loc}_openings.csv has been saved to your directory!\n')
def job_scraper():
"""
The function scrapes the required number of job openings posted for a given job title and location
and stores all the associated information in a .csv file
"""
title = input("\nEnter job title: ").replace(" ", "+")
loc = input("Enter job location: ").replace(" ", "+")
num = int(input("Enter the number of job openings to obtain: "))
url = f'https://in.indeed.com/jobs?q={title}&l={loc}'
req_page = requests.get(url)
job_array = []
if req_page.status_code == 200:
soup = BeautifulSoup(req_page.text, "html.parser")
job_table = soup.find("td", id="resultsCol")
count = 0
flag = 1
while flag :
for job_card in job_table.find_all("div", class_="jobsearch-SerpJobCard"):
# Getting the job title
title_elem = job_card.find('a', class_='jobtitle turnstileLink')
title = title_elem.text.strip()
# Getting the company name
company_details = job_card.find('div', class_='sjcl')
company_name = company_details.find('span', class_='company')
company_name = company_name.text.strip()
# Getting the company location
company_loc = company_details.find('span', class_='location')
if company_loc!= None:
company_loc = company_loc.text.strip()
else:
company_loc = loc
# Getting the URL of the post
link = job_card.find('a')['href']
link = 'https://in.indeed.com' + link
# Getting the date of the post
date_elem = job_card.find('span', class_='date')
date = date_elem.text.strip()
# Getting the job summary
summary_ele = job_card.findAll('div', attrs={'class': 'summary'})
for span in summary_ele:
span = span.text.strip()
count += 1
job_array.append([title, company_name, company_loc, date, span, link])
if count == num:
flag = 0
break
# To go to the next page
page = soup.find("ul", class_="pagination-list")
found = 0
for page in page.find_all('a'):
if page.attrs['aria-label'] == 'Next':
found = 1
break
if found:
next_page_link = 'https://in.indeed.com' + page.attrs['href']
time.sleep(2)
req_page = requests.get(next_page_link)
soup = BeautifulSoup(req_page.text, "html.parser")
job_table = soup.find("td", id="resultsCol")
else:
flag = 0
write_csv(loc, job_array)
else:
print('There seems to be a problem fetching the results. Check your inputs, connections and try again')
if __name__ == '__main__':
job_scraper()