Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactored out to function to allow multipages (#4) #5

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
88 changes: 71 additions & 17 deletions job-search-web-scraping.py
Original file line number Diff line number Diff line change
@@ -1,36 +1,90 @@
import time
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

INDEED_URL = "https://www.indeed.com/worldwide"
PATH_TO_DRIVER = "./geckodriver/geckodriver.exe"
WAIT_TIME = 5
PAGES = 10

def indeed_job_search():

PATH_TO_DRIVER = './geckodriver'

browser = webdriver.Firefox(executable_path=PATH_TO_DRIVER)
def initial_search(search_term, driver_path):
browser = webdriver.Firefox(executable_path=driver_path)

browser.get('https://www.indeed.com/worldwide')
browser.get(INDEED_URL)

browser.implicitly_wait(5)
browser.implicitly_wait(WAIT_TIME)

search_bar = browser.find_element_by_name('q')
search_bar.send_keys('machine learning')
search_bar = browser.find_element_by_name("q")
search_bar.send_keys(search_term)
search_bar.send_keys(Keys.ENTER)

browser.implicitly_wait(5)
browser.implicitly_wait(WAIT_TIME)
return browser

search_results = browser.find_elements_by_xpath('//h2/a')

file = open("job_search.txt", 'a')
def initialize_file(search_term):
file = open(f"job_search_{search_term.replace(' ','_')}.txt", "a")
file.write("\n")
return file

for job_element in search_results:

job_title = job_element.text
job_link = job_element.get_attribute('href')
def write_job(job_element, file):
job_title = job_element.text
job_link = job_element.get_attribute("href")

file.write("%s | link: %s \n" %(job_title, job_link))
file.write("%s | link: %s \n" % (job_title, job_link))


def get_jobs(browser):
return browser.find_elements_by_xpath("//h2/a")


def close_popup_if_present(browser):
try:

popup_cross = browser.find_elements_by_xpath(
"//button[contains(@class,'popover-x-button-close')]"
)
popup_cross[0].click()
return True
except:
return False


def clean_up(browser, file):
try:
browser.close()
except:
pass
file.close()


def indeed_job_search(search_term, pages=10, driver_path="./geckodriver"):
popup_encountered = False
browser = initial_search(search_term, driver_path)
file = initialize_file(search_term)
page_number = 1
while True:
if not popup_encountered:
time.sleep(WAIT_TIME)
popup_encountered = close_popup_if_present(browser)
search_results = get_jobs(browser)
[write_job(job_element, file) for job_element in search_results]
try:
browser.implicitly_wait(WAIT_TIME)
next_button = browser.find_element_by_xpath("//a[@aria-label='Next']")
if page_number == pages:
clean_up(browser, file)
next_button.click()
page_number += 1
except Exception:
clean_up(browser, file)
exit

browser.close()

if __name__ == "__main__":
indeed_job_search()
indeed_job_search("machine learning", PAGES, PATH_TO_DRIVER)