forked from aayushsanghavi/HCM-Project
-
Notifications
You must be signed in to change notification settings - Fork 0
/
3_1.py
76 lines (66 loc) · 2.16 KB
/
3_1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
#importing python modules and libraries
import urllib2
import re
import csv
from bs4 import BeautifulSoup
import sys
reload(sys)
sys.setdefaultencoding('utf8')
#url of the desired webpage
page_url = "http://www.healthcaremagic.com/premiumquestions"
#this code creates a file premiumQuestions.csv and stores all the information retrived
file = open('premiumQuestions.csv','wb')
write = csv.writer(file,delimiter=",")
def to_string(variable):
variable = variable.encode('ascii','ignore')
variable = variable.lstrip()
variable = variable.rstrip()
return variable
#this function retrives the next page url and the data from that page
def get_next_page(page_link):
page = urllib2.urlopen(page_link)
soup = BeautifulSoup(page,'lxml')
content_div = soup.find("div",class_="mainContent")
div = content_div.find("div", style="width:63%; float:left;")
page.close()
get_values(div)
#this segment of code retrives the url of the next page
pagination = div.find("div",id="paginationDiv")
page_li = pagination.find_all("a",class_="box")
for li in page_li:
page_li_value = li.string
page_li_value = to_string(page_li_value)
page_li_url = li.get('href')
page_li_url = to_string(page_li_url)
match = re.search(r'Next',page_li_value)
if match:
match = match.group()
if match:
next_page_url = "http://www.healthcaremagic.com" + page_li_url
else:
next_page_url = None
return next_page_url
#this function retrives the title, url and number of each category for the specific url
def get_values(content):
all_div = content.find_all("div",class_="queriesBox")
for div in all_div:
inner_div = div.find("div",class_="smallPQIcon")
title = inner_div.a.string
if title:
title = to_string(title)
url = "http://www.healthcaremagic.com" + inner_div.a.get('href')
url = to_string(url)
match = re.search(r'/([\d]+)',url)
match = match.group(1)
number = match
number = int(number)
d = [title,url,number]
write.writerow(d)
#this loop extracts all the required information from all the pages
continue_extraction = True
while continue_extraction:
next_page = get_next_page(page_url)
if next_page == None:
continue_extraction = False
else:
page_url = next_page