-
Notifications
You must be signed in to change notification settings - Fork 0
/
crawler.py
60 lines (50 loc) · 1.27 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
'''
Web Crawler for Shopping.com
'''
import sys
import os
from lxml import html
import requests
import string
class featureCollect:
def __init__(self):
self.productTree = dict()
self.count = 0
def collect (self, url):
headers = {'user-agent': 'Mozilla/5.0 (compatible; CrawlBot/2.1; +http://www.google.com/bot.html)'}
try:
page = requests.get(url, headers=headers)
except Exception, e:
raise e
return 0
tree = html.fromstring(page.text)
productList = tree.find_class ("productName")
count = len(productList)
if count > 0 :
self.count = self.count + count
return 1
else :
return 0
def main():
url = "http://www.shopping.com/products"
collector = featureCollect()
args = len(sys.argv) - 1
if args == 1 :
url1 = url + '?KW=' + sys.argv[1]
collector.collect(url1)
next = 1
x = 2
while (next == 1):
#print 'page ::: ', x - 1
url1 = url + '~PG-' + str(x) + '?KW=' + sys.argv[1]
next = collector.collect (url1)
x = x + 1
elif args == 2 :
url1 = url + '~PG-' + sys.argv[1] + '?KW=' + sys.argv[2]
collector.collect(url1)
else :
print 'provide one article name as argument or provide page number with article name::eg: 2 deo'
print ':::::: Total product count ::::::'
print collector.count
if __name__ == "__main__":
main()