-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrape_and_extract_apps.py
399 lines (344 loc) · 19.5 KB
/
scrape_and_extract_apps.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
#! /usr/bin/env python
# scrape_and_extract_apps
import sys
sys.path.append('../')
from common import utilities
reload(utilities)
import urllib2, BeautifulSoup, re, urlparse, datetime, traceback, time, os
ANDROID_MARKET_SCRAPE_CACHE_FOR_HTML='../cache/dummy_html/'
ANDROID_MARKET_SCRAPE_CACHE_FOR_RESOLVED_URLS='../cache/dummy_resolved_urls/'
download_from_web=True
read_from_cache=False
write_to_cache=False
downloaded_urls=[]
CATEGORY_L=[u'GAME', u'ARCADE', u'BRAIN', u'CARDS', u'CASUAL', u'GAME_WALLPAPER', u'RACING', u'SPORTS_GAMES', u'GAME_WIDGETS', u'APPLICATION', u'BOOKS_AND_REFERENCE', u'BUSINESS', u'COMICS', u'COMMUNICATION', u'EDUCATION', u'ENTERTAINMENT', u'FINANCE', u'HEALTH_AND_FITNESS', u'LIBRARIES_AND_DEMO', u'LIFESTYLE', u'APP_WALLPAPER', u'MEDIA_AND_VIDEO', u'MEDICAL', u'MUSIC_AND_AUDIO', u'NEWS_AND_MAGAZINES', u'PERSONALIZATION', u'PHOTOGRAPHY', u'PRODUCTIVITY', u'SHOPPING', u'SOCIAL', u'SPORTS', u'TOOLS', u'TRANSPORTATION', u'TRAVEL_AND_LOCAL', u'WEATHER', u'APP_WIDGETS']
def get_categories_old(u):
print 'get_categories: ' + u
print 'BeautifulSoup.__version__ ' + BeautifulSoup.__version__
page, _ = get(u)
soup = BeautifulSoup.BeautifulSoup(page, fromEncoding='utf-8')
rez=soup.findAll(attrs={"class":"top-nav-sub-item"}) #main-menu-sub-item-last
categories=[]
for r in rez:
cat_url=(r.findAll('a'))[0]['href']
m=re.match("/apps/(.*)", cat_url)
if m:
cat=m.group(1)
categories+=[cat]
return categories
def get_categories():
return CATEGORY_L
# caches results, returns cached values
# returns 2-tuple of the html behind this url (after any redirection) and whatever this url ultimately redirected to (if there was no redirection, then this is just the original url of course)
def get(url):
global download_from_web, read_from_cache, write_to_cache, downloaded_urls
cleaned_url=utilities._clean_for_path(url)
cleaned_url_minus_goog_redirect_prefix=re.compile(r'^http-www.google.comurlq').sub('',cleaned_url) # if link is a Google redirect link, remove the Google prefix part. note that the '&usg=' part remains.
if read_from_cache or write_to_cache:
html_cache_path=ANDROID_MARKET_SCRAPE_CACHE_FOR_HTML+cleaned_url_minus_goog_redirect_prefix
resolved_url_cache_path=ANDROID_MARKET_SCRAPE_CACHE_FOR_RESOLVED_URLS+cleaned_url_minus_goog_redirect_prefix
if read_from_cache and os.path.isfile(resolved_url_cache_path) and os.path.isfile(html_cache_path):
print 'cache hit - ' + (' previously cached.' if not (url in downloaded_urls) else '') + ' '+url+' file: '+cleaned_url_minus_goog_redirect_prefix
html=open(html_cache_path).read()
resolved_url=open(resolved_url_cache_path).read()
else:
if not download_from_web:
raise Exception('Cached file not present', 'We are doing an offline, cache-only processing job. If there is no file here it probably just means in the original scrape this request 404\'ed or something. Expected file in '+html_cache_path+' and '+ resolved_url_cache_path)
if read_from_cache:
print 'cache miss '+url
f = urllib2.urlopen(url,timeout=10)
html=f.read()
resolved_url = f.geturl()
# the call to got_url is after urlopen() and f.read() so that it won't be called if there is a 404 or other error on reading.
# print "hello philipp!"
# print html
got_url(url)
if write_to_cache:
html_cache=open(html_cache_path,'w')
html_cache.write(html)
resolved_url_cache=open(resolved_url_cache_path,'w')
resolved_url_cache.write(resolved_url)
return (html,resolved_url)
#todo: should put the summary processing thing in here maybe... but then it would all have to work at once...? well, just for one page, that's a good thing.
#todo: really should put the recursive 'try again' thing here into get() instead, except with the default being attempts_remaining=0. the thing about that is that
#then you'd throw the exception when you've run out of attempts, which I'd then have to catch and ignore here. but honestly, it would be really valuable in
#other places in the code
def scrape_top(url, attempts_remaining=2, seconds_between_attempts=5):
print 'scrape top ' + url
rez=[]
try:
page, _ = get(url)
soup = BeautifulSoup.BeautifulSoup(page, fromEncoding='utf-8')
rez=soup.findAll(attrs={"class":"snippet snippet-medium"})
except Exception as e:
print 'Exception when getting page of summaries. url='+url
print e
print e.args
if attempts_remaining>0:
print 'waiting '+str(seconds_between_attempts)+' seconds to try again. there are '+str(attempts_remaining)+' attempts remaining (including this next one.)'
time.sleep(seconds_between_attempts)
print 'trying again. '
rez=scrape_top(url,attempts_remaining=attempts_remaining-1,seconds_between_attempts=seconds_between_attempts) #kinda weird recursion. setting the return value but not actually returning. hmm.
return rez
def got_url(url):
if url in downloaded_urls:
print 'url has been gotten, but no cache hit this is not normal. '+url
downloaded_urls.append(url)
def _get_detail_url(summary):
soup = BeautifulSoup.BeautifulSoup(str(summary), fromEncoding='utf-8')
rez=soup.find(attrs={"class":"thumbnail"})
return rez['href']
def get_detail_url_absolute(summary):
return "http://market.android.com"+_get_detail_url(summary)
def get_standard_metadatum(parent_tag,heading_text):
heading_tag=parent_tag.find((lambda tag: tag.string == heading_text))
value_tag = heading_tag.nextSibling
return value_tag
def get_itemprop_soup(soup,itemprop_name,recur=False):
return soup.find(None,itemprop=itemprop_name,recursive=recur)
#does not recur! only one layer deep.
def get_itemprop_soup_l(soup,recur=False):
return soup.findAll(None,itemprop=re.compile('.+'),recursive=recur)
def get_itemprop_content(soup,itemprop_name,recur=False):
return get_itemprop_soup(soup,itemprop_name,recur)['content']
def scrape_detail(url):
app={}
app['email_contacts']=[]
app['logos']=[]
page, resolved_url = get(url)
soup = BeautifulSoup.BeautifulSoup(page, fromEncoding='utf-8')
app_info_list_entity=soup.find(None,itemtype="http://schema.org/MobileSoftwareApplication")
application_name=get_itemprop_content(app_info_list_entity,'name')
app['application_name']=application_name
image=get_itemprop_content(app_info_list_entity,'image')
app['logos'].append(image)
content_rating=get_itemprop_soup(app_info_list_entity,'contentRating', True)
app['content_rating']=content_rating.text
size=get_itemprop_soup(app_info_list_entity,'fileSize',True)
app['install_size']=size.text
version=get_itemprop_soup(app_info_list_entity,'softwareVersion',True)
app['version']=version.text
author=get_itemprop_soup(app_info_list_entity,'author',True)
author_name=get_itemprop_content(author,'name',True)
app['developer_name']=author_name
developer_page_url=get_itemprop_content(author,'url',True)
app['developer_page_url']='http://market.android.com'+developer_page_url
rating=get_itemprop_content(app_info_list_entity,'ratingValue',True)
app['rating']= rating
try:
rating_count=get_itemprop_soup(app_info_list_entity,'ratingCount',True)
app['rating_count']= rating_count.text
except:
app['rating_count']=0
price=get_itemprop_soup(app_info_list_entity,'price',True)
app['price']= price.text
updated=get_itemprop_soup(app_info_list_entity,'datePublished',True)
app['application_updated']= updated.text
# for a lot of these values, get_standard_metadatum is still useful! :)
requires_android=get_standard_metadatum(app_info_list_entity,'Requires Android:').string
app['requires_android']= requires_android
category_tag = get_standard_metadatum(app_info_list_entity,'Category:')
category_a=category_tag.find('a')
app['category']=category_a.string
app['description']=soup.find('div',attrs={"id":"doc-original-text"}).text
emails_in_description=get_emails(app['description'])
app['email_contacts']+=emails_in_description
# screenshots
ss_section=soup.find('div',attrs={"class": re.compile(".*doc-screenshot-section.*")})
try:
img_l=ss_section.findAll('img')
img_src_l=map(lambda img: img['src'], img_l)
app['screenshots']=img_src_l
except:
app['screenshots']=[]
# logos
logos_section=soup.find('div',attrs={"class":"doc-banner-icon"})
try:
img_l=logos_section.findAll('img')
img_src_l=map(lambda img: img['src'], img_l)
app['logos']=img_src_l
except:
app['logos']=[]
downloads_soup=get_itemprop_soup(soup,'numDownloads', True)
num_downloads=downloads_soup
app['installs']=num_downloads.contents[0].string
chart=downloads_soup.find(None, attrs={'class':'normalized-daily-installs-chart'})
try:
chart_img_src=chart.find('img')['src']
app['normalized_daily_installs_chart']=chart_img_src
except:
pass
return app
def get_emails(txt):
email_re='[A-Z0-9._%+-]+@[A-Z0-9.-]+\\.[A-Z]{2,4}'
return re.findall(email_re, txt,re.IGNORECASE)
def get_emails_in_page(url):
html, _ = get(url)
#print _
emails=get_emails(html)
return list(set(emails))
def get_links_to_contact_page(url):
html,redirected_url = get(url)
contact_links_soup = BeautifulSoup.BeautifulSoup(html, fromEncoding='utf-8')
tags=contact_links_soup.findAll('a', href=re.compile('contact',re.IGNORECASE))
contact_links_with_desired_hrefs=[urlparse.urljoin(redirected_url,anchor['href']) for anchor in tags]
contact_text_links_soup = BeautifulSoup.BeautifulSoup(html, fromEncoding='utf-8')
tags=contact_links_soup.findAll('a', text=re.compile('contact',re.IGNORECASE))
contact_links_with_desired_anchor_text=[urlparse.urljoin(redirected_url,anchor_text.parent['href']) for anchor_text in tags]
return list(set(contact_links_with_desired_hrefs+contact_links_with_desired_anchor_text))
def get_twitter_handles(url):
html,_ = get(url)
twitter_handle_re='(^|\s)(@[A-Z0-9]+)'
handles=re.findall(twitter_handle_re, html,re.IGNORECASE)
if handles:
handles=map(lambda x: x[1],handles)
handles=filter(lambda x: x not in ['@import','@param','@page'],handles)
#twitter_link_re='(http://twitter.com/[A-Z0-9]+)'
return list(set(handles))
def get_dev_homesite_url(dev_page_url):
#<a href="/developer?pub=Glu+Mobile" class="doc-header-link">Glu Mobile</a>
html,_ =get(dev_page_url)
soup = BeautifulSoup.BeautifulSoup(html, fromEncoding="utf-8")
#<a href="http://www.google.com/url?q=http://www.glu.com/&usg=AFQjCNHZ-ipDlTho6lCB5-SdepXgLSDmjw" target="_blank">Visit Website for Glu Mobile</a>
rez=soup.find("a",href=re.compile("^http://www.google.com/url\?q"))
#print "rez="+str(rez)
return rez["href"]
#if this were Clojure I'd make a macro like 'with-try-catch(print-to-output)' that would wrap its body in the try: except: block below which includes the statement print "error in getting contact links from the developer's site: "+u except it would just print the values of args-to-output in the exception block. args-to-output would of course be a list of symbols. oh wait, maybe better would be one that takes a list of such body blocks. anyway. the below is fine for now. I can make a Python function maybe that does what i want here with this: http://pypi.python.org/pypi/SymbolType
def scrape_dev_homesite(u,allowable_recursion_depth=1):
#print "scrape_dev_homesite for " + u
app={}
if re.compile('.*twitter.com.*',re.IGNORECASE).match(u):
return {'twitter_contacts': [u]} # if the resolved_dev_homesite_u is a "twitter.com" profile, just use that url as the lone contact
if re.compile('mailto:.*',re.IGNORECASE).match(u):
return {'email_contacts': [u]} # if the resolved_dev_homesite_u is a "mailto" link, just use that url as the lone contact
if re.compile('.*facebook.com.*',re.IGNORECASE).match(u):
return {'contact links': [u]}
try:
app['email_contacts']=get_emails_in_page(u)
except Exception as e:
print "error in getting emails from the developer's site: "+u
print e
try:
app['contact links']=get_links_to_contact_page(u)
except Exception as e:
print "error in getting contact links from the developer's site: "+u
print e
try:
app['twitter_contacts'] = get_twitter_handles(u)
except Exception as e:
print "error in getting twitters from the developer's site: "+u
print e
if allowable_recursion_depth>0 and app.has_key('contact links'):
for contact_link in app['contact links']:
app_from_contact_page=scrape_dev_homesite(contact_link,allowable_recursion_depth=allowable_recursion_depth-1)
utilities.merge_dicts_add_values(app,app_from_contact_page)
return app
#normally the use of the cache is in the OFFLINE global variable
def extract_app(detail_u):
app={}
app['unique_package']=get_unique_package(detail_u)
try:
utilities.merge_dicts_add_values(app,scrape_detail(detail_u))
except Exception as e:
print >> sys.stderr ,"! ERROR IN GETTING DATA FROM THE APPLICATION'S DETAIL PAGE ON THE MARKET SITE: %s" % detail_u
print >> sys.stderr, e
traceback.print_exc()
app['developer_page_url']=detail_u
#print "app"
#print app # at this point there may be nothing on this dict except 'developer_page_url' (if this errored out above)
dev_page_u=app['developer_page_url']
try:
dev_homesite_u = get_dev_homesite_url(dev_page_u)
#print "dev_homesite_u= "+dev_homesite_u
resolved_dev_homesite_u=get(dev_homesite_u)[1]
app['developer_homepage_url']=resolved_dev_homesite_u
homesite_data=scrape_dev_homesite(resolved_dev_homesite_u)
utilities.merge_dicts_add_values(app,homesite_data)
except Exception as e:
print "exception part 2 on "+ dev_page_u + " . This may **or may not!** be the url that is having a problem. It could be that the dev homepage is the problem." #it's possible that dev_homesite_u won't be defined yet, tho unlikely. so i need to print this instead. todo: note that this continues on and returns app, which will have a few values. it might be worth looking to see . but if this broke when trying to get the dev_homesite_u, then that page won't be cached, thus a second run will probably get the page successfully, assuming it's an temporary server issue (or a connectivity issue on our side)
print e
print e.args
print "those are the e details"
return app
def get_unique_package(detail_u):
return utilities.url_param(detail_u,'id')
def extract_and_populate_app(detail_u,scrape_timestamp,extraction_timestamp,category_scraped):
app=extract_app(detail_u)
app['scrape_timestamp']=scrape_timestamp
app['extraction_timestamp']=extraction_timestamp
app['category_scraped']=category_scraped
#app['paid']=is_paid
return app
def scrape_category_top_ranked(category_scraped,is_paid,scrape_timestamp,extraction_timestamp,starting_page,max_ending_page):
scraped_app_l=[]
count=None
i=starting_page
while i<max_ending_page and (count>0 or count is None):
print "page #"+str(i)
print len(scraped_app_l)
start_time=datetime.datetime.now()
summaries=scrape_top('https://market.android.com/details?id=apps_topselling_'+ ('paid' if is_paid else 'free') +'&cat='+category_scraped+'&start='+str(i*24)+'&num=24')
print "and now summaries:"
for summary in summaries:
detail_u=get_detail_url_absolute(summary)
app=extract_and_populate_app(detail_u,scrape_timestamp,extraction_timestamp,category_scraped)
scraped_app_l.append(app)
i+=1
count=len(summaries)
end_time=datetime.datetime.now()
print 'elapsed time for page of data: '+str(end_time-start_time)
return scraped_app_l
# this scrapes, ranks, and inserts into the database
# note that category can also be a subcategory / subgenre. In fact I'm not positive that the Android market has any sort of hierarchical idea of categories. That may just be tacked onto the web UI.
# example: inhale_market_data('GAME',True,'/Users/herdrick/Dropbox/python/appbackr/cache/html_new/','/Users/herdrick/Dropbox/python/appbackr/cache/resolved_urls_new/')
# example: inhale_market_data('PUZZLE',False,'/Users/herdrick/Dropbox/python/appbackr/cache/html_new/','/Users/herdrick/Dropbox/python/appbackr/cache/resolved_urls_new/')
def inhale_market_data(category,paid,html_cache_path, resolved_urls_cache_path, scrape_date, extraction_date, database, offline=False,starting_page=0,max_ending_page=sys.maxint):
start_time=datetime.datetime.now()
print 'scrape start time:'+str(start_time)
if offline:
_download_from_web=False
_read_from_cache=True
_write_to_cache=False
else:
_download_from_web=True
_read_from_cache=True
_write_to_cache=True
initialize_globals(html_cache_path,resolved_urls_cache_path,_download_from_web=_download_from_web,_read_from_cache=_read_from_cache,_write_to_cache=_write_to_cache)
scrape_timestamp=int(time.mktime(scrape_date.timetuple()) * 1000)
extraction_timestamp=int(time.mktime(extraction_date.timetuple()) * 1000)
if not offline and (scrape_date != extraction_date):
raise Exception('this is supposed to be an online scrape yet the extraction date is different from the scrape date. wtf.')
print 'scrape_date:' + str(scrape_date)
print 'scrape_timestamp:' + str(scrape_timestamp)
print 'extraction_date:' + str(extraction_date)
print 'extraction_timestamp:' + str(extraction_timestamp)
print 'html cache dir: '+html_cache_path
print 'resolved_urls_cache dir: '+resolved_urls_cache_path
app_l=scrape_category_top_ranked(category,paid,scrape_timestamp,extraction_timestamp,starting_page,max_ending_page)
# assign Android Market ranks, we got them in order; use that order.
for n in range(len(app_l)):
app_l[n]['market_rank']=n
# do something with the list of apps here. for example, persist them.
from android import db
reload(db)
print 'persisting apps. db=',database
print 'app_l',app_l
db.persist_apps(app_l,database=database)
print '<here is where apps would be persisted>' # comment this out if you are persisting your apps here
if len(app_l)>0:
print 'scrape timestamp: '+str(app_l[0]['scrape_timestamp'])
print 'count of applications extracted '+str(len(app_l))
else:
print 'ERROR (or unexpected): Results count is zero!'
end_time=datetime.datetime.now()
print 'category scrape time:'+str(start_time)
print 'category scrape end time: '+str(end_time)
print 'elapsed time: '+str(end_time-start_time)
def initialize_globals(_html_cache_path, _resolved_urls_cache_path, _download_from_web, _read_from_cache, _write_to_cache ):
global ANDROID_MARKET_SCRAPE_CACHE_FOR_HTML, ANDROID_MARKET_SCRAPE_CACHE_FOR_RESOLVED_URLS, downloaded_urls, download_from_web, read_from_cache, write_to_cache
ANDROID_MARKET_SCRAPE_CACHE_FOR_HTML=_html_cache_path
ANDROID_MARKET_SCRAPE_CACHE_FOR_RESOLVED_URLS=_resolved_urls_cache_path
download_from_web=_download_from_web
read_from_cache=_read_from_cache
write_to_cache=_write_to_cache
downloaded_urls=[]