This repository has been archived by the owner on May 3, 2019. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 105
/
instagramcrawler.py
360 lines (301 loc) · 13.3 KB
/
instagramcrawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
from __future__ import division
import argparse
import codecs
from collections import defaultdict
import json
import os
import re
import sys
import time
try:
from urlparse import urljoin
from urllib import urlretrieve
except ImportError:
from urllib.parse import urljoin
from urllib.request import urlretrieve
import requests
import selenium
from selenium import webdriver
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
# HOST
HOST = 'http://www.instagram.com'
# SELENIUM CSS SELECTOR
CSS_LOAD_MORE = "a._1cr2e._epyes"
CSS_RIGHT_ARROW = "a[class='_de018 coreSpriteRightPaginationArrow']"
FIREFOX_FIRST_POST_PATH = "//div[contains(@class, '_8mlbc _vbtk2 _t5r8b')]"
TIME_TO_CAPTION_PATH = "../../../div/ul/li/span"
# FOLLOWERS/FOLLOWING RELATED
CSS_EXPLORE = "a[href='/explore/']"
CSS_LOGIN = "a[href='/accounts/login/']"
CSS_FOLLOWERS = "a[href='/{}/followers/']"
CSS_FOLLOWING = "a[href='/{}/following/']"
FOLLOWER_PATH = "//div[contains(text(), 'Followers')]"
FOLLOWING_PATH = "//div[contains(text(), 'Following')]"
# JAVASCRIPT COMMANDS
SCROLL_UP = "window.scrollTo(0, 0);"
SCROLL_DOWN = "window.scrollTo(0, document.body.scrollHeight);"
class url_change(object):
"""
Used for caption scraping
"""
def __init__(self, prev_url):
self.prev_url = prev_url
def __call__(self, driver):
return self.prev_url != driver.current_url
class InstagramCrawler(object):
"""
Crawler class
"""
def __init__(self, headless=True, firefox_path=None):
if headless:
print("headless mode on")
self._driver = webdriver.PhantomJS()
else:
# credit to https://github.com/SeleniumHQ/selenium/issues/3884#issuecomment-296990844
binary = FirefoxBinary(firefox_path)
self._driver = webdriver.Firefox(firefox_binary=binary)
self._driver.implicitly_wait(10)
self.data = defaultdict(list)
def login(self, authentication=None):
"""
authentication: path to authentication json file
"""
self._driver.get(urljoin(HOST, "accounts/login/"))
if authentication:
print("Username and password loaded from {}".format(authentication))
with open(authentication, 'r') as fin:
auth_dict = json.loads(fin.read())
# Input username
username_input = WebDriverWait(self._driver, 5).until(
EC.presence_of_element_located((By.NAME, 'username'))
)
username_input.send_keys(auth_dict['username'])
# Input password
password_input = WebDriverWait(self._driver, 5).until(
EC.presence_of_element_located((By.NAME, 'password'))
)
password_input.send_keys(auth_dict['password'])
# Submit
password_input.submit()
else:
print("Type your username and password by hand to login!")
print("You have a minute to do so!")
print("")
WebDriverWait(self._driver, 60).until(
EC.presence_of_element_located((By.CSS_SELECTOR, CSS_EXPLORE))
)
def quit(self):
self._driver.quit()
def crawl(self, dir_prefix, query, crawl_type, number, caption, authentication):
print("dir_prefix: {}, query: {}, crawl_type: {}, number: {}, caption: {}, authentication: {}"
.format(dir_prefix, query, crawl_type, number, caption, authentication))
if crawl_type == "photos":
# Browse target page
self.browse_target_page(query)
# Scroll down until target number photos is reached
self.scroll_to_num_of_posts(number)
# Scrape photo links
self.scrape_photo_links(number, is_hashtag=query.startswith("#"))
# Scrape captions if specified
if caption is True:
self.click_and_scrape_captions(number)
elif crawl_type in ["followers", "following"]:
# Need to login first before crawling followers/following
print("You will need to login to crawl {}".format(crawl_type))
self.login(authentication)
# Then browse target page
assert not query.startswith(
'#'), "Hashtag does not have followers/following!"
self.browse_target_page(query)
# Scrape captions
self.scrape_followers_or_following(crawl_type, query, number)
else:
print("Unknown crawl type: {}".format(crawl_type))
self.quit()
return
# Save to directory
print("Saving...")
self.download_and_save(dir_prefix, query, crawl_type)
# Quit driver
print("Quitting driver...")
self.quit()
def browse_target_page(self, query):
# Browse Hashtags
if query.startswith('#'):
relative_url = urljoin('explore/tags/', query.strip('#'))
else: # Browse user page
relative_url = query
target_url = urljoin(HOST, relative_url)
self._driver.get(target_url)
def scroll_to_num_of_posts(self, number):
# Get total number of posts of page
num_info = re.search(r'\], "count": \d+',
self._driver.page_source).group()
num_of_posts = int(re.findall(r'\d+', num_info)[0])
print("posts: {}, number: {}".format(num_of_posts, number))
number = number if number < num_of_posts else num_of_posts
# scroll page until reached
loadmore = WebDriverWait(self._driver, 10).until(
EC.presence_of_element_located(
(By.CSS_SELECTOR, CSS_LOAD_MORE))
)
loadmore.click()
num_to_scroll = int((number - 12) / 12) + 1
for _ in range(num_to_scroll):
self._driver.execute_script(SCROLL_DOWN)
time.sleep(0.2)
self._driver.execute_script(SCROLL_UP)
time.sleep(0.2)
def scrape_photo_links(self, number, is_hashtag=False):
print("Scraping photo links...")
encased_photo_links = re.finditer(r'src="([https]+:...[\/\w \.-]*..[\/\w \.-]*'
r'..[\/\w \.-]*..[\/\w \.-].jpg)', self._driver.page_source)
photo_links = [m.group(1) for m in encased_photo_links]
print("Number of photo_links: {}".format(len(photo_links)))
begin = 0 if is_hashtag else 1
self.data['photo_links'] = photo_links[begin:number + begin]
def click_and_scrape_captions(self, number):
print("Scraping captions...")
captions = []
for post_num in range(number):
sys.stdout.write("\033[F")
print("Scraping captions {} / {}".format(post_num+1,number))
if post_num == 0: # Click on the first post
# Chrome
# self._driver.find_element_by_class_name('_ovg3g').click()
self._driver.find_element_by_xpath(
FIREFOX_FIRST_POST_PATH).click()
if number != 1: #
WebDriverWait(self._driver, 5).until(
EC.presence_of_element_located(
(By.CSS_SELECTOR, CSS_RIGHT_ARROW)
)
)
elif number != 1: # Click Right Arrow to move to next post
url_before = self._driver.current_url
self._driver.find_element_by_css_selector(
CSS_RIGHT_ARROW).click()
# Wait until the page has loaded
try:
WebDriverWait(self._driver, 10).until(
url_change(url_before))
except TimeoutException:
print("Time out in caption scraping at number {}".format(post_num))
break
# Parse caption
try:
time_element = WebDriverWait(self._driver, 10).until(
EC.presence_of_element_located((By.TAG_NAME, "time"))
)
caption = time_element.find_element_by_xpath(
TIME_TO_CAPTION_PATH).text
except NoSuchElementException: # Forbidden
print("Caption not found in the {} photo".format(post_num))
caption = ""
captions.append(caption)
self.data['captions'] = captions
def scrape_followers_or_following(self, crawl_type, query, number):
print("Scraping {}...".format(crawl_type))
if crawl_type == "followers":
FOLLOW_ELE = CSS_FOLLOWERS
FOLLOW_PATH = FOLLOWER_PATH
elif crawl_type == "following":
FOLLOW_ELE = CSS_FOLLOWING
FOLLOW_PATH = FOLLOWING_PATH
# Locate follow list
follow_ele = WebDriverWait(self._driver, 5).until(
EC.presence_of_element_located(
(By.CSS_SELECTOR, FOLLOW_ELE.format(query)))
)
# when no number defined, check the total items
if number is 0:
number = int(filter(str.isdigit, str(follow_ele.text)))
print("getting all " + str(number) + " items")
# open desired list
follow_ele.click()
title_ele = WebDriverWait(self._driver, 5).until(
EC.presence_of_element_located(
(By.XPATH, FOLLOW_PATH))
)
List = title_ele.find_element_by_xpath(
'..').find_element_by_tag_name('ul')
List.click()
# Loop through list till target number is reached
num_of_shown_follow = len(List.find_elements_by_xpath('*'))
while len(List.find_elements_by_xpath('*')) < number:
element = List.find_elements_by_xpath('*')[-1]
# Work around for now => should use selenium's Expected Conditions!
try:
element.send_keys(Keys.PAGE_DOWN)
except Exception as e:
time.sleep(0.1)
follow_items = []
for ele in List.find_elements_by_xpath('*')[:number]:
follow_items.append(ele.text.split('\n')[0])
self.data[crawl_type] = follow_items
def download_and_save(self, dir_prefix, query, crawl_type):
# Check if is hashtag
dir_name = query.lstrip(
'#') + '.hashtag' if query.startswith('#') else query
dir_path = os.path.join(dir_prefix, dir_name)
if not os.path.exists(dir_path):
os.makedirs(dir_path)
print("Saving to directory: {}".format(dir_path))
# Save Photos
for idx, photo_link in enumerate(self.data['photo_links'], 0):
sys.stdout.write("\033[F")
print("Downloading {} images to ".format(idx + 1))
# Filename
_, ext = os.path.splitext(photo_link)
filename = str(idx) + ext
filepath = os.path.join(dir_path, filename)
# Send image request
urlretrieve(photo_link, filepath)
# Save Captions
for idx, caption in enumerate(self.data['captions'], 0):
filename = str(idx) + '.txt'
filepath = os.path.join(dir_path, filename)
with codecs.open(filepath, 'w', encoding='utf-8') as fout:
fout.write(caption + '\n')
# Save followers/following
filename = crawl_type + '.txt'
filepath = os.path.join(dir_path, filename)
if len(self.data[crawl_type]):
with codecs.open(filepath, 'w', encoding='utf-8') as fout:
for fol in self.data[crawl_type]:
fout.write(fol + '\n')
def main():
# Arguments #
parser = argparse.ArgumentParser(description='Instagram Crawler')
parser.add_argument('-d', '--dir_prefix', type=str,
default='./data/', help='directory to save results')
parser.add_argument('-q', '--query', type=str, default='instagram',
help="target to crawl, add '#' for hashtags")
parser.add_argument('-t', '--crawl_type', type=str,
default='photos', help="Options: 'photos' | 'followers' | 'following'")
parser.add_argument('-n', '--number', type=int, default=0,
help='Number of posts to download: integer')
parser.add_argument('-c', '--caption', action='store_true',
help='Add this flag to download caption when downloading photos')
parser.add_argument('-l', '--headless', action='store_true',
help='If set, will use PhantomJS driver to run script as headless')
parser.add_argument('-a', '--authentication', type=str, default=None,
help='path to authentication json file')
parser.add_argument('-f', '--firefox_path', type=str, default=None,
help='path to Firefox installation')
args = parser.parse_args()
# End Argparse #
crawler = InstagramCrawler(headless=args.headless, firefox_path=args.firefox_path)
crawler.crawl(dir_prefix=args.dir_prefix,
query=args.query,
crawl_type=args.crawl_type,
number=args.number,
caption=args.caption,
authentication=args.authentication)
if __name__ == "__main__":
main()