-
Notifications
You must be signed in to change notification settings - Fork 0
/
script_recursive_following.py
270 lines (228 loc) · 10.6 KB
/
script_recursive_following.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
import config
from script_base import ScriptBase
import time
import os.path
import os
"""
usage:
-----
ScriptRecursiveFollowing(username=config.username,
password=config.password,
target_username='account-to-scrape',
root_folder='path/to/{(.*)_visited.csv files}',
following_folder='path/to/{(.*)_following.csv files}',
MAX_DEPTH=1).start()
working:
-------
1. logs in using self.username, and self.password
2. goes to target_username
3. creates {self.username}_{target_username}_visited.csv file at self.root_folder if it doesn't exist
4. starts collecting the links to the profiles being followed
5. stores the links in the file {target_username}_following.csv at self.following_folder
6. reads the {target_username}_following.csv and goes to each of those profiles
7. collects the links to their followings into respective (.*)_following.csv files
...
notes:
-----
- it's basically BFS
- the {self.username}_{target_username}_visited.csv stores the state of the BFS
- if the corresponding (.*)_following.csv file exists, then the goes to the next link
- MAX_DEPTH:
- 0 : collects only the followings of the target_username
- 1 : collects the followings of target_username, and the followings' followings.
...
"""
class ScriptRecursiveFollowing(ScriptBase):
"""
script to:
1. go to a target account and collect the links to accounts it is following
2. filter out in-accessible accounts
3. repeat (1) and (2) for every account in the filtered list
4. repeat once again
5. repeat once again
"""
def __init__(self, username, password, target_username, root_folder, following_folder=None, MAX_DEPTH=1):
if not os.path.exists(root_folder):
raise Exception(f'{root_folder} not found')
ScriptBase.__init__(self)
self.username = username
self.password = password
root_folder = os.path.normpath(root_folder)
self.visited_file = os.path.join(root_folder, f'{username}_{target_username}_visited.csv')
if following_folder is None:
self.following_folder = os.path.join(root_folder, 'following')
self.mkdir(self.following_folder)
else:
if os.path.exists(following_folder):
self.following_folder = following_folder
else:
raise Exception('following_folder doesn\'t exist.')
self.visited = set()
self.queue = []
self.profile = target_username # username, as opposed to urls
self.accessible = None
self.scraped = None
self.process_incomplete = True # if the recursive scrapping is incomplete
self.depth = 0
self.MAX_DEPTH = MAX_DEPTH
"""
visited_file stores the state of BFS
when read line by line, BFS needs to be simulated
"""
# 0 1 2 3 4 5
visited_file_header = 'profile, posts, followers, following, accessible, scraped\n'
last_line = None
if not os.path.exists(self.visited_file):
with open(self.visited_file, 'w+') as fout:
fout.write(visited_file_header)
else:
"""
BFS
---
queue.push(start)
while queue is not empty:
top = queue.pop()
for child in top.children:
if child not in visited:
visited.push(child)
queue.push(child)
visited.push(top)
"""
visited_file = open(self.visited_file, 'r')
if visited_file.readline() != visited_file_header:
raise Exception('visited_file has header missing, possibly corrupted.')
line = visited_file.readline()
if line:
self.break_line_up(line) # fills self.profile, self.accessible, self.scraped
terminate = False
if self.scraped == True:
self.queue.insert(0, self.profile)
self.visited.add(self.profile)
while len(self.queue) > 0 and not terminate:
level_size = len(self.queue)
while level_size > 0 and not terminate:
level_size -= 1
top = self.queue.pop(0)
if top != self.profile:
raise Exception('top not matching self.profile, visited_file possibly corrupted.')
if self.accessible and os.path.exists(self.get_following_file(top)):
with open(self.get_following_file(top), 'r') as fin:
for line in fin:
if self.depth < self.MAX_DEPTH:
child = self.get_username(line.replace('\n', ''))
if child not in self.visited:
self.queue.append(child)
self.visited.add(child)
line = visited_file.readline()
if not line:
terminate = True
else:
self.break_line_up(line) # fills self.profile, self.accessible, self.scraped
if self.scraped == False:
terminate = True
if not terminate:
self.depth += 1
if len(self.queue) == 0:
if terminate:
self.process_incomplete = False # ie, the recusrive script has been completed.
else:
raise Exception('queue empty without visited_file read completely, visited_file possibly corrupted.')
visited_file.close()
def start(self):
if not self.process_incomplete:
print('Process was already completed.')
exit(1)
self.perform_login(self.username, self.password)
if not self.scraped:
self.queue.insert(0, self.profile)
self.visited.add(self.profile)
else:
self.accessible = None
while len(self.queue) > 0:
level_size = len(self.queue)
while level_size > 0:
level_size -= 1
self.profile = self.queue.pop(0)
file_already_exists = False
self.ii.go_to_account(self.profile)
if self.ii.page_removed():
profile_header = {
'username' : '',
'posts' : -1,
'followers' : -1,
'following' : -1,
'given name' : '',
'bio' : '',
'website' : ''
}
else:
profile_header = self.ii.get_header_info()
if self.accessible is None:
self.log(f'{self.profile}, {profile_header["posts"]}, {profile_header["followers"]}, {profile_header["following"]}')
if profile_header['following'] == -1:
self.accessible = False
else:
self.accessible = self.ii.is_following_list_accessible()
self.log(f', {self.accessible}')
if self.accessible:
if not os.path.exists(self.get_following_file(self.profile)):
if profile_header['following'] > 0:
following_list = self.ii.get_following()
with open(self.get_following_file(self.profile), 'w+') as fout:
for following in following_list:
fout.write(f'{following}\n')
else:
file_already_exists = True
if profile_header['following'] > 0:
with open(self.get_following_file(self.profile), 'r') as fin:
for line in fin:
if self.depth < self.MAX_DEPTH:
child = self.get_username(line.replace('\n', ''))
if child not in self.visited:
self.queue.append(child)
self.visited.add(child)
self.scraped = True
self.log(f', {self.scraped}')
self.log('\n')
self.accessible = None
self.scraped = None
if file_already_exists:
wait_for_secs = 30
print(f'{self.get_following_file(self.profile)} already exists.')
print(f'waiting {wait_for_secs} secs')
time.sleep(wait_for_secs)
self.depth += 1
def log(self, message):
with open(self.visited_file, 'a') as fout:
fout.write(message)
@staticmethod
def mkdir(folderpath):
if not os.path.exists(folderpath):
os.system(f'mkdir {folderpath}')
return True
return False
@staticmethod
def mv(from_p, to_p):
os.system(f'mv {from_p} {to_p}')
@staticmethod
def get_profile_url(username):
return f'https://www.instagram.com/{username}/'
@staticmethod
def get_username(profile_url):
return profile_url[len('https://www.instagram.com/'):-1]
def get_following_file(self, profile):
return os.path.join(self.following_folder, f'{profile}_following.csv')
def break_line_up(self, line):
"""
fills self.profile, self.accessible, self.scraped
"""
splits = line.replace('\n', '').split(', ')
self.profile = splits[0]
if len(splits) == 6:
self.accessible = True
self.scraped = splits[5] == 'True'
elif len(splits) == 5:
self.accessible = splits[4] == 'True'
self.scraped = None
else:
raise Exception('line incomplete, visited_file possibly corrupted.')