This repository has been archived by the owner on Jan 19, 2018. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 39
/
Copy pathgroupme-fetch.py
189 lines (154 loc) · 6.4 KB
/
groupme-fetch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
from datetime import datetime
import os
import sys
import argparse
reload(sys)
sys.setdefaultencoding("utf-8")
import requests
import json
def onRequestError(request):
print(request.status_code)
print(request.headers)
print(request.text)
sys.exit(2)
def main():
"""
Writes out "transcript-groupId.json" with the history of the group
in chronological order.
If a file by that name is found, we'll go ahead and update that
scrape depending on the options you provide. It is assumed that the
file is in the correct format *and its messages are in chronological
order*.
The easiest way to continue a scrape is to pass --resumePrevious or --resumeNext
which will continue with the scrape moving backwards/forwards.
For more fine grained control you can provide --oldest or --newest flags with
specific message IDs
"""
parser = argparse.ArgumentParser()
parser.add_argument('group')
parser.add_argument('accessToken')
parser.add_argument("--resumePrevious", action='store_true', default=False, help="Resume based on the last found files and get previous messages.")
parser.add_argument("--resumeNext", action='store_true', default=False, help="Resume based on the last found files and get next messages.")
parser.add_argument("--oldest", help="The ID of the oldest (topmost) message in the existing transcript file")
parser.add_argument("--newest", help="The ID of the newest (bottom-most) message in the existing transcript file")
parser.add_argument("--pages", type=int,
help="The number of pages to pull down (defaults to as many as the conversation has")
args = parser.parse_args()
group = args.group
accessToken = args.accessToken
beforeId = args.oldest
stopId = args.newest
pages = args.pages
transcriptFileName = 'transcript-{0}.json'.format(group)
transcript = loadTranscript(transcriptFileName)
if args.resumePrevious or args.resumeNext:
tempFileName = getTempFileName(group)
tempTranscript = loadTempTranscript(tempFileName)
transcript = sorted(reconcileTranscripts(transcript, tempTranscript),
key=lambda k: k['created_at'])
if transcript:
if args.resumePrevious:
beforeId = transcript[0]['id']
else:
stopId = transcript[-1]['id']
transcript = populateTranscript(group, accessToken, transcript, beforeId, stopId, pages)
# sort transcript in chronological order
transcript = sorted(transcript, key=lambda k: k[u'created_at'])
print 'Transcript contains {0} messages from {1} to {2}'.format(
len(transcript),
datetime.fromtimestamp(transcript[0]['created_at']),
datetime.fromtimestamp(transcript[-1]['created_at']),
)
transcriptFile = open(transcriptFileName, 'w+')
json.dump(transcript, transcriptFile, ensure_ascii=False)
transcriptFile.close()
def reconcileTranscripts(*transcripts):
"""
Given multiple transcripts, returns a generate that includes any message exactly once
removing duplicates across transcripts
"""
seenIds = set()
for transcript in transcripts:
for message in transcript:
if message['id'] not in seenIds:
seenIds.add(message['id'])
yield message
def loadTranscript(transcriptFileName):
"""
Load a transcript file by name
"""
if os.path.exists(transcriptFileName):
with open(transcriptFileName, 'rb') as transcriptFile:
try:
return json.loads(transcriptFile.read())
except ValueError:
print 'transcript file had bad json! ignoring'
return []
return []
def loadTempTranscript(tempFileName):
"""
Load a temp transcript file by name
"""
# todo lot of copy/paste from above
if os.path.exists(tempFileName):
with open(tempFileName, 'rb') as tempFile:
try:
return [m for line in tempFile.readlines() for m in json.loads(line)]
except ValueError:
print 'temp file had bad json! ignoring'
return []
return []
def populateTranscript(group, accessToken, transcript, beforeId, stopId, pageLimit=None):
complete = False
pageCount = 0
endpoint = 'https://v2.groupme.com/groups/' + group + '/messages'
headers = {
'Accept': 'application/json, text/javascript',
'Accept-Charset': 'ISO-8859-1,utf-8',
'Accept-Language': 'en-US',
'Content-Type': 'application/json',
'Origin': 'https://web.groupme.com',
'Referer': 'https://web.groupme.com/groups/' + group,
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/537.22 (KHTML, like Gecko) Chrome/25.0.1364.45 Safari/537.22',
'X-Access-Token': accessToken
}
tempFileName = getTempFileName(group)
with open(tempFileName, 'wb') as tempFile:
while not complete:
pageCount = pageCount + 1
if pageLimit and pageCount > pageLimit:
break
print('starting on page ' + str(pageCount))
if beforeId is not None:
params = {'before_id': beforeId}
else:
params = {}
r = requests.get(endpoint, params=params, headers=headers)
if r.status_code is not 200:
onRequestError(r)
response = r.json()
messages = response[u'response'][u'messages']
if stopId is not None:
messages = sorted(messages, key=lambda k: k[u'created_at'], reverse=True)
for message in messages:
if message[u'id'] == stopId:
complete = True
print('Reached ID ' + stopId + "; stopping!")
break
else:
transcript.append(message)
else:
transcript.extend(messages)
tempFile.write(json.dumps(messages))
tempFile.write('\n')
if len(messages) is not 20:
complete = True
print('Reached the end/beginning!')
# keep working back in time
beforeId = messages[-1][u'id']
return transcript
def getTempFileName(group):
return 'temp-transcript-{0}.json'.format(group)
if __name__ == '__main__':
main()
sys.exit(0)