-
Notifications
You must be signed in to change notification settings - Fork 0
/
Makefile
286 lines (226 loc) · 9.72 KB
/
Makefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
################################################
# COVID Rumors in Historical Context
# School of Advanced Study, University of London
# Marty Steer and Kunika Kono, 2022
################################################
# Using gnu makefiles helps with reproducibility
# of the data pipeline.
#
# `Make` this datapackage to build local source
# data into a variety of more useful datasets
# which can be used downstream.
#
# For multithreaded data processing use:
# $ make -j 8
#
# Use these rules to help debug source & target:
# @echo T-source $^
# @echo T-target $@
# @echo T-replace $(^:.csv=.txt)
# @echo T-file-within $(@F)
# @echo T-target $@
# @echo
#
################################################
# Variables
# Source data and build data directories (in and out)
DATA_DIR = ./data
BUILD_DIR = ./build
SCRIPT_DIR = ./scripts
# Source file list
SOURCE_JSONL = $(wildcard $(DATA_DIR)/*.jsonl.gz)
# ---
all:
$(MAKE) $(MAKEFLAGS) tids users hashtags urls
$(MAKE) $(MAKEFLAGS) languages sensitive
$(MAKE) $(MAKEFLAGS) texts
$(MAKE) $(MAKEFLAGS) users_daily tweets_daily
# ---
# Other targets to run separately if you need them:
# requirements
# noretweets, noretweets-en, noretweets-jp
# media-urls, media-urls-files
# ---
# tweets
# useful metadata fields from all tweets
T_CHUNKS_DIR = tweets
$(BUILD_DIR)/$(T_CHUNKS_DIR)/%.csv: $(DATA_DIR)/%.jsonl.gz
mkdir -p $(@D)
gzcat $^ | jq -r '[.id_str, .created_at, .user.name, .user.id_str, .user.created_at, .lang, .possibly_sensitive, .quote_count, .reply_count, .retweet_count, .favorite_count] | @csv' > $@
# ---
# hashtags
# hashstags, use jq to convert to long/narrow data stream of [tid, single hashtag]
H_CHUNKS_DIR = hashtags
$(BUILD_DIR)/$(H_CHUNKS_DIR)/%.csv: $(DATA_DIR)/%.jsonl.gz
mkdir -p $(@D)
gzcat $^ | jq -r '{id_str: .id_str, hashtags: .entities.hashtags} | {id_str: .id_str, hashtag: .hashtags[].text} | [.id_str, .hashtag] | @csv' > $@
# ---
# urls
# long/narrow data stream of [tid, single url]
URL_CHUNKS_DIR = urls
$(BUILD_DIR)/$(URL_CHUNKS_DIR)/%.csv: $(DATA_DIR)/%.jsonl.gz
mkdir -p $(@D)
gzcat $^ | jq -r '{id_str: .id_str, urls: .entities.urls} | {id_str: .id_str, url: .urls[].expanded_url} | [.id_str, .url] | @csv' > $@
# ---
# target: tids
# The full list of tweet ID's is also the primary publishable dataset!
T_CSV = $(patsubst $(DATA_DIR)/%.jsonl.gz, $(BUILD_DIR)/$(T_CHUNKS_DIR)/%.csv, $(SOURCE_JSONL))
tids: $(BUILD_DIR)/tids.csv.gz
$(BUILD_DIR)/tids.csv.gz: $(T_CSV)
cat $^ | csvcut -c 1,2 | pigz > $@ && \
gzcat $@ | wc -l > $(@:.csv.gz=-count.txt)
# ---
# target: users
# NB: personal data alert!
# Extract and summarise username frequencies (how often they tweeted)
users: $(BUILD_DIR)/users-count.txt $(BUILD_DIR)/users-created_at.csv.gz
$(BUILD_DIR)/users-count.txt: $(T_CSV)
cat $^ | csvcut -c 3 | sort | uniq -c | sort -rn > $@
# Extract account created dates. We are looking for recently created accounts.
# i.e. "Cyber Fleets" of social media accounts.
# Get all unique [userid, create date]
$(BUILD_DIR)/users-created_at.csv.gz: $(T_CSV)
cat $^ | csvcut -c 4,5 | pigz > $@
# ---
# target: users_daily
# Make an anonymised publishable user daily summary file for visualisation
users_daily: $(BUILD_DIR)/users-daily.csv
$(BUILD_DIR)/users-daily.csv: $(BUILD_DIR)/users-created_at.csv.gz
gzcat $^ | xsv select 2 | \
awk -F' ' '{mth=sprintf("%02i", (match("JANFEBMARAPRMAYJUNJULAUGSEPOCTNOVDEC",toupper($$2))+2)/3); print $$6 "-" mth "-" $$3 " 00:00:00" }' | \
sort | uniq -c | \
awk 'BEGINFILE{print "created_at,user_count"}{print $$2,$$3","$$1}' > $@
# ---
# target: tweets_daily
# Make an anonymised publishable daily tweets summary file for visualisation
tweets_daily: $(BUILD_DIR)/tweets-daily.csv
$(BUILD_DIR)/tweets-daily.csv: $(BUILD_DIR)/tids.csv.gz
gzcat $^ | xsv select 2 | \
awk -F' ' '{mth=sprintf("%02i", (match("JANFEBMARAPRMAYJUNJULAUGSEPOCTNOVDEC",toupper($$2))+2)/3); print $$6 "-" mth "-" $$3 " 00:00:00" }' | \
sort | uniq -c | \
awk 'BEGINFILE{print "created_at,tweet_count"}{print $$2,$$3","$$1}' > $@
tweets_hourly: $(BUILD_DIR)/tweets-hourly.csv
$(BUILD_DIR)/tweets-hourly.csv: $(BUILD_DIR)/tids.csv.gz
gzcat $^ | xsv select 2 | \
awk -F' ' '{mth=sprintf("%02i", (match("JANFEBMARAPRMAYJUNJULAUGSEPOCTNOVDEC",toupper($$2))+2)/3); print $$6 "-" mth "-" $$3 " " substr($$4,1,2)":00:00" }' | \
sort | uniq -c | \
awk 'BEGINFILE{print "created_at,tweet_count"}{print $$2,$$3","$$1}' > $@
# ---
# target: languages
languages: $(BUILD_DIR)/languages-count.txt
$(BUILD_DIR)/languages-count.txt: $(T_CSV)
cat $^ | csvcut -c 6 | sort | uniq -c | sort -rn > $@
# ---
# target: sensitive
sensitive: $(BUILD_DIR)/sensitive-count.txt
$(BUILD_DIR)/sensitive-count.txt: $(T_CSV)
cat $^ | csvcut -c 7 | sort | uniq -c | sort -rn > $@
# ---
# target: hashtags
H_CSV = $(patsubst $(DATA_DIR)/%.jsonl.gz, $(BUILD_DIR)/$(H_CHUNKS_DIR)/%.csv, $(SOURCE_JSONL))
hashtags: $(BUILD_DIR)/hashtags-count.txt
$(BUILD_DIR)/hashtags-count.txt: $(H_CSV)
cat $^ | csvcut -c 2 | sort | uniq -c | sort -rn > $@
# ---
# target: urls
URL_CSV = $(patsubst $(DATA_DIR)/%.jsonl.gz, $(BUILD_DIR)/$(URL_CHUNKS_DIR)/%.csv, $(SOURCE_JSONL))
urls: $(BUILD_DIR)/urls-count.txt
$(BUILD_DIR)/urls-count.txt: $(URL_CSV)
cat $^ | csvcut -c 2 | sort | uniq -c | sort -rn > $@
# ---
# target: tweets-retweeted
# urls, long/narrow data stream of [id_str, quote_count, reply_count, retweet_count, favorite_count]
RETWEET_DIR = tweets-retweeted
RETWEET_CSV = $(patsubst $(DATA_DIR)/%.jsonl.gz, $(BUILD_DIR)/$(RETWEET_DIR)/%.csv.gz, $(SOURCE_JSONL))
tweets-retweeted: $(BUILD_DIR)/tweets-retweeted-daily.csv
$(BUILD_DIR)/tweets-retweeted-daily.csv: $(RETWEET_CSV)
python $(SCRIPT_DIR)/resample-retweets-data.py --inputdir $(<D) --output $@
$(BUILD_DIR)/$(RETWEET_DIR)/%.csv.gz: $(DATA_DIR)/%.jsonl.gz
mkdir -p $(@D)
gzcat $^ | jq -r 'select(.retweeted_status.id_str) | [.retweeted_status.id_str, .retweeted_status.created_at, .retweeted_status.quote_count, .retweeted_status.reply_count, .retweeted_status.retweet_count, .retweeted_status.favorite_count, .retweeted_status.full_text] | @csv' | pigz > $@
# ---
# target: media-urls
MEDIA_CHUNKS_DIR = media-urls
$(BUILD_DIR)/$(MEDIA_CHUNKS_DIR)/%.csv: $(DATA_DIR)/%.jsonl.gz
mkdir -p $(@D)
gzcat $^ | python ./bin/twarc/utils/media_urls.py > $@
# urls, long/narrow data stream of [tid, single url]
MEDIA_CSV = $(patsubst $(DATA_DIR)/%.jsonl.gz, $(BUILD_DIR)/$(MEDIA_CHUNKS_DIR)/%.csv, $(SOURCE_JSONL))
media-urls: $(BUILD_DIR)/media-urls-count.txt
$(BUILD_DIR)/media-urls-count.txt: $(MEDIA_CSV)
cat $^ | cut -d' ' -f2 | sort | uniq -c | sort -rn > $@
# ---
# target: media-urls-files
# Uses sed to insert headers in first line, then awk to cut last column from file and
# feed this to minet, which handles downloading the media.
# Minet will resume/restart and capture metadata.
media-urls-files: minet $(BUILD_DIR)/media-urls-files-complete.txt
$(BUILD_DIR)/media-urls-files-complete.txt: $(BUILD_DIR)/media-urls-count.txt
sed '1 s/^/tid url\n/' $(BUILD_DIR)/media-urls-count.txt | awk '{print $$(NF)}' | \
minet fetch url -o $(BUILD_DIR)/media-urls-files.csv -d $(BUILD_DIR)/$(MEDIA_CHUNKS_DIR)-files --resume --folder-strategy prefix-2 && \
touch $@
# ---
# target: noretweets
# Remove retweets from entire corpus.
NORETWEET_DIR = noretweets
NORETWEET_JSONLS = $(patsubst $(DATA_DIR)/%.jsonl.gz, $(BUILD_DIR)/$(NORETWEET_DIR)/%.jsonl.gz, $(SOURCE_JSONL))
$(BUILD_DIR)/$(NORETWEET_DIR)/%.jsonl.gz: $(DATA_DIR)/%.jsonl.gz
mkdir -p $(@D)
gzcat $^ | python ./bin/twarc/utils/noretweets.py | pigz > $@
noretweets: $(BUILD_DIR)/noretweets-done.txt
$(BUILD_DIR)/noretweets-done.txt: $(NORETWEET_JSONLS)
touch $@
# ---
# target: noretweets-en
# english/undetermined noretweets (langcode == en or und)
EN_NORETWEET_DIR = noretweets-en
EN_NORETWEET_JSONL = $(patsubst $(BUILD_DIR)/$(NORETWEET_DIR)/%.jsonl.gz, $(BUILD_DIR)/$(EN_NORETWEET_DIR)/%.jsonl.gz, $(NORETWEET_JSONLS))
$(BUILD_DIR)/$(EN_NORETWEET_DIR)/%.jsonl.gz: $(BUILD_DIR)/$(NORETWEET_DIR)/%.jsonl.gz
mkdir -p $(@D)
gzcat $^ | grep -e '"lang": "en"' -e '"lang": "und"' | pigz > $@
noretweets-en: $(BUILD_DIR)/noretweets-en-done.txt
$(BUILD_DIR)/noretweets-en-done.txt: $(EN_NORETWEET_JSONL)
touch $@
# ---
# target: japanese noretweets
JA_NORETWEET_DIR = noretweets-ja
JA_NORETWEET_JSONL = $(patsubst $(BUILD_DIR)/$(NORETWEET_DIR)/%.jsonl.gz, $(BUILD_DIR)/$(JA_NORETWEET_DIR)/%.jsonl.gz, $(NORETWEET_JSONLS))
$(BUILD_DIR)/$(JA_NORETWEET_DIR)/%.jsonl.gz: $(BUILD_DIR)/$(NORETWEET_DIR)/%.jsonl.gz
mkdir -p $(@D)
gzcat $^ | grep '"lang": "ja"' | pigz > $@
noretweets-ja: $(BUILD_DIR)/noretweets-ja-done.txt
$(BUILD_DIR)/noretweets-ja-done.txt: $(JA_NORETWEET_JSONL)
touch $@
# ---
# target: texts
# Uses noretweets-en
# The full text of tweets as {id, text} json lines (for downstream linguistic processes)
TXT_CHUNKS_DIR = texts
TXT_JSONL = $(patsubst $(BUILD_DIR)/$(EN_NORETWEET_DIR)/%.jsonl.gz, $(BUILD_DIR)/$(TXT_CHUNKS_DIR)/%.jsonl, $(EN_NORETWEET_JSONL))
$(BUILD_DIR)/$(TXT_CHUNKS_DIR)/%.jsonl: $(BUILD_DIR)/$(EN_NORETWEET_DIR)/%.jsonl.gz
mkdir -p $(@D)
gzcat $^ | jq -c '{id: .id_str, text: (if .extended_tweet.full_text then .extended_tweet.full_text else (if .full_text then .full_text else .text end) end)}' > $@
texts: $(BUILD_DIR)/texts-done.txt
$(BUILD_DIR)/texts-done.txt: $(TXT_JSONL)
touch $@
# ---
# @requirements: based on macos
PHONY: twarc jq csvkit minet
requirements: twarc jq csvkit minet
twarc: ./bin/twarc
./bin/twarc:
git clone https://github.com/DocNow/twarc.git bin/twarc
jq: /usr/local/bin/jq
/usr/local/bin/jq:
brew install jq
csvkit: /usr/local/bin/csvcut
/usr/local/bin/csvcut:
brew install csvkit
minet: /usr/local/bin/minet
/usr/local/bin/minet:
curl -sSL https://raw.githubusercontent.com/medialab/minet/master/scripts/install.sh | bash
# ---
.PHONY: clean
clean:
@echo "Removing build directory..."
rm -r $(BUILD_DIR)