-
Notifications
You must be signed in to change notification settings - Fork 1
/
Makefile
112 lines (99 loc) · 4.48 KB
/
Makefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
hdfs_dir = /user/halfak/streaming
dbstore = --defaults-file=~/.my.research.cnf -h analytics-store.eqiad.wmnet -u research
# Enwiki revdocs unsorted (local)
datasets/enwiki-20150901/revdocs_unsorted-bz2:
mwxml dump2revdocs \
/mnt/data/xmldatadumps/public/enwiki/20150901/*pages-meta-history*.xml*.bz2 \
--output datasets/enwiki-20150901/revdocs_unsorted-bz2
# Enwiki revdocs (hadoop)
datasets/enwiki-20150901.revdocs:
./hadoop/dump2revdocs.hadoop \
2000 \
$(hdfs_dir)/enwiki-20150901/xml-bz2 \
$(hdfs_dir)/enwiki-20150901/revdocs-bz2 && \
(du -hs /hdfs$(hdfs_dir)/enwiki-20150901/revdocs-bz2; \
ls -al --color=never /hdfs$(hdfs_dir)/enwiki-20150901/revdocs-bz2) > \
datasets/enwiki-20150602.diffs
# Enwiki diffs (hadoop)
datasets/enwiki-20150602/diffs.info: datasets/enwiki-20150602/revdocs.info
./hadoop/revdocs2diffs.hadoop \
enwiki-20150602.diffs \
western.diffs.yaml \
$(hdfs_dir)/enwiki-20150602/revdocs-bz2 \
$(hdfs_dir)/enwiki-20150602/diffs-bz2 && \
(du -hs /hdfs$(hdfs_dir)/enwiki-20150602/diffs-bz2; \
ls -al --color=never /hdfs$(hdfs_dir)/enwiki-20150602/diffs-bz2) > \
datasets/enwiki-20150602/diffs.info
datasets/enwiki-20150901/diffs.info:
./hadoop/json2diffs.hadoop \
enwiki-20150901.diffs \
western.diffs.yaml \
$(hdfs_dir)/enwiki-20150901/json-snappy \
$(hdfs_dir)/enwiki-20150901/diffs-bz2 && \
(du -hs /hdfs$(hdfs_dir)/enwiki-20150901/diffs-bz2; \
ls -al --color=never /hdfs$(hdfs_dir)/enwiki-20150901/diffs-bz2) > \
datasets/enwiki-20150901/diffs.info
# Enwiki token persistence (local)
datasets/enwiki-20150602/persistence.info: datasets/enwiki-20150602/diffs.info
mwpersistence diffs2persistence \
datasets/enwiki-20150602/diffs-bz2/*.bz2 \
--sunset 20150602000000 \
--window 50 \
--revert-radius 15 \
--output datasets/enwiki-20150602/persistence-bz2 && \
(du -hs datasets/enwiki-20150602/persistence-bz2; \
ls -al --color=never datasets/enwiki-20150602/persistence-bz2) > \
datasets/enwiki-20150602/persistence.info
datasets/enwiki-20150602/persistence-hadoop.info: datasets/enwiki-20150602/diffs.info
./hadoop/diffs2persistence.hadoop \
enwiki-20150602.persistence \
20150602000000 \
$(hdfs_dir)/enwiki-20150602/diffs-bz2 \
$(hdfs_dir)/enwiki-20150602/persistence-bz2 && \
(du -hs /hdfs$(hdfs_dir)/enwiki-20150602/persistence-bz2; \
ls -al --color=never /hdfs$(hdfs_dir)/enwiki-20150602/persistence-bz2) > \
datasets/enwiki-20150602/persistence-hadoop.info
# Enwiki revision stats for words (local)
datasets/enwiki-20150602/word-stats.info: datasets/enwiki-20150602/persistence.info
mwpersistence persistence2stats \
datasets/enwiki-20150602/persistence-bz2/*.bz2 \
--min-persisted 5 \
--min-visible 48 \
--exclude '\\s+' \
--output datasets/enwiki-20150602/word-stats-bz2 && \
(du -hs datasets/enwiki-20150602/word-stats-bz2; \
ls -al --color=never datasets/enwiki-20150602/word-stats-bz2) > \
datasets/enwiki-20150602/word-stats.info
datasets/enwiki-20150602/word_persistence.tsv: datasets/enwiki-20150602/word-stats.info
bzcat datasets/enwiki-20150602/word-stats-bz2/*.bz2 | \
json2tsv \
id timestamp page.id page.namespace page.title user.id user.text comment minor sha1 \
persistence.revisions_processed \
persistence.non_self_processed \
persistence.seconds_possible \
persistence.tokens_added \
persistence.persistent_tokens \
persistence.non_self_persistent_tokens \
persistence.censored \
persistence.non_self_censored \
persistence.sum_log_persisted \
persistence.sum_log_non_self_persisted \
persistence.sum_log_seconds_visible | pv > \
datasets/enwiki-20150602/word_persistence.tsv
datasets/word_persistence.created.table: sql/word_persistence.create.sql
cat sql/word_persistence.create.sql | \
mysql $(dbstore) && date > \
datasets/word_persistence.created.table
datasets/word_persistence.loaded.table: datasets/word_persistence.created.table
ln -s datasets/enwiki-20150602/word_persistence.tsv mep_word_persistence && \
mysqlimport $(dbstore) --local staging mep_word_persistence && \
rm -f mep_word_persistence && date > \
datasets/word_persistence.loaded.table
datasets/enwiki-20150602/monthly_persistence_stats.tsv: sql/monthly_persistence_stats.sql
cat sql/monthly_persistence_stats.sql | \
mysql $(dbstore) staging > \
datasets/enwiki-20150602/monthly_persistence_stats.tsv
datasets/enwiki-20150602/per_user_persistence_stats.tsv: sql/per_user_persistence_stats.sql
cat sql/per_user_persistence_stats.sql | \
mysql $(dbstore) staging > \
datasets/enwiki-20150602/per_user_persistence_stats.tsv