This repository has been archived by the owner on Oct 31, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 145
/
Makefile
236 lines (199 loc) · 8.05 KB
/
Makefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
# Makefile to install CC-Net and train the LMs.
# `make` or `make help` to get some help.
# Arguments:
lang?=en
process?=8
servers?=0
# List of languages for LM.
langs?=af,ar,az,be,bg,bn,ca,cs,da,de,el,en,es,et,fa,fi,fr,gu,he,hi,hr,hu,hy,id,\
is,it,ja,ka,kk,km,kn,ko,lt,lv,mk,ml,mn,mr,my,ne,nl,no,pl,pt,ro,ru,uk,zh
# Experiment config
NDOC_FOR_LM=1_000_000
NDOC_FOR_SENTPIECE=400000
VOCAB_SIZE=65536
# Static resources, scripts, ...
KENLM=./bin/lmplz
KENLM_BUILD_BINARY=./bin/build_binary
SPM_TRAIN=./bin/spm_train
SPM_ENCODE=./bin/spm_encode
# DISTRIBUTE will run locally, or on slurm if "servers" is set.
DISTRIBUTE=xargs -L1 -P $(process)
ifneq ($(servers), 0)
DISTRIBUTE=xargs -L1 -P $(servers) srun -t 240 --mem 5000
endif
# PRIVATE
_SEGMENT=2019-09/CC-MAIN-20190215183319-20190215205319-00000
help:
# Show help
grep -i -A1 '^[a-z0-9_]*:' Makefile
# Deletes output files on error (useful when piping output)
SHELL=/bin/bash
.SHELLFLAGS = -o pipefail -c
.DELETE_ON_ERROR:
install: bin/lid.bin $(KENLM) $(SPM_TRAIN)
# Installs dependencies.
@if [ -f "data" ]; then\
echo "Please create/simlink a 'data' directory.";\
fi
@if ! python -c "from cc_net import __main__" 2> /dev/null; then\
pip install . ;\
fi
echo " --> All dependencies looks good !"
dl_lm:
# Download a pretrained language model
mkdir -p data/lm_sp
wget -c -P data/lm_sp http://dl.fbaipublicfiles.com/cc_net/lm/$(lang).arpa.bin
wget -c -P data/lm_sp http://dl.fbaipublicfiles.com/cc_net/lm/$(lang).sp.model
lm: data/lm_sp/$(lang).sp.model data/lm_sp/$(lang).arpa.bin
# Computes a 5-gram LM for the given language -> make lang=it lm
# Restricted to the first NDOC_FOR_LM documents
sp: data/lm_sp/$(lang).sp.model
# Train a sentence piece model on Wikipedia -> make lang=it sp
get_lang = $(firstword $(subst ., ,$1))
all_lms:
# Train a list of language models -> make process=10 langs=en,it,fr all_lms
# Defaults to the LMs trained in the paper.
echo "$(langs)" \
| tr -s ', ' '\n' | awk '{print "data/lm_sp/" $$0 ".arpa.bin"}' \
| $(DISTRIBUTE) make
dl_all_lms:
# Download all pretrained language models
echo "$(langs)" \
| tr -s ', ' '\n' | awk '{print "lang=" $$0 " dl_lm"}' \
| $(DISTRIBUTE) make
%.arpa.bin: %.arpa
# Compress a learned LM to a binary format.
$(KENLM_BUILD_BINARY) $< $@
%.vocab.txt: %.txt
# Extracts the vocabulary of a corpus.
# Restricted to the first NDOC_FOR_LM documents and VOCAB_SIZE top words.
cat $< | tr ' ' '\n' | sort | uniq -c | sort -rn > [email protected]_sort
head -$(VOCAB_SIZE) [email protected]_sort | sed "s/^ *//" | cut -d' ' -f2 > $@
echo Extracted `wc -l $@` words
data/lm_sp/%.arpa: data/cirrus/sp/%.opening.txt
mkdir -p $(@D)
$(KENLM) -o 5 -S 8G -T /tmp --vocab_estimate $(VOCAB_SIZE) --discount_fallback \
< $< > $@
data/lm_sp/%.sp.model: data/cirrus/txt/%.opening.txt
mkdir -p $(@D)
$(SPM_TRAIN) --input=$< \
--vocab_size=$(VOCAB_SIZE) --hard_vocab_limit \
--character_coverage=0.9995 \
--model_type=unigram \
--model_prefix=$(basename $@) \
|| echo "WARNING: Corpus is too small, will train smaller model" && \
$(SPM_TRAIN) --input=$< \
--vocab_size=40000 \
--character_coverage=0.9995 \
--model_type=unigram \
--model_prefix=$(basename $@)
echo "Trained SentencePiece model with `wc -l $(basename $@).vocab` pieces"
data/cirrus/sp/%.opening.txt: data/cirrus/gz/%.json.gz data/lm_sp/%.sp.model
$(SPM_ENCODE) \
--model=$(word 2,$^) \
--output_format=piece \
< <(python get_wiki_cirrus.py opening --file $< --n_docs $(NDOC_FOR_LM)) \
> $@
data/cirrus/txt/%.opening.txt: data/cirrus/gz/%.json.gz
python get_wiki_cirrus.py opening \
--n_docs $(NDOC_FOR_LM) \
--file $< --output $@
data/cirrus/gz/%.json.gz:
mkdir $(@D)
python get_wiki_cirrus.py dl --lang $(call get_lang,$(@F)) --output_dir $(@D)
clean:
# Remove intemediary files, dataset, third_party sources
# We don't need the vocab files nor the text version of the LM.
rm -r data/cirrus
rm -r data/lm_sp/*.arpa data/lm_sp/*.vocab
rm -r third_party
# Installation
bin/lid.bin:
# DL languages id from Fasttext releases.
mkdir -p $(@D)
wget https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin -O $@
third_party/kenlm:
# Download kenlm sources: https://kheafield.com/code/kenlm/"
mkdir -p $(@D)
wget -O - https://kheafield.com/code/kenlm.tar.gz | tar -xz -C $(@D)
bin/lmplz: third_party/kenlm
# Compiles kenlm binaries
mkdir -p $(@D)
mkdir -p $</build
(cd $</build && cmake ..)
make -C $</build -j2
mv $</build/bin/lmplz $</build/bin/build_binary $(@D)
third_party/sentencepiece:
# Download sentencepiece sources: https://github.com/google/sentencepiece
mkdir -p $(@D)
wget -c -O $(@D)/sentencepiece.zip https://github.com/google/sentencepiece/archive/v0.1.83.zip
unzip -o -d $(@D) $(@D)/sentencepiece.zip
rm $(@D)/sentencepiece.zip
# remove the version id from the folder name
mv $(@D)/sentencepiece-* $@
bin/spm_train: third_party/sentencepiece
# Compiles sentencepiece binaries
mkdir -p $(@D)
mkdir -p $</build
(cd $</build && cmake ..)
make -C $</build -j2
mv $</build/src/spm_train $</build/src/spm_encode $(@D)
# Installed SentencePiece locally to install globally do:
# $ cd $</build
# $ sudo make install
# $ sudo ldconfig -v
test:
python -m cc_net mine --config test
mkdir -p test_data/mini
python -m cc_net.minify minify -f test_data/mined/2019-09 -o test_data/mini/2019-09
mkdir -p test_data/reproduce
python cc_net/minify.py unminify -f test_data/mini/2019-09 -o test_data/reproduce/2019-09
diff \
<(zcat test_data/mined/2019-09/de_head_0000.json.gz | sort | jq -r .raw_content) \
<(zcat test_data/reproduce/2019-09/de_head_0000.json.gz | sort | jq -r .raw_content)
test2:
python -m cc_net --config config/test_segment.json
python -m cc_net --config config/test_reproduce.json
diff \
<(zcat test_data/mined/2019-09/fr_head_0000.json.gz | jq -c 'select(.cc_segment == "crawl-data/CC-MAIN-2019-09/segments/1550247479101.30/wet/CC-MAIN-20190215183319-20190215205319-00000.warc.wet.gz") | {url, perplexity}' | sort) \
<(zcat test_data2/mined_by_segment/2019-09/CC-MAIN-20190215183319-20190215205319-00000.json.gz | jq -c 'select(.bucket == "head" and .language == "fr") | {url, perplexity}' | sort) \
| head
diff \
<(zcat test_data/mined/2019-09/fr_head_0000.json.gz | sort | jq -r .raw_content ) \
<(zcat test_data2/reproduce/2019-09/fr_head_0000.json.gz | sort | jq -r .raw_content ) \
| head
test_data/regroup_tr/$(_SEGMENT).json.gz:
mkdir -p test_data/transpose
python cc_net/transpose.py transpose -f test_data/mined/2019-09 -o test_data/transpose/2019-09 \
--ex debug
mkdir -p test_data/regroup_tr
python cc_net/transpose.py regroup_tr -i test_data/transpose/2019-09 -o test_data/regroup_tr/2019-09 \
--ex local --conf test
mkdir -p test_data/reproduce_tr
python cc_net/transpose.py unminify -f test_data/regroup_tr/2019-09 -o test_data/reproduce_tr/2019-09 \
--ex debug --conf test
test_transpose: test_data/regroup_tr/$(_SEGMENT).json.gz
diff -y -W60 \
<(zcat test_data/mined/2019-09/*.json.gz | jq -r .language | sort | uniq -c ) \
<(zcat test_data/reproduce_tr/2019-09/*.json.gz | jq -r .language | sort | uniq -c )
diff -y -w60 \
<(zcat test_data/mined/2019-09/*.json.gz | jq -r .raw_content | wc) \
<(zcat test_data/reproduce_tr/2019-09/*.json.gz | jq -r .raw_content | wc)
diff \
<(zcat test_data/mined/2019-09/*.json.gz | jq -r .url | sort) \
<(zcat test_data/reproduce_tr/2019-09/*.json.gz | jq -r .url | sort) \
| head
# zcat test_data/reproduce_tr/2019-09/*.json.gz | sort | head -2 | jq -r .raw_content
test_with_metadata: test_data/regroup_tr/$(_SEGMENT).json.gz
python -m cc_net mine --config test --metadata test_data/regroup_tr
diff -y -W60 \
<(zcat test_data/mined/2019-09/*.json.gz | jq -r .language | sort | uniq -c ) \
<(zcat test_data/reproduce/2019-09/*.json.gz | jq -r .language | sort | uniq -c )
diff -y -w60 \
<(zcat test_data/mined/2019-09/*.json.gz | jq -r .raw_content | wc) \
<(zcat test_data/reproduce/2019-09/*.json.gz | jq -r .raw_content | wc)
diff \
<(zcat test_data/mined/2019-09/*.json.gz | jq -r .url | sort) \
<(zcat test_data/reproduce/2019-09/*.json.gz | jq -r .url | sort) \
| head