-
Notifications
You must be signed in to change notification settings - Fork 0
/
makefile.coref_ml
75 lines (58 loc) · 2.44 KB
/
makefile.coref_ml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
SHELL=/bin/bash
#DATA_SET = train
DATA_SET=train_00-18
DATA_ID = pcedt
DATA_VERSION := $(shell cat data/analysed/$(DATA_ID)/$(DATA_SET)/last_id 2> /dev/null || echo 0000)
DATA_DIR=data/analysed/$(DATA_ID)/$(DATA_SET)/$(DATA_VERSION)
JOBS = 50
LRC=1
ifeq (${LRC}, 1)
LRC_FLAGS = -p --qsub '-hard -l mem_free=8G -l act_mem_free=8G -l h_vmem=8G' --jobs ${JOBS}
endif
#==================================== ML =========================================
LANGUAGE=en
ifeq ($(LANGUAGE),en)
ALIGNED_LANGUAGE=cs
else
ALIGNED_LANGUAGE=en
endif
#----------------------------- data extraction -----------------------------------
DATA_EXTRACT_DIR=/home/mnovak/projects/czeng_coref
ifeq ($(DATA_ID),pcedt)
DATA_ID_FOR_EXTRACT = pcedt_bi
else
DATA_ID_FOR_EXTRACT := $(DATA_ID)
endif
COREF_PRINTER_PARAMS=aligned_feats=1
extract_data :
$(MAKE) -C $(DATA_EXTRACT_DIR) train_table \
DATA_SOURCE=$(DATA_ID_FOR_EXTRACT) \
DATA_SET=$(DATA_SET) \
ANOT=analysed \
LANGUAGE=$(LANGUAGE) \
DATA_TABLE_SCENARIO=/home/mnovak/projects/coref_bitext/scenarios/before_data_table.scen \
COREF_PRINTER_PARAMS=$(COREF_PRINTER_PARAMS) \
LRC=$(LRC) \
DESC="$(D)"
#----------------------------- coreference projection ----------------------------
ML_FRAMEWORK=/home/mnovak/projects/ml_framework
# comment out My::AlignmentResolver if a projection without supervised alignment is required
projection_eval :
mkdir -p tmp/projection_eval
-treex $(LRC_FLAGS) -Len -Ssrc \
Read::Treex from=@$(DATA_DIR)/list \
Coref::RemoveLinks type=text language=$(LANGUAGE) \
My::AlignmentResolver align_language=$(ALIGNED_LANGUAGE) model_path=/home/mnovak/projects/align_resolver/model/train.pcedt_19.mgiza_on_czeng.19b6cae385.vw.ranking.4d7ef.model \
My::ProjectCoreference language=$(ALIGNED_LANGUAGE) trg_language=$(LANGUAGE) \
My::EvalCoref language=$(LANGUAGE) to='.' substitute='{^.*/([^\/]*)}{tmp/projection_eval/$$1.$(DATA_SET).txt}'
find tmp/projection_eval -path "*.$(DATA_SET).txt" | sort | xargs cat > tmp/projection_eval/$(DATA_SET).all
cat tmp/projection_eval/$(DATA_SET).all | $(ML_FRAMEWORK)/scripts/eval.pl --acc --prf
#============================ STANFORD COREF =======================================
stanford_coref_sents :
-treex $(LRC_FLAGS) -Len -Ssrc \
Read::Treex from=@$(DATA_DIR)/list \
Coref::EN::ResolveStanfordCoreNLP \
A2T::SetDocOrds \
Coref::RearrangeLinks retain_cataphora=1 \
Coref::MarkMentionsForScorer \
Print::CorefSentences path=tmp/pcedt_stanford_coref