-
Notifications
You must be signed in to change notification settings - Fork 475
/
parameters.ini
129 lines (100 loc) · 6.99 KB
/
parameters.ini
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
#----- Possible modes of operation -----------------------------------------------------------------------------------------------------------------#
# training mode (from scratch): set train_model to True, and use_pretrained_model to False (if training from scratch). #
# Must have train and valid sets in the dataset_text_folder, and test and deployment sets are optional. #
# training mode (from pretrained model): set train_model to True, and use_pretrained_model to True (if training from a pretrained model). #
# Must have train and valid sets in the dataset_text_folder, and test and deployment sets are optional. #
# prediction mode (using pretrained model): set train_model to False, and use_pretrained_model to True. #
# Must have either a test set or a deployment set. #
# NOTE: Whenever use_pretrained_model is set to True, pretrained_model_folder must be set to the folder containing the pretrained model to use, and #
# model.ckpt, dataset.pickle and parameters.ini must exist in the same folder as the checkpoint file. #
#---------------------------------------------------------------------------------------------------------------------------------------------------#
[mode]
# At least one of use_pretrained_model and train_model must be set to True.
train_model = True
use_pretrained_model = False
pretrained_model_folder = ./trained_models/conll_2003_en
[dataset]
dataset_text_folder = ./data/conll2003/en
# main_evaluation_mode should be either 'conll', 'bio', 'token', or 'binary'. ('conll' is entity-based)
# It determines which metric to use for early stopping, displaying during training, and plotting F1-score vs. epoch.
main_evaluation_mode = conll
output_folder = ./output
#---------------------------------------------------------------------------------------------------------------------#
# The parameters below are for advanced users. Their default values should yield good performance in most cases. #
#---------------------------------------------------------------------------------------------------------------------#
[ann]
use_character_lstm = True
character_embedding_dimension = 25
character_lstm_hidden_state_dimension = 25
# In order to use random initialization instead, set token_pretrained_embedding_filepath to empty string, as below:
# token_pretrained_embedding_filepath =
token_pretrained_embedding_filepath = ./data/word_vectors/glove.6B.100d.txt
token_embedding_dimension = 100
token_lstm_hidden_state_dimension = 100
use_crf = True
[training]
patience = 10
maximum_number_of_epochs = 100
# optimizer should be either 'sgd', 'adam', or 'adadelta'
optimizer = sgd
learning_rate = 0.005
# gradients will be clipped above |gradient_clipping_value| and below -|gradient_clipping_value|, if gradient_clipping_value is non-zero
# (set to 0 to disable gradient clipping)
gradient_clipping_value = 5.0
# dropout_rate should be between 0 and 1
dropout_rate = 0.5
# Upper bound on the number of CPU threads NeuroNER will use
number_of_cpu_threads = 8
# Upper bound on the number of GPU NeuroNER will use
# If number_of_gpus > 0, you need to have installed tensorflow-gpu
number_of_gpus = 0
[advanced]
experiment_name = test
# Append the scores to a column in the output file
output_scores = False
# tagging_format should be either 'bioes' or 'bio'
tagging_format = bioes
# tokenizer should be either 'spacy' or 'stanford'. The tokenizer is only used when the original data is provided only in BRAT format.
# - 'spacy' refers to spaCy (https://spacy.io). To install spacy: pip install -U spacy
# - 'stanford' refers to Stanford CoreNLP (https://stanfordnlp.github.io/CoreNLP/). Stanford CoreNLP is written in Java: to use it one has to start a
# Stanford CoreNLP server, which can tokenize sentences given on the fly. Stanford CoreNLP is portable, which means that it can be run
# without any installation.
# To download Stanford CoreNLP: https://stanfordnlp.github.io/CoreNLP/download.html
# To run Stanford CoreNLP, execute in the terminal: `java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 50000`
# By default Stanford CoreNLP is in English. To use it in other languages, see: https://stanfordnlp.github.io/CoreNLP/human-languages.html
# Stanford CoreNLP 3.6.0 and higher requires Java 8. We have tested NeuroNER with Stanford CoreNLP 3.6.0.
tokenizer = spacy
# spacylanguage should be either 'de' (German), 'en' (English) or 'fr' (French). (https://spacy.io/docs/api/language-models)
# To install the spaCy language: `python -m spacy.de.download`; or `python -m spacy.en.download`; or `python -m spacy.fr.download`
spacylanguage = en
# If remap_unknown_tokens is set to True, map to UNK any token that hasn't been seen in neither the training set nor the pre-trained token embeddings.
remap_unknown_tokens_to_unk = True
# If load_only_pretrained_token_embeddings is set to True, then token embeddings will only be loaded if it exists in token_pretrained_embedding_filepath
# or in pretrained_model_checkpoint_filepath, even for the training set.
load_only_pretrained_token_embeddings = False
# If load_all_pretrained_token_embeddings is set to True, then all pretrained token embeddings will be loaded even for the tokens that do not appear in the dataset.
load_all_pretrained_token_embeddings = False
# If check_for_lowercase is set to True, the lowercased version of each token will also be checked when loading the pretrained embeddings.
# For example, if the token 'Boston' does not exist in the pretrained embeddings, then it is mapped to the embedding of its lowercased version 'boston',
# if it exists among the pretrained embeddings.
check_for_lowercase = True
# If check_for_digits_replaced_with_zeros is set to True, each token with digits replaced with zeros will also be checked when loading pretrained embeddings.
# For example, if the token '123-456-7890' does not exist in the pretrained embeddings, then it is mapped to the embedding of '000-000-0000',
# if it exists among the pretrained embeddings.
# If both check_for_lowercase and check_for_digits_replaced_with_zeros are set to True, then the lowercased version is checked before the digit-zeroed version.
check_for_digits_replaced_with_zeros = True
# If freeze_token_embeddings is set to True, token embedding will remain frozen (not be trained).
freeze_token_embeddings = False
# If debug is set to True, only 200 lines will be loaded for each split of the dataset.
debug = False
verbose = False
# plot_format specifies the format of the plots generated by NeuroNER. It should be either 'png' or 'pdf'.
plot_format = pdf
# specify which layers to reload from the pretrained model
reload_character_embeddings = True
reload_character_lstm = True
reload_token_embeddings = True
reload_token_lstm = True
reload_feedforward = True
reload_crf = True
parameters_filepath = ./parameters.ini