-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconfig.yaml
122 lines (118 loc) · 3.41 KB
/
config.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
# Initial settings
verbose: true # Verbose output
y_classificaton_column: "TUI" # Column which contains the classification result
drop_classificaton_columns: []
max_nb_data_per_class: 0 # Maximum number of rows to be classified. 0 if no limit
attributes_features: ["Has_Def", "SAB", "Parents_Types", "Labels", "Def"] # Possible values: Has_Def, SAB, Parents_Types, Labels, Def
test_size: 0.3 # Percentage of data used for testing
debug_output_path: "" # It is used to store the debug output
test_source: ["AEO"] # List of Source to be used for graph testing
# Preprocessing
stemming: false # Stemming of the words
lemmitization: true # Lemmitization of the words
# Word2Vec
vector_size: 700 # vector_size
window: 20 # window
w2v_epochs: 40 # w2v_epochs
sequence_length: 100 # sequence_length
# Numerical data
numerical_data_shape: 130 # numerical_data_shape / 129 (ENG SAB) + 1 (Has_Definition) -> These are only the size that we can determine before running pipeline
# Neural network settings
neural_network:
word_embedding:
steps:
- name: "Input-Word-Embedding"
type: "Input"
input_shape: 100 # sequence_length
- name: "Embedding"
type: "Embedding"
input_shape: None # Len of vocabulary
imput_length: 100 # sequence_length
output_shape: 700 # vector_size
weights: "embeddings"
trainable: false
- name: "LSTM-1"
type: "LSTM"
units: 100 # sequence_length
dropout: 0.2
return_sequences: true
- name: "LSTM-2"
type: "LSTM"
units: 100 # sequence_length
dropout: 0.2
return_sequences: false
- name: "Dense-1-WE"
type: "Dense"
units: 220
activation: "relu"
multi_layer_perception:
steps:
- name: "Input-Multi-Layer-Perceptron"
type: "Input"
input_shape: 130 # numerical_data_shape
- name: "Dense-1-MLP"
type: "Dense"
units: 60
activation: "relu"
- name: "Dense-2-MLP"
type: "Dense"
units: 10
activation: "relu"
bag_of_words:
steps:
- name: "Input-Bag-Of-Words"
type: "Input"
input_shape: 0
- name: "Dense-1-BOW"
type: "Dense"
units: 128
activation: "relu"
- name: "Dense-2-BOW"
type: "Dense"
units: 512
activation: "relu"
- name: "Batch-Normalization-1-BOW"
type: "BatchNormalization"
- name: "Dense-3-BOW"
type: "Dense"
units: 128
activation: "relu"
- name: "Dense-4-BOW"
type: "Dense"
units: 512
activation: "relu"
- name: "Dropout-1-BOW"
type: "Dropout"
rate: 0.25
- name: "Dense-5-BOW"
type: "Dense"
units: 128
activation: "relu"
concatenate:
steps:
- name: "Dense-1"
type: "Dense"
units: 152
activation: "relu"
out:
name: "Final-Dense"
type: "Dense"
units: 'data[config["y_classificaton_column"]].nunique()'
activation: "softmax"
optimizer:
name: "adam"
learning_rate: 0.001 # Unused
decay: 0.0 # Unused
momentum: 0.0 # Unused
nesterov: false # Unused
clipnorm: 0.0 # Unused
clipvalue: 0.0 # Unused
beta_1: 0.9 # Unused
beta_2: 0.999 # Unused
epsilon: 1e-08 # Unused
amsgrad: false # Unused
loss: "sparse_categorical_crossentropy"
metrics: [] # Unused
batch_size: 2
epochs: 1
shuffle: true