config.yaml

# Initial settings
verbose: true # Verbose output
y_classificaton_column: "TUI" # Column which contains the classification result
drop_classificaton_columns: []
max_nb_data_per_class: 0 # Maximum number of rows to be classified. 0 if no limit
attributes_features: ["Has_Def", "SAB", "Parents_Types", "Labels", "Def"] # Possible values: Has_Def, SAB, Parents_Types, Labels, Def
test_size: 0.3 # Percentage of data used for testing
debug_output_path: "" # It is used to store the debug output
test_source: ["AEO"] # List of Source to be used for graph testing

# Preprocessing
stemming: false # Stemming of the words
lemmitization: true # Lemmitization of the words

# Word2Vec
vector_size: 700 # vector_size
window: 20 # window
w2v_epochs: 40 # w2v_epochs
sequence_length: 100 # sequence_length

# Numerical data
numerical_data_shape: 130 # numerical_data_shape / 129 (ENG SAB) + 1 (Has_Definition) -> These are only the size that we can determine before running pipeline

# Neural network settings
neural_network:
  word_embedding:
    steps:
      - name: "Input-Word-Embedding"
        type: "Input"
        input_shape: 100 # sequence_length
      - name: "Embedding"
        type: "Embedding"
        input_shape: None # Len of vocabulary
        imput_length: 100 # sequence_length
        output_shape: 700 # vector_size
        weights: "embeddings"
        trainable: false
      - name: "LSTM-1"
        type: "LSTM"
        units: 100 # sequence_length
        dropout: 0.2
        return_sequences: true
      - name: "LSTM-2"
        type: "LSTM"
        units: 100 # sequence_length
        dropout: 0.2
        return_sequences: false
      - name: "Dense-1-WE"
        type: "Dense"
        units: 220
        activation: "relu"
  multi_layer_perception:
    steps:
      - name: "Input-Multi-Layer-Perceptron"
        type: "Input"
        input_shape: 130 # numerical_data_shape
      - name: "Dense-1-MLP"
        type: "Dense"
        units: 60
        activation: "relu"
      - name: "Dense-2-MLP"
        type: "Dense"
        units: 10
        activation: "relu"
  bag_of_words:
    steps:
      - name: "Input-Bag-Of-Words"
        type: "Input"
        input_shape: 0
      - name: "Dense-1-BOW"
        type: "Dense"
        units: 128
        activation: "relu"
      - name: "Dense-2-BOW"
        type: "Dense"
        units: 512
        activation: "relu"
      - name: "Batch-Normalization-1-BOW"
        type: "BatchNormalization"
      - name: "Dense-3-BOW"
        type: "Dense"
        units: 128
        activation: "relu"
      - name: "Dense-4-BOW"
        type: "Dense"
        units: 512
        activation: "relu"
      - name: "Dropout-1-BOW"
        type: "Dropout"
        rate: 0.25
      - name: "Dense-5-BOW"
        type: "Dense"
        units: 128
        activation: "relu"
  concatenate:
    steps:
      - name: "Dense-1"
        type: "Dense"
        units: 152
        activation: "relu"
  out:
    name: "Final-Dense"
    type: "Dense"
    units: 'data[config["y_classificaton_column"]].nunique()'
    activation: "softmax"
  optimizer:
    name: "adam"
    learning_rate: 0.001 # Unused
    decay: 0.0 # Unused
    momentum: 0.0 # Unused
    nesterov: false # Unused
    clipnorm: 0.0 # Unused
    clipvalue: 0.0 # Unused
    beta_1: 0.9 # Unused
    beta_2: 0.999 # Unused
    epsilon: 1e-08 # Unused
    amsgrad: false # Unused
  loss: "sparse_categorical_crossentropy"
  metrics: [] # Unused
  batch_size: 2
  epochs: 1
  shuffle: true