` is path to the chosen model's config file (e.g.
- `deeppavlov/configs/ner/slotfill_dstc2.json`) or just name without
- *.json* extension (e.g. `slotfill_dstc2`)
+ `deeppavlov/configs/classifiers/insults_kaggle_bert.json`) or just name without
+ *.json* extension (e.g. `insults_kaggle_bert`)
### Command line interface (CLI)
@@ -305,10 +303,7 @@ into the `"class_name"` field.
DeepPavlov is Apache 2.0 - licensed.
-## The Team
-
-DeepPavlov is built and maintained by [Neural Networks and Deep Learning Lab](https://www.facebook.com/deepmipt/)
-at [MIPT](https://mipt.ru/english/).
+##
diff --git a/deeppavlov/_meta.py b/deeppavlov/_meta.py
index 2ee55f7d22..df485ac732 100644
--- a/deeppavlov/_meta.py
+++ b/deeppavlov/_meta.py
@@ -1,4 +1,4 @@
-__version__ = '0.17.2'
+__version__ = '1.0.0rc0'
__author__ = 'Neural Networks and Deep Learning lab, MIPT'
__description__ = 'An open source library for building end-to-end dialog systems and training chatbots.'
__keywords__ = ['NLP', 'NER', 'SQUAD', 'Intents', 'Chatbot']
diff --git a/deeppavlov/configs/classifiers/insults_kaggle.json b/deeppavlov/configs/classifiers/insults_kaggle.json
deleted file mode 100644
index 8627589eca..0000000000
--- a/deeppavlov/configs/classifiers/insults_kaggle.json
+++ /dev/null
@@ -1,155 +0,0 @@
-{
- "dataset_reader": {
- "class_name": "basic_classification_reader",
- "x": "Comment",
- "y": "Class",
- "data_path": "{DOWNLOADS_PATH}/insults_data"
- },
- "dataset_iterator": {
- "class_name": "basic_classification_iterator",
- "seed": 42
- },
- "chainer": {
- "in": [
- "x"
- ],
- "in_y": [
- "y"
- ],
- "pipe": [
- {
- "id": "classes_vocab",
- "class_name": "simple_vocab",
- "fit_on": [
- "y"
- ],
- "save_path": "{MODEL_PATH}/classes.dict",
- "load_path": "{MODEL_PATH}/classes.dict",
- "in": "y",
- "out": "y_ids"
- },
- {
- "in": [
- "x"
- ],
- "out": [
- "x_prep"
- ],
- "class_name": "dirty_comments_preprocessor"
- },
- {
- "in": "x_prep",
- "out": "x_tok",
- "id": "my_tokenizer",
- "class_name": "nltk_tokenizer",
- "tokenizer": "wordpunct_tokenize"
- },
- {
- "in": "x_tok",
- "out": "x_emb",
- "id": "my_embedder",
- "class_name": "fasttext",
- "load_path": "{DOWNLOADS_PATH}/embeddings/wordpunct_tok_reddit_comments_2017_11_300.bin",
- "pad_zero": true
- },
- {
- "in": "y_ids",
- "out": "y_onehot",
- "class_name": "one_hotter",
- "depth": "#classes_vocab.len",
- "single_vector": true
- },
- {
- "in": [
- "x_emb"
- ],
- "in_y": [
- "y_onehot"
- ],
- "out": [
- "y_pred_probas"
- ],
- "main": true,
- "class_name": "keras_classification_model",
- "save_path": "{MODEL_PATH}/model",
- "load_path": "{MODEL_PATH}/model",
- "embedding_size": "#my_embedder.dim",
- "n_classes": "#classes_vocab.len",
- "kernel_sizes_cnn": [
- 3,
- 5,
- 7
- ],
- "filters_cnn": 256,
- "optimizer": "Adam",
- "learning_rate": 0.01,
- "learning_rate_decay": 0.1,
- "loss": "binary_crossentropy",
- "last_layer_activation": "softmax",
- "coef_reg_cnn": 1e-3,
- "coef_reg_den": 1e-2,
- "dropout_rate": 0.5,
- "dense_size": 100,
- "model_name": "cnn_model"
- },
- {
- "in": "y_pred_probas",
- "out": "y_pred_ids",
- "class_name": "proba2labels",
- "max_proba": true
- },
- {
- "in": "y_pred_ids",
- "out": "y_pred_labels",
- "ref": "classes_vocab"
- }
- ],
- "out": [
- "y_pred_labels"
- ]
- },
- "train": {
- "epochs": 1000,
- "batch_size": 64,
- "metrics": [
- "accuracy",
- "f1_macro",
- {
- "name": "roc_auc",
- "inputs": ["y_onehot", "y_pred_probas"]
- }
- ],
- "validation_patience": 5,
- "val_every_n_epochs": 5,
- "log_every_n_epochs": 5,
- "show_examples": false,
- "evaluation_targets": [
- "train",
- "valid",
- "test"
- ],
- "class_name": "nn_trainer"
- },
- "metadata": {
- "variables": {
- "ROOT_PATH": "~/.deeppavlov",
- "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
- "MODELS_PATH": "{ROOT_PATH}/models",
- "MODEL_PATH": "{MODELS_PATH}/classifiers/insults_kaggle_v2"
- },
- "download": [
- {
- "url": "http://files.deeppavlov.ai/datasets/insults_data.tar.gz",
- "subdir": "{DOWNLOADS_PATH}"
- },
- {
- "url": "http://files.deeppavlov.ai/embeddings/reddit_fastText/wordpunct_tok_reddit_comments_2017_11_300.bin",
- "subdir": "{DOWNLOADS_PATH}/embeddings"
- },
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/classifiers/insults_kaggle_v2.tar.gz",
- "subdir": "{MODELS_PATH}/classifiers"
- }
- ]
- }
-}
diff --git a/deeppavlov/configs/classifiers/insults_kaggle_bert.json b/deeppavlov/configs/classifiers/insults_kaggle_bert.json
index d64f2363b4..a9ff62015e 100644
--- a/deeppavlov/configs/classifiers/insults_kaggle_bert.json
+++ b/deeppavlov/configs/classifiers/insults_kaggle_bert.json
@@ -18,9 +18,9 @@
],
"pipe": [
{
- "class_name": "bert_preprocessor",
- "vocab_file": "{DOWNLOADS_PATH}/bert_models/cased_L-12_H-768_A-12/vocab.txt",
- "do_lower_case": false,
+ "class_name": "torch_transformers_preprocessor",
+ "vocab_file": "{TRANSFORMER}",
+ "do_lower_case": true,
"max_seq_length": 64,
"in": [
"x"
@@ -37,48 +37,64 @@
],
"save_path": "{MODEL_PATH}/classes.dict",
"load_path": "{MODEL_PATH}/classes.dict",
- "in": "y",
- "out": "y_ids"
+ "in": [
+ "y"
+ ],
+ "out": [
+ "y_ids"
+ ]
},
{
- "in": "y_ids",
- "out": "y_onehot",
+ "in": [
+ "y_ids"
+ ],
+ "out": [
+ "y_onehot"
+ ],
"class_name": "one_hotter",
"depth": "#classes_vocab.len",
"single_vector": true
},
{
- "class_name": "bert_classifier",
+ "class_name": "torch_transformers_classifier",
"n_classes": "#classes_vocab.len",
"return_probas": true,
- "one_hot_labels": true,
- "bert_config_file": "{DOWNLOADS_PATH}/bert_models/cased_L-12_H-768_A-12/bert_config.json",
- "pretrained_bert": "{DOWNLOADS_PATH}/bert_models/cased_L-12_H-768_A-12/bert_model.ckpt",
+ "pretrained_bert": "{TRANSFORMER}",
"save_path": "{MODEL_PATH}/model",
"load_path": "{MODEL_PATH}/model",
- "keep_prob": 0.5,
- "learning_rate": 1e-05,
+ "optimizer": "AdamW",
+ "optimizer_parameters": {
+ "lr": 1e-05
+ },
"learning_rate_drop_patience": 5,
"learning_rate_drop_div": 2.0,
"in": [
"bert_features"
],
"in_y": [
- "y_onehot"
+ "y_ids"
],
"out": [
"y_pred_probas"
]
},
{
- "in": "y_pred_probas",
- "out": "y_pred_ids",
+ "in": [
+ "y_pred_probas"
+ ],
+ "out": [
+ "y_pred_ids"
+ ],
"class_name": "proba2labels",
"max_proba": true
},
{
- "in": "y_pred_ids",
- "out": "y_pred_labels",
+ "in": [
+ "y_pred_ids"
+ ],
+ "out": [
+ "y_pred_labels"
+ ],
"ref": "classes_vocab"
}
],
@@ -109,15 +125,15 @@
"valid",
"test"
],
- "class_name": "nn_trainer",
- "tensorboard_log_dir": "{MODEL_PATH}/"
+ "class_name": "torch_trainer"
},
"metadata": {
"variables": {
+ "TRANSFORMER": "bert-base-uncased",
"ROOT_PATH": "~/.deeppavlov",
"DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
"MODELS_PATH": "{ROOT_PATH}/models",
- "MODEL_PATH": "{MODELS_PATH}/classifiers/insults_kaggle_v3"
+ "MODEL_PATH": "{MODELS_PATH}/classifiers/insults_kaggle_torch_bert"
},
"download": [
{
@@ -125,11 +141,7 @@
"subdir": "{DOWNLOADS_PATH}"
},
{
- "url": "http://files.deeppavlov.ai/deeppavlov_data/bert/cased_L-12_H-768_A-12.zip",
- "subdir": "{DOWNLOADS_PATH}/bert_models"
- },
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/classifiers/insults_kaggle_v3.tar.gz",
+ "url": "http://files.deeppavlov.ai/deeppavlov_data/classifiers/insults_kaggle_torch_bert_v0.tar.gz",
"subdir": "{MODELS_PATH}/classifiers"
}
]
diff --git a/deeppavlov/configs/classifiers/insults_kaggle_bert_torch.json b/deeppavlov/configs/classifiers/insults_kaggle_bert_torch.json
deleted file mode 100644
index a9ff62015e..0000000000
--- a/deeppavlov/configs/classifiers/insults_kaggle_bert_torch.json
+++ /dev/null
@@ -1,149 +0,0 @@
-{
- "dataset_reader": {
- "class_name": "basic_classification_reader",
- "x": "Comment",
- "y": "Class",
- "data_path": "{DOWNLOADS_PATH}/insults_data"
- },
- "dataset_iterator": {
- "class_name": "basic_classification_iterator",
- "seed": 42
- },
- "chainer": {
- "in": [
- "x"
- ],
- "in_y": [
- "y"
- ],
- "pipe": [
- {
- "class_name": "torch_transformers_preprocessor",
- "vocab_file": "{TRANSFORMER}",
- "do_lower_case": true,
- "max_seq_length": 64,
- "in": [
- "x"
- ],
- "out": [
- "bert_features"
- ]
- },
- {
- "id": "classes_vocab",
- "class_name": "simple_vocab",
- "fit_on": [
- "y"
- ],
- "save_path": "{MODEL_PATH}/classes.dict",
- "load_path": "{MODEL_PATH}/classes.dict",
- "in": [
- "y"
- ],
- "out": [
- "y_ids"
- ]
- },
- {
- "in": [
- "y_ids"
- ],
- "out": [
- "y_onehot"
- ],
- "class_name": "one_hotter",
- "depth": "#classes_vocab.len",
- "single_vector": true
- },
- {
- "class_name": "torch_transformers_classifier",
- "n_classes": "#classes_vocab.len",
- "return_probas": true,
- "pretrained_bert": "{TRANSFORMER}",
- "save_path": "{MODEL_PATH}/model",
- "load_path": "{MODEL_PATH}/model",
- "optimizer": "AdamW",
- "optimizer_parameters": {
- "lr": 1e-05
- },
- "learning_rate_drop_patience": 5,
- "learning_rate_drop_div": 2.0,
- "in": [
- "bert_features"
- ],
- "in_y": [
- "y_ids"
- ],
- "out": [
- "y_pred_probas"
- ]
- },
- {
- "in": [
- "y_pred_probas"
- ],
- "out": [
- "y_pred_ids"
- ],
- "class_name": "proba2labels",
- "max_proba": true
- },
- {
- "in": [
- "y_pred_ids"
- ],
- "out": [
- "y_pred_labels"
- ],
- "ref": "classes_vocab"
- }
- ],
- "out": [
- "y_pred_labels"
- ]
- },
- "train": {
- "epochs": 100,
- "batch_size": 64,
- "metrics": [
- {
- "name": "roc_auc",
- "inputs": [
- "y_onehot",
- "y_pred_probas"
- ]
- },
- "accuracy",
- "f1_macro"
- ],
- "validation_patience": 5,
- "val_every_n_epochs": 1,
- "log_every_n_epochs": 1,
- "show_examples": false,
- "evaluation_targets": [
- "train",
- "valid",
- "test"
- ],
- "class_name": "torch_trainer"
- },
- "metadata": {
- "variables": {
- "TRANSFORMER": "bert-base-uncased",
- "ROOT_PATH": "~/.deeppavlov",
- "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
- "MODELS_PATH": "{ROOT_PATH}/models",
- "MODEL_PATH": "{MODELS_PATH}/classifiers/insults_kaggle_torch_bert"
- },
- "download": [
- {
- "url": "http://files.deeppavlov.ai/datasets/insults_data.tar.gz",
- "subdir": "{DOWNLOADS_PATH}"
- },
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/classifiers/insults_kaggle_torch_bert_v0.tar.gz",
- "subdir": "{MODELS_PATH}/classifiers"
- }
- ]
- }
-}
diff --git a/deeppavlov/configs/classifiers/insults_kaggle_conv_bert.json b/deeppavlov/configs/classifiers/insults_kaggle_conv_bert.json
deleted file mode 100644
index 01f13affca..0000000000
--- a/deeppavlov/configs/classifiers/insults_kaggle_conv_bert.json
+++ /dev/null
@@ -1,153 +0,0 @@
-{
- "dataset_reader": {
- "class_name": "basic_classification_reader",
- "x": "Comment",
- "y": "Class",
- "data_path": "{DOWNLOADS_PATH}/insults_data"
- },
- "dataset_iterator": {
- "class_name": "basic_classification_iterator",
- "seed": 42
- },
- "chainer": {
- "in": [
- "x"
- ],
- "in_y": [
- "y"
- ],
- "pipe": [
- {
- "class_name": "bert_preprocessor",
- "vocab_file": "{DOWNLOADS_PATH}/bert_models/conversational_cased_L-12_H-768_A-12/vocab.txt",
- "do_lower_case": false,
- "max_seq_length": 64,
- "in": [
- "x"
- ],
- "out": [
- "bert_features"
- ]
- },
- {
- "id": "classes_vocab",
- "class_name": "simple_vocab",
- "fit_on": [
- "y"
- ],
- "save_path": "{MODEL_PATH}/classes.dict",
- "load_path": "{MODEL_PATH}/classes.dict",
- "in": [
- "y"
- ],
- "out": [
- "y_ids"
- ]
- },
- {
- "in": [
- "y_ids"
- ],
- "out": [
- "y_onehot"
- ],
- "class_name": "one_hotter",
- "depth": "#classes_vocab.len",
- "single_vector": true
- },
- {
- "class_name": "bert_classifier",
- "n_classes": "#classes_vocab.len",
- "return_probas": true,
- "one_hot_labels": true,
- "bert_config_file": "{DOWNLOADS_PATH}/bert_models/conversational_cased_L-12_H-768_A-12/bert_config.json",
- "pretrained_bert": "{DOWNLOADS_PATH}/bert_models/conversational_cased_L-12_H-768_A-12/bert_model.ckpt",
- "save_path": "{MODEL_PATH}/model",
- "load_path": "{MODEL_PATH}/model",
- "keep_prob": 0.5,
- "learning_rate": 1e-05,
- "learning_rate_drop_patience": 5,
- "learning_rate_drop_div": 2.0,
- "in": [
- "bert_features"
- ],
- "in_y": [
- "y_onehot"
- ],
- "out": [
- "y_pred_probas"
- ]
- },
- {
- "in": [
- "y_pred_probas"
- ],
- "out": [
- "y_pred_ids"
- ],
- "class_name": "proba2labels",
- "max_proba": true
- },
- {
- "in": [
- "y_pred_ids"
- ],
- "out": [
- "y_pred_labels"
- ],
- "ref": "classes_vocab"
- }
- ],
- "out": [
- "y_pred_labels"
- ]
- },
- "train": {
- "epochs": 100,
- "batch_size": 64,
- "metrics": [
- {
- "name": "roc_auc",
- "inputs": [
- "y_onehot",
- "y_pred_probas"
- ]
- },
- "accuracy",
- "f1_macro"
- ],
- "validation_patience": 5,
- "val_every_n_epochs": 1,
- "log_every_n_epochs": 1,
- "show_examples": false,
- "evaluation_targets": [
- "train",
- "valid",
- "test"
- ],
- "class_name": "nn_trainer",
- "tensorboard_log_dir": "{MODEL_PATH}/"
- },
- "metadata": {
- "variables": {
- "ROOT_PATH": "~/.deeppavlov",
- "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
- "MODELS_PATH": "{ROOT_PATH}/models",
- "MODEL_PATH": "{MODELS_PATH}/classifiers/insults_kaggle_v4"
- },
- "download": [
- {
- "url": "http://files.deeppavlov.ai/datasets/insults_data.tar.gz",
- "subdir": "{DOWNLOADS_PATH}"
- },
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/bert/conversational_cased_L-12_H-768_A-12.tar.gz",
- "subdir": "{DOWNLOADS_PATH}/bert_models"
- },
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/classifiers/insults_kaggle_v4.tar.gz",
- "subdir": "{MODELS_PATH}/classifiers"
- }
- ]
- }
-}
diff --git a/deeppavlov/configs/classifiers/intents_dstc2.json b/deeppavlov/configs/classifiers/intents_dstc2.json
deleted file mode 100644
index 828c01d634..0000000000
--- a/deeppavlov/configs/classifiers/intents_dstc2.json
+++ /dev/null
@@ -1,156 +0,0 @@
-{
- "dataset_reader": {
- "class_name": "dstc2_reader",
- "data_path": "{DOWNLOADS_PATH}/dstc2"
- },
- "dataset_iterator": {
- "class_name": "dstc2_intents_iterator",
- "seed": 42
- },
- "chainer": {
- "in": [
- "x"
- ],
- "in_y": [
- "y"
- ],
- "pipe": [
- {
- "id": "classes_vocab",
- "class_name": "simple_vocab",
- "fit_on": [
- "y"
- ],
- "save_path": "{MODEL_PATH}/classes.dict",
- "load_path": "{MODEL_PATH}/classes.dict",
- "in": "y",
- "out": "y_ids",
- "special_tokens": [""]
- },
- {
- "in": "x",
- "out": "x_tok",
- "id": "my_tokenizer",
- "class_name": "nltk_tokenizer",
- "tokenizer": "wordpunct_tokenize"
- },
- {
- "in": "x_tok",
- "out": "x_emb",
- "id": "my_embedder",
- "class_name": "fasttext",
- "load_path": "{DOWNLOADS_PATH}/embeddings/dstc2_fastText_model.bin",
- "pad_zero": true
- },
- {
- "in": "y_ids",
- "out": "y_onehot",
- "class_name": "one_hotter",
- "id": "my_one_hotter",
- "depth": "#classes_vocab.len",
- "single_vector": true
- },
- {
- "in": [
- "x_emb"
- ],
- "in_y": [
- "y_onehot"
- ],
- "out": [
- "y_pred_probas"
- ],
- "main": true,
- "class_name": "keras_classification_model",
- "save_path": "{MODEL_PATH}/model",
- "load_path": "{MODEL_PATH}/model",
- "embedding_size": "#my_embedder.dim",
- "classes": "#classes_vocab.keys()",
- "n_classes": "#classes_vocab.len",
- "kernel_sizes_cnn": [
- 3,
- 5,
- 7
- ],
- "filters_cnn": 512,
- "optimizer": "Adam",
- "learning_rate": 0.1,
- "learning_rate_decay": 0.1,
- "loss": "binary_crossentropy",
- "coef_reg_cnn": 1e-4,
- "coef_reg_den": 1e-4,
- "dropout_rate": 0.5,
- "dense_size": 100,
- "model_name": "cnn_model"
- },
- {
- "in": "y_pred_probas",
- "out": "y_pred_ids",
- "class_name": "proba2labels",
- "confidence_threshold": 0.5
- },
- {
- "in": "y_pred_ids",
- "out": "y_pred_labels",
- "ref": "classes_vocab"
- },
- {
- "ref": "my_one_hotter",
- "in": "y_pred_ids",
- "out": "y_pred_onehot"
- }
- ],
- "out": [
- "y_pred_labels",
- "y_pred_probas"
- ]
- },
- "train": {
- "epochs": 1000,
- "batch_size": 64,
- "metrics": [
- {
- "name": "sets_accuracy",
- "inputs": [
- "y",
- "y_pred_labels"
- ]
- },
- {
- "name": "roc_auc",
- "inputs": [
- "y_onehot",
- "y_pred_probas"
- ]
- }
- ],
- "validation_patience": 5,
- "val_every_n_epochs": 5,
- "log_every_n_batches": 100,
- "show_examples": false,
- "evaluation_targets": [
- "train",
- "valid",
- "test"
- ],
- "class_name": "nn_trainer"
- },
- "metadata": {
- "variables": {
- "ROOT_PATH": "~/.deeppavlov",
- "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
- "MODELS_PATH": "{ROOT_PATH}/models",
- "MODEL_PATH": "{MODELS_PATH}/classifiers/intents_dstc2_v10"
- },
- "download": [
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/embeddings/dstc2_fastText_model.bin",
- "subdir": "{DOWNLOADS_PATH}/embeddings"
- },
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/classifiers/intents_dstc2_v10.tar.gz",
- "subdir": "{MODELS_PATH}/classifiers"
- }
- ]
- }
-}
diff --git a/deeppavlov/configs/classifiers/intents_dstc2_bert.json b/deeppavlov/configs/classifiers/intents_dstc2_bert.json
deleted file mode 100644
index 0143446b53..0000000000
--- a/deeppavlov/configs/classifiers/intents_dstc2_bert.json
+++ /dev/null
@@ -1,121 +0,0 @@
-{
- "dataset_reader": {
- "class_name": "dstc2_reader",
- "data_path": "{DOWNLOADS_PATH}/dstc2"
- },
- "dataset_iterator": {
- "class_name": "dstc2_intents_iterator",
- "seed": 42
- },
- "chainer": {
- "in": ["x"],
- "in_y": ["y"],
- "pipe": [
- {
- "id": "classes_vocab",
- "class_name": "simple_vocab",
- "fit_on": ["y"],
- "save_path": "{MODEL_PATH}/classes.dict",
- "load_path": "{MODEL_PATH}/classes.dict",
- "in": "y",
- "out": "y_ids",
- "special_tokens": [""]
- },
- {
- "class_name": "bert_preprocessor",
- "vocab_file": "{DOWNLOADS_PATH}/bert_models/cased_L-12_H-768_A-12/vocab.txt",
- "do_lower_case": false,
- "max_seq_length": 64,
- "in": ["x"],
- "out": ["bert_features"]
- },
- {
- "in": "y_ids",
- "out": "y_onehot",
- "class_name": "one_hotter",
- "id": "my_one_hotter",
- "depth": "#classes_vocab.len",
- "single_vector": true
- },
- {
- "class_name": "bert_classifier",
- "n_classes": "#classes_vocab.len",
- "return_probas": true,
- "one_hot_labels": true,
- "multilabel": true,
- "bert_config_file": "{DOWNLOADS_PATH}/bert_models/cased_L-12_H-768_A-12/bert_config.json",
- "pretrained_bert": "{DOWNLOADS_PATH}/bert_models/cased_L-12_H-768_A-12/bert_model.ckpt",
- "save_path": "{MODEL_PATH}/model",
- "load_path": "{MODEL_PATH}/model",
- "keep_prob": 0.5,
- "learning_rate": 2e-05,
- "learning_rate_drop_patience": 3,
- "learning_rate_drop_div": 2.0,
- "in": ["bert_features"],
- "in_y": ["y_onehot"],
- "out": ["y_pred_probas"]
- },
- {
- "in": "y_pred_probas",
- "out": "y_pred_ids",
- "class_name": "proba2labels",
- "confidence_threshold": 0.5
- },
- {
- "in": "y_pred_ids",
- "out": "y_pred_labels",
- "ref": "classes_vocab"
- },
- {
- "ref": "my_one_hotter",
- "in": "y_pred_ids",
- "out": "y_pred_onehot"
- }
- ],
- "out": ["y_pred_probas", "y_pred_labels"]
- },
- "train": {
- "metrics": [
- {
- "name": "sets_accuracy",
- "inputs": ["y", "y_pred_labels"]
- },
- {
- "name": "roc_auc",
- "inputs": ["y_onehot", "y_pred_probas"]
- }
- ],
- "show_examples": false,
- "batch_size": 32,
- "pytest_max_batches": 2,
- "validation_patience": 10,
- "val_every_n_batches": 100,
- "log_every_n_batches": 100,
- "validate_best": true,
- "test_best": true,
- "tensorboard_log_dir": "{MODEL_PATH}/logs"
- },
- "metadata": {
- "variables": {
- "ROOT_PATH": "~/.deeppavlov",
- "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
- "MODELS_PATH": "{ROOT_PATH}/models",
- "MODEL_PATH": "{MODELS_PATH}/classifiers/intents_dstc2_bert_v0"
- },
- "download": [
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/bert/cased_L-12_H-768_A-12.zip",
- "subdir": "{DOWNLOADS_PATH}/bert_models"
- },
- {
- "url": "http://files.deeppavlov.ai/datasets/dstc2_v2.tar.gz",
- "subdir": "{DOWNLOADS_PATH}/dstc2"
- },
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/classifiers/intents_dstc2_bert_v0.tar.gz",
- "subdir": "{MODELS_PATH}/classifiers"
- }
-
- ]
- }
-}
diff --git a/deeppavlov/configs/classifiers/intents_dstc2_big.json b/deeppavlov/configs/classifiers/intents_dstc2_big.json
deleted file mode 100644
index d6a458dcab..0000000000
--- a/deeppavlov/configs/classifiers/intents_dstc2_big.json
+++ /dev/null
@@ -1,155 +0,0 @@
-{
- "dataset_reader": {
- "class_name": "dstc2_reader",
- "data_path": "{DOWNLOADS_PATH}/dstc2"
- },
- "dataset_iterator": {
- "class_name": "dstc2_intents_iterator",
- "seed": 42
- },
- "chainer": {
- "in": [
- "x"
- ],
- "in_y": [
- "y"
- ],
- "pipe": [
- {
- "id": "classes_vocab",
- "class_name": "simple_vocab",
- "fit_on": [
- "y"
- ],
- "save_path": "{MODEL_PATH}/classes.dict",
- "load_path": "{MODEL_PATH}/classes.dict",
- "in": "y",
- "out": "y_ids",
- "special_tokens": [""]
- },
- {
- "in": "x",
- "out": "x_tok",
- "id": "my_tokenizer",
- "class_name": "nltk_tokenizer",
- "tokenizer": "wordpunct_tokenize"
- },
- {
- "in": "x_tok",
- "out": "x_emb",
- "id": "my_embedder",
- "class_name": "fasttext",
- "load_path": "{DOWNLOADS_PATH}/embeddings/wiki.en.bin",
- "pad_zero": true
- },
- {
- "in": "y_ids",
- "out": "y_onehot",
- "class_name": "one_hotter",
- "id": "my_one_hotter",
- "depth": "#classes_vocab.len",
- "single_vector": true
- },
- {
- "in": [
- "x_emb"
- ],
- "in_y": [
- "y_onehot"
- ],
- "out": [
- "y_pred_probas"
- ],
- "main": true,
- "class_name": "keras_classification_model",
- "save_path": "{MODEL_PATH}/model",
- "load_path": "{MODEL_PATH}/model",
- "embedding_size": "#my_embedder.dim",
- "classes": "#classes_vocab.keys()",
- "n_classes": "#classes_vocab.len",
- "kernel_sizes_cnn": [
- 3,
- 5,
- 7
- ],
- "filters_cnn": 512,
- "optimizer": "Adam",
- "learning_rate": 0.1,
- "learning_rate_decay": 0.1,
- "loss": "binary_crossentropy",
- "coef_reg_cnn": 1e-4,
- "coef_reg_den": 1e-4,
- "dropout_rate": 0.5,
- "dense_size": 100,
- "model_name": "cnn_model"
- },
- {
- "in": "y_pred_probas",
- "out": "y_pred_ids",
- "class_name": "proba2labels",
- "confidence_threshold": 0.5
- },
- {
- "in": "y_pred_ids",
- "out": "y_pred_labels",
- "ref": "classes_vocab"
- },
- {
- "ref": "my_one_hotter",
- "in": "y_pred_ids",
- "out": "y_pred_onehot"
- }
- ],
- "out": [
- "y_pred_labels"
- ]
- },
- "train": {
- "epochs": 1000,
- "batch_size": 64,
- "metrics": [
- {
- "name": "sets_accuracy",
- "inputs": [
- "y",
- "y_pred_labels"
- ]
- },
- {
- "name": "roc_auc",
- "inputs": [
- "y_onehot",
- "y_pred_probas"
- ]
- }
- ],
- "validation_patience": 5,
- "val_every_n_epochs": 5,
- "log_every_n_batches": 100,
- "show_examples": false,
- "evaluation_targets": [
- "train",
- "valid",
- "test"
- ],
- "class_name": "nn_trainer"
- },
- "metadata": {
- "variables": {
- "ROOT_PATH": "~/.deeppavlov",
- "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
- "MODELS_PATH": "{ROOT_PATH}/models",
- "MODEL_PATH": "{MODELS_PATH}/classifiers/intents_dstc2_v11"
- },
- "download": [
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/embeddings/wiki.en.bin",
- "subdir": "{DOWNLOADS_PATH}/embeddings"
- },
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/classifiers/intents_dstc2_v11.tar.gz",
- "subdir": "{MODELS_PATH}/classifiers"
- }
- ]
- }
-}
diff --git a/deeppavlov/configs/classifiers/intents_sample_csv.json b/deeppavlov/configs/classifiers/intents_sample_csv.json
deleted file mode 100644
index 4b01a2d301..0000000000
--- a/deeppavlov/configs/classifiers/intents_sample_csv.json
+++ /dev/null
@@ -1,160 +0,0 @@
-{
- "dataset": {
- "type": "classification",
- "format": "csv",
- "sep": ",",
- "header": 0,
- "names": [
- "text",
- "classes"
- ],
- "class_sep": ",",
- "train": "sample.csv",
- "data_path": "{DOWNLOADS_PATH}/sample",
- "x": "text",
- "y": "classes",
- "url": "http://files.deeppavlov.ai/datasets/snips_intents/train.csv",
- "seed": 42,
- "field_to_split": "train",
- "split_fields": [
- "train",
- "valid"
- ],
- "split_proportions": [
- 0.9,
- 0.1
- ]
- },
- "chainer": {
- "in": [
- "x"
- ],
- "in_y": [
- "y"
- ],
- "pipe": [
- {
- "id": "classes_vocab",
- "class_name": "simple_vocab",
- "fit_on": [
- "y"
- ],
- "save_path": "{MODEL_PATH}/classes.dict",
- "load_path": "{MODEL_PATH}/classes.dict",
- "in": "y",
- "out": "y_ids"
- },
- {
- "in": "x",
- "out": "x_tok",
- "id": "my_tokenizer",
- "class_name": "nltk_tokenizer",
- "tokenizer": "wordpunct_tokenize"
- },
- {
- "in": "x_tok",
- "out": "x_emb",
- "id": "my_embedder",
- "class_name": "fasttext",
- "load_path": "{DOWNLOADS_PATH}/embeddings/dstc2_fastText_model.bin",
- "pad_zero": true
- },
- {
- "in": "y_ids",
- "out": "y_onehot",
- "class_name": "one_hotter",
- "depth": "#classes_vocab.len",
- "single_vector": true
- },
- {
- "in": [
- "x_emb"
- ],
- "in_y": [
- "y_onehot"
- ],
- "out": [
- "y_pred_probas"
- ],
- "main": true,
- "class_name": "keras_classification_model",
- "save_path": "{MODEL_PATH}/model",
- "load_path": "{MODEL_PATH}/model",
- "embedding_size": "#my_embedder.dim",
- "n_classes": "#classes_vocab.len",
- "kernel_sizes_cnn": [
- 1,
- 2,
- 3
- ],
- "filters_cnn": 256,
- "optimizer": "Adam",
- "learning_rate": 0.01,
- "learning_rate_decay": 0.1,
- "loss": "binary_crossentropy",
- "coef_reg_cnn": 1e-4,
- "coef_reg_den": 1e-4,
- "dropout_rate": 0.5,
- "dense_size": 100,
- "model_name": "cnn_model"
- },
- {
- "in": "y_pred_probas",
- "out": "y_pred_ids",
- "class_name": "proba2labels",
- "max_proba": true
- },
- {
- "in": "y_pred_ids",
- "out": "y_pred_labels",
- "ref": "classes_vocab"
- }
- ],
- "out": [
- "y_pred_labels"
- ]
- },
- "train": {
- "epochs": 100,
- "batch_size": 64,
- "metrics": [
- "accuracy",
- "f1_macro",
- {
- "name": "roc_auc",
- "inputs": ["y_onehot", "y_pred_probas"]
- }
- ],
- "validation_patience": 5,
- "val_every_n_epochs": 1,
- "log_every_n_epochs": 1,
- "show_examples": false,
- "evaluation_targets": [
- "train",
- "valid"
- ],
- "class_name": "nn_trainer"
- },
- "metadata": {
- "variables": {
- "ROOT_PATH": "~/.deeppavlov",
- "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
- "MODELS_PATH": "{ROOT_PATH}/models",
- "MODEL_PATH": "{MODELS_PATH}/classifiers/intents_snips_v9"
- },
- "download": [
- {
- "url": "http://files.deeppavlov.ai/datasets/snips_intents/train.csv",
- "subdir": "{DOWNLOADS_PATH}/sample"
- },
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/embeddings/dstc2_fastText_model.bin",
- "subdir": "{DOWNLOADS_PATH}/embeddings"
- },
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/classifiers/intents_snips_v9.tar.gz",
- "subdir": "{MODELS_PATH}/classifiers"
- }
- ]
- }
-}
diff --git a/deeppavlov/configs/classifiers/intents_sample_json.json b/deeppavlov/configs/classifiers/intents_sample_json.json
deleted file mode 100644
index b87d3274be..0000000000
--- a/deeppavlov/configs/classifiers/intents_sample_json.json
+++ /dev/null
@@ -1,155 +0,0 @@
-{
- "dataset": {
- "type": "classification",
- "format": "json",
- "orient": "records",
- "lines": true,
- "data_path": "{DOWNLOADS_PATH}/sample",
- "train": "sample.json",
- "x": "text",
- "y": "intents",
- "url": "http://files.deeppavlov.ai/datasets/snips_intents/train.json",
- "seed": 42,
- "field_to_split": "train",
- "split_fields": [
- "train",
- "valid"
- ],
- "split_proportions": [
- 0.9,
- 0.1
- ]
- },
- "chainer": {
- "in": [
- "x"
- ],
- "in_y": [
- "y"
- ],
- "pipe": [
- {
- "id": "classes_vocab",
- "class_name": "simple_vocab",
- "fit_on": [
- "y"
- ],
- "save_path": "{MODEL_PATH}/classes.dict",
- "load_path": "{MODEL_PATH}/classes.dict",
- "in": "y",
- "out": "y_ids"
- },
- {
- "in": "x",
- "out": "x_tok",
- "id": "my_tokenizer",
- "class_name": "nltk_tokenizer",
- "tokenizer": "wordpunct_tokenize"
- },
- {
- "in": "x_tok",
- "out": "x_emb",
- "id": "my_embedder",
- "class_name": "fasttext",
- "load_path": "{DOWNLOADS_PATH}/embeddings/dstc2_fastText_model.bin",
- "pad_zero": true
- },
- {
- "in": "y_ids",
- "out": "y_onehot",
- "class_name": "one_hotter",
- "depth": "#classes_vocab.len",
- "single_vector": true
- },
- {
- "in": [
- "x_emb"
- ],
- "in_y": [
- "y_onehot"
- ],
- "out": [
- "y_pred_probas"
- ],
- "main": true,
- "class_name": "keras_classification_model",
- "save_path": "{MODEL_PATH}/model",
- "load_path": "{MODEL_PATH}/model",
- "embedding_size": "#my_embedder.dim",
- "n_classes": "#classes_vocab.len",
- "kernel_sizes_cnn": [
- 1,
- 2,
- 3
- ],
- "filters_cnn": 256,
- "optimizer": "Adam",
- "learning_rate": 0.01,
- "learning_rate_decay": 0.1,
- "loss": "binary_crossentropy",
- "coef_reg_cnn": 1e-4,
- "coef_reg_den": 1e-4,
- "dropout_rate": 0.5,
- "dense_size": 100,
- "model_name": "cnn_model"
- },
- {
- "in": "y_pred_probas",
- "out": "y_pred_ids",
- "class_name": "proba2labels",
- "max_proba": true
- },
- {
- "in": "y_pred_ids",
- "out": "y_pred_labels",
- "ref": "classes_vocab"
- }
- ],
- "out": [
- "y_pred_labels"
- ]
- },
- "train": {
- "epochs": 100,
- "batch_size": 64,
- "metrics": [
- "accuracy",
- "f1_macro",
- {
- "name": "roc_auc",
- "inputs": ["y_onehot", "y_pred_probas"]
- }
- ],
- "validation_patience": 5,
- "val_every_n_epochs": 1,
- "log_every_n_epochs": 1,
- "show_examples": false,
- "evaluation_targets": [
- "train",
- "valid"
- ],
- "class_name": "nn_trainer"
- },
- "metadata": {
- "variables": {
- "ROOT_PATH": "~/.deeppavlov",
- "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
- "MODELS_PATH": "{ROOT_PATH}/models",
- "MODEL_PATH": "{MODELS_PATH}/classifiers/intents_snips_v9"
- },
- "download": [
- {
- "url": "http://files.deeppavlov.ai/datasets/snips_intents/train.json",
- "subdir": "{DOWNLOADS_PATH}/sample"
- },
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/embeddings/dstc2_fastText_model.bin",
- "subdir": "{DOWNLOADS_PATH}/embeddings"
- },
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/classifiers/intents_snips_v9.tar.gz",
- "subdir": "{MODELS_PATH}/classifiers"
- }
- ]
- }
-}
diff --git a/deeppavlov/configs/classifiers/intents_snips.json b/deeppavlov/configs/classifiers/intents_snips.json
deleted file mode 100644
index 5f0aa89cd0..0000000000
--- a/deeppavlov/configs/classifiers/intents_snips.json
+++ /dev/null
@@ -1,141 +0,0 @@
-{
- "dataset_reader": {
- "class_name": "snips_reader",
- "x": "text",
- "y": "intents",
- "data_path": "{DOWNLOADS_PATH}/snips"
- },
- "dataset_iterator": {
- "class_name": "snips_intents_iterator",
- "seed": 42
- },
- "chainer": {
- "in": [
- "x"
- ],
- "in_y": [
- "y"
- ],
- "pipe": [
- {
- "id": "classes_vocab",
- "class_name": "simple_vocab",
- "fit_on": [
- "y"
- ],
- "level": "token",
- "save_path": "{MODEL_PATH}/classes.dict",
- "load_path": "{MODEL_PATH}/classes.dict",
- "in": "y",
- "out": "y_ids"
- },
- {
- "in": "x",
- "out": "x_tok",
- "id": "my_tokenizer",
- "class_name": "nltk_tokenizer",
- "tokenizer": "wordpunct_tokenize"
- },
- {
- "in": "x_tok",
- "out": "x_emb",
- "id": "my_embedder",
- "class_name": "fasttext",
- "load_path": "{DOWNLOADS_PATH}/embeddings/dstc2_fastText_model.bin",
- "pad_zero": true
- },
- {
- "in": "y_ids",
- "out": "y_onehot",
- "class_name": "one_hotter",
- "depth": "#classes_vocab.len",
- "single_vector": true
- },
- {
- "in": [
- "x_emb"
- ],
- "in_y": [
- "y_onehot"
- ],
- "out": [
- "y_pred_probas"
- ],
- "main": true,
- "class_name": "keras_classification_model",
- "save_path": "{MODEL_PATH}/model",
- "load_path": "{MODEL_PATH}/model",
- "embedding_size": "#my_embedder.dim",
- "n_classes": "#classes_vocab.len",
- "kernel_sizes_cnn": [
- 1,
- 2,
- 3
- ],
- "filters_cnn": 256,
- "optimizer": "Adam",
- "learning_rate": 0.01,
- "learning_rate_decay": 0.1,
- "loss": "binary_crossentropy",
- "coef_reg_cnn": 1e-4,
- "coef_reg_den": 1e-4,
- "dropout_rate": 0.5,
- "dense_size": 100,
- "model_name": "cnn_model"
- },
- {
- "in": "y_pred_probas",
- "out": "y_pred_ids",
- "class_name": "proba2labels",
- "max_proba": true
- },
- {
- "in": "y_pred_ids",
- "out": "y_pred_labels",
- "ref": "classes_vocab"
- }
- ],
- "out": [
- "y_pred_labels"
- ]
- },
- "train": {
- "epochs": 1000,
- "batch_size": 64,
- "metrics": [
- "accuracy",
- "f1_macro",
- {
- "name": "roc_auc",
- "inputs": ["y_onehot", "y_pred_probas"]
- }
- ],
- "validation_patience": 5,
- "val_every_n_epochs": 5,
- "log_every_n_epochs": 5,
- "show_examples": false,
- "evaluation_targets": [
- "train",
- "valid"
- ],
- "class_name": "nn_trainer"
- },
- "metadata": {
- "variables": {
- "ROOT_PATH": "~/.deeppavlov",
- "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
- "MODELS_PATH": "{ROOT_PATH}/models",
- "MODEL_PATH": "{MODELS_PATH}/classifiers/intents_snips_v9"
- },
- "download": [
-{
- "url": "http://files.deeppavlov.ai/deeppavlov_data/embeddings/dstc2_fastText_model.bin",
- "subdir": "{DOWNLOADS_PATH}/embeddings"
- },
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/classifiers/intents_snips_v9.tar.gz",
- "subdir": "{MODELS_PATH}/classifiers"
- }
- ]
- }
-}
diff --git a/deeppavlov/configs/classifiers/intents_snips_big.json b/deeppavlov/configs/classifiers/intents_snips_big.json
deleted file mode 100644
index 15b5adc648..0000000000
--- a/deeppavlov/configs/classifiers/intents_snips_big.json
+++ /dev/null
@@ -1,141 +0,0 @@
-{
- "dataset_reader": {
- "class_name": "snips_reader",
- "x": "text",
- "y": "intents",
- "data_path": "{DOWNLOADS_PATH}/snips"
- },
- "dataset_iterator": {
- "class_name": "snips_intents_iterator",
- "seed": 42
- },
- "chainer": {
- "in": [
- "x"
- ],
- "in_y": [
- "y"
- ],
- "pipe": [
- {
- "id": "classes_vocab",
- "class_name": "simple_vocab",
- "fit_on": [
- "y"
- ],
- "level": "token",
- "save_path": "{MODEL_PATH}/classes.dict",
- "load_path": "{MODEL_PATH}/classes.dict",
- "in": "y",
- "out": "y_ids"
- },
- {
- "in": "x",
- "out": "x_tok",
- "id": "my_tokenizer",
- "class_name": "nltk_tokenizer",
- "tokenizer": "wordpunct_tokenize"
- },
- {
- "in": "x_tok",
- "out": "x_emb",
- "id": "my_embedder",
- "class_name": "fasttext",
- "load_path": "{DOWNLOADS_PATH}/embeddings/wiki.en.bin",
- "pad_zero": true
- },
- {
- "in": "y_ids",
- "out": "y_onehot",
- "class_name": "one_hotter",
- "depth": "#classes_vocab.len",
- "single_vector": true
- },
- {
- "in": [
- "x_emb"
- ],
- "in_y": [
- "y_onehot"
- ],
- "out": [
- "y_pred_probas"
- ],
- "main": true,
- "class_name": "keras_classification_model",
- "save_path": "{MODEL_PATH}/model",
- "load_path": "{MODEL_PATH}/model",
- "embedding_size": "#my_embedder.dim",
- "n_classes": "#classes_vocab.len",
- "kernel_sizes_cnn": [
- 3,
- 5,
- 7
- ],
- "filters_cnn": 256,
- "optimizer": "Adam",
- "learning_rate": 0.01,
- "learning_rate_decay": 0.1,
- "loss": "categorical_crossentropy",
- "coef_reg_cnn": 1e-4,
- "coef_reg_den": 1e-4,
- "dropout_rate": 0.5,
- "dense_size": 100,
- "model_name": "cnn_model"
- },
- {
- "in": "y_pred_probas",
- "out": "y_pred_ids",
- "class_name": "proba2labels",
- "max_proba": true
- },
- {
- "in": "y_pred_ids",
- "out": "y_pred_labels",
- "ref": "classes_vocab"
- }
- ],
- "out": [
- "y_pred_labels"
- ]
- },
- "train": {
- "epochs": 1000,
- "batch_size": 64,
- "metrics": [
- "accuracy",
- "f1_macro",
- {
- "name": "roc_auc",
- "inputs": ["y_onehot", "y_pred_probas"]
- }
- ],
- "validation_patience": 5,
- "val_every_n_epochs": 1,
- "log_every_n_epochs": 1,
- "show_examples": false,
- "evaluation_targets": [
- "train",
- "valid"
- ],
- "class_name": "nn_trainer"
- },
- "metadata": {
- "variables": {
- "ROOT_PATH": "~/.deeppavlov",
- "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
- "MODELS_PATH": "{ROOT_PATH}/models",
- "MODEL_PATH": "{MODELS_PATH}/classifiers/intents_snips_v10"
- },
- "download": [
-{
- "url": "http://files.deeppavlov.ai/deeppavlov_data/embeddings/wiki.en.bin",
- "subdir": "{DOWNLOADS_PATH}/embeddings"
- },
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/classifiers/intents_snips_v10.tar.gz",
- "subdir": "{MODELS_PATH}/classifiers"
- }
- ]
- }
-}
diff --git a/deeppavlov/configs/classifiers/intents_snips_sklearn.json b/deeppavlov/configs/classifiers/intents_snips_sklearn.json
deleted file mode 100644
index 7847aa3e15..0000000000
--- a/deeppavlov/configs/classifiers/intents_snips_sklearn.json
+++ /dev/null
@@ -1,164 +0,0 @@
-{
- "dataset_reader": {
- "class_name": "snips_reader",
- "x": "text",
- "y": "intents",
- "data_path": "{DOWNLOADS_PATH}/snips"
- },
- "dataset_iterator": {
- "class_name": "snips_intents_iterator",
- "seed": 42
- },
- "chainer": {
- "in": [
- "x"
- ],
- "in_y": [
- "y"
- ],
- "pipe": [
- {
- "id": "classes_vocab",
- "class_name": "simple_vocab",
- "fit_on": [
- "y"
- ],
- "save_path": "{MODEL_PATH}/classes.dict",
- "load_path": "{MODEL_PATH}/classes.dict",
- "in": "y",
- "out": "y_ids"
- },
- {
- "in": [
- "x"
- ],
- "out": [
- "x_vec"
- ],
- "fit_on": [
- "x",
- "y_ids"
- ],
- "id": "tfidf_vec",
- "class_name": "sklearn_component",
- "save_path": "{MODEL_PATH}/tfidf.pkl",
- "load_path": "{MODEL_PATH}/tfidf.pkl",
- "model_class": "sklearn.feature_extraction.text:TfidfVectorizer",
- "infer_method": "transform",
- "lowercase": true,
- "analyzer": "word",
- "ngram_range": [
- 1,
- 5
- ],
- "max_features": 10000,
- "norm": null
- },
- {
- "in": [
- "x_vec"
- ],
- "out": [
- "x_sel"
- ],
- "fit_on": [
- "x_vec",
- "y_ids"
- ],
- "id": "selector",
- "class_name": "sklearn_component",
- "save_path": "{MODEL_PATH}/selectkbest.pkl",
- "load_path": "{MODEL_PATH}/selectkbest.pkl",
- "model_class": "sklearn.feature_selection:SelectKBest",
- "infer_method": "transform",
- "score_func": "sklearn.feature_selection:chi2",
- "k": 1000
- },
- {
- "in": [
- "x_sel"
- ],
- "out": [
- "x_pca"
- ],
- "fit_on": [
- "x_sel"
- ],
- "id": "pca",
- "class_name": "sklearn_component",
- "save_path": "{MODEL_PATH}/pca.pkl",
- "load_path": "{MODEL_PATH}/pca.pkl",
- "model_class": "sklearn.decomposition:PCA",
- "infer_method": "transform",
- "n_components": 300
- },
- {
- "class_name": "one_hotter",
- "id": "onehotter",
- "depth": "#classes_vocab.len",
- "in": "y_ids",
- "out": "y_onehot",
- "single_vector": true
- },
- {
- "in": [
- "x_pca"
- ],
- "out": [
- "y_pred_onehot"
- ],
- "fit_on": [
- "x_pca",
- "y_onehot"
- ],
- "class_name": "sklearn_component",
- "main": true,
- "save_path": "{MODEL_PATH}/model.pkl",
- "load_path": "{MODEL_PATH}/model.pkl",
- "model_class": "sklearn.neighbors:KNeighborsClassifier",
- "infer_method": "predict",
- "ensure_list_output": true
- },
- {
- "class_name": "proba2labels",
- "in": "y_pred_onehot",
- "out": "y_pred_ids",
- "max_proba": true
- },
- {
- "ref": "classes_vocab",
- "in": "y_pred_ids",
- "out": "y_pred_labels"
- }
- ],
- "out": [
- "y_pred_labels"
- ]
- },
- "train": {
- "batch_size": 64,
- "metrics": [
- "accuracy"
- ],
- "show_examples": false,
- "evaluation_targets": [
- "train",
- "valid"
- ],
- "class_name": "fit_trainer"
- },
- "metadata": {
- "variables": {
- "ROOT_PATH": "~/.deeppavlov",
- "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
- "MODELS_PATH": "{ROOT_PATH}/models",
- "MODEL_PATH": "{MODELS_PATH}/classifiers/intents_snips_sklearn_v11"
- },
- "download": [
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/classifiers/intents_snips_sklearn_v11.tar.gz",
- "subdir": "{MODELS_PATH}/classifiers"
- }
- ]
- }
-}
diff --git a/deeppavlov/configs/classifiers/intents_snips_tfidf_weighted.json b/deeppavlov/configs/classifiers/intents_snips_tfidf_weighted.json
deleted file mode 100644
index b7f4e70712..0000000000
--- a/deeppavlov/configs/classifiers/intents_snips_tfidf_weighted.json
+++ /dev/null
@@ -1,182 +0,0 @@
-{
- "dataset_reader": {
- "class_name": "snips_reader",
- "x": "text",
- "y": "intents",
- "data_path": "{DOWNLOADS_PATH}/snips"
- },
- "dataset_iterator": {
- "class_name": "snips_intents_iterator",
- "seed": 42
- },
- "chainer": {
- "in": [
- "x"
- ],
- "in_y": [
- "y"
- ],
- "pipe": [
- {
- "id": "classes_vocab",
- "class_name": "simple_vocab",
- "fit_on": [
- "y"
- ],
- "save_path": "{MODEL_PATH}/classes.dict",
- "load_path": "{MODEL_PATH}/classes.dict",
- "in": "y",
- "out": "y_ids"
- },
- {
- "in": [
- "x"
- ],
- "out": [
- "x_vec"
- ],
- "fit_on": [
- "x",
- "y_ids"
- ],
- "id": "my_tfidf_vectorizer",
- "class_name": "sklearn_component",
- "save_path": "{MODEL_PATH}/tfidf.pkl",
- "load_path": "{MODEL_PATH}/tfidf.pkl",
- "model_class": "sklearn.feature_extraction.text:TfidfVectorizer",
- "infer_method": "transform",
- "lowercase": true,
- "analyzer": "word"
- },
- {
- "in": [
- "x_vec"
- ],
- "out": [
- "x_sel"
- ],
- "fit_on": [
- "x_vec",
- "y_ids"
- ],
- "id": "my_selector",
- "class_name": "sklearn_component",
- "save_path": "{MODEL_PATH}/selectkbest.pkl",
- "load_path": "{MODEL_PATH}/selectkbest.pkl",
- "model_class": "sklearn.feature_selection:SelectKBest",
- "infer_method": "transform",
- "score_func": "sklearn.feature_selection:chi2",
- "k": 1000
- },
- {
- "in": [
- "x_sel"
- ],
- "out": [
- "x_pca"
- ],
- "fit_on": [
- "x_sel"
- ],
- "id": "my_pca",
- "class_name": "sklearn_component",
- "save_path": "{MODEL_PATH}/pca.pkl",
- "load_path": "{MODEL_PATH}/pca.pkl",
- "model_class": "sklearn.decomposition:PCA",
- "infer_method": "transform",
- "n_components": 300
- },
- {
- "in": "x",
- "out": "x_tok",
- "id": "my_tokenizer",
- "class_name": "nltk_moses_tokenizer"
- },
- {
- "in": "x_tok",
- "out": "x_emb",
- "id": "my_embedder",
- "class_name": "fasttext",
- "load_path": "{DOWNLOADS_PATH}/embeddings/wiki.en.bin",
- "dim": 300
- },
- {
- "class_name": "one_hotter",
- "id": "my_onehotter",
- "depth": "#classes_vocab.len",
- "in": "y_ids",
- "out": "y_onehot",
- "single_vector": true
- },
- {
- "in": "x_tok",
- "out": "x_weighted_emb",
- "class_name": "tfidf_weighted",
- "id": "my_weighted_embedder",
- "embedder": "#my_embedder",
- "tokenizer": "#my_tokenizer",
- "vectorizer": "#my_tfidf_vectorizer",
- "mean": true
- },
- {
- "in": [
- "x_pca",
- "x_weighted_emb"
- ],
- "out": [
- "y_pred_ids"
- ],
- "fit_on": [
- "x_pca",
- "x_weighted_emb",
- "y_ids"
- ],
- "class_name": "sklearn_component",
- "main": true,
- "save_path": "{MODEL_PATH}/model.pkl",
- "load_path": "{MODEL_PATH}/model.pkl",
- "model_class": "sklearn.linear_model:LogisticRegression",
- "infer_method": "predict",
- "ensure_list_output": true
- },
- {
- "ref": "classes_vocab",
- "in": "y_pred_ids",
- "out": "y_pred_labels"
- }
- ],
- "out": [
- "y_pred_labels"
- ]
- },
- "train": {
- "batch_size": 64,
- "metrics": [
- "accuracy"
- ],
- "show_examples": false,
- "evaluation_targets": [
- "train",
- "valid"
- ],
- "class_name": "fit_trainer"
- },
- "metadata": {
- "variables": {
- "ROOT_PATH": "~/.deeppavlov",
- "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
- "MODELS_PATH": "{ROOT_PATH}/models",
- "MODEL_PATH": "{MODELS_PATH}/classifiers/intents_snips_sklearn_v12"
- },
- "download": [
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/classifiers/intents_snips_sklearn_v12.tar.gz",
- "subdir": "{MODELS_PATH}/classifiers"
- },
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/embeddings/wiki.en.bin",
- "subdir": "{DOWNLOADS_PATH}/embeddings"
- }
- ]
- }
-}
diff --git a/deeppavlov/configs/classifiers/relation_prediction_rus.json b/deeppavlov/configs/classifiers/relation_prediction_rus.json
deleted file mode 100644
index 24f16cc159..0000000000
--- a/deeppavlov/configs/classifiers/relation_prediction_rus.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
- "dataset_reader": {
- "class_name": "basic_classification_reader",
- "x": "Question",
- "y": "Class",
- "data_path": "{DOWNLOADS_PATH}/relation_prediction"
- },
- "dataset_iterator": {
- "class_name": "basic_classification_iterator",
- "seed": 42
- },
- "chainer": {
- "in": ["x"],
- "in_y": ["y"],
- "pipe": [
- {
- "id": "classes_vocab",
- "class_name": "simple_vocab",
- "fit_on": ["y"],
- "save_path": "{MODEL_PATH}/classes.dict",
- "load_path": "{MODEL_PATH}/classes.dict",
- "in": "y",
- "out": "y_ids"
- },
- {
- "in": "x",
- "out": "x_tok",
- "id": "my_tokenizer",
- "class_name": "nltk_tokenizer",
- "tokenizer": "wordpunct_tokenize"
- },
- {
- "in": ["x_tok"],
- "class_name": "str_lower",
- "out": ["x_lower"]
- },
- {
- "in": "x_lower",
- "out": "x_emb",
- "id": "my_embedder",
- "class_name": "fasttext",
- "load_path": "{DOWNLOADS_PATH}/embeddings/ft_native_300_ru_wiki_lenta_nltk_word_tokenize.bin"
- },
- {
- "in": "y_ids",
- "out": ["y_onehot"],
- "class_name": "one_hotter",
- "depth": "#classes_vocab.len"
- },
- {
- "in": ["x_emb"],
- "in_y": ["y_onehot"],
- "out": ["y_pred_probas"],
- "main": true,
- "class_name": "keras_classification_model",
- "save_path": "{MODEL_PATH}/model",
- "load_path": "{MODEL_PATH}/model",
- "embedding_size": "#my_embedder.dim",
- "n_classes": "#classes_vocab.len",
- "kernel_sizes_cnn": [1, 2, 3],
- "filters_cnn": 256,
- "optimizer": "Adam",
- "learning_rate": 0.05,
- "learning_rate_decay": 0.1,
- "loss": "categorical_crossentropy",
- "last_layer_activation": "softmax",
- "text_size": 36,
- "coef_reg_cnn": 1e-3,
- "coef_reg_den": 1e-2,
- "dropout_rate": 0.5,
- "dense_size": 300,
- "model_name": "cnn_model"
- },
- {
- "in": "y_pred_probas",
- "out": "y_pred_ids",
- "class_name": "proba2labels",
- "top_n": 5
- },
- {
- "in": "y_pred_ids",
- "out": "y_pred_labels",
- "ref": "classes_vocab"
- }
- ],
- "out": ["y_pred_probas", "y_pred_labels"]
- },
- "train": {
- "epochs": 1000,
- "batch_size": 64,
- "metrics": [
- "sets_accuracy",
- "f1_macro",
- {
- "name": "roc_auc",
- "inputs": ["y_onehot", "y_pred_probas"]
- }
- ],
- "validation_patience": 5,
- "val_every_n_epochs": 5,
- "log_every_n_epochs": 5,
- "show_examples": false,
- "evaluation_targets": ["train", "valid", "test"],
- "class_name": "nn_trainer"
- },
- "metadata": {
- "variables": {
- "ROOT_PATH": "~/.deeppavlov",
- "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
- "MODELS_PATH": "{ROOT_PATH}/models",
- "MODEL_PATH": "{MODELS_PATH}/kbqa_mix_lowercase/relation_prediction"
- },
- "labels": {
- "telegram_utils": "IntentModel",
- "server_utils": "KerasIntentModel"
- },
- "download": [
- {
- "url": "http://files.deeppavlov.ai/datasets/relation_prediction_rus.tar.gz",
- "subdir": "{DOWNLOADS_PATH}"
- },
- {
- "url": "http://files.deeppavlov.ai/embeddings/ft_native_300_ru_wiki_lenta_nltk_word_tokenize/ft_native_300_ru_wiki_lenta_nltk_word_tokenize.bin",
- "subdir": "{DOWNLOADS_PATH}/embeddings"
- },
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/relation_prediction_rus.tar.gz",
- "subdir": "{MODELS_PATH}/kbqa_mix_lowercase/relation_prediction"
- }
- ]
- }
-}
diff --git a/deeppavlov/configs/classifiers/ru_obscenity_classifier.json b/deeppavlov/configs/classifiers/ru_obscenity_classifier.json
deleted file mode 100644
index 2344a71b87..0000000000
--- a/deeppavlov/configs/classifiers/ru_obscenity_classifier.json
+++ /dev/null
@@ -1,30 +0,0 @@
-{
- "chainer": {
- "in": [
- "text"
- ],
- "pipe": [
- {
- "class_name": "ru_obscenity_classifier",
- "data_path": "{DOWNLOADS_PATH}/obscenity_dataset/",
- "in": "text",
- "out": "flags_obscenity_or_not"
- }
- ],
- "out": [
- "flags_obscenity_or_not"
- ]
- },
- "metadata": {
- "variables": {
- "ROOT_PATH": "~/.deeppavlov",
- "DOWNLOADS_PATH": "{ROOT_PATH}/downloads"
- },
- "download": [
- {
- "url": "http://files.deeppavlov.ai/models/obscenity_classifier/ru_obscenity_dataset.zip",
- "subdir": "{DOWNLOADS_PATH}/obscenity_dataset"
- }
- ]
- }
-}
\ No newline at end of file
diff --git a/deeppavlov/configs/classifiers/rusentiment_bigru_superconv.json b/deeppavlov/configs/classifiers/rusentiment_bigru_superconv.json
deleted file mode 100644
index ceff4b647a..0000000000
--- a/deeppavlov/configs/classifiers/rusentiment_bigru_superconv.json
+++ /dev/null
@@ -1,165 +0,0 @@
-{
- "dataset_reader": {
- "class_name": "basic_classification_reader",
- "x": "text",
- "y": "label",
- "data_path": "{DOWNLOADS_PATH}/rusentiment/",
- "train": "rusentiment_random_posts.csv",
- "test": "rusentiment_test.csv"
- },
- "dataset_iterator": {
- "class_name": "basic_classification_iterator",
- "seed": 42,
- "field_to_split": "train",
- "split_seed": 23,
- "split_fields": [
- "train",
- "valid"
- ],
- "split_proportions": [
- 0.9,
- 0.1
- ]
- },
- "chainer": {
- "in": [
- "x"
- ],
- "in_y": [
- "y"
- ],
- "pipe": [
- {
- "id": "classes_vocab",
- "class_name": "simple_vocab",
- "fit_on": [
- "y"
- ],
- "save_path": "{MODEL_PATH}/classes.dict",
- "load_path": "{MODEL_PATH}/classes.dict",
- "in": "y",
- "out": "y_ids"
- },
- {
- "in": [
- "x"
- ],
- "out": [
- "x_prep"
- ],
- "class_name": "dirty_comments_preprocessor",
- "remove_punctuation": false
- },
- {
- "in": "x_prep",
- "out": "x_tok",
- "id": "my_tokenizer",
- "class_name": "nltk_tokenizer",
- "tokenizer": "wordpunct_tokenize"
- },
- {
- "in": "x_tok",
- "out": "x_emb",
- "id": "my_embedder",
- "class_name": "fasttext",
- "load_path": "{DOWNLOADS_PATH}/embeddings/ft_native_300_ru_twitter_nltk_word_tokenize.bin",
- "dim": 300,
- "pad_zero": true
- },
- {
- "in": "y_ids",
- "out": "y_onehot",
- "class_name": "one_hotter",
- "depth": "#classes_vocab.len",
- "single_vector": true
- },
- {
- "in": [
- "x_emb"
- ],
- "in_y": [
- "y_onehot"
- ],
- "out": [
- "y_pred_probas"
- ],
- "main": true,
- "class_name": "keras_classification_model",
- "save_path": "{MODEL_PATH}/model",
- "load_path": "{MODEL_PATH}/model",
- "embedding_size": "#my_embedder.dim",
- "n_classes": "#classes_vocab.len",
- "units_gru": 256,
- "optimizer": "Adam",
- "learning_rate": 0.001,
- "learning_rate_decay": "trapezoid",
- "learning_rate_decay_batches": 10000,
- "fit_batch_size": 64,
- "fit_on": ["x_emb", "y_onehot"],
- "momentum": [0.95, 0.55],
- "momentum_decay": "trapezoid",
- "momentum_decay_batches": 10000,
- "loss": "categorical_crossentropy",
- "last_layer_activation": "softmax",
- "coef_reg_gru": 1e-6,
- "coef_reg_den": 1e-6,
- "dropout_rate": 0.2,
- "rec_dropout_rate": 0.2,
- "dense_size": 100,
- "model_name": "bigru_with_max_aver_pool_model"
- },
- {
- "in": "y_pred_probas",
- "out": "y_pred_ids",
- "class_name": "proba2labels",
- "max_proba": true
- },
- {
- "in": "y_pred_ids",
- "out": "y_pred_labels",
- "ref": "classes_vocab"
- }
- ],
- "out": [
- "y_pred_labels"
- ]
- },
- "train": {
- "epochs": 100,
- "batch_size": 64,
- "metrics": [
- "f1_weighted",
- "f1_macro",
- "accuracy",
- {
- "name": "roc_auc",
- "inputs": ["y_onehot", "y_pred_probas"]
- }
- ],
- "validation_patience": 5,
- "val_every_n_epochs": 1,
- "log_every_n_epochs": 1,
- "show_examples": false,
- "validate_best": true,
- "test_best": true,
- "tensorboard_log_dir": "{MODEL_PATH}/logs"
- },
- "metadata": {
- "variables": {
- "ROOT_PATH": "~/.deeppavlov",
- "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
- "MODELS_PATH": "{ROOT_PATH}/models",
- "MODEL_PATH": "{MODELS_PATH}/classifiers/rusentiment_v14"
- },
- "download": [
- {
- "url": "http://files.deeppavlov.ai/embeddings/ft_native_300_ru_twitter_nltk_word_tokenize.bin",
- "subdir": "{DOWNLOADS_PATH}/embeddings"
- },
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/classifiers/rusentiment_v14.tar.gz",
- "subdir": "{MODELS_PATH}/classifiers"
- }
- ]
- }
-}
diff --git a/deeppavlov/configs/classifiers/rusentiment_cnn.json b/deeppavlov/configs/classifiers/rusentiment_cnn.json
deleted file mode 100644
index 0706d803e7..0000000000
--- a/deeppavlov/configs/classifiers/rusentiment_cnn.json
+++ /dev/null
@@ -1,167 +0,0 @@
-{
- "dataset_reader": {
- "class_name": "basic_classification_reader",
- "x": "text",
- "y": "label",
- "data_path": "{DOWNLOADS_PATH}/rusentiment/",
- "train": "rusentiment_random_posts.csv",
- "test": "rusentiment_test.csv"
- },
- "dataset_iterator": {
- "class_name": "basic_classification_iterator",
- "seed": 42,
- "field_to_split": "train",
- "split_fields": [
- "train",
- "valid"
- ],
- "split_proportions": [
- 0.9,
- 0.1
- ]
- },
- "chainer": {
- "in": [
- "x"
- ],
- "in_y": [
- "y"
- ],
- "pipe": [
- {
- "id": "classes_vocab",
- "class_name": "simple_vocab",
- "fit_on": [
- "y"
- ],
- "save_path": "{MODEL_PATH}/classes.dict",
- "load_path": "{MODEL_PATH}/classes.dict",
- "in": "y",
- "out": "y_ids"
- },
- {
- "in": [
- "x"
- ],
- "out": [
- "x_prep"
- ],
- "class_name": "dirty_comments_preprocessor"
- },
- {
- "in": "x_prep",
- "out": "x_tok",
- "id": "my_tokenizer",
- "class_name": "nltk_tokenizer",
- "tokenizer": "wordpunct_tokenize"
- },
- {
- "in": "x_tok",
- "out": "x_emb",
- "id": "my_embedder",
- "class_name": "fasttext",
- "load_path": "{DOWNLOADS_PATH}/embeddings/ft_native_300_ru_wiki_lenta_nltk_wordpunct_tokenize.bin",
- "pad_zero": true
- },
- {
- "in": "y_ids",
- "out": "y_onehot",
- "class_name": "one_hotter",
- "depth": "#classes_vocab.len",
- "single_vector": true
- },
- {
- "in": [
- "x_emb"
- ],
- "in_y": [
- "y_onehot"
- ],
- "out": [
- "y_pred_probas"
- ],
- "main": true,
- "class_name": "keras_classification_model",
- "save_path": "{MODEL_PATH}/model",
- "load_path": "{MODEL_PATH}/model",
- "embedding_size": "#my_embedder.dim",
- "n_classes": "#classes_vocab.len",
- "kernel_sizes_cnn": [
- 3,
- 5,
- 7
- ],
- "filters_cnn": 256,
- "optimizer": "Adam",
- "learning_rate": [0.01, 1e-4],
- "learning_rate_decay": "exponential",
- "learning_rate_decay_batches": 5000,
- "learning_rate_drop_patience": 5,
- "learning_rate_drop_div": 5.0,
- "loss": "binary_crossentropy",
- "last_layer_activation": "softmax",
- "coef_reg_cnn": 1e-3,
- "coef_reg_den": 1e-2,
- "dropout_rate": 0.5,
- "dense_size": 100,
- "model_name": "cnn_model"
- },
- {
- "in": "y_pred_probas",
- "out": "y_pred_ids",
- "class_name": "proba2labels",
- "max_proba": true
- },
- {
- "in": "y_pred_ids",
- "out": "y_pred_labels",
- "ref": "classes_vocab"
- }
- ],
- "out": [
- "y_pred_labels"
- ]
- },
- "train": {
- "epochs": 100,
- "batch_size": 64,
- "metrics": [
- "f1_weighted",
- "accuracy",
- "f1_macro",
- {
- "name": "roc_auc",
- "inputs": ["y_onehot", "y_pred_probas"]
- }
- ],
- "validation_patience": 5,
- "val_every_n_epochs": 1,
- "log_every_n_epochs": 1,
- "tensorboard_log_dir": "{MODEL_PATH}/logs",
- "show_examples": false,
- "evaluation_targets": [
- "train",
- "valid",
- "test"
- ],
- "class_name": "nn_trainer"
- },
- "metadata": {
- "variables": {
- "ROOT_PATH": "~/.deeppavlov",
- "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
- "MODELS_PATH": "{ROOT_PATH}/models",
- "MODEL_PATH": "{MODELS_PATH}/classifiers/rusentiment_v3"
- },
- "download": [
- {
- "url": "http://files.deeppavlov.ai/embeddings/ft_native_300_ru_wiki_lenta_nltk_wordpunct_tokenize/ft_native_300_ru_wiki_lenta_nltk_wordpunct_tokenize.bin",
- "subdir": "{DOWNLOADS_PATH}/embeddings"
- },
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/classifiers/rusentiment_v3.tar.gz",
- "subdir": "{MODELS_PATH}/classifiers"
- }
- ]
- }
-}
diff --git a/deeppavlov/configs/classifiers/rusentiment_elmo_twitter_cnn.json b/deeppavlov/configs/classifiers/rusentiment_elmo_twitter_cnn.json
deleted file mode 100644
index 1418b30dc2..0000000000
--- a/deeppavlov/configs/classifiers/rusentiment_elmo_twitter_cnn.json
+++ /dev/null
@@ -1,170 +0,0 @@
-{
- "dataset_reader": {
- "class_name": "basic_classification_reader",
- "x": "text",
- "y": "label",
- "data_path": "{DOWNLOADS_PATH}/rusentiment/",
- "train": "rusentiment_random_posts.csv",
- "test": "rusentiment_test.csv"
- },
- "dataset_iterator": {
- "class_name": "basic_classification_iterator",
- "seed": 42,
- "field_to_split": "train",
- "split_seed": 23,
- "split_fields": [
- "train",
- "valid"
- ],
- "split_proportions": [
- 0.9,
- 0.1
- ]
- },
- "chainer": {
- "in": [
- "x"
- ],
- "in_y": [
- "y"
- ],
- "pipe": [
- {
- "id": "classes_vocab",
- "class_name": "simple_vocab",
- "fit_on": [
- "y"
- ],
- "save_path": "{MODEL_PATH}/classes.dict",
- "load_path": "{MODEL_PATH}/classes.dict",
- "in": "y",
- "out": "y_ids"
- },
- {
- "in": [
- "x"
- ],
- "out": [
- "x_prep"
- ],
- "class_name": "dirty_comments_preprocessor",
- "remove_punctuation": false
- },
- {
- "in": "x_prep",
- "out": "x_tok",
- "id": "my_tokenizer",
- "class_name": "nltk_tokenizer",
- "tokenizer": "wordpunct_tokenize"
- },
- {
- "in": [
- "x_tok"
- ],
- "out": [
- "x_emb"
- ],
- "id": "my_embedder",
- "class_name": "elmo_embedder",
- "elmo_output_names": [
- "elmo"
- ],
- "mini_batch_size": 32,
- "spec": "http://files.deeppavlov.ai/deeppavlov_data/elmo_ru-twitter_2013-01_2018-04_600k_steps.tar.gz",
- "pad_zero": true
- },
- {
- "in": "y_ids",
- "out": "y_onehot",
- "class_name": "one_hotter",
- "depth": "#classes_vocab.len",
- "single_vector": true
- },
- {
- "in": [
- "x_emb"
- ],
- "in_y": [
- "y_onehot"
- ],
- "out": [
- "y_pred_probas"
- ],
- "main": true,
- "class_name": "keras_classification_model",
- "save_path": "{MODEL_PATH}/model",
- "load_path": "{MODEL_PATH}/model",
- "embedding_size": "#my_embedder.dim",
- "n_classes": "#classes_vocab.len",
- "kernel_sizes_cnn": [
- 3,
- 5,
- 7
- ],
- "filters_cnn": 256,
- "optimizer": "Adam",
- "learning_rate": 0.01,
- "learning_rate_decay": 0.1,
- "loss": "categorical_crossentropy",
- "last_layer_activation": "softmax",
- "coef_reg_cnn": 1e-3,
- "coef_reg_den": 1e-2,
- "dropout_rate": 0.5,
- "dense_size": 100,
- "model_name": "cnn_model"
- },
- {
- "in": "y_pred_probas",
- "out": "y_pred_ids",
- "class_name": "proba2labels",
- "max_proba": true
- },
- {
- "in": "y_pred_ids",
- "out": "y_pred_labels",
- "ref": "classes_vocab"
- }
- ],
- "out": [
- "y_pred_labels"
- ]
- },
- "train": {
- "epochs": 100,
- "batch_size": 128,
- "metrics": [
- "f1_weighted",
- "f1_macro",
- "accuracy",
- {
- "name": "roc_auc",
- "inputs": ["y_onehot", "y_pred_probas"]
- }
- ],
- "validation_patience": 5,
- "val_every_n_epochs": 1,
- "log_every_n_epochs": 1,
- "show_examples": false,
- "evaluation_targets": [
- "train",
- "valid",
- "test"
- ],
- "tensorboard_log_dir": "{MODEL_PATH}/logs",
- "class_name": "nn_trainer"
- },
- "metadata": {
- "variables": {
- "ROOT_PATH": "~/.deeppavlov",
- "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
- "MODELS_PATH": "{ROOT_PATH}/models",
- "MODEL_PATH": "{MODELS_PATH}/classifiers/rusentiment_v10"
- },
- "download": [
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/classifiers/rusentiment_v10.tar.gz",
- "subdir": "{MODELS_PATH}/classifiers"
- }
- ]
- }
-}
diff --git a/deeppavlov/configs/classifiers/sentiment_imdb_bert.json b/deeppavlov/configs/classifiers/sentiment_imdb_bert.json
deleted file mode 100644
index 8e62aefe8c..0000000000
--- a/deeppavlov/configs/classifiers/sentiment_imdb_bert.json
+++ /dev/null
@@ -1,142 +0,0 @@
-{
- "dataset_reader": {
- "class_name": "imdb_reader",
- "data_path": "{DOWNLOADS_PATH}/aclImdb"
- },
- "dataset_iterator": {
- "class_name": "basic_classification_iterator",
- "seed": 42,
- "split_seed": 23,
- "field_to_split": "train",
- "stratify": true,
- "split_fields": [
- "train",
- "valid"
- ],
- "split_proportions": [
- 0.9,
- 0.1
- ]
- },
- "chainer": {
- "in": [
- "x"
- ],
- "in_y": [
- "y"
- ],
- "pipe": [
- {
- "class_name": "bert_preprocessor",
- "vocab_file": "{DOWNLOADS_PATH}/bert_models/cased_L-12_H-768_A-12/vocab.txt",
- "do_lower_case": false,
- "max_seq_length": 450,
- "in": [
- "x"
- ],
- "out": [
- "bert_features"
- ]
- },
- {
- "id": "classes_vocab",
- "class_name": "simple_vocab",
- "fit_on": [
- "y"
- ],
- "save_path": "{MODEL_PATH}/classes.dict",
- "load_path": "{MODEL_PATH}/classes.dict",
- "in": "y",
- "out": "y_ids"
- },
- {
- "in": "y_ids",
- "out": "y_onehot",
- "class_name": "one_hotter",
- "depth": "#classes_vocab.len",
- "single_vector": true
- },
- {
- "class_name": "bert_classifier",
- "n_classes": "#classes_vocab.len",
- "return_probas": true,
- "one_hot_labels": true,
- "bert_config_file": "{DOWNLOADS_PATH}/bert_models/cased_L-12_H-768_A-12/bert_config.json",
- "pretrained_bert": "{DOWNLOADS_PATH}/bert_models/cased_L-12_H-768_A-12/bert_model.ckpt",
- "save_path": "{MODEL_PATH}/model",
- "load_path": "{MODEL_PATH}/model",
- "keep_prob": 0.5,
- "learning_rate": 1e-05,
- "learning_rate_drop_patience": 5,
- "learning_rate_drop_div": 2.0,
- "in": [
- "bert_features"
- ],
- "in_y": [
- "y_onehot"
- ],
- "out": [
- "y_pred_probas"
- ]
- },
- {
- "in": "y_pred_probas",
- "out": "y_pred_ids",
- "class_name": "proba2labels",
- "max_proba": true
- },
- {
- "in": "y_pred_ids",
- "out": "y_pred_labels",
- "ref": "classes_vocab"
- }
- ],
- "out": [
- "y_pred_labels"
- ]
- },
- "train": {
- "batch_size": 8,
- "epochs": 100,
- "metrics": [
- "f1_weighted",
- "f1_macro",
- {
- "name": "roc_auc",
- "inputs": [
- "y_onehot",
- "y_pred_probas"
- ]
- }
- ],
- "show_examples": false,
- "pytest_max_batches": 2,
- "validation_patience": 5,
- "val_every_n_epochs": 1,
- "log_every_n_epochs": 1,
- "evaluation_targets": [
- "train",
- "valid",
- "test"
- ],
- "tensorboard_log_dir": "{MODEL_PATH}/"
- },
- "metadata": {
- "variables": {
- "ROOT_PATH": "~/.deeppavlov",
- "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
- "MODELS_PATH": "{ROOT_PATH}/models",
- "MODEL_PATH": "{MODELS_PATH}/classifiers/sentiment_imdb_bert_v0/"
- },
- "labels": {
- "telegram_utils": "IntentModel",
- "server_utils": "KerasIntentModel"
- },
- "download": [
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/bert/cased_L-12_H-768_A-12.zip",
- "subdir": "{DOWNLOADS_PATH}/bert_models"
- }
- ]
- }
-}
diff --git a/deeppavlov/configs/classifiers/sentiment_imdb_conv_bert.json b/deeppavlov/configs/classifiers/sentiment_imdb_conv_bert.json
deleted file mode 100644
index 4e1a1287b5..0000000000
--- a/deeppavlov/configs/classifiers/sentiment_imdb_conv_bert.json
+++ /dev/null
@@ -1,142 +0,0 @@
-{
- "dataset_reader": {
- "class_name": "imdb_reader",
- "data_path": "{DOWNLOADS_PATH}/aclImdb"
- },
- "dataset_iterator": {
- "class_name": "basic_classification_iterator",
- "seed": 42,
- "split_seed": 23,
- "field_to_split": "train",
- "stratify": true,
- "split_fields": [
- "train",
- "valid"
- ],
- "split_proportions": [
- 0.9,
- 0.1
- ]
- },
- "chainer": {
- "in": [
- "x"
- ],
- "in_y": [
- "y"
- ],
- "pipe": [
- {
- "class_name": "bert_preprocessor",
- "vocab_file": "{DOWNLOADS_PATH}/bert_models/conversational_cased_L-12_H-768_A-12/vocab.txt",
- "do_lower_case": false,
- "max_seq_length": 450,
- "in": [
- "x"
- ],
- "out": [
- "bert_features"
- ]
- },
- {
- "id": "classes_vocab",
- "class_name": "simple_vocab",
- "fit_on": [
- "y"
- ],
- "save_path": "{MODEL_PATH}/classes.dict",
- "load_path": "{MODEL_PATH}/classes.dict",
- "in": "y",
- "out": "y_ids"
- },
- {
- "in": "y_ids",
- "out": "y_onehot",
- "class_name": "one_hotter",
- "depth": "#classes_vocab.len",
- "single_vector": true
- },
- {
- "class_name": "bert_classifier",
- "n_classes": "#classes_vocab.len",
- "return_probas": true,
- "one_hot_labels": true,
- "bert_config_file": "{DOWNLOADS_PATH}/bert_models/conversational_cased_L-12_H-768_A-12/bert_config.json",
- "pretrained_bert": "{DOWNLOADS_PATH}/bert_models/conversational_cased_L-12_H-768_A-12/bert_model.ckpt",
- "save_path": "{MODEL_PATH}/model",
- "load_path": "{MODEL_PATH}/model",
- "keep_prob": 0.5,
- "learning_rate": 1e-05,
- "learning_rate_drop_patience": 5,
- "learning_rate_drop_div": 2.0,
- "in": [
- "bert_features"
- ],
- "in_y": [
- "y_onehot"
- ],
- "out": [
- "y_pred_probas"
- ]
- },
- {
- "in": "y_pred_probas",
- "out": "y_pred_ids",
- "class_name": "proba2labels",
- "max_proba": true
- },
- {
- "in": "y_pred_ids",
- "out": "y_pred_labels",
- "ref": "classes_vocab"
- }
- ],
- "out": [
- "y_pred_labels"
- ]
- },
- "train": {
- "batch_size": 8,
- "epochs": 100,
- "metrics": [
- "f1_weighted",
- "f1_macro",
- {
- "name": "roc_auc",
- "inputs": [
- "y_onehot",
- "y_pred_probas"
- ]
- }
- ],
- "show_examples": false,
- "pytest_max_batches": 2,
- "validation_patience": 5,
- "val_every_n_epochs": 1,
- "log_every_n_epochs": 1,
- "evaluation_targets": [
- "train",
- "valid",
- "test"
- ],
- "tensorboard_log_dir": "{MODEL_PATH}/"
- },
- "metadata": {
- "variables": {
- "ROOT_PATH": "~/.deeppavlov",
- "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
- "MODELS_PATH": "{ROOT_PATH}/models",
- "MODEL_PATH": "{MODELS_PATH}/classifiers/sentiment_imdb_conv_bert_v0/"
- },
- "labels": {
- "telegram_utils": "IntentModel",
- "server_utils": "KerasIntentModel"
- },
- "download": [
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/bert/conversational_cased_L-12_H-768_A-12.tar.gz",
- "subdir": "{DOWNLOADS_PATH}/bert_models"
- }
- ]
- }
-}
diff --git a/deeppavlov/configs/classifiers/sentiment_sst_multi_bert.json b/deeppavlov/configs/classifiers/sentiment_sst_multi_bert.json
deleted file mode 100644
index 95a46ad544..0000000000
--- a/deeppavlov/configs/classifiers/sentiment_sst_multi_bert.json
+++ /dev/null
@@ -1,135 +0,0 @@
-{
- "dataset_reader": {
- "class_name": "basic_classification_reader",
- "x": "text",
- "y": "fine_grained_label",
- "data_path": "{DOWNLOADS_PATH}/stanfordSentimentTreebank",
- "train": "train_fine_grained.csv",
- "valid": "valid_fine_grained.csv",
- "test": "test_fine_grained.csv"
- },
- "dataset_iterator": {
- "class_name": "basic_classification_iterator",
- "seed": 42
- },
- "chainer": {
- "in": [
- "x"
- ],
- "in_y": [
- "y"
- ],
- "pipe": [
- {
- "class_name": "bert_preprocessor",
- "vocab_file": "{MODEL_PATH}/vocab.txt",
- "do_lower_case": false,
- "max_seq_length": 64,
- "in": [
- "x"
- ],
- "out": [
- "bert_features"
- ]
- },
- {
- "id": "classes_vocab",
- "class_name": "simple_vocab",
- "fit_on": [
- "y"
- ],
- "save_path": "{MODEL_PATH}/classes.dict",
- "load_path": "{MODEL_PATH}/classes.dict",
- "in": "y",
- "out": "y_ids"
- },
- {
- "in": "y_ids",
- "out": "y_onehot",
- "class_name": "one_hotter",
- "depth": "#classes_vocab.len",
- "single_vector": true
- },
- {
- "class_name": "bert_classifier",
- "n_classes": "#classes_vocab.len",
- "return_probas": true,
- "one_hot_labels": true,
- "bert_config_file": "{MODEL_PATH}/bert_config.json",
- "save_path": "{MODEL_PATH}/model",
- "load_path": "{MODEL_PATH}/model",
- "keep_prob": 0.5,
- "learning_rate": 1e-05,
- "learning_rate_drop_patience": 5,
- "learning_rate_drop_div": 2.0,
- "in": [
- "bert_features"
- ],
- "in_y": [
- "y_onehot"
- ],
- "out": [
- "y_pred_probas"
- ]
- },
- {
- "in": "y_pred_probas",
- "out": "y_pred_ids",
- "class_name": "proba2labels",
- "max_proba": true
- },
- {
- "in": "y_pred_ids",
- "out": "y_pred_labels",
- "ref": "classes_vocab"
- }
- ],
- "out": [
- "y_pred_labels"
- ]
- },
- "train": {
- "epochs": 100,
- "batch_size": 64,
- "metrics": [
- "accuracy",
- {
- "name": "roc_auc",
- "inputs": [
- "y_onehot",
- "y_pred_probas"
- ]
- },
- "f1_macro"
- ],
- "validation_patience": 5,
- "val_every_n_epochs": 1,
- "log_every_n_epochs": 1,
- "show_examples": false,
- "evaluation_targets": [
- "train",
- "valid",
- "test"
- ],
- "class_name": "nn_trainer",
- "tensorboard_log_dir": "{MODEL_PATH}/"
- },
- "metadata": {
- "variables": {
- "ROOT_PATH": "~/.deeppavlov",
- "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
- "MODELS_PATH": "{ROOT_PATH}/models",
- "MODEL_PATH": "{MODELS_PATH}/classifiers/sentiment_sst_bert_v1"
- },
- "download": [
- {
- "url": "http://files.deeppavlov.ai/datasets/stanfordSentimentTreebank.zip",
- "subdir": "{DOWNLOADS_PATH}"
- },
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/classifiers/sentiment_sst_bert_v1.tar.gz",
- "subdir": "{MODELS_PATH}/classifiers"
- }
- ]
- }
-}
diff --git a/deeppavlov/configs/classifiers/sentiment_twitter_bert_emb.json b/deeppavlov/configs/classifiers/sentiment_twitter_bert_emb.json
deleted file mode 100644
index 6a4fb9756a..0000000000
--- a/deeppavlov/configs/classifiers/sentiment_twitter_bert_emb.json
+++ /dev/null
@@ -1,144 +0,0 @@
-{
- "dataset_reader": {
- "class_name": "basic_classification_reader",
- "x": "Twit",
- "y": "Class",
- "data_path": "{DOWNLOADS_PATH}/sentiment_twitter_data"
- },
- "dataset_iterator": {
- "class_name": "basic_classification_iterator",
- "seed": 42
- },
- "chainer": {
- "in": [
- "x"
- ],
- "in_y": [
- "y"
- ],
- "pipe": [
- {
- "id": "classes_vocab",
- "class_name": "simple_vocab",
- "fit_on": [
- "y"
- ],
- "save_path": "{MODEL_PATH}/classes.dict",
- "load_path": "{MODEL_PATH}/classes.dict",
- "in": "y",
- "out": "y_ids"
- },
- {
- "class_name": "transformers_bert_preprocessor",
- "vocab_file": "{BERT_PATH}/vocab.txt",
- "do_lower_case": false,
- "max_seq_length": 512,
- "in": ["x"],
- "out": ["tokens", "subword_tokens", "subword_tok_ids", "startofword_markers", "attention_mask"]
- },
- {
- "class_name": "transformers_bert_embedder",
- "id": "my_embedder",
- "bert_config_path": "{BERT_PATH}/bert_config.json",
- "truncate": false,
- "load_path": "{BERT_PATH}",
- "in": ["subword_tok_ids", "startofword_markers", "attention_mask"],
- "out": ["word_emb", "subword_emb", "max_emb", "mean_emb", "pooler_output"]
- },
- {
- "in": "y_ids",
- "out": "y_onehot",
- "class_name": "one_hotter",
- "depth": "#classes_vocab.len",
- "single_vector": true
- },
- {
- "in": [
- "word_emb"
- ],
- "in_y": [
- "y_onehot"
- ],
- "out": [
- "y_pred_probas"
- ],
- "main": true,
- "class_name": "keras_classification_model",
- "save_path": "{MODEL_PATH}/model",
- "load_path": "{MODEL_PATH}/model",
- "embedding_size": "#my_embedder.dim",
- "n_classes": "#classes_vocab.len",
- "kernel_sizes_cnn": [
- 3,
- 5,
- 7
- ],
- "filters_cnn": 256,
- "optimizer": "Adam",
- "learning_rate": 0.01,
- "learning_rate_decay": 0.1,
- "loss": "binary_crossentropy",
- "last_layer_activation": "softmax",
- "coef_reg_cnn": 1e-3,
- "coef_reg_den": 1e-2,
- "dropout_rate": 0.5,
- "dense_size": 100,
- "model_name": "cnn_model"
- },
- {
- "in": "y_pred_probas",
- "out": "y_pred_ids",
- "class_name": "proba2labels",
- "max_proba": true
- },
- {
- "in": "y_pred_ids",
- "out": "y_pred_labels",
- "ref": "classes_vocab"
- }
- ],
- "out": [
- "y_pred_labels"
- ]
- },
- "train": {
- "epochs": 100,
- "batch_size": 64,
- "metrics": [
- "accuracy",
- "f1_macro",
- {
- "name": "roc_auc",
- "inputs": ["y_onehot", "y_pred_probas"]
- }
- ],
- "validation_patience": 5,
- "val_every_n_epochs": 1,
- "log_every_n_epochs": 1,
- "show_examples": false,
- "evaluation_targets": [
- "valid",
- "test"
- ],
- "class_name": "nn_trainer"
- },
- "metadata": {
- "variables": {
- "ROOT_PATH": "~/.deeppavlov",
- "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
- "MODELS_PATH": "{ROOT_PATH}/models",
- "MODEL_PATH": "{MODELS_PATH}/classifiers/sentiment_twitter_bert_emb",
- "BERT_PATH": "{DOWNLOADS_PATH}/bert_models/rubert_cased_L-12_H-768_A-12_pt"
- },
- "download": [
- {
- "url": "http://files.deeppavlov.ai/datasets/sentiment_twitter_data.tar.gz",
- "subdir": "{DOWNLOADS_PATH}"
- },
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/bert/rubert_cased_L-12_H-768_A-12_pt.tar.gz",
- "subdir": "{DOWNLOADS_PATH}/bert_models"
- }
- ]
- }
-}
diff --git a/deeppavlov/configs/classifiers/sentiment_twitter_preproc.json b/deeppavlov/configs/classifiers/sentiment_twitter_preproc.json
deleted file mode 100644
index 2d6a250958..0000000000
--- a/deeppavlov/configs/classifiers/sentiment_twitter_preproc.json
+++ /dev/null
@@ -1,159 +0,0 @@
-{
- "dataset_reader": {
- "class_name": "basic_classification_reader",
- "x": "Twit",
- "y": "Class",
- "data_path": "{DOWNLOADS_PATH}/sentiment_twitter_data"
- },
- "dataset_iterator": {
- "class_name": "basic_classification_iterator",
- "seed": 42
- },
- "chainer": {
- "in": [
- "x"
- ],
- "in_y": [
- "y"
- ],
- "pipe": [
- {
- "id": "classes_vocab",
- "class_name": "simple_vocab",
- "fit_on": [
- "y"
- ],
- "save_path": "{MODEL_PATH}/classes.dict",
- "load_path": "{MODEL_PATH}/classes.dict",
- "in": "y",
- "out": "y_ids"
- },
- {
- "in": [
- "x"
- ],
- "out": [
- "x_prep"
- ],
- "class_name": "dirty_comments_preprocessor",
- "delete_smile_brackets": true
- },
- {
- "in": "x_prep",
- "out": "x_tok",
- "id": "my_tokenizer",
- "class_name": "nltk_tokenizer",
- "tokenizer": "wordpunct_tokenize"
- },
- {
- "in": "x_tok",
- "out": "x_emb",
- "id": "my_embedder",
- "class_name": "fasttext",
- "load_path": "{DOWNLOADS_PATH}/embeddings/ft_native_300_ru_wiki_lenta_nltk_wordpunct_tokenize.bin",
- "pad_zero": true
- },
- {
- "in": "y_ids",
- "out": "y_onehot",
- "class_name": "one_hotter",
- "depth": "#classes_vocab.len",
- "single_vector": true
- },
- {
- "in": [
- "x_emb"
- ],
- "in_y": [
- "y_onehot"
- ],
- "out": [
- "y_pred_probas"
- ],
- "main": true,
- "class_name": "keras_classification_model",
- "save_path": "{MODEL_PATH}/model",
- "load_path": "{MODEL_PATH}/model",
- "embedding_size": "#my_embedder.dim",
- "n_classes": "#classes_vocab.len",
- "kernel_sizes_cnn": [
- 3,
- 5,
- 7
- ],
- "filters_cnn": 256,
- "optimizer": "Adam",
- "learning_rate": 0.01,
- "learning_rate_decay": 0.1,
- "loss": "binary_crossentropy",
- "last_layer_activation": "softmax",
- "coef_reg_cnn": 1e-3,
- "coef_reg_den": 1e-2,
- "dropout_rate": 0.5,
- "dense_size": 100,
- "model_name": "cnn_model"
- },
- {
- "in": "y_pred_probas",
- "out": "y_pred_ids",
- "class_name": "proba2labels",
- "max_proba": true
- },
- {
- "in": "y_pred_ids",
- "out": "y_pred_labels",
- "ref": "classes_vocab"
- }
- ],
- "out": [
- "y_pred_labels"
- ]
- },
- "train": {
- "epochs": 100,
- "batch_size": 64,
- "metrics": [
- "accuracy",
- "f1_macro",
- {
- "name": "roc_auc",
- "inputs": [
- "y_onehot",
- "y_pred_probas"
- ]
- }
- ],
- "validation_patience": 5,
- "val_every_n_epochs": 1,
- "log_every_n_epochs": 1,
- "show_examples": false,
- "evaluation_targets": [
- "train",
- "valid",
- "test"
- ],
- "class_name": "nn_trainer"
- },
- "metadata": {
- "variables": {
- "ROOT_PATH": "~/.deeppavlov",
- "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
- "MODELS_PATH": "{ROOT_PATH}/models",
- "MODEL_PATH": "{MODELS_PATH}/classifiers/sentiment_twitter_v7"
- },
- "download": [
- {
- "url": "http://files.deeppavlov.ai/datasets/sentiment_twitter_data.tar.gz",
- "subdir": "{DOWNLOADS_PATH}"
- },
- {
- "url": "http://files.deeppavlov.ai/embeddings/ft_native_300_ru_wiki_lenta_nltk_wordpunct_tokenize/ft_native_300_ru_wiki_lenta_nltk_wordpunct_tokenize.bin",
- "subdir": "{DOWNLOADS_PATH}/embeddings"
- },
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/classifiers/sentiment_twitter_v7.tar.gz",
- "subdir": "{MODELS_PATH}/classifiers"
- }
- ]
- }
-}
diff --git a/deeppavlov/configs/classifiers/sentiment_yelp_conv_bert.json b/deeppavlov/configs/classifiers/sentiment_yelp_conv_bert.json
deleted file mode 100644
index f1b1a40561..0000000000
--- a/deeppavlov/configs/classifiers/sentiment_yelp_conv_bert.json
+++ /dev/null
@@ -1,149 +0,0 @@
-{
- "dataset_reader": {
- "class_name": "basic_classification_reader",
- "x": "text",
- "y": "label",
- "data_path": "{DOWNLOADS_PATH}/yelp_review_full_csv",
- "train": "train.csv",
- "test": "test.csv",
- "header": null,
- "names": [
- "label",
- "text"
- ]
- },
- "dataset_iterator": {
- "class_name": "basic_classification_iterator",
- "seed": 42,
- "split_seed": 23,
- "field_to_split": "train",
- "split_fields": [
- "train",
- "valid"
- ],
- "split_proportions": [
- 0.9,
- 0.1
- ]
- },
- "chainer": {
- "in": [
- "x"
- ],
- "in_y": [
- "y"
- ],
- "pipe": [
- {
- "class_name": "bert_preprocessor",
- "vocab_file": "{MODEL_PATH}/vocab.txt",
- "do_lower_case": false,
- "max_seq_length": 256,
- "in": [
- "x"
- ],
- "out": [
- "bert_features"
- ]
- },
- {
- "id": "classes_vocab",
- "class_name": "simple_vocab",
- "fit_on": [
- "y"
- ],
- "save_path": "{MODEL_PATH}/classes.dict",
- "load_path": "{MODEL_PATH}/classes.dict",
- "in": "y",
- "out": "y_ids"
- },
- {
- "in": "y_ids",
- "out": "y_onehot",
- "class_name": "one_hotter",
- "depth": "#classes_vocab.len",
- "single_vector": true
- },
- {
- "class_name": "bert_classifier",
- "n_classes": "#classes_vocab.len",
- "return_probas": true,
- "one_hot_labels": true,
- "bert_config_file": "{MODEL_PATH}/bert_config.json",
- "save_path": "{MODEL_PATH}/model",
- "load_path": "{MODEL_PATH}/model",
- "keep_prob": 0.5,
- "learning_rate": 1e-05,
- "learning_rate_drop_patience": 5,
- "learning_rate_drop_div": 2.0,
- "in": [
- "bert_features"
- ],
- "in_y": [
- "y_onehot"
- ],
- "out": [
- "y_pred_probas"
- ]
- },
- {
- "in": "y_pred_probas",
- "out": "y_pred_ids",
- "class_name": "proba2labels",
- "max_proba": true
- },
- {
- "in": "y_pred_ids",
- "out": "y_pred_labels",
- "ref": "classes_vocab"
- }
- ],
- "out": [
- "y_pred_labels"
- ]
- },
- "train": {
- "epochs": 100,
- "batch_size": 16,
- "metrics": [
- "accuracy",
- {
- "name": "roc_auc",
- "inputs": [
- "y_onehot",
- "y_pred_probas"
- ]
- },
- "f1_macro"
- ],
- "validation_patience": 5,
- "val_every_n_epochs": 1,
- "log_every_n_epochs": 1,
- "show_examples": false,
- "evaluation_targets": [
- "train",
- "valid",
- "test"
- ],
- "class_name": "nn_trainer",
- "tensorboard_log_dir": "{MODEL_PATH}/"
- },
- "metadata": {
- "variables": {
- "ROOT_PATH": "~/.deeppavlov",
- "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
- "MODELS_PATH": "{ROOT_PATH}/models",
- "MODEL_PATH": "{MODELS_PATH}/classifiers/sentiment_yelp_bert_v2"
- },
- "download": [
- {
- "url": "http://files.deeppavlov.ai/datasets/yelp_review_full_csv.tar.gz",
- "subdir": "{DOWNLOADS_PATH}"
- },
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/classifiers/sentiment_yelp_bert_v2.tar.gz",
- "subdir": "{MODELS_PATH}/classifiers"
- }
- ]
- }
-}
diff --git a/deeppavlov/configs/classifiers/sentiment_yelp_multi_bert.json b/deeppavlov/configs/classifiers/sentiment_yelp_multi_bert.json
deleted file mode 100644
index d18dab7b05..0000000000
--- a/deeppavlov/configs/classifiers/sentiment_yelp_multi_bert.json
+++ /dev/null
@@ -1,149 +0,0 @@
-{
- "dataset_reader": {
- "class_name": "basic_classification_reader",
- "x": "text",
- "y": "label",
- "data_path": "{DOWNLOADS_PATH}/yelp_review_full_csv",
- "train": "train.csv",
- "test": "test.csv",
- "header": null,
- "names": [
- "label",
- "text"
- ]
- },
- "dataset_iterator": {
- "class_name": "basic_classification_iterator",
- "seed": 42,
- "split_seed": 23,
- "field_to_split": "train",
- "split_fields": [
- "train",
- "valid"
- ],
- "split_proportions": [
- 0.9,
- 0.1
- ]
- },
- "chainer": {
- "in": [
- "x"
- ],
- "in_y": [
- "y"
- ],
- "pipe": [
- {
- "class_name": "bert_preprocessor",
- "vocab_file": "{MODEL_PATH}/vocab.txt",
- "do_lower_case": false,
- "max_seq_length": 200,
- "in": [
- "x"
- ],
- "out": [
- "bert_features"
- ]
- },
- {
- "id": "classes_vocab",
- "class_name": "simple_vocab",
- "fit_on": [
- "y"
- ],
- "save_path": "{MODEL_PATH}/classes.dict",
- "load_path": "{MODEL_PATH}/classes.dict",
- "in": "y",
- "out": "y_ids"
- },
- {
- "in": "y_ids",
- "out": "y_onehot",
- "class_name": "one_hotter",
- "depth": "#classes_vocab.len",
- "single_vector": true
- },
- {
- "class_name": "bert_classifier",
- "n_classes": "#classes_vocab.len",
- "return_probas": true,
- "one_hot_labels": true,
- "bert_config_file": "{MODEL_PATH}/bert_config.json",
- "save_path": "{MODEL_PATH}/model",
- "load_path": "{MODEL_PATH}/model",
- "keep_prob": 0.5,
- "learning_rate": 1e-05,
- "learning_rate_drop_patience": 5,
- "learning_rate_drop_div": 2.0,
- "in": [
- "bert_features"
- ],
- "in_y": [
- "y_onehot"
- ],
- "out": [
- "y_pred_probas"
- ]
- },
- {
- "in": "y_pred_probas",
- "out": "y_pred_ids",
- "class_name": "proba2labels",
- "max_proba": true
- },
- {
- "in": "y_pred_ids",
- "out": "y_pred_labels",
- "ref": "classes_vocab"
- }
- ],
- "out": [
- "y_pred_labels"
- ]
- },
- "train": {
- "epochs": 100,
- "batch_size": 16,
- "metrics": [
- "accuracy",
- {
- "name": "roc_auc",
- "inputs": [
- "y_onehot",
- "y_pred_probas"
- ]
- },
- "f1_macro"
- ],
- "validation_patience": 5,
- "val_every_n_epochs": 1,
- "log_every_n_epochs": 1,
- "show_examples": false,
- "evaluation_targets": [
- "train",
- "valid",
- "test"
- ],
- "class_name": "nn_trainer",
- "tensorboard_log_dir": "{MODEL_PATH}/"
- },
- "metadata": {
- "variables": {
- "ROOT_PATH": "~/.deeppavlov",
- "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
- "MODELS_PATH": "{ROOT_PATH}/models",
- "MODEL_PATH": "{MODELS_PATH}/classifiers/sentiment_yelp_bert_v1"
- },
- "download": [
- {
- "url": "http://files.deeppavlov.ai/datasets/yelp_review_full_csv.tar.gz",
- "subdir": "{DOWNLOADS_PATH}"
- },
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/classifiers/sentiment_yelp_bert_v1.tar.gz",
- "subdir": "{MODELS_PATH}/classifiers"
- }
- ]
- }
-}
diff --git a/deeppavlov/configs/classifiers/sst_torch_swcnn.json b/deeppavlov/configs/classifiers/sst_torch_swcnn.json
deleted file mode 100644
index 9709d333fd..0000000000
--- a/deeppavlov/configs/classifiers/sst_torch_swcnn.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
- "dataset_reader": {
- "class_name": "torchtext_classification_data_reader",
- "data_path": "{DOWNLOADS_PATH}",
- "dataset_title": "SST"
- },
- "dataset_iterator": {
- "class_name": "basic_classification_iterator",
- "seed": 42
- },
- "chainer": {
- "in": [
- "x"
- ],
- "in_y": [
- "y"
- ],
- "pipe": [
- {
- "id": "classes_vocab",
- "class_name": "simple_vocab",
- "fit_on": [
- "y"
- ],
- "level": "token",
- "save_path": "{MODEL_PATH}/classes.dict",
- "load_path": "{MODEL_PATH}/classes.dict",
- "in": "y",
- "out": "y_ids"
- },
- {
- "in": "x",
- "out": "x_tok",
- "id": "my_tokenizer",
- "class_name": "nltk_tokenizer",
- "tokenizer": "wordpunct_tokenize"
- },
- {
- "in": "x_tok",
- "out": "x_emb",
- "id": "my_embedder",
- "class_name": "fasttext",
- "load_path": "{DOWNLOADS_PATH}/embeddings/wiki.en.bin",
- "pad_zero": true
- },
- {
- "in": "y_ids",
- "out": "y_onehot",
- "class_name": "one_hotter",
- "depth": "#classes_vocab.len",
- "single_vector": true
- },
- {
- "in": [
- "x_emb"
- ],
- "in_y": [
- "y_ids"
- ],
- "out": [
- "y_pred_probas"
- ],
- "main": true,
- "class_name": "torch_text_classification_model",
- "save_path": "{MODEL_PATH}/model",
- "load_path": "{MODEL_PATH}/model",
- "embedding_size": "#my_embedder.dim",
- "n_classes": "#classes_vocab.len",
- "model_name": "cnn_model",
- "kernel_sizes_cnn": [
- 3,
- 5,
- 7
- ],
- "filters_cnn": 128,
- "dropout_rate": 0.5,
- "dense_size": 64,
- "optimizer": "SGD",
- "optimizer_parameters": {
- "lr": 0.0001,
- "momentum": 0.9,
- "weight_decay": 0.0001
- },
- "lr_scheduler": "CyclicLR",
- "lr_scheduler_parameters": {
- "base_lr": 0.0001,
- "max_lr": 0.001
- },
- "loss": "CrossEntropyLoss"
- },
- {
- "in": "y_pred_probas",
- "out": "y_pred_ids",
- "class_name": "proba2labels",
- "max_proba": true
- },
- {
- "in": "y_pred_ids",
- "out": "y_pred_labels",
- "ref": "classes_vocab"
- }
- ],
- "out": [
- "y_pred_labels"
- ]
- },
- "train": {
- "epochs": 100,
- "batch_size": 64,
- "metrics": [
- "accuracy",
- {
- "name": "roc_auc",
- "inputs": [
- "y_onehot",
- "y_pred_probas"
- ]
- }
- ],
- "validation_patience": 10,
- "val_every_n_epochs": 1,
- "log_every_n_epochs": 1,
- "show_examples": false,
- "evaluation_targets": [
- "train",
- "valid"
- ],
- "class_name": "torch_trainer"
- },
- "metadata": {
- "variables": {
- "ROOT_PATH": "~/.deeppavlov",
- "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
- "MODELS_PATH": "{ROOT_PATH}/models",
- "MODEL_PATH": "{MODELS_PATH}/classifiers/sst_torch_v0"
- },
- "download": [
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/embeddings/wiki.en.bin",
- "subdir": "{DOWNLOADS_PATH}/embeddings"
- },
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/classifiers/sst_torch_v0.tar.gz",
- "subdir": "{MODELS_PATH}/classifiers"
- }
- ]
- }
-}
diff --git a/deeppavlov/configs/classifiers/topic_ag_news.json b/deeppavlov/configs/classifiers/topic_ag_news.json
deleted file mode 100644
index 0e56578b55..0000000000
--- a/deeppavlov/configs/classifiers/topic_ag_news.json
+++ /dev/null
@@ -1,154 +0,0 @@
-{
- "dataset_reader": {
- "class_name": "basic_classification_reader",
- "x": "text",
- "y": "label",
- "data_path": "{DOWNLOADS_PATH}/ag_news_data"
- },
- "dataset_iterator": {
- "class_name": "basic_classification_iterator"
- },
- "chainer": {
- "in": [
- "x"
- ],
- "in_y": [
- "y"
- ],
- "pipe": [
- {
- "id": "classes_vocab",
- "class_name": "simple_vocab",
- "fit_on": [
- "y"
- ],
- "save_path": "{MODEL_PATH}/classes.dict",
- "load_path": "{MODEL_PATH}/classes.dict",
- "in": "y",
- "out": "y_ids"
- },
- {
- "in": [
- "x"
- ],
- "out": [
- "x_lower"
- ],
- "class_name": "str_lower"
- },
- {
- "in": "x_lower",
- "out": "x_tok",
- "id": "my_tokenizer",
- "class_name": "nltk_tokenizer",
- "tokenizer": "wordpunct_tokenize"
- },
- {
- "in": "x_tok",
- "out": "x_emb",
- "id": "my_embedder",
- "class_name": "fasttext",
- "load_path": "{DOWNLOADS_PATH}/embeddings/wiki.en.bin",
- "pad_zero": true
- },
- {
- "in": "y_ids",
- "out": "y_onehot",
- "class_name": "one_hotter",
- "depth": "#classes_vocab.len",
- "single_vector": true
- },
- {
- "in": [
- "x_emb"
- ],
- "in_y": [
- "y_onehot"
- ],
- "out": [
- "y_pred_probas"
- ],
- "main": true,
- "class_name": "keras_classification_model",
- "save_path": "{MODEL_PATH}/model",
- "load_path": "{MODEL_PATH}/model",
- "embedding_size": "#my_embedder.dim",
- "n_classes": "#classes_vocab.len",
- "kernel_sizes_cnn": [
- 3,
- 5,
- 7
- ],
- "filters_cnn": 256,
- "optimizer": "Adam",
- "learning_rate": 0.01,
- "learning_rate_decay": 0.1,
- "loss": "binary_crossentropy",
- "coef_reg_cnn": 1e-4,
- "coef_reg_den": 1e-4,
- "dropout_rate": 0.5,
- "dense_size": 100,
- "last_layer_activation": "softmax",
- "model_name": "cnn_model"
- },
- {
- "in": "y_pred_probas",
- "out": "y_pred_ids",
- "class_name": "proba2labels",
- "max_proba": true
- },
- {
- "in": "y_pred_ids",
- "out": "y_pred_labels",
- "ref": "classes_vocab"
- }
- ],
- "out": [
- "y_pred_labels"
- ]
- },
- "train": {
- "epochs": 100,
- "batch_size": 64,
- "metrics": [
- "accuracy",
- "f1_macro",
- {
- "name": "roc_auc",
- "inputs": ["y_onehot", "y_pred_probas"]
- }
- ],
- "validation_patience": 5,
- "val_every_n_epochs": 1,
- "log_every_n_epochs": 1,
- "show_examples": false,
- "evaluation_targets": [
- "train",
- "valid",
- "test"
- ],
- "class_name": "nn_trainer"
- },
- "metadata": {
- "variables": {
- "ROOT_PATH": "~/.deeppavlov",
- "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
- "MODELS_PATH": "{ROOT_PATH}/models",
- "MODEL_PATH": "{MODELS_PATH}/classifiers/topic_ag_news_v3"
- },
- "download": [
- {
- "url": "http://files.deeppavlov.ai/datasets/ag_news_data.tar.gz",
- "subdir": "{DOWNLOADS_PATH}"
- },
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/embeddings/wiki.en.bin",
- "subdir": "{DOWNLOADS_PATH}/embeddings"
- },
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/classifiers/topic_ag_news_v3.tar.gz",
- "subdir": "{MODELS_PATH}/classifiers"
- }
- ]
- }
-}
diff --git a/deeppavlov/configs/classifiers/yahoo_convers_vs_info.json b/deeppavlov/configs/classifiers/yahoo_convers_vs_info.json
deleted file mode 100644
index ec1cd427b0..0000000000
--- a/deeppavlov/configs/classifiers/yahoo_convers_vs_info.json
+++ /dev/null
@@ -1,167 +0,0 @@
-{
- "chainer": {
- "in": [
- "x"
- ],
- "in_y": [
- "y"
- ],
- "pipe": [
- {
- "id": "classes_vocab",
- "class_name": "simple_vocab",
- "fit_on": [
- "y"
- ],
- "save_path": "{MODEL_PATH}/classes.dict",
- "load_path": "{MODEL_PATH}/classes.dict",
- "in": "y",
- "out": "y_ids"
- },
- {
- "in": [
- "x"
- ],
- "out": [
- "x_prep"
- ],
- "class_name": "dirty_comments_preprocessor",
- "remove_punctuation": false
- },
- {
- "in": "x_prep",
- "out": "x_tok",
- "id": "my_tokenizer",
- "class_name": "nltk_moses_tokenizer"
- },
- {
- "in": [
- "x_tok"
- ],
- "out": [
- "x_emb"
- ],
- "id": "my_embedder",
- "class_name": "elmo_embedder",
- "elmo_output_names": [
- "elmo"
- ],
- "mini_batch_size": 32,
- "spec": "{DOWNLOADS_PATH}/embeddings/yahooo-sber-questions_epoches_n_15/",
- "pad_zero": true
- },
- {
- "in": "y_ids",
- "out": "y_onehot",
- "class_name": "one_hotter",
- "id": "my_one_hotter",
- "depth": "#classes_vocab.len",
- "single_vector": true
- },
- {
- "in": [
- "x_emb"
- ],
- "in_y": [
- "y_onehot"
- ],
- "out": [
- "y_pred_probas"
- ],
- "main": true,
- "class_name": "keras_classification_model",
- "save_path": "{MODEL_PATH}/model",
- "load_path": "{MODEL_PATH}/model",
- "embedding_size": "#my_embedder.dim",
- "n_classes": "#classes_vocab.len",
- "units_gru": 512,
- "optimizer": "Adam",
- "learning_rate": 0.001,
- "learning_rate_decay": 0.001,
- "loss": "categorical_crossentropy",
- "coef_reg_gru": 1e-4,
- "coef_reg_den": 1e-4,
- "dropout_rate": 0.5,
- "rec_dropout_rate": 0.5,
- "dense_size": 100,
- "model_name": "bigru_with_max_aver_pool_model",
- "last_layer_activation": "softmax",
- "restore_lr": false
- },
- {
- "in": "y_pred_probas",
- "out": "y_pred_ids",
- "class_name": "proba2labels",
- "max_proba": true
- },
- {
- "in": "y_pred_ids",
- "out": "y_pred_labels",
- "ref": "classes_vocab"
- },
- {
- "ref": "my_one_hotter",
- "in": "y_pred_ids",
- "out": "y_pred_onehot"
- }
- ],
- "out": [
- "y_pred_labels"
- ]
- },
- "train": {
- "epochs": 100,
- "batch_size": 32,
- "metrics": [
- {
- "name": "roc_auc",
- "inputs": [
- "y_onehot",
- "y_pred_probas"
- ]
- },
- {
- "name": "accuracy",
- "inputs": [
- "y",
- "y_pred_labels"
- ]
- },
- {
- "name": "f1_macro",
- "inputs": [
- "y",
- "y_pred_labels"
- ]
- }
- ],
- "validation_patience": 20,
- "val_every_n_epochs": 1,
- "log_every_n_epochs": 1,
- "show_examples": true,
- "evaluation_targets": [
- "train",
- "valid"
- ],
- "tensorboard_log_dir": "{MODEL_PATH}/",
- "class_name": "nn_trainer"
- },
- "metadata": {
- "variables": {
- "ROOT_PATH": "~/.deeppavlov",
- "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
- "MODELS_PATH": "{ROOT_PATH}/models",
- "MODEL_PATH": "{MODELS_PATH}/classifiers/yahoo_convers_vs_info_v2"
- },
- "download": [
- {
- "url": "http://files.deeppavlov.ai/embeddings/yahooo-sber-questions_epoches_n_15.tar.gz",
- "subdir": "{DOWNLOADS_PATH}/embeddings/yahooo-sber-questions_epoches_n_15/"
- },
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/classifiers/yahoo_convers_vs_info_v2.tar.gz",
- "subdir": "{MODELS_PATH}/classifiers/"
- }
- ]
- }
-}
diff --git a/deeppavlov/configs/classifiers/yahoo_convers_vs_info_bert.json b/deeppavlov/configs/classifiers/yahoo_convers_vs_info_bert.json
deleted file mode 100644
index 7b0e79994d..0000000000
--- a/deeppavlov/configs/classifiers/yahoo_convers_vs_info_bert.json
+++ /dev/null
@@ -1,160 +0,0 @@
-{
- "chainer": {
- "in": [
- "x"
- ],
- "in_y": [
- "y"
- ],
- "pipe": [
- {
- "class_name": "bert_preprocessor",
- "vocab_file": "{DOWNLOADS_PATH}/bert_models/conversational_cased_L-12_H-768_A-12/vocab.txt",
- "do_lower_case": false,
- "max_seq_length": 64,
- "in": [
- "x"
- ],
- "out": [
- "bert_features"
- ]
- },
- {
- "id": "classes_vocab",
- "class_name": "simple_vocab",
- "fit_on": [
- "y"
- ],
- "save_path": "{MODEL_PATH}/classes.dict",
- "load_path": "{MODEL_PATH}/classes.dict",
- "in": [
- "y"
- ],
- "out": [
- "y_ids"
- ]
- },
- {
- "in": [
- "y_ids"
- ],
- "out": [
- "y_onehot"
- ],
- "class_name": "one_hotter",
- "id": "my_one_hotter",
- "depth": "#classes_vocab.len",
- "single_vector": true
- },
- {
- "class_name": "bert_classifier",
- "n_classes": "#classes_vocab.len",
- "return_probas": true,
- "one_hot_labels": true,
- "bert_config_file": "{DOWNLOADS_PATH}/bert_models/conversational_cased_L-12_H-768_A-12/bert_config.json",
- "pretrained_bert": "{DOWNLOADS_PATH}/bert_models/conversational_cased_L-12_H-768_A-12/bert_model.ckpt",
- "save_path": "{MODEL_PATH}/model",
- "load_path": "{MODEL_PATH}/model",
- "keep_prob": 0.5,
- "learning_rate": 1e-05,
- "learning_rate_drop_patience": 5,
- "learning_rate_drop_div": 2.0,
- "in": [
- "bert_features"
- ],
- "in_y": [
- "y_onehot"
- ],
- "out": [
- "y_pred_probas"
- ]
- },
- {
- "in": [
- "y_pred_probas"
- ],
- "out": [
- "y_pred_ids"
- ],
- "class_name": "proba2labels",
- "max_proba": true
- },
- {
- "in": [
- "y_pred_ids"
- ],
- "out": [
- "y_pred_labels"
- ],
- "ref": "classes_vocab"
- },
- {
- "ref": "my_one_hotter",
- "in": [
- "y_pred_ids"
- ],
- "out": [
- "y_pred_onehot"
- ]
- }
- ],
- "out": [
- "y_pred_labels"
- ]
- },
- "train": {
- "epochs": 100,
- "batch_size": 64,
- "metrics": [
- {
- "name": "roc_auc",
- "inputs": [
- "y_onehot",
- "y_pred_probas"
- ]
- },
- {
- "name": "accuracy",
- "inputs": [
- "y",
- "y_pred_labels"
- ]
- },
- {
- "name": "f1_macro",
- "inputs": [
- "y",
- "y_pred_labels"
- ]
- }
- ],
- "validation_patience": 20,
- "val_every_n_epochs": 1,
- "log_every_n_epochs": 1,
- "show_examples": false,
- "evaluation_targets": [
- "train",
- "valid"
- ],
- "tensorboard_log_dir": "{MODEL_PATH}/",
- "class_name": "nn_trainer"
- },
- "metadata": {
- "variables": {
- "ROOT_PATH": "~/.deeppavlov",
- "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
- "MODELS_PATH": "{ROOT_PATH}/models",
- "MODEL_PATH": "{MODELS_PATH}/classifiers/yahoo_convers_vs_info_v3"
- },
- "download": [
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/bert/conversational_cased_L-12_H-768_A-12.tar.gz",
- "subdir": "{DOWNLOADS_PATH}/bert_models"
- },
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/classifiers/yahoo_convers_vs_info_v3.tar.gz",
- "subdir": "{MODELS_PATH}/classifiers/"
- }
- ]
- }
-}
diff --git a/deeppavlov/configs/doc_retrieval/en_ranker_pop_enwiki20180211.json b/deeppavlov/configs/doc_retrieval/en_ranker_pop_enwiki20180211.json
index 20402495c1..24c4d566fe 100644
--- a/deeppavlov/configs/doc_retrieval/en_ranker_pop_enwiki20180211.json
+++ b/deeppavlov/configs/doc_retrieval/en_ranker_pop_enwiki20180211.json
@@ -56,7 +56,7 @@
{
"class_name": "pop_ranker",
"pop_dict_path": "{DOWNLOADS_PATH}/odqa/enwiki20180211_popularities.json",
- "load_path": "{MODELS_PATH}/odqa/logreg_3features.joblib",
+ "load_path": "{MODELS_PATH}/odqa/logreg_3features_v2.joblib",
"top_n": 10,
"in": ["tfidf_doc_ids", "tfidf_doc_scores"],
"out": ["pop_doc_ids", "pop_doc_scores"]
@@ -88,8 +88,8 @@
"subdir": "{DOWNLOADS_PATH}"
},
{
- "url": "http://files.deeppavlov.ai/deeppavlov_data/pop_ranker.tar.gz",
- "subdir": "{MODELS_PATH}"
+ "url": "http://files.deeppavlov.ai/deeppavlov_data/ranking/logreg_3features_v2.joblib",
+ "subdir": "{MODELS_PATH}/odqa"
}
]
}
diff --git a/deeppavlov/configs/doc_retrieval/en_ranker_tfidf_enwiki20161221.json b/deeppavlov/configs/doc_retrieval/en_ranker_tfidf_enwiki20161221.json
deleted file mode 100644
index 4fe8f63780..0000000000
--- a/deeppavlov/configs/doc_retrieval/en_ranker_tfidf_enwiki20161221.json
+++ /dev/null
@@ -1,80 +0,0 @@
-{
- "dataset_reader": {
- "class_name": "odqa_reader",
- "data_path": "{DOWNLOADS_PATH}/odqa/enwiki20161221",
- "save_path": "{DOWNLOADS_PATH}/odqa/enwiki20161221.db",
- "dataset_format": "wiki"
- },
- "dataset_iterator": {
- "class_name": "sqlite_iterator",
- "shuffle": false,
- "load_path": "{DOWNLOADS_PATH}/odqa/enwiki20161221.db"
- },
- "chainer": {
- "in": [
- "docs"
- ],
- "in_y": [
- "doc_ids",
- "doc_nums"
- ],
- "out": [
- "tfidf_doc_ids"
- ],
- "pipe": [
- {
- "class_name": "hashing_tfidf_vectorizer",
- "id": "vectorizer",
- "fit_on": [
- "docs",
- "doc_ids",
- "doc_nums"
- ],
- "save_path": "{MODELS_PATH}/odqa/enwiki20161221_tfidf_matrix.npz",
- "load_path": "{MODELS_PATH}/odqa/enwiki20161221_tfidf_matrix.npz",
- "tokenizer": {
- "class_name": "stream_spacy_tokenizer",
- "lemmas": true,
- "ngram_range": [
- 1,
- 2
- ]
- }
- },
- {
- "class_name": "tfidf_ranker",
- "top_n": 25,
- "in": [
- "docs"
- ],
- "out": [
- "tfidf_doc_ids",
- "tfidf_doc_scores"
- ],
- "vectorizer": "#vectorizer"
- }
- ]
- },
- "train": {
- "batch_size": 10000,
- "evaluation_targets": [],
- "class_name": "fit_trainer"
- },
- "metadata": {
- "variables": {
- "ROOT_PATH": "~/.deeppavlov",
- "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
- "MODELS_PATH": "{ROOT_PATH}/models"
- },
- "download": [
- {
- "url": "http://files.deeppavlov.ai/datasets/wikipedia/enwiki20161221.tar.gz",
- "subdir": "{DOWNLOADS_PATH}"
- },
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/en_odqa_enwiki20161221.tar.gz",
- "subdir": "{MODELS_PATH}"
- }
- ]
- }
-}
\ No newline at end of file
diff --git a/deeppavlov/configs/embedder/bert_sentence_embedder.json b/deeppavlov/configs/embedder/bert_sentence_embedder.json
index 348616ae27..b8c29a67f2 100644
--- a/deeppavlov/configs/embedder/bert_sentence_embedder.json
+++ b/deeppavlov/configs/embedder/bert_sentence_embedder.json
@@ -12,7 +12,7 @@
},
{
"class_name": "transformers_bert_embedder",
- "bert_config_path": "{BERT_PATH}/bert_config.json",
+ "bert_config_path": "{BERT_PATH}/config.json",
"load_path": "{BERT_PATH}",
"truncate": false,
"in": ["subword_tok_ids", "startofword_markers", "attention_mask"],
@@ -26,12 +26,12 @@
"variables": {
"ROOT_PATH": "~/.deeppavlov",
"DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
- "BERT_PATH": "{DOWNLOADS_PATH}/bert_models/sentence_multi_cased_L-12_H-768_A-12_pt"
+ "BERT_PATH": "{DOWNLOADS_PATH}/bert_models/sentence_multi_cased_L-12_H-768_A-12_pt_v1"
},
"labels": {},
"download": [
{
- "url": "http://files.deeppavlov.ai/deeppavlov_data/bert/sentence_multi_cased_L-12_H-768_A-12_pt.tar.gz",
+ "url": "http://files.deeppavlov.ai/deeppavlov_data/bert/sentence_multi_cased_L-12_H-768_A-12_pt_v1.tar.gz",
"subdir": "{DOWNLOADS_PATH}/bert_models"
}
]
diff --git a/deeppavlov/configs/embedder/elmo_en_1billion.json b/deeppavlov/configs/embedder/elmo_en_1billion.json
deleted file mode 100644
index c79d4908af..0000000000
--- a/deeppavlov/configs/embedder/elmo_en_1billion.json
+++ /dev/null
@@ -1,36 +0,0 @@
-{
- "chainer": {
- "in": [
- "sentences"
- ],
- "pipe": [
- {
- "in": ["sentences"],
- "class_name": "lazy_tokenizer",
- "out": ["tokens"]
- },
- {
- "class_name": "elmo_embedder",
- "elmo_output_names": ["lstm_outputs1", "lstm_outputs2", "word_emb"],
- "mini_batch_size": 32,
- "in": [
- "tokens"
- ],
- "spec": "https://tfhub.dev/google/elmo/2",
- "out": [
- "tokens_emb"
- ]
- }
- ],
- "out": [
- "tokens_emb"
- ]
- },
- "metadata": {
- "variables": {
- "ROOT_PATH": "~/.deeppavlov",
- "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
- "MODELS_PATH": "{ROOT_PATH}/models"
- }
- }
-}
diff --git a/deeppavlov/configs/embedder/elmo_ru_news.json b/deeppavlov/configs/embedder/elmo_ru_news.json
deleted file mode 100644
index 86d78bfe52..0000000000
--- a/deeppavlov/configs/embedder/elmo_ru_news.json
+++ /dev/null
@@ -1,42 +0,0 @@
-{
- "chainer": {
- "in": [
- "sentences"
- ],
- "pipe": [
- {
- "in": ["sentences"],
- "class_name": "lazy_tokenizer",
- "out": ["tokens"]
- },
- {
- "class_name": "elmo_embedder",
- "elmo_output_names": ["lstm_outputs1", "lstm_outputs2", "word_emb"],
- "mini_batch_size": 32,
- "in": [
- "tokens"
- ],
- "spec": "{DOWNLOADS_PATH}/embeddings/elmo_ru_news",
- "out": [
- "tokens_emb"
- ]
- }
- ],
- "out": [
- "tokens_emb"
- ]
- },
- "metadata": {
- "variables": {
- "ROOT_PATH": "~/.deeppavlov",
- "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
- "MODELS_PATH": "{ROOT_PATH}/models"
- },
- "download": [
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/elmo_ru-news_wmt11-16_1.5M_steps.tar.gz",
- "subdir": "{DOWNLOADS_PATH}/embeddings/elmo_ru_news"
- }
- ]
- }
-}
diff --git a/deeppavlov/configs/embedder/elmo_ru_twitter.json b/deeppavlov/configs/embedder/elmo_ru_twitter.json
deleted file mode 100644
index df4c6013d4..0000000000
--- a/deeppavlov/configs/embedder/elmo_ru_twitter.json
+++ /dev/null
@@ -1,42 +0,0 @@
-{
- "chainer": {
- "in": [
- "sentences"
- ],
- "pipe": [
- {
- "in": ["sentences"],
- "class_name": "lazy_tokenizer",
- "out": ["tokens"]
- },
- {
- "class_name": "elmo_embedder",
- "elmo_output_names": ["lstm_outputs1", "lstm_outputs2", "word_emb"],
- "mini_batch_size": 32,
- "in": [
- "tokens"
- ],
- "spec": "{DOWNLOADS_PATH}/embeddings/elmo_ru_tw",
- "out": [
- "tokens_emb"
- ]
- }
- ],
- "out": [
- "tokens_emb"
- ]
- },
- "metadata": {
- "variables": {
- "ROOT_PATH": "~/.deeppavlov",
- "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
- "MODELS_PATH": "{ROOT_PATH}/models"
- },
- "download": [
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/elmo_ru-twitter_2013-01_2018-04_600k_steps.tar.gz",
- "subdir": "{DOWNLOADS_PATH}/embeddings/elmo_ru_tw"
- }
- ]
- }
-}
diff --git a/deeppavlov/configs/embedder/elmo_ru_wiki.json b/deeppavlov/configs/embedder/elmo_ru_wiki.json
deleted file mode 100644
index f234430e6f..0000000000
--- a/deeppavlov/configs/embedder/elmo_ru_wiki.json
+++ /dev/null
@@ -1,42 +0,0 @@
-{
- "chainer": {
- "in": [
- "sentences"
- ],
- "pipe": [
- {
- "in": ["sentences"],
- "class_name": "lazy_tokenizer",
- "out": ["tokens"]
- },
- {
- "class_name": "elmo_embedder",
- "elmo_output_names": ["lstm_outputs1", "lstm_outputs2", "word_emb"],
- "mini_batch_size": 32,
- "in": [
- "tokens"
- ],
- "spec": "{DOWNLOADS_PATH}/embeddings/elmo_ru_wiki",
- "out": [
- "tokens_emb"
- ]
- }
- ],
- "out": [
- "tokens_emb"
- ]
- },
- "metadata": {
- "variables": {
- "ROOT_PATH": "~/.deeppavlov",
- "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
- "MODELS_PATH": "{ROOT_PATH}/models"
- },
- "download": [
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/elmo_ru-wiki_600k_steps.tar.gz",
- "subdir": "{DOWNLOADS_PATH}/embeddings/elmo_ru_wiki"
- }
- ]
- }
-}
\ No newline at end of file
diff --git a/deeppavlov/configs/faq/tfidf_logreg_autofaq.json b/deeppavlov/configs/faq/tfidf_logreg_autofaq.json
index 9e2516fceb..a41ada103a 100644
--- a/deeppavlov/configs/faq/tfidf_logreg_autofaq.json
+++ b/deeppavlov/configs/faq/tfidf_logreg_autofaq.json
@@ -64,8 +64,8 @@
],
"class_name": "sklearn_component",
"main": true,
- "save_path": "{MODELS_PATH}/faq/tfidf_logreg_classifier_v2.pkl",
- "load_path": "{MODELS_PATH}/faq/tfidf_logreg_classifier_v2.pkl",
+ "save_path": "{MODELS_PATH}/faq/tfidf_logreg_classifier_v4.pkl",
+ "load_path": "{MODELS_PATH}/faq/tfidf_logreg_classifier_v4.pkl",
"model_class": "sklearn.linear_model:LogisticRegression",
"infer_method": "predict_proba",
"C": 1000,
@@ -100,7 +100,7 @@
},
"download": [
{
- "url": "http://files.deeppavlov.ai/faq/school/tfidf_logreg_classifier_v2.pkl",
+ "url": "http://files.deeppavlov.ai/faq/school/tfidf_logreg_classifier_v4.pkl",
"subdir": "{MODELS_PATH}/faq"
},
{
diff --git a/deeppavlov/configs/faq/tfidf_logreg_en_faq.json b/deeppavlov/configs/faq/tfidf_logreg_en_faq.json
index 6146bbb295..8abccda06e 100644
--- a/deeppavlov/configs/faq/tfidf_logreg_en_faq.json
+++ b/deeppavlov/configs/faq/tfidf_logreg_en_faq.json
@@ -36,8 +36,8 @@
],
"id": "tfidf_vec",
"class_name": "sklearn_component",
- "save_path": "{MODELS_PATH}/faq/mipt/en_mipt_faq_v4/tfidf.pkl",
- "load_path": "{MODELS_PATH}/faq/mipt/en_mipt_faq_v4/tfidf.pkl",
+ "save_path": "{MODELS_PATH}/faq/mipt/en_mipt_faq_v5/tfidf.pkl",
+ "load_path": "{MODELS_PATH}/faq/mipt/en_mipt_faq_v5/tfidf.pkl",
"model_class": "sklearn.feature_extraction.text:TfidfVectorizer",
"infer_method": "transform"
},
@@ -47,8 +47,8 @@
"fit_on": [
"y"
],
- "save_path": "{MODELS_PATH}/faq/mipt/en_mipt_faq_v4/en_mipt_answers.dict",
- "load_path": "{MODELS_PATH}/faq/mipt/en_mipt_faq_v4/en_mipt_answers.dict",
+ "save_path": "{MODELS_PATH}/faq/mipt/en_mipt_faq_v5/en_mipt_answers.dict",
+ "load_path": "{MODELS_PATH}/faq/mipt/en_mipt_faq_v5/en_mipt_answers.dict",
"in": "y",
"out": "y_ids"
},
@@ -63,8 +63,8 @@
],
"class_name": "sklearn_component",
"main": true,
- "save_path": "{MODELS_PATH}/faq/mipt/en_mipt_faq_v4/logreg.pkl",
- "load_path": "{MODELS_PATH}/faq/mipt/en_mipt_faq_v4/logreg.pkl",
+ "save_path": "{MODELS_PATH}/faq/mipt/en_mipt_faq_v5/logreg.pkl",
+ "load_path": "{MODELS_PATH}/faq/mipt/en_mipt_faq_v5/logreg.pkl",
"model_class": "sklearn.linear_model:LogisticRegression",
"infer_method": "predict_proba",
"C": 1000,
@@ -99,7 +99,7 @@
},
"download": [
{
- "url": "http://files.deeppavlov.ai/faq/mipt/en_mipt_faq_v4.tar.gz",
+ "url": "http://files.deeppavlov.ai/faq/mipt/en_mipt_faq_v5.tar.gz",
"subdir": "{MODELS_PATH}/faq/mipt"
}
]
diff --git a/deeppavlov/configs/go_bot/database_dstc2.json b/deeppavlov/configs/go_bot/database_dstc2.json
deleted file mode 100644
index bcb153a6fd..0000000000
--- a/deeppavlov/configs/go_bot/database_dstc2.json
+++ /dev/null
@@ -1,44 +0,0 @@
-{
- "dataset_reader": {
- "class_name": "dstc2_reader",
- "data_path": "{DOWNLOADS_PATH}/dstc2_v3"
- },
- "dataset_iterator": {
- "class_name": "dialog_db_result_iterator"
- },
- "chainer": {
- "in": ["db_result"],
- "in_y": [],
- "out": [],
- "pipe": [
- {
- "id": "restaurant_database",
- "class_name": "sqlite_database",
- "fit_on": ["db_result"],
- "table_name": "mytable",
- "primary_keys": ["name"],
- "save_path": "{DOWNLOADS_PATH}/dstc2_v3/resto.sqlite"
- }
- ]
- },
- "train": {
- "class_name": "fit_trainer",
- "evaluation_targets": [
- "valid",
- "test"
- ]
- },
- "metadata": {
- "variables": {
- "ROOT_PATH": "~/.deeppavlov",
- "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
- "MODELS_PATH": "{ROOT_PATH}/models"
- },
- "download": [
- {
- "url": "http://files.deeppavlov.ai/datasets/dstc2_v3.tar.gz",
- "subdir": "{DOWNLOADS_PATH}/dstc2_v3"
- }
- ]
- }
-}
diff --git a/deeppavlov/configs/go_bot/gobot_dstc2.json b/deeppavlov/configs/go_bot/gobot_dstc2.json
deleted file mode 100644
index 2611af6f05..0000000000
--- a/deeppavlov/configs/go_bot/gobot_dstc2.json
+++ /dev/null
@@ -1,125 +0,0 @@
-{
- "dataset_reader": {
- "class_name": "dstc2_reader",
- "data_path": "{DATA_PATH}"
- },
- "dataset_iterator": {
- "class_name": "dialog_iterator"
- },
- "chainer": {
- "in": ["x"],
- "in_y": ["y"],
- "out": ["y_predicted"],
- "pipe": [
- {
- "class_name": "dialog_component_wrapper",
- "component": { "class_name": "split_tokenizer" },
- "in": ["x"],
- "out": ["x_tokens"]
- },
- {
- "id": "word_vocab",
- "class_name": "simple_vocab",
- "fit_on": ["x_tokens"],
- "save_path": "{MODEL_PATH}/word.dict",
- "load_path": "{MODEL_PATH}/word.dict"
- },
- {
- "class_name": "go_bot",
- "load_path": "{MODEL_PATH}/model",
- "save_path": "{MODEL_PATH}/model",
- "in": ["x"],
- "in_y": ["y"],
- "out": ["y_predicted"],
- "main": true,
- "debug": false,
- "learning_rate": 0.003,
- "learning_rate_drop_patience": 5,
- "learning_rate_drop_div": 10.0,
- "momentum": 0.95,
- "optimizer": "tensorflow.train:AdamOptimizer",
- "clip_norm": 2.0,
- "dropout_rate": 0.4,
- "l2_reg_coef": 3e-4,
- "hidden_size": 128,
- "dense_size": 160,
- "word_vocab": "#word_vocab",
- "database": {
- "class_name": "sqlite_database",
- "table_name": "mytable",
- "primary_keys": ["name"],
- "save_path": "{DOWNLOADS_PATH}/dstc2_v3/resto.sqlite"
- },
- "nlg_manager": {
- "class_name": "gobot_nlg_manager",
- "template_path": "{DATA_PATH}/dstc2-templates.txt",
- "template_type": "DualTemplate",
- "api_call_action": "api_call"
- },
- "api_call_action": "api_call",
- "use_action_mask": false,
- "slot_filler": {
- "config_path": "{CONFIGS_PATH}/ner/slotfill_dstc2.json"
- },
- "intent_classifier": null,
- "embedder": {
- "class_name": "glove",
- "load_path": "{DOWNLOADS_PATH}/embeddings/glove.6B.100d.txt"
- },
- "bow_embedder": {
- "class_name": "bow",
- "depth": "#word_vocab.__len__()",
- "with_counts": true
- },
- "tokenizer": {
- "class_name": "stream_spacy_tokenizer",
- "lowercase": false
- },
- "tracker": {
- "class_name": "featurized_tracker",
- "slot_names": ["pricerange", "this", "area", "food", "name"]
- }
- }
- ]
- },
- "train": {
- "epochs": 200,
- "batch_size": 8,
-
- "metrics": ["per_item_dialog_accuracy"],
- "validation_patience": 10,
- "val_every_n_batches": 15,
-
- "log_every_n_batches": 15,
- "show_examples": false,
- "evaluation_targets": [
- "valid",
- "test"
- ],
- "class_name": "nn_trainer"
- },
- "metadata": {
- "variables": {
- "ROOT_PATH": "~/.deeppavlov",
- "CONFIGS_PATH": "{DEEPPAVLOV_PATH}/configs",
- "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
- "DATA_PATH": "{DOWNLOADS_PATH}/dstc2_v3",
- "MODELS_PATH": "{ROOT_PATH}/models",
- "MODEL_PATH": "{MODELS_PATH}/gobot_dstc2"
- },
- "download": [
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/gobot_dstc2_v9.tar.gz",
- "subdir": "{MODELS_PATH}"
- },
- {
- "url": "http://files.deeppavlov.ai/embeddings/glove.6B.100d.txt",
- "subdir": "{DOWNLOADS_PATH}/embeddings"
- },
- {
- "url": "http://files.deeppavlov.ai/datasets/dstc2_v3.tar.gz",
- "subdir": "{DATA_PATH}"
- }
- ]
- }
-}
diff --git a/deeppavlov/configs/go_bot/gobot_dstc2_best.json b/deeppavlov/configs/go_bot/gobot_dstc2_best.json
deleted file mode 100644
index b13c680f9e..0000000000
--- a/deeppavlov/configs/go_bot/gobot_dstc2_best.json
+++ /dev/null
@@ -1,133 +0,0 @@
-{
- "dataset_reader": {
- "class_name": "dstc2_reader",
- "data_path": "{DSTC2_DATA_PATH}"
- },
- "dataset_iterator": {
- "class_name": "dialog_iterator"
- },
- "chainer": {
- "in": ["x"],
- "in_y": ["y"],
- "out": ["y_predicted"],
- "pipe": [
- {
- "class_name": "dialog_component_wrapper",
- "component": { "class_name": "split_tokenizer" },
- "in": ["x"],
- "out": ["x_tokens"]
- },
- {
- "id": "token_vocab",
- "fit_on": ["x_tokens"],
- "class_name": "simple_vocab",
- "save_path": "{MODELS_PATH}/gobot_dstc2_best/word.dict",
- "load_path": "{MODELS_PATH}/gobot_dstc2_best/word.dict"
- },
- {
- "id": "restaurant_database",
- "class_name": "sqlite_database",
- "table_name": "mytable",
- "primary_keys": ["name"],
- "save_path": "{DOWNLOADS_PATH}/dstc2_v3/resto.sqlite"
- },
- {
- "class_name": "go_bot",
- "load_path": "{MODELS_PATH}/gobot_dstc2_best/model",
- "save_path": "{MODELS_PATH}/gobot_dstc2_best/model",
- "in": ["x"],
- "in_y": ["y"],
- "out": ["y_predicted"],
- "main": true,
- "debug": false,
- "learning_rate": 3e-3,
- "learning_rate_drop_patience": 10,
- "learning_rate_drop_div": 4.0,
- "momentum": 0.95,
- "optimizer": "tensorflow.train:AdamOptimizer",
- "clip_norm": 2.0,
- "dropout_rate": 0.75,
- "l2_reg_coef": 9e-4,
- "hidden_size": 128,
- "dense_size": 128,
- "attention_mechanism": {
- "type": "general",
- "hidden_size": 32,
- "action_as_key": true,
- "intent_as_key": true,
- "max_num_tokens": 100,
- "projected_align": false
- },
- "word_vocab": "#token_vocab",
- "database": "#restaurant_database",
- "nlg_manager": {
- "class_name": "gobot_nlg_manager",
- "template_path": "{DSTC2_DATA_PATH}/dstc2-templates.txt",
- "template_type": "DualTemplate",
- "api_call_action": "api_call"
- },
- "use_action_mask": false,
- "slot_filler": {
- "config_path": "{CONFIGS_PATH}/ner/slotfill_dstc2.json"
- },
- "intent_classifier": {
- "config_path": "{CONFIGS_PATH}/classifiers/intents_dstc2.json"
- },
- "embedder": {
- "class_name": "fasttext",
- "load_path": "{DOWNLOADS_PATH}/embeddings/wiki.en.bin"
- },
- "bow_embedder": null,
- "tokenizer": {
- "class_name": "stream_spacy_tokenizer",
- "lowercase": false
- },
- "tracker": {
- "class_name": "featurized_tracker",
- "slot_names": ["pricerange", "this", "area", "food", "name"]
- }
- }
- ]
- },
- "train": {
- "epochs": 100,
- "batch_size": 8,
-
- "pytest_max_batches": 2,
-
- "metrics": ["per_item_dialog_accuracy"],
- "validation_patience": 15,
- "val_every_n_batches": 15,
-
- "log_every_n_batches": 15,
- "show_examples": false,
- "evaluation_targets": [
- "valid",
- "test"
- ],
- "class_name": "nn_trainer"
- },
- "metadata": {
- "variables": {
- "ROOT_PATH": "~/.deeppavlov",
- "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
- "MODELS_PATH": "{ROOT_PATH}/models",
- "CONFIGS_PATH": "{DEEPPAVLOV_PATH}/configs",
- "DSTC2_DATA_PATH": "{DOWNLOADS_PATH}/dstc2_v3"
- },
- "download": [
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/gobot_dstc2_best_v4.tar.gz",
- "subdir": "{MODELS_PATH}"
- },
- {
- "url": "http://files.deeppavlov.ai/datasets/dstc2_v3.tar.gz",
- "subdir": "{DOWNLOADS_PATH}/dstc2_v3"
- },
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/embeddings/wiki.en.bin",
- "subdir": "{DOWNLOADS_PATH}/embeddings"
- }
- ]
- }
-}
diff --git a/deeppavlov/configs/go_bot/gobot_dstc2_best_json_nlg.json b/deeppavlov/configs/go_bot/gobot_dstc2_best_json_nlg.json
deleted file mode 100644
index 19202edb74..0000000000
--- a/deeppavlov/configs/go_bot/gobot_dstc2_best_json_nlg.json
+++ /dev/null
@@ -1,133 +0,0 @@
-{
- "dataset_reader": {
- "class_name": "dstc2_reader",
- "data_path": "{DSTC2_DATA_PATH}"
- },
- "dataset_iterator": {
- "class_name": "dialog_iterator"
- },
- "chainer": {
- "in": ["x"],
- "in_y": ["y"],
- "out": ["y_predicted"],
- "pipe": [
- {
- "class_name": "dialog_component_wrapper",
- "component": { "class_name": "split_tokenizer" },
- "in": ["x"],
- "out": ["x_tokens"]
- },
- {
- "id": "token_vocab",
- "fit_on": ["x_tokens"],
- "class_name": "simple_vocab",
- "save_path": "{MODELS_PATH}/gobot_dstc2_best_json/word.dict",
- "load_path": "{MODELS_PATH}/gobot_dstc2_best_json/word.dict"
- },
- {
- "id": "restaurant_database",
- "class_name": "sqlite_database",
- "table_name": "mytable",
- "primary_keys": ["name"],
- "save_path": "{DOWNLOADS_PATH}/dstc2/resto.sqlite"
- },
- {
- "class_name": "go_bot",
- "load_path": "{MODELS_PATH}/gobot_dstc2_best_json/model",
- "save_path": "{MODELS_PATH}/gobot_dstc2_best_json/model",
- "in": ["x"],
- "in_y": ["y"],
- "out": ["y_predicted"],
- "main": true,
- "debug": false,
- "learning_rate": 3e-3,
- "learning_rate_drop_patience": 10,
- "learning_rate_drop_div": 4.0,
- "momentum": 0.95,
- "optimizer": "tensorflow.train:AdamOptimizer",
- "clip_norm": 2.0,
- "dropout_rate": 0.75,
- "l2_reg_coef": 9e-4,
- "hidden_size": 128,
- "dense_size": 128,
- "attention_mechanism": {
- "type": "general",
- "hidden_size": 32,
- "action_as_key": true,
- "intent_as_key": true,
- "max_num_tokens": 100,
- "projected_align": false
- },
- "word_vocab": "#token_vocab",
- "database": "#restaurant_database",
- "nlg_manager": {
- "class_name": "gobot_json_nlg_manager",
- "data_path": "{DSTC2_DATA_PATH}",
- "actions2slots_path": "{DSTC2_DATA_PATH}/dstc2-actions2slots.json",
- "api_call_action": "api_call"
- },
- "use_action_mask": false,
- "slot_filler": {
- "config_path": "{CONFIGS_PATH}/ner/slotfill_dstc2.json"
- },
- "intent_classifier": {
- "config_path": "{CONFIGS_PATH}/classifiers/intents_dstc2.json"
- },
- "embedder": {
- "class_name": "fasttext",
- "load_path": "{DOWNLOADS_PATH}/embeddings/wiki.en.bin"
- },
- "bow_embedder": null,
- "tokenizer": {
- "class_name": "stream_spacy_tokenizer",
- "lowercase": false
- },
- "tracker": {
- "class_name": "featurized_tracker",
- "slot_names": ["pricerange", "this", "area", "food", "name"]
- }
- }
- ]
- },
- "train": {
- "epochs": 100,
- "batch_size": 8,
-
- "pytest_max_batches": 2,
-
- "metrics": ["per_item_action_accuracy"],
- "validation_patience": 15,
- "val_every_n_batches": 15,
-
- "log_every_n_batches": 15,
- "show_examples": false,
- "evaluation_targets": [
- "valid",
- "test"
- ],
- "class_name": "nn_trainer"
- },
- "metadata": {
- "variables": {
- "ROOT_PATH": "~/.deeppavlov",
- "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
- "MODELS_PATH": "{ROOT_PATH}/models",
- "CONFIGS_PATH": "{DEEPPAVLOV_PATH}/configs",
- "DSTC2_DATA_PATH": "{DOWNLOADS_PATH}/dstc2_v3"
- },
- "download": [
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/gobot_dstc2_best_v4.tar.gz",
- "subdir": "{MODELS_PATH}"
- },
- {
- "url": "http://files.deeppavlov.ai/datasets/dstc2_v3.tar.gz",
- "subdir": "{DSTC2_DATA_PATH}"
- },
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/embeddings/wiki.en.bin",
- "subdir": "{DOWNLOADS_PATH}/embeddings"
- }
- ]
- }
-}
diff --git a/deeppavlov/configs/go_bot/gobot_simple_dstc2.json b/deeppavlov/configs/go_bot/gobot_simple_dstc2.json
deleted file mode 100644
index 52093d0ecb..0000000000
--- a/deeppavlov/configs/go_bot/gobot_simple_dstc2.json
+++ /dev/null
@@ -1,125 +0,0 @@
-{
- "dataset_reader": {
- "class_name": "dstc2_reader",
- "data_path": "{DSTC2_DATA_PATH}"
- },
- "dataset_iterator": {
- "class_name": "dialog_iterator"
- },
- "chainer": {
- "in": ["x"],
- "in_y": ["y"],
- "out": ["y_predicted"],
- "pipe": [
- {
- "class_name": "dialog_component_wrapper",
- "component": { "class_name": "split_tokenizer" },
- "in": ["x"],
- "out": ["x_tokens"]
- },
- {
- "id": "word_vocab",
- "class_name": "simple_vocab",
- "fit_on": ["x_tokens"],
- "save_path": "{MODEL_PATH}/word.dict",
- "load_path": "{MODEL_PATH}/word.dict"
- },
- {
- "class_name": "go_bot",
- "load_path": "{MODEL_PATH}/model",
- "save_path": "{MODEL_PATH}/model",
- "in": ["x"],
- "in_y": ["y"],
- "out": ["y_predicted"],
- "main": true,
- "debug": false,
- "learning_rate": 0.003,
- "learning_rate_drop_patience": 5,
- "learning_rate_drop_div": 10.0,
- "momentum": 0.95,
- "optimizer": "tensorflow.train:AdamOptimizer",
- "clip_norm": 2.0,
- "dropout_rate": 0.4,
- "l2_reg_coef": 3e-4,
- "hidden_size": 128,
- "dense_size": 160,
- "word_vocab": "#word_vocab",
- "database": {
- "class_name": "sqlite_database",
- "table_name": "mytable",
- "primary_keys": ["name"],
- "save_path": "{DSTC2_DATA_PATH}/resto.sqlite"
- },
- "nlg_manager": {
- "class_name": "gobot_nlg_manager",
- "template_path": "{DSTC2_DATA_PATH}/dstc2-templates.txt",
- "template_type": "DualTemplate",
- "api_call_action": "api_call"
- },
- "api_call_action": "api_call",
- "use_action_mask": false,
- "slot_filler": {
- "config_path": "{CONFIGS_PATH}/ner/slotfill_dstc2.json"
- },
- "intent_classifier": null,
- "embedder": {
- "class_name": "glove",
- "load_path": "{DOWNLOADS_PATH}/embeddings/glove.6B.100d.txt"
- },
- "bow_embedder": {
- "class_name": "bow",
- "depth": "#word_vocab.__len__()",
- "with_counts": true
- },
- "tokenizer": {
- "class_name": "stream_spacy_tokenizer",
- "lowercase": false
- },
- "tracker": {
- "class_name": "featurized_tracker",
- "slot_names": ["pricerange", "this", "area", "food", "name"]
- }
- }
- ]
- },
- "train": {
- "epochs": 200,
- "batch_size": 8,
-
- "metrics": ["per_item_dialog_accuracy"],
- "validation_patience": 10,
- "val_every_n_batches": 15,
-
- "log_every_n_batches": 15,
- "show_examples": false,
- "evaluation_targets": [
- "valid",
- "test"
- ],
- "class_name": "nn_trainer"
- },
- "metadata": {
- "variables": {
- "ROOT_PATH": "~/.deeppavlov",
- "CONFIGS_PATH": "{DEEPPAVLOV_PATH}/configs",
- "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
- "DSTC2_DATA_PATH": "{DOWNLOADS_PATH}/dstc2_v3",
- "MODELS_PATH": "{ROOT_PATH}/models",
- "MODEL_PATH": "{MODELS_PATH}/gobot_dstc2"
- },
- "download": [
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/gobot_dstc2_v9.tar.gz",
- "subdir": "{MODELS_PATH}"
- },
- {
- "url": "http://files.deeppavlov.ai/embeddings/glove.6B.100d.txt",
- "subdir": "{DOWNLOADS_PATH}/embeddings"
- },
- {
- "url": "http://files.deeppavlov.ai/datasets/dstc2_v3.tar.gz",
- "subdir": "{DSTC2_DATA_PATH}"
- }
- ]
- }
-}
diff --git a/deeppavlov/configs/morpho_tagger/UD2.0/morpho_ar.json b/deeppavlov/configs/morpho_tagger/UD2.0/morpho_ar.json
deleted file mode 100644
index 1abd931d5d..0000000000
--- a/deeppavlov/configs/morpho_tagger/UD2.0/morpho_ar.json
+++ /dev/null
@@ -1,173 +0,0 @@
-{
- "dataset_reader": {
- "class_name": "morphotagger_dataset_reader",
- "data_path": "{DOWNLOADS_PATH}/UD2.0_source",
- "language": "ar",
- "data_types": [
- "train",
- "dev",
- "test"
- ]
- },
- "dataset_iterator": {
- "class_name": "morphotagger_dataset"
- },
- "chainer": {
- "in": [
- "x"
- ],
- "in_y": [
- "y"
- ],
- "pipe": [
- {
- "in": [
- "x"
- ],
- "class_name": "lazy_tokenizer",
- "out": [
- "x_tokens"
- ]
- },
- {
- "id": "char_splitting_lowercase_preprocessor",
- "class_name": "char_splitting_lowercase_preprocessor",
- "in": [
- "x_tokens"
- ],
- "out": [
- "x_chars_lowered_marked"
- ]
- },
- {
- "id": "tag_vocab",
- "class_name": "simple_vocab",
- "fit_on": [
- "y"
- ],
- "special_tokens": [
- "PAD",
- "BEGIN",
- "END"
- ],
- "save_path": "{MODELS_PATH}/morpho_tagger/UD2.0/ar/tag.dict",
- "load_path": "{MODELS_PATH}/morpho_tagger/UD2.0/ar/tag.dict"
- },
- {
- "id": "char_vocab",
- "class_name": "simple_vocab",
- "min_freq": 3,
- "fit_on": [
- "x_chars_lowered_marked"
- ],
- "special_tokens": [
- "PAD",
- "BEGIN",
- "END"
- ],
- "save_path": "{MODELS_PATH}/morpho_tagger/UD2.0/ar/char.dict",
- "load_path": "{MODELS_PATH}/morpho_tagger/UD2.0/ar/char.dict"
- },
- {
- "in": [
- "x_chars_lowered_marked"
- ],
- "in_y": [
- "y"
- ],
- "out": [
- "y_predicted"
- ],
- "class_name": "morpho_tagger",
- "main": true,
- "save_path": "{MODELS_PATH}/morpho_tagger/UD2.0/ar/model.hdf5",
- "load_path": "{MODELS_PATH}/morpho_tagger/UD2.0/ar/model.hdf5",
- "tags": "#tag_vocab",
- "symbols": "#char_vocab",
- "verbose": 1,
- "char_embeddings_size": 32,
- "char_window_size": [
- 1,
- 2,
- 3,
- 4,
- 5,
- 6,
- 7
- ],
- "word_lstm_units": 128,
- "conv_dropout": 0.0,
- "char_conv_layers": 1,
- "char_highway_layers": 1,
- "highway_dropout": 0.0,
- "word_lstm_layers": 1,
- "char_filter_multiple": 50,
- "intermediate_dropout": 0.0,
- "word_dropout": 0.2,
- "lstm_dropout": 0.2,
- "regularizer": 0.01
- },
- {
- "in": [
- "x_tokens",
- "y_predicted"
- ],
- "out": [
- "y_prettified"
- ],
- "id": "prettifier",
- "class_name": "tag_output_prettifier",
- "end": "\n"
- }
- ],
- "out": [
- "y_prettified"
- ]
- },
- "train": {
- "epochs": 50,
- "batch_size": 32,
- "metrics": [
- {
- "name": "per_token_accuracy",
- "inputs": [
- "y",
- "y_predicted"
- ]
- },
- {
- "name": "accuracy",
- "inputs": [
- "y",
- "y_predicted"
- ]
- }
- ],
- "validation_patience": 10,
- "val_every_n_epochs": 1,
- "log_every_n_epochs": 1,
- "class_name": "nn_trainer",
- "evaluation_targets": [
- "valid",
- "test"
- ]
- },
- "metadata": {
- "variables": {
- "ROOT_PATH": "~/.deeppavlov",
- "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
- "MODELS_PATH": "{ROOT_PATH}/models",
- "RESULTS_PATH": "{ROOT_PATH}/results"
- },
- "download": [
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/morpho_tagger/UD2.0/ar.tar.gz",
- "subdir": "{MODELS_PATH}/morpho_tagger/UD2.0/ar"
- },
- {
- "url": "http://files.deeppavlov.ai/datasets/UD2.0_source/ar.tar.gz",
- "subdir": "{DOWNLOADS_PATH}/UD2.0_source/ar"
- }
- ]
- }
-}
\ No newline at end of file
diff --git a/deeppavlov/configs/morpho_tagger/UD2.0/morpho_cs.json b/deeppavlov/configs/morpho_tagger/UD2.0/morpho_cs.json
deleted file mode 100644
index 047a08cf24..0000000000
--- a/deeppavlov/configs/morpho_tagger/UD2.0/morpho_cs.json
+++ /dev/null
@@ -1,173 +0,0 @@
-{
- "dataset_reader": {
- "class_name": "morphotagger_dataset_reader",
- "data_path": "{DOWNLOADS_PATH}/UD2.0_source",
- "language": "cs",
- "data_types": [
- "train",
- "dev",
- "test"
- ]
- },
- "dataset_iterator": {
- "class_name": "morphotagger_dataset"
- },
- "chainer": {
- "in": [
- "x"
- ],
- "in_y": [
- "y"
- ],
- "pipe": [
- {
- "in": [
- "x"
- ],
- "class_name": "lazy_tokenizer",
- "out": [
- "x_tokens"
- ]
- },
- {
- "id": "char_splitting_lowercase_preprocessor",
- "class_name": "char_splitting_lowercase_preprocessor",
- "in": [
- "x_tokens"
- ],
- "out": [
- "x_chars_lowered_marked"
- ]
- },
- {
- "id": "tag_vocab",
- "class_name": "simple_vocab",
- "fit_on": [
- "y"
- ],
- "special_tokens": [
- "PAD",
- "BEGIN",
- "END"
- ],
- "save_path": "{MODELS_PATH}/morpho_tagger/UD2.0/cs/tag.dict",
- "load_path": "{MODELS_PATH}/morpho_tagger/UD2.0/cs/tag.dict"
- },
- {
- "id": "char_vocab",
- "class_name": "simple_vocab",
- "min_freq": 3,
- "fit_on": [
- "x_chars_lowered_marked"
- ],
- "special_tokens": [
- "PAD",
- "BEGIN",
- "END"
- ],
- "save_path": "{MODELS_PATH}/morpho_tagger/UD2.0/cs/char.dict",
- "load_path": "{MODELS_PATH}/morpho_tagger/UD2.0/cs/char.dict"
- },
- {
- "in": [
- "x_chars_lowered_marked"
- ],
- "in_y": [
- "y"
- ],
- "out": [
- "y_predicted"
- ],
- "class_name": "morpho_tagger",
- "main": true,
- "save_path": "{MODELS_PATH}/morpho_tagger/UD2.0/cs/model.hdf5",
- "load_path": "{MODELS_PATH}/morpho_tagger/UD2.0/cs/model.hdf5",
- "tags": "#tag_vocab",
- "symbols": "#char_vocab",
- "verbose": 1,
- "char_embeddings_size": 32,
- "char_window_size": [
- 1,
- 2,
- 3,
- 4,
- 5,
- 6,
- 7
- ],
- "word_lstm_units": 128,
- "conv_dropout": 0.0,
- "char_conv_layers": 1,
- "char_highway_layers": 1,
- "highway_dropout": 0.0,
- "word_lstm_layers": 1,
- "char_filter_multiple": 50,
- "intermediate_dropout": 0.0,
- "word_dropout": 0.2,
- "lstm_dropout": 0.2,
- "regularizer": 0.01
- },
- {
- "in": [
- "x_tokens",
- "y_predicted"
- ],
- "out": [
- "y_prettified"
- ],
- "id": "prettifier",
- "class_name": "tag_output_prettifier",
- "end": "\n"
- }
- ],
- "out": [
- "y_prettified"
- ]
- },
- "train": {
- "epochs": 50,
- "batch_size": 32,
- "metrics": [
- {
- "name": "per_token_accuracy",
- "inputs": [
- "y",
- "y_predicted"
- ]
- },
- {
- "name": "accuracy",
- "inputs": [
- "y",
- "y_predicted"
- ]
- }
- ],
- "validation_patience": 10,
- "val_every_n_epochs": 1,
- "log_every_n_epochs": 1,
- "class_name": "nn_trainer",
- "evaluation_targets": [
- "valid",
- "test"
- ]
- },
- "metadata": {
- "variables": {
- "ROOT_PATH": "~/.deeppavlov",
- "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
- "MODELS_PATH": "{ROOT_PATH}/models",
- "RESULTS_PATH": "{ROOT_PATH}/results"
- },
- "download": [
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/morpho_tagger/UD2.0/cs.tar.gz",
- "subdir": "{MODELS_PATH}/morpho_tagger/UD2.0/cs"
- },
- {
- "url": "http://files.deeppavlov.ai/datasets/UD2.0_source/cs.tar.gz",
- "subdir": "{DOWNLOADS_PATH}/UD2.0_source/cs"
- }
- ]
- }
-}
\ No newline at end of file
diff --git a/deeppavlov/configs/morpho_tagger/UD2.0/morpho_de.json b/deeppavlov/configs/morpho_tagger/UD2.0/morpho_de.json
deleted file mode 100644
index c0c7aa19f1..0000000000
--- a/deeppavlov/configs/morpho_tagger/UD2.0/morpho_de.json
+++ /dev/null
@@ -1,173 +0,0 @@
-{
- "dataset_reader": {
- "class_name": "morphotagger_dataset_reader",
- "data_path": "{DOWNLOADS_PATH}/UD2.0_source",
- "language": "de",
- "data_types": [
- "train",
- "dev",
- "test"
- ]
- },
- "dataset_iterator": {
- "class_name": "morphotagger_dataset"
- },
- "chainer": {
- "in": [
- "x"
- ],
- "in_y": [
- "y"
- ],
- "pipe": [
- {
- "in": [
- "x"
- ],
- "class_name": "lazy_tokenizer",
- "out": [
- "x_tokens"
- ]
- },
- {
- "id": "char_splitting_lowercase_preprocessor",
- "class_name": "char_splitting_lowercase_preprocessor",
- "in": [
- "x_tokens"
- ],
- "out": [
- "x_chars_lowered_marked"
- ]
- },
- {
- "id": "tag_vocab",
- "class_name": "simple_vocab",
- "fit_on": [
- "y"
- ],
- "special_tokens": [
- "PAD",
- "BEGIN",
- "END"
- ],
- "save_path": "{MODELS_PATH}/morpho_tagger/UD2.0/de/tag.dict",
- "load_path": "{MODELS_PATH}/morpho_tagger/UD2.0/de/tag.dict"
- },
- {
- "id": "char_vocab",
- "class_name": "simple_vocab",
- "min_freq": 3,
- "fit_on": [
- "x_chars_lowered_marked"
- ],
- "special_tokens": [
- "PAD",
- "BEGIN",
- "END"
- ],
- "save_path": "{MODELS_PATH}/morpho_tagger/UD2.0/de/char.dict",
- "load_path": "{MODELS_PATH}/morpho_tagger/UD2.0/de/char.dict"
- },
- {
- "in": [
- "x_chars_lowered_marked"
- ],
- "in_y": [
- "y"
- ],
- "out": [
- "y_predicted"
- ],
- "class_name": "morpho_tagger",
- "main": true,
- "save_path": "{MODELS_PATH}/morpho_tagger/UD2.0/de/model.hdf5",
- "load_path": "{MODELS_PATH}/morpho_tagger/UD2.0/de/model.hdf5",
- "tags": "#tag_vocab",
- "symbols": "#char_vocab",
- "verbose": 1,
- "char_embeddings_size": 32,
- "char_window_size": [
- 1,
- 2,
- 3,
- 4,
- 5,
- 6,
- 7
- ],
- "word_lstm_units": 128,
- "conv_dropout": 0.0,
- "char_conv_layers": 1,
- "char_highway_layers": 1,
- "highway_dropout": 0.0,
- "word_lstm_layers": 1,
- "char_filter_multiple": 50,
- "intermediate_dropout": 0.0,
- "word_dropout": 0.2,
- "lstm_dropout": 0.2,
- "regularizer": 0.01
- },
- {
- "in": [
- "x_tokens",
- "y_predicted"
- ],
- "out": [
- "y_prettified"
- ],
- "id": "prettifier",
- "class_name": "tag_output_prettifier",
- "end": "\n"
- }
- ],
- "out": [
- "y_prettified"
- ]
- },
- "train": {
- "epochs": 50,
- "batch_size": 32,
- "metrics": [
- {
- "name": "per_token_accuracy",
- "inputs": [
- "y",
- "y_predicted"
- ]
- },
- {
- "name": "accuracy",
- "inputs": [
- "y",
- "y_predicted"
- ]
- }
- ],
- "validation_patience": 10,
- "val_every_n_epochs": 1,
- "log_every_n_epochs": 1,
- "class_name": "nn_trainer",
- "evaluation_targets": [
- "valid",
- "test"
- ]
- },
- "metadata": {
- "variables": {
- "ROOT_PATH": "~/.deeppavlov",
- "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
- "MODELS_PATH": "{ROOT_PATH}/models",
- "RESULTS_PATH": "{ROOT_PATH}/results"
- },
- "download": [
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/morpho_tagger/UD2.0/de.tar.gz",
- "subdir": "{MODELS_PATH}/morpho_tagger/UD2.0/de"
- },
- {
- "url": "http://files.deeppavlov.ai/datasets/UD2.0_source/de.tar.gz",
- "subdir": "{DOWNLOADS_PATH}/UD2.0_source/de"
- }
- ]
- }
-}
\ No newline at end of file
diff --git a/deeppavlov/configs/morpho_tagger/UD2.0/morpho_en.json b/deeppavlov/configs/morpho_tagger/UD2.0/morpho_en.json
deleted file mode 100644
index dd771f8216..0000000000
--- a/deeppavlov/configs/morpho_tagger/UD2.0/morpho_en.json
+++ /dev/null
@@ -1,173 +0,0 @@
-{
- "dataset_reader": {
- "class_name": "morphotagger_dataset_reader",
- "data_path": "{DOWNLOADS_PATH}/UD2.0_source",
- "language": "en",
- "data_types": [
- "train",
- "dev",
- "test"
- ]
- },
- "dataset_iterator": {
- "class_name": "morphotagger_dataset"
- },
- "chainer": {
- "in": [
- "x"
- ],
- "in_y": [
- "y"
- ],
- "pipe": [
- {
- "in": [
- "x"
- ],
- "class_name": "lazy_tokenizer",
- "out": [
- "x_tokens"
- ]
- },
- {
- "id": "char_splitting_lowercase_preprocessor",
- "class_name": "char_splitting_lowercase_preprocessor",
- "in": [
- "x_tokens"
- ],
- "out": [
- "x_chars_lowered_marked"
- ]
- },
- {
- "id": "tag_vocab",
- "class_name": "simple_vocab",
- "fit_on": [
- "y"
- ],
- "special_tokens": [
- "PAD",
- "BEGIN",
- "END"
- ],
- "save_path": "{MODELS_PATH}/morpho_tagger/UD2.0/en/tag.dict",
- "load_path": "{MODELS_PATH}/morpho_tagger/UD2.0/en/tag.dict"
- },
- {
- "id": "char_vocab",
- "class_name": "simple_vocab",
- "min_freq": 3,
- "fit_on": [
- "x_chars_lowered_marked"
- ],
- "special_tokens": [
- "PAD",
- "BEGIN",
- "END"
- ],
- "save_path": "{MODELS_PATH}/morpho_tagger/UD2.0/en/char.dict",
- "load_path": "{MODELS_PATH}/morpho_tagger/UD2.0/en/char.dict"
- },
- {
- "in": [
- "x_chars_lowered_marked"
- ],
- "in_y": [
- "y"
- ],
- "out": [
- "y_predicted"
- ],
- "class_name": "morpho_tagger",
- "main": true,
- "save_path": "{MODELS_PATH}/morpho_tagger/UD2.0/en/model.hdf5",
- "load_path": "{MODELS_PATH}/morpho_tagger/UD2.0/en/model.hdf5",
- "tags": "#tag_vocab",
- "symbols": "#char_vocab",
- "verbose": 1,
- "char_embeddings_size": 32,
- "char_window_size": [
- 1,
- 2,
- 3,
- 4,
- 5,
- 6,
- 7
- ],
- "word_lstm_units": 128,
- "conv_dropout": 0.0,
- "char_conv_layers": 1,
- "char_highway_layers": 1,
- "highway_dropout": 0.0,
- "word_lstm_layers": 1,
- "char_filter_multiple": 50,
- "intermediate_dropout": 0.0,
- "word_dropout": 0.2,
- "lstm_dropout": 0.2,
- "regularizer": 0.01
- },
- {
- "in": [
- "x_tokens",
- "y_predicted"
- ],
- "out": [
- "y_prettified"
- ],
- "id": "prettifier",
- "class_name": "tag_output_prettifier",
- "end": "\n"
- }
- ],
- "out": [
- "y_prettified"
- ]
- },
- "train": {
- "epochs": 50,
- "batch_size": 32,
- "metrics": [
- {
- "name": "per_token_accuracy",
- "inputs": [
- "y",
- "y_predicted"
- ]
- },
- {
- "name": "accuracy",
- "inputs": [
- "y",
- "y_predicted"
- ]
- }
- ],
- "validation_patience": 10,
- "val_every_n_epochs": 1,
- "log_every_n_epochs": 1,
- "class_name": "nn_trainer",
- "evaluation_targets": [
- "valid",
- "test"
- ]
- },
- "metadata": {
- "variables": {
- "ROOT_PATH": "~/.deeppavlov",
- "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
- "MODELS_PATH": "{ROOT_PATH}/models",
- "RESULTS_PATH": "{ROOT_PATH}/results"
- },
- "download": [
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/morpho_tagger/UD2.0/en.tar.gz",
- "subdir": "{MODELS_PATH}/morpho_tagger/UD2.0/en"
- },
- {
- "url": "http://files.deeppavlov.ai/datasets/UD2.0_source/en.tar.gz",
- "subdir": "{DOWNLOADS_PATH}/UD2.0_source/en"
- }
- ]
- }
-}
\ No newline at end of file
diff --git a/deeppavlov/configs/morpho_tagger/UD2.0/morpho_es_ancora.json b/deeppavlov/configs/morpho_tagger/UD2.0/morpho_es_ancora.json
deleted file mode 100644
index ce6c39f736..0000000000
--- a/deeppavlov/configs/morpho_tagger/UD2.0/morpho_es_ancora.json
+++ /dev/null
@@ -1,173 +0,0 @@
-{
- "dataset_reader": {
- "class_name": "morphotagger_dataset_reader",
- "data_path": "{DOWNLOADS_PATH}/UD2.0_source",
- "language": "es_ancora",
- "data_types": [
- "train",
- "dev",
- "test"
- ]
- },
- "dataset_iterator": {
- "class_name": "morphotagger_dataset"
- },
- "chainer": {
- "in": [
- "x"
- ],
- "in_y": [
- "y"
- ],
- "pipe": [
- {
- "in": [
- "x"
- ],
- "class_name": "lazy_tokenizer",
- "out": [
- "x_tokens"
- ]
- },
- {
- "id": "char_splitting_lowercase_preprocessor",
- "class_name": "char_splitting_lowercase_preprocessor",
- "in": [
- "x_tokens"
- ],
- "out": [
- "x_chars_lowered_marked"
- ]
- },
- {
- "id": "tag_vocab",
- "class_name": "simple_vocab",
- "fit_on": [
- "y"
- ],
- "special_tokens": [
- "PAD",
- "BEGIN",
- "END"
- ],
- "save_path": "{MODELS_PATH}/morpho_tagger/UD2.0/es_ancora/tag.dict",
- "load_path": "{MODELS_PATH}/morpho_tagger/UD2.0/es_ancora/tag.dict"
- },
- {
- "id": "char_vocab",
- "class_name": "simple_vocab",
- "min_freq": 3,
- "fit_on": [
- "x_chars_lowered_marked"
- ],
- "special_tokens": [
- "PAD",
- "BEGIN",
- "END"
- ],
- "save_path": "{MODELS_PATH}/morpho_tagger/UD2.0/es_ancora/char.dict",
- "load_path": "{MODELS_PATH}/morpho_tagger/UD2.0/es_ancora/char.dict"
- },
- {
- "in": [
- "x_chars_lowered_marked"
- ],
- "in_y": [
- "y"
- ],
- "out": [
- "y_predicted"
- ],
- "class_name": "morpho_tagger",
- "main": true,
- "save_path": "{MODELS_PATH}/morpho_tagger/UD2.0/es_ancora/model.hdf5",
- "load_path": "{MODELS_PATH}/morpho_tagger/UD2.0/es_ancora/model.hdf5",
- "tags": "#tag_vocab",
- "symbols": "#char_vocab",
- "verbose": 1,
- "char_embeddings_size": 32,
- "char_window_size": [
- 1,
- 2,
- 3,
- 4,
- 5,
- 6,
- 7
- ],
- "word_lstm_units": 128,
- "conv_dropout": 0.0,
- "char_conv_layers": 1,
- "char_highway_layers": 1,
- "highway_dropout": 0.0,
- "word_lstm_layers": 1,
- "char_filter_multiple": 50,
- "intermediate_dropout": 0.0,
- "word_dropout": 0.2,
- "lstm_dropout": 0.2,
- "regularizer": 0.01
- },
- {
- "in": [
- "x_tokens",
- "y_predicted"
- ],
- "out": [
- "y_prettified"
- ],
- "id": "prettifier",
- "class_name": "tag_output_prettifier",
- "end": "\n"
- }
- ],
- "out": [
- "y_prettified"
- ]
- },
- "train": {
- "epochs": 50,
- "batch_size": 32,
- "metrics": [
- {
- "name": "per_token_accuracy",
- "inputs": [
- "y",
- "y_predicted"
- ]
- },
- {
- "name": "accuracy",
- "inputs": [
- "y",
- "y_predicted"
- ]
- }
- ],
- "validation_patience": 10,
- "val_every_n_epochs": 1,
- "log_every_n_epochs": 1,
- "class_name": "nn_trainer",
- "evaluation_targets": [
- "valid",
- "test"
- ]
- },
- "metadata": {
- "variables": {
- "ROOT_PATH": "~/.deeppavlov",
- "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
- "MODELS_PATH": "{ROOT_PATH}/models",
- "RESULTS_PATH": "{ROOT_PATH}/results"
- },
- "download": [
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/morpho_tagger/UD2.0/es_ancora.tar.gz",
- "subdir": "{MODELS_PATH}/morpho_tagger/UD2.0/es_ancora"
- },
- {
- "url": "http://files.deeppavlov.ai/datasets/UD2.0_source/es_ancora.tar.gz",
- "subdir": "{DOWNLOADS_PATH}/UD2.0_source/es_ancora"
- }
- ]
- }
-}
\ No newline at end of file
diff --git a/deeppavlov/configs/morpho_tagger/UD2.0/morpho_fr.json b/deeppavlov/configs/morpho_tagger/UD2.0/morpho_fr.json
deleted file mode 100644
index 7c944e807a..0000000000
--- a/deeppavlov/configs/morpho_tagger/UD2.0/morpho_fr.json
+++ /dev/null
@@ -1,173 +0,0 @@
-{
- "dataset_reader": {
- "class_name": "morphotagger_dataset_reader",
- "data_path": "{DOWNLOADS_PATH}/UD2.0_source",
- "language": "fr",
- "data_types": [
- "train",
- "dev",
- "test"
- ]
- },
- "dataset_iterator": {
- "class_name": "morphotagger_dataset"
- },
- "chainer": {
- "in": [
- "x"
- ],
- "in_y": [
- "y"
- ],
- "pipe": [
- {
- "in": [
- "x"
- ],
- "class_name": "lazy_tokenizer",
- "out": [
- "x_tokens"
- ]
- },
- {
- "id": "char_splitting_lowercase_preprocessor",
- "class_name": "char_splitting_lowercase_preprocessor",
- "in": [
- "x_tokens"
- ],
- "out": [
- "x_chars_lowered_marked"
- ]
- },
- {
- "id": "tag_vocab",
- "class_name": "simple_vocab",
- "fit_on": [
- "y"
- ],
- "special_tokens": [
- "PAD",
- "BEGIN",
- "END"
- ],
- "save_path": "{MODELS_PATH}/morpho_tagger/UD2.0/fr/tag.dict",
- "load_path": "{MODELS_PATH}/morpho_tagger/UD2.0/fr/tag.dict"
- },
- {
- "id": "char_vocab",
- "class_name": "simple_vocab",
- "min_freq": 3,
- "fit_on": [
- "x_chars_lowered_marked"
- ],
- "special_tokens": [
- "PAD",
- "BEGIN",
- "END"
- ],
- "save_path": "{MODELS_PATH}/morpho_tagger/UD2.0/fr/char.dict",
- "load_path": "{MODELS_PATH}/morpho_tagger/UD2.0/fr/char.dict"
- },
- {
- "in": [
- "x_chars_lowered_marked"
- ],
- "in_y": [
- "y"
- ],
- "out": [
- "y_predicted"
- ],
- "class_name": "morpho_tagger",
- "main": true,
- "save_path": "{MODELS_PATH}/morpho_tagger/UD2.0/fr/model.hdf5",
- "load_path": "{MODELS_PATH}/morpho_tagger/UD2.0/fr/model.hdf5",
- "tags": "#tag_vocab",
- "symbols": "#char_vocab",
- "verbose": 1,
- "char_embeddings_size": 32,
- "char_window_size": [
- 1,
- 2,
- 3,
- 4,
- 5,
- 6,
- 7
- ],
- "word_lstm_units": 128,
- "conv_dropout": 0.0,
- "char_conv_layers": 1,
- "char_highway_layers": 1,
- "highway_dropout": 0.0,
- "word_lstm_layers": 1,
- "char_filter_multiple": 50,
- "intermediate_dropout": 0.0,
- "word_dropout": 0.2,
- "lstm_dropout": 0.2,
- "regularizer": 0.01
- },
- {
- "in": [
- "x_tokens",
- "y_predicted"
- ],
- "out": [
- "y_prettified"
- ],
- "id": "prettifier",
- "class_name": "tag_output_prettifier",
- "end": "\n"
- }
- ],
- "out": [
- "y_prettified"
- ]
- },
- "train": {
- "epochs": 50,
- "batch_size": 32,
- "metrics": [
- {
- "name": "per_token_accuracy",
- "inputs": [
- "y",
- "y_predicted"
- ]
- },
- {
- "name": "accuracy",
- "inputs": [
- "y",
- "y_predicted"
- ]
- }
- ],
- "validation_patience": 10,
- "val_every_n_epochs": 1,
- "log_every_n_epochs": 1,
- "class_name": "nn_trainer",
- "evaluation_targets": [
- "valid",
- "test"
- ]
- },
- "metadata": {
- "variables": {
- "ROOT_PATH": "~/.deeppavlov",
- "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
- "MODELS_PATH": "{ROOT_PATH}/models",
- "RESULTS_PATH": "{ROOT_PATH}/results"
- },
- "download": [
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/morpho_tagger/UD2.0/fr.tar.gz",
- "subdir": "{MODELS_PATH}/morpho_tagger/UD2.0/fr"
- },
- {
- "url": "http://files.deeppavlov.ai/datasets/UD2.0_source/fr.tar.gz",
- "subdir": "{DOWNLOADS_PATH}/UD2.0_source/fr"
- }
- ]
- }
-}
\ No newline at end of file
diff --git a/deeppavlov/configs/morpho_tagger/UD2.0/morpho_hi.json b/deeppavlov/configs/morpho_tagger/UD2.0/morpho_hi.json
deleted file mode 100644
index ff10e2e4ba..0000000000
--- a/deeppavlov/configs/morpho_tagger/UD2.0/morpho_hi.json
+++ /dev/null
@@ -1,173 +0,0 @@
-{
- "dataset_reader": {
- "class_name": "morphotagger_dataset_reader",
- "data_path": "{DOWNLOADS_PATH}/UD2.0_source",
- "language": "hi",
- "data_types": [
- "train",
- "dev",
- "test"
- ]
- },
- "dataset_iterator": {
- "class_name": "morphotagger_dataset"
- },
- "chainer": {
- "in": [
- "x"
- ],
- "in_y": [
- "y"
- ],
- "pipe": [
- {
- "in": [
- "x"
- ],
- "class_name": "lazy_tokenizer",
- "out": [
- "x_tokens"
- ]
- },
- {
- "id": "char_splitting_lowercase_preprocessor",
- "class_name": "char_splitting_lowercase_preprocessor",
- "in": [
- "x_tokens"
- ],
- "out": [
- "x_chars_lowered_marked"
- ]
- },
- {
- "id": "tag_vocab",
- "class_name": "simple_vocab",
- "fit_on": [
- "y"
- ],
- "special_tokens": [
- "PAD",
- "BEGIN",
- "END"
- ],
- "save_path": "{MODELS_PATH}/morpho_tagger/UD2.0/hi/tag.dict",
- "load_path": "{MODELS_PATH}/morpho_tagger/UD2.0/hi/tag.dict"
- },
- {
- "id": "char_vocab",
- "class_name": "simple_vocab",
- "min_freq": 3,
- "fit_on": [
- "x_chars_lowered_marked"
- ],
- "special_tokens": [
- "PAD",
- "BEGIN",
- "END"
- ],
- "save_path": "{MODELS_PATH}/morpho_tagger/UD2.0/hi/char.dict",
- "load_path": "{MODELS_PATH}/morpho_tagger/UD2.0/hi/char.dict"
- },
- {
- "in": [
- "x_chars_lowered_marked"
- ],
- "in_y": [
- "y"
- ],
- "out": [
- "y_predicted"
- ],
- "class_name": "morpho_tagger",
- "main": true,
- "save_path": "{MODELS_PATH}/morpho_tagger/UD2.0/hi/model.hdf5",
- "load_path": "{MODELS_PATH}/morpho_tagger/UD2.0/hi/model.hdf5",
- "tags": "#tag_vocab",
- "symbols": "#char_vocab",
- "verbose": 1,
- "char_embeddings_size": 32,
- "char_window_size": [
- 1,
- 2,
- 3,
- 4,
- 5,
- 6,
- 7
- ],
- "word_lstm_units": 128,
- "conv_dropout": 0.0,
- "char_conv_layers": 1,
- "char_highway_layers": 1,
- "highway_dropout": 0.0,
- "word_lstm_layers": 1,
- "char_filter_multiple": 50,
- "intermediate_dropout": 0.0,
- "word_dropout": 0.2,
- "lstm_dropout": 0.2,
- "regularizer": 0.01
- },
- {
- "in": [
- "x_tokens",
- "y_predicted"
- ],
- "out": [
- "y_prettified"
- ],
- "id": "prettifier",
- "class_name": "tag_output_prettifier",
- "end": "\n"
- }
- ],
- "out": [
- "y_prettified"
- ]
- },
- "train": {
- "epochs": 50,
- "batch_size": 32,
- "metrics": [
- {
- "name": "per_token_accuracy",
- "inputs": [
- "y",
- "y_predicted"
- ]
- },
- {
- "name": "accuracy",
- "inputs": [
- "y",
- "y_predicted"
- ]
- }
- ],
- "validation_patience": 10,
- "val_every_n_epochs": 1,
- "log_every_n_epochs": 1,
- "class_name": "nn_trainer",
- "evaluation_targets": [
- "valid",
- "test"
- ]
- },
- "metadata": {
- "variables": {
- "ROOT_PATH": "~/.deeppavlov",
- "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
- "MODELS_PATH": "{ROOT_PATH}/models",
- "RESULTS_PATH": "{ROOT_PATH}/results"
- },
- "download": [
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/morpho_tagger/UD2.0/hi.tar.gz",
- "subdir": "{MODELS_PATH}/morpho_tagger/UD2.0/hi"
- },
- {
- "url": "http://files.deeppavlov.ai/datasets/UD2.0_source/hi.tar.gz",
- "subdir": "{DOWNLOADS_PATH}/UD2.0_source/hi"
- }
- ]
- }
-}
\ No newline at end of file
diff --git a/deeppavlov/configs/morpho_tagger/UD2.0/morpho_hu.json b/deeppavlov/configs/morpho_tagger/UD2.0/morpho_hu.json
deleted file mode 100644
index 6e399a3a43..0000000000
--- a/deeppavlov/configs/morpho_tagger/UD2.0/morpho_hu.json
+++ /dev/null
@@ -1,173 +0,0 @@
-{
- "dataset_reader": {
- "class_name": "morphotagger_dataset_reader",
- "data_path": "{DOWNLOADS_PATH}/UD2.0_source",
- "language": "hu",
- "data_types": [
- "train",
- "dev",
- "test"
- ]
- },
- "dataset_iterator": {
- "class_name": "morphotagger_dataset"
- },
- "chainer": {
- "in": [
- "x"
- ],
- "in_y": [
- "y"
- ],
- "pipe": [
- {
- "in": [
- "x"
- ],
- "class_name": "lazy_tokenizer",
- "out": [
- "x_tokens"
- ]
- },
- {
- "id": "char_splitting_lowercase_preprocessor",
- "class_name": "char_splitting_lowercase_preprocessor",
- "in": [
- "x_tokens"
- ],
- "out": [
- "x_chars_lowered_marked"
- ]
- },
- {
- "id": "tag_vocab",
- "class_name": "simple_vocab",
- "fit_on": [
- "y"
- ],
- "special_tokens": [
- "PAD",
- "BEGIN",
- "END"
- ],
- "save_path": "{MODELS_PATH}/morpho_tagger/UD2.0/hu/tag.dict",
- "load_path": "{MODELS_PATH}/morpho_tagger/UD2.0/hu/tag.dict"
- },
- {
- "id": "char_vocab",
- "class_name": "simple_vocab",
- "min_freq": 3,
- "fit_on": [
- "x_chars_lowered_marked"
- ],
- "special_tokens": [
- "PAD",
- "BEGIN",
- "END"
- ],
- "save_path": "{MODELS_PATH}/morpho_tagger/UD2.0/hu/char.dict",
- "load_path": "{MODELS_PATH}/morpho_tagger/UD2.0/hu/char.dict"
- },
- {
- "in": [
- "x_chars_lowered_marked"
- ],
- "in_y": [
- "y"
- ],
- "out": [
- "y_predicted"
- ],
- "class_name": "morpho_tagger",
- "main": true,
- "save_path": "{MODELS_PATH}/morpho_tagger/UD2.0/hu/model.hdf5",
- "load_path": "{MODELS_PATH}/morpho_tagger/UD2.0/hu/model.hdf5",
- "tags": "#tag_vocab",
- "symbols": "#char_vocab",
- "verbose": 1,
- "char_embeddings_size": 32,
- "char_window_size": [
- 1,
- 2,
- 3,
- 4,
- 5,
- 6,
- 7
- ],
- "word_lstm_units": 128,
- "conv_dropout": 0.0,
- "char_conv_layers": 1,
- "char_highway_layers": 1,
- "highway_dropout": 0.0,
- "word_lstm_layers": 1,
- "char_filter_multiple": 50,
- "intermediate_dropout": 0.0,
- "word_dropout": 0.2,
- "lstm_dropout": 0.2,
- "regularizer": 0.01
- },
- {
- "in": [
- "x_tokens",
- "y_predicted"
- ],
- "out": [
- "y_prettified"
- ],
- "id": "prettifier",
- "class_name": "tag_output_prettifier",
- "end": "\n"
- }
- ],
- "out": [
- "y_prettified"
- ]
- },
- "train": {
- "epochs": 50,
- "batch_size": 32,
- "metrics": [
- {
- "name": "per_token_accuracy",
- "inputs": [
- "y",
- "y_predicted"
- ]
- },
- {
- "name": "accuracy",
- "inputs": [
- "y",
- "y_predicted"
- ]
- }
- ],
- "validation_patience": 10,
- "val_every_n_epochs": 1,
- "log_every_n_epochs": 1,
- "class_name": "nn_trainer",
- "evaluation_targets": [
- "valid",
- "test"
- ]
- },
- "metadata": {
- "variables": {
- "ROOT_PATH": "~/.deeppavlov",
- "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
- "MODELS_PATH": "{ROOT_PATH}/models",
- "RESULTS_PATH": "{ROOT_PATH}/results"
- },
- "download": [
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/morpho_tagger/UD2.0/hu.tar.gz",
- "subdir": "{MODELS_PATH}/morpho_tagger/UD2.0/hu"
- },
- {
- "url": "http://files.deeppavlov.ai/datasets/UD2.0_source/hu.tar.gz",
- "subdir": "{DOWNLOADS_PATH}/UD2.0_source/hu"
- }
- ]
- }
-}
\ No newline at end of file
diff --git a/deeppavlov/configs/morpho_tagger/UD2.0/morpho_it.json b/deeppavlov/configs/morpho_tagger/UD2.0/morpho_it.json
deleted file mode 100644
index a84510a2e2..0000000000
--- a/deeppavlov/configs/morpho_tagger/UD2.0/morpho_it.json
+++ /dev/null
@@ -1,173 +0,0 @@
-{
- "dataset_reader": {
- "class_name": "morphotagger_dataset_reader",
- "data_path": "{DOWNLOADS_PATH}/UD2.0_source",
- "language": "it",
- "data_types": [
- "train",
- "dev",
- "test"
- ]
- },
- "dataset_iterator": {
- "class_name": "morphotagger_dataset"
- },
- "chainer": {
- "in": [
- "x"
- ],
- "in_y": [
- "y"
- ],
- "pipe": [
- {
- "in": [
- "x"
- ],
- "class_name": "lazy_tokenizer",
- "out": [
- "x_tokens"
- ]
- },
- {
- "id": "char_splitting_lowercase_preprocessor",
- "class_name": "char_splitting_lowercase_preprocessor",
- "in": [
- "x_tokens"
- ],
- "out": [
- "x_chars_lowered_marked"
- ]
- },
- {
- "id": "tag_vocab",
- "class_name": "simple_vocab",
- "fit_on": [
- "y"
- ],
- "special_tokens": [
- "PAD",
- "BEGIN",
- "END"
- ],
- "save_path": "{MODELS_PATH}/morpho_tagger/UD2.0/it/tag.dict",
- "load_path": "{MODELS_PATH}/morpho_tagger/UD2.0/it/tag.dict"
- },
- {
- "id": "char_vocab",
- "class_name": "simple_vocab",
- "min_freq": 3,
- "fit_on": [
- "x_chars_lowered_marked"
- ],
- "special_tokens": [
- "PAD",
- "BEGIN",
- "END"
- ],
- "save_path": "{MODELS_PATH}/morpho_tagger/UD2.0/it/char.dict",
- "load_path": "{MODELS_PATH}/morpho_tagger/UD2.0/it/char.dict"
- },
- {
- "in": [
- "x_chars_lowered_marked"
- ],
- "in_y": [
- "y"
- ],
- "out": [
- "y_predicted"
- ],
- "class_name": "morpho_tagger",
- "main": true,
- "save_path": "{MODELS_PATH}/morpho_tagger/UD2.0/it/model.hdf5",
- "load_path": "{MODELS_PATH}/morpho_tagger/UD2.0/it/model.hdf5",
- "tags": "#tag_vocab",
- "symbols": "#char_vocab",
- "verbose": 1,
- "char_embeddings_size": 32,
- "char_window_size": [
- 1,
- 2,
- 3,
- 4,
- 5,
- 6,
- 7
- ],
- "word_lstm_units": 128,
- "conv_dropout": 0.0,
- "char_conv_layers": 1,
- "char_highway_layers": 1,
- "highway_dropout": 0.0,
- "word_lstm_layers": 1,
- "char_filter_multiple": 50,
- "intermediate_dropout": 0.0,
- "word_dropout": 0.2,
- "lstm_dropout": 0.2,
- "regularizer": 0.01
- },
- {
- "in": [
- "x_tokens",
- "y_predicted"
- ],
- "out": [
- "y_prettified"
- ],
- "id": "prettifier",
- "class_name": "tag_output_prettifier",
- "end": "\n"
- }
- ],
- "out": [
- "y_prettified"
- ]
- },
- "train": {
- "epochs": 50,
- "batch_size": 32,
- "metrics": [
- {
- "name": "per_token_accuracy",
- "inputs": [
- "y",
- "y_predicted"
- ]
- },
- {
- "name": "accuracy",
- "inputs": [
- "y",
- "y_predicted"
- ]
- }
- ],
- "validation_patience": 10,
- "val_every_n_epochs": 1,
- "log_every_n_epochs": 1,
- "class_name": "nn_trainer",
- "evaluation_targets": [
- "valid",
- "test"
- ]
- },
- "metadata": {
- "variables": {
- "ROOT_PATH": "~/.deeppavlov",
- "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
- "MODELS_PATH": "{ROOT_PATH}/models",
- "RESULTS_PATH": "{ROOT_PATH}/results"
- },
- "download": [
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/morpho_tagger/UD2.0/it.tar.gz",
- "subdir": "{MODELS_PATH}/morpho_tagger/UD2.0/it"
- },
- {
- "url": "http://files.deeppavlov.ai/datasets/UD2.0_source/it.tar.gz",
- "subdir": "{DOWNLOADS_PATH}/UD2.0_source/it"
- }
- ]
- }
-}
\ No newline at end of file
diff --git a/deeppavlov/configs/morpho_tagger/UD2.0/morpho_ru_syntagrus.json b/deeppavlov/configs/morpho_tagger/UD2.0/morpho_ru_syntagrus.json
deleted file mode 100644
index b5ec00c85e..0000000000
--- a/deeppavlov/configs/morpho_tagger/UD2.0/morpho_ru_syntagrus.json
+++ /dev/null
@@ -1,173 +0,0 @@
-{
- "dataset_reader": {
- "class_name": "morphotagger_dataset_reader",
- "data_path": "{DOWNLOADS_PATH}/UD2.0_source",
- "language": "ru_syntagrus",
- "data_types": [
- "train",
- "dev",
- "test"
- ]
- },
- "dataset_iterator": {
- "class_name": "morphotagger_dataset"
- },
- "chainer": {
- "in": [
- "x"
- ],
- "in_y": [
- "y"
- ],
- "pipe": [
- {
- "in": [
- "x"
- ],
- "class_name": "lazy_tokenizer",
- "out": [
- "x_tokens"
- ]
- },
- {
- "id": "char_splitting_lowercase_preprocessor",
- "class_name": "char_splitting_lowercase_preprocessor",
- "in": [
- "x_tokens"
- ],
- "out": [
- "x_chars_lowered_marked"
- ]
- },
- {
- "id": "tag_vocab",
- "class_name": "simple_vocab",
- "fit_on": [
- "y"
- ],
- "special_tokens": [
- "PAD",
- "BEGIN",
- "END"
- ],
- "save_path": "{MODELS_PATH}/morpho_tagger/UD2.0/ru_syntagrus/tag.dict",
- "load_path": "{MODELS_PATH}/morpho_tagger/UD2.0/ru_syntagrus/tag.dict"
- },
- {
- "id": "char_vocab",
- "class_name": "simple_vocab",
- "min_freq": 3,
- "fit_on": [
- "x_chars_lowered_marked"
- ],
- "special_tokens": [
- "PAD",
- "BEGIN",
- "END"
- ],
- "save_path": "{MODELS_PATH}/morpho_tagger/UD2.0/ru_syntagrus/char.dict",
- "load_path": "{MODELS_PATH}/morpho_tagger/UD2.0/ru_syntagrus/char.dict"
- },
- {
- "in": [
- "x_chars_lowered_marked"
- ],
- "in_y": [
- "y"
- ],
- "out": [
- "y_predicted"
- ],
- "class_name": "morpho_tagger",
- "main": true,
- "save_path": "{MODELS_PATH}/morpho_tagger/UD2.0/ru_syntagrus/model.hdf5",
- "load_path": "{MODELS_PATH}/morpho_tagger/UD2.0/ru_syntagrus/model.hdf5",
- "tags": "#tag_vocab",
- "symbols": "#char_vocab",
- "verbose": 1,
- "char_embeddings_size": 32,
- "char_window_size": [
- 1,
- 2,
- 3,
- 4,
- 5,
- 6,
- 7
- ],
- "word_lstm_units": 128,
- "conv_dropout": 0.0,
- "char_conv_layers": 1,
- "char_highway_layers": 1,
- "highway_dropout": 0.0,
- "word_lstm_layers": 1,
- "char_filter_multiple": 50,
- "intermediate_dropout": 0.0,
- "word_dropout": 0.2,
- "lstm_dropout": 0.2,
- "regularizer": 0.01
- },
- {
- "in": [
- "x_tokens",
- "y_predicted"
- ],
- "out": [
- "y_prettified"
- ],
- "id": "prettifier",
- "class_name": "tag_output_prettifier",
- "end": "\n"
- }
- ],
- "out": [
- "y_prettified"
- ]
- },
- "train": {
- "epochs": 50,
- "batch_size": 32,
- "metrics": [
- {
- "name": "per_token_accuracy",
- "inputs": [
- "y",
- "y_predicted"
- ]
- },
- {
- "name": "accuracy",
- "inputs": [
- "y",
- "y_predicted"
- ]
- }
- ],
- "validation_patience": 10,
- "val_every_n_epochs": 1,
- "log_every_n_epochs": 1,
- "class_name": "nn_trainer",
- "evaluation_targets": [
- "valid",
- "test"
- ]
- },
- "metadata": {
- "variables": {
- "ROOT_PATH": "~/.deeppavlov",
- "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
- "MODELS_PATH": "{ROOT_PATH}/models",
- "RESULTS_PATH": "{ROOT_PATH}/results"
- },
- "download": [
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/morpho_tagger/UD2.0/ru_syntagrus.tar.gz",
- "subdir": "{MODELS_PATH}/morpho_tagger/UD2.0/ru_syntagrus"
- },
- {
- "url": "http://files.deeppavlov.ai/datasets/UD2.0_source/ru_syntagrus.tar.gz",
- "subdir": "{DOWNLOADS_PATH}/UD2.0_source/ru_syntagrus"
- }
- ]
- }
-}
\ No newline at end of file
diff --git a/deeppavlov/configs/morpho_tagger/UD2.0/morpho_ru_syntagrus_pymorphy.json b/deeppavlov/configs/morpho_tagger/UD2.0/morpho_ru_syntagrus_pymorphy.json
deleted file mode 100644
index ef67338faa..0000000000
--- a/deeppavlov/configs/morpho_tagger/UD2.0/morpho_ru_syntagrus_pymorphy.json
+++ /dev/null
@@ -1,193 +0,0 @@
-{
- "dataset_reader": {
- "class_name": "morphotagger_dataset_reader",
- "data_path": "{DOWNLOADS_PATH}/UD2.0_source",
- "language": "ru_syntagrus",
- "data_types": [
- "train",
- "dev",
- "test"
- ]
- },
- "dataset_iterator": {
- "class_name": "morphotagger_dataset"
- },
- "chainer": {
- "in": [
- "x"
- ],
- "in_y": [
- "y"
- ],
- "pipe": [
- {
- "in": [
- "x"
- ],
- "class_name": "lazy_tokenizer",
- "out": [
- "x_tokens"
- ]
- },
- {
- "id": "char_splitting_lowercase_preprocessor",
- "class_name": "char_splitting_lowercase_preprocessor",
- "in": [
- "x_tokens"
- ],
- "out": [
- "x_chars_lowered_marked"
- ]
- },
- {
- "id": "tag_vocab",
- "class_name": "simple_vocab",
- "fit_on": [
- "y"
- ],
- "special_tokens": [
- "PAD",
- "BEGIN",
- "END"
- ],
- "save_path": "{MODELS_PATH}/morpho_tagger/UD2.0/ru_syntagrus/tag.dict",
- "load_path": "{MODELS_PATH}/morpho_tagger/UD2.0/ru_syntagrus/tag.dict"
- },
- {
- "id": "char_vocab",
- "class_name": "simple_vocab",
- "min_freq": 3,
- "fit_on": [
- "x_chars_lowered_marked"
- ],
- "special_tokens": [
- "PAD",
- "BEGIN",
- "END"
- ],
- "save_path": "{MODELS_PATH}/morpho_tagger/UD2.0/ru_syntagrus/char.dict",
- "load_path": "{MODELS_PATH}/morpho_tagger/UD2.0/ru_syntagrus/char.dict"
- },
- {
- "id": "pymorphy_vectorizer",
- "class_name": "pymorphy_vectorizer",
- "save_path": "{MODELS_PATH}/morpho_tagger/UD2.0/ru_syntagrus/tags_russian.txt",
- "load_path": "{MODELS_PATH}/morpho_tagger/UD2.0/ru_syntagrus/tags_russian.txt",
- "max_pymorphy_variants": 5,
- "in": [
- "x_tokens"
- ],
- "out": [
- "x_possible_tags"
- ]
- },
- {
- "in": [
- "x_chars_lowered_marked",
- "x_possible_tags"
- ],
- "in_y": [
- "y"
- ],
- "out": [
- "y_predicted"
- ],
- "class_name": "morpho_tagger",
- "main": true,
- "save_path": "{MODELS_PATH}/morpho_tagger/UD2.0/ru_syntagrus/model_pymorphy.hdf5",
- "load_path": "{MODELS_PATH}/morpho_tagger/UD2.0/ru_syntagrus/model_pymorphy.hdf5",
- "tags": "#tag_vocab",
- "symbols": "#char_vocab",
- "verbose": 1,
- "char_embeddings_size": 32,
- "char_window_size": [
- 1,
- 2,
- 3,
- 4,
- 5,
- 6,
- 7
- ],
- "word_lstm_units": 128,
- "conv_dropout": 0.0,
- "char_conv_layers": 1,
- "char_highway_layers": 1,
- "highway_dropout": 0.0,
- "word_lstm_layers": 1,
- "char_filter_multiple": 50,
- "intermediate_dropout": 0.0,
- "word_dropout": 0.2,
- "lstm_dropout": 0.2,
- "regularizer": 0.01,
- "word_vectorizers": [
- [
- "#pymorphy_vectorizer.dim",
- 128
- ]
- ]
- },
- {
- "in": [
- "x_tokens",
- "y_predicted"
- ],
- "out": [
- "y_prettified"
- ],
- "id": "prettifier",
- "class_name": "tag_output_prettifier",
- "end": "\n"
- }
- ],
- "out": [
- "y_prettified"
- ]
- },
- "train": {
- "epochs": 50,
- "batch_size": 32,
- "metrics": [
- {
- "name": "per_token_accuracy",
- "inputs": [
- "y",
- "y_predicted"
- ]
- },
- {
- "name": "accuracy",
- "inputs": [
- "y",
- "y_predicted"
- ]
- }
- ],
- "validation_patience": 10,
- "val_every_n_epochs": 1,
- "log_every_n_epochs": 1,
- "class_name": "nn_trainer",
- "evaluation_targets": [
- "valid",
- "test"
- ]
- },
- "metadata": {
- "variables": {
- "ROOT_PATH": "~/.deeppavlov",
- "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
- "MODELS_PATH": "{ROOT_PATH}/models",
- "RESULTS_PATH": "{ROOT_PATH}/results"
- },
- "download": [
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/morpho_tagger/UD2.0/ru_syntagrus.tar.gz",
- "subdir": "{MODELS_PATH}/morpho_tagger/UD2.0/ru_syntagrus"
- },
- {
- "url": "http://files.deeppavlov.ai/datasets/UD2.0_source/ru_syntagrus.tar.gz",
- "subdir": "{DOWNLOADS_PATH}/UD2.0_source/ru_syntagrus"
- }
- ]
- }
-}
\ No newline at end of file
diff --git a/deeppavlov/configs/morpho_tagger/UD2.0/morpho_ru_syntagrus_pymorphy_lemmatize.json b/deeppavlov/configs/morpho_tagger/UD2.0/morpho_ru_syntagrus_pymorphy_lemmatize.json
deleted file mode 100644
index da40a4f2a5..0000000000
--- a/deeppavlov/configs/morpho_tagger/UD2.0/morpho_ru_syntagrus_pymorphy_lemmatize.json
+++ /dev/null
@@ -1,201 +0,0 @@
-{
- "dataset_reader": {
- "class_name": "morphotagger_dataset_reader",
- "data_path": "{DOWNLOADS_PATH}/UD2.0_source",
- "language": "ru_syntagrus",
- "data_types": [
- "train",
- "dev",
- "test"
- ]
- },
- "dataset_iterator": {
- "class_name": "morphotagger_dataset"
- },
- "chainer": {
- "in": [
- "x"
- ],
- "in_y": [
- "y"
- ],
- "pipe": [
- {
- "in": [
- "x"
- ],
- "class_name": "lazy_tokenizer",
- "language": "russian",
- "out": [
- "x_tokens"
- ]
- },
- {
- "id": "char_splitting_lowercase_preprocessor",
- "class_name": "char_splitting_lowercase_preprocessor",
- "in": [
- "x_tokens"
- ],
- "out": [
- "x_chars_lowered_marked"
- ]
- },
- {
- "id": "tag_vocab",
- "class_name": "simple_vocab",
- "fit_on": [
- "y"
- ],
- "special_tokens": [
- "PAD",
- "BEGIN",
- "END"
- ],
- "save_path": "{MODELS_PATH}/morpho_tagger/UD2.0/ru_syntagrus/tag.dict",
- "load_path": "{MODELS_PATH}/morpho_tagger/UD2.0/ru_syntagrus/tag.dict"
- },
- {
- "id": "char_vocab",
- "class_name": "simple_vocab",
- "min_freq": 3,
- "fit_on": [
- "x_chars_lowered_marked"
- ],
- "special_tokens": [
- "PAD",
- "BEGIN",
- "END"
- ],
- "save_path": "{MODELS_PATH}/morpho_tagger/UD2.0/ru_syntagrus/char.dict",
- "load_path": "{MODELS_PATH}/morpho_tagger/UD2.0/ru_syntagrus/char.dict"
- },
- {
- "id": "pymorphy_vectorizer",
- "class_name": "pymorphy_vectorizer",
- "save_path": "{MODELS_PATH}/morpho_tagger/UD2.0/ru_syntagrus/tags_russian.txt",
- "load_path": "{MODELS_PATH}/morpho_tagger/UD2.0/ru_syntagrus/tags_russian.txt",
- "max_pymorphy_variants": 5,
- "in": [
- "x_tokens"
- ],
- "out": [
- "x_possible_tags"
- ]
- },
- {
- "in": [
- "x_chars_lowered_marked",
- "x_possible_tags"
- ],
- "in_y": [
- "y"
- ],
- "out": [
- "y_predicted"
- ],
- "class_name": "morpho_tagger",
- "main": true,
- "save_path": "{MODELS_PATH}/morpho_tagger/UD2.0/ru_syntagrus/model_pymorphy.hdf5",
- "load_path": "{MODELS_PATH}/morpho_tagger/UD2.0/ru_syntagrus/model_pymorphy.hdf5",
- "tags": "#tag_vocab",
- "symbols": "#char_vocab",
- "verbose": 1,
- "char_embeddings_size": 32,
- "char_window_size": [
- 1,
- 2,
- 3,
- 4,
- 5,
- 6,
- 7
- ],
- "word_lstm_units": 128,
- "conv_dropout": 0.0,
- "char_conv_layers": 1,
- "char_highway_layers": 1,
- "highway_dropout": 0.0,
- "word_lstm_layers": 1,
- "char_filter_multiple": 50,
- "intermediate_dropout": 0.0,
- "word_dropout": 0.2,
- "lstm_dropout": 0.2,
- "regularizer": 0.01,
- "word_vectorizers": [
- [
- "#pymorphy_vectorizer.dim",
- 128
- ]
- ]
- },
- {
- "in": [
- "x_tokens",
- "y_predicted"
- ],
- "out": [
- "y_lemmas"
- ],
- "class_name": "UD_pymorphy_lemmatizer",
- "end": "\n"
- },
- {
- "in": [
- "x_tokens",
- "y_predicted",
- "y_lemmas"
- ],
- "out": [
- "y_prettified"
- ],
- "id": "prettifier",
- "class_name": "lemmatized_output_prettifier",
- "end": "\n"
- }
- ],
- "out": [
- "y_prettified"
- ]
- },
- "train": {
- "epochs": 50,
- "batch_size": 32,
- "metrics": [
- {
- "name": "per_token_accuracy",
- "inputs": [
- "y",
- "y_predicted"
- ]
- },
- {
- "name": "accuracy",
- "inputs": [
- "y",
- "y_predicted"
- ]
- }
- ],
- "validation_patience": 10,
- "val_every_n_epochs": 1,
- "log_every_n_epochs": 1
- },
- "metadata": {
- "variables": {
- "ROOT_PATH": "~/.deeppavlov",
- "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
- "MODELS_PATH": "{ROOT_PATH}/models",
- "RESULTS_PATH": "{ROOT_PATH}/results"
- },
- "download": [
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/morpho_tagger/UD2.0/ru_syntagrus.tar.gz",
- "subdir": "{MODELS_PATH}/morpho_tagger/UD2.0/ru_syntagrus"
- },
- {
- "url": "http://files.deeppavlov.ai/datasets/UD2.0_source/ru_syntagrus.tar.gz",
- "subdir": "{DOWNLOADS_PATH}/UD2.0_source/ru_syntagrus"
- }
- ]
- }
-}
\ No newline at end of file
diff --git a/deeppavlov/configs/morpho_tagger/UD2.0/morpho_tr.json b/deeppavlov/configs/morpho_tagger/UD2.0/morpho_tr.json
deleted file mode 100644
index e7887f1560..0000000000
--- a/deeppavlov/configs/morpho_tagger/UD2.0/morpho_tr.json
+++ /dev/null
@@ -1,174 +0,0 @@
-{
- "dataset_reader": {
- "class_name": "morphotagger_dataset_reader",
- "data_path": "{DOWNLOADS_PATH}/UD2.0_source",
- "language": "tr",
- "data_types": [
- "train",
- "dev",
- "test"
- ]
- },
- "dataset_iterator": {
- "class_name": "morphotagger_dataset",
- "min_train_fraction": 0.9
- },
- "chainer": {
- "in": [
- "x"
- ],
- "in_y": [
- "y"
- ],
- "pipe": [
- {
- "in": [
- "x"
- ],
- "class_name": "lazy_tokenizer",
- "out": [
- "x_tokens"
- ]
- },
- {
- "id": "char_splitting_lowercase_preprocessor",
- "class_name": "char_splitting_lowercase_preprocessor",
- "in": [
- "x_tokens"
- ],
- "out": [
- "x_chars_lowered_marked"
- ]
- },
- {
- "id": "tag_vocab",
- "class_name": "simple_vocab",
- "fit_on": [
- "y"
- ],
- "special_tokens": [
- "PAD",
- "BEGIN",
- "END"
- ],
- "save_path": "{MODELS_PATH}/morpho_tagger/UD2.0/tr/tag.dict",
- "load_path": "{MODELS_PATH}/morpho_tagger/UD2.0/tr/tag.dict"
- },
- {
- "id": "char_vocab",
- "class_name": "simple_vocab",
- "min_freq": 3,
- "fit_on": [
- "x_chars_lowered_marked"
- ],
- "special_tokens": [
- "PAD",
- "BEGIN",
- "END"
- ],
- "save_path": "{MODELS_PATH}/morpho_tagger/UD2.0/tr/char.dict",
- "load_path": "{MODELS_PATH}/morpho_tagger/UD2.0/tr/char.dict"
- },
- {
- "in": [
- "x_chars_lowered_marked"
- ],
- "in_y": [
- "y"
- ],
- "out": [
- "y_predicted"
- ],
- "class_name": "morpho_tagger",
- "main": true,
- "save_path": "{MODELS_PATH}/morpho_tagger/UD2.0/tr/model.hdf5",
- "load_path": "{MODELS_PATH}/morpho_tagger/UD2.0/tr/model.hdf5",
- "tags": "#tag_vocab",
- "symbols": "#char_vocab",
- "verbose": 1,
- "char_embeddings_size": 32,
- "char_window_size": [
- 1,
- 2,
- 3,
- 4,
- 5,
- 6,
- 7
- ],
- "word_lstm_units": 128,
- "conv_dropout": 0.0,
- "char_conv_layers": 1,
- "char_highway_layers": 1,
- "highway_dropout": 0.0,
- "word_lstm_layers": 1,
- "char_filter_multiple": 50,
- "intermediate_dropout": 0.0,
- "word_dropout": 0.2,
- "lstm_dropout": 0.2,
- "regularizer": 0.01
- },
- {
- "in": [
- "x_tokens",
- "y_predicted"
- ],
- "out": [
- "y_prettified"
- ],
- "id": "prettifier",
- "class_name": "tag_output_prettifier",
- "end": "\n"
- }
- ],
- "out": [
- "y_prettified"
- ]
- },
- "train": {
- "epochs": 50,
- "batch_size": 32,
- "metrics": [
- {
- "name": "per_token_accuracy",
- "inputs": [
- "y",
- "y_predicted"
- ]
- },
- {
- "name": "accuracy",
- "inputs": [
- "y",
- "y_predicted"
- ]
- }
- ],
- "validation_patience": 10,
- "val_every_n_epochs": 1,
- "log_every_n_epochs": 1,
- "class_name": "nn_trainer",
- "evaluation_targets": [
- "valid",
- "test"
- ]
- },
- "metadata": {
- "variables": {
- "ROOT_PATH": "~/.deeppavlov",
- "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
- "MODELS_PATH": "{ROOT_PATH}/models",
- "RESULTS_PATH": "{ROOT_PATH}/results"
- },
- "download": [
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/morpho_tagger/UD2.0/tr.tar.gz",
- "subdir": "{MODELS_PATH}/morpho_tagger/UD2.0/tr"
- },
- {
- "url": "http://files.deeppavlov.ai/datasets/UD2.0_source/tr.tar.gz",
- "subdir": "{DOWNLOADS_PATH}/UD2.0_source/tr"
- }
- ]
- }
-}
\ No newline at end of file
diff --git a/deeppavlov/configs/nemo/asr.json b/deeppavlov/configs/nemo/asr.json
deleted file mode 100644
index 410e0ac560..0000000000
--- a/deeppavlov/configs/nemo/asr.json
+++ /dev/null
@@ -1,26 +0,0 @@
-{
- "chainer": {
- "in": "speech",
- "pipe": [
- {
- "class_name": "nemo_asr",
- "nemo_params_path": "{NEMO_PATH}/quartznet15x5/quartznet15x5.yaml",
- "load_path": "{NEMO_PATH}/quartznet15x5",
- "in": ["speech"],
- "out": ["text"]
- }
- ],
- "out": ["text"]
- },
- "metadata": {
- "variables": {
- "NEMO_PATH": "~/.deeppavlov/models/nemo"
- },
- "download": [
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/nemo/quartznet15x5.tar.gz",
- "subdir": "{NEMO_PATH}"
- }
- ]
- }
-}
diff --git a/deeppavlov/configs/nemo/asr_tts.json b/deeppavlov/configs/nemo/asr_tts.json
deleted file mode 100644
index 8ecc10c304..0000000000
--- a/deeppavlov/configs/nemo/asr_tts.json
+++ /dev/null
@@ -1,48 +0,0 @@
-{
- "chainer": {
- "in": "speech_in_encoded",
- "pipe": [
- {
- "class_name": "base64_decode_bytesIO",
- "in": ["speech_in_encoded"],
- "out": ["speech_in"]
- },
- {
- "class_name": "nemo_asr",
- "nemo_params_path": "{NEMO_PATH}/quartznet15x5/quartznet15x5.yaml",
- "load_path": "{NEMO_PATH}/quartznet15x5",
- "in": ["speech_in"],
- "out": ["text"]
- },
- {
- "class_name": "nemo_tts",
- "nemo_params_path": "{TTS_PATH}/tacotron2_waveglow.yaml",
- "load_path": "{TTS_PATH}",
- "in": ["text"],
- "out": ["speech_out"]
- },
- {
- "class_name": "bytesIO_encode_base64",
- "in": ["speech_out"],
- "out": ["speech_out_encoded"]
- }
- ],
- "out": ["text", "speech_out_encoded"]
- },
- "metadata": {
- "variables": {
- "NEMO_PATH": "~/.deeppavlov/models/nemo",
- "TTS_PATH": "{NEMO_PATH}/tacotron2_waveglow"
- },
- "download": [
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/nemo/quartznet15x5.tar.gz",
- "subdir": "{NEMO_PATH}"
- },
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/nemo/tacotron2_waveglow.tar.gz",
- "subdir": "{NEMO_PATH}"
- }
- ]
- }
-}
diff --git a/deeppavlov/configs/nemo/tts.json b/deeppavlov/configs/nemo/tts.json
deleted file mode 100644
index 6cbac9a043..0000000000
--- a/deeppavlov/configs/nemo/tts.json
+++ /dev/null
@@ -1,27 +0,0 @@
-{
- "chainer": {
- "in": ["text", "filepath"],
- "pipe": [
- {
- "class_name": "nemo_tts",
- "nemo_params_path": "{TTS_PATH}/tacotron2_waveglow.yaml",
- "load_path": "{TTS_PATH}",
- "in": ["text", "filepath"],
- "out": ["saved_path"]
- }
- ],
- "out": ["saved_path"]
- },
- "metadata": {
- "variables": {
- "NEMO_PATH": "~/.deeppavlov/models/nemo",
- "TTS_PATH": "{NEMO_PATH}/tacotron2_waveglow"
- },
- "download": [
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/nemo/tacotron2_waveglow.tar.gz",
- "subdir": "{NEMO_PATH}"
- }
- ]
- }
-}
diff --git a/deeppavlov/configs/ner/ner_conll2003_pos.json b/deeppavlov/configs/ner/ner_conll2003_pos.json
deleted file mode 100644
index 3ddd6ab55d..0000000000
--- a/deeppavlov/configs/ner/ner_conll2003_pos.json
+++ /dev/null
@@ -1,189 +0,0 @@
-{
- "dataset_reader": {
- "class_name": "conll2003_reader",
- "data_path": "{DOWNLOADS_PATH}/conll2003/",
- "dataset_name": "conll2003",
- "provide_pos": true
- },
- "dataset_iterator": {
- "class_name": "data_learning_iterator"
- },
- "chainer": {
- "in": ["x", "pos"],
- "in_y": ["y"],
- "pipe": [
- {
- "in": ["x"],
- "class_name": "lazy_tokenizer",
- "out": ["x_tokens"]
- },
- {
- "in": ["x_tokens"],
- "class_name": "str_lower",
- "out": ["x_lower"]
- },
- {
- "in": ["x_lower"],
- "class_name": "sanitizer",
- "nums": true,
- "out": ["x_san"]
- },
- {
- "in": ["x_san"],
- "id": "word_vocab",
- "class_name": "simple_vocab",
- "pad_with_zeros": true,
- "special_tokens": [""],
- "fit_on": ["x_san"],
- "save_path": "{MODELS_PATH}/ner_conll2003/word.dict",
- "load_path": "{MODELS_PATH}/ner_conll2003/word.dict",
- "out": ["x_tok_ind"]
- },
- {
- "in": ["pos"],
- "id": "pos_vocab",
- "class_name": "simple_vocab",
- "pad_with_zeros": true,
- "fit_on": ["pos"],
- "save_path": "{MODELS_PATH}/ner_conll2003/pos.dict",
- "load_path": "{MODELS_PATH}/ner_conll2003/pos.dict",
- "out": ["pos_ind"]
- },
- {
- "in": ["pos_ind"],
- "class_name": "one_hotter",
- "depth": "#pos_vocab.len",
- "pad_zeros": true,
- "out": ["pos_one_hot"]
- },
- {
- "in": ["y"],
- "id": "tag_vocab",
- "class_name": "simple_vocab",
- "pad_with_zeros": true,
- "fit_on": ["y"],
- "save_path": "{MODELS_PATH}/ner_conll2003/tag.dict",
- "load_path": "{MODELS_PATH}/ner_conll2003/tag.dict",
- "out": ["y_ind"]
- },
- {
- "in": ["x_tokens"],
- "class_name": "char_splitter",
- "out": ["x_char"]
- },
- {
- "in": ["x_char"],
- "id": "char_vocab",
- "class_name": "simple_vocab",
- "pad_with_zeros": true,
- "fit_on": ["x_char"],
- "save_path": "{MODELS_PATH}/ner_conll2003/char.dict",
- "load_path": "{MODELS_PATH}/ner_conll2003/char.dict",
- "out": ["x_char_ind"]
- },
- {
- "in": ["x_tokens"],
- "class_name": "mask",
- "out": ["mask"]
- },
- {
- "in": ["x_san"],
- "id": "glove_emb",
- "class_name": "glove",
- "pad_zero": true,
- "load_path": "{DOWNLOADS_PATH}/embeddings/glove.6B.100d.txt",
-
- "out": ["x_emb"]
- },
- {
- "id": "embeddings",
- "class_name": "emb_mat_assembler",
- "embedder": "#glove_emb",
- "vocab": "#word_vocab"
- },
- {
- "id": "embeddings_char",
- "class_name": "emb_mat_assembler",
- "character_level": true,
- "emb_dim": 32,
- "embedder": "#glove_emb",
- "vocab": "#char_vocab"
- },
- {
- "id": "capitalization",
- "class_name": "capitalization_featurizer",
- "in": ["x_tokens"],
- "out": ["cap"]
- },
- {
- "in": ["x_emb", "mask", "x_char_ind", "cap", "pos_one_hot"],
- "in_y": ["y_ind"],
- "out": ["y_predicted"],
- "class_name": "ner",
- "main": true,
- "token_emb_dim": "#glove_emb.dim",
- "n_hidden_list": [128],
- "net_type": "rnn",
- "cell_type": "lstm",
- "use_cudnn_rnn": true,
- "n_tags": "#tag_vocab.len",
- "capitalization_dim": "#capitalization.dim",
- "char_emb_dim": "#embeddings_char.dim",
- "pos_features_dim": "#pos_vocab.len",
- "save_path": "{MODELS_PATH}/ner_conll2003/model",
- "load_path": "{MODELS_PATH}/ner_conll2003/model",
- "char_emb_mat": "#embeddings_char.emb_mat",
- "two_dense_on_top": true,
- "use_crf": true,
- "use_batch_norm": true,
- "embeddings_dropout": true,
- "top_dropout": true,
- "intra_layer_dropout": true,
- "l2_reg": 0,
- "learning_rate": 1e-2,
- "dropout_keep_prob": 0.7
- },
- {
- "ref": "tag_vocab",
- "in": ["y_predicted"],
- "out": ["tags"]
- }
- ],
-
- "out": ["x_tokens", "tags"]
- },
- "train": {
- "epochs": 100,
- "batch_size": 64,
-
- "metrics": [
- {
- "name": "ner_f1",
- "inputs": ["y", "tags"]
- }
- ],
- "validation_patience": 7,
- "val_every_n_epochs": 1,
-
- "log_every_n_epochs": 1,
- "show_examples": false,
- "class_name": "nn_trainer",
- "evaluation_targets": [
- "valid",
- "test"
- ]
- },
- "metadata": {
- "variables": {
- "ROOT_PATH": "~/.deeppavlov",
- "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
- "MODELS_PATH": "{ROOT_PATH}/models"
- },
- "download": [
- {
- "url": "http://files.deeppavlov.ai/embeddings/glove.6B.100d.txt",
- "subdir": "{DOWNLOADS_PATH}/embeddings"
- }
- ]
- }
-}
\ No newline at end of file
diff --git a/deeppavlov/configs/ner/ner_dstc2.json b/deeppavlov/configs/ner/ner_dstc2.json
deleted file mode 100644
index 4f35d4b530..0000000000
--- a/deeppavlov/configs/ner/ner_dstc2.json
+++ /dev/null
@@ -1,126 +0,0 @@
-{
- "dataset_reader": {
- "class_name": "dstc2_reader",
- "data_path": "{DATA_PATH}"
- },
- "dataset_iterator": {
- "class_name": "dstc2_ner_iterator",
- "slot_values_path": "{SLOT_VALS_PATH}"
- },
- "chainer": {
- "in": ["x"],
- "in_y": ["y"],
- "pipe": [
- {
- "in": ["x"],
- "class_name": "lazy_tokenizer",
- "out": ["x_tokens"]
- },
- {
- "in": ["x_tokens"],
- "class_name": "str_lower",
- "out": ["x_lower"]
- },
- {
- "in": ["x_lower"],
- "id": "word_vocab",
- "class_name": "simple_vocab",
- "pad_with_zeros": true,
- "fit_on": ["x_lower"],
- "save_path": "{MODEL_PATH}/word.dict",
- "load_path": "{MODEL_PATH}/word.dict",
- "out": ["x_tok_ind"]
- },
- {
- "class_name": "random_emb_mat",
- "id": "embeddings",
- "vocab_len": "#word_vocab.len",
- "emb_dim": 100
- },
- {
- "in": ["y"],
- "id": "tag_vocab",
- "class_name": "simple_vocab",
- "pad_with_zeros": true,
- "fit_on": ["y"],
- "save_path": "{MODEL_PATH}/tag.dict",
- "load_path": "{MODEL_PATH}/tag.dict",
- "out": ["y_ind"]
- },
- {
- "in": ["x_tokens"],
- "class_name": "mask",
- "out": ["mask"]
- },
- {
- "in": ["x_tok_ind", "mask"],
- "in_y": ["y_ind"],
- "out": ["y_predicted"],
- "class_name": "ner",
- "main": true,
- "token_emb_mat": "#embeddings.emb_mat",
- "n_hidden_list": [64, 64],
- "net_type": "cnn",
- "n_tags": "#tag_vocab.len",
- "save_path": "{MODEL_PATH}/model",
- "load_path": "{MODEL_PATH}/model",
- "embeddings_dropout": true,
- "top_dropout": true,
- "intra_layer_dropout": false,
- "use_batch_norm": true,
- "learning_rate": 1e-2,
- "dropout_keep_prob": 0.5
- },
- {
- "ref": "tag_vocab",
- "in": ["y_predicted"],
- "out": ["tags"]
- }
- ],
- "out": ["x_tokens", "tags"]
- },
- "train": {
- "epochs": 100,
- "batch_size": 64,
-
- "metrics": [
- {
- "name": "ner_f1",
- "inputs": ["y", "tags"]
- },
- {
- "name": "per_token_accuracy",
- "inputs": ["y", "tags"]
- }
- ],
- "validation_patience": 5,
- "val_every_n_epochs": 5,
-
- "log_every_n_batches": 100,
- "show_examples": false,
- "class_name": "nn_trainer",
- "evaluation_targets": [
- "valid",
- "test"
- ]
- },
- "metadata": {
- "variables": {
- "ROOT_PATH": "~/.deeppavlov",
- "DATA_PATH": "{ROOT_PATH}/downloads/dstc2",
- "SLOT_VALS_PATH": "{DATA_PATH}/dstc_slot_vals.json",
- "MODELS_PATH": "{ROOT_PATH}/models",
- "MODEL_PATH": "{MODELS_PATH}/slotfill_dstc2"
- },
- "download": [
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/dstc_slot_vals.tar.gz",
- "subdir": "{DATA_PATH}"
- },
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/slotfill_dstc2.tar.gz",
- "subdir": "{MODELS_PATH}"
- }
- ]
- }
-}
diff --git a/deeppavlov/configs/ner/ner_few_shot_ru.json b/deeppavlov/configs/ner/ner_few_shot_ru.json
deleted file mode 100644
index ad60b46567..0000000000
--- a/deeppavlov/configs/ner/ner_few_shot_ru.json
+++ /dev/null
@@ -1,104 +0,0 @@
-{
- "deeppavlov_root": ".",
- "dataset_reader": {
- "class_name": "conll2003_reader",
- "data_path": "{DOWNLOADS_PATH}/ner_few_shot_data/"
- },
- "dataset_iterator": {
- "class_name": "data_learning_iterator"
- },
- "chainer": {
- "in": ["x"],
- "in_y": ["tags"],
- "pipe": [
- {
- "in": ["x"],
- "class_name": "lazy_tokenizer",
- "out": ["x_tokens"]
- },
- {
- "in": ["tags"],
- "id": "tag_vocab",
- "class_name": "simple_vocab",
- "pad_with_zeros": false,
- "fit_on": ["tags"],
- "save_path": "{MODELS_PATH}/ner_fs/tag.dict",
- "load_path": "{MODELS_PATH}/ner_fs/tag.dict",
- "out": ["tag_indices"]
- },
- {
- "class_name": "elmo_embedder",
- "elmo_output_names": ["lstm_outputs1", "lstm_outputs2", "word_emb"],
- "mini_batch_size": 32,
- "in": ["x_tokens"],
- "spec": "{DOWNLOADS_PATH}/embeddings/elmo_ru_news",
- "out": [
- "tokens_emb"
- ]
- },
- {
- "class_name": "ner_svm",
- "in": "tokens_emb",
- "out": "tag_indices",
- "fit_on": ["tokens_emb", "tag_indices"],
- "save_path": "{MODELS_PATH}/ner_fs/model",
- "load_path": "{MODELS_PATH}/ner_fs/model"
- },
- {
- "ref": "tag_vocab",
- "in": ["tag_indices"],
- "out": ["tags_hat"]
- },
- {
- "class_name": "ner_bio_converter",
- "in": ["tags_hat"],
- "out": ["tags_bio_hat"]
- },
- {
- "class_name": "ner_bio_converter",
- "in": ["tags"],
- "out": ["tags_bio"]
- }
-
- ],
-
- "out": ["x_tokens", "tags_bio_hat"]
- },
- "train": {
- "epochs": 100,
- "batch_size": 64,
- "metrics": [
- {
- "name": "ner_f1",
- "inputs": [
- "tags_bio",
- "tags_bio_hat"
- ]
- }
- ],
- "validation_patience": 7,
- "val_every_n_epochs": 1,
-
- "log_every_n_epochs": 1,
- "show_examples": false,
- "tensorboard_log_dir": "{MODELS_PATH}/ner_fs/logs",
- "class_name": "fit_trainer",
- "evaluation_targets": [
- "valid",
- "test"
- ]
- },
- "metadata": {
- "variables": {
- "ROOT_PATH": "~/.deeppavlov",
- "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
- "MODELS_PATH": "{ROOT_PATH}/models"
- },
- "download": [
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/elmo_ru-news_wmt11-16_1.5M_steps.tar.gz",
- "subdir": "{DOWNLOADS_PATH}/embeddings/elmo_ru_news"
- }
- ]
- }
-}
\ No newline at end of file
diff --git a/deeppavlov/configs/ner/ner_few_shot_ru_simulate.json b/deeppavlov/configs/ner/ner_few_shot_ru_simulate.json
deleted file mode 100644
index cb58707224..0000000000
--- a/deeppavlov/configs/ner/ner_few_shot_ru_simulate.json
+++ /dev/null
@@ -1,140 +0,0 @@
-{
- "dataset_reader": {
- "class_name": "conll2003_reader",
- "dataset_name": "collection_rus",
- "data_path": "{DOWNLOADS_PATH}/ner_few_shot_data/"
- },
- "dataset_iterator": {
- "class_name": "ner_few_shot_iterator",
- "target_tag": "PER"
- },
- "chainer": {
- "in": [
- "x"
- ],
- "in_y": [
- "tags"
- ],
- "pipe": [
- {
- "in": [
- "x"
- ],
- "class_name": "lazy_tokenizer",
- "out": [
- "x_tokens"
- ]
- },
- {
- "in": [
- "tags"
- ],
- "id": "tag_vocab",
- "class_name": "simple_vocab",
- "pad_with_zeros": false,
- "fit_on": [
- "tags"
- ],
- "save_path": "{MODELS_PATH}/ner_fs/tag.dict",
- "load_path": "{MODELS_PATH}/ner_fs/tag.dict",
- "out": [
- "tag_indices"
- ]
- },
- {
- "class_name": "elmo_embedder",
- "elmo_output_names": [
- "lstm_outputs1",
- "lstm_outputs2",
- "word_emb"
- ],
- "mini_batch_size": 32,
- "in": [
- "x_tokens"
- ],
- "spec": "{DOWNLOADS_PATH}/embeddings/elmo_ru_news",
- "out": [
- "tokens_emb"
- ]
- },
- {
- "class_name": "ner_svm",
- "in": "tokens_emb",
- "out": "tag_indices",
- "fit_on": [
- "tokens_emb",
- "tag_indices"
- ],
- "save_path": "{MODELS_PATH}/ner_fs/model",
- "load_path": "{MODELS_PATH}/ner_fs/model"
- },
- {
- "ref": "tag_vocab",
- "in": [
- "tag_indices"
- ],
- "out": [
- "tags_hat"
- ]
- },
- {
- "class_name": "ner_bio_converter",
- "in": [
- "tags_hat"
- ],
- "out": [
- "tags_bio_hat"
- ]
- },
- {
- "class_name": "ner_bio_converter",
- "in": [
- "tags"
- ],
- "out": [
- "tags_bio"
- ]
- }
- ],
- "out": [
- "x_tokens",
- "tags_bio_hat"
- ]
- },
- "train": {
- "epochs": 100,
- "batch_size": 64,
- "metrics": [
- {
- "name": "ner_f1",
- "inputs": [
- "tags_bio",
- "tags_bio_hat"
- ]
- }
- ],
- "validation_patience": 7,
- "val_every_n_epochs": 1,
- "log_every_n_epochs": 1,
- "show_examples": false,
- "tensorboard_log_dir": "{MODELS_PATH}/ner_fs/logs",
- "class_name": "fit_trainer",
- "evaluation_targets": [
- "valid",
- "test"
- ]
- },
- "metadata": {
- "variables": {
- "ROOT_PATH": "~/.deeppavlov",
- "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
- "MODELS_PATH": "{ROOT_PATH}/models"
- },
- "download": [
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/elmo_ru-news_wmt11-16_1.5M_steps.tar.gz",
- "subdir": "{DOWNLOADS_PATH}/embeddings/elmo_ru_news"
- }
- ]
- }
-}
\ No newline at end of file
diff --git a/deeppavlov/configs/ner/ner_kb_rus.json b/deeppavlov/configs/ner/ner_kb_rus.json
deleted file mode 100644
index 1bef6b87b1..0000000000
--- a/deeppavlov/configs/ner/ner_kb_rus.json
+++ /dev/null
@@ -1,164 +0,0 @@
-{
- "dataset_reader": {
- "class_name": "sq_reader",
- "data_path": "{DOWNLOADS_PATH}/ner_sq/SQ_rus_dataset_zs.pckl"
- },
- "dataset_iterator": {
- "class_name": "data_learning_iterator"
- },
- "chainer": {
- "in": ["x"],
- "in_y": ["y"],
- "pipe": [
- {
- "in": ["x"],
- "class_name": "lazy_tokenizer",
- "out": ["x_tokens"]
- },
- {
- "in": ["x_tokens"],
- "class_name": "str_lower",
- "out": ["x_lower"]
- },
- {
- "in": ["x_lower"],
- "class_name": "sanitizer",
- "nums": true,
- "out": ["x_san"]
- },
- {
- "in": ["x_san"],
- "id": "word_vocab",
- "class_name": "simple_vocab",
- "pad_with_zeros": true,
- "special_tokens": [""],
- "fit_on": ["x_san"],
- "save_path": "{MODEL_PATH}/ner/word.dict",
- "load_path": "{MODEL_PATH}/ner/word.dict",
- "out": ["x_tok_ind"]
- },
- {
- "in": ["y"],
- "id": "tag_vocab",
- "class_name": "simple_vocab",
- "pad_with_zeros": true,
- "fit_on": ["y"],
- "save_path": "{MODEL_PATH}/ner/tag.dict",
- "load_path": "{MODEL_PATH}/ner/tag.dict",
- "out": ["y_ind"]
- },
- {
- "in": ["x_tokens"],
- "class_name": "char_splitter",
- "out": ["x_char"]
- },
- {
- "in": ["x_char"],
- "id": "char_vocab",
- "class_name": "simple_vocab",
- "pad_with_zeros": true,
- "fit_on": ["x_char"],
- "save_path": "{MODEL_PATH}/ner/char.dict",
- "load_path": "{MODEL_PATH}/ner/char.dict",
- "out": ["x_char_ind"]
- },
- {
- "in": ["x_san"],
- "id": "embedder",
- "class_name": "fasttext",
- "pad_zero": true,
- "load_path": "{DOWNLOADS_PATH}/embeddings/lenta_lower_100.bin",
- "out": ["x_emb"]
- },
- {
- "in": ["x_tokens"],
- "class_name": "mask",
- "out": ["mask"]
- },
- {
- "class_name": "random_emb_mat",
- "id": "embeddings",
- "vocab_len": "#word_vocab.len",
- "emb_dim": 100
- },
- {
- "class_name": "random_emb_mat",
- "id": "embeddings_char",
- "vocab_len": "#char_vocab.len",
- "emb_dim": 100
- },
- {
- "in": ["x_emb", "mask", "x_char_ind"],
- "in_y": ["y_ind"],
- "out": ["y_predicted"],
- "class_name": "ner",
- "main": true,
- "n_hidden_list": [128],
- "net_type": "rnn",
- "cell_type": "lstm",
- "use_cudnn_rnn": true,
- "n_tags": "#tag_vocab.len",
- "token_emb_dim": "#embedder.dim",
- "char_emb_dim": 100,
- "save_path": "{MODEL_PATH}/ner/model",
- "load_path": "{MODEL_PATH}/ner/model",
- "char_emb_mat": "#embeddings_char.emb_mat",
- "use_crf": true,
- "use_batch_norm": true,
- "embeddings_dropout": true,
- "top_dropout": true,
- "intra_layer_dropout": true,
- "l2_reg": 0,
- "learning_rate": 1e-2,
- "dropout_keep_prob": 0.7
- },
- {
- "ref": "tag_vocab",
- "in": ["y_predicted"],
- "out": ["tags"]
- }
- ],
-
- "out": ["x_tokens", "tags"]
- },
- "train": {
- "epochs": 100,
- "batch_size": 64,
-
- "metrics": [
- {
- "name": "ner_f1",
- "inputs": ["y", "tags"]
- }
- ],
- "validation_patience": 7,
- "val_every_n_epochs": 1,
-
- "log_every_n_epochs": 1,
- "show_examples": false,
- "tensorboard_log_dir": "{MODEL_PATH}/ner/logs"
- },
- "metadata": {
- "variables": {
- "ROOT_PATH": "~/.deeppavlov",
- "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
- "MODELS_PATH": "{ROOT_PATH}/models",
- "MODEL_PATH": "{MODELS_PATH}/kbqa_mix_lowercase"
- },
- "download": [
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/ner_kb_rus.tar.gz",
- "subdir": "{MODEL_PATH}"
- },
- {
- "url": "http://files.deeppavlov.ai/embeddings/lenta_lower_100.bin",
- "subdir": "{DOWNLOADS_PATH}/embeddings"
- },
- {
- "url": "http://files.deeppavlov.ai/datasets/SQ_rus_dataset_zs.pckl",
- "subdir": "{DOWNLOADS_PATH}/ner_sq"
- }
- ]
- }
-}
-
diff --git a/deeppavlov/configs/ner/ner_lcquad_bert_probas.json b/deeppavlov/configs/ner/ner_lcquad_bert_probas.json
deleted file mode 100644
index 60c4febd57..0000000000
--- a/deeppavlov/configs/ner/ner_lcquad_bert_probas.json
+++ /dev/null
@@ -1,119 +0,0 @@
-{
- "dataset_reader": {
- "class_name": "sq_reader",
- "data_path": "{DOWNLOADS_PATH}/lcquad/entity_detection_dataset.pickle"
- },
- "dataset_iterator": {
- "class_name": "data_learning_iterator"
- },
- "chainer": {
- "in": ["x"],
- "in_y": ["y"],
- "pipe": [
- {
- "class_name": "bert_ner_preprocessor",
- "vocab_file": "{BERT_PATH}/vocab.txt",
- "do_lower_case": false,
- "max_seq_length": 512,
- "max_subword_length": 15,
- "token_maksing_prob": 0.0,
- "in": ["x"],
- "out": ["x_tokens", "x_subword_tokens", "x_subword_tok_ids", "pred_subword_mask"]
- },
- {
- "class_name": "mask",
- "in": ["x_subword_tokens"],
- "out": ["x_subword_mask"]
- },
- {
- "id": "tag_vocab",
- "class_name": "simple_vocab",
- "unk_token": ["O"],
- "pad_with_zeros": true,
- "save_path": "{NER_PATH}/tag.dict",
- "load_path": "{NER_PATH}/tag.dict",
- "fit_on": ["y"],
- "in": ["y"],
- "out": ["y_ind"]
- },
- {
- "class_name": "bert_sequence_tagger",
- "n_tags": "#tag_vocab.len",
- "keep_prob": 0.1,
- "bert_config_file": "{BERT_PATH}/bert_config.json",
- "pretrained_bert": "{BERT_PATH}/bert_model.ckpt",
- "attention_probs_keep_prob": 0.5,
- "use_crf": false,
- "return_probas": true,
- "ema_decay": 0.9,
- "encoder_layer_ids": [-1],
- "optimizer": "tf.train:AdamOptimizer",
- "learning_rate": 1e-3,
- "bert_learning_rate": 2e-5,
- "min_learning_rate": 1e-7,
- "learning_rate_drop_patience": 30,
- "learning_rate_drop_div": 1.5,
- "load_before_drop": true,
- "clip_norm": 1.0,
- "save_path": "{NER_PATH}/model",
- "load_path": "{NER_PATH}/model",
- "in": ["x_subword_tok_ids", "x_subword_mask", "pred_subword_mask"],
- "in_y": ["y_ind"],
- "out": ["y_pred_ind"]
- }
- ],
- "out": ["x_tokens", "y_pred_ind"]
- },
- "train": {
- "epochs": 30,
- "batch_size": 16,
- "metrics": [
- {
- "name": "ner_f1",
- "inputs": ["y", "y_pred"]
- },
- {
- "name": "ner_token_f1",
- "inputs": ["y", "y_pred"]
- }
- ],
- "validation_patience": 10,
- "val_every_n_batches": 400,
-
- "log_every_n_batches": 400,
- "tensorboard_log_dir": "{NER_PATH}/logs",
- "show_examples": false,
- "pytest_max_batches": 2,
- "pytest_batch_size": 8,
- "evaluation_targets": ["valid", "test"],
- "class_name": "nn_trainer"
- },
- "metadata": {
- "variables": {
- "ROOT_PATH": "~/.deeppavlov",
- "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
- "MODELS_PATH": "{ROOT_PATH}/models",
- "BERT_PATH": "{DOWNLOADS_PATH}/bert_models/cased_L-12_H-768_A-12",
- "NER_PATH": "{MODELS_PATH}/ner_lcquad"
- },
- "labels": {
- "telegram_utils": "NERCoNLL2003Model",
- "server_utils": "NER"
- },
- "download": [
-
- {
- "url": "http://files.deeppavlov.ai/kbqa/datasets/entity_detection_dataset.pickle",
- "subdir": "{MODELS_PATH}"
- },
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/bert/cased_L-12_H-768_A-12.zip",
- "subdir": "{DOWNLOADS_PATH}/bert_models"
- },
- {
- "url": "http://files.deeppavlov.ai/kbqa/models/ner_lcquad.tar.gz",
- "subdir": "{MODELS_PATH}/ner_lcquad"
- }
- ]
- }
-}
diff --git a/deeppavlov/configs/ner/ner_ontonotes_m1.json b/deeppavlov/configs/ner/ner_ontonotes_m1.json
deleted file mode 100644
index 6e4e85d66e..0000000000
--- a/deeppavlov/configs/ner/ner_ontonotes_m1.json
+++ /dev/null
@@ -1,131 +0,0 @@
-{
- "dataset_reader": {
- "class_name": "conll2003_reader",
- "data_path": "{DOWNLOADS_PATH}/ontonotes/",
- "dataset_name": "ontonotes",
- "provide_pos": false,
- "provide_chunk": false,
- "iobes": true
- },
- "dataset_iterator": {
- "class_name": "data_learning_iterator"
- },
- "chainer": {
- "in": ["x"],
- "in_y": ["y"],
- "pipe": [
- {
- "in": ["x"],
- "out": ["x_tokens"],
- "class_name": "lazy_tokenizer"
- },
- {
- "in": ["x_tokens"],
- "out": ["x_lower", "sent_lengths", "x_tokens_elmo"],
- "class_name": "ner_preprocessor",
- "get_x_padded_for_elmo": true
- },
- {
- "in": ["x_lower"],
- "out": ["x_tok_ind"],
- "fit_on": ["x_lower"],
- "class_name": "ner_vocab",
- "id": "word_vocab",
- "save_path": "{MODEL_PATH}/word.dict",
- "load_path": "{MODEL_PATH}/word.dict"
- },
- {
- "in": ["y"],
- "out": ["y_ind"],
- "fit_on": ["y"],
- "class_name": "ner_vocab",
- "id": "tag_vocab",
- "save_path": "{MODEL_PATH}/tag.dict",
- "load_path": "{MODEL_PATH}/tag.dict"
- },
- {
- "in": ["x_tokens"],
- "out": ["x_char_ind"],
- "fit_on": ["x_tokens"],
- "class_name": "ner_vocab",
- "char_level": true,
- "id": "char_vocab",
- "save_path": "{MODEL_PATH}/char.dict",
- "load_path": "{MODEL_PATH}/char.dict"
- },
- {
- "in":[
- "sent_lengths",
- "x_tok_ind",
- "x_char_ind",
- "x_tokens_elmo"
- ],
- "in_y": ["y_ind"],
- "out": ["y_predicted"],
- "class_name": "hybrid_ner_model",
- "n_tags": "#tag_vocab.len",
- "word_emb_path": "{DOWNLOADS_PATH}/embeddings/glove.6B.100d.txt",
- "word_emb_name": "glove",
- "word_dim": 100,
- "word_vocab": "#word_vocab",
- "char_vocab_size": "#char_vocab.len",
- "char_dim": 100,
- "elmo_dim": 128,
- "lstm_hidden_size": 256,
- "save_path": "{MODEL_PATH}/ontonotes",
- "load_path": "{MODEL_PATH}/ontonotes",
- "learning_rate": 1e-3,
- "learning_rate_drop_patience": 5,
- "learning_rate_drop_div": 10,
- "dropout_keep_prob": 0.7
- },
- {
- "in": ["y_predicted"],
- "out": ["tags"],
- "class_name": "convert_ids2tags",
- "id2tag": "#tag_vocab.i2t"
- }
- ],
- "out": ["x_tokens", "tags"]
- },
- "train": {
- "epochs": 100,
- "batch_size": 64,
- "metrics": [
- {
- "name": "ner_f1",
- "inputs": ["y", "tags"]
- },
- {
- "name": "ner_token_f1",
- "inputs": ["y", "tags"]
- }
- ],
- "validation_patience": 10,
- "val_every_n_epochs": 1,
- "log_every_n_epochs": 1,
- "show_examples": false,
- "class_name": "nn_trainer",
- "evaluation_targets": [
- "valid",
- "test"
- ]
- },
- "metadata": {
- "variables": {
- "ROOT_PATH": "~/.deeppavlov",
- "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
- "MODEL_PATH": "{ROOT_PATH}/models/ontonotes"
- },
- "download": [
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/ner_ontonotes_m1.tar.gz",
- "subdir": "{MODEL_PATH}"
- },
- {
- "url": "http://files.deeppavlov.ai/embeddings/glove.6B.100d.txt",
- "subdir": "{DOWNLOADS_PATH}/embeddings"
- }
- ]
- }
-}
diff --git a/deeppavlov/configs/ner/ner_rus_bert.json b/deeppavlov/configs/ner/ner_rus_bert.json
index 9a00116886..8a4c51ff5f 100644
--- a/deeppavlov/configs/ner/ner_rus_bert.json
+++ b/deeppavlov/configs/ner/ner_rus_bert.json
@@ -9,103 +9,146 @@
"class_name": "data_learning_iterator"
},
"chainer": {
- "in": ["x"],
- "in_y": ["y"],
+ "in": [
+ "x"
+ ],
+ "in_y": [
+ "y"
+ ],
"pipe": [
{
- "class_name": "bert_ner_preprocessor",
- "vocab_file": "{BERT_PATH}/vocab.txt",
+ "class_name": "torch_transformers_ner_preprocessor",
+ "vocab_file": "{TRANSFORMER}",
"do_lower_case": false,
"max_seq_length": 512,
"max_subword_length": 15,
"token_masking_prob": 0.0,
- "in": ["x"],
- "out": ["x_tokens", "x_subword_tokens", "x_subword_tok_ids", "startofword_markers", "attention_mask"]
+ "in": [
+ "x"
+ ],
+ "out": [
+ "x_tokens",
+ "x_subword_tokens",
+ "x_subword_tok_ids",
+ "startofword_markers",
+ "attention_mask"
+ ]
},
{
"id": "tag_vocab",
"class_name": "simple_vocab",
- "unk_token": ["O"],
+ "unk_token": [
+ "O"
+ ],
"pad_with_zeros": true,
- "save_path": "{NER_PATH}/tag.dict",
- "load_path": "{NER_PATH}/tag.dict",
- "fit_on": ["y"],
- "in": ["y"],
- "out": ["y_ind"]
+ "save_path": "{MODEL_PATH}/tag.dict",
+ "load_path": "{MODEL_PATH}/tag.dict",
+ "fit_on": [
+ "y"
+ ],
+ "in": [
+ "y"
+ ],
+ "out": [
+ "y_ind"
+ ]
},
{
- "class_name": "bert_sequence_tagger",
+ "class_name": "torch_transformers_sequence_tagger",
"n_tags": "#tag_vocab.len",
- "keep_prob": 0.1,
- "bert_config_file": "{BERT_PATH}/bert_config.json",
- "pretrained_bert": "{BERT_PATH}/bert_model.ckpt",
+ "pretrained_bert": "{TRANSFORMER}",
"attention_probs_keep_prob": 0.5,
- "use_crf": true,
- "ema_decay": 0.9,
"return_probas": false,
- "encoder_layer_ids": [-1],
- "optimizer": "tf.train:AdamOptimizer",
- "learning_rate": 1e-3,
- "bert_learning_rate": 2e-5,
- "min_learning_rate": 1e-7,
+ "encoder_layer_ids": [
+ -1
+ ],
+ "optimizer": "AdamW",
+ "optimizer_parameters": {
+ "lr": 2e-05,
+ "weight_decay": 1e-06,
+ "betas": [
+ 0.9,
+ 0.999
+ ],
+ "eps": 1e-06
+ },
+ "clip_norm": 1.0,
+ "min_learning_rate": 1e-07,
"learning_rate_drop_patience": 30,
"learning_rate_drop_div": 1.5,
"load_before_drop": true,
- "clip_norm": null,
- "save_path": "{NER_PATH}/model",
- "load_path": "{NER_PATH}/model",
- "in": ["x_subword_tok_ids", "attention_mask", "startofword_markers"],
- "in_y": ["y_ind"],
- "out": ["y_pred_ind"]
+ "save_path": "{MODEL_PATH}/model",
+ "load_path": "{MODEL_PATH}/model",
+ "in": [
+ "x_subword_tok_ids",
+ "attention_mask",
+ "startofword_markers"
+ ],
+ "in_y": [
+ "y_ind"
+ ],
+ "out": [
+ "y_pred_ind"
+ ]
},
{
"ref": "tag_vocab",
- "in": ["y_pred_ind"],
- "out": ["y_pred"]
+ "in": [
+ "y_pred_ind"
+ ],
+ "out": [
+ "y_pred"
+ ]
}
],
- "out": ["x_tokens", "y_pred"]
+ "out": [
+ "x_tokens",
+ "y_pred"
+ ]
},
"train": {
"epochs": 30,
- "batch_size": 16,
+ "batch_size": 10,
"metrics": [
{
"name": "ner_f1",
- "inputs": ["y", "y_pred"]
+ "inputs": [
+ "y",
+ "y_pred"
+ ]
},
{
"name": "ner_token_f1",
- "inputs": ["y", "y_pred"]
+ "inputs": [
+ "y",
+ "y_pred"
+ ]
}
],
"validation_patience": 100,
"val_every_n_batches": 20,
-
"log_every_n_batches": 20,
- "tensorboard_log_dir": "{NER_PATH}/logs",
"show_examples": false,
"pytest_max_batches": 2,
"pytest_batch_size": 8,
- "evaluation_targets": ["valid", "test"],
- "class_name": "nn_trainer"
+ "evaluation_targets": [
+ "valid",
+ "test"
+ ],
+ "class_name": "torch_trainer"
},
"metadata": {
"variables": {
"ROOT_PATH": "~/.deeppavlov",
"DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
"MODELS_PATH": "{ROOT_PATH}/models",
- "BERT_PATH": "{DOWNLOADS_PATH}/bert_models/rubert_cased_L-12_H-768_A-12_v1",
- "NER_PATH": "{MODELS_PATH}/ner_rus_bert"
+ "TRANSFORMER": "DeepPavlov/rubert-base-cased",
+ "MODEL_PATH": "{MODELS_PATH}/ner_rus_bert_torch"
},
"download": [
{
- "url": "http://files.deeppavlov.ai/deeppavlov_data/ner_rus_bert_v1.tar.gz",
+ "url": "http://files.deeppavlov.ai/0.16/ner/ner_rus_bert_torch.tar.gz",
"subdir": "{MODELS_PATH}"
- },
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/bert/rubert_cased_L-12_H-768_A-12_v1.tar.gz",
- "subdir": "{DOWNLOADS_PATH}/bert_models"
}
]
}
diff --git a/deeppavlov/configs/ner/ner_rus_bert_torch.json b/deeppavlov/configs/ner/ner_rus_bert_torch.json
deleted file mode 100644
index 8a4c51ff5f..0000000000
--- a/deeppavlov/configs/ner/ner_rus_bert_torch.json
+++ /dev/null
@@ -1,155 +0,0 @@
-{
- "dataset_reader": {
- "class_name": "conll2003_reader",
- "data_path": "{DOWNLOADS_PATH}/total_rus/",
- "dataset_name": "collection_rus",
- "provide_pos": false
- },
- "dataset_iterator": {
- "class_name": "data_learning_iterator"
- },
- "chainer": {
- "in": [
- "x"
- ],
- "in_y": [
- "y"
- ],
- "pipe": [
- {
- "class_name": "torch_transformers_ner_preprocessor",
- "vocab_file": "{TRANSFORMER}",
- "do_lower_case": false,
- "max_seq_length": 512,
- "max_subword_length": 15,
- "token_masking_prob": 0.0,
- "in": [
- "x"
- ],
- "out": [
- "x_tokens",
- "x_subword_tokens",
- "x_subword_tok_ids",
- "startofword_markers",
- "attention_mask"
- ]
- },
- {
- "id": "tag_vocab",
- "class_name": "simple_vocab",
- "unk_token": [
- "O"
- ],
- "pad_with_zeros": true,
- "save_path": "{MODEL_PATH}/tag.dict",
- "load_path": "{MODEL_PATH}/tag.dict",
- "fit_on": [
- "y"
- ],
- "in": [
- "y"
- ],
- "out": [
- "y_ind"
- ]
- },
- {
- "class_name": "torch_transformers_sequence_tagger",
- "n_tags": "#tag_vocab.len",
- "pretrained_bert": "{TRANSFORMER}",
- "attention_probs_keep_prob": 0.5,
- "return_probas": false,
- "encoder_layer_ids": [
- -1
- ],
- "optimizer": "AdamW",
- "optimizer_parameters": {
- "lr": 2e-05,
- "weight_decay": 1e-06,
- "betas": [
- 0.9,
- 0.999
- ],
- "eps": 1e-06
- },
- "clip_norm": 1.0,
- "min_learning_rate": 1e-07,
- "learning_rate_drop_patience": 30,
- "learning_rate_drop_div": 1.5,
- "load_before_drop": true,
- "save_path": "{MODEL_PATH}/model",
- "load_path": "{MODEL_PATH}/model",
- "in": [
- "x_subword_tok_ids",
- "attention_mask",
- "startofword_markers"
- ],
- "in_y": [
- "y_ind"
- ],
- "out": [
- "y_pred_ind"
- ]
- },
- {
- "ref": "tag_vocab",
- "in": [
- "y_pred_ind"
- ],
- "out": [
- "y_pred"
- ]
- }
- ],
- "out": [
- "x_tokens",
- "y_pred"
- ]
- },
- "train": {
- "epochs": 30,
- "batch_size": 10,
- "metrics": [
- {
- "name": "ner_f1",
- "inputs": [
- "y",
- "y_pred"
- ]
- },
- {
- "name": "ner_token_f1",
- "inputs": [
- "y",
- "y_pred"
- ]
- }
- ],
- "validation_patience": 100,
- "val_every_n_batches": 20,
- "log_every_n_batches": 20,
- "show_examples": false,
- "pytest_max_batches": 2,
- "pytest_batch_size": 8,
- "evaluation_targets": [
- "valid",
- "test"
- ],
- "class_name": "torch_trainer"
- },
- "metadata": {
- "variables": {
- "ROOT_PATH": "~/.deeppavlov",
- "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
- "MODELS_PATH": "{ROOT_PATH}/models",
- "TRANSFORMER": "DeepPavlov/rubert-base-cased",
- "MODEL_PATH": "{MODELS_PATH}/ner_rus_bert_torch"
- },
- "download": [
- {
- "url": "http://files.deeppavlov.ai/0.16/ner/ner_rus_bert_torch.tar.gz",
- "subdir": "{MODELS_PATH}"
- }
- ]
- }
-}
diff --git a/deeppavlov/configs/ner/slotfill_dstc2.json b/deeppavlov/configs/ner/slotfill_dstc2.json
deleted file mode 100644
index e1df2f26fd..0000000000
--- a/deeppavlov/configs/ner/slotfill_dstc2.json
+++ /dev/null
@@ -1,64 +0,0 @@
-{
- "dataset_reader": {
- "class_name": "dstc2_reader",
- "data_path": "{DATA_PATH}"
- },
- "dataset_iterator": {
- "class_name": "dstc2_ner_iterator",
- "slot_values_path": "{SLOT_VALS_PATH}"
- },
- "chainer": {
- "in": ["x"],
- "in_y": ["y"],
- "pipe": [
- {
- "in": ["x"],
- "class_name": "lazy_tokenizer",
- "out": ["x_tokens"]
- },
- {
- "in": ["x_tokens"],
- "config_path": "{NER_CONFIG_PATH}",
- "out": ["x_tokens", "tags"]
- },
-
- {
- "in": ["x_tokens", "tags"],
- "class_name": "dstc_slotfilling",
- "threshold": 0.8,
- "save_path": "{MODEL_PATH}/model",
- "load_path": "{MODEL_PATH}/model",
- "out": ["slots"]
- }
- ],
- "out": ["slots"]
- },
- "train": {
- "metrics": ["slots_accuracy"],
- "class_name": "fit_trainer",
- "evaluation_targets": [
- "valid",
- "test"
- ]
- },
- "metadata": {
- "variables": {
- "ROOT_PATH": "~/.deeppavlov",
- "NER_CONFIG_PATH": "{DEEPPAVLOV_PATH}/configs/ner/ner_dstc2.json",
- "DATA_PATH": "{ROOT_PATH}/downloads/dstc2",
- "SLOT_VALS_PATH": "{DATA_PATH}/dstc_slot_vals.json",
- "MODELS_PATH": "{ROOT_PATH}/models",
- "MODEL_PATH": "{MODELS_PATH}/slotfill_dstc2"
- },
- "download": [
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/dstc_slot_vals.tar.gz",
- "subdir": "{DATA_PATH}"
- },
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/slotfill_dstc2.tar.gz",
- "subdir": "{MODELS_PATH}"
- }
- ]
- }
-}
diff --git a/deeppavlov/configs/ner/slotfill_dstc2_raw.json b/deeppavlov/configs/ner/slotfill_dstc2_raw.json
deleted file mode 100644
index 9138d99c01..0000000000
--- a/deeppavlov/configs/ner/slotfill_dstc2_raw.json
+++ /dev/null
@@ -1,54 +0,0 @@
-{
- "dataset_reader": {
- "class_name": "dstc2_reader",
- "data_path": "{DATA_PATH}"
- },
- "dataset_iterator": {
- "class_name": "dstc2_ner_iterator",
- "slot_values_path": "{SLOT_VALS_PATH}"
- },
- "chainer": {
- "in": ["x"],
- "in_y": ["y"],
- "pipe": [
- {
- "in": ["x"],
- "class_name": "lazy_tokenizer",
- "out": ["x_tokens"]
- },
- {
- "in": ["x_tokens"],
- "class_name": "str_lower",
- "out": ["x_lower"]
- },
- {
- "in": ["x_lower"],
- "class_name": "slotfill_raw",
- "save_path": "{SLOT_VALS_PATH}",
- "load_path": "{SLOT_VALS_PATH}",
- "out": ["slots"]
- }
- ],
- "out": ["slots"]
- },
- "train": {
- "metrics": ["slots_accuracy"],
- "evaluation_targets": [
- "valid",
- "test"
- ]
- },
- "metadata": {
- "variables": {
- "ROOT_PATH": "~/.deeppavlov",
- "DATA_PATH": "{ROOT_PATH}/downloads/dstc2",
- "SLOT_VALS_PATH": "{DATA_PATH}/dstc_slot_vals.json"
- },
- "download": [
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/dstc_slot_vals.tar.gz",
- "subdir": "{DATA_PATH}"
- }
- ]
- }
-}
diff --git a/deeppavlov/configs/ner/slotfill_simple_dstc2_raw.json b/deeppavlov/configs/ner/slotfill_simple_dstc2_raw.json
deleted file mode 100644
index d6f9750e34..0000000000
--- a/deeppavlov/configs/ner/slotfill_simple_dstc2_raw.json
+++ /dev/null
@@ -1,54 +0,0 @@
-{
- "dataset_reader": {
- "class_name": "simple_dstc2_reader",
- "data_path": "{DATA_PATH}"
- },
- "dataset_iterator": {
- "class_name": "dstc2_ner_iterator",
- "slot_values_path": "{SLOT_VALS_PATH}"
- },
- "chainer": {
- "in": ["x"],
- "in_y": ["y"],
- "pipe": [
- {
- "in": ["x"],
- "class_name": "lazy_tokenizer",
- "out": ["x_tokens"]
- },
- {
- "in": ["x_tokens"],
- "class_name": "str_lower",
- "out": ["x_lower"]
- },
- {
- "in": ["x_lower"],
- "class_name": "slotfill_raw",
- "save_path": "{SLOT_VALS_PATH}",
- "load_path": "{SLOT_VALS_PATH}",
- "out": ["slots"]
- }
- ],
- "out": ["slots"]
- },
- "train": {
- "metrics": ["slots_accuracy"],
- "evaluation_targets": [
- "valid",
- "test"
- ]
- },
- "metadata": {
- "variables": {
- "ROOT_PATH": "~/.deeppavlov",
- "DATA_PATH": "{ROOT_PATH}/downloads/simple-dstc2",
- "SLOT_VALS_PATH": "{DATA_PATH}/dstc_slot_vals.json"
- },
- "download": [
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/dstc_slot_vals.tar.gz",
- "subdir": "{DATA_PATH}"
- }
- ]
- }
-}
diff --git a/deeppavlov/configs/ner/slotfill_simple_rasa_raw.json b/deeppavlov/configs/ner/slotfill_simple_rasa_raw.json
deleted file mode 100644
index 1365ebe7f4..0000000000
--- a/deeppavlov/configs/ner/slotfill_simple_rasa_raw.json
+++ /dev/null
@@ -1,43 +0,0 @@
-{
- "chainer": {
- "in": ["x"],
- "in_y": ["y"],
- "pipe": [
- {
- "in": ["x"],
- "class_name": "lazy_tokenizer",
- "out": ["x_tokens"]
- },
- {
- "in": ["x_tokens"],
- "class_name": "str_lower",
- "out": ["x_lower"]
- },
- {
- "in": ["x_lower"],
- "class_name": "slotfill_raw_rasa",
- "save_path": "{DATA_PATH}",
- "load_path": "{DATA_PATH}",
- "out": ["slots"]
- }
- ],
- "out": ["slots"]
- },
- "train": {
- "metrics": [],
- "evaluation_targets": []
- },
- "metadata": {
- "variables": {
- "ROOT_PATH": "~/.deeppavlov",
- "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
- "DATA_PATH": "{DOWNLOADS_PATH}/rasa_configs_reader"
- },
- "download": [
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/dp_minimal_rasa_demo.tar.gz",
- "subdir": "{DATA_PATH}"
- }
- ]
- }
-}
diff --git a/deeppavlov/configs/ner/vlsp2016_full.json b/deeppavlov/configs/ner/vlsp2016_full.json
deleted file mode 100644
index ec8d10ffbe..0000000000
--- a/deeppavlov/configs/ner/vlsp2016_full.json
+++ /dev/null
@@ -1,170 +0,0 @@
-{
- "dataset_reader": {
- "class_name": "conll2003_reader",
- "data_path": "{DOWNLOADS_PATH}/vlsp2016/",
- "dataset_name": "vlsp2016",
- "provide_pos": true,
- "provide_chunk": true,
- "iobes": true
- },
- "dataset_iterator": {
- "class_name": "data_learning_iterator"
- },
- "chainer": {
- "in": ["x", "pos", "chunk"],
- "in_y": ["y"],
- "pipe": [
- {
- "in": ["x"],
- "out": ["x_tokens"],
- "class_name": "lazy_tokenizer"
- },
- {
- "in": ["pos"],
- "out": ["pos_tokens"],
- "class_name": "lazy_tokenizer"
- },
- {
- "in": ["chunk"],
- "out": ["chunk_tokens"],
- "class_name": "lazy_tokenizer"
- },
- {
- "in": ["x_tokens"],
- "out": ["x_lower", "sent_lengths"],
- "class_name": "ner_preprocessor",
- "id": "ner_preprocessor",
- "get_x_padded_for_elmo": false,
- "get_x_cap_padded": false
- },
- {
- "in": ["x_lower"],
- "out": ["x_tok_ind"],
- "fit_on": ["x_lower"],
- "class_name": "ner_vocab",
- "id": "word_vocab",
- "save_path": "{MODELS_PATH}/word.dict",
- "load_path": "{MODELS_PATH}/word.dict"
- },
- {
- "in": ["pos_tokens"],
- "out": ["pos_ind"],
- "fit_on": ["pos_tokens"],
- "class_name": "ner_vocab",
- "id": "pos_vocab",
- "save_path": "{MODELS_PATH}/pos.dict",
- "load_path": "{MODELS_PATH}/pos.dict"
- },
- {
- "in": ["chunk_tokens"],
- "out": ["chunk_ind"],
- "fit_on": ["chunk_tokens"],
- "class_name": "ner_vocab",
- "id": "chunk_vocab",
- "save_path": "{MODELS_PATH}/chunk.dict",
- "load_path": "{MODELS_PATH}/chunk.dict"
- },
- {
- "in": ["y"],
- "out": ["y_ind"],
- "fit_on": ["y"],
- "class_name": "ner_vocab",
- "id": "tag_vocab",
- "save_path": "{MODELS_PATH}/tag.dict",
- "load_path": "{MODELS_PATH}/tag.dict"
- },
- {
- "in": ["x_tokens"],
- "out": ["x_char"],
- "class_name": "char_splitter"
- },
- {
- "in": ["x_tokens"],
- "out": ["x_char_ind"],
- "fit_on": ["x_tokens"],
- "class_name": "ner_vocab",
- "char_level": true,
- "id": "char_vocab",
- "save_path": "{MODELS_PATH}/char.dict",
- "load_path": "{MODELS_PATH}/char.dict"
- },
- {
- "in":[
- "sent_lengths",
- "x_tok_ind",
- "pos_ind",
- "chunk_ind",
- "x_char_ind"
- ],
- "in_y": ["y_ind"],
- "out": ["y_predicted"],
- "class_name": "hybrid_ner_model",
- "n_tags": "#tag_vocab.len",
- "word_emb_path": "{DOWNLOADS_PATH}/embeddings/baomoi.bin",
- "word_emb_name": "baomoi",
- "word_dim": 300,
- "word_vocab": "#word_vocab",
- "char_vocab_size": "#char_vocab.len",
- "pos_vocab_size": "#pos_vocab.len",
- "chunk_vocab_size": "#chunk_vocab.len",
- "pos_dim": 40,
- "chunk_dim": 40,
- "char_dim": 100,
- "lstm_hidden_size": 256,
- "save_path": "{MODELS_PATH}/vlsp2016_full",
- "load_path": "{MODELS_PATH}/vlsp2016_full",
- "learning_rate": 1e-3,
- "learning_rate_drop_patience": 5,
- "learning_rate_drop_div": 10,
- "dropout_keep_prob": 0.7
- },
- {
- "in": ["y_predicted"],
- "out": ["tags"],
- "class_name": "convert_ids2tags",
- "id2tag": "#tag_vocab.i2t"
- }
- ],
- "out": ["x_tokens", "tags"]
- },
- "train": {
- "epochs": 100,
- "batch_size": 64,
- "metrics": [
- {
- "name": "ner_f1",
- "inputs": ["y", "tags"]
- },
- {
- "name": "ner_token_f1",
- "inputs": ["y", "tags"]
- }
- ],
- "validation_patience": 10,
- "val_every_n_epochs": 1,
- "log_every_n_epochs": 1,
- "show_examples": false,
- "class_name": "nn_trainer",
- "evaluation_targets": [
- "valid",
- "test"
- ]
- },
- "metadata": {
- "variables": {
- "ROOT_PATH": "~/.deeppavlov",
- "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
- "MODELS_PATH": "{ROOT_PATH}/models/vlsp2016_full"
- },
- "download": [
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/ner_vlsp2016_full.tar.gz",
- "subdir": "{MODELS_PATH}"
- },
- {
- "url": "http://files.deeppavlov.ai/embeddings/baomoi.bin",
- "subdir": "{DOWNLOADS_PATH}/embeddings"
- }
- ]
- }
-}
diff --git a/deeppavlov/configs/odqa/en_odqa_infer_enwiki20161221.json b/deeppavlov/configs/odqa/en_odqa_infer_enwiki20161221.json
deleted file mode 100644
index 7b011f13d4..0000000000
--- a/deeppavlov/configs/odqa/en_odqa_infer_enwiki20161221.json
+++ /dev/null
@@ -1,69 +0,0 @@
-{
- "chainer": {
- "in": [
- "question_raw"
- ],
- "out": [
- "best_answer"
- ],
- "pipe": [
- {
- "config_path": "{CONFIGS_PATH}/doc_retrieval/en_ranker_tfidf_enwiki20161221.json",
- "in": [
- "question_raw"
- ],
- "out": [
- "tfidf_doc_ids"
- ]
- },
- {
- "class_name": "wiki_sqlite_vocab",
- "in": [
- "tfidf_doc_ids"
- ],
- "out": [
- "tfidf_doc_text"
- ],
- "join_docs": false,
- "shuffle": false,
- "load_path": "{DOWNLOADS_PATH}/odqa/enwiki20161221.db"
- },
- {
- "class_name": "document_chunker",
- "in": ["tfidf_doc_text"],
- "out": ["chunks"],
- "flatten_result": true,
- "paragraphs": true
- },
- {
- "class_name": "string_multiplier",
- "in": ["question_raw", "chunks"],
- "out":["questions"]
- },
- {
- "class_name": "logit_ranker",
- "batch_size": 10,
- "squad_model": {"config_path": "{CONFIGS_PATH}/squad/multi_squad_noans_infer.json"},
- "sort_noans": true,
- "in": [
- "chunks",
- "questions"
- ],
- "out": [
- "best_answer",
- "best_answer_score"
- ]
- }
- ]
- },
- "metadata": {
- "variables": {
- "ROOT_PATH": "~/.deeppavlov",
- "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
- "MODELS_PATH": "{ROOT_PATH}/models",
- "CONFIGS_PATH": "{DEEPPAVLOV_PATH}/configs"
- },
- "download": [
- ]
- }
-}
\ No newline at end of file
diff --git a/deeppavlov/configs/paramsearch/tfidf_logreg_autofaq_psearch.json b/deeppavlov/configs/paramsearch/tfidf_logreg_autofaq_psearch.json
index f793916514..bf65d82229 100644
--- a/deeppavlov/configs/paramsearch/tfidf_logreg_autofaq_psearch.json
+++ b/deeppavlov/configs/paramsearch/tfidf_logreg_autofaq_psearch.json
@@ -53,8 +53,8 @@
],
"class_name": "sklearn_component",
"main": true,
- "save_path": "{MODELS_PATH}/faq/tfidf_logreg_classifier_v3.pkl",
- "load_path": "{MODELS_PATH}/faq/tfidf_logreg_classifier_v3.pkl",
+ "save_path": "{MODELS_PATH}/faq/tfidf_logreg_classifier_v4.pkl",
+ "load_path": "{MODELS_PATH}/faq/tfidf_logreg_classifier_v4.pkl",
"model_class": "sklearn.linear_model:LogisticRegression",
"infer_method": "predict",
"C": {
@@ -94,7 +94,7 @@
},
"download": [
{
- "url": "http://files.deeppavlov.ai/faq/school/tfidf_logreg_classifier_v3.pkl",
+ "url": "http://files.deeppavlov.ai/faq/school/tfidf_logreg_classifier_v4.pkl",
"subdir": "{MODELS_PATH}/faq"
},
{
diff --git a/deeppavlov/configs/ranking/paraphrase_ident_paraphraser.json b/deeppavlov/configs/ranking/paraphrase_ident_paraphraser.json
deleted file mode 100644
index d9cb4c5ec6..0000000000
--- a/deeppavlov/configs/ranking/paraphrase_ident_paraphraser.json
+++ /dev/null
@@ -1,108 +0,0 @@
-{
- "dataset_reader": {
- "class_name": "paraphraser_reader",
- "data_path": "{DOWNLOADS_PATH}/paraphraser_data"
- },
- "dataset_iterator": {
- "class_name": "siamese_iterator",
- "seed": 243
- },
- "chainer": {
- "in": ["x"],
- "in_y": ["y"],
- "pipe": [
- {
- "id": "preproc",
- "class_name": "siamese_preprocessor",
- "use_matrix": false,
- "max_sequence_length": 28,
- "fit_on": ["x"],
- "in": ["x"],
- "out": ["x_proc"],
- "sent_vocab": {
- "id": "siam_sent_vocab",
- "class_name": "simple_vocab",
- "save_path": "{MODELS_PATH}/paraphraser_vocabs/sent.dict",
- "load_path": "{MODELS_PATH}/paraphraser_vocabs/sent.dict"
- },
- "tokenizer": {
- "class_name": "nltk_tokenizer"
- },
- "vocab": {
- "id": "siam_vocab",
- "class_name": "simple_vocab",
- "save_path": "{MODELS_PATH}/paraphraser_vocabs/tok.dict",
- "load_path": "{MODELS_PATH}/paraphraser_vocabs/tok.dict"
- },
- "embedder": {
- "id": "siam_embedder",
- "class_name": "fasttext",
- "load_path": "{DOWNLOADS_PATH}/embeddings/ft_native_300_ru_wiki_lenta_lower_case.bin"
- }
- },
- {
- "id": "embeddings",
- "class_name": "emb_mat_assembler",
- "embedder": "#siam_embedder",
- "vocab": "#siam_vocab"
- },
- {
- "in": ["x_proc"],
- "in_y": ["y"],
- "out": ["y_predicted"],
- "class_name": "mpm_nn",
- "len_vocab": "#siam_vocab.len",
- "use_matrix": "#preproc.use_matrix",
- "attention": true,
- "max_sequence_length": "#preproc.max_sequence_length",
- "emb_matrix": "#embeddings.emb_mat",
- "embedding_dim": "#siam_embedder.dim",
- "seed": 243,
- "hidden_dim": 200,
- "learning_rate": 1e-3,
- "triplet_loss": false,
- "batch_size": 256,
- "save_path": "{MODELS_PATH}/paraphraser_model/model_weights.h5",
- "load_path": "{MODELS_PATH}/paraphraser_model/model_weights.h5",
- "preprocess": "#preproc.__call__"
- }
- ],
- "out": ["y_predicted"]
- },
- "train": {
- "epochs": 200,
- "batch_size": 256,
- "pytest_max_batches": 2,
- "train_metrics": ["f1", "acc", "log_loss"],
- "metrics": ["f1", "acc", "log_loss"],
- "validation_patience": 10,
- "val_every_n_epochs": 1,
- "log_every_n_batches": 24,
- "class_name": "nn_trainer",
- "evaluation_targets": [
- "valid",
- "test"
- ]
- },
- "metadata": {
- "variables": {
- "ROOT_PATH": "~/.deeppavlov",
- "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
- "MODELS_PATH": "{ROOT_PATH}/models"
- },
- "download": [
- {
- "url": "http://files.deeppavlov.ai/datasets/paraphraser.zip",
- "subdir": "{DOWNLOADS_PATH}/paraphraser_data"
- },
- {
- "url": "http://files.deeppavlov.ai/datasets/paraphraser_gold.zip",
- "subdir": "{DOWNLOADS_PATH}/paraphraser_data"
- },
- {
- "url": "http://files.deeppavlov.ai/embeddings/ft_native_300_ru_wiki_lenta_lower_case/ft_native_300_ru_wiki_lenta_lower_case.bin",
- "subdir": "{DOWNLOADS_PATH}/embeddings"
- }
- ]
- }
-}
\ No newline at end of file
diff --git a/deeppavlov/configs/ranking/paraphrase_ident_paraphraser_interact.json b/deeppavlov/configs/ranking/paraphrase_ident_paraphraser_interact.json
deleted file mode 100644
index 4c6e3fa28b..0000000000
--- a/deeppavlov/configs/ranking/paraphrase_ident_paraphraser_interact.json
+++ /dev/null
@@ -1,121 +0,0 @@
-{
- "dataset_reader": {
- "class_name": "paraphraser_reader",
- "data_path": "{DOWNLOADS_PATH}/paraphraser_data"
- },
- "dataset_iterator": {
- "class_name": "siamese_iterator",
- "seed": 243
- },
- "chainer": {
- "in": ["x"],
- "in_y": ["y"],
- "pipe": [
- {
- "id": "preproc",
- "class_name": "siamese_preprocessor",
- "use_matrix": false,
- "max_sequence_length": 28,
- "fit_on": ["x"],
- "in": ["x"],
- "out": ["x_proc"],
- "sent_vocab": {
- "id": "siam_sent_vocab",
- "class_name": "simple_vocab",
- "save_path": "{MODELS_PATH}/paraphraser_vocabs/sent.dict",
- "load_path": "{MODELS_PATH}/paraphraser_vocabs/sent.dict"
- },
- "tokenizer": {
- "class_name": "nltk_tokenizer"
- },
- "vocab": {
- "id": "siam_vocab",
- "class_name": "simple_vocab",
- "save_path": "{MODELS_PATH}/paraphraser_vocabs/tok.dict",
- "load_path": "{MODELS_PATH}/paraphraser_vocabs/tok.dict"
- },
- "embedder": {
- "id": "siam_embedder",
- "class_name": "fasttext",
- "load_path": "{DOWNLOADS_PATH}/embeddings/ft_native_300_ru_wiki_lenta_lower_case.bin"
- }
- },
- {
- "id": "embeddings",
- "class_name": "emb_mat_assembler",
- "embedder": "#siam_embedder",
- "vocab": "#siam_vocab"
- },
- {
- "id": "model",
- "class_name": "mpm_nn",
- "len_vocab": "#siam_vocab.len",
- "use_matrix": "#preproc.use_matrix",
- "attention": true,
- "max_sequence_length": "#preproc.max_sequence_length",
- "emb_matrix": "#embeddings.emb_mat",
- "embedding_dim": "#siam_embedder.dim",
- "seed": 243,
- "hidden_dim": 200,
- "learning_rate": 1e-3,
- "triplet_loss": false,
- "batch_size": 256,
- "save_path": "{MODELS_PATH}/paraphraser_model/model_weights.h5",
- "load_path": "{MODELS_PATH}/paraphraser_model/model_weights.h5",
- "preprocess": "#preproc.__call__"
- },
- {
- "in": ["x_proc"],
- "in_y": ["y"],
- "out": ["y_predicted"],
- "class_name": "siamese_predictor",
- "model": "#model",
- "ranking": false,
- "attention": true,
- "batch_size": "#model.batch_size",
- "preproc_func": "#preproc.__call__"
- }
- ],
- "out": ["y_predicted"]
- },
- "train": {
- "epochs": 200,
- "batch_size": 256,
- "pytest_max_batches": 2,
- "train_metrics": ["f1", "acc", "log_loss"],
- "metrics": ["f1", "acc", "log_loss"],
- "validation_patience": 10,
- "val_every_n_epochs": 5,
- "log_every_n_batches": 12,
- "class_name": "nn_trainer",
- "evaluation_targets": [
- "valid",
- "test"
- ]
- },
- "metadata": {
- "variables": {
- "ROOT_PATH": "~/.deeppavlov",
- "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
- "MODELS_PATH": "{ROOT_PATH}/models"
- },
- "download": [
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/paraphrase_ident_paraphraser.tar.gz",
- "subdir": "{MODELS_PATH}"
- },
- {
- "url": "http://files.deeppavlov.ai/datasets/paraphraser.zip",
- "subdir": "{DOWNLOADS_PATH}/paraphraser_data"
- },
- {
- "url": "http://files.deeppavlov.ai/datasets/paraphraser_gold.zip",
- "subdir": "{DOWNLOADS_PATH}/paraphraser_data"
- },
- {
- "url": "http://files.deeppavlov.ai/embeddings/ft_native_300_ru_wiki_lenta_lower_case/ft_native_300_ru_wiki_lenta_lower_case.bin",
- "subdir": "{DOWNLOADS_PATH}/embeddings"
- }
- ]
- }
-}
\ No newline at end of file
diff --git a/deeppavlov/configs/ranking/ranking_default.json b/deeppavlov/configs/ranking/ranking_default.json
deleted file mode 100644
index 8d3ac4f15f..0000000000
--- a/deeppavlov/configs/ranking/ranking_default.json
+++ /dev/null
@@ -1,106 +0,0 @@
-{
- "dataset_reader": {
- "class_name": "siamese_reader",
- "data_path": "{DOWNLOADS_PATH}/default_ranking_data"
- },
- "dataset_iterator": {
- "class_name": "siamese_iterator"
- },
- "chainer": {
- "in": ["x"],
- "in_y": ["y"],
- "pipe": [
- {
- "id": "preproc",
- "class_name": "siamese_preprocessor",
- "use_matrix": false,
- "num_ranking_samples": 10,
- "max_sequence_length": 50,
- "fit_on": ["x"],
- "in": ["x"],
- "out": ["x_proc"],
- "sent_vocab": {
- "id": "siam_sent_vocab",
- "class_name": "simple_vocab",
- "save_path": "{MODELS_PATH}/default_ranking_vocabs/sent.dict",
- "load_path": "{MODELS_PATH}/default_ranking_vocabs/sent.dict"
- },
- "tokenizer": {
- "class_name": "split_tokenizer"
- },
- "vocab": {
- "id": "siam_vocab",
- "class_name": "simple_vocab",
- "save_path": "{MODELS_PATH}/default_ranking_vocabs/tok.dict",
- "load_path": "{MODELS_PATH}/default_ranking_vocabs/tok.dict"
- },
- "embedder": {
- "id": "siam_embedder",
- "class_name": "fasttext",
- "load_path": "{DOWNLOADS_PATH}/embeddings/wiki.ru.bin"
- }
- },
- {
- "id": "embeddings",
- "class_name": "emb_mat_assembler",
- "embedder": "#siam_embedder",
- "vocab": "#siam_vocab"
- },
- {
- "in": ["x_proc"],
- "in_y": ["y"],
- "out": ["y_predicted"],
- "class_name": "bilstm_nn",
- "len_vocab": "#siam_vocab.len",
- "use_matrix": "#preproc.use_matrix",
- "max_sequence_length": "#preproc.max_sequence_length",
- "emb_matrix": "#embeddings.emb_mat",
- "embedding_dim": "#siam_embedder.dim",
- "seed": 243,
- "reccurent": "bilstm",
- "max_pooling": true,
- "shared_weights": true,
- "hidden_dim": 100,
- "learning_rate": 1e-3,
- "batch_size": 256,
- "save_path": "{MODELS_PATH}/default_ranking_model/model_weights.h5",
- "load_path": "{MODELS_PATH}/default_ranking_model/model_weights.h5",
- "preprocess": "#preproc.__call__",
- "interact_pred_num": 3
- }
- ],
- "out": ["y_predicted"]
- },
- "train": {
- "epochs": 10,
- "batch_size": 256,
- "pytest_max_batches": 2,
- "train_metrics": ["f1", "acc"],
- "metrics": ["r@1", "r@2", "r@5", "rank_response"],
- "validation_patience": 3,
- "val_every_n_epochs": 1,
- "log_every_n_batches": 1,
- "class_name": "nn_trainer",
- "evaluation_targets": [
- "valid",
- "test"
- ]
- },
- "metadata": {
- "variables": {
- "ROOT_PATH": "~/.deeppavlov",
- "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
- "MODELS_PATH": "{ROOT_PATH}/models"
- },
- "download": [
- {
- "url": "http://files.deeppavlov.ai/datasets/default_ranking_data.tar.gz",
- "subdir": "{DOWNLOADS_PATH}/default_ranking_data"
- },
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/embeddings/wiki.ru.bin",
- "subdir": "{DOWNLOADS_PATH}/embeddings"
- }
- ]
- }
-}
\ No newline at end of file
diff --git a/deeppavlov/configs/ranking/ranking_default_triplet.json b/deeppavlov/configs/ranking/ranking_default_triplet.json
deleted file mode 100644
index fdfeb74621..0000000000
--- a/deeppavlov/configs/ranking/ranking_default_triplet.json
+++ /dev/null
@@ -1,108 +0,0 @@
-{
- "dataset_reader": {
- "class_name": "siamese_reader",
- "data_path": "{DOWNLOADS_PATH}/default_ranking_data_triplet"
- },
- "dataset_iterator": {
- "class_name": "siamese_iterator"
- },
- "chainer": {
- "in": ["x"],
- "in_y": ["y"],
- "pipe": [
- {
- "id": "preproc",
- "class_name": "siamese_preprocessor",
- "use_matrix": false,
- "num_ranking_samples": 10,
- "max_sequence_length": 50,
- "fit_on": ["x"],
- "in": ["x"],
- "out": ["x_proc"],
- "sent_vocab": {
- "id": "siam_sent_vocab",
- "class_name": "simple_vocab",
- "save_path": "{MODELS_PATH}/default_ranking_vocabs/sent.dict",
- "load_path": "{MODELS_PATH}/default_ranking_vocabs/sent.dict"
- },
- "tokenizer": {
- "class_name": "split_tokenizer"
- },
- "vocab": {
- "id": "siam_vocab",
- "class_name": "simple_vocab",
- "save_path": "{MODELS_PATH}/default_ranking_vocabs/tok.dict",
- "load_path": "{MODELS_PATH}/default_ranking_vocabs/tok.dict"
- },
- "embedder": {
- "id": "siam_embedder",
- "class_name": "fasttext",
- "load_path": "{DOWNLOADS_PATH}/embeddings/wiki.ru.bin"
- }
- },
- {
- "id": "embeddings",
- "class_name": "emb_mat_assembler",
- "embedder": "#siam_embedder",
- "vocab": "#siam_vocab"
- },
- {
- "in": ["x_proc"],
- "in_y": ["y"],
- "out": ["y_predicted"],
- "class_name": "bilstm_nn",
- "len_vocab": "#siam_vocab.len",
- "use_matrix": "#preproc.use_matrix",
- "max_sequence_length": "#preproc.max_sequence_length",
- "emb_matrix": "#embeddings.emb_mat",
- "embedding_dim": "#siam_embedder.dim",
- "seed": 243,
- "reccurent": "bilstm",
- "max_pooling": true,
- "shared_weights": true,
- "hidden_dim": 100,
- "triplet_loss": true,
- "hard_triplets": false,
- "learning_rate": 1e-3,
- "batch_size": 256,
- "save_path": "{MODELS_PATH}/default_ranking_model/model_weights.h5",
- "load_path": "{MODELS_PATH}/default_ranking_model/model_weights.h5",
- "preprocess": "#preproc.__call__",
- "interact_pred_num": 3
- }
- ],
- "out": ["y_predicted"]
- },
- "train": {
- "epochs": 10,
- "batch_size": 256,
- "pytest_max_batches": 2,
- "train_metrics": ["f1", "acc"],
- "metrics": ["r@1", "r@2", "r@5", "rank_response"],
- "validation_patience": 3,
- "val_every_n_epochs": 1,
- "log_every_n_batches": 1,
- "class_name": "nn_trainer",
- "evaluation_targets": [
- "valid",
- "test"
- ]
- },
- "metadata": {
- "variables": {
- "ROOT_PATH": "~/.deeppavlov",
- "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
- "MODELS_PATH": "{ROOT_PATH}/models"
- },
- "download": [
- {
- "url": "http://files.deeppavlov.ai/datasets/default_ranking_data_triplet.tar.gz",
- "subdir": "{DOWNLOADS_PATH}/default_ranking_data_triplet"
- },
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/embeddings/wiki.ru.bin",
- "subdir": "{DOWNLOADS_PATH}/embeddings"
- }
- ]
- }
-}
\ No newline at end of file
diff --git a/deeppavlov/configs/ranking/ranking_ubuntu_v2_bert_sep.json b/deeppavlov/configs/ranking/ranking_ubuntu_v2_bert_sep.json
deleted file mode 100644
index 200ac499bd..0000000000
--- a/deeppavlov/configs/ranking/ranking_ubuntu_v2_bert_sep.json
+++ /dev/null
@@ -1,72 +0,0 @@
-{
- "dataset_reader": {
- "class_name": "ubuntu_v2_reader",
- "data_path": "{DOWNLOADS_PATH}/ubuntu_v2_data",
- "positive_samples": true
- },
- "dataset_iterator": {
- "class_name": "siamese_iterator",
- "seed": 243
- },
- "chainer": {
- "in": ["x"],
- "in_y": ["y"],
- "pipe": [
- {
- "class_name": "bert_sep_ranker_preprocessor",
- "vocab_file": "{DOWNLOADS_PATH}/bert_models/uncased_L-12_H-768_A-12/vocab.txt",
- "do_lower_case": true,
- "max_seq_length": 128,
- "in": ["x"],
- "out": ["bert_features"]
- },
- {
- "class_name": "bert_sep_ranker",
- "bert_config_file": "{DOWNLOADS_PATH}/bert_models/uncased_L-12_H-768_A-12/bert_config.json",
- "pretrained_bert": "{DOWNLOADS_PATH}/bert_models/uncased_L-12_H-768_A-12/bert_model.ckpt",
- "save_path": "{MODEL_PATH}/model",
- "load_path": "{MODEL_PATH}/model",
- "learning_rate": 2e-05,
- "in": ["bert_features"],
- "in_y": ["y"],
- "out": ["predictions"]
- }
- ],
- "out": ["predictions"]
- },
- "train": {
- "batch_size": 16,
- "pytest_max_batches": 2,
- "train_metrics": [],
- "metrics": ["r@1", "r@2", "r@5"],
- "validation_patience": 1,
- "val_every_n_batches": -1,
- "val_every_n_epochs": 1,
- "log_every_n_batches": -1,
- "validate_best": true,
- "test_best": true,
- "tensorboard_log_dir": "{MODEL_PATH}/"
- },
- "metadata": {
- "variables": {
- "ROOT_PATH": "~/.deeppavlov",
- "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
- "MODELS_PATH": "{ROOT_PATH}/models",
- "MODEL_PATH": "{MODELS_PATH}/ubuntu_v2_uncased_bert_sep_model"
- },
- "download": [
- {
- "url": "http://files.deeppavlov.ai/datasets/ubuntu_v2_data.tar.gz",
- "subdir": "{DOWNLOADS_PATH}/ubuntu_v2_data"
- },
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/bert/uncased_L-12_H-768_A-12.zip",
- "subdir": "{DOWNLOADS_PATH}/bert_models"
- },
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/ubuntu_v2_uncased_bert_sep_model.tar.gz",
- "subdir": "{MODELS_PATH}"
- }
- ]
- }
-}
\ No newline at end of file
diff --git a/deeppavlov/configs/ranking/ranking_ubuntu_v2_bert_sep_interact.json b/deeppavlov/configs/ranking/ranking_ubuntu_v2_bert_sep_interact.json
deleted file mode 100644
index 8884dcfc24..0000000000
--- a/deeppavlov/configs/ranking/ranking_ubuntu_v2_bert_sep_interact.json
+++ /dev/null
@@ -1,91 +0,0 @@
-{
- "dataset_reader": {
- "class_name": "ubuntu_v2_reader",
- "data_path": "{DOWNLOADS_PATH}/ubuntu_v2_data",
- "positive_samples": true
- },
- "dataset_iterator": {
- "class_name": "siamese_iterator",
- "seed": 243
- },
- "chainer": {
- "in": ["x"],
- "in_y": ["y"],
- "pipe": [
- {
- "class_name": "response_base_loader",
- "id": "loader",
- "save_path": "{MODEL_PATH}",
- "load_path": "{MODEL_PATH}"
- },
- {
- "class_name": "bert_sep_ranker_predictor_preprocessor",
- "id": "preproc",
- "vocab_file": "{DOWNLOADS_PATH}/bert_models/uncased_L-12_H-768_A-12/vocab.txt",
- "do_lower_case": true,
- "max_seq_length": 128,
- "resps": "#loader.resps",
- "resp_vecs": "#loader.resp_vecs",
- "conts": "#loader.conts",
- "cont_vecs": "#loader.cont_vecs",
- "in": ["x"],
- "out": ["bert_features"]
- },
- {
- "class_name": "bert_sep_ranker_predictor",
- "resps": "#loader.resps",
- "resp_vecs": "#loader.resp_vecs",
- "resp_features": "#preproc.resp_features",
- "conts": "#loader.conts",
- "cont_vecs": "#loader.cont_vecs",
- "cont_features": "#preproc.cont_features",
- "interact_mode": 3,
- "bert_config_file": "{DOWNLOADS_PATH}/bert_models/uncased_L-12_H-768_A-12/bert_config.json",
- "pretrained_bert": "{DOWNLOADS_PATH}/bert_models/uncased_L-12_H-768_A-12/bert_model.ckpt",
- "save_path": "{MODEL_PATH}",
- "load_path": "{MODEL_PATH}/model",
- "learning_rate": 2e-05,
- "in": ["bert_features"],
- "in_y": ["y"],
- "out": ["predictions"]
- }
- ],
- "out": ["predictions"]
- },
- "train": {
- "batch_size": 16,
- "pytest_max_batches": 2,
- "train_metrics": [],
- "metrics": ["r@1", "r@2", "r@5"],
- "validation_patience": 1,
- "val_every_n_batches": -1,
- "val_every_n_epochs": 1,
- "log_every_n_batches": -1,
- "validate_best": true,
- "test_best": true,
- "tensorboard_log_dir": "{MODEL_PATH}/"
- },
- "metadata": {
- "variables": {
- "ROOT_PATH": "~/.deeppavlov",
- "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
- "MODELS_PATH": "{ROOT_PATH}/models",
- "MODEL_PATH": "{MODELS_PATH}/ubuntu_v2_uncased_bert_sep_predictor_model"
- },
- "download": [
- {
- "url": "http://files.deeppavlov.ai/datasets/ubuntu_v2_data.tar.gz",
- "subdir": "{DOWNLOADS_PATH}/ubuntu_v2_data"
- },
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/bert/uncased_L-12_H-768_A-12.zip",
- "subdir": "{DOWNLOADS_PATH}/bert_models"
- },
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/ubuntu_v2_uncased_bert_sep_predictor_model.tar.gz",
- "subdir": "{MODELS_PATH}"
- }
-
- ]
- }
-}
\ No newline at end of file
diff --git a/deeppavlov/configs/ranking/ranking_ubuntu_v2_bert_uncased.json b/deeppavlov/configs/ranking/ranking_ubuntu_v2_bert_uncased.json
deleted file mode 100644
index 266fe02630..0000000000
--- a/deeppavlov/configs/ranking/ranking_ubuntu_v2_bert_uncased.json
+++ /dev/null
@@ -1,72 +0,0 @@
-{
- "dataset_reader": {
- "class_name": "ubuntu_v2_reader",
- "data_path": "{DOWNLOADS_PATH}/ubuntu_v2_data"
- },
- "dataset_iterator": {
- "class_name": "siamese_iterator",
- "seed": 243
- },
- "chainer": {
- "in": ["x"],
- "in_y": ["y"],
- "pipe": [
- {
- "class_name": "bert_ranker_preprocessor",
- "vocab_file": "{DOWNLOADS_PATH}/bert_models/uncased_L-12_H-768_A-12/vocab.txt",
- "do_lower_case": true,
- "max_seq_length": 128,
- "in": ["x"],
- "out": ["bert_features"]
- },
- {
- "class_name": "bert_ranker",
- "one_hot_labels": false,
- "bert_config_file": "{DOWNLOADS_PATH}/bert_models/uncased_L-12_H-768_A-12/bert_config.json",
- "pretrained_bert": "{DOWNLOADS_PATH}/bert_models/uncased_L-12_H-768_A-12/bert_model.ckpt",
- "save_path": "{MODEL_PATH}/model",
- "load_path": "{MODEL_PATH}/model",
- "learning_rate": 2e-05,
- "in": ["bert_features"],
- "in_y": ["y"],
- "out": ["predictions"]
- }
- ],
- "out": ["predictions"]
- },
- "train": {
- "batch_size": 32,
- "pytest_max_batches": 2,
- "train_metrics": [],
- "metrics": ["r@1", "r@2", "r@5"],
- "validation_patience": 1,
- "val_every_n_batches": -1,
- "val_every_n_epochs": 1,
- "log_every_n_batches": -1,
- "validate_best": true,
- "test_best": true,
- "tensorboard_log_dir": "{MODEL_PATH}/"
- },
- "metadata": {
- "variables": {
- "ROOT_PATH": "~/.deeppavlov",
- "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
- "MODELS_PATH": "{ROOT_PATH}/models",
- "MODEL_PATH": "{MODELS_PATH}/ubuntu_v2_uncased_bert_model"
- },
- "download": [
- {
- "url": "http://files.deeppavlov.ai/datasets/ubuntu_v2_data.tar.gz",
- "subdir": "{DOWNLOADS_PATH}/ubuntu_v2_data"
- },
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/bert/uncased_L-12_H-768_A-12.zip",
- "subdir": "{DOWNLOADS_PATH}/bert_models"
- },
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/ubuntu_v2_uncased_bert_model.tar.gz",
- "subdir": "{MODELS_PATH}"
- }
- ]
- }
-}
\ No newline at end of file
diff --git a/deeppavlov/configs/ranking/ranking_ubuntu_v2_mt.json b/deeppavlov/configs/ranking/ranking_ubuntu_v2_mt.json
deleted file mode 100644
index 499bd3d3dc..0000000000
--- a/deeppavlov/configs/ranking/ranking_ubuntu_v2_mt.json
+++ /dev/null
@@ -1,107 +0,0 @@
-{
- "dataset_reader": {
- "class_name": "ubuntu_v2_mt_reader",
- "data_path": "{DOWNLOADS_PATH}/ubuntu_v2_data",
- "num_context_turns": "{NUM_CONTEXT_TURNS}"
- },
- "dataset_iterator": {
- "class_name": "siamese_iterator",
- "seed": 243
- },
- "chainer": {
- "in": ["x"],
- "in_y": ["y"],
- "pipe": [
- {
- "id": "preproc",
- "class_name": "siamese_preprocessor",
- "use_matrix": true,
- "num_ranking_samples": 10,
- "num_context_turns": "{NUM_CONTEXT_TURNS}",
- "max_sequence_length": 50,
- "fit_on": ["x"],
- "in": ["x"],
- "out": ["x_proc"],
- "sent_vocab": {
- "id": "siam_sent_vocab",
- "class_name": "simple_vocab",
- "save_path": "{MODELS_PATH}/ubuntu_v2_vocabs/sent.dict",
- "load_path": "{MODELS_PATH}/ubuntu_v2_vocabs/sent.dict"
- },
- "tokenizer": {
- "class_name": "nltk_tokenizer"
- },
- "vocab": {
- "id": "siam_vocab",
- "class_name": "simple_vocab",
- "save_path": "{MODELS_PATH}/ubuntu_v2_mt_vocabs/tok.dict",
- "load_path": "{MODELS_PATH}/ubuntu_v2_mt_vocabs/tok.dict"
- },
- "embedder": {
- "id": "siam_embedder",
- "class_name": "fasttext",
- "load_path": "{DOWNLOADS_PATH}/embeddings/wiki.en.bin"
- }
- },
- {
- "id": "embeddings",
- "class_name": "emb_mat_assembler",
- "embedder": "#siam_embedder",
- "vocab": "#siam_vocab"
- },
- {
- "in": ["x_proc"],
- "in_y": ["y"],
- "out": ["y_predicted"],
- "class_name": "bilstm_gru_nn",
- "use_matrix": "#preproc.use_matrix",
- "num_context_turns": "{NUM_CONTEXT_TURNS}",
- "len_vocab": "#siam_vocab.len",
- "max_sequence_length": "#preproc.max_sequence_length",
- "embedding_dim": "#siam_embedder.dim",
- "emb_matrix": "#embeddings.emb_mat",
- "seed": 243,
- "hidden_dim": 300,
- "learning_rate": 1e-3,
- "triplet_loss": false,
- "batch_size": 256,
- "save_path": "{MODELS_PATH}/ubuntu_v2_mt_model/model_weights.h5",
- "load_path": "{MODELS_PATH}/ubuntu_v2_mt_model/model_weights.h5"
- }
- ],
- "out": ["y_predicted"]
- },
- "train": {
- "epochs": 200,
- "batch_size": 256,
- "pytest_max_batches": 2,
- "train_metrics": [],
- "metrics": ["r@1", "rank_response"],
- "validation_patience": 10,
- "val_every_n_epochs": 1,
- "log_every_n_batches": 1000,
- "class_name": "nn_trainer",
- "evaluation_targets": [
- "valid",
- "test"
- ]
- },
- "metadata": {
- "variables": {
- "ROOT_PATH": "~/.deeppavlov",
- "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
- "MODELS_PATH": "{ROOT_PATH}/models",
- "NUM_CONTEXT_TURNS": 10
- },
- "download": [
- {
- "url": "http://files.deeppavlov.ai/datasets/ubuntu_v2_data.tar.gz",
- "subdir": "{DOWNLOADS_PATH}/ubuntu_v2_data"
- },
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/embeddings/wiki.en.bin",
- "subdir": "{DOWNLOADS_PATH}/embeddings"
- }
- ]
- }
-}
\ No newline at end of file
diff --git a/deeppavlov/configs/ranking/ranking_ubuntu_v2_mt_interact.json b/deeppavlov/configs/ranking/ranking_ubuntu_v2_mt_interact.json
deleted file mode 100644
index 3ece2399dd..0000000000
--- a/deeppavlov/configs/ranking/ranking_ubuntu_v2_mt_interact.json
+++ /dev/null
@@ -1,121 +0,0 @@
-{
- "dataset_reader": {
- "class_name": "ubuntu_v2_mt_reader",
- "data_path": "{DOWNLOADS_PATH}/ubuntu_v2_data",
- "num_context_turns": "{NUM_CONTEXT_TURNS}"
- },
- "dataset_iterator": {
- "class_name": "siamese_iterator",
- "seed": 243
- },
- "chainer": {
- "in": ["x"],
- "in_y": ["y"],
- "pipe": [
- {
- "id": "preproc",
- "class_name": "siamese_preprocessor",
- "use_matrix": true,
- "num_ranking_samples": 10,
- "num_context_turns": "{NUM_CONTEXT_TURNS}",
- "max_sequence_length": 50,
- "fit_on": ["x"],
- "in": ["x"],
- "out": ["x_proc"],
- "sent_vocab": {
- "id": "siam_sent_vocab",
- "class_name": "simple_vocab",
- "save_path": "{MODELS_PATH}/ubuntu_v2_vocabs/sent.dict",
- "load_path": "{MODELS_PATH}/ubuntu_v2_vocabs/sent.dict"
- },
- "tokenizer": {
- "class_name": "nltk_tokenizer"
- },
- "vocab": {
- "id": "siam_vocab",
- "class_name": "simple_vocab",
- "save_path": "{MODELS_PATH}/ubuntu_v2_mt_vocabs/tok.dict",
- "load_path": "{MODELS_PATH}/ubuntu_v2_mt_vocabs/tok.dict"
- },
- "embedder": {
- "id": "siam_embedder",
- "class_name": "fasttext",
- "load_path": "{DOWNLOADS_PATH}/embeddings/wiki.en.bin"
- }
- },
- {
- "id": "embeddings",
- "class_name": "emb_mat_assembler",
- "embedder": "#siam_embedder",
- "vocab": "#siam_vocab"
- },
- {
- "id": "model",
- "class_name": "bilstm_gru_nn",
- "use_matrix": "#preproc.use_matrix",
- "num_context_turns": "{NUM_CONTEXT_TURNS}",
- "len_vocab": "#siam_vocab.len",
- "max_sequence_length": "#preproc.max_sequence_length",
- "embedding_dim": "#siam_embedder.dim",
- "emb_matrix": "#embeddings.emb_mat",
- "seed": 243,
- "hidden_dim": 300,
- "learning_rate": 1e-3,
- "triplet_loss": false,
- "batch_size": 256,
- "save_path": "{MODELS_PATH}/ubuntu_v2_mt_model/model_weights.h5",
- "load_path": "{MODELS_PATH}/ubuntu_v2_mt_model/model_weights.h5"
- },
- {
- "in": ["x_proc"],
- "in_y": ["y"],
- "out": ["y_predicted"],
- "class_name": "siamese_predictor",
- "model": "#model",
- "num_context_turns": "{NUM_CONTEXT_TURNS}",
- "batch_size": "#model.batch_size",
- "responses": "#siam_sent_vocab",
- "preproc_func": "#preproc.__call__"
- }
- ],
- "out": ["y_predicted"]
- },
- "train": {
- "epochs": 200,
- "batch_size": 256,
- "pytest_max_batches": 2,
- "train_metrics": [],
- "metrics": ["r@1", "rank_response"],
- "validation_patience": 10,
- "val_every_n_epochs": 1,
- "log_every_n_batches": 1000,
- "class_name": "nn_trainer",
- "evaluation_targets": [
- "valid",
- "test"
- ]
- },
- "metadata": {
- "variables": {
- "ROOT_PATH": "~/.deeppavlov",
- "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
- "MODELS_PATH": "{ROOT_PATH}/models",
- "NUM_CONTEXT_TURNS": 10
-
- },
- "download": [
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/ubuntu_v2_mt_ranking.tar.gz",
- "subdir": "{MODELS_PATH}"
- },
- {
- "url": "http://files.deeppavlov.ai/datasets/ubuntu_v2_data.tar.gz",
- "subdir": "{DOWNLOADS_PATH}/ubuntu_v2_data"
- },
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/embeddings/wiki.en.bin",
- "subdir": "{DOWNLOADS_PATH}/embeddings"
- }
- ]
- }
-}
\ No newline at end of file
diff --git a/deeppavlov/configs/ranking/ranking_ubuntu_v2_mt_word2vec_dam_transformer.json b/deeppavlov/configs/ranking/ranking_ubuntu_v2_mt_word2vec_dam_transformer.json
deleted file mode 100644
index fb7d8aa31f..0000000000
--- a/deeppavlov/configs/ranking/ranking_ubuntu_v2_mt_word2vec_dam_transformer.json
+++ /dev/null
@@ -1,134 +0,0 @@
-{
- "info": "The config is for training or evaluation of DAM_USE-T on Ubuntu Dialogue Corpus v2 using prepared Word2vec embeddings",
- "dataset_reader": {
- "class_name": "ubuntu_v2_mt_reader",
- "data_path": "{DOWNLOADS_PATH}/ubuntu_v2_data_clean",
- "num_context_turns": "{NUM_CONTEXT_TURNS}",
- "padding": "pre"
- },
- "dataset_iterator": {
- "class_name": "siamese_iterator",
- "shuffle": true,
- "seed": 243
- },
- "chainer": {
- "in": ["x"],
- "in_y": ["y"],
- "pipe": [
- {
- "class_name": "split_tokenizer",
- "id": "tok_1"
- },
- {
- "class_name": "simple_vocab",
- "special_tokens": ["", ""],
- "unk_token": "",
- "fit_on": ["x"],
- "id": "vocab_1",
- "save_path": "{MODELS_PATH}/ubuntu_v2_mt_word2vec_dam_transformer/vocabs/int_tok.dict",
- "load_path": "{MODELS_PATH}/ubuntu_v2_mt_word2vec_dam_transformer/vocabs/int_tok.dict"
- },
- {
- "id": "word2vec_embedder",
- "class_name": "glove",
- "dim": 200,
- "load_path": "{DOWNLOADS_PATH}/embeddings/v2_ubuntu_w2v_vectors.txt"
- },
- {
- "id": "preproc",
- "class_name": "siamese_preprocessor",
- "save_path": "{MODELS_PATH}/ubuntu_v2_mt_word2vec_dam_transformer/preproc/tok.dict",
- "load_path": "{MODELS_PATH}/ubuntu_v2_mt_word2vec_dam_transformer/preproc/tok.dict",
- "num_ranking_samples": 10,
- "num_context_turns": "{NUM_CONTEXT_TURNS}",
- "max_sequence_length": 50,
- "embedding_dim": 200,
- "add_raw_text": true,
- "fit_on": ["x"],
- "in": ["x"],
- "out": ["x_proc"],
- "tokenizer": {
- "ref": "tok_1",
- "notes": "use defined tokenizer"
- },
- "vocab": {
- "ref": "vocab_1",
- "notes": "use vocab built for tokenized data"
- }
- },
- {
- "id": "embeddings",
- "class_name": "emb_mat_assembler",
- "embedder": "#word2vec_embedder",
- "vocab": "#vocab_1"
- },
- {
- "in": ["x_proc"],
- "in_y": ["y"],
- "out": ["y_predicted"],
- "class_name": "dam_nn_use_transformer",
- "stack_num": 5,
- "is_positional": true,
- "num_context_turns": "{NUM_CONTEXT_TURNS}",
- "max_sequence_length": "#preproc.max_sequence_length",
- "embedding_dim": "#word2vec_embedder.dim",
- "emb_matrix": "#embeddings.emb_mat",
- "learning_rate": 1e-3,
- "batch_size": 100,
- "seed": 65,
- "decay_steps": 1000,
- "save_path": "{MODELS_PATH}/ubuntu_v2_mt_word2vec_dam_transformer/model_dam/model",
- "load_path": "{MODELS_PATH}/ubuntu_v2_mt_word2vec_dam_transformer/model_dam/model"
- }
- ],
- "out": [
- "y_predicted"
- ]
- },
- "train": {
- "class_name": "nn_trainer",
- "epochs": 8,
- "batch_size": 100,
- "shuffle": true,
- "pytest_max_batches": 2,
- "train_metrics": [],
- "validate_best": true,
- "test_best": true,
- "metrics": [
- "r@1",
- "r@2",
- "r@5",
- "rank_response"
- ],
- "validation_patience": 1,
- "val_every_n_epochs": 1,
- "log_every_n_batches": 100,
- "evaluation_targets": [
- "valid",
- "test"
- ],
- "tensorboard_log_dir": "{MODELS_PATH}/ubuntu_v2_mt_word2vec_dam_transformer/logs_dam/"
- },
- "metadata": {
- "variables": {
- "ROOT_PATH": "~/.deeppavlov",
- "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
- "MODELS_PATH": "{ROOT_PATH}/models",
- "NUM_CONTEXT_TURNS": 10
- },
- "download": [
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/ubuntu_v2_mt_word2vec_dam_transformer.tar.gz",
- "subdir": "{MODELS_PATH}"
- },
- {
- "url": "http://files.deeppavlov.ai/datasets/ubuntu_v2_data_clean.tar.gz",
- "subdir": "{DOWNLOADS_PATH}/ubuntu_v2_data_clean"
- },
- {
- "url": "http://files.deeppavlov.ai/embeddings/v2_ubuntu_w2v_vectors.txt.tar.gz",
- "subdir": "{DOWNLOADS_PATH}/embeddings"
- }
- ]
- }
-}
diff --git a/deeppavlov/configs/ranking/ranking_ubuntu_v2_mt_word2vec_smn.json b/deeppavlov/configs/ranking/ranking_ubuntu_v2_mt_word2vec_smn.json
deleted file mode 100644
index e6ef4cdd5e..0000000000
--- a/deeppavlov/configs/ranking/ranking_ubuntu_v2_mt_word2vec_smn.json
+++ /dev/null
@@ -1,127 +0,0 @@
-{
- "info": "The config is for training or evaluation of SMN on Ubuntu Dialogue Corpus v2 using prepared Word2vec embeddings",
- "dataset_reader": {
- "class_name": "ubuntu_v2_mt_reader",
- "data_path": "{DOWNLOADS_PATH}/ubuntu_v2_data_clean",
- "num_context_turns": "{NUM_CONTEXT_TURNS}",
- "padding": "pre"
- },
- "dataset_iterator": {
- "class_name": "siamese_iterator",
- "shuffle": true,
- "seed": 243
- },
- "chainer": {
- "in": ["x"],
- "in_y": ["y"],
- "pipe": [
- {
- "class_name": "split_tokenizer",
- "id": "tok_1"
- },
- {
- "class_name": "simple_vocab",
- "special_tokens": ["", ""],
- "unk_token": "",
- "fit_on": ["x"],
- "id": "vocab_1",
- "save_path": "{MODELS_PATH}/ubuntu_v2_mt_word2vec_smn/vocabs/int_tok.dict",
- "load_path": "{MODELS_PATH}/ubuntu_v2_mt_word2vec_smn/vocabs/int_tok.dict"
- },
- {
- "id": "word2vec_embedder",
- "class_name": "glove",
- "dim": 200,
- "load_path": "{DOWNLOADS_PATH}/embeddings/v2_ubuntu_w2v_vectors.txt"
- },
- {
- "id": "preproc",
- "class_name": "siamese_preprocessor",
- "save_path": "{MODELS_PATH}/ubuntu_v2_mt_word2vec_smn/preproc/tok.dict",
- "load_path": "{MODELS_PATH}/ubuntu_v2_mt_word2vec_smn/preproc/tok.dict",
- "num_ranking_samples": 10,
- "num_context_turns": "{NUM_CONTEXT_TURNS}",
- "max_sequence_length": 50,
- "embedding_dim": 200,
- "fit_on": ["x"],
- "in": ["x"],
- "out": ["x_proc"],
- "tokenizer": {
- "ref": "tok_1",
- "notes": "use defined tokenizer"
- },
- "vocab": {
- "ref": "vocab_1",
- "notes": "use vocab built for tokenized data"
- }
- },
- {
- "id": "embeddings",
- "class_name": "emb_mat_assembler",
- "embedder": "#word2vec_embedder",
- "vocab": "#vocab_1"
- },
- {
- "in": ["x_proc"],
- "in_y": ["y"],
- "out": ["y_predicted"],
- "class_name": "smn_nn",
- "num_context_turns": "{NUM_CONTEXT_TURNS}",
- "max_sequence_length": "#preproc.max_sequence_length",
- "embedding_dim": "#word2vec_embedder.dim",
- "emb_matrix": "#embeddings.emb_mat",
- "learning_rate": 1e-3,
- "batch_size": 500,
- "seed": 65,
- "save_path": "{MODELS_PATH}/ubuntu_v2_mt_word2vec_smn/model_smn/model",
- "load_path": "{MODELS_PATH}/ubuntu_v2_mt_word2vec_smn/model_smn/model"
- }
- ],
- "out": [
- "y_predicted"
- ]
- },
- "train": {
- "class_name": "nn_trainer",
- "epochs": 8,
- "batch_size": 500,
- "pytest_max_batches": 2,
- "train_metrics": [],
- "metrics": [
- "r@1",
- "r@2",
- "r@5",
- "rank_response"
- ],
- "validation_patience": 3,
- "val_every_n_epochs": 1,
- "log_every_n_batches": 100,
- "evaluation_targets": [
- "valid",
- "test"
- ],
- "tensorboard_log_dir": "{MODELS_PATH}/ubuntu_v2_mt_word2vec_smn/logs_smn/"
- },
- "metadata": {
- "variables": {
- "ROOT_PATH": "~/.deeppavlov",
- "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
- "MODELS_PATH": "{ROOT_PATH}/models",
- "NUM_CONTEXT_TURNS": 10
- },
- "download": [
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/ubuntu_v2_mt_word2vec_smn.tar.gz",
- "subdir": "{MODELS_PATH}"
- },
- {
- "url": "http://files.deeppavlov.ai/datasets/ubuntu_v2_data_clean.tar.gz",
- "subdir": "{DOWNLOADS_PATH}/ubuntu_v2_data_clean"
- },
- {
- "url": "http://files.deeppavlov.ai/embeddings/v2_ubuntu_w2v_vectors.txt.tar.gz",
- "subdir": "{DOWNLOADS_PATH}/embeddings"
- }
- ]
- }
-}
\ No newline at end of file
diff --git a/deeppavlov/configs/spelling_correction/brillmoore_kartaslov_ru.json b/deeppavlov/configs/spelling_correction/brillmoore_kartaslov_ru.json
deleted file mode 100644
index d24b70d8e4..0000000000
--- a/deeppavlov/configs/spelling_correction/brillmoore_kartaslov_ru.json
+++ /dev/null
@@ -1,82 +0,0 @@
-{
- "dataset_reader": {
- "class_name": "typos_kartaslov_reader",
- "data_path": "{DOWNLOADS_PATH}"
- },
- "dataset_iterator": {
- "class_name": "typos_iterator",
- "test_ratio": 0.02
- },
- "chainer":{
- "in": ["x"],
- "in_y": ["y"],
- "pipe": [
- {
- "class_name": "str_lower",
- "id": "lower",
- "in": ["x"],
- "out": ["x_lower"]
- },
- {
- "class_name": "nltk_moses_tokenizer",
- "id": "tokenizer",
- "in": ["x_lower"],
- "out": ["x_tokens"]
- },
- {
- "ref": "tokenizer",
- "in": ["y"],
- "out": ["y_tokens"]
- },
- {
- "fit_on": ["x_tokens", "y_tokens"],
- "in": ["x_tokens"],
- "out": ["tokens_candidates"],
- "class_name": "spelling_error_model",
- "window": 1,
- "candidates_count": 4,
- "dictionary": {
- "class_name": "russian_words_vocab",
- "data_dir": "{DOWNLOADS_PATH}/vocabs"
- },
- "save_path": "{MODELS_PATH}/error_model/error_model_ru.tsv",
- "load_path": "{MODELS_PATH}/error_model/error_model_ru.tsv"
- },
- {
- "class_name": "kenlm_elector",
- "in": ["tokens_candidates"],
- "out": ["y_predicted_tokens"],
- "load_path": "{DOWNLOADS_PATH}/language_models/ru_wiyalen_no_punkt.arpa.binary"
- },
- {
- "ref": "tokenizer",
- "in": ["y_predicted_tokens"],
- "out": ["y_predicted"]
- }
- ],
- "out": ["y_predicted"]
- },
- "train": {
- "evaluation_targets": [
- "test"
- ],
- "class_name": "fit_trainer"
- },
- "metadata": {
- "variables": {
- "ROOT_PATH": "~/.deeppavlov",
- "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
- "MODELS_PATH": "{ROOT_PATH}/models"
- },
- "download": [
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/error_model.tar.gz",
- "subdir": "{MODELS_PATH}"
- },
- {
- "url": "http://files.deeppavlov.ai/lang_models/ru_wiyalen_no_punkt.arpa.binary.gz",
- "subdir": "{DOWNLOADS_PATH}/language_models"
- }
- ]
- }
-}
\ No newline at end of file
diff --git a/deeppavlov/configs/spelling_correction/brillmoore_kartaslov_ru_custom_vocab.json b/deeppavlov/configs/spelling_correction/brillmoore_kartaslov_ru_custom_vocab.json
deleted file mode 100644
index 46694d2205..0000000000
--- a/deeppavlov/configs/spelling_correction/brillmoore_kartaslov_ru_custom_vocab.json
+++ /dev/null
@@ -1,84 +0,0 @@
-{
- "dataset_reader": {
- "class_name": "typos_kartaslov_reader",
- "data_path": "{DOWNLOADS_PATH}"
- },
- "dataset_iterator": {
- "class_name": "typos_iterator",
- "test_ratio": 0.02
- },
- "chainer":{
- "in": ["x"],
- "in_y": ["y"],
- "pipe": [
- {
- "class_name": "str_lower",
- "id": "lower",
- "in": ["x"],
- "out": ["x_lower"]
- },
- {
- "class_name": "nltk_moses_tokenizer",
- "id": "tokenizer",
- "in": ["x_lower"],
- "out": ["x_tokens"]
- },
- {
- "ref": "tokenizer",
- "in": ["y"],
- "out": ["y_tokens"]
- },
- {
- "fit_on": ["x_tokens", "y_tokens"],
- "in": ["x_tokens"],
- "out": ["tokens_candidates"],
- "class_name": "spelling_error_model",
- "window": 1,
- "candidates_count": 4,
- "dictionary": {
- "class_name": "static_dictionary",
- "dictionary_name": "compreno_words",
- "data_dir": "{DOWNLOADS_PATH}/vocabs",
- "raw_dictionary_path": "./compreno_wordforms.txt"
- },
- "save_path": "{MODELS_PATH}/error_model/error_model_ru.tsv",
- "load_path": "{MODELS_PATH}/error_model/error_model_ru.tsv"
- },
- {
- "class_name": "kenlm_elector",
- "in": ["tokens_candidates"],
- "out": ["y_predicted_tokens"],
- "load_path": "{DOWNLOADS_PATH}/language_models/ru_wiyalen_no_punkt.arpa.binary"
- },
- {
- "ref": "tokenizer",
- "in": ["y_predicted_tokens"],
- "out": ["y_predicted"]
- }
- ],
- "out": ["y_predicted"]
- },
- "train": {
- "evaluation_targets": [
- "test"
- ],
- "class_name": "fit_trainer"
- },
- "metadata": {
- "variables": {
- "ROOT_PATH": "~/.deeppavlov",
- "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
- "MODELS_PATH": "{ROOT_PATH}/models"
- },
- "download": [
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/error_model.tar.gz",
- "subdir": "{MODELS_PATH}"
- },
- {
- "url": "http://files.deeppavlov.ai/lang_models/ru_wiyalen_no_punkt.arpa.binary.gz",
- "subdir": "{DOWNLOADS_PATH}/language_models"
- }
- ]
- }
-}
\ No newline at end of file
diff --git a/deeppavlov/configs/spelling_correction/brillmoore_kartaslov_ru_nolm.json b/deeppavlov/configs/spelling_correction/brillmoore_kartaslov_ru_nolm.json
deleted file mode 100644
index 6aa7de9c85..0000000000
--- a/deeppavlov/configs/spelling_correction/brillmoore_kartaslov_ru_nolm.json
+++ /dev/null
@@ -1,77 +0,0 @@
-{
- "dataset_reader": {
- "class_name": "typos_kartaslov_reader",
- "data_path": "{DOWNLOADS_PATH}"
- },
- "dataset_iterator": {
- "class_name": "typos_iterator",
- "test_ratio": 0.02
- },
- "chainer":{
- "in": ["x"],
- "in_y": ["y"],
- "pipe": [
- {
- "class_name": "str_lower",
- "id": "lower",
- "in": ["x"],
- "out": ["x_lower"]
- },
- {
- "class_name": "nltk_moses_tokenizer",
- "id": "tokenizer",
- "in": ["x_lower"],
- "out": ["x_tokens"]
- },
- {
- "ref": "tokenizer",
- "in": ["y"],
- "out": ["y_tokens"]
- },
- {
- "fit_on": ["x_tokens", "y_tokens"],
- "in": ["x_tokens"],
- "out": ["tokens_candidates"],
- "class_name": "spelling_error_model",
- "window": 1,
- "candidates_count": 1,
- "dictionary": {
- "class_name": "russian_words_vocab",
- "data_dir": "{DOWNLOADS_PATH}/vocabs"
- },
- "save_path": "{MODELS_PATH}/error_model/error_model_ru.tsv",
- "load_path": "{MODELS_PATH}/error_model/error_model_ru.tsv"
- },
- {
- "class_name": "top1_elector",
- "in": ["tokens_candidates"],
- "out": ["y_predicted_tokens"]
- },
- {
- "ref": "tokenizer",
- "in": ["y_predicted_tokens"],
- "out": ["y_predicted"]
- }
- ],
- "out": ["y_predicted"]
- },
- "train": {
- "evaluation_targets": [
- "test"
- ],
- "class_name": "fit_trainer"
- },
- "metadata": {
- "variables": {
- "ROOT_PATH": "~/.deeppavlov",
- "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
- "MODELS_PATH": "{ROOT_PATH}/models"
- },
- "download": [
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/error_model.tar.gz",
- "subdir": "{MODELS_PATH}"
- }
- ]
- }
-}
\ No newline at end of file
diff --git a/deeppavlov/configs/squad/squad_bert_uncased.json b/deeppavlov/configs/squad/squad_bert_uncased.json
deleted file mode 100644
index 5542458965..0000000000
--- a/deeppavlov/configs/squad/squad_bert_uncased.json
+++ /dev/null
@@ -1,103 +0,0 @@
-{
- "dataset_reader": {
- "class_name": "squad_dataset_reader",
- "data_path": "{DOWNLOADS_PATH}/squad/"
- },
- "dataset_iterator": {
- "class_name": "squad_iterator",
- "seed": 1337,
- "shuffle": true
- },
- "chainer": {
- "in": ["context_raw", "question_raw"],
- "in_y": ["ans_raw", "ans_raw_start"],
- "pipe": [
- {
- "class_name": "bert_preprocessor",
- "id": "bert_preprocessor",
- "vocab_file": "{DOWNLOADS_PATH}/bert_models/uncased_L-12_H-768_A-12/vocab.txt",
- "do_lower_case": "{lowercase}",
- "max_seq_length": 384,
- "in": ["question_raw", "context_raw"],
- "out": ["bert_features"]
- },
- {
- "class_name": "squad_bert_mapping",
- "do_lower_case": "{lowercase}",
- "in": ["context_raw", "bert_features"],
- "out": ["subtok2chars", "char2subtoks"]
- },
- {
- "class_name": "squad_bert_ans_preprocessor",
- "do_lower_case": "{lowercase}",
- "in": ["ans_raw", "ans_raw_start","char2subtoks"],
- "out": ["ans", "ans_start", "ans_end"]
- },
- {
- "class_name": "squad_bert_model",
- "bert_config_file": "{DOWNLOADS_PATH}/bert_models/uncased_L-12_H-768_A-12/bert_config.json",
- "pretrained_bert": "{DOWNLOADS_PATH}/bert_models/uncased_L-12_H-768_A-12/bert_model.ckpt",
- "save_path": "{MODELS_PATH}/squad_bert/uncased_model",
- "load_path": "{MODELS_PATH}/squad_bert/uncased_model",
- "keep_prob": 0.5,
- "learning_rate": 2e-05,
- "learning_rate_drop_patience": 2,
- "learning_rate_drop_div": 2.0,
- "in": ["bert_features"],
- "in_y": ["ans_start", "ans_end"],
- "out": ["ans_start_predicted", "ans_end_predicted", "logits"]
- },
- {
- "class_name": "squad_bert_ans_postprocessor",
- "in": ["ans_start_predicted", "ans_end_predicted", "context_raw", "bert_features", "subtok2chars"],
- "out": ["ans_predicted", "ans_start_predicted", "ans_end_predicted"]
- }
- ],
- "out": ["ans_predicted", "ans_start_predicted", "logits"]
- },
- "train": {
- "show_examples": false,
- "test_best": false,
- "validate_best": true,
- "log_every_n_batches": 250,
- "val_every_n_batches": 500,
- "batch_size": 10,
- "pytest_max_batches": 2,
- "pytest_batch_size": 5,
- "validation_patience": 10,
- "metrics": [
- {
- "name": "squad_v1_f1",
- "inputs": ["ans", "ans_predicted"]
- },
- {
- "name": "squad_v1_em",
- "inputs": ["ans", "ans_predicted"]
- },
- {
- "name": "squad_v2_f1",
- "inputs": ["ans", "ans_predicted"]
- },
- {
- "name": "squad_v2_em",
- "inputs": ["ans", "ans_predicted"]
- }
- ],
- "tensorboard_log_dir": "{MODELS_PATH}/squad_bert/uncased_logs"
- },
- "metadata": {
- "variables": {
- "lowercase": true,
- "ROOT_PATH": "~/.deeppavlov",
- "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
- "MODELS_PATH": "{ROOT_PATH}/models"
- },
- "download": [
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/bert/uncased_L-12_H-768_A-12.zip",
- "subdir": "{DOWNLOADS_PATH}/bert_models"
- }
- ]
- }
-}
-
diff --git a/deeppavlov/configs/squad/squad_ru_bert.json b/deeppavlov/configs/squad/squad_ru_bert.json
index 7b105b47ef..029777626e 100644
--- a/deeppavlov/configs/squad/squad_ru_bert.json
+++ b/deeppavlov/configs/squad/squad_ru_bert.json
@@ -11,98 +11,165 @@
"shuffle": true
},
"chainer": {
- "in": ["context_raw", "question_raw"],
- "in_y": ["ans_raw", "ans_raw_start"],
+ "in": [
+ "context_raw",
+ "question_raw"
+ ],
+ "in_y": [
+ "ans_raw",
+ "ans_raw_start"
+ ],
"pipe": [
{
- "class_name": "bert_preprocessor",
- "vocab_file": "{DOWNLOADS_PATH}/bert_models/multi_cased_L-12_H-768_A-12/vocab.txt",
- "do_lower_case": "{lowercase}",
+ "class_name": "torch_squad_transformers_preprocessor",
+ "vocab_file": "{TRANSFORMER}",
+ "do_lower_case": "{LOWERCASE}",
"max_seq_length": 384,
- "in": ["question_raw", "context_raw"],
- "out": ["bert_features"]
+ "return_tokens": true,
+ "in": [
+ "question_raw",
+ "context_raw"
+ ],
+ "out": [
+ "bert_features",
+ "subtokens"
+ ]
},
{
"class_name": "squad_bert_mapping",
- "do_lower_case": "{lowercase}",
- "in": ["context_raw", "bert_features"],
- "out": ["subtok2chars", "char2subtoks"]
+ "do_lower_case": "{LOWERCASE}",
+ "in": [
+ "context_raw",
+ "bert_features",
+ "subtokens"
+ ],
+ "out": [
+ "subtok2chars",
+ "char2subtoks"
+ ]
},
{
"class_name": "squad_bert_ans_preprocessor",
- "do_lower_case": "{lowercase}",
- "in": ["ans_raw", "ans_raw_start","char2subtoks"],
- "out": ["ans", "ans_start", "ans_end"]
+ "do_lower_case": "{LOWERCASE}",
+ "in": [
+ "ans_raw",
+ "ans_raw_start",
+ "char2subtoks"
+ ],
+ "out": [
+ "ans",
+ "ans_start",
+ "ans_end"
+ ]
},
{
- "class_name": "squad_bert_model",
- "bert_config_file": "{DOWNLOADS_PATH}/bert_models/multi_cased_L-12_H-768_A-12/bert_config.json",
- "pretrained_bert": "{DOWNLOADS_PATH}/bert_models/multi_cased_L-12_H-768_A-12/bert_model.ckpt",
- "save_path": "{MODELS_PATH}/squad_ru_bert/model_multi",
- "load_path": "{MODELS_PATH}/squad_ru_bert/model_multi",
- "keep_prob": 0.5,
- "learning_rate": 2e-05,
+ "class_name": "torch_transformers_squad",
+ "pretrained_bert": "{TRANSFORMER}",
+ "save_path": "{MODEL_PATH}/model",
+ "load_path": "{MODEL_PATH}/model",
+ "optimizer": "AdamW",
+ "optimizer_parameters": {
+ "lr": 2e-05,
+ "weight_decay": 0.01,
+ "betas": [
+ 0.9,
+ 0.999
+ ],
+ "eps": 1e-06
+ },
"learning_rate_drop_patience": 3,
"learning_rate_drop_div": 2.0,
- "in": ["bert_features"],
- "in_y": ["ans_start", "ans_end"],
- "out": ["ans_start_predicted", "ans_end_predicted", "logits", "score"]
+ "in": [
+ "bert_features"
+ ],
+ "in_y": [
+ "ans_start",
+ "ans_end"
+ ],
+ "out": [
+ "ans_start_predicted",
+ "ans_end_predicted",
+ "logits"
+ ]
},
{
"class_name": "squad_bert_ans_postprocessor",
- "in": ["ans_start_predicted", "ans_end_predicted", "context_raw", "bert_features", "subtok2chars"],
- "out": ["ans_predicted", "ans_start_predicted", "ans_end_predicted"]
+ "in": [
+ "ans_start_predicted",
+ "ans_end_predicted",
+ "context_raw",
+ "bert_features",
+ "subtok2chars",
+ "subtokens"
+ ],
+ "out": [
+ "ans_predicted",
+ "ans_start_predicted",
+ "ans_end_predicted"
+ ]
}
],
- "out": ["ans_predicted", "ans_start_predicted", "logits"]
+ "out": [
+ "ans_predicted",
+ "ans_start_predicted",
+ "logits"
+ ]
},
"train": {
"show_examples": false,
- "test_best": false,
- "validate_best": true,
+ "evaluation_targets": [
+ "valid"
+ ],
"log_every_n_batches": 250,
"val_every_n_batches": 500,
"batch_size": 10,
- "pytest_max_batches": 2,
- "pytest_batch_size": 5,
"validation_patience": 10,
"metrics": [
{
- "name": "squad_v2_f1",
- "inputs": ["ans", "ans_predicted"]
+ "name": "squad_v1_f1",
+ "inputs": [
+ "ans",
+ "ans_predicted"
+ ]
},
{
- "name": "squad_v2_em",
- "inputs": ["ans", "ans_predicted"]
+ "name": "squad_v1_em",
+ "inputs": [
+ "ans",
+ "ans_predicted"
+ ]
},
{
- "name": "squad_v1_f1",
- "inputs": ["ans", "ans_predicted"]
+ "name": "squad_v2_f1",
+ "inputs": [
+ "ans",
+ "ans_predicted"
+ ]
},
{
- "name": "squad_v1_em",
- "inputs": ["ans", "ans_predicted"]
+ "name": "squad_v2_em",
+ "inputs": [
+ "ans",
+ "ans_predicted"
+ ]
}
],
- "tensorboard_log_dir": "{MODELS_PATH}/squad_ru_bert/logs"
+ "class_name": "torch_trainer"
},
"metadata": {
"variables": {
- "lowercase": false,
+ "LOWERCASE": false,
+ "TRANSFORMER": "DeepPavlov/rubert-base-cased",
"ROOT_PATH": "~/.deeppavlov",
"DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
- "MODELS_PATH": "{ROOT_PATH}/models"
+ "MODELS_PATH": "{ROOT_PATH}/models",
+ "MODEL_PATH": "{MODELS_PATH}/squad_ru_torch_bert/{TRANSFORMER}"
},
"download": [
{
- "url": "http://files.deeppavlov.ai/deeppavlov_data/bert/multi_cased_L-12_H-768_A-12.zip",
- "subdir": "{DOWNLOADS_PATH}/bert_models"
- },
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/squad_ru_bert.tar.gz",
+ "url": "http://files.deeppavlov.ai/v1/squad/squad_ru_torch_bert.tar.gz",
"subdir": "{MODELS_PATH}"
}
- ]
+ ]
}
}
-
diff --git a/deeppavlov/configs/squad/squad_ru_bert_infer.json b/deeppavlov/configs/squad/squad_ru_bert_infer.json
index 83cc2cdd68..1dcb8e5d2e 100644
--- a/deeppavlov/configs/squad/squad_ru_bert_infer.json
+++ b/deeppavlov/configs/squad/squad_ru_bert_infer.json
@@ -11,68 +11,87 @@
"shuffle": true
},
"chainer": {
- "in": ["context_raw", "question_raw"],
- "in_y": ["ans_raw", "ans_raw_start"],
+ "in": [
+ "context_raw",
+ "question_raw"
+ ],
+ "in_y": [
+ "ans_raw",
+ "ans_raw_start"
+ ],
"pipe": [
{
- "class_name": "squad_bert_infer",
- "lang": "ru",
+ "class_name": "torch_transformers_squad_infer",
"batch_size": 10,
"squad_model_config": "{CONFIGS_PATH}/squad/squad_ru_bert.json",
- "vocab_file": "{DOWNLOADS_PATH}/bert_models/multi_cased_L-12_H-768_A-12/vocab.txt",
- "do_lower_case": false,
- "max_seq_length": 512,
+ "vocab_file": "{TRANSFORMER}",
+ "do_lower_case": "{LOWERCASE}",
+ "max_seq_length": 384,
"in": ["context_raw", "question_raw"],
- "out": ["ans_predicted", "ans_start_predicted", "logits", "score"]
+ "out": ["ans_predicted", "ans_start_predicted", "logits"]
}
],
- "out": ["ans_predicted", "ans_start_predicted", "logits"]
+ "out": [
+ "ans_predicted",
+ "ans_start_predicted",
+ "logits"
+ ]
},
"train": {
"show_examples": false,
- "test_best": false,
- "validate_best": true,
+ "evaluation_targets": [
+ "valid"
+ ],
"log_every_n_batches": 250,
"val_every_n_batches": 500,
"batch_size": 10,
- "pytest_max_batches": 2,
"validation_patience": 10,
"metrics": [
{
"name": "squad_v1_f1",
- "inputs": ["ans_raw", "ans_predicted"]
+ "inputs": [
+ "ans_raw",
+ "ans_predicted"
+ ]
},
{
"name": "squad_v1_em",
- "inputs": ["ans_raw", "ans_predicted"]
+ "inputs": [
+ "ans_raw",
+ "ans_predicted"
+ ]
},
{
"name": "squad_v2_f1",
- "inputs": ["ans_raw", "ans_predicted"]
+ "inputs": [
+ "ans_raw",
+ "ans_predicted"
+ ]
},
{
"name": "squad_v2_em",
- "inputs": ["ans_raw", "ans_predicted"]
+ "inputs": [
+ "ans_raw",
+ "ans_predicted"
+ ]
}
]
},
"metadata": {
"variables": {
+ "LOWERCASE": false,
+ "TRANSFORMER": "DeepPavlov/rubert-base-cased",
"ROOT_PATH": "~/.deeppavlov",
"DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
"MODELS_PATH": "{ROOT_PATH}/models",
+ "MODEL_PATH": "{MODELS_PATH}/squad_ru_torch_bert/{TRANSFORMER}",
"CONFIGS_PATH": "{DEEPPAVLOV_PATH}/configs"
},
"download": [
{
- "url": "http://files.deeppavlov.ai/deeppavlov_data/bert/multi_cased_L-12_H-768_A-12.zip",
- "subdir": "{DOWNLOADS_PATH}/bert_models"
- },
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/squad_ru_bert.tar.gz",
+ "url": "http://files.deeppavlov.ai/v1/squad/squad_ru_torch_bert.tar.gz",
"subdir": "{MODELS_PATH}"
}
- ]
+ ]
}
}
-
diff --git a/deeppavlov/configs/squad/squad_ru_torch_bert.json b/deeppavlov/configs/squad/squad_ru_torch_bert.json
deleted file mode 100644
index 029777626e..0000000000
--- a/deeppavlov/configs/squad/squad_ru_torch_bert.json
+++ /dev/null
@@ -1,175 +0,0 @@
-{
- "dataset_reader": {
- "class_name": "squad_dataset_reader",
- "dataset": "SberSQuADClean",
- "url": "http://files.deeppavlov.ai/datasets/sber_squad_clean-v1.1.tar.gz",
- "data_path": "{DOWNLOADS_PATH}/squad_ru_clean/"
- },
- "dataset_iterator": {
- "class_name": "squad_iterator",
- "seed": 1337,
- "shuffle": true
- },
- "chainer": {
- "in": [
- "context_raw",
- "question_raw"
- ],
- "in_y": [
- "ans_raw",
- "ans_raw_start"
- ],
- "pipe": [
- {
- "class_name": "torch_squad_transformers_preprocessor",
- "vocab_file": "{TRANSFORMER}",
- "do_lower_case": "{LOWERCASE}",
- "max_seq_length": 384,
- "return_tokens": true,
- "in": [
- "question_raw",
- "context_raw"
- ],
- "out": [
- "bert_features",
- "subtokens"
- ]
- },
- {
- "class_name": "squad_bert_mapping",
- "do_lower_case": "{LOWERCASE}",
- "in": [
- "context_raw",
- "bert_features",
- "subtokens"
- ],
- "out": [
- "subtok2chars",
- "char2subtoks"
- ]
- },
- {
- "class_name": "squad_bert_ans_preprocessor",
- "do_lower_case": "{LOWERCASE}",
- "in": [
- "ans_raw",
- "ans_raw_start",
- "char2subtoks"
- ],
- "out": [
- "ans",
- "ans_start",
- "ans_end"
- ]
- },
- {
- "class_name": "torch_transformers_squad",
- "pretrained_bert": "{TRANSFORMER}",
- "save_path": "{MODEL_PATH}/model",
- "load_path": "{MODEL_PATH}/model",
- "optimizer": "AdamW",
- "optimizer_parameters": {
- "lr": 2e-05,
- "weight_decay": 0.01,
- "betas": [
- 0.9,
- 0.999
- ],
- "eps": 1e-06
- },
- "learning_rate_drop_patience": 3,
- "learning_rate_drop_div": 2.0,
- "in": [
- "bert_features"
- ],
- "in_y": [
- "ans_start",
- "ans_end"
- ],
- "out": [
- "ans_start_predicted",
- "ans_end_predicted",
- "logits"
- ]
- },
- {
- "class_name": "squad_bert_ans_postprocessor",
- "in": [
- "ans_start_predicted",
- "ans_end_predicted",
- "context_raw",
- "bert_features",
- "subtok2chars",
- "subtokens"
- ],
- "out": [
- "ans_predicted",
- "ans_start_predicted",
- "ans_end_predicted"
- ]
- }
- ],
- "out": [
- "ans_predicted",
- "ans_start_predicted",
- "logits"
- ]
- },
- "train": {
- "show_examples": false,
- "evaluation_targets": [
- "valid"
- ],
- "log_every_n_batches": 250,
- "val_every_n_batches": 500,
- "batch_size": 10,
- "validation_patience": 10,
- "metrics": [
- {
- "name": "squad_v1_f1",
- "inputs": [
- "ans",
- "ans_predicted"
- ]
- },
- {
- "name": "squad_v1_em",
- "inputs": [
- "ans",
- "ans_predicted"
- ]
- },
- {
- "name": "squad_v2_f1",
- "inputs": [
- "ans",
- "ans_predicted"
- ]
- },
- {
- "name": "squad_v2_em",
- "inputs": [
- "ans",
- "ans_predicted"
- ]
- }
- ],
- "class_name": "torch_trainer"
- },
- "metadata": {
- "variables": {
- "LOWERCASE": false,
- "TRANSFORMER": "DeepPavlov/rubert-base-cased",
- "ROOT_PATH": "~/.deeppavlov",
- "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
- "MODELS_PATH": "{ROOT_PATH}/models",
- "MODEL_PATH": "{MODELS_PATH}/squad_ru_torch_bert/{TRANSFORMER}"
- },
- "download": [
- {
- "url": "http://files.deeppavlov.ai/v1/squad/squad_ru_torch_bert.tar.gz",
- "subdir": "{MODELS_PATH}"
- }
- ]
- }
-}
diff --git a/deeppavlov/configs/squad/squad_zh_bert_mult.json b/deeppavlov/configs/squad/squad_zh_bert_mult.json
deleted file mode 100644
index 50cac7569f..0000000000
--- a/deeppavlov/configs/squad/squad_zh_bert_mult.json
+++ /dev/null
@@ -1,118 +0,0 @@
-{
- "dataset_reader": {
- "class_name": "squad_dataset_reader",
- "dataset": "SQuAD",
- "url": "http://files.deeppavlov.ai/datasets/DRCD.tar.gz",
- "data_path": "{DOWNLOADS_PATH}/DRCD_train/"
- },
- "dataset_iterator": {
- "class_name": "squad_iterator",
- "seed": 1337,
- "shuffle": true
- },
- "chainer": {
- "in": ["context_raw", "question_raw"],
- "in_y": ["ans_raw", "ans_raw_start"],
- "pipe": [
- {
- "class_name": "bert_preprocessor",
- "vocab_file": "{DOWNLOADS_PATH}/bert_models/multi_cased_L-12_H-768_A-12/vocab.txt",
- "do_lower_case": "{lowercase}",
- "max_seq_length": 384,
- "in": ["question_raw", "context_raw"],
- "out": ["bert_features"]
- },
- {
- "class_name": "squad_bert_mapping",
- "do_lower_case": "{lowercase}",
- "in": ["context_raw", "bert_features"],
- "out": ["subtok2chars", "char2subtoks"]
- },
- {
- "class_name": "squad_bert_ans_preprocessor",
- "do_lower_case": "{lowercase}",
- "in": ["ans_raw", "ans_raw_start","char2subtoks"],
- "out": ["ans", "ans_start", "ans_end"]
- },
- {
- "class_name": "squad_bert_model",
- "bert_config_file": "{DOWNLOADS_PATH}/bert_models/multi_cased_L-12_H-768_A-12/bert_config.json",
- "pretrained_bert": "{DOWNLOADS_PATH}/bert_models/multi_cased_L-12_H-768_A-12/bert_model.ckpt",
- "save_path": "{MODELS_PATH}/squad_zh_bert/model_multi",
- "load_path": "{MODELS_PATH}/squad_zh_bert/model_multi",
- "keep_prob": 0.5,
- "learning_rate": 2e-05,
- "learning_rate_drop_patience": 3,
- "learning_rate_drop_div": 2.0,
- "in": ["bert_features"],
- "in_y": ["ans_start", "ans_end"],
- "out": ["ans_start_predicted", "ans_end_predicted", "logits", "score"]
- },
- {
- "class_name": "squad_bert_ans_postprocessor",
- "in": ["ans_start_predicted", "ans_end_predicted", "context_raw", "bert_features", "subtok2chars"],
- "out": ["ans_predicted", "ans_start_predicted", "ans_end_predicted"]
- },
- {
- "in": "ans",
- "out": "ans_tok",
- "class_name": "jieba_tokenizer"
- },
- {
- "in": "ans_predicted",
- "out": "ans_predicted_tok",
- "class_name": "jieba_tokenizer"
- }
- ],
- "out": ["ans_predicted", "ans_start_predicted", "logits"]
- },
- "train": {
- "show_examples": false,
- "test_best": false,
- "validate_best": true,
- "log_every_n_batches": 250,
- "val_every_n_batches": 500,
- "batch_size": 8,
- "pytest_max_batches": 2,
- "pytest_batch_size": 5,
- "validation_patience": 10,
- "metrics": [
- {
- "name": "squad_v2_f1",
- "inputs": ["ans_tok", "ans_predicted_tok"]
- },
- {
- "name": "squad_v2_em",
- "inputs": ["ans_tok", "ans_predicted_tok"]
- },
- {
- "name": "squad_v1_f1",
- "inputs": ["ans_tok", "ans_predicted_tok"]
- },
- {
- "name": "squad_v1_em",
- "inputs": ["ans_tok", "ans_predicted_tok"]
- }
- ],
- "tensorboard_log_dir": "{MODELS_PATH}/squad_zh_bert/logs"
- },
- "metadata": {
- "variables": {
- "lowercase": false,
- "ROOT_PATH": "~/.deeppavlov",
- "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
- "MODELS_PATH": "{ROOT_PATH}/models"
- },
- "download": [
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/bert/multi_cased_L-12_H-768_A-12.zip",
- "subdir": "{DOWNLOADS_PATH}/bert_models"
- },
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/squad_zh.tar.gz",
- "subdir": "{MODELS_PATH}"
- }
- ]
- }
-}
-
diff --git a/deeppavlov/configs/squad/squad_zh_bert_zh.json b/deeppavlov/configs/squad/squad_zh_bert_zh.json
deleted file mode 100644
index 5864236cf9..0000000000
--- a/deeppavlov/configs/squad/squad_zh_bert_zh.json
+++ /dev/null
@@ -1,118 +0,0 @@
-{
- "dataset_reader": {
- "class_name": "squad_dataset_reader",
- "dataset": "SQuAD",
- "url": "http://files.deeppavlov.ai/datasets/DRCD.tar.gz",
- "data_path": "{DOWNLOADS_PATH}/DRCD_train/"
- },
- "dataset_iterator": {
- "class_name": "squad_iterator",
- "seed": 1337,
- "shuffle": true
- },
- "chainer": {
- "in": ["context_raw", "question_raw"],
- "in_y": ["ans_raw", "ans_raw_start"],
- "pipe": [
- {
- "class_name": "bert_preprocessor",
- "vocab_file": "{DOWNLOADS_PATH}/bert_models/chinese_L-12_H-768_A-12/vocab.txt",
- "do_lower_case": "{lowercase}",
- "max_seq_length": 384,
- "in": ["question_raw", "context_raw"],
- "out": ["bert_features"]
- },
- {
- "class_name": "squad_bert_mapping",
- "do_lower_case": "{lowercase}",
- "in": ["context_raw", "bert_features"],
- "out": ["subtok2chars", "char2subtoks"]
- },
- {
- "class_name": "squad_bert_ans_preprocessor",
- "do_lower_case": "{lowercase}",
- "in": ["ans_raw", "ans_raw_start","char2subtoks"],
- "out": ["ans", "ans_start", "ans_end"]
- },
- {
- "class_name": "squad_bert_model",
- "bert_config_file": "{DOWNLOADS_PATH}/bert_models/chinese_L-12_H-768_A-12/bert_config.json",
- "pretrained_bert": "{DOWNLOADS_PATH}/bert_models/chinese_L-12_H-768_A-12/bert_model.ckpt",
- "save_path": "{MODELS_PATH}/squad_zh_bert/model_zh",
- "load_path": "{MODELS_PATH}/squad_zh_bert/model_zh",
- "keep_prob": 0.5,
- "learning_rate": 2e-05,
- "learning_rate_drop_patience": 3,
- "learning_rate_drop_div": 2.0,
- "in": ["bert_features"],
- "in_y": ["ans_start", "ans_end"],
- "out": ["ans_start_predicted", "ans_end_predicted", "logits", "score"]
- },
- {
- "class_name": "squad_bert_ans_postprocessor",
- "in": ["ans_start_predicted", "ans_end_predicted", "context_raw", "bert_features", "subtok2chars"],
- "out": ["ans_predicted", "ans_start_predicted", "ans_end_predicted"]
- },
- {
- "in": "ans",
- "out": "ans_tok",
- "class_name": "jieba_tokenizer"
- },
- {
- "in": "ans_predicted",
- "out": "ans_predicted_tok",
- "class_name": "jieba_tokenizer"
- }
- ],
- "out": ["ans_predicted", "ans_start_predicted", "logits"]
- },
- "train": {
- "show_examples": false,
- "test_best": false,
- "validate_best": true,
- "log_every_n_batches": 250,
- "val_every_n_batches": 500,
- "batch_size": 8,
- "pytest_max_batches": 2,
- "pytest_batch_size": 5,
- "validation_patience": 10,
- "metrics": [
- {
- "name": "squad_v2_f1",
- "inputs": ["ans_tok", "ans_predicted_tok"]
- },
- {
- "name": "squad_v2_em",
- "inputs": ["ans_tok", "ans_predicted_tok"]
- },
- {
- "name": "squad_v1_f1",
- "inputs": ["ans_tok", "ans_predicted_tok"]
- },
- {
- "name": "squad_v1_em",
- "inputs": ["ans_tok", "ans_predicted_tok"]
- }
- ],
- "tensorboard_log_dir": "{MODELS_PATH}/squad_zh_bert/logs"
- },
- "metadata": {
- "variables": {
- "lowercase": false,
- "ROOT_PATH": "~/.deeppavlov",
- "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
- "MODELS_PATH": "{ROOT_PATH}/models"
- },
- "download": [
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/bert/chinese_L-12_H-768_A-12.zip",
- "subdir": "{DOWNLOADS_PATH}/bert_models"
- },
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/squad_model_zh_zhbert.tar.gz",
- "subdir": "{MODELS_PATH}"
- }
- ]
- }
-}
-
diff --git a/deeppavlov/core/common/base.py b/deeppavlov/core/common/base.py
index 91067cae29..e18d548d05 100644
--- a/deeppavlov/core/common/base.py
+++ b/deeppavlov/core/common/base.py
@@ -55,21 +55,6 @@ def __init__(self, x: Optional[Union[str, list]] = None,
out: Names of pipeline inference outputs.
y: Names of additional inputs (targets) for pipeline training and evaluation.
pipe: List of pipeline elements.
-
-
- Example:
- .. code:: python
-
- >>> from deeppavlov.models.nemo.asr import NeMoASR
- >>> from deeppavlov import Element, Model
- >>> asr = NeMoASR(nemo_params_path="~/.deeppavlov/models/nemo/quartznet15x5/quartznet15x5.yaml",
- load_path="~/.deeppavlov/models/nemo/quartznet15x5")
- >>> upper = lambda batch: list(map(str.upper, batch))
- >>> model = Model(x=["speech"],
- out=["upper_text"],
- pipe=[Element(asr, "speech", "text"), Element(upper, "text", "upper_text")])
- >>> model(["8088-284756-0037.wav"])
- ['I WALKED ALONG BRISKLY FOR PERHAPS FIVE MINUTES']
"""
super().__init__(in_x=x, out_params=out, in_y=y)
if pipe is not None:
diff --git a/deeppavlov/core/common/registry.json b/deeppavlov/core/common/registry.json
index 2ed9d8e217..09c205d70b 100644
--- a/deeppavlov/core/common/registry.json
+++ b/deeppavlov/core/common/registry.json
@@ -2,26 +2,17 @@
"UD_pymorphy_lemmatizer": "deeppavlov.models.morpho_tagger.lemmatizer:UDPymorphyLemmatizer",
"api_requester": "deeppavlov.models.api_requester.api_requester:ApiRequester",
"api_router": "deeppavlov.models.api_requester.api_router:ApiRouter",
- "base64_decode_bytesIO": "deeppavlov.models.nemo.common:ascii_to_bytes_io",
"basic_classification_iterator": "deeppavlov.dataset_iterators.basic_classification_iterator:BasicClassificationDatasetIterator",
"basic_classification_reader": "deeppavlov.dataset_readers.basic_classification_reader:BasicClassificationDatasetReader",
"bert_classifier": "deeppavlov.models.bert.bert_classifier:BertClassifierModel",
"bert_ner_preprocessor": "deeppavlov.models.preprocessors.bert_preprocessor:BertNerPreprocessor",
"bert_preprocessor": "deeppavlov.models.preprocessors.bert_preprocessor:BertPreprocessor",
- "bert_ranker": "deeppavlov.models.bert.bert_ranker:BertRankerModel",
- "bert_ranker_preprocessor": "deeppavlov.models.preprocessors.bert_preprocessor:BertRankerPreprocessor",
- "bert_sep_ranker": "deeppavlov.models.bert.bert_ranker:BertSepRankerModel",
- "bert_sep_ranker_predictor": "deeppavlov.models.bert.bert_ranker:BertSepRankerPredictor",
- "bert_sep_ranker_predictor_preprocessor": "deeppavlov.models.preprocessors.bert_preprocessor:BertSepRankerPredictorPreprocessor",
- "bert_sep_ranker_preprocessor": "deeppavlov.models.preprocessors.bert_preprocessor:BertSepRankerPreprocessor",
"bert_sequence_network": "deeppavlov.models.bert.bert_sequence_tagger:BertSequenceNetwork",
"bert_sequence_tagger": "deeppavlov.models.bert.bert_sequence_tagger:BertSequenceTagger",
"bert_syntax_parser": "deeppavlov.models.syntax_parser.network:BertSyntaxParser",
- "bilstm_gru_nn": "deeppavlov.models.ranking.bilstm_gru_siamese_network:BiLSTMGRUSiameseNetwork",
"bilstm_nn": "deeppavlov.models.ranking.bilstm_siamese_network:BiLSTMSiameseNetwork",
"boolqa_reader": "deeppavlov.dataset_readers.boolqa_reader:BoolqaReader",
"bow": "deeppavlov.models.embedders.bow_embedder:BoWEmbedder",
- "bytesIO_encode_base64": "deeppavlov.models.nemo.common:bytes_io_to_ascii",
"capitalization_featurizer": "deeppavlov.models.preprocessors.capitalization:CapitalizationPreprocessor",
"char_splitter": "deeppavlov.models.preprocessors.char_splitter:CharSplitter",
"char_splitting_lowercase_preprocessor": "deeppavlov.models.preprocessors.capitalization:CharSplittingLowercasePreprocessor",
@@ -29,7 +20,6 @@
"conll2003_reader": "deeppavlov.dataset_readers.conll2003_reader:Conll2003DatasetReader",
"convert_ids2tags": "deeppavlov.models.preprocessors.ner_preprocessor:ConvertIds2Tags",
"cos_sim_classifier": "deeppavlov.models.classifiers.cos_sim_classifier:CosineSimilarityClassifier",
- "dam_nn_use_transformer": "deeppavlov.models.ranking.deep_attention_matching_network_use_transformer:DAMNetworkUSETransformer",
"data_fitting_iterator": "deeppavlov.core.data.data_fitting_iterator:DataFittingIterator",
"data_learning_iterator": "deeppavlov.core.data.data_learning_iterator:DataLearningIterator",
"dependency_output_prettifier": "deeppavlov.models.morpho_tagger.common:DependencyOutputPrettifier",
@@ -45,7 +35,6 @@
"dstc2_ner_iterator": "deeppavlov.dataset_iterators.dstc2_ner_iterator:Dstc2NerDatasetIterator",
"dstc2_reader": "deeppavlov.dataset_readers.dstc2_reader:DSTC2DatasetReader",
"dstc_slotfilling": "deeppavlov.models.slotfill.slotfill:DstcSlotFillingNetwork",
- "elmo_embedder": "deeppavlov.models.embedders.elmo_embedder:ELMoEmbedder",
"emb_mat_assembler": "deeppavlov.models.preprocessors.assemble_embeddings_matrix:EmbeddingsMatrixAssembler",
"entity_detection_parser": "deeppavlov.models.kbqa.entity_detection_parser:EntityDetectionParser",
"entity_linker": "deeppavlov.models.kbqa.entity_linking:EntityLinker",
@@ -65,7 +54,6 @@
"hybrid_ner_model": "deeppavlov.models.ner.NER_model:HybridNerModel",
"imdb_reader": "deeppavlov.dataset_readers.imdb_reader:ImdbReader",
"input_splitter": "deeppavlov.models.multitask_bert.multitask_bert:InputSplitter",
- "jieba_tokenizer": "deeppavlov.models.tokenizers.jieba_tokenizer:JiebaTokenizer",
"joint_tagger_parser": "deeppavlov.models.syntax_parser.joint:JointTaggerParser",
"kbqa_entity_linker": "deeppavlov.models.kbqa.kbqa_entity_linking:KBEntityLinker",
"kbqa_reader": "deeppavlov.dataset_readers.kbqa_reader:KBQAReader",
@@ -79,10 +67,8 @@
"logit_ranker": "deeppavlov.models.doc_retrieval.logit_ranker:LogitRanker",
"mask": "deeppavlov.models.preprocessors.mask:Mask",
"md_yaml_dialogs_reader": "deeppavlov.dataset_readers.md_yaml_dialogs_reader:MD_YAML_DialogsDatasetReader",
- "morpho_tagger": "deeppavlov.models.morpho_tagger.morpho_tagger:MorphoTagger",
"morphotagger_dataset": "deeppavlov.dataset_iterators.morphotagger_iterator:MorphoTaggerDatasetIterator",
"morphotagger_dataset_reader": "deeppavlov.dataset_readers.morphotagging_dataset_reader:MorphotaggerDatasetReader",
- "mpm_nn": "deeppavlov.models.ranking.mpm_siamese_network:MPMSiameseNetwork",
"mt_bert": "deeppavlov.models.multitask_bert.multitask_bert:MultiTaskBert",
"mt_bert_classification_task": "deeppavlov.models.multitask_bert.multitask_bert:MTBertClassificationTask",
"mt_bert_reuser": "deeppavlov.models.multitask_bert.multitask_bert:MTBertReUser",
@@ -92,14 +78,9 @@
"multi_squad_retr_iterator": "deeppavlov.dataset_iterators.squad_iterator:MultiSquadRetrIterator",
"multitask_iterator": "deeppavlov.dataset_iterators.multitask_iterator:MultiTaskIterator",
"multitask_reader": "deeppavlov.dataset_readers.multitask_reader:MultiTaskReader",
- "nemo_asr": "deeppavlov.models.nemo.asr:NeMoASR",
- "nemo_tts": "deeppavlov.models.nemo.tts:NeMoTTS",
"ner": "deeppavlov.models.ner.network:NerNetwork",
- "ner_bio_converter": "deeppavlov.models.ner.bio:BIOMarkupRestorer",
"ner_chunker": "deeppavlov.models.kbqa.entity_linking:NerChunker",
- "ner_few_shot_iterator": "deeppavlov.dataset_iterators.ner_few_shot_iterator:NERFewShotIterator",
"ner_preprocessor": "deeppavlov.models.preprocessors.ner_preprocessor:NerPreprocessor",
- "ner_svm": "deeppavlov.models.ner.svm:SVMTagger",
"ner_vocab": "deeppavlov.models.preprocessors.ner_preprocessor:NerVocab",
"nltk_moses_tokenizer": "deeppavlov.models.tokenizers.nltk_moses_tokenizer:NLTKMosesTokenizer",
"nltk_tokenizer": "deeppavlov.models.tokenizers.nltk_tokenizer:NLTKTokenizer",
@@ -124,7 +105,6 @@
"re_classifier": "deeppavlov.models.relation_extraction.relation_extraction_bert:REBertModel",
"response_base_loader": "deeppavlov.models.preprocessors.response_base_loader:ResponseBaseLoader",
"ru_adj_to_noun": "deeppavlov.models.kbqa.tree_to_sparql:RuAdjToNoun",
- "ru_obscenity_classifier": "deeppavlov.models.classifiers.ru_obscenity_classifier:RuObscenityClassifier",
"ru_sent_tokenizer": "deeppavlov.models.tokenizers.ru_sent_tokenizer:RuSentTokenizer",
"ru_tokenizer": "deeppavlov.models.tokenizers.ru_tokenizer:RussianTokenizer",
"rured_reader": "deeppavlov.dataset_readers.rured_reader:RuREDDatasetReader",
@@ -132,18 +112,12 @@
"sanitizer": "deeppavlov.models.preprocessors.sanitizer:Sanitizer",
"sentseg_restore_sent": "deeppavlov.models.preprocessors.sentseg_preprocessor:SentSegRestoreSent",
"siamese_iterator": "deeppavlov.dataset_iterators.siamese_iterator:SiameseIterator",
- "siamese_predictor": "deeppavlov.models.ranking.siamese_predictor:SiamesePredictor",
- "siamese_preprocessor": "deeppavlov.models.preprocessors.siamese_preprocessor:SiamesePreprocessor",
"siamese_reader": "deeppavlov.dataset_readers.siamese_reader:SiameseReader",
"simple_dstc2_reader": "deeppavlov.dataset_readers.dstc2_reader:SimpleDSTC2DatasetReader",
"simple_vocab": "deeppavlov.core.data.simple_vocab:SimpleVocabulary",
"sklearn_component": "deeppavlov.models.sklearn.sklearn_component:SklearnComponent",
"slotfill_raw": "deeppavlov.models.slotfill.slotfill_raw:SlotFillingComponent",
"slotfill_raw_rasa": "deeppavlov.models.slotfill.slotfill_raw:RASA_SlotFillingComponent",
- "smn_nn": "deeppavlov.models.ranking.sequential_matching_network:SMNNetwork",
- "snips_intents_iterator": "deeppavlov.dataset_iterators.snips_intents_iterator:SnipsIntentIterator",
- "snips_ner_iterator": "deeppavlov.dataset_iterators.snips_ner_iterator:SnipsNerIterator",
- "snips_reader": "deeppavlov.dataset_readers.snips_reader:SnipsReader",
"spelling_error_model": "deeppavlov.models.spelling_correction.brillmoore.error_model:ErrorModel",
"spelling_levenshtein": "deeppavlov.models.spelling_correction.levenshtein.searcher_component:LevenshteinSearcherComponent",
"split_tokenizer": "deeppavlov.models.tokenizers.split_tokenizer:SplitTokenizer",
diff --git a/deeppavlov/core/common/requirements_registry.json b/deeppavlov/core/common/requirements_registry.json
index 7d4f1bc84f..1636093a59 100644
--- a/deeppavlov/core/common/requirements_registry.json
+++ b/deeppavlov/core/common/requirements_registry.json
@@ -1,7 +1,6 @@
{
"UD_pymorphy_lemmatizer": [
- "{DEEPPAVLOV_PATH}/requirements/morpho_tagger.txt",
- "{DEEPPAVLOV_PATH}/requirements/tf.txt"
+ "{DEEPPAVLOV_PATH}/requirements/morpho_tagger.txt"
],
"bert_classifier": [
"{DEEPPAVLOV_PATH}/requirements/tf.txt",
@@ -15,30 +14,6 @@
"{DEEPPAVLOV_PATH}/requirements/bert_dp.txt",
"{DEEPPAVLOV_PATH}/requirements/tf.txt"
],
- "bert_ranker": [
- "{DEEPPAVLOV_PATH}/requirements/tf.txt",
- "{DEEPPAVLOV_PATH}/requirements/bert_dp.txt"
- ],
- "bert_ranker_preprocessor": [
- "{DEEPPAVLOV_PATH}/requirements/bert_dp.txt",
- "{DEEPPAVLOV_PATH}/requirements/tf.txt"
- ],
- "bert_sep_ranker": [
- "{DEEPPAVLOV_PATH}/requirements/tf.txt",
- "{DEEPPAVLOV_PATH}/requirements/bert_dp.txt"
- ],
- "bert_sep_ranker_predictor": [
- "{DEEPPAVLOV_PATH}/requirements/tf.txt",
- "{DEEPPAVLOV_PATH}/requirements/bert_dp.txt"
- ],
- "bert_sep_ranker_predictor_preprocessor": [
- "{DEEPPAVLOV_PATH}/requirements/bert_dp.txt",
- "{DEEPPAVLOV_PATH}/requirements/tf.txt"
- ],
- "bert_sep_ranker_preprocessor": [
- "{DEEPPAVLOV_PATH}/requirements/bert_dp.txt",
- "{DEEPPAVLOV_PATH}/requirements/tf.txt"
- ],
"bert_sequence_network": [
"{DEEPPAVLOV_PATH}/requirements/tf.txt",
"{DEEPPAVLOV_PATH}/requirements/bert_dp.txt"
@@ -54,24 +29,15 @@
"chu_liu_edmonds_transformer": [
"{DEEPPAVLOV_PATH}/requirements/syntax_parser.txt"
],
- "dam_nn_use_transformer": [
- "{DEEPPAVLOV_PATH}/requirements/tf.txt",
- "{DEEPPAVLOV_PATH}/requirements/tf-hub.txt"
- ],
"dependency_output_prettifier": [
"{DEEPPAVLOV_PATH}/requirements/tf.txt"
],
"dictionary_vectorizer": [
- "{DEEPPAVLOV_PATH}/requirements/morpho_tagger.txt",
- "{DEEPPAVLOV_PATH}/requirements/tf.txt"
+ "{DEEPPAVLOV_PATH}/requirements/morpho_tagger.txt"
],
"dstc_slotfilling": [
"{DEEPPAVLOV_PATH}/requirements/rapidfuzz.txt"
],
- "elmo_embedder": [
- "{DEEPPAVLOV_PATH}/requirements/tf.txt",
- "{DEEPPAVLOV_PATH}/requirements/tf-hub.txt"
- ],
"fasttext": [
"{DEEPPAVLOV_PATH}/requirements/fasttext.txt"
],
@@ -90,9 +56,6 @@
"{DEEPPAVLOV_PATH}/requirements/tf.txt",
"{DEEPPAVLOV_PATH}/requirements/bert_dp.txt"
],
- "jieba_tokenizer": [
- "{DEEPPAVLOV_PATH}/requirements/jieba.txt"
- ],
"joint_tagger_parser": [
"{DEEPPAVLOV_PATH}/requirements/tf.txt"
],
@@ -105,12 +68,6 @@
"lemmatized_output_prettifier": [
"{DEEPPAVLOV_PATH}/requirements/tf.txt"
],
- "morpho_tagger": [
- "{DEEPPAVLOV_PATH}/requirements/tf.txt"
- ],
- "mpm_nn": [
- "{DEEPPAVLOV_PATH}/requirements/tf.txt"
- ],
"mt_bert": [
"{DEEPPAVLOV_PATH}/requirements/tf.txt",
"{DEEPPAVLOV_PATH}/requirements/bert_dp.txt"
@@ -131,8 +88,7 @@
"{DEEPPAVLOV_PATH}/requirements/tf.txt"
],
"pymorphy_vectorizer": [
- "{DEEPPAVLOV_PATH}/requirements/morpho_tagger.txt",
- "{DEEPPAVLOV_PATH}/requirements/tf.txt"
+ "{DEEPPAVLOV_PATH}/requirements/morpho_tagger.txt"
],
"rel_ranker": [
"{DEEPPAVLOV_PATH}/requirements/tf.txt"
@@ -140,12 +96,6 @@
"rel_ranking_infer": [
"{DEEPPAVLOV_PATH}/requirements/tf.txt"
],
- "siamese_predictor": [
- "{DEEPPAVLOV_PATH}/requirements/tf.txt"
- ],
- "smn_nn": [
- "{DEEPPAVLOV_PATH}/requirements/tf.txt"
- ],
"squad_bert_infer": [
"{DEEPPAVLOV_PATH}/requirements/tf.txt",
"{DEEPPAVLOV_PATH}/requirements/bert_dp.txt"
@@ -167,9 +117,6 @@
"two_sentences_emb": [
"{DEEPPAVLOV_PATH}/requirements/tf.txt"
],
- "bilstm_gru_nn": [
- "{DEEPPAVLOV_PATH}/requirements/tf.txt"
- ],
"wiki_parser": [
"{DEEPPAVLOV_PATH}/requirements/hdt.txt"
],
@@ -182,18 +129,12 @@
"static_dictionary": [
"{DEEPPAVLOV_PATH}/requirements/lxml.txt"
],
- "base64_decode_bytesIO": [
- "{DEEPPAVLOV_PATH}/requirements/nemo.txt"
- ],
"wikitionary_100K_vocab": [
"{DEEPPAVLOV_PATH}/requirements/lxml.txt"
],
"huggingface_dataset_iterator": [
"{DEEPPAVLOV_PATH}/requirements/datasets.txt"
],
- "bytesIO_encode_base64": [
- "{DEEPPAVLOV_PATH}/requirements/nemo.txt"
- ],
"typos_custom_reader": [
"{DEEPPAVLOV_PATH}/requirements/lxml.txt"
],
@@ -206,10 +147,6 @@
"tree_to_sparql": [
"{DEEPPAVLOV_PATH}/requirements/udapi.txt"
],
- "torch_squad_bert_model": [
- "{DEEPPAVLOV_PATH}/requirements/pytorch16.txt",
- "{DEEPPAVLOV_PATH}/requirements/transformers.txt"
- ],
"torch_transformers_preprocessor": [
"{DEEPPAVLOV_PATH}/requirements/pytorch16.txt",
"{DEEPPAVLOV_PATH}/requirements/transformers.txt"
@@ -228,7 +165,7 @@
],
"torch_bert_ranker": [
"{DEEPPAVLOV_PATH}/requirements/pytorch16.txt",
- "{DEEPPAVLOV_PATH}/requirements/transformers28.txt"
+ "{DEEPPAVLOV_PATH}/requirements/transformers.txt"
],
"torch_transformers_classifier": [
"{DEEPPAVLOV_PATH}/requirements/pytorch16.txt",
@@ -251,7 +188,7 @@
],
"torch_bert_ranker_preprocessor": [
"{DEEPPAVLOV_PATH}/requirements/pytorch16.txt",
- "{DEEPPAVLOV_PATH}/requirements/transformers28.txt"
+ "{DEEPPAVLOV_PATH}/requirements/transformers.txt"
],
"transformers_bert_preprocessor": [
"{DEEPPAVLOV_PATH}/requirements/transformers.txt"
@@ -262,22 +199,6 @@
"typos_kartaslov_reader": [
"{DEEPPAVLOV_PATH}/requirements/lxml.txt"
],
- "torch_squad_bert_infer": [
- "{DEEPPAVLOV_PATH}/requirements/pytorch16.txt",
- "{DEEPPAVLOV_PATH}/requirements/transformers.txt"
- ],
- "nemo_asr": [
- "{DEEPPAVLOV_PATH}/requirements/pytorch14.txt",
- "{DEEPPAVLOV_PATH}/requirements/nemo.txt",
- "{DEEPPAVLOV_PATH}/requirements/nemo-asr.txt"
- ],
- "nemo_tts": [
- "{DEEPPAVLOV_PATH}/requirements/pytorch14.txt",
- "{DEEPPAVLOV_PATH}/requirements/nemo.txt",
- "{DEEPPAVLOV_PATH}/requirements/nemo-asr.txt",
- "{DEEPPAVLOV_PATH}/requirements/transformers28.txt",
- "{DEEPPAVLOV_PATH}/requirements/nemo-tts.txt"
- ],
"spelling_error_model": [
"{DEEPPAVLOV_PATH}/requirements/lxml.txt"
],
diff --git a/deeppavlov/dataset_iterators/ner_few_shot_iterator.py b/deeppavlov/dataset_iterators/ner_few_shot_iterator.py
deleted file mode 100644
index 52e1fa38c1..0000000000
--- a/deeppavlov/dataset_iterators/ner_few_shot_iterator.py
+++ /dev/null
@@ -1,144 +0,0 @@
-# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import re
-from typing import List, Dict, Tuple, Any, Iterator, Optional
-
-import numpy as np
-
-from deeppavlov.core.common.registry import register
-from deeppavlov.core.data.data_learning_iterator import DataLearningIterator
-
-
-@register('ner_few_shot_iterator')
-class NERFewShotIterator(DataLearningIterator):
- """Dataset iterator for simulating few-shot Named Entity Recognition setting.
-
- Args:
- data: list of (x, y) pairs for every data type in ``'train'``, ``'valid'`` and ``'test'``
- seed: random seed for data shuffling
- shuffle: whether to shuffle data during batching
- target_tag: the tag of interest. For this tag the few-shot setting will be simulated
- filter_bi: whether to filter BIO markup or not
- n_train_samples: number of training samples in the few shot setting. The validation and the test sets will be
- the same
- remove_not_targets: whether to replace all non target tags with `O` tag or not.
- """
-
- def __init__(self,
- data: Dict[str, List[Tuple[Any, Any]]],
- seed: int = None,
- shuffle: bool = True,
- target_tag: str = None,
- filter_bi: bool = True,
- n_train_samples: int = 20,
- remove_not_targets: bool = True,
- *args, **kwargs) -> None:
- super(NERFewShotIterator, self).__init__(data=data, seed=seed, shuffle=shuffle)
- self.target_tag = target_tag
- self.filter_bi = filter_bi
- self.n_train_samples = n_train_samples
- self.remove_not_targets = remove_not_targets
- if self.target_tag is None:
- raise RuntimeError('You must provide a target tag to NERFewShotIterator!')
-
- self.n_samples = len(self.train)
-
- if self.remove_not_targets:
- self._remove_not_target_tags()
-
- if self.filter_bi:
- for key in self.data:
- for n, (x, y) in enumerate(self.data[key]):
- self.data[key][n] = [x, [re.sub('(B-|I-)', '', tag) for tag in y]]
-
- self.tag_map = np.zeros(self.n_samples, dtype=bool)
- for n, (toks, tags) in enumerate(self.data['train']):
- if self.filter_bi:
- self.tag_map[n] = any(self.target_tag == tag for tag in tags if len(tag) > 2)
- else:
- self.tag_map[n] = any(self.target_tag == tag[2:] for tag in tags if len(tag) > 2)
-
- self.marked_nums = None
- self.unmarked_nums = None
- self._sample_marked()
-
- def _sample_marked(self):
- np.zeros(len(self.data['train']), dtype=bool)
- n_marked = 0
- self.marked_mask = np.zeros(self.n_samples, dtype=bool)
- while n_marked < self.n_train_samples:
- is_picked = True
- while is_picked:
- n = np.random.randint(self.n_samples)
- if not self.marked_mask[n]:
- is_picked = False
- self.marked_mask[n] = True
- if self.tag_map[n]:
- n_marked += 1
-
- self.marked_nums = np.arange(self.n_samples)[self.marked_mask]
- self.unmarked_nums = np.arange(self.n_samples)[~self.marked_mask]
-
- def _remove_not_target_tags(self):
- if self.remove_not_targets:
- for key in self.data:
- for n, (x, y) in enumerate(self.data[key]):
- tags = []
- for tag in y:
- if tag.endswith('-' + self.target_tag):
- tags.append(tag)
- else:
- tags.append('O')
- self.data[key][n] = [x, tags]
-
- def get_instances(self, data_type: str = 'train') -> Tuple[List[List[str]], List[List[str]]]:
- """Get all data for a selected data type
-
- Args:
- data_type (str): can be either ``'train'``, ``'test'``, ``'valid'`` or ``'all'``
-
- Returns:
- a tuple of all inputs for a data type and all expected outputs for a data type
- """
-
- if data_type == 'train':
- samples = [self.data[data_type][i] for i in self.marked_nums]
- else:
- samples = self.data[data_type][:]
-
- x, y = list(zip(*samples))
-
- return x, y
-
- def gen_batches(self, batch_size: int,
- data_type: str = 'train',
- shuffle: Optional[bool] = None) -> Iterator[Tuple[List[List[str]], List[List[str]]]]:
- x, y = self.get_instances(data_type)
- data_len = len(x)
-
- if data_len == 0:
- return
-
- order = list(range(data_len))
- if shuffle is None and self.shuffle:
- self.random.shuffle(order)
- elif shuffle:
- self.random.shuffle(order)
-
- if batch_size < 0:
- batch_size = data_len
-
- for i in range((data_len - 1) // batch_size + 1):
- yield tuple(zip(*[(x[o], y[o]) for o in order[i * batch_size:(i + 1) * batch_size]]))
diff --git a/deeppavlov/dataset_iterators/snips_intents_iterator.py b/deeppavlov/dataset_iterators/snips_intents_iterator.py
deleted file mode 100644
index 2a881634ac..0000000000
--- a/deeppavlov/dataset_iterators/snips_intents_iterator.py
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright 2019 Alexey Romanov
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from overrides import overrides
-
-from deeppavlov.core.common.registry import register
-from deeppavlov.core.data.data_learning_iterator import DataLearningIterator
-
-
-@register('snips_intents_iterator')
-class SnipsIntentIterator(DataLearningIterator):
- @overrides
- def preprocess(self, data, *args, **kwargs):
- result = []
- for query in data:
- text = ''.join(part['text'] for part in query['data'])
- intent = query['intent']
- result.append((text, intent))
- return result
diff --git a/deeppavlov/dataset_iterators/snips_ner_iterator.py b/deeppavlov/dataset_iterators/snips_ner_iterator.py
deleted file mode 100644
index 2186ebbaa9..0000000000
--- a/deeppavlov/dataset_iterators/snips_ner_iterator.py
+++ /dev/null
@@ -1,42 +0,0 @@
-# Copyright 2019 Alexey Romanov
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import nltk
-from overrides import overrides
-
-from deeppavlov.core.common.registry import register
-from deeppavlov.core.data.data_learning_iterator import DataLearningIterator
-
-
-@register('snips_ner_iterator')
-class SnipsNerIterator(DataLearningIterator):
- @overrides
- def preprocess(self, data, *args, **kwargs):
- result = []
- for query in data:
- query = query['data']
- words = []
- slots = []
- for part in query:
- part_words = nltk.tokenize.wordpunct_tokenize(part['text'])
- entity = part.get('entity', None)
- if entity:
- slots.append('B-' + entity)
- slots += ['I-' + entity] * (len(part_words) - 1)
- else:
- slots += ['O'] * len(part_words)
- words += part_words
-
- result.append((words, slots))
- return result
diff --git a/deeppavlov/dataset_readers/snips_reader.py b/deeppavlov/dataset_readers/snips_reader.py
deleted file mode 100644
index 7041df6aa7..0000000000
--- a/deeppavlov/dataset_readers/snips_reader.py
+++ /dev/null
@@ -1,93 +0,0 @@
-# Copyright 2019 Alexey Romanov
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import json
-from logging import getLogger
-from pathlib import Path
-from typing import List, Dict, Any, Optional
-
-from overrides import overrides
-
-from deeppavlov.core.common.registry import register
-from deeppavlov.core.data.dataset_reader import DatasetReader
-from deeppavlov.core.data.utils import download_decompress, mark_done, is_done
-
-log = getLogger(__name__)
-
-
-@register('snips_reader')
-class SnipsReader(DatasetReader):
- """The class to download and read Snips NLU Benchmark dataset (custom intents section).
-
- See https://github.com/snipsco/nlu-benchmark.
- """
-
- # noinspection PyAttributeOutsideInit
- @overrides
- def read(self, data_path: str, queries_per_intent: Optional[int] = None, test_validate_split: float = 0.5,
- *args, **kwargs) -> \
- Dict[str, List[Dict[str, Any]]]:
- """
- Each query in the output has the following form:
- { 'intent': intent_name,
- 'data': [ { 'text': text, ('entity': slot_name)? } ]
- }
-
- Args:
- data_path: A path to a folder with dataset files.
- queries_per_intent: Number of queries to load for each intent. None to load all.
- If the requested number is greater than available in file, all queries are returned.
- test_validate_split: Proportion of `_validate` files to be used as test dataset (since Snips
- is split into training and validation sets without a separate test set).
- """
- data_path = Path(data_path)
- intents = ['AddToPlaylist', 'BookRestaurant', 'GetWeather', 'PlayMusic',
- 'RateBook', 'SearchCreativeWork', 'SearchScreeningEvent']
-
- if not is_done(data_path):
- url = 'http://files.deeppavlov.ai/datasets/snips.tar.gz'
- log.info('[downloading data from {} to {}]'.format(url, data_path))
- download_decompress(url, data_path)
- mark_done(data_path)
-
- use_full_file = queries_per_intent is None or queries_per_intent > 70
- training_data = []
- validation_data = []
- test_data = []
-
- for intent in intents:
- intent_path = data_path / intent
- train_file_name = f"train_{intent}{'_full' if use_full_file else ''}.json"
- validate_file_name = f"validate_{intent}.json"
-
- train_queries = self._load_file(intent_path / train_file_name, intent, queries_per_intent)
- validate_queries = self._load_file(intent_path / validate_file_name, intent, queries_per_intent)
- num_test_queries = round(len(validate_queries) * test_validate_split)
-
- training_data.extend(train_queries)
- validation_data.extend(validate_queries[num_test_queries:])
- test_data.extend(validate_queries[:num_test_queries])
-
- return {'train': training_data, 'valid': validation_data, 'test': test_data}
-
- @staticmethod
- def _load_file(path: Path, intent: str, num_queries: Optional[int]):
- with path.open(encoding='latin_1') as f:
- data = json.load(f)
-
- # restrict number of queries
- queries = data[intent][:num_queries]
- for query in queries:
- query['intent'] = intent
- return queries
diff --git a/deeppavlov/models/bert/bert_ranker.py b/deeppavlov/models/bert/bert_ranker.py
deleted file mode 100644
index c4d26be4ae..0000000000
--- a/deeppavlov/models/bert/bert_ranker.py
+++ /dev/null
@@ -1,467 +0,0 @@
-# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import re
-from collections import OrderedDict
-from logging import getLogger
-from operator import itemgetter
-from typing import List, Dict, Union
-
-import numpy as np
-import tensorflow as tf
-from bert_dp.modeling import BertConfig, BertModel
-from bert_dp.optimization import AdamWeightDecayOptimizer
-from bert_dp.preprocessing import InputFeatures
-
-from deeppavlov.core.commands.utils import expand_path
-from deeppavlov.core.common.registry import register
-from deeppavlov.core.models.tf_model import LRScheduledTFModel
-from deeppavlov.models.bert.bert_classifier import BertClassifierModel
-
-logger = getLogger(__name__)
-
-
-@register('bert_ranker')
-class BertRankerModel(BertClassifierModel):
- """BERT-based model for interaction-based text ranking.
-
- Linear transformation is trained over the BERT pooled output from [CLS] token.
- Predicted probabilities of classes are used as a similarity measure for ranking.
-
- Args:
- bert_config_file: path to Bert configuration file
- n_classes: number of classes
- keep_prob: dropout keep_prob for non-Bert layers
- return_probas: set True if class probabilities are returned instead of the most probable label
- """
-
- def __init__(self, bert_config_file, n_classes=2, keep_prob=0.9, return_probas=True, **kwargs) -> None:
- super().__init__(bert_config_file=bert_config_file, n_classes=n_classes,
- keep_prob=keep_prob, return_probas=return_probas, **kwargs)
-
- def train_on_batch(self, features_li: List[List[InputFeatures]], y: Union[List[int], List[List[int]]]) -> Dict:
- """Train the model on the given batch.
-
- Args:
- features_li: list with the single element containing the batch of InputFeatures
- y: batch of labels (class id or one-hot encoding)
-
- Returns:
- dict with loss and learning rate values
- """
-
- features = features_li[0]
- input_ids = [f.input_ids for f in features]
- input_masks = [f.input_mask for f in features]
- input_type_ids = [f.input_type_ids for f in features]
-
- feed_dict = self._build_feed_dict(input_ids, input_masks, input_type_ids, y)
-
- _, loss = self.sess.run([self.train_op, self.loss], feed_dict=feed_dict)
- return {'loss': loss, 'learning_rate': feed_dict[self.learning_rate_ph]}
-
- def __call__(self, features_li: List[List[InputFeatures]]) -> Union[List[int], List[List[float]]]:
- """Calculate scores for the given context over candidate responses.
-
- Args:
- features_li: list of elements where each element contains the batch of features
- for contexts with particular response candidates
-
- Returns:
- predicted scores for contexts over response candidates
- """
-
- if len(features_li) == 1 and len(features_li[0]) == 1:
- msg = "It is not intended to use the {} in the interact mode.".format(self.__class__)
- logger.error(msg)
- return [msg]
-
- predictions = []
- for features in features_li:
- input_ids = [f.input_ids for f in features]
- input_masks = [f.input_mask for f in features]
- input_type_ids = [f.input_type_ids for f in features]
-
- feed_dict = self._build_feed_dict(input_ids, input_masks, input_type_ids)
- if not self.return_probas:
- pred = self.sess.run(self.y_predictions, feed_dict=feed_dict)
- else:
- pred = self.sess.run(self.y_probas, feed_dict=feed_dict)
- predictions.append(pred[:, 1])
- if len(features_li) == 1:
- predictions = predictions[0]
- else:
- predictions = np.hstack([np.expand_dims(el, 1) for el in predictions])
- return predictions
-
-
-@register('bert_sep_ranker')
-class BertSepRankerModel(LRScheduledTFModel):
- """BERT-based model for representation-based text ranking.
-
- BERT pooled output from [CLS] token is used to get a separate representation of a context and a response.
- Similarity measure is calculated as cosine similarity between these representations.
-
- Args:
- bert_config_file: path to Bert configuration file
- keep_prob: dropout keep_prob for non-Bert layers
- attention_probs_keep_prob: keep_prob for Bert self-attention layers
- hidden_keep_prob: keep_prob for Bert hidden layers
- optimizer: name of tf.train.* optimizer or None for ``AdamWeightDecayOptimizer``
- weight_decay_rate: L2 weight decay for ``AdamWeightDecayOptimizer``
- pretrained_bert: pretrained Bert checkpoint
- min_learning_rate: min value of learning rate if learning rate decay is used
- """
-
- def __init__(self, bert_config_file, keep_prob=0.9,
- attention_probs_keep_prob=None, hidden_keep_prob=None,
- optimizer=None, weight_decay_rate=0.01,
- pretrained_bert=None, min_learning_rate=1e-06, **kwargs) -> None:
- super().__init__(**kwargs)
-
- self.min_learning_rate = min_learning_rate
- self.keep_prob = keep_prob
- self.optimizer = optimizer
- self.weight_decay_rate = weight_decay_rate
-
- self.bert_config = BertConfig.from_json_file(str(expand_path(bert_config_file)))
-
- if attention_probs_keep_prob is not None:
- self.bert_config.attention_probs_dropout_prob = 1.0 - attention_probs_keep_prob
- if hidden_keep_prob is not None:
- self.bert_config.hidden_dropout_prob = 1.0 - hidden_keep_prob
-
- self.sess_config = tf.ConfigProto(allow_soft_placement=True)
- self.sess_config.gpu_options.allow_growth = True
- self.sess = tf.Session(config=self.sess_config)
-
- self._init_graph()
-
- self._init_optimizer()
-
- if pretrained_bert is not None:
- pretrained_bert = str(expand_path(pretrained_bert))
-
- if tf.train.checkpoint_exists(pretrained_bert) \
- and not (self.load_path and tf.train.checkpoint_exists(str(self.load_path.resolve()))):
- logger.info('[initializing model with Bert from {}]'.format(pretrained_bert))
- # Exclude optimizer and classification variables from saved variables
- var_list = self._get_saveable_variables(
- exclude_scopes=('Optimizer', 'learning_rate', 'momentum', 'output_weights', 'output_bias'))
- assignment_map = self.get_variables_to_restore(var_list, pretrained_bert)
- tf.train.init_from_checkpoint(pretrained_bert, assignment_map)
-
- self.sess.run(tf.global_variables_initializer())
-
- if self.load_path is not None:
- self.load()
-
- @classmethod
- def get_variables_to_restore(cls, tvars, init_checkpoint):
- """Determine correspondence of checkpoint variables to current variables."""
-
- assignment_map = OrderedDict()
- graph_names = []
- for var in tvars:
- name = var.name
- m = re.match("^(.*):\\d+$", name)
- if m is not None:
- name = m.group(1)
- graph_names.append(name)
- ckpt_names = [el[0] for el in tf.train.list_variables(init_checkpoint)]
- for u in ckpt_names:
- for v in graph_names:
- if u in v:
- assignment_map[u] = v
- return assignment_map
-
- def _init_graph(self):
- self._init_placeholders()
-
- with tf.variable_scope("model"):
- model_a = BertModel(
- config=self.bert_config,
- is_training=self.is_train_ph,
- input_ids=self.input_ids_a_ph,
- input_mask=self.input_masks_a_ph,
- token_type_ids=self.token_types_a_ph,
- use_one_hot_embeddings=False)
-
- with tf.variable_scope("model", reuse=True):
- model_b = BertModel(
- config=self.bert_config,
- is_training=self.is_train_ph,
- input_ids=self.input_ids_b_ph,
- input_mask=self.input_masks_b_ph,
- token_type_ids=self.token_types_b_ph,
- use_one_hot_embeddings=False)
-
- output_layer_a = model_a.get_pooled_output()
- output_layer_b = model_b.get_pooled_output()
-
- with tf.variable_scope("loss"):
- output_layer_a = tf.nn.dropout(output_layer_a, keep_prob=self.keep_prob_ph)
- output_layer_b = tf.nn.dropout(output_layer_b, keep_prob=self.keep_prob_ph)
- output_layer_a = tf.nn.l2_normalize(output_layer_a, axis=1)
- output_layer_b = tf.nn.l2_normalize(output_layer_b, axis=1)
- embeddings = tf.concat([output_layer_a, output_layer_b], axis=0)
- labels = tf.concat([self.y_ph, self.y_ph], axis=0)
- self.loss = tf.contrib.losses.metric_learning.triplet_semihard_loss(labels, embeddings)
- logits = tf.multiply(output_layer_a, output_layer_b)
- self.y_probas = tf.reduce_sum(logits, 1)
- self.pooled_out = output_layer_a
-
- def _init_placeholders(self):
- self.input_ids_a_ph = tf.placeholder(shape=(None, None), dtype=tf.int32, name='ids_a_ph')
- self.input_masks_a_ph = tf.placeholder(shape=(None, None), dtype=tf.int32, name='masks_a_ph')
- self.token_types_a_ph = tf.placeholder(shape=(None, None), dtype=tf.int32, name='token_a_types_ph')
- self.input_ids_b_ph = tf.placeholder(shape=(None, None), dtype=tf.int32, name='ids_b_ph')
- self.input_masks_b_ph = tf.placeholder(shape=(None, None), dtype=tf.int32, name='masks_b_ph')
- self.token_types_b_ph = tf.placeholder(shape=(None, None), dtype=tf.int32, name='token_types_b_ph')
- self.y_ph = tf.placeholder(shape=(None,), dtype=tf.int32, name='y_ph')
- self.learning_rate_ph = tf.placeholder_with_default(0.0, shape=[], name='learning_rate_ph')
- self.keep_prob_ph = tf.placeholder_with_default(1.0, shape=[], name='keep_prob_ph')
- self.is_train_ph = tf.placeholder_with_default(False, shape=[], name='is_train_ph')
-
- def _init_optimizer(self):
- with tf.variable_scope('Optimizer'):
- self.global_step = tf.get_variable('global_step', shape=[], dtype=tf.int32,
- initializer=tf.constant_initializer(0), trainable=False)
- # default optimizer for Bert is Adam with fixed L2 regularization
- if self.optimizer is None:
-
- self.train_op = self.get_train_op(self.loss, learning_rate=self.learning_rate_ph,
- optimizer=AdamWeightDecayOptimizer,
- weight_decay_rate=self.weight_decay_rate,
- beta_1=0.9,
- beta_2=0.999,
- epsilon=1e-6,
- exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"]
- )
- else:
- self.train_op = self.get_train_op(self.loss, learning_rate=self.learning_rate_ph)
-
- if self.optimizer is None:
- new_global_step = self.global_step + 1
- self.train_op = tf.group(self.train_op, [self.global_step.assign(new_global_step)])
-
- def _build_feed_dict(self, input_ids_a, input_masks_a, token_types_a,
- input_ids_b, input_masks_b, token_types_b, y=None):
- feed_dict = {
- self.input_ids_a_ph: input_ids_a,
- self.input_masks_a_ph: input_masks_a,
- self.token_types_a_ph: token_types_a,
- self.input_ids_b_ph: input_ids_b,
- self.input_masks_b_ph: input_masks_b,
- self.token_types_b_ph: token_types_b,
- }
- if y is not None:
- feed_dict.update({
- self.y_ph: y,
- self.learning_rate_ph: max(self.get_learning_rate(), self.min_learning_rate),
- self.keep_prob_ph: self.keep_prob,
- self.is_train_ph: True,
- })
-
- return feed_dict
-
- def train_on_batch(self, features_li: List[List[InputFeatures]], y: Union[List[int], List[List[int]]]) -> Dict:
- """Train the model on the given batch.
-
- Args:
- features_li: list with two elements, one containing the batch of context features
- and the other containing the batch of response features
- y: batch of labels (class id or one-hot encoding)
-
- Returns:
- dict with loss and learning rate values
- """
-
- input_ids_a = [f.input_ids for f in features_li[0]]
- input_masks_a = [f.input_mask for f in features_li[0]]
- input_type_ids_a = [f.input_type_ids for f in features_li[0]]
- input_ids_b = [f.input_ids for f in features_li[1]]
- input_masks_b = [f.input_mask for f in features_li[1]]
- input_type_ids_b = [f.input_type_ids for f in features_li[1]]
-
- feed_dict = self._build_feed_dict(input_ids_a, input_masks_a, input_type_ids_a,
- input_ids_b, input_masks_b, input_type_ids_b, y)
-
- _, loss = self.sess.run([self.train_op, self.loss], feed_dict=feed_dict)
- return {'loss': loss, 'learning_rate': feed_dict[self.learning_rate_ph]}
-
- def __call__(self, features_li: List[List[InputFeatures]]) -> Union[List[int], List[List[float]]]:
- """Calculate scores for the given context over candidate responses.
-
- Args:
- features_li: list of elements where the first element represents the context batch of features
- and the rest of elements represent response candidates batches of features
-
- Returns:
- predicted scores for contexts over response candidates
- """
-
- if len(features_li) == 1 and len(features_li[0]) == 1:
- msg = "It is not intended to use the {} in the interact mode.".format(self.__class__)
- logger.error(msg)
- return [msg]
-
- predictions = []
- input_ids_a = [f.input_ids for f in features_li[0]]
- input_masks_a = [f.input_mask for f in features_li[0]]
- input_type_ids_a = [f.input_type_ids for f in features_li[0]]
- for features in features_li[1:]:
- input_ids_b = [f.input_ids for f in features]
- input_masks_b = [f.input_mask for f in features]
- input_type_ids_b = [f.input_type_ids for f in features]
-
- feed_dict = self._build_feed_dict(input_ids_a, input_masks_a, input_type_ids_a,
- input_ids_b, input_masks_b, input_type_ids_b)
- pred = self.sess.run(self.y_probas, feed_dict=feed_dict)
- predictions.append(pred)
- if len(features_li) == 1:
- predictions = predictions[0]
- else:
- predictions = np.hstack([np.expand_dims(el, 1) for el in predictions])
- return predictions
-
-
-@register('bert_sep_ranker_predictor')
-class BertSepRankerPredictor(BertSepRankerModel):
- """Bert-based model for ranking and receiving a text response.
-
- BERT pooled output from [CLS] token is used to get a separate representation of a context and a response.
- A similarity score is calculated as cosine similarity between these representations.
- Based on this similarity score the text response is retrieved provided some base
- with possible responses (and corresponding contexts).
- Contexts of responses are used additionaly to get the best possible result of retrieval from the base.
-
- Args:
- bert_config_file: path to Bert configuration file
- interact_mode: mode setting a policy to retrieve the response from the base
- batch_size: batch size for building response (and context) vectors over the base
- keep_prob: dropout keep_prob for non-Bert layers
- resps: list of strings containing the base of text responses
- resp_vecs: BERT vector respresentations of ``resps``, if is ``None`` it will be build
- resp_features: features of ``resps`` to build their BERT vector representations
- conts: list of strings containing the base of text contexts
- cont_vecs: BERT vector respresentations of ``conts``, if is ``None`` it will be build
- cont_features: features of ``conts`` to build their BERT vector representations
- """
-
- def __init__(self, bert_config_file, interact_mode=0, batch_size=32,
- resps=None, resp_features=None, resp_vecs=None,
- conts=None, cont_features=None, cont_vecs=None, **kwargs) -> None:
- super().__init__(bert_config_file=bert_config_file,
- **kwargs)
-
- self.interact_mode = interact_mode
- self.batch_size = batch_size
- self.resps = resps
- self.resp_vecs = resp_vecs
- self.resp_features = resp_features
- self.conts = conts
- self.cont_vecs = cont_vecs
- self.cont_features = cont_features
-
- if self.resps is not None and self.resp_vecs is None:
- logger.info("Building BERT vector representations for the response base...")
- self.resp_features = [resp_features[0][i * self.batch_size: (i + 1) * self.batch_size]
- for i in range(len(resp_features[0]) // batch_size + 1)]
- self.resp_vecs = self._get_predictions(self.resp_features)
- self.resp_vecs /= np.linalg.norm(self.resp_vecs, axis=1, keepdims=True)
- np.save(self.save_path / "resp_vecs", self.resp_vecs)
-
- if self.conts is not None and self.cont_vecs is None:
- logger.info("Building BERT vector representations for the context base...")
- self.cont_features = [cont_features[0][i * self.batch_size: (i + 1) * self.batch_size]
- for i in range(len(cont_features[0]) // batch_size + 1)]
- self.cont_vecs = self._get_predictions(self.cont_features)
- self.cont_vecs /= np.linalg.norm(self.cont_vecs, axis=1, keepdims=True)
- np.save(self.save_path / "cont_vecs", self.resp_vecs)
-
- def train_on_batch(self, features, y):
- pass
-
- def __call__(self, features_li):
- """Get the context vector representation and retrieve the text response from the database.
-
- Uses cosine similarity scores over vectors of responses (and corresponding contexts) from the base.
- Based on these scores retrieves the text response from the base.
-
- Args:
- features_li: list of elements where elements represent context batches of features
-
- Returns:
- text response with the highest similarity score and its similarity score from the response base
- """
-
- pred = self._get_predictions(features_li)
- return self._retrieve_db_response(pred)
-
- def _get_predictions(self, features_li):
- """Get BERT vector representations for a list of feature batches."""
-
- pred = []
- for features in features_li:
- input_ids = [f.input_ids for f in features]
- input_masks = [f.input_mask for f in features]
- input_type_ids = [f.input_type_ids for f in features]
- feed_dict = self._build_feed_dict(input_ids, input_masks, input_type_ids,
- input_ids, input_masks, input_type_ids)
- p = self.sess.run(self.pooled_out, feed_dict=feed_dict)
- if len(p.shape) == 1:
- p = np.expand_dims(p, 0)
- p /= np.linalg.norm(p, axis=1, keepdims=True)
- pred.append(p)
- return np.vstack(pred)
-
- def _retrieve_db_response(self, ctx_vec):
- """Retrieve a text response from the base based on the policy determined by ``interact_mode``.
-
- Uses cosine similarity scores over vectors of responses (and corresponding contexts) from the base.
- """
-
- bs = ctx_vec.shape[0]
- if self.interact_mode == 0:
- s = ctx_vec @ self.resp_vecs.T
- ids = np.argmax(s, 1)
- rsp = [[self.resps[ids[i]] for i in range(bs)], [s[i][ids[i]] for i in range(bs)]]
- if self.interact_mode == 1:
- sr = (ctx_vec @ self.resp_vecs.T + 1) / 2
- sc = (ctx_vec @ self.cont_vecs.T + 1) / 2
- ids = np.argsort(sr, 1)[:, -10:]
- sc = [sc[i, ids[i]] for i in range(bs)]
- ids = [sorted(zip(ids[i], sc[i]), key=itemgetter(1), reverse=True) for i in range(bs)]
- sc = [list(map(lambda x: x[1], ids[i])) for i in range(bs)]
- ids = [list(map(lambda x: x[0], ids[i])) for i in range(bs)]
- rsp = [[self.resps[ids[i][0]] for i in range(bs)], [float(sc[i][0]) for i in range(bs)]]
- if self.interact_mode == 2:
- sr = (ctx_vec @ self.resp_vecs.T + 1) / 2
- sc = (ctx_vec @ self.cont_vecs.T + 1) / 2
- ids = np.argsort(sc, 1)[:, -10:]
- sr = [sr[i, ids[i]] for i in range(bs)]
- ids = [sorted(zip(ids[i], sr[i]), key=itemgetter(1), reverse=True) for i in range(bs)]
- sr = [list(map(lambda x: x[1], ids[i])) for i in range(bs)]
- ids = [list(map(lambda x: x[0], ids[i])) for i in range(bs)]
- rsp = [[self.resps[ids[i][0]] for i in range(bs)], [float(sr[i][0]) for i in range(bs)]]
- if self.interact_mode == 3:
- sr = (ctx_vec @ self.resp_vecs.T + 1) / 2
- sc = (ctx_vec @ self.cont_vecs.T + 1) / 2
- s = (sr + sc) / 2
- ids = np.argmax(s, 1)
- rsp = [[self.resps[ids[i]] for i in range(bs)], [float(s[i][ids[i]]) for i in range(bs)]]
- # remove special tokens if they are presented
- rsp = [[el.replace('__eou__', '').replace('__eot__', '').strip() for el in rsp[0]], rsp[1]]
- return rsp
diff --git a/deeppavlov/models/classifiers/ru_obscenity_classifier.py b/deeppavlov/models/classifiers/ru_obscenity_classifier.py
deleted file mode 100644
index 6c17ae2ae8..0000000000
--- a/deeppavlov/models/classifiers/ru_obscenity_classifier.py
+++ /dev/null
@@ -1,144 +0,0 @@
-# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import json
-import re
-from logging import getLogger
-from pathlib import Path
-from typing import List, Union
-
-import pymorphy2
-
-from deeppavlov.core.commands.utils import expand_path
-from deeppavlov.core.common.registry import register
-from deeppavlov.core.models.estimator import Component
-
-log = getLogger(__name__)
-
-
-@register("ru_obscenity_classifier")
-class RuObscenityClassifier(Component):
- """Rule-Based model that decides whether the sentence is obscene or not,
- for Russian language
-
- Args:
- data_path: a directory where the required files are stored.
- next files are required:
- -'obscenity_words.json' — file that stores list of obscenity words
- -'obscenity_words_exception.json' — file that stores list of not obscenity words,
- but which are detects by algorithm as obscenity(for fixing this situation)
- -'obscenity_words_extended.json' — file that stores list of obscenity words,
- in which user can add additional obscenity words
-
- Attributes:
- obscenity_words: list of russian obscenity words
- obscenity_words_extended: list of russian obscenity words
- obscenity_words_exception: list of words on that model makes mistake that they are obscene
- regexp: reg exp that finds various obscene words
- regexp2: reg exp that finds various obscene words
- morph: pymorphy2.MorphAnalyzer object
- word_pattern: reg exp that finds words in text
- """
-
- def _get_patterns(self):
- PATTERN_1 = r''.join((
- r'\w{0,5}[хx]([хx\s\!@#\$%\^&*+-\|\/]{0,6})',
- r'[уy]([уy\s\!@#\$%\^&*+-\|\/]{0,6})[ёiлeеюийя]\w{0,7}|\w{0,6}[пp]',
- r'([пp\s\!@#\$%\^&*+-\|\/]{0,6})[iие]([iие\s\!@#\$%\^&*+-\|\/]{0,6})',
- r'[3зс]([3зс\s\!@#\$%\^&*+-\|\/]{0,6})[дd]\w{0,10}|[сcs][уy]',
- r'([уy\!@#\$%\^&*+-\|\/]{0,6})[4чkк]\w{1,3}|\w{0,4}[bб]',
- r'([bб\s\!@#\$%\^&*+-\|\/]{0,6})[lл]([lл\s\!@#\$%\^&*+-\|\/]{0,6})',
- r'[yя]\w{0,10}|\w{0,8}[её][bб][лске@eыиаa][наи@йвл]\w{0,8}|\w{0,4}[еe]',
- r'([еe\s\!@#\$%\^&*+-\|\/]{0,6})[бb]([бb\s\!@#\$%\^&*+-\|\/]{0,6})',
- r'[uу]([uу\s\!@#\$%\^&*+-\|\/]{0,6})[н4ч]\w{0,4}|\w{0,4}[еeё]',
- r'([еeё\s\!@#\$%\^&*+-\|\/]{0,6})[бb]([бb\s\!@#\$%\^&*+-\|\/]{0,6})',
- r'[нn]([нn\s\!@#\$%\^&*+-\|\/]{0,6})[уy]\w{0,4}|\w{0,4}[еe]',
- r'([еe\s\!@#\$%\^&*+-\|\/]{0,6})[бb]([бb\s\!@#\$%\^&*+-\|\/]{0,6})',
- r'[оoаa@]([оoаa@\s\!@#\$%\^&*+-\|\/]{0,6})[тnнt]\w{0,4}|\w{0,10}[ё]',
- r'([ё\!@#\$%\^&*+-\|\/]{0,6})[б]\w{0,6}|\w{0,4}[pп]',
- r'([pп\s\!@#\$%\^&*+-\|\/]{0,6})[иeеi]([иeеi\s\!@#\$%\^&*+-\|\/]{0,6})',
- r'[дd]([дd\s\!@#\$%\^&*+-\|\/]{0,6})[oоаa@еeиi]',
- r'([oоаa@еeиi\s\!@#\$%\^&*+-\|\/]{0,6})[рr]\w{0,12}',
- ))
-
- PATTERN_2 = r'|'.join((
- r"(\b[сs]{1}[сsц]{0,1}[uуy](?:[ч4]{0,1}[иаakк][^ц])\w*\b)",
- r"(\b(?!пло|стра|[тл]и)(\w(?!(у|пло)))*[хx][уy](й|йа|[еeё]|и|я|ли|ю)(?!га)\w*\b)",
- r"(\b(п[oо]|[нз][аa])*[хx][eе][рp]\w*\b)",
- r"(\b[мm][уy][дd]([аa][кk]|[oо]|и)\w*\b)",
- r"(\b\w*д[рp](?:[oо][ч4]|[аa][ч4])(?!л)\w*\b)",
- r"(\b(?!(?:кило)?[тм]ет)(?!смо)[а-яa-z]*(? None:
- log.info(f"Initializing `{self.__class__.__name__}`")
-
- data_path = expand_path(data_path)
- with open(data_path / 'obscenity_words.json', encoding="utf-8") as f:
- self.obscenity_words = set(json.load(f))
- with open(data_path / 'obscenity_words_exception.json', encoding="utf-8") as f:
- self.obscenity_words_exception = set(json.load(f))
- if (data_path / 'obscenity_words_extended.json').exists():
- with open(data_path / 'obscenity_words_extended.json', encoding="utf-8") as f:
- self.obscenity_words_extended = set(json.load(f))
- self.obscenity_words.update(self.obscenity_words_extended)
-
- PATTERN_1, PATTERN_2 = self._get_patterns()
- self.regexp = re.compile(PATTERN_1, re.U | re.I)
- self.regexp2 = re.compile(PATTERN_2, re.U | re.I)
- self.morph = pymorphy2.MorphAnalyzer()
- self.word_pattern = re.compile(r'[А-яЁё]+')
-
- def _check_obscenity(self, text: str) -> bool:
- for word in self.word_pattern.findall(text):
- if len(word) < 3:
- continue
- word = word.lower()
- word.replace('ё', 'е')
- normal_word = self.morph.parse(word)[0].normal_form
- if normal_word in self.obscenity_words_exception \
- or word in self.obscenity_words_exception:
- continue
- if normal_word in self.obscenity_words \
- or word in self.obscenity_words \
- or bool(self.regexp.findall(normal_word)) \
- or bool(self.regexp.findall(word)) \
- or bool(self.regexp2.findall(normal_word)) \
- or bool(self.regexp2.findall(word)):
- return True
- return False
-
- def __call__(self, texts: List[str]) -> List[bool]:
- """It decides whether text is obscene or not
-
- Args:
- texts: list of texts, for which it needs to decide they are obscene or not
-
- Returns:
- list of bool: True is for obscene text, False is for not obscene text
- """
- decisions = list(map(self._check_obscenity, texts))
- return decisions
diff --git a/deeppavlov/models/doc_retrieval/pop_ranker.py b/deeppavlov/models/doc_retrieval/pop_ranker.py
index 280805dc48..f27938e811 100644
--- a/deeppavlov/models/doc_retrieval/pop_ranker.py
+++ b/deeppavlov/models/doc_retrieval/pop_ranker.py
@@ -17,7 +17,7 @@
from typing import List, Any, Tuple
import numpy as np
-from sklearn.externals import joblib
+import joblib
from deeppavlov.core.commands.utils import expand_path
from deeppavlov.core.common.file import read_json
diff --git a/deeppavlov/models/embedders/elmo_embedder.py b/deeppavlov/models/embedders/elmo_embedder.py
deleted file mode 100644
index 09990ce648..0000000000
--- a/deeppavlov/models/embedders/elmo_embedder.py
+++ /dev/null
@@ -1,314 +0,0 @@
-# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import sys
-from logging import getLogger
-from typing import Iterator, List, Union, Optional
-
-import numpy as np
-import tensorflow as tf
-import tensorflow_hub as hub
-from overrides import overrides
-
-from deeppavlov.core.commands.utils import expand_path
-from deeppavlov.core.common.registry import register
-from deeppavlov.core.data.utils import zero_pad, chunk_generator
-from deeppavlov.core.models.component import Component
-from deeppavlov.core.models.tf_backend import TfModelMeta
-
-log = getLogger(__name__)
-
-
-@register('elmo_embedder')
-class ELMoEmbedder(Component, metaclass=TfModelMeta):
- """
- ``ELMo`` (Embeddings from Language Models) representations are pre-trained contextual representations from
- large-scale bidirectional language models. See a paper `Deep contextualized word representations
- `__ for more information about the algorithm and a detailed analysis.
-
- Parameters:
- spec: A ``ModuleSpec`` defining the Module to instantiate or a path where to load a ``ModuleSpec`` from via
- ``tenserflow_hub.load_module_spec`` by using `TensorFlow Hub `__.
- elmo_output_names: A list of output ELMo. You can use combination of
- ``["word_emb", "lstm_outputs1", "lstm_outputs2","elmo"]`` and you can use separately ``["default"]``.
-
- Where,
-
- * ``word_emb`` - CNN embedding (default dim 512)
- * ``lstm_outputs*`` - ouputs of lstm (default dim 1024)
- * ``elmo`` - weighted sum of cnn and lstm outputs (default dim 1024)
- * ``default`` - mean ``elmo`` vector for sentence (default dim 1024)
-
- See `TensorFlow Hub `__ for more information about it.
- dim: Can be used for output embeddings dimensionality reduction if elmo_output_names != ['default']
- pad_zero: Whether to use pad samples or not.
- concat_last_axis: A boolean that enables/disables last axis concatenation. It is not used for
- ``elmo_output_names = ["default"]``.
- max_token: The number limitation of words per a batch line.
- mini_batch_size: It is used to reduce the memory requirements of the device.
-
-
- If some required packages are missing, install all the requirements by running in command line:
-
- .. code:: bash
-
- python -m deeppavlov install
-
- where ```` is a path to one of the :config:`provided config files `
- or its name without an extension, for example :
-
- .. code:: bash
-
- python -m deeppavlov install elmo_ru-news
-
- Examples:
- >>> from deeppavlov.models.embedders.elmo_embedder import ELMoEmbedder
- >>> elmo = ELMoEmbedder("http://files.deeppavlov.ai/deeppavlov_data/elmo_ru-news_wmt11-16_1.5M_steps.tar.gz")
- >>> elmo([['вопрос', 'жизни', 'Вселенной', 'и', 'вообще', 'всего'], ['42']])
- array([[ 0.00719104, 0.08544601, -0.07179783, ..., 0.10879009,
- -0.18630421, -0.2189409 ],
- [ 0.16325025, -0.04736076, 0.12354863, ..., -0.1889013 ,
- 0.04972512, 0.83029324]], dtype=float32)
-
- You can use ELMo models from DeepPavlov as usual `TensorFlow Hub Module
- `_.
-
- >>> import tensorflow as tf
- >>> import tensorflow_hub as hub
- >>> elmo = hub.Module("http://files.deeppavlov.ai/deeppavlov_data/elmo_ru-news_wmt11-16_1.5M_steps.tar.gz",
- trainable=True)
- >>> sess = tf.Session()
- >>> sess.run(tf.global_variables_initializer())
- >>> embeddings = elmo(["это предложение", "word"], signature="default", as_dict=True)["elmo"]
- >>> sess.run(embeddings)
- array([[[ 0.05817392, 0.22493343, -0.19202903, ..., -0.14448944,
- -0.12425567, 1.0148407 ],
- [ 0.53596294, 0.2868537 , 0.28028542, ..., -0.08028372,
- 0.49089077, 0.75939953]],
- [[ 0.3433637 , 1.0031182 , -0.1597258 , ..., 1.2442509 ,
- 0.61029315, 0.43388373],
- [ 0.05370751, 0.02260921, 0.01074906, ..., 0.08748816,
- -0.0066415 , -0.01344293]]], dtype=float32)
-
- TensorFlow Hub module also supports tokenized sentences in the following format.
-
- >>> tokens_input = [["мама", "мыла", "раму"], ["рама", "", ""]]
- >>> tokens_length = [3, 1]
- >>> embeddings = elmo(
- inputs={
- "tokens": tokens_input,
- "sequence_len": tokens_length
- },
- signature="tokens",
- as_dict=True)["elmo"]
- >>> sess.run(embeddings)
- array([[[ 0.6040001 , -0.16130011, 0.56478846, ..., -0.00376141,
- -0.03820051, 0.26321286],
- [ 0.01834148, 0.17055789, 0.5311495 , ..., -0.5675535 ,
- 0.62669843, -0.05939034],
- [ 0.3242596 , 0.17909613, 0.01657108, ..., 0.1866098 ,
- 0.7392496 , 0.08285746]],
- [[ 1.1322289 , 0.19077688, -0.17811403, ..., 0.42973226,
- 0.23391506, -0.01294377],
- [ 0.05370751, 0.02260921, 0.01074906, ..., 0.08748816,
- -0.0066415 , -0.01344293],
- [ 0.05370751, 0.02260921, 0.01074906, ..., 0.08748816,
- -0.0066415 , -0.01344293]]], dtype=float32)
-
- You can also get ``hub.text_embedding_column`` like described `here
- `_.
-
-
- """
-
- def __init__(self, spec: str, elmo_output_names: Optional[List] = None,
- dim: Optional[int] = None, pad_zero: bool = False,
- concat_last_axis: bool = True, max_token: Optional[int] = None,
- mini_batch_size: int = 32, **kwargs) -> None:
-
- self.spec = spec if '://' in spec else str(expand_path(spec))
-
- self.elmo_output_dims = {'word_emb': 512,
- 'lstm_outputs1': 1024,
- 'lstm_outputs2': 1024,
- 'elmo': 1024,
- 'default': 1024}
- elmo_output_names = elmo_output_names or ['default']
- self.elmo_output_names = elmo_output_names
- elmo_output_names_set = set(self.elmo_output_names)
- if elmo_output_names_set - set(self.elmo_output_dims.keys()):
- log.error(f'Incorrect elmo_output_names = {elmo_output_names} . You can use either ["default"] or some of'
- '["word_emb", "lstm_outputs1", "lstm_outputs2","elmo"]')
- sys.exit(1)
-
- if elmo_output_names_set - {'default'} and elmo_output_names_set - {"word_emb", "lstm_outputs1",
- "lstm_outputs2", "elmo"}:
- log.error('Incompatible conditions: you can use either ["default"] or list of '
- '["word_emb", "lstm_outputs1", "lstm_outputs2","elmo"] ')
- sys.exit(1)
-
- self.pad_zero = pad_zero
- self.concat_last_axis = concat_last_axis
- self.max_token = max_token
- self.mini_batch_size = mini_batch_size
- self.elmo_outputs, self.sess, self.tokens_ph, self.tokens_length_ph = self._load()
- self.dim = self._get_dims(self.elmo_output_names, dim, concat_last_axis)
-
- def _get_dims(self, elmo_output_names, in_dim, concat_last_axis):
- dims = [self.elmo_output_dims[elmo_output_name] for elmo_output_name in elmo_output_names]
- if concat_last_axis:
- dims = in_dim if in_dim else sum(dims)
- else:
- if in_dim:
- log.warning(f"[ dim = {in_dim} is not used, because the elmo_output_names has more than one element.]")
- return dims
-
- def _load(self):
- """
- Load a ELMo TensorFlow Hub Module from a self.spec.
-
- Returns:
- ELMo pre-trained model wrapped in TenserFlow Hub Module.
- """
- elmo_module = hub.Module(self.spec, trainable=False)
-
- sess_config = tf.ConfigProto()
- sess_config.gpu_options.allow_growth = True
- sess = tf.Session(config=sess_config)
-
- tokens_ph = tf.placeholder(shape=(None, None), dtype=tf.string, name='tokens')
- tokens_length_ph = tf.placeholder(shape=(None,), dtype=tf.int32, name='tokens_length')
-
- elmo_outputs = elmo_module(inputs={"tokens": tokens_ph,
- "sequence_len": tokens_length_ph},
- signature="tokens",
- as_dict=True)
-
- sess.run(tf.global_variables_initializer())
-
- return elmo_outputs, sess, tokens_ph, tokens_length_ph
-
- def _fill_batch(self, batch):
- """
- Fill batch correct values.
-
- Args:
- batch: A list of tokenized text samples.
-
- Returns:
- batch: A list of tokenized text samples.
- """
-
- if not batch:
- empty_vec = np.zeros(self.dim, dtype=np.float32)
- return [empty_vec] if 'default' in self.elmo_output_names else [[empty_vec]]
-
- filled_batch = []
- for batch_line in batch:
- batch_line = batch_line if batch_line else ['']
- filled_batch.append(batch_line)
-
- batch = filled_batch
-
- if self.max_token:
- batch = [batch_line[:self.max_token] for batch_line in batch]
- tokens_length = [len(batch_line) for batch_line in batch]
- tokens_length_max = max(tokens_length)
- batch = [batch_line + [''] * (tokens_length_max - len(batch_line)) for batch_line in batch]
-
- return batch, tokens_length
-
- def _mini_batch_fit(self, batch: List[List[str]], *args, **kwargs) -> Union[List[np.ndarray], np.ndarray]:
- """
- Embed sentences from a batch.
-
- Args:
- batch: A list of tokenized text samples.
-
- Returns:
- A batch of ELMo embeddings.
- """
- batch, tokens_length = self._fill_batch(batch)
-
- elmo_outputs = self.sess.run(self.elmo_outputs,
- feed_dict={self.tokens_ph: batch,
- self.tokens_length_ph: tokens_length})
-
- if 'default' in self.elmo_output_names:
- elmo_output_values = elmo_outputs['default']
- dim0, dim1 = elmo_output_values.shape
- if self.dim != dim1:
- shape = (dim0, self.dim if isinstance(self.dim, int) else self.dim[0])
- elmo_output_values = np.resize(elmo_output_values, shape)
- else:
- elmo_output_values = [elmo_outputs[elmo_output_name] for elmo_output_name in self.elmo_output_names]
- elmo_output_values = np.concatenate(elmo_output_values, axis=-1)
-
- dim0, dim1, dim2 = elmo_output_values.shape
- if self.concat_last_axis and self.dim != dim2:
- shape = (dim0, dim1, self.dim)
- elmo_output_values = np.resize(elmo_output_values, shape)
-
- elmo_output_values = [elmo_output_values_line[:length_line]
- for length_line, elmo_output_values_line in zip(tokens_length, elmo_output_values)]
-
- if not self.concat_last_axis:
- slice_indexes = np.cumsum(self.dim).tolist()[:-1]
- elmo_output_values = [[np.array_split(vec, slice_indexes) for vec in tokens]
- for tokens in elmo_output_values]
-
- return elmo_output_values
-
- @overrides
- def __call__(self, batch: List[List[str]],
- *args, **kwargs) -> Union[List[np.ndarray], np.ndarray]:
- """
- Embed sentences from a batch.
-
- Args:
- batch: A list of tokenized text samples.
-
- Returns:
- A batch of ELMo embeddings.
- """
- if len(batch) > self.mini_batch_size:
- batch_gen = chunk_generator(batch, self.mini_batch_size)
- elmo_output_values = []
- for mini_batch in batch_gen:
- mini_batch_out = self._mini_batch_fit(mini_batch, *args, **kwargs)
- elmo_output_values.extend(mini_batch_out)
- else:
- elmo_output_values = self._mini_batch_fit(batch, *args, **kwargs)
-
- if self.pad_zero:
- elmo_output_values = zero_pad(elmo_output_values)
-
- return elmo_output_values
-
- def __iter__(self) -> Iterator:
- """
- Iterate over all words from a ELMo model vocabulary.
- The ELMo model vocabulary consists of ``['', '', '']``.
-
- Returns:
- An iterator of three elements ``['', '', '']``.
- """
-
- yield from ['', '', '']
-
- def destroy(self):
- if hasattr(self, 'sess'):
- for k in list(self.sess.graph.get_all_collection_keys()):
- self.sess.graph.clear_collection(k)
- super().destroy()
diff --git a/deeppavlov/models/kbqa/sentence_answer.py b/deeppavlov/models/kbqa/sentence_answer.py
index ea7042c819..d3dabf1614 100644
--- a/deeppavlov/models/kbqa/sentence_answer.py
+++ b/deeppavlov/models/kbqa/sentence_answer.py
@@ -12,13 +12,23 @@
# See the License for the specific language governing permissions and
# limitations under the License.
+import importlib
import re
from logging import getLogger
+import pkg_resources
import spacy
log = getLogger(__name__)
+# en_core_web_sm is installed and used by test_inferring_pretrained_model in the same interpreter session during tests.
+# Spacy checks en_core_web_sm package presence with pkg_resources, but pkg_resources is initialized with interpreter,
+# sot it doesn't see en_core_web_sm installed after interpreter initialization, so we use importlib.reload below.
+
+if 'en-core-web-sm' not in pkg_resources.working_set.by_key.keys():
+ importlib.reload(pkg_resources)
+
+# TODO: move nlp to sentence_answer, sentence_answer to rel_ranking_bert_infer and revise en_core_web_sm requirement
nlp = spacy.load('en_core_web_sm')
pronouns = ["who", "what", "when", "where", "how"]
diff --git a/deeppavlov/models/morpho_tagger/common_tagger.py b/deeppavlov/models/morpho_tagger/common_tagger.py
index dfc7e330aa..572747fc4d 100644
--- a/deeppavlov/models/morpho_tagger/common_tagger.py
+++ b/deeppavlov/models/morpho_tagger/common_tagger.py
@@ -16,7 +16,6 @@
from typing import Union, Optional, Tuple
-from tensorflow.keras import backend as K
import numpy as np
EPS = 1e-15
@@ -35,11 +34,6 @@ def to_one_hot(x, k):
return unit[x]
-def repeat_(x, k):
- tile_factor = [1, k] + [1] * (K.ndim(x) - 1)
- return K.tile(x[:, None, :], tile_factor)
-
-
def make_pos_and_tag(tag: str, sep: str = ",",
return_mode: Optional[str] = None) -> Tuple[str, Union[str, list, dict, tuple]]:
"""
diff --git a/deeppavlov/models/morpho_tagger/morpho_tagger.py b/deeppavlov/models/morpho_tagger/morpho_tagger.py
deleted file mode 100644
index 45a6bb2379..0000000000
--- a/deeppavlov/models/morpho_tagger/morpho_tagger.py
+++ /dev/null
@@ -1,352 +0,0 @@
-# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from logging import getLogger
-from pathlib import Path
-from typing import List, Optional, Union, Tuple
-
-import numpy as np
-import tensorflow.keras.backend as K
-from tensorflow.keras import Model
-from tensorflow.keras.layers import (Input, Dense, Lambda, Concatenate, Conv2D, Dropout, LSTM, Bidirectional,
- TimeDistributed)
-from tensorflow.keras.optimizers import Nadam
-from tensorflow.keras.regularizers import l2
-
-from deeppavlov.core.common.registry import register
-from deeppavlov.core.data.simple_vocab import SimpleVocabulary
-from deeppavlov.core.models.keras_model import KerasModel
-from .cells import Highway
-from .common_tagger import to_one_hot
-
-log = getLogger(__name__)
-
-MAX_WORD_LENGTH = 30
-
-
-@register("morpho_tagger")
-class MorphoTagger(KerasModel):
- """A class for character-based neural morphological tagger
-
- Parameters:
- symbols: character vocabulary
- tags: morphological tags vocabulary
- save_path: the path where model is saved
- load_path: the path from where model is loaded
- mode: usage mode
-
- word_rnn: the type of character-level network (only `cnn` implemented)
- char_embeddings_size: the size of character embeddings
- char_conv_layers: the number of convolutional layers on character level
- char_window_size: the width of convolutional filter (filters).
- It can be a list if several parallel filters are applied, for example, [2, 3, 4, 5].
- char_filters: the number of convolutional filters for each window width.
- It can be a number, a list (when there are several windows of different width
- on a single convolution layer), a list of lists, if there
- are more than 1 convolution layers, or **None**.
- If **None**, a layer with width **width** contains
- min(**char_filter_multiple** * **width**, 200) filters.
-
- char_filter_multiple: the ratio between filters number and window width
- char_highway_layers: the number of highway layers on character level
- conv_dropout: the ratio of dropout between convolutional layers
- highway_dropout: the ratio of dropout between highway layers,
- intermediate_dropout: the ratio of dropout between convolutional
- and highway layers on character level
- lstm_dropout: dropout ratio in word-level LSTM
- word_vectorizers: list of parameters for additional word-level vectorizers,
- for each vectorizer it stores a pair of vectorizer dimension and
- the dimension of the corresponding word embedding
- word_lstm_layers: the number of word-level LSTM layers
- word_lstm_units: hidden dimensions of word-level LSTMs
- word_dropout: the ratio of dropout before word level (it is applied to word embeddings)
- regularizer: l2 regularization parameter
- verbose: the level of verbosity
-
- A subclass of :class:`~deeppavlov.core.models.keras_model.KerasModel`
- """
- def __init__(self,
- symbols: SimpleVocabulary,
- tags: SimpleVocabulary,
- save_path: Optional[Union[str, Path]] = None,
- load_path: Optional[Union[str, Path]] = None,
- mode: str = 'infer',
- word_rnn: str = "cnn",
- char_embeddings_size: int = 16,
- char_conv_layers: int = 1,
- char_window_size: Union[int, List[int]] = 5,
- char_filters: Union[int, List[int]] = None,
- char_filter_multiple: int = 25,
- char_highway_layers: int = 1,
- conv_dropout: float = 0.0,
- highway_dropout: float = 0.0,
- intermediate_dropout: float = 0.0,
- lstm_dropout: float = 0.0,
- word_vectorizers: List[Tuple[int, int]] = None,
- word_lstm_layers: int = 1,
- word_lstm_units: Union[int, List[int]] = 128,
- word_dropout: float = 0.0,
- regularizer: float = None,
- verbose: int = 1, **kwargs):
- # Calls parent constructor. Results in creation of save_folder if it doesn't exist
- super().__init__(save_path=save_path, load_path=load_path, mode=mode, **kwargs)
- self.symbols = symbols
- self.tags = tags
- self.word_rnn = word_rnn
- self.char_embeddings_size = char_embeddings_size
- self.char_conv_layers = char_conv_layers
- self.char_window_size = char_window_size
- self.char_filters = char_filters
- self.char_filter_multiple = char_filter_multiple
- self.char_highway_layers = char_highway_layers
- self.conv_dropout = conv_dropout
- self.highway_dropout = highway_dropout
- self.intermediate_dropout = intermediate_dropout
- self.lstm_dropout = lstm_dropout
- self.word_dropout = word_dropout
- self.word_vectorizers = word_vectorizers # a list of additional vectorizer dimensions
- self.word_lstm_layers = word_lstm_layers
- self.word_lstm_units = word_lstm_units
- self.regularizer = regularizer
- self.verbose = verbose
- self._initialize()
- self.model_ = None
- self.build()
-
- # Tries to load the model from model `load_path`, if it is available
- self.load()
-
- def load(self) -> None:
- """
- Checks existence of the model file, loads the model if the file exists
- Loads model weights from a file
- """
-
- # Checks presence of the model files
- if self.load_path.exists():
- path = str(self.load_path.resolve())
- log.info('[loading model from {}]'.format(path))
- self.model_.load_weights(path)
-
- def save(self) -> None:
- """
- Saves model weights to the save_path, provided in config. The directory is
- already created by super().__init__, which is called in __init__ of this class"""
- path = str(self.save_path.absolute())
- log.info('[saving model to {}]'.format(path))
- self.model_.save_weights(path)
-
- def _initialize(self):
- if isinstance(self.char_window_size, int):
- self.char_window_size = [self.char_window_size]
- if self.char_filters is None or isinstance(self.char_filters, int):
- self.char_filters = [self.char_filters] * len(self.char_window_size)
- if len(self.char_window_size) != len(self.char_filters):
- raise ValueError("There should be the same number of window sizes and filter sizes")
- if isinstance(self.word_lstm_units, int):
- self.word_lstm_units = [self.word_lstm_units] * self.word_lstm_layers
- if len(self.word_lstm_units) != self.word_lstm_layers:
- raise ValueError("There should be the same number of lstm layer units and lstm layers")
- if self.word_vectorizers is None:
- self.word_vectorizers = []
- if self.regularizer is not None:
- self.regularizer = l2(self.regularizer)
- if self.verbose > 0:
- log.info("{} symbols, {} tags in CharacterTagger".format(len(self.symbols), len(self.tags)))
-
- def build(self):
- """Builds the network using Keras.
- """
- word_inputs = Input(shape=(None, MAX_WORD_LENGTH+2), dtype="int32")
- inputs = [word_inputs]
- word_outputs = self._build_word_cnn(word_inputs)
- if len(self.word_vectorizers) > 0:
- additional_word_inputs = [Input(shape=(None, input_dim), dtype="float32")
- for input_dim, dense_dim in self.word_vectorizers]
- inputs.extend(additional_word_inputs)
- additional_word_embeddings = [Dense(dense_dim)(additional_word_inputs[i])
- for i, (_, dense_dim) in enumerate(self.word_vectorizers)]
- word_outputs = Concatenate()([word_outputs] + additional_word_embeddings)
- outputs, lstm_outputs = self._build_basic_network(word_outputs)
- compile_args = {"optimizer": Nadam(lr=0.002, clipnorm=5.0),
- "loss": "categorical_crossentropy", "metrics": ["accuracy"]}
- self.model_ = Model(inputs, outputs)
- self.model_.compile(**compile_args)
- if self.verbose > 0:
- self.model_.summary(print_fn=log.info)
- return self
-
- def _build_word_cnn(self, inputs):
- """Builds word-level network
- """
- inputs = Lambda(K.one_hot, arguments={"num_classes": len(self.symbols)},
- output_shape=lambda x: tuple(x) + (len(self.symbols),))(inputs)
- char_embeddings = Dense(self.char_embeddings_size, use_bias=False)(inputs)
- conv_outputs = []
- self.char_output_dim_ = 0
- for window_size, filters_number in zip(self.char_window_size, self.char_filters):
- curr_output = char_embeddings
- curr_filters_number = (min(self.char_filter_multiple * window_size, 200)
- if filters_number is None else filters_number)
- for _ in range(self.char_conv_layers - 1):
- curr_output = Conv2D(curr_filters_number, (1, window_size),
- padding="same", activation="relu",
- data_format="channels_last")(curr_output)
- if self.conv_dropout > 0.0:
- curr_output = Dropout(self.conv_dropout)(curr_output)
- curr_output = Conv2D(curr_filters_number, (1, window_size),
- padding="same", activation="relu",
- data_format="channels_last")(curr_output)
- conv_outputs.append(curr_output)
- self.char_output_dim_ += curr_filters_number
- if len(conv_outputs) > 1:
- conv_output = Concatenate(axis=-1)(conv_outputs)
- else:
- conv_output = conv_outputs[0]
- highway_input = Lambda(K.max, arguments={"axis": -2})(conv_output)
- if self.intermediate_dropout > 0.0:
- highway_input = Dropout(self.intermediate_dropout)(highway_input)
- for i in range(self.char_highway_layers - 1):
- highway_input = Highway(activation="relu")(highway_input)
- if self.highway_dropout > 0.0:
- highway_input = Dropout(self.highway_dropout)(highway_input)
- highway_output = Highway(activation="relu")(highway_input)
- return highway_output
-
- def _build_basic_network(self, word_outputs):
- """
- Creates the basic network architecture,
- transforming word embeddings to intermediate outputs
- """
- if self.word_dropout > 0.0:
- lstm_outputs = Dropout(self.word_dropout)(word_outputs)
- else:
- lstm_outputs = word_outputs
- for j in range(self.word_lstm_layers-1):
- lstm_outputs = Bidirectional(
- LSTM(self.word_lstm_units[j], return_sequences=True,
- dropout=self.lstm_dropout))(lstm_outputs)
- lstm_outputs = Bidirectional(
- LSTM(self.word_lstm_units[-1], return_sequences=True,
- dropout=self.lstm_dropout))(lstm_outputs)
- pre_outputs = TimeDistributed(
- Dense(len(self.tags), activation="softmax",
- activity_regularizer=self.regularizer),
- name="p")(lstm_outputs)
- return pre_outputs, lstm_outputs
-
- # noinspection PyPep8Naming
- def _transform_batch(self, data, labels=None, transform_to_one_hot=True):
- data, additional_data = data[0], data[1:]
- L = max(len(x) for x in data)
- X = np.array([self._make_sent_vector(x, L) for x in data])
- X = [X] + [np.array(x) for x in additional_data]
- if labels is not None:
- Y = np.array([self._make_tags_vector(y, L) for y in labels])
- if transform_to_one_hot:
- Y = to_one_hot(Y, len(self.tags))
- return X, Y
- else:
- return X
-
- def train_on_batch(self, *args) -> None:
- """Trains the model on a single batch.
-
- Args:
- *args: the list of network inputs. Last element of `args` is the batch of targets,
- all previous elements are training data batches
- """
- # data: List[Iterable], labels: Iterable[list]
- # Args:
- # data: a batch of word sequences
- # labels: a batch of correct tag sequences
- *data, labels = args
- # noinspection PyPep8Naming
- X, Y = self._transform_batch(data, labels)
- self.model_.train_on_batch(X, Y)
-
- # noinspection PyPep8Naming
- def predict_on_batch(self, data: Union[List[np.ndarray], Tuple[np.ndarray]],
- return_indexes: bool = False) -> List[List[str]]:
- """
- Makes predictions on a single batch
-
- Args:
- data: model inputs for a single batch, data[0] contains input character encodings
- and is the only element of data for mist models. Subsequent elements of data
- include the output of additional vectorizers, e.g., dictionary-based one.
- return_indexes: whether to return tag indexes in vocabulary or the tags themselves
-
- Returns:
- a batch of label sequences
- """
- X = self._transform_batch(data)
- objects_number, lengths = len(X[0]), [len(elem) for elem in data[0]]
- Y = self.model_.predict_on_batch(X)
- labels = np.argmax(Y, axis=-1)
- answer: List[Optional[List[str]]] = [None] * objects_number
- for i, (elem, length) in enumerate(zip(labels, lengths)):
- elem = elem[:length]
- answer[i] = elem if return_indexes else self.tags.idxs2toks(elem)
- return answer
-
- def __call__(self, *x_batch: np.ndarray, **kwargs) -> Union[List, np.ndarray]:
- """
- Predicts answers on batch elements.
-
- Args:
- x_batch: a batch to predict answers on. It can be either a single array
- for basic model or a sequence of arrays for a complex one (
- :config:`configuration file `
- or its lemmatized version).
- """
- return self.predict_on_batch(x_batch, **kwargs)
-
- def _make_sent_vector(self, sent: List, bucket_length: int = None) -> np.ndarray:
- """Transforms a sentence to Numpy array, which will be the network input.
-
- Args:
- sent: input sentence
- bucket_length: the width of the bucket
-
- Returns:
- A 3d array, answer[i][j][k] contains the index of k-th letter
- in j-th word of i-th input sentence.
- """
- bucket_length = bucket_length or len(sent)
- answer = np.zeros(shape=(bucket_length, MAX_WORD_LENGTH+2), dtype=np.int32)
- for i, word in enumerate(sent):
- answer[i, 0] = self.tags["BEGIN"]
- m = min(len(word), MAX_WORD_LENGTH)
- for j, x in enumerate(word[-m:]):
- answer[i, j+1] = self.symbols[x]
- answer[i, m+1] = self.tags["END"]
- answer[i, m+2:] = self.tags["PAD"]
- return answer
-
- def _make_tags_vector(self, tags, bucket_length=None) -> np.ndarray:
- """Transforms a sentence of tags to Numpy array, which will be the network target.
-
- Args:
- tags: input sentence of tags
- bucket_length: the width of the bucket
-
- Returns:
- A 2d array, answer[i][j] contains the index of j-th tag in i-th input sentence.
- """
- bucket_length = bucket_length or len(tags)
- answer = np.zeros(shape=(bucket_length,), dtype=np.int32)
- for i, tag in enumerate(tags):
- answer[i] = self.tags[tag]
- return answer
diff --git a/deeppavlov/models/nemo/asr.py b/deeppavlov/models/nemo/asr.py
deleted file mode 100644
index 70527adea3..0000000000
--- a/deeppavlov/models/nemo/asr.py
+++ /dev/null
@@ -1,193 +0,0 @@
-# Copyright 2020 Neural Networks and Deep Learning lab, MIPT
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import logging
-from io import BytesIO
-from pathlib import Path
-from typing import List, Optional, Tuple, Union, Dict
-
-import torch
-from nemo.collections.asr import AudioToMelSpectrogramPreprocessor, JasperEncoder, JasperDecoderForCTC, GreedyCTCDecoder
-from nemo.collections.asr.helpers import post_process_predictions
-from nemo.collections.asr.parts.features import WaveformFeaturizer
-from nemo.core.neural_types import AudioSignal, NeuralType, LengthsType
-from nemo.utils.decorators import add_port_docs
-from torch import Tensor
-from torch.utils.data import Dataset, DataLoader
-
-from deeppavlov.core.common.registry import register
-from deeppavlov.models.nemo.common import CustomDataLayerBase, NeMoBase
-
-log = logging.getLogger(__name__)
-
-
-class AudioInferDataset(Dataset):
- def __init__(self, audio_batch: List[Union[str, BytesIO]], sample_rate: int, int_values: bool, trim=False) -> None:
- """Dataset reader for AudioInferDataLayer.
-
- Args:
- audio_batch: Batch to be read. Elements could be either paths to audio files or Binary I/O objects.
- sample_rate: Audio files sample rate.
- int_values: If true, load samples as 32-bit integers.
- trim: Trim leading and trailing silence from an audio signal if True.
-
- """
- self.audio_batch = audio_batch
- self.featurizer = WaveformFeaturizer(sample_rate=sample_rate, int_values=int_values)
- self.trim = trim
-
- def __getitem__(self, index: int) -> Tuple[Tensor, Tensor]:
- """Processes audio batch item and extracts features.
-
- Args:
- index: Audio batch item index.
-
- Returns:
- features: Audio file's extracted features tensor.
- features_length: Features length tensor.
-
- """
- sample = self.audio_batch[index]
- features = self.featurizer.process(sample, trim=self.trim)
- features_length = torch.tensor(features.shape[0]).long()
-
- return features, features_length
-
- def __len__(self) -> int:
- return len(self.audio_batch)
-
-
-class AudioInferDataLayer(CustomDataLayerBase):
- """Data Layer for ASR pipeline inference."""
-
- @property
- @add_port_docs()
- def output_ports(self) -> Dict[str, NeuralType]:
- return {
- "audio_signal": NeuralType(('B', 'T'), AudioSignal(freq=self._sample_rate)),
- "a_sig_length": NeuralType(tuple('B'), LengthsType())
- }
-
- def __init__(self, *,
- audio_batch: List[Union[str, BytesIO]],
- batch_size: int = 32,
- sample_rate: int = 16000,
- int_values: bool = False,
- trim_silence: bool = False,
- **kwargs) -> None:
- """Initializes Data Loader.
-
- Args:
- audio_batch: Batch to be read. Elements could be either paths to audio files or Binary I/O objects.
- batch_size: How many samples per batch to load.
- sample_rate: Target sampling rate for data. Audio files will be resampled to sample_rate if
- it is not already.
- int_values: If true, load data as 32-bit integers.
- trim_silence: Trim leading and trailing silence from an audio signal if True.
-
- """
- self._sample_rate = sample_rate
-
- dataset = AudioInferDataset(audio_batch=audio_batch, sample_rate=sample_rate, int_values=int_values,
- trim=trim_silence)
-
- dataloader = DataLoader(dataset=dataset, batch_size=batch_size, collate_fn=self.seq_collate_fn)
- super(AudioInferDataLayer, self).__init__(dataset, dataloader, **kwargs)
-
- @staticmethod
- def seq_collate_fn(batch: Tuple[Tuple[Tensor], Tuple[Tensor]]) -> Tuple[Optional[Tensor], Optional[Tensor]]:
- """Collates batch of audio signal and audio length, zero pads audio signal.
-
- Args:
- batch: A tuple of tuples of audio signals and signal lengths. This collate function assumes the signals
- are 1d torch tensors (i.e. mono audio).
-
- Returns:
- audio_signal: Zero padded audio signal tensor.
- audio_length: Audio signal length tensor.
-
- """
- _, audio_lengths = zip(*batch)
- max_audio_len = 0
- has_audio = audio_lengths[0] is not None
- if has_audio:
- max_audio_len = max(audio_lengths).item()
-
- audio_signal = []
- for sig, sig_len in batch:
- if has_audio:
- sig_len = sig_len.item()
- if sig_len < max_audio_len:
- pad = (0, max_audio_len - sig_len)
- sig = torch.nn.functional.pad(sig, pad)
- audio_signal.append(sig)
-
- if has_audio:
- audio_signal = torch.stack(audio_signal)
- audio_lengths = torch.stack(audio_lengths)
- else:
- audio_signal, audio_lengths = None, None
-
- return audio_signal, audio_lengths
-
-
-@register('nemo_asr')
-class NeMoASR(NeMoBase):
- """ASR model on NeMo modules."""
-
- def __init__(self, load_path: Union[str, Path], nemo_params_path: Union[str, Path], **kwargs) -> None:
- """Initializes NeuralModules for ASR.
-
- Args:
- load_path: Path to a directory with pretrained checkpoints for JasperEncoder and JasperDecoderForCTC.
- nemo_params_path: Path to a file containig labels and params for AudioToMelSpectrogramPreprocessor,
- JasperEncoder, JasperDecoderForCTC and AudioInferDataLayer.
-
- """
- super(NeMoASR, self).__init__(load_path=load_path, nemo_params_path=nemo_params_path, **kwargs)
-
- self.labels = self.nemo_params['labels']
-
- self.data_preprocessor = AudioToMelSpectrogramPreprocessor(
- **self.nemo_params['AudioToMelSpectrogramPreprocessor']
- )
- self.jasper_encoder = JasperEncoder(**self.nemo_params['JasperEncoder'])
- self.jasper_decoder = JasperDecoderForCTC(num_classes=len(self.labels), **self.nemo_params['JasperDecoder'])
- self.greedy_decoder = GreedyCTCDecoder()
- self.modules_to_restore = [self.jasper_encoder, self.jasper_decoder]
-
- self.load()
-
- def __call__(self, audio_batch: List[Union[str, BytesIO]]) -> List[str]:
- """Transcripts audio batch to text.
-
- Args:
- audio_batch: Batch to be transcribed. Elements could be either paths to audio files or Binary I/O objects.
-
- Returns:
- text_batch: Batch of transcripts.
-
- """
- data_layer = AudioInferDataLayer(audio_batch=audio_batch, **self.nemo_params['AudioToTextDataLayer'])
- audio_signal, audio_signal_len = data_layer()
- processed_signal, processed_signal_len = self.data_preprocessor(input_signal=audio_signal,
- length=audio_signal_len)
- encoded, encoded_len = self.jasper_encoder(audio_signal=processed_signal, length=processed_signal_len)
- log_probs = self.jasper_decoder(encoder_output=encoded)
- predictions = self.greedy_decoder(log_probs=log_probs)
- eval_tensors = [predictions]
- tensors = self.neural_factory.infer(tensors=eval_tensors)
- text_batch = post_process_predictions(tensors[0], self.labels)
-
- return text_batch
diff --git a/deeppavlov/models/nemo/common.py b/deeppavlov/models/nemo/common.py
deleted file mode 100644
index 883483c5d6..0000000000
--- a/deeppavlov/models/nemo/common.py
+++ /dev/null
@@ -1,117 +0,0 @@
-# Copyright 2020 Neural Networks and Deep Learning lab, MIPT
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import base64
-from io import BytesIO
-from logging import getLogger
-from pathlib import Path
-from typing import Union
-
-import nemo
-import torch
-from nemo.backends.pytorch import DataLayerNM
-from torch.utils.data import Dataset, DataLoader
-
-from deeppavlov.core.commands.utils import expand_path
-from deeppavlov.core.common.file import read_yaml
-from deeppavlov.core.common.registry import register
-from deeppavlov.core.models.component import Component
-from deeppavlov.core.models.serializable import Serializable
-
-log = getLogger(__name__)
-
-
-@register('base64_decode_bytesIO')
-def ascii_to_bytes_io(batch: Union[str, list]) -> Union[BytesIO, list]:
- """Recursively searches for strings in the input batch and converts them into the base64-encoded bytes wrapped in
- Binary I/O objects.
-
- Args:
- batch: A string or an iterable container with strings at some level of nesting.
-
- Returns:
- The same structure where all strings are converted into the base64-encoded bytes wrapped in Binary I/O objects.
-
- """
- if isinstance(batch, str):
- return BytesIO(base64.decodebytes(batch.encode()))
-
- return list(map(ascii_to_bytes_io, batch))
-
-
-@register('bytesIO_encode_base64')
-def bytes_io_to_ascii(batch: Union[BytesIO, list]) -> Union[str, list]:
- """Recursively searches for Binary I/O objects in the input batch and converts them into ASCII-strings.
-
- Args:
- batch: A BinaryIO object or an iterable container with BinaryIO objects at some level of nesting.
-
- Returns:
- The same structure where all BinaryIO objects are converted into strings.
-
- """
- if isinstance(batch, BytesIO):
- return base64.encodebytes(batch.read()).decode('ascii')
-
- return list(map(bytes_io_to_ascii, batch))
-
-
-class NeMoBase(Component, Serializable):
- """Base class for NeMo Chainer's pipeline components."""
-
- def __init__(self, load_path: Union[str, Path], nemo_params_path: Union[str, Path], **kwargs) -> None:
- """Initializes NeuralModuleFactory on CPU or GPU and reads nemo modules params from yaml.
-
- Args:
- load_path: Path to a directory with pretrained checkpoints for NeMo modules.
- nemo_params_path: Path to a file containig NeMo modules params.
-
- """
- super(NeMoBase, self).__init__(save_path=None, load_path=load_path, **kwargs)
- placement = nemo.core.DeviceType.GPU if torch.cuda.is_available() else nemo.core.DeviceType.CPU
- self.neural_factory = nemo.core.NeuralModuleFactory(placement=placement)
- self.modules_to_restore = []
- self.nemo_params = read_yaml(expand_path(nemo_params_path))
-
- def __call__(self, *args, **kwargs):
- raise NotImplementedError
-
- def load(self) -> None:
- """Loads pretrained checkpoints for modules from self.modules_to_restore list."""
- module_names = [str(module) for module in self.modules_to_restore]
- checkpoints = nemo.utils.get_checkpoint_from_dir(module_names, self.load_path)
- for module, checkpoint in zip(self.modules_to_restore, checkpoints):
- log.info(f'Restoring {module} from {checkpoint}')
- module.restore_from(checkpoint)
-
- def save(self, *args, **kwargs) -> None:
- pass
-
-
-class CustomDataLayerBase(DataLayerNM):
- def __init__(self, dataset: Dataset, dataloader: DataLoader, **kwargs) -> None:
- super(CustomDataLayerBase, self).__init__()
- self._dataset = dataset
- self._dataloader = dataloader
-
- def __len__(self) -> int:
- return len(self._dataset)
-
- @property
- def dataset(self) -> None:
- return None
-
- @property
- def data_iterator(self) -> torch.utils.data.DataLoader:
- return self._dataloader
diff --git a/deeppavlov/models/nemo/tts.py b/deeppavlov/models/nemo/tts.py
deleted file mode 100644
index d31fa0bcfb..0000000000
--- a/deeppavlov/models/nemo/tts.py
+++ /dev/null
@@ -1,210 +0,0 @@
-# Copyright 2020 Neural Networks and Deep Learning lab, MIPT
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from functools import partial
-from io import BytesIO
-from logging import getLogger
-from pathlib import Path
-from typing import List, Optional, Tuple, Union, Dict
-
-import torch
-from nemo.collections.asr.parts import collections, parsers
-from nemo.collections.asr.parts.dataset import TranscriptDataset
-from nemo.collections.tts import TextEmbedding, Tacotron2Encoder, Tacotron2DecoderInfer, Tacotron2Postnet
-from nemo.core.neural_types import NeuralType, LabelsType, LengthsType
-from nemo.utils.decorators import add_port_docs
-from nemo.utils.misc import pad_to
-from scipy.io import wavfile
-from torch import Tensor
-
-from deeppavlov.core.commands.utils import expand_path
-from deeppavlov.core.common.registry import register
-from deeppavlov.models.nemo.common import CustomDataLayerBase, NeMoBase
-from deeppavlov.models.nemo.vocoder import WaveGlow, GriffinLim
-
-log = getLogger(__name__)
-
-
-class TextDataset(TranscriptDataset):
- def __init__(self,
- text_batch: List[str],
- labels: List[str],
- bos_id: Optional[int] = None,
- eos_id: Optional[int] = None,
- lowercase: bool = True) -> None:
- """Text dataset reader for TextDataLayer.
-
- Args:
- text_batch: Texts to be used for speech synthesis.
- labels: List of string labels to use when to str2int translation.
- bos_id: Label position of beginning of string symbol.
- eos_id: Label position of end of string symbol.
- lowercase: Whether to convert all uppercase characters in a text batch into lowercase characters.
-
- """
- parser = parsers.make_parser(labels, do_lowercase=lowercase)
- self.texts = collections.Text(text_batch, parser)
- self.bos_id = bos_id
- self.eos_id = eos_id
-
-
-class TextDataLayer(CustomDataLayerBase):
- @property
- @add_port_docs()
- def output_ports(self) -> Dict[str, NeuralType]:
- return {
- 'texts': NeuralType(('B', 'T'), LabelsType()),
- "texts_length": NeuralType(tuple('B'), LengthsType())
- }
-
- def __init__(self, *,
- text_batch: List[str],
- labels: List[str],
- batch_size: int = 32,
- bos_id: Optional[int] = None,
- eos_id: Optional[int] = None,
- pad_id: Optional[int] = None,
- **kwargs) -> None:
- """A simple Neural Module for loading text data.
-
- Args:
- text_batch: Texts to be used for speech synthesis.
- labels: List of string labels to use when to str2int translation.
- batch_size: How many strings per batch to load.
- bos_id: Label position of beginning of string symbol. If None is initialized as `len(labels)`.
- eos_id: Label position of end of string symbol. If None is initialized as `len(labels) + 1`.
- pad_id: Label position of pad symbol. If None is initialized as `len(labels) + 2`.
-
- """
- len_labels = len(labels)
- if bos_id is None:
- bos_id = len_labels
- if eos_id is None:
- eos_id = len_labels + 1
- if pad_id is None:
- pad_id = len_labels + 2
-
- dataset = TextDataset(text_batch=text_batch, labels=labels, bos_id=bos_id, eos_id=eos_id)
-
- dataloader = torch.utils.data.DataLoader(dataset=dataset, batch_size=batch_size,
- collate_fn=partial(self._collate_fn, pad_id=pad_id))
- super(TextDataLayer, self).__init__(dataset, dataloader, **kwargs)
-
- @staticmethod
- def _collate_fn(batch: Tuple[Tuple[Tensor], Tuple[Tensor]], pad_id: int) -> Tuple[Tensor, Tensor]:
- """Collates batch of texts.
-
- Args:
- batch: A tuple of tuples of audio signals and signal lengths.
- pad_id: Label position of pad symbol.
-
- Returns:
- texts: Padded texts tensor.
- texts_len: Text lengths tensor.
-
- """
- texts_list, texts_len = zip(*batch)
- max_len = max(texts_len)
- max_len = pad_to(max_len, 8)
-
- texts = torch.empty(len(texts_list), max_len, dtype=torch.long)
- texts.fill_(pad_id)
-
- for i, text in enumerate(texts_list):
- texts[i].narrow(0, 0, text.size(0)).copy_(text)
-
- if len(texts.shape) != 2:
- raise ValueError(f'Texts in collate function have shape {texts.shape}, should have 2 dimensions.')
-
- return texts, torch.stack(texts_len)
-
-
-@register('nemo_tts')
-class NeMoTTS(NeMoBase):
- """TTS model on NeMo modules."""
- def __init__(self,
- load_path: Union[str, Path],
- nemo_params_path: Union[str, Path],
- vocoder: str = 'waveglow',
- **kwargs) -> None:
- """Initializes NeuralModules for TTS.
-
- Args:
- load_path: Path to a directory with pretrained checkpoints for TextEmbedding, Tacotron2Encoder,
- Tacotron2DecoderInfer, Tacotron2Postnet and, if Waveglow vocoder is selected, WaveGlowInferNM.
- nemo_params_path: Path to a file containig sample_rate, labels and params for TextEmbedding,
- Tacotron2Encoder, Tacotron2Decoder, Tacotron2Postnet and TranscriptDataLayer.
- vocoder: Vocoder used to convert from spectrograms to audio. Available options: `waveglow` (needs pretrained
- checkpoint) and `griffin-lim`.
-
- """
- super(NeMoTTS, self).__init__(load_path=load_path, nemo_params_path=nemo_params_path, **kwargs)
-
- self.sample_rate = self.nemo_params['sample_rate']
- self.text_embedding = TextEmbedding(
- len(self.nemo_params['labels']) + 3, # + 3 special chars
- **self.nemo_params['TextEmbedding']
- )
- self.t2_enc = Tacotron2Encoder(**self.nemo_params['Tacotron2Encoder'])
- self.t2_dec = Tacotron2DecoderInfer(**self.nemo_params['Tacotron2Decoder'])
- self.t2_postnet = Tacotron2Postnet(**self.nemo_params['Tacotron2Postnet'])
- self.modules_to_restore = [self.text_embedding, self.t2_enc, self.t2_dec, self.t2_postnet]
-
- if vocoder == 'waveglow':
- self.vocoder = WaveGlow(**self.nemo_params['WaveGlowNM'])
- self.modules_to_restore.append(self.vocoder)
- elif vocoder == 'griffin-lim':
- self.vocoder = GriffinLim(**self.nemo_params['GriffinLim'])
- else:
- raise ValueError(f'{vocoder} vocoder is not supported.')
-
- self.load()
-
- def __call__(self,
- text_batch: List[str],
- path_batch: Optional[List[str]] = None) -> Union[List[BytesIO], List[str]]:
- """Creates wav files or file objects with speech.
-
- Args:
- text_batch: Text from which human audible speech should be generated.
- path_batch: i-th element of `path_batch` is the path to save i-th generated speech file. If argument isn't
- specified, the synthesized speech will be stored to Binary I/O objects.
-
- Returns:
- List of Binary I/O objects with generated speech if `path_batch` was not specified, list of paths to files
- with synthesized speech otherwise.
-
- """
- if path_batch is None:
- path_batch = [BytesIO() for _ in text_batch]
- elif len(text_batch) != len(path_batch):
- raise ValueError('Text batch length differs from path batch length.')
- else:
- path_batch = [expand_path(path) for path in path_batch]
-
- data_layer = TextDataLayer(text_batch=text_batch, **self.nemo_params['TranscriptDataLayer'])
- transcript, transcript_len = data_layer()
- transcript_embedded = self.text_embedding(char_phone=transcript)
- transcript_encoded = self.t2_enc(char_phone_embeddings=transcript_embedded, embedding_length=transcript_len)
- mel_decoder, gate, alignments, mel_len = self.t2_dec(char_phone_encoded=transcript_encoded,
- encoded_length=transcript_len)
- mel_postnet = self.t2_postnet(mel_input=mel_decoder)
- infer_tensors = [self.vocoder(mel_postnet), mel_len]
- evaluated_tensors = self.neural_factory.infer(tensors=infer_tensors)
- synthesized_batch = self.vocoder.get_audio(*evaluated_tensors)
-
- for fout, synthesized_audio in zip(path_batch, synthesized_batch):
- wavfile.write(fout, self.sample_rate, synthesized_audio)
-
- return path_batch
diff --git a/deeppavlov/models/nemo/vocoder.py b/deeppavlov/models/nemo/vocoder.py
deleted file mode 100644
index 3ec918d266..0000000000
--- a/deeppavlov/models/nemo/vocoder.py
+++ /dev/null
@@ -1,131 +0,0 @@
-# Copyright 2020 Neural Networks and Deep Learning lab, MIPT
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from logging import getLogger
-from typing import List
-
-import librosa
-import numpy as np
-from nemo.core.neural_types import NmTensor
-from nemo.collections.tts import WaveGlowInferNM
-from numpy import ndarray
-
-log = getLogger(__name__)
-
-
-class BaseVocoder:
- """Class is used to maintain consistency in the construction of the TTS pipeline based on NeMo modules."""
-
- def __call__(self, tensor: NmTensor) -> NmTensor:
- """Should return the tensor after the evaluation of which speech could be synthesized with `get_audio` method"""
- raise NotImplementedError
-
- def get_audio(self, evaluated_tensor: list, mel_len: list):
- """Synthesizes audio from the evaluated tensor constructed by `__call__` method."""
- raise NotImplementedError
-
-
-class WaveGlow(BaseVocoder):
- def __init__(self, *, denoiser_strength: float = 0.0, n_window_stride: int = 160, **kwargs) -> None:
- """Wraps WaveGlowInferNM module.
-
- Args:
- denoiser_strength: Denoiser strength for waveglow.
- n_window_stride: Stride of window for FFT in samples used in model training.
- kwargs: Named arguments for WaveGlowInferNM constructor.
-
- """
- self.waveglow = WaveGlowInferNM(**kwargs)
- self.denoiser_strength = denoiser_strength
- self.n_window_stride = n_window_stride
-
- def __call__(self, mel_postnet: NmTensor) -> NmTensor:
- return self.waveglow(mel_spectrogram=mel_postnet)
-
- def __str__(self):
- return str(self.waveglow)
-
- def restore_from(self, path: str) -> None:
- """Wraps WaveGlowInferNM restore_from method."""
- self.waveglow.restore_from(path)
- if self.denoiser_strength > 0:
- log.info('Setup denoiser for WaveGlow')
- self.waveglow.setup_denoiser()
-
- def get_audio(self, evaluated_audio: list, mel_len: list) -> List[ndarray]:
- """Unpacks audio data from evaluated tensor and denoises it if `denoiser_strength` > 0."""
- audios = []
- for i, batch in enumerate(evaluated_audio):
- audio = batch.cpu().numpy()
- for j, sample in enumerate(audio):
- sample_len = mel_len[i][j] * self.n_window_stride
- sample = sample[:sample_len]
- if self.denoiser_strength > 0:
- sample, _ = self.waveglow.denoise(sample, strength=self.denoiser_strength)
- audios.append(sample)
- return audios
-
-
-class GriffinLim(BaseVocoder):
- def __init__(self, *,
- sample_rate: float = 16000.0,
- n_fft: int = 1024,
- mag_scale: float = 2048.0,
- power: float = 1.2,
- n_iters: int = 50,
- **kwargs) -> None:
- """Uses Griffin Lim algorithm to generate speech from spectrograms.
-
- Args:
- sample_rate: Generated audio data sample rate.
- n_fft: The number of points to use for the FFT.
- mag_scale: Multiplied with the linear spectrogram to avoid audio sounding muted due to mel filter
- normalization.
- power: The linear spectrogram is raised to this power prior to running the Griffin Lim algorithm. A power
- of greater than 1 has been shown to improve audio quality.
- n_iters: Number of iterations of convertion magnitude spectrograms to audio signal.
-
- """
- self.mag_scale = mag_scale
- self.power = power
- self.n_iters = n_iters
- self.n_fft = n_fft
- self.filterbank = librosa.filters.mel(sr=sample_rate, n_fft=n_fft, **kwargs)
-
- def __call__(self, mel_postnet: NmTensor) -> NmTensor:
- return mel_postnet
-
- def get_audio(self, mel_spec: list, mel_len: list) -> List[ndarray]:
- audios = []
- for i, batch in enumerate(mel_spec):
- log_mel = batch.cpu().numpy().transpose(0, 2, 1)
- mel = np.exp(log_mel)
- magnitudes = np.dot(mel, self.filterbank) * self.mag_scale
- for j, sample in enumerate(magnitudes):
- sample = sample[:mel_len[i][j], :]
- audio = self.griffin_lim(sample.T ** self.power)
- audios.append(audio)
- return audios
-
- def griffin_lim(self, magnitudes):
- """Griffin-Lim algorithm to convert magnitude spectrograms to audio signals."""
- phase = np.exp(2j * np.pi * np.random.rand(*magnitudes.shape))
- complex_spec = magnitudes * phase
- signal = librosa.istft(complex_spec)
-
- for _ in range(self.n_iters):
- _, phase = librosa.magphase(librosa.stft(signal, n_fft=self.n_fft))
- complex_spec = magnitudes * phase
- signal = librosa.istft(complex_spec)
- return signal
diff --git a/deeppavlov/models/ner/bio.py b/deeppavlov/models/ner/bio.py
deleted file mode 100644
index 7eb75015ed..0000000000
--- a/deeppavlov/models/ner/bio.py
+++ /dev/null
@@ -1,46 +0,0 @@
-# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from logging import getLogger
-from typing import List
-
-from deeppavlov.core.common.registry import register
-from deeppavlov.core.models.component import Component
-
-log = getLogger(__name__)
-
-
-@register('ner_bio_converter')
-class BIOMarkupRestorer(Component):
- """Restores BIO markup for tags batch"""
-
- def __init__(self, *args, **kwargs) -> None:
- pass
-
- @staticmethod
- def _convert_to_bio(tags: List[str]) -> List[str]:
- tags_bio = []
- for n, tag in enumerate(tags):
- if tag != 'O':
- if n > 0 and tags[n - 1] == tag:
- tag = 'I-' + tag
- else:
- tag = 'B-' + tag
- tags_bio.append(tag)
-
- return tags_bio
-
- def __call__(self, tag_batch: List[List[str]], *args, **kwargs) -> List[List[str]]:
- y = [self._convert_to_bio(sent) for sent in tag_batch]
- return y
diff --git a/deeppavlov/models/ner/svm.py b/deeppavlov/models/ner/svm.py
deleted file mode 100644
index d8eda1538b..0000000000
--- a/deeppavlov/models/ner/svm.py
+++ /dev/null
@@ -1,83 +0,0 @@
-# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import pickle
-from itertools import chain
-from logging import getLogger
-from typing import List, Union
-
-import numpy as np
-from sklearn.svm import SVC
-
-from deeppavlov.core.common.registry import register
-from deeppavlov.core.models.estimator import Estimator
-
-log = getLogger(__name__)
-
-
-@register('ner_svm')
-class SVMTagger(Estimator):
- """
- ``SVM`` (Support Vector Machines) classifier for tagging sequences
-
- Parameters:
- return_probabilities: whether to return probabilities or predictions
- kernel: kernel of SVM (RBF works well in the most of the cases)
- seed: seed for SVM initialization
- """
-
- def __init__(self, return_probabilities: bool = False, kernel: str = 'rbf', seed=42, *args, **kwargs) -> None:
- super().__init__(*args, **kwargs)
- self.classifier = None
- self.return_probabilities = return_probabilities
- self._kernel = kernel
- self._seed = seed
-
- self.load()
-
- def fit(self, tokens: List[List[str]], tags: List[List[int]], *args, **kwargs) -> None:
- tokens = list(chain(*tokens))
- tags = list(chain(*tags))
- self.classifier = SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
- decision_function_shape='ovr', degree=3, gamma='auto',
- kernel=self._kernel, max_iter=-1, probability=self.return_probabilities,
- random_state=self._seed, shrinking=True, tol=0.001, verbose=False)
- self.classifier.fit(tokens, tags)
-
- def __call__(self, token_vectors_batch: List[List[str]], *args, **kwargs) -> \
- Union[List[List[int]], List[List[np.ndarray]]]:
- lens = [len(utt) for utt in token_vectors_batch]
- token_vectors_list = list(chain(*token_vectors_batch))
- predictions = self.classifier.predict(token_vectors_list)
- y = []
- cl = 0
- for l in lens:
- y.append(predictions[cl: cl + l])
- cl += l
- return y
-
- def save(self) -> None:
- with self.save_path.open('wb') as f:
- pickle.dump(self.classifier, f, protocol=4)
-
- def serialize(self):
- return pickle.dumps(self.classifier, protocol=4)
-
- def load(self) -> None:
- if self.load_path.exists():
- with self.load_path.open('rb') as f:
- self.classifier = pickle.load(f)
-
- def deserialize(self, data):
- self.classifier = pickle.loads(data)
diff --git a/deeppavlov/models/preprocessors/bert_preprocessor.py b/deeppavlov/models/preprocessors/bert_preprocessor.py
index e60a068193..3cf0c3b211 100644
--- a/deeppavlov/models/preprocessors/bert_preprocessor.py
+++ b/deeppavlov/models/preprocessors/bert_preprocessor.py
@@ -211,114 +211,3 @@ def _ner_bert_tokenize(tokens: List[str],
startofword_markers.append(0)
tags_subword.append('X')
return tokens_subword, startofword_markers, tags_subword
-
-
-@register('bert_ranker_preprocessor')
-class BertRankerPreprocessor(BertPreprocessor):
- """Tokenize text to sub-tokens, encode sub-tokens with their indices, create tokens and segment masks for ranking.
-
- Builds features for a pair of context with each of the response candidates.
- """
-
- def __call__(self, batch: List[List[str]]) -> List[List[InputFeatures]]:
- """Call BERT :func:`bert_dp.preprocessing.convert_examples_to_features` function to tokenize and create masks.
-
- Args:
- batch: list of elemenents where the first element represents the batch with contexts
- and the rest of elements represent response candidates batches
-
- Returns:
- list of feature batches with subtokens, subtoken ids, subtoken mask, segment mask.
- """
-
- if isinstance(batch[0], str):
- batch = [batch]
-
- cont_resp_pairs = []
- if len(batch[0]) == 1:
- contexts = batch[0]
- responses_empt = [None] * len(batch)
- cont_resp_pairs.append(zip(contexts, responses_empt))
- else:
- contexts = [el[0] for el in batch]
- for i in range(1, len(batch[0])):
- responses = []
- for el in batch:
- responses.append(el[i])
- cont_resp_pairs.append(zip(contexts, responses))
- examples = []
- for s in cont_resp_pairs:
- ex = [InputExample(unique_id=0, text_a=context, text_b=response) for context, response in s]
- examples.append(ex)
- features = [convert_examples_to_features(el, self.max_seq_length, self.tokenizer) for el in examples]
-
- return features
-
-
-@register('bert_sep_ranker_preprocessor')
-class BertSepRankerPreprocessor(BertPreprocessor):
- """Tokenize text to sub-tokens, encode sub-tokens with their indices, create tokens and segment masks for ranking.
-
- Builds features for a context and for each of the response candidates separately.
- """
-
- def __call__(self, batch: List[List[str]]) -> List[List[InputFeatures]]:
- """Call BERT :func:`bert_dp.preprocessing.convert_examples_to_features` function to tokenize and create masks.
-
- Args:
- batch: list of elemenents where the first element represents the batch with contexts
- and the rest of elements represent response candidates batches
-
- Returns:
- list of feature batches with subtokens, subtoken ids, subtoken mask, segment mask
- for the context and each of response candidates separately.
- """
-
- if isinstance(batch[0], str):
- batch = [batch]
-
- samples = []
- for i in range(len(batch[0])):
- s = []
- for el in batch:
- s.append(el[i])
- samples.append(s)
- s_empt = [None] * len(samples[0])
- # TODO: add unique id
- examples = []
- for s in samples:
- ex = [InputExample(unique_id=0, text_a=text_a, text_b=text_b) for text_a, text_b in
- zip(s, s_empt)]
- examples.append(ex)
- features = [convert_examples_to_features(el, self.max_seq_length, self.tokenizer) for el in examples]
-
- return features
-
-
-@register('bert_sep_ranker_predictor_preprocessor')
-class BertSepRankerPredictorPreprocessor(BertSepRankerPreprocessor):
- """Tokenize text to sub-tokens, encode sub-tokens with their indices, create tokens and segment masks for ranking.
-
- Builds features for a context and for each of the response candidates separately.
- In addition, builds features for a response (and corresponding context) text base.
-
- Args:
- resps: list of strings containing the base of text responses
- resp_vecs: BERT vector respresentations of ``resps``, if is ``None`` features for the response base will be build
- conts: list of strings containing the base of text contexts
- cont_vecs: BERT vector respresentations of ``conts``, if is ``None`` features for the response base will be build
- """
-
- def __init__(self,
- resps=None, resp_vecs=None, conts=None, cont_vecs=None, **kwargs) -> None:
- super().__init__(**kwargs)
- self.resp_features = None
- self.cont_features = None
- if resps is not None and resp_vecs is None:
- log.info("Building BERT features for the response base...")
- resp_batch = [[el] for el in resps]
- self.resp_features = self(resp_batch)
- if conts is not None and cont_vecs is None:
- log.info("Building BERT features for the context base...")
- cont_batch = [[el] for el in conts]
- self.cont_features = self(cont_batch)
diff --git a/deeppavlov/models/preprocessors/siamese_preprocessor.py b/deeppavlov/models/preprocessors/siamese_preprocessor.py
deleted file mode 100644
index 9a7a92332e..0000000000
--- a/deeppavlov/models/preprocessors/siamese_preprocessor.py
+++ /dev/null
@@ -1,138 +0,0 @@
-# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from logging import getLogger
-from typing import List, Union, Iterable, Optional
-
-import numpy as np
-
-from deeppavlov.core.commands.utils import expand_path
-from deeppavlov.core.common.registry import register
-from deeppavlov.core.data.utils import zero_pad_truncate
-from deeppavlov.core.models.component import Component
-from deeppavlov.core.models.estimator import Estimator
-
-log = getLogger(__name__)
-
-
-@register('siamese_preprocessor')
-class SiamesePreprocessor(Estimator):
- """ Preprocessing of data samples containing text strings to feed them in a siamese network.
-
- First ``num_context_turns`` strings in each data sample corresponds to the dialogue ``context``
- and the rest string(s) in the sample is (are) ``response(s)``.
-
- Args:
- save_path: The parameter is only needed to initialize the base class
- :class:`~deeppavlov.core.models.serializable.Serializable`.
- load_path: The parameter is only needed to initialize the base class
- :class:`~deeppavlov.core.models.serializable.Serializable`.
- max_sequence_length: A maximum length of text sequences in tokens.
- Longer sequences will be truncated and shorter ones will be padded.
- dynamic_batch: Whether to use dynamic batching. If ``True``, the maximum length of a sequence for a batch
- will be equal to the maximum of all sequences lengths from this batch,
- but not higher than ``max_sequence_length``.
- padding: Padding. Possible values are ``pre`` and ``post``.
- If set to ``pre`` a sequence will be padded at the beginning.
- If set to ``post`` it will padded at the end.
- truncating: Truncating. Possible values are ``pre`` and ``post``.
- If set to ``pre`` a sequence will be truncated at the beginning.
- If set to ``post`` it will truncated at the end.
- use_matrix: Whether to use a trainable matrix with token (word) embeddings.
- num_context_turns: A number of ``context`` turns in data samples.
- num_ranking_samples: A number of condidates for ranking including positive one.
- add_raw_text: whether add raw text sentences to output data list or not.
- Use with conjunction of models using sentence encoders
- tokenizer: An instance of one of the :class:`deeppavlov.models.tokenizers`.
- vocab: An instance of :class:`deeppavlov.core.data.simple_vocab.SimpleVocabulary`.
- embedder: an instance of one of the :class:`deeppavlov.models.embedders`.
- sent_vocab: An instance of of :class:`deeppavlov.core.data.simple_vocab.SimpleVocabulary`.
- It is used to store all ``responces`` and to find the best ``response``
- to the user ``context`` in the ``interact`` mode.
- """
-
- def __init__(self,
- save_path: str = './tok.dict',
- load_path: str = './tok.dict',
- max_sequence_length: int = None,
- dynamic_batch: bool = False,
- padding: str = 'post',
- truncating: str = 'post',
- use_matrix: bool = True,
- num_context_turns: int = 1,
- num_ranking_samples: int = 1,
- add_raw_text: bool = False,
- tokenizer: Component = None,
- vocab: Optional[Estimator] = None,
- embedder: Optional[Component] = None,
- sent_vocab: Optional[Estimator] = None,
- **kwargs):
-
- self.max_sequence_length = max_sequence_length
- self.padding = padding
- self.truncating = truncating
- self.dynamic_batch = dynamic_batch
- self.use_matrix = use_matrix
- self.num_ranking_samples = num_ranking_samples
- self.num_context_turns = num_context_turns
- self.add_raw_text = add_raw_text
- self.tokenizer = tokenizer
- self.embedder = embedder
- self.vocab = vocab
- self.sent_vocab = sent_vocab
- self.save_path = expand_path(save_path).resolve()
- self.load_path = expand_path(load_path).resolve()
-
- super().__init__(load_path=self.load_path, save_path=self.save_path, **kwargs)
-
- def fit(self, x: List[List[str]]) -> None:
- if self.sent_vocab is not None:
- self.sent_vocab.fit([el[self.num_context_turns:] for el in x])
- x_tok = [self.tokenizer(el) for el in x]
- self.vocab.fit([el for x in x_tok for el in x])
-
- def __call__(self, x: Union[List[List[str]], List[str]]) -> Iterable[List[List[np.ndarray]]]:
- if len(x) == 0 or isinstance(x[0], str):
- if len(x) == 1: # interact mode: len(batch) == 1
- x_preproc = [[sent.strip() for sent in x[0].split('&')]] # List[str] -> List[List[str]]
- elif len(x) == 0:
- x_preproc = [['']]
- else:
- x_preproc = [[el] for el in x]
- else:
- x_preproc = [el[:self.num_context_turns + self.num_ranking_samples] for el in x]
- for el in x_preproc:
- x_tok = self.tokenizer(el)
- x_ctok = [y if len(y) != 0 else [''] for y in x_tok]
- if self.use_matrix:
- x_proc = self.vocab(x_ctok)
- else:
- x_proc = self.embedder(x_ctok)
- if self.dynamic_batch:
- msl = min((max([len(y) for el in x_tok for y in el]), self.max_sequence_length))
- else:
- msl = self.max_sequence_length
- x_proc = zero_pad_truncate(x_proc, msl, pad=self.padding, trunc=self.truncating)
- x_proc = list(x_proc)
- if self.add_raw_text:
- x_proc += el # add (self.num_context_turns+self.num_ranking_samples) raw sentences
- yield x_proc
-
- def load(self) -> None:
- pass
-
- def save(self) -> None:
- if self.sent_vocab is not None:
- self.sent_vocab.save()
- self.vocab.save()
diff --git a/deeppavlov/models/ranking/bilstm_gru_siamese_network.py b/deeppavlov/models/ranking/bilstm_gru_siamese_network.py
deleted file mode 100644
index fe243598a9..0000000000
--- a/deeppavlov/models/ranking/bilstm_gru_siamese_network.py
+++ /dev/null
@@ -1,110 +0,0 @@
-# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from logging import getLogger
-
-from tensorflow.keras import backend as K
-from tensorflow.keras.layers import Input, GlobalMaxPooling1D, Lambda, Dense, GRU
-from tensorflow.keras.models import Model
-
-from deeppavlov.core.common.registry import register
-from deeppavlov.models.ranking.bilstm_siamese_network import BiLSTMSiameseNetwork
-
-log = getLogger(__name__)
-
-
-@register('bilstm_gru_nn')
-class BiLSTMGRUSiameseNetwork(BiLSTMSiameseNetwork):
- """The class implementing a siamese neural network with BiLSTM, GRU and max pooling.
-
- GRU is used to take into account multi-turn dialogue ``context``.
-
- Args:
- len_vocab: A size of the vocabulary to build embedding layer.
- seed: Random seed.
- shared_weights: Whether to use shared weights in the model to encode ``contexts`` and ``responses``.
- embedding_dim: Dimensionality of token (word) embeddings.
- reccurent: A type of the RNN cell. Possible values are ``lstm`` and ``bilstm``.
- hidden_dim: Dimensionality of the hidden state of the RNN cell. If ``reccurent`` equals ``bilstm``
- ``hidden_dim`` should be doubled to get the actual dimensionality.
- max_pooling: Whether to use max-pooling operation to get ``context`` (``response``) vector representation.
- If ``False``, the last hidden state of the RNN will be used.
- triplet_loss: Whether to use a model with triplet loss.
- If ``False``, a model with crossentropy loss will be used.
- margin: A margin parameter for triplet loss. Only required if ``triplet_loss`` is set to ``True``.
- hard_triplets: Whether to use hard triplets sampling to train the model
- i.e. to choose negative samples close to positive ones.
- If set to ``False`` random sampling will be used.
- Only required if ``triplet_loss`` is set to ``True``.
- """
-
- def create_model(self) -> Model:
- input = []
- if self.use_matrix:
- for i in range(self.num_context_turns + 1):
- input.append(Input(shape=(self.max_sequence_length,)))
- context = input[:self.num_context_turns]
- response = input[-1]
- emb_layer = self.embedding_layer()
- emb_c = [emb_layer(el) for el in context]
- emb_r = emb_layer(response)
- else:
- for i in range(self.num_context_turns + 1):
- input.append(Input(shape=(self.max_sequence_length, self.embedding_dim,)))
- context = input[:self.num_context_turns]
- response = input[-1]
- emb_c = context
- emb_r = response
- lstm_layer = self.lstm_layer()
- lstm_c = [lstm_layer(el) for el in emb_c]
- lstm_r = lstm_layer(emb_r)
- pooling_layer = GlobalMaxPooling1D(name="pooling")
- lstm_c = [pooling_layer(el) for el in lstm_c]
- lstm_r = pooling_layer(lstm_r)
- lstm_c = [Lambda(lambda x: K.expand_dims(x, 1))(el) for el in lstm_c]
- lstm_c = Lambda(lambda x: K.concatenate(x, 1))(lstm_c)
- gru_layer = GRU(2 * self.hidden_dim, name="gru")
- gru_c = gru_layer(lstm_c)
-
- if self.triplet_mode:
- dist = Lambda(self._pairwise_distances)([gru_c, lstm_r])
- else:
- dist = Lambda(self._diff_mult_dist)([gru_c, lstm_r])
- dist = Dense(1, activation='sigmoid', name="score_model")(dist)
- model = Model(context + [response], dist)
- return model
-
- def create_score_model(self) -> Model:
- cr = self.model.inputs
- if self.triplet_mode:
- emb_c = self.model.get_layer("gru").output
- emb_r = self.model.get_layer("pooling").get_output(-1)
- dist_score = Lambda(lambda x: self.euclidian_dist(x), name="score_model")
- score = dist_score([emb_c, emb_r])
- else:
- score = self.model.get_layer("score_model").output
- score = Lambda(lambda x: 1. - K.squeeze(x, -1))(score)
- score = Lambda(lambda x: 1. - x)(score)
- model = Model(cr, score)
- return model
-
- def create_context_model(self) -> Model:
- m = Model(self.model.inputs[:-1],
- self.model.get_layer("gru").output)
- return m
-
- def create_response_model(self) -> Model:
- m = Model(self.model.inputs[-1],
- self.model.get_layer("pooling").get_output_at(-1))
- return m
diff --git a/deeppavlov/models/ranking/deep_attention_matching_network_use_transformer.py b/deeppavlov/models/ranking/deep_attention_matching_network_use_transformer.py
deleted file mode 100644
index a9dc45ccd0..0000000000
--- a/deeppavlov/models/ranking/deep_attention_matching_network_use_transformer.py
+++ /dev/null
@@ -1,403 +0,0 @@
-# Copyright 2018 Neural Networks and Deep Learning lab, MIPT
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from logging import getLogger
-from typing import List, Dict, Tuple, Optional
-
-import numpy as np
-import tensorflow as tf
-import tensorflow_hub as hub
-
-from deeppavlov.core.common.registry import register
-from deeppavlov.models.ranking.matching_models.dam_utils import layers
-from deeppavlov.models.ranking.matching_models.dam_utils import operations as op
-from deeppavlov.models.ranking.tf_base_matching_model import TensorflowBaseMatchingModel
-
-log = getLogger(__name__)
-
-
-@register('dam_nn_use_transformer')
-class DAMNetworkUSETransformer(TensorflowBaseMatchingModel):
- """
- Tensorflow implementation of Deep Attention Matching Network (DAM) [1] improved with USE [2]. We called it DAM-USE-T
- ```
- http://aclweb.org/anthology/P18-1103
-
- Based on Tensorflow code: https://github.com/baidu/Dialogue/tree/master/DAM
- We added USE-T [2] as a sentence encoder to the DAM network to achieve state-of-the-art performance on the datasets:
- * Ubuntu Dialogue Corpus v1 (R@1: 0.7929, R@2: 0.8912, R@5: 0.9742)
- * Ubuntu Dialogue Corpus v2 (R@1: 0.7414, R@2: 0.8656, R@5: 0.9731)
-
- References:
- [1]
- ```
- @inproceedings{ ,
- title={Multi-Turn Response Selection for Chatbots with Deep Attention Matching Network},
- author={Xiangyang Zhou, Lu Li, Daxiang Dong, Yi Liu, Ying Chen, Wayne Xin Zhao, Dianhai Yu and Hua Wu},
- booktitle={Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
- volume={1},
- pages={ -- },
- year={2018}
- }
- ```
- [2] Cer D, Yang Y, Kong S-y, Hua N, Limtiaco N, John RS, et al. 2018. Universal sentence encoder.
- arXiv preprint arXiv:1803.11175 2018.
-
- Args:
- num_context_turns (int): A number of ``context`` turns in data samples.
- max_sequence_length(int): A maximum length of text sequences in tokens.
- Longer sequences will be truncated and shorter ones will be padded.
- learning_rate (float): Initial learning rate.
- emb_matrix (np.ndarray): An embeddings matrix to initialize an embeddings layer of a model.
- trainable_embeddings (bool): Whether train embeddings matrix or not.
- embedding_dim (int): Dimensionality of token (word) embeddings.
- is_positional (bool): Adds a bunch of sinusoids of different frequencies to an embeddings.
- stack_num (int): Number of stack layers, default is 5.
- seed (int): Random seed.
- decay_steps (int): Number of steps after which is to decay the learning rate.
- """
-
- def __init__(self,
- embedding_dim: int = 200,
- max_sequence_length: int = 50,
- learning_rate: float = 1e-3,
- emb_matrix: Optional[np.ndarray] = None,
- trainable_embeddings: bool = False,
- is_positional: bool = True,
- stack_num: int = 5,
- seed: int = 65,
- decay_steps: int = 600,
- *args,
- **kwargs):
-
- self.seed = seed
- tf.set_random_seed(self.seed)
-
- self.max_sentence_len = max_sequence_length
- self.word_embedding_size = embedding_dim
- self.trainable = trainable_embeddings
- self.is_positional = is_positional
- self.stack_num = stack_num
- self.learning_rate = learning_rate
- self.emb_matrix = emb_matrix
- self.decay_steps = decay_steps
-
- super(DAMNetworkUSETransformer, self).__init__(*args, **kwargs)
-
- ##############################################################################
- self._init_graph()
- self.sess_config = tf.ConfigProto(allow_soft_placement=True)
- self.sess_config.gpu_options.allow_growth = True
- self.sess = tf.Session(config=self.sess_config)
- self.sess.run([tf.global_variables_initializer(), tf.tables_initializer()])
- ##############################################################################
-
- if self.load_path is not None:
- self.load()
-
- def _init_placeholders(self):
- """ Init model placeholders """
- with tf.variable_scope('inputs'):
- # Utterances and their lengths
- self.utterance_ph = tf.placeholder(tf.int32, shape=(None, self.num_context_turns, self.max_sentence_len))
- self.all_utterance_len_ph = tf.placeholder(tf.int32, shape=(None, self.num_context_turns))
-
- # Responses and their lengths
- self.response_ph = tf.placeholder(tf.int32, shape=(None, self.max_sentence_len))
- self.response_len_ph = tf.placeholder(tf.int32, shape=(None,))
-
- # Labels
- self.y_true = tf.placeholder(tf.int32, shape=(None,))
-
- # Raw sentences for context and response
- self.context_sent_ph = tf.placeholder(tf.string,
- shape=(None, self.num_context_turns),
- name="context_sentences")
- self.response_sent_ph = tf.placeholder(tf.string, shape=(None,), name="response_sentences")
-
- def _init_sentence_encoder(self):
- """ Init sentence encoder, for example USE-T """
- # sentence encoder
- self.embed = hub.Module("https://tfhub.dev/google/universal-sentence-encoder-large/3",
- trainable=False)
-
- # embed sentences of context
- with tf.variable_scope('sentence_embeddings'):
- x = []
- for i in range(self.num_context_turns):
- x.append(self.embed(tf.reshape(self.context_sent_ph[:, i], shape=(tf.shape(self.context_sent_ph)[0],))))
- embed_context_turns = tf.stack(x, axis=1)
- embed_response = self.embed(self.response_sent_ph)
-
- # for context sentences: shape=(None, self.num_context_turns, 1, 512)
- self.sent_embedder_context = tf.expand_dims(embed_context_turns, axis=2)
- # for resp sentences: shape=(None, 1, 512)
- self.sent_embedder_response = tf.expand_dims(embed_response, axis=1)
-
- def _init_graph(self):
- self._init_placeholders()
- self._init_sentence_encoder()
-
- with tf.variable_scope('sentence_emb_dim_reduction'):
- dense_emb = tf.layers.Dense(200,
- kernel_initializer=tf.keras.initializers.glorot_uniform(seed=42),
- kernel_regularizer=tf.keras.regularizers.l2(),
- bias_regularizer=tf.keras.regularizers.l2(),
- trainable=True)
-
- a = []
- for i in range(self.num_context_turns):
- a.append(dense_emb(self.sent_embedder_context[:, i]))
- sent_embedder_context = tf.stack(a, axis=1)
- sent_embedder_response = dense_emb(self.sent_embedder_response)
-
- with tf.variable_scope('embedding_matrix_init'):
- word_embeddings = tf.get_variable("word_embeddings_v",
- initializer=tf.constant(self.emb_matrix, dtype=tf.float32),
- trainable=self.trainable)
- with tf.variable_scope('embedding_lookup'):
- response_embeddings = tf.nn.embedding_lookup(word_embeddings, self.response_ph)
-
- Hr = response_embeddings
- if self.is_positional and self.stack_num > 0:
- with tf.variable_scope('positional'):
- Hr = op.positional_encoding_vector(Hr, max_timescale=10)
-
- with tf.variable_scope('expand_resp_embeddings'):
- Hr = tf.concat([sent_embedder_response, Hr], axis=1)
-
- Hr_stack = [Hr]
-
- for index in range(self.stack_num):
- with tf.variable_scope('self_stack_' + str(index)):
- Hr = layers.block(
- Hr, Hr, Hr,
- Q_lengths=self.response_len_ph, K_lengths=self.response_len_ph, attention_type='dot')
- Hr_stack.append(Hr)
-
- # context part
- # a list of length max_turn_num, every element is a tensor with shape [batch, max_turn_len]
- list_turn_t = tf.unstack(self.utterance_ph, axis=1)
- list_turn_length = tf.unstack(self.all_utterance_len_ph, axis=1)
- list_turn_t_sent = tf.unstack(sent_embedder_context, axis=1)
-
- sim_turns = []
- # for every turn_t calculate matching vector
- for turn_t, t_turn_length, turn_t_sent in zip(list_turn_t, list_turn_length, list_turn_t_sent):
- Hu = tf.nn.embedding_lookup(word_embeddings, turn_t) # [batch, max_turn_len, emb_size]
-
- if self.is_positional and self.stack_num > 0:
- with tf.variable_scope('positional', reuse=True):
- Hu = op.positional_encoding_vector(Hu, max_timescale=10)
-
- with tf.variable_scope('expand_cont_embeddings'):
- Hu = tf.concat([turn_t_sent, Hu], axis=1)
-
- Hu_stack = [Hu]
-
- for index in range(self.stack_num):
- with tf.variable_scope('self_stack_' + str(index), reuse=True):
- Hu = layers.block(
- Hu, Hu, Hu,
- Q_lengths=t_turn_length, K_lengths=t_turn_length, attention_type='dot')
-
- Hu_stack.append(Hu)
-
- r_a_t_stack = []
- t_a_r_stack = []
- for index in range(self.stack_num + 1):
-
- with tf.variable_scope('t_attend_r_' + str(index)):
- try:
- t_a_r = layers.block(
- Hu_stack[index], Hr_stack[index], Hr_stack[index],
- Q_lengths=t_turn_length, K_lengths=self.response_len_ph, attention_type='dot')
- except ValueError:
- tf.get_variable_scope().reuse_variables()
- t_a_r = layers.block(
- Hu_stack[index], Hr_stack[index], Hr_stack[index],
- Q_lengths=t_turn_length, K_lengths=self.response_len_ph, attention_type='dot')
-
- with tf.variable_scope('r_attend_t_' + str(index)):
- try:
- r_a_t = layers.block(
- Hr_stack[index], Hu_stack[index], Hu_stack[index],
- Q_lengths=self.response_len_ph, K_lengths=t_turn_length, attention_type='dot')
- except ValueError:
- tf.get_variable_scope().reuse_variables()
- r_a_t = layers.block(
- Hr_stack[index], Hu_stack[index], Hu_stack[index],
- Q_lengths=self.response_len_ph, K_lengths=t_turn_length, attention_type='dot')
-
- t_a_r_stack.append(t_a_r)
- r_a_t_stack.append(r_a_t)
-
- t_a_r_stack.extend(Hu_stack)
- r_a_t_stack.extend(Hr_stack)
-
- t_a_r = tf.stack(t_a_r_stack, axis=-1)
- r_a_t = tf.stack(r_a_t_stack, axis=-1)
-
- # log.info(t_a_r, r_a_t) # debug
-
- # calculate similarity matrix
- with tf.variable_scope('similarity'):
- # sim shape [batch, max_turn_len, max_turn_len, 2*stack_num+1]
- # divide sqrt(200) to prevent gradient explosion
- sim = tf.einsum('biks,bjks->bijs', t_a_r, r_a_t) / tf.sqrt(float(self.word_embedding_size))
-
- sim_turns.append(sim)
-
- # cnn and aggregation
- sim = tf.stack(sim_turns, axis=1)
- log.info('sim shape: %s' % sim.shape)
- with tf.variable_scope('cnn_aggregation'):
- final_info = layers.CNN_3d(sim, 32, 32) # We can improve performance if use 32 filters for each layer
- # for douban
- # final_info = layers.CNN_3d(sim, 16, 16)
-
- # loss and train
- with tf.variable_scope('loss'):
- self.loss, self.logits = layers.loss(final_info, self.y_true, clip_value=10.)
- self.y_pred = tf.nn.softmax(self.logits, name="y_pred")
- tf.summary.scalar('loss', self.loss)
-
- self.global_step = tf.Variable(0, trainable=False)
- initial_learning_rate = self.learning_rate
- self.learning_rate = tf.train.exponential_decay(
- initial_learning_rate,
- global_step=self.global_step,
- decay_steps=self.decay_steps,
- decay_rate=0.9,
- staircase=True)
-
- Optimizer = tf.train.AdamOptimizer(self.learning_rate)
- self.grads_and_vars = Optimizer.compute_gradients(self.loss)
-
- for grad, var in self.grads_and_vars:
- if grad is None:
- log.info(var)
-
- self.capped_gvs = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in self.grads_and_vars]
- self.train_op = Optimizer.apply_gradients(
- self.capped_gvs,
- global_step=self.global_step)
-
- # Debug
- self.print_number_of_parameters()
-
- def _append_sample_to_batch_buffer(self, sample: List[np.ndarray], buf: List[Tuple]):
- """
- The function for adding samples to the batch buffer
-
- Args:
- sample (List[nd.array]): samples generator
- buf (List[Tuple[np.ndarray]]) : List of samples with model inputs each:
- [( context, context_len, response, response_len ), ( ... ), ... ].
-
- Returns:
- None
- """
- sample_len = len(sample)
-
- batch_buffer_context = [] # [batch_size, 10, 50]
- batch_buffer_context_len = [] # [batch_size, 10]
- batch_buffer_response = [] # [batch_size, 50]
- batch_buffer_response_len = [] # [batch_size]
-
- raw_batch_buffer_context = [] # [batch_size, 10]
- raw_batch_buffer_response = [] # [batch_size]
-
- context_sentences = sample[:self.num_context_turns]
- response_sentences = sample[self.num_context_turns:sample_len // 2]
-
- raw_context_sentences = sample[sample_len // 2:sample_len // 2 + self.num_context_turns]
- raw_response_sentences = sample[sample_len // 2 + self.num_context_turns:]
-
- # Format model inputs:
- # 4 model inputs
-
- # 1. Token indices for context
- batch_buffer_context += [context_sentences for sent in response_sentences] # replicate context N times
- # 2. Token indices for response
- batch_buffer_response += [response_sentence for response_sentence in response_sentences]
- # 3. Lengths of all context sentences
- lens = []
- for context in [context_sentences for sent in response_sentences]: # replicate context N times
- context_sentences_lens = []
- for sent in context:
- sent_len = len(sent[sent != 0])
- sent_len = sent_len + 1 if sent_len > 0 else 0 # 1 additional token is the USE token
- context_sentences_lens.append(sent_len)
- lens.append(context_sentences_lens)
- batch_buffer_context_len += lens
- # 4. Length of response
- lens = []
- for response in [response_sentence for response_sentence in response_sentences]:
- sent_len = len(response[response != 0])
- sent_len = sent_len + 1 if sent_len > 0 else 0 # 1 additional token is the USE token
- lens.append(sent_len)
- batch_buffer_response_len += lens
- # 5. Raw context sentences
- raw_batch_buffer_context += [raw_context_sentences for sent in raw_response_sentences]
- # 6. Raw response sentences
- raw_batch_buffer_response += [raw_sent for raw_sent in raw_response_sentences]
-
- for i in range(len(batch_buffer_context)):
- buf.append(tuple((
- batch_buffer_context[i],
- batch_buffer_context_len[i],
- batch_buffer_response[i],
- batch_buffer_response_len[i],
- raw_batch_buffer_context[i],
- raw_batch_buffer_response[i]
- )))
- return len(response_sentences)
-
- def _make_batch(self, batch: List[Tuple[np.ndarray]]) -> Dict:
- """
- The function for formatting model inputs
-
- Args:
- batch (List[Tuple[np.ndarray]]): List of samples with model inputs each:
- [( context, context_len, response, response_len ), ( ... ), ... ].
- graph (str): which graph the inputs is preparing for
-
- Returns:
- Dict: feed_dict to feed a model
- """
- input_context = []
- input_context_len = []
- input_response = []
- input_response_len = []
- input_raw_context = []
- input_raw_response = []
-
- # format model inputs for MAIN graph as numpy arrays
- for sample in batch:
- input_context.append(sample[0])
- input_context_len.append(sample[1])
- input_response.append(sample[2])
- input_response_len.append(sample[3])
- input_raw_context.append(sample[4]) # raw context is the 4th element of each Tuple in the batch
- input_raw_response.append(sample[5]) # raw response is the 5th element of each Tuple in the batch
-
- return {
- self.utterance_ph: np.array(input_context),
- self.all_utterance_len_ph: np.array(input_context_len),
- self.response_ph: np.array(input_response),
- self.response_len_ph: np.array(input_response_len),
- self.context_sent_ph: np.array(input_raw_context),
- self.response_sent_ph: np.array(input_raw_response)
- }
diff --git a/deeppavlov/models/ranking/mpm_siamese_network.py b/deeppavlov/models/ranking/mpm_siamese_network.py
deleted file mode 100644
index cccc26f508..0000000000
--- a/deeppavlov/models/ranking/mpm_siamese_network.py
+++ /dev/null
@@ -1,180 +0,0 @@
-# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from logging import getLogger
-
-from tensorflow.keras import backend as K
-from tensorflow.keras.initializers import glorot_uniform, Orthogonal
-from tensorflow.keras.layers import Input, LSTM, Lambda, Dense, Dropout, Bidirectional
-from tensorflow.keras.models import Model
-
-from deeppavlov.core.common.registry import register
-from deeppavlov.core.layers.keras_layers import AttentiveMatchingLayer, MaxattentiveMatchingLayer
-from deeppavlov.core.layers.keras_layers import FullMatchingLayer, MaxpoolingMatchingLayer
-from deeppavlov.models.ranking.bilstm_siamese_network import BiLSTMSiameseNetwork
-
-log = getLogger(__name__)
-
-
-@register('mpm_nn')
-class MPMSiameseNetwork(BiLSTMSiameseNetwork):
- """The class implementing a siamese neural network with bilateral multi-Perspective matching.
-
- The network architecture is based on https://arxiv.org/abs/1702.03814.
-
- Args:
- dense_dim: Dimensionality of the dense layer.
- perspective_num: Number of perspectives in multi-perspective matching layers.
- aggregation dim: Dimensionality of the hidden state in the second BiLSTM layer.
- inpdrop_val: Float between 0 and 1. A dropout value for the linear transformation of the inputs.
- recdrop_val: Float between 0 and 1. A dropout value for the linear transformation of the recurrent state.
- ldrop_val: A dropout value of the dropout layer before the second BiLSTM layer.
- dropout_val: A dropout value of the dropout layer after the second BiLSTM layer.
- """
-
- def __init__(self,
- dense_dim: int = 50,
- perspective_num: int = 20,
- aggregation_dim: int = 200,
- recdrop_val: float = 0.0,
- inpdrop_val: float = 0.0,
- ldrop_val: float = 0.0,
- dropout_val: float = 0.0,
- *args,
- **kwargs) -> None:
-
- self.dense_dim = dense_dim
- self.perspective_num = perspective_num
- self.aggregation_dim = aggregation_dim
- self.ldrop_val = ldrop_val
- self.recdrop_val = recdrop_val
- self.inpdrop_val = inpdrop_val
- self.dropout_val = dropout_val
- self.seed = kwargs.get("triplet_loss")
- self.triplet_mode = kwargs.get("triplet_loss")
-
- super(MPMSiameseNetwork, self).__init__(*args, **kwargs)
-
- def create_lstm_layer_1(self):
- ker_in = glorot_uniform(seed=self.seed)
- rec_in = Orthogonal(seed=self.seed)
- bioutp = Bidirectional(LSTM(self.hidden_dim,
- input_shape=(self.max_sequence_length, self.embedding_dim,),
- kernel_regularizer=None,
- recurrent_regularizer=None,
- bias_regularizer=None,
- activity_regularizer=None,
- recurrent_dropout=self.recdrop_val,
- dropout=self.inpdrop_val,
- kernel_initializer=ker_in,
- recurrent_initializer=rec_in,
- return_sequences=True), merge_mode=None)
- return bioutp
-
- def create_lstm_layer_2(self):
- ker_in = glorot_uniform(seed=self.seed)
- rec_in = Orthogonal(seed=self.seed)
- bioutp = Bidirectional(LSTM(self.aggregation_dim,
- input_shape=(self.max_sequence_length, 8 * self.perspective_num,),
- kernel_regularizer=None,
- recurrent_regularizer=None,
- bias_regularizer=None,
- activity_regularizer=None,
- recurrent_dropout=self.recdrop_val,
- dropout=self.inpdrop_val,
- kernel_initializer=ker_in,
- recurrent_initializer=rec_in,
- return_sequences=False),
- merge_mode='concat',
- name="sentence_embedding")
- return bioutp
-
- def create_model(self) -> Model:
- if self.use_matrix:
- context = Input(shape=(self.max_sequence_length,))
- response = Input(shape=(self.max_sequence_length,))
- emb_layer = self.embedding_layer()
- emb_c = emb_layer(context)
- emb_r = emb_layer(response)
- else:
- context = Input(shape=(self.max_sequence_length, self.embedding_dim,))
- response = Input(shape=(self.max_sequence_length, self.embedding_dim,))
- emb_c = context
- emb_r = response
- lstm_layer = self.create_lstm_layer_1()
- lstm_a = lstm_layer(emb_c)
- lstm_b = lstm_layer(emb_r)
-
- f_layer_f = FullMatchingLayer(self.perspective_num)
- f_layer_b = FullMatchingLayer(self.perspective_num)
- f_a_forw = f_layer_f([lstm_a[0], lstm_b[0]])[0]
- f_a_back = f_layer_b([Lambda(lambda x: K.reverse(x, 1))(lstm_a[1]),
- Lambda(lambda x: K.reverse(x, 1))(lstm_b[1])])[0]
- f_a_back = Lambda(lambda x: K.reverse(x, 1))(f_a_back)
- f_b_forw = f_layer_f([lstm_b[0], lstm_a[0]])[0]
- f_b_back = f_layer_b([Lambda(lambda x: K.reverse(x, 1))(lstm_b[1]),
- Lambda(lambda x: K.reverse(x, 1))(lstm_a[1])])[0]
- f_b_back = Lambda(lambda x: K.reverse(x, 1))(f_b_back)
-
- mp_layer_f = MaxpoolingMatchingLayer(self.perspective_num)
- mp_layer_b = MaxpoolingMatchingLayer(self.perspective_num)
- mp_a_forw = mp_layer_f([lstm_a[0], lstm_b[0]])[0]
- mp_a_back = mp_layer_b([lstm_a[1], lstm_b[1]])[0]
- mp_b_forw = mp_layer_f([lstm_b[0], lstm_a[0]])[0]
- mp_b_back = mp_layer_b([lstm_b[1], lstm_a[1]])[0]
-
- at_layer_f = AttentiveMatchingLayer(self.perspective_num)
- at_layer_b = AttentiveMatchingLayer(self.perspective_num)
- at_a_forw = at_layer_f([lstm_a[0], lstm_b[0]])[0]
- at_a_back = at_layer_b([lstm_a[1], lstm_b[1]])[0]
- at_b_forw = at_layer_f([lstm_b[0], lstm_a[0]])[0]
- at_b_back = at_layer_b([lstm_b[1], lstm_a[1]])[0]
-
- ma_layer_f = MaxattentiveMatchingLayer(self.perspective_num)
- ma_layer_b = MaxattentiveMatchingLayer(self.perspective_num)
- ma_a_forw = ma_layer_f([lstm_a[0], lstm_b[0]])[0]
- ma_a_back = ma_layer_b([lstm_a[1], lstm_b[1]])[0]
- ma_b_forw = ma_layer_f([lstm_b[0], lstm_a[0]])[0]
- ma_b_back = ma_layer_b([lstm_b[1], lstm_a[1]])[0]
-
- concat_a = Lambda(lambda x: K.concatenate(x, axis=-1))([f_a_forw, f_a_back,
- mp_a_forw, mp_a_back,
- at_a_forw, at_a_back,
- ma_a_forw, ma_a_back])
- concat_b = Lambda(lambda x: K.concatenate(x, axis=-1))([f_b_forw, f_b_back,
- mp_b_forw, mp_b_back,
- at_b_forw, at_b_back,
- ma_b_forw, ma_b_back])
-
- concat_a = Dropout(self.ldrop_val)(concat_a)
- concat_b = Dropout(self.ldrop_val)(concat_b)
-
- lstm_layer_agg = self.create_lstm_layer_2()
- agg_a = lstm_layer_agg(concat_a)
- agg_b = lstm_layer_agg(concat_b)
-
- agg_a = Dropout(self.dropout_val)(agg_a)
- agg_b = Dropout(self.dropout_val)(agg_b)
-
- reduced = Lambda(lambda x: K.concatenate(x, axis=-1))([agg_a, agg_b])
-
- if self.triplet_mode:
- dist = Lambda(self._pairwise_distances)([agg_a, agg_b])
- else:
- ker_in = glorot_uniform(seed=self.seed)
- dense = Dense(self.dense_dim, kernel_initializer=ker_in)(reduced)
- dist = Dense(1, activation='sigmoid', name="score_model")(dense)
- model = Model([context, response], dist)
- return model
diff --git a/deeppavlov/models/ranking/sequential_matching_network.py b/deeppavlov/models/ranking/sequential_matching_network.py
deleted file mode 100644
index a9222897af..0000000000
--- a/deeppavlov/models/ranking/sequential_matching_network.py
+++ /dev/null
@@ -1,150 +0,0 @@
-# Copyright 2018 Neural Networks and Deep Learning lab, MIPT
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from logging import getLogger
-from typing import Optional
-
-import numpy as np
-import tensorflow as tf
-
-from deeppavlov.core.common.registry import register
-from deeppavlov.models.ranking.tf_base_matching_model import TensorflowBaseMatchingModel
-
-log = getLogger(__name__)
-
-
-@register('smn_nn')
-class SMNNetwork(TensorflowBaseMatchingModel):
- """
- Tensorflow implementation of Sequential Matching Network
-
- Wu, Yu, et al. "Sequential Matching Network: A New Architecture for Multi-turn Response Selection in
- Retrieval-based Chatbots." ACL. 2017.
- https://arxiv.org/abs/1612.01627
-
- Based on authors' Tensorflow code: https://github.com/MarkWuNLP/MultiTurnResponseSelection
-
- Args:
- num_context_turns (int): A number of ``context`` turns in data samples.
- max_sequence_length (int): A maximum length of text sequences in tokens.
- Longer sequences will be truncated and shorter ones will be padded.
- learning_rate (float): Initial learning rate.
- emb_matrix (np.ndarray): An embeddings matrix to initialize an embeddings layer of a model.
- trainable_embeddings (bool): Whether train embeddings matrix or not.
- embedding_dim (int): Dimensionality of token (word) embeddings.
- """
-
- def __init__(self,
- embedding_dim: int = 200,
- max_sequence_length: int = 50,
- learning_rate: float = 1e-3,
- emb_matrix: Optional[np.ndarray] = None,
- trainable_embeddings: bool = False,
- *args,
- **kwargs):
-
- self.max_sentence_len = max_sequence_length
- self.word_embedding_size = embedding_dim
- self.trainable = trainable_embeddings
- self.learning_rate = learning_rate
- self.emb_matrix = emb_matrix
-
- super(SMNNetwork, self).__init__(*args, **kwargs)
-
- self.sess_config = tf.ConfigProto(allow_soft_placement=True)
- self.sess_config.gpu_options.allow_growth = True
- self.sess = tf.Session(config=self.sess_config)
- self._init_graph()
- self.sess.run(tf.global_variables_initializer())
-
- if self.load_path is not None:
- self.load()
-
- def _init_placeholders(self):
- with tf.variable_scope('inputs'):
- # Utterances and their lengths
- self.utterance_ph = tf.placeholder(tf.int32, shape=(None, self.num_context_turns, self.max_sentence_len))
- self.all_utterance_len_ph = tf.placeholder(tf.int32, shape=(None, self.num_context_turns))
-
- # Responses and their lengths
- self.response_ph = tf.placeholder(tf.int32, shape=(None, self.max_sentence_len))
- self.response_len_ph = tf.placeholder(tf.int32, shape=(None,))
-
- # Labels
- self.y_true = tf.placeholder(tf.int32, shape=(None,))
-
- def _init_graph(self):
- self._init_placeholders()
-
- word_embeddings = tf.get_variable("word_embeddings_v",
- initializer=tf.constant(self.emb_matrix, dtype=tf.float32),
- trainable=self.trainable)
-
- all_utterance_embeddings = tf.nn.embedding_lookup(word_embeddings, self.utterance_ph)
- response_embeddings = tf.nn.embedding_lookup(word_embeddings, self.response_ph)
- sentence_GRU = tf.nn.rnn_cell.GRUCell(self.word_embedding_size, kernel_initializer=tf.orthogonal_initializer())
- all_utterance_embeddings = tf.unstack(all_utterance_embeddings, num=self.num_context_turns,
- axis=1) # list of self.num_context_turns tensors with shape (?, 200)
- all_utterance_len = tf.unstack(self.all_utterance_len_ph, num=self.num_context_turns, axis=1)
- A_matrix = tf.get_variable('A_matrix_v', shape=(self.word_embedding_size, self.word_embedding_size),
- initializer=tf.contrib.layers.xavier_initializer(), dtype=tf.float32)
- final_GRU = tf.nn.rnn_cell.GRUCell(self.word_embedding_size, kernel_initializer=tf.orthogonal_initializer())
- reuse = None
-
- response_GRU_embeddings, _ = tf.nn.dynamic_rnn(sentence_GRU,
- response_embeddings,
- sequence_length=self.response_len_ph,
- dtype=tf.float32,
- scope='sentence_GRU')
- response_embeddings = tf.transpose(response_embeddings, perm=[0, 2, 1])
- response_GRU_embeddings = tf.transpose(response_GRU_embeddings, perm=[0, 2, 1])
- matching_vectors = []
- for utterance_embeddings, utterance_len in zip(all_utterance_embeddings, all_utterance_len):
- matrix1 = tf.matmul(utterance_embeddings, response_embeddings)
- utterance_GRU_embeddings, _ = tf.nn.dynamic_rnn(sentence_GRU,
- utterance_embeddings,
- sequence_length=utterance_len,
- dtype=tf.float32,
- scope='sentence_GRU')
- matrix2 = tf.einsum('aij,jk->aik', utterance_GRU_embeddings, A_matrix) # TODO:check this
- matrix2 = tf.matmul(matrix2, response_GRU_embeddings)
- matrix = tf.stack([matrix1, matrix2], axis=3, name='matrix_stack')
- conv_layer = tf.layers.conv2d(matrix, filters=8, kernel_size=(3, 3), padding='VALID',
- kernel_initializer=tf.contrib.keras.initializers.he_normal(),
- activation=tf.nn.relu, reuse=reuse, name='conv') # TODO: check other params
- pooling_layer = tf.layers.max_pooling2d(conv_layer, (3, 3), strides=(3, 3),
- padding='VALID', name='max_pooling') # TODO: check other params
- matching_vector = tf.layers.dense(tf.contrib.layers.flatten(pooling_layer), 50,
- kernel_initializer=tf.contrib.layers.xavier_initializer(),
- activation=tf.tanh, reuse=reuse,
- name='matching_v') # TODO: check wthether this is correct
- if not reuse:
- reuse = True
- matching_vectors.append(matching_vector)
- _, last_hidden = tf.nn.dynamic_rnn(final_GRU,
- tf.stack(matching_vectors, axis=0, name='matching_stack'),
- # resulting shape: (10, ?, 50)
- dtype=tf.float32,
- time_major=True,
- scope='final_GRU') # TODO: check time_major
- logits = tf.layers.dense(last_hidden, 2, kernel_initializer=tf.contrib.layers.xavier_initializer(),
- name='final_v')
- self.y_pred = tf.nn.softmax(logits)
- self.logits = logits
- self.loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.y_true, logits=logits))
- optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate)
- self.train_op = optimizer.minimize(self.loss)
-
- # Debug
- self.print_number_of_parameters()
diff --git a/deeppavlov/models/ranking/siamese_predictor.py b/deeppavlov/models/ranking/siamese_predictor.py
deleted file mode 100644
index a42dccc22b..0000000000
--- a/deeppavlov/models/ranking/siamese_predictor.py
+++ /dev/null
@@ -1,146 +0,0 @@
-# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from logging import getLogger
-from typing import List, Iterable, Callable, Union
-
-import numpy as np
-
-from deeppavlov.core.common.registry import register
-from deeppavlov.core.data.simple_vocab import SimpleVocabulary
-from deeppavlov.core.models.component import Component
-from deeppavlov.models.ranking.keras_siamese_model import SiameseModel
-
-log = getLogger(__name__)
-
-
-@register('siamese_predictor')
-class SiamesePredictor(Component):
- """The class for ranking or paraphrase identification using the trained siamese network in the ``interact`` mode.
-
- Args:
- batch_size: A size of a batch.
- num_context_turns: A number of ``context`` turns in data samples.
- ranking: Whether to perform ranking.
- If it is set to ``False`` paraphrase identification will be performed.
- attention: Whether any attention mechanism is used in the siamese network.
- If ``False`` then calculated in advance vectors of ``responses``
- will be used to obtain similarity score for the input ``context``;
- Otherwise the whole siamese architecture will be used
- to obtain similarity score for the input ``context`` and each particular ``response``.
- The parameter will be used if the ``ranking`` is set to ``True``.
- responses: A instance of :class:`~deeppavlov.core.data.simple_vocab.SimpleVocabulary`
- with all possible ``responses`` to perform ranking.
- Will be used if the ``ranking`` is set to ``True``.
- preproc_func: A ``__call__`` function of the
- :class:`~deeppavlov.models.preprocessors.siamese_preprocessor.SiamesePreprocessor`.
- interact_pred_num: The number of the most relevant ``responses`` which will be returned.
- Will be used if the ``ranking`` is set to ``True``.
- **kwargs: Other parameters.
- """
-
- def __init__(self,
- model: SiameseModel,
- batch_size: int,
- num_context_turns: int = 1,
- ranking: bool = True,
- attention: bool = False,
- responses: SimpleVocabulary = None,
- preproc_func: Callable = None,
- interact_pred_num: int = 3,
- *args, **kwargs) -> None:
-
- super().__init__()
-
- self.batch_size = batch_size
- self.num_context_turns = num_context_turns
- self.ranking = ranking
- self.attention = attention
- self.preproc_responses = []
- self.response_embeddings = None
- self.preproc_func = preproc_func
- self.interact_pred_num = interact_pred_num
- self.model = model
- if self.ranking:
- self.responses = {el[1]: el[0] for el in responses.items()}
- self._build_preproc_responses()
- if not self.attention:
- self._build_response_embeddings()
-
- def __call__(self, batch: Iterable[List[np.ndarray]]) -> List[Union[List[str], str]]:
- context = next(batch)
- try:
- next(batch)
- log.error("It is not intended to use the `%s` with the batch size greater then 1." % self.__class__)
- except StopIteration:
- pass
-
- if self.ranking:
- if len(context) == self.num_context_turns:
- scores = []
- if self.attention:
- for i in range(len(self.preproc_responses) // self.batch_size + 1):
- responses = self.preproc_responses[i * self.batch_size: (i + 1) * self.batch_size]
- b = [context + el for el in responses]
- b = self.model._make_batch(b)
- sc = self.model._predict_on_batch(b)
- scores += list(sc)
- else:
- b = self.model._make_batch([context])
- context_emb = self.model._predict_context_on_batch(b)
- context_emb = np.squeeze(context_emb, axis=0)
- scores = context_emb @ self.response_embeddings.T
- ids = np.flip(np.argsort(scores), -1)
- return [[self.responses[el] for el in ids[:self.interact_pred_num]]]
- else:
- return ["Please, provide contexts separated by '&' in the number equal to that used while training."]
-
- else:
- if len(context) == 2:
- b = self.model._make_batch([context])
- sc = self.model._predict_on_batch(b)[0]
- if sc > 0.5:
- return ["This is a paraphrase."]
- else:
- return ["This is not a paraphrase."]
- else:
- return ["Please, provide two sentences separated by '&'."]
-
- def reset(self) -> None:
- pass
-
- def process_event(self) -> None:
- pass
-
- def _build_response_embeddings(self) -> None:
- resp_vecs = []
- for i in range(len(self.preproc_responses) // self.batch_size + 1):
- resp_preproc = self.preproc_responses[i * self.batch_size: (i + 1) * self.batch_size]
- resp_preproc = self.model._make_batch(resp_preproc)
- resp_preproc = resp_preproc
- resp_vecs.append(self.model._predict_response_on_batch(resp_preproc))
- self.response_embeddings = np.vstack(resp_vecs)
-
- def _build_preproc_responses(self) -> None:
- responses = list(self.responses.values())
- for i in range(len(responses) // self.batch_size + 1):
- el = self.preproc_func(responses[i * self.batch_size: (i + 1) * self.batch_size])
- self.preproc_responses += list(el)
-
- def rebuild_responses(self, candidates) -> None:
- self.attention = True
- self.interact_pred_num = 1
- self.preproc_responses = list()
- self.responses = {idx: sentence for idx, sentence in enumerate(candidates)}
- self._build_preproc_responses()
diff --git a/deeppavlov/models/spelling_correction/levenshtein/levenshtein_searcher.py b/deeppavlov/models/spelling_correction/levenshtein/levenshtein_searcher.py
index 3c05d6848c..dfebaf228b 100644
--- a/deeppavlov/models/spelling_correction/levenshtein/levenshtein_searcher.py
+++ b/deeppavlov/models/spelling_correction/levenshtein/levenshtein_searcher.py
@@ -23,8 +23,8 @@
class LevenshteinSearcher:
"""
- Класс для поиска близких слов
- в соответствии с расстоянием Левенштейна
+ A class for searching for close words
+ according to the Levenshtein distance
"""
@@ -42,7 +42,7 @@ def __init__(self, alphabet, dictionary, operation_costs=None,
else:
raise ValueError("Euristics should be non-negative integer or None")
if isinstance(dictionary, Trie):
- # словарь передан уже в виде бора
+ # the dictionary has already been transmitted in the form of a Trie
self.dictionary = dictionary
else:
self.dictionary = make_trie(alphabet, dictionary, make_cashed=True,
@@ -70,15 +70,15 @@ def search(self, word, d, allow_spaces=True, return_cost=True):
def _trie_search(self, word, d, transducer=None,
allow_spaces=True, return_cost=True):
"""
- Находит все слова в префиксном боре, расстояние до которых
- в соответствии с заданным преобразователем не превышает d
+ Finds all words in the Trie, the distance to which in
+ accordance with the specified converter does not exceed d
"""
if transducer is None:
- # разобраться с пробелами
+ # deals with spaces
transducer = self.transducer.inverse()
allow_spaces &= self.allow_spaces
trie = self.dictionary
- # инициализация переменных
+ # variable initialization
used_agenda_keys = set()
agenda = SortedListWithKey(key=(lambda x: x[1]))
h = self.h_func(word, trie.root)
@@ -87,7 +87,7 @@ def _trie_search(self, word, d, transducer=None,
agenda.add((key, value))
answer = dict()
k = 0
- # очередь с приоритетом с промежуточными результатами
+ # priority queue with intermediate results
while len(agenda) > 0:
key, value = agenda.pop(0)
if key in used_agenda_keys:
@@ -95,8 +95,8 @@ def _trie_search(self, word, d, transducer=None,
used_agenda_keys.add(key)
low, pos, index = key
cost, g, h = value
- # g --- текущая стоимость, h --- нижняя оценка будущей стоимости
- # cost = g + h --- нижняя оценка суммарной стоимости
+ # g --- current value, h--- lower estimate of future value
+ # cost = g + h --- lower estimate of the total cost
k += 1
max_upperside_length = min(len(word) - pos, transducer.max_up_length)
for upperside_length in range(max_upperside_length + 1):
@@ -106,7 +106,7 @@ def _trie_search(self, word, d, transducer=None,
continue
for curr_low, curr_cost in transducer.operation_costs[curr_up].items():
new_g = g + curr_cost
- if new_g > d: # если g > d, то h можно не вычислять
+ if new_g > d: # if g > d, then h can not be calculated
continue
if curr_low == " ":
if allow_spaces and trie.is_final(index):
@@ -137,13 +137,13 @@ def _trie_search(self, word, d, transducer=None,
def _precompute_euristics(self):
"""
- Предвычисляет будущие символы и стоимости операций с ними
- для h-эвристики
+ Precalculates future symbols and the cost of operations with them
+ for h-heuristics
"""
if self.euristics is None:
return
- # вычисление минимальной стоимости операции,
- # приводящей к появлению ('+') или исчезновению ('-') данного символа
+ # calculation of the minimum cost of the operation,
+ # leading to the appearance ('+') or disappearance ('-') of this symbol
removal_costs = {a: np.inf for a in self.alphabet}
insertion_costs = {a: np.inf for a in self.alphabet}
if self.allow_spaces:
@@ -161,13 +161,13 @@ def _precompute_euristics(self):
insertion_cost = cost / len(low)
for a in low:
insertion_costs[a] = min(insertion_costs[a], insertion_cost)
- # предвычисление возможных будущих символов в узлах дерева
- # precompute_future_symbols(self.dictionary, self.euristics, self.allow_spaces)
- # предвычисление стоимостей потери символа в узлах дерева
+ # precomputation of possible future symbols in tree nodes
+ # precompute_future_symbols(self.dictionary, self.heuristics, self.allow_spaces)
+ # precomputing the cost of symbol loss in tree nodes
self._absense_costs_by_node = _precompute_absense_costs(
self.dictionary, removal_costs, insertion_costs,
self.euristics, self.allow_spaces)
- # массив для сохранения эвристик
+ # array for storing heuristics
self._temporary_euristics = [dict() for i in range(len(self.dictionary))]
def _define_h_function(self):
@@ -178,35 +178,35 @@ def _define_h_function(self):
def _euristic_h_function(self, suffix, index):
"""
- Вычисление h-эвристики из работы Hulden,2009 для текущей вершины словаря
+ Calculation of the h-heuristic from Hulden, 2009 for the current vertex of the dictionary
- Аргументы:
+ Arguments:
----------
suffix : string
- непрочитанный суффикс входного слова
+ the unread suffix of the input word
index : int
- индекс текущего узла в словаре
+ index of the current node in the dictionary
- Возвращает:
+ Returns:
-----------
- cost : float
- оценка снизу для стоимости замены,
- приводящей к входному слову с суффиксом suffix,
- если прочитанный префикс слова без опечатки
- привёл в вершину с номером index
+ const : float
+ bottom estimate for replacement cost,
+ leading to an input word with the suffix suffix,
+ if the read prefix is a word without a typo
+ led to the vertex with the index number
"""
if self.euristics > 0:
suffix = suffix[:self.euristics]
- # кэширование результатов
+ # caching of results
index_temporary_euristics = self._temporary_euristics[index]
cost = index_temporary_euristics.get(suffix, None)
if cost is not None:
return cost
- # извлечение нужных данных из массивов
+ # extracting the necessary data from arrays
absense_costs = self._absense_costs_by_node[index]
data = self.dictionary.data[index]
costs = np.zeros(dtype=np.float64, shape=(self.euristics,))
- # costs[j] --- оценка штрафа при предпросмотре вперёд на j символов
+ # costs[j] --- penalty estimate when previewing forward by j characters
for i, a in enumerate(suffix):
costs[i:] += absense_costs[a][i:]
cost = max(costs)
@@ -229,28 +229,28 @@ def _minimal_replacement_cost(self, first, second):
def _precompute_absense_costs(dictionary, removal_costs, insertion_costs, n,
allow_spaces=False):
"""
- Вычисляет минимальную стоимость появления нового символа в узлах словаря
- в соответствии со штрафами из costs
+ Calculates the minimum cost of the appearance of a new character in the dictionary nodes
+ according to the penalties from costs
- Аргументы:
+ Arguments:
---------------
- dictionary : Trie
- словарь, хранящийся в виде ациклического автомата
+ dictionary : Tkey
+ a dictionary stored as an acyclic automaton
removal_costs : dict
- штрафы за удаление символов
+ penalties for deleting characters
insertion_costs : dict
- штрафы за вставку символов
+ penalties for inserting characters
n : int
- глубина ``заглядывания вперёд'' в словаре
+ the depth of `looking ahead" in the dictionary
- Возвращает
+ Returns
---------------
- answer : list of dicts, len(answer)=len(dictionary)
- answer[i][a][j] равно минимальному штрафу за появление символа a
- в j-ой позиции в вершине с номером i
+ answer : list of diets, en(answer)=len(dictionary)
+ answer[i][a][j] is equal to the minimum penalty for the appearance of the symbol a
+ in the j-th position at the vertex with the number i
"""
answer = [dict() for node in dictionary.data]
if n == 0:
@@ -259,7 +259,7 @@ def _precompute_absense_costs(dictionary, removal_costs, insertion_costs, n,
if allow_spaces:
curr_alphabet += [' ']
for l, (costs_in_node, node) in enumerate(zip(answer, dictionary.data)):
- # определение минимальной стоимости удаления символов
+ # determining the minimum cost of deleting characters
curr_node_removal_costs = np.empty(dtype=np.float64, shape=(n,))
if len(node[0]) > 0:
curr_node_removal_costs[0] = min(removal_costs[symbol] for symbol in node[0])
@@ -271,7 +271,7 @@ def _precompute_absense_costs(dictionary, removal_costs, insertion_costs, n,
curr_node_removal_costs[j] = min(curr_node_removal_costs[j - 1], curr_cost)
else:
curr_node_removal_costs[:] = np.inf
- # определение минимальной стоимости вставки
+ # determining the minimum insertion cost
for a in curr_alphabet:
curr_symbol_costs = np.empty(dtype=np.float64, shape=(n,))
curr_symbol_costs.fill(insertion_costs[a])
@@ -286,21 +286,21 @@ def _precompute_absense_costs(dictionary, removal_costs, insertion_costs, n,
class SegmentTransducer:
"""
- Класс, реализующий взвешенный конечный преобразователь,
- осуществляющий замены из заданного списка операций
+ A class implementing a weighted finite converter,
+ performing substitutions from a given list of operations
- Аргументы:
+ Arguments:
----------
alphabet : list
- алфавит
+ alphabet
- operation_costs : dict or None(optional, default=None)
- словарь вида {(up,low) : cost}
+ operation_costs : doctor None(optional, default=None)
+ dictionary of the form {(up,low) : cost}
allow_spaces : bool(optional, default=False)
- разрешены ли элементы трансдукции, содержащие пробел
- (используется только если явно не заданы operation costs
- и они равны значению по умолчанию)
+ are the transduction elements containing a space allowed
+ (used only if operation costs are not explicitly specified
+ and they are equal to the default value)
"""
@@ -316,26 +316,26 @@ def __init__(self, alphabet, operation_costs=None, allow_spaces=False):
self._make_maximal_key_lengths()
# self.maximal_value_lengths = {}
# for up, probs in self.operation_costs.items():
- # СЛИШКОМ МНОГО ВЫЗОВОВ, НАДО КАК-ТО ЗАПОМНИТЬ
- # МАКСИМАЛЬНЫЕ ДЛИНЫ КЛЮЧЕЙ ПРИ ОБРАЩЕНИИ
+ # THERE ARE TOO MANY CALLS, YOU NEED TO REMEMBER SOMEHOW
+ # MAXIMUM KEY LENGTHS WHEN ACCESSING
# max_low_length = max(len(low) for low in probs) if (len(probs) > 0) else -1
# self.maximal_value_lengths[up] = self.maximal_key_length
def get_operation_cost(self, up, low):
"""
- Возвращает стоимость элементарной трансдукции up->low
- или np.inf, если такой элементарной трансдукции нет
+ Returns the cost of elementary transduction up->low
+ or np.inf, if there is no such elementary transduction
- Аргументы:
+ Arguments:
----------
up, low : string
- элементы элементарной трансдукции
+ elements of elementary transduction
- Возвращает:
+ Returns:
-----------
- cost : float
- стоимость элементарной трансдукции up->low
- (np.inf, если такая трансдукция отсутствует)
+ const : float
+ the cost of elementary transduction up->low
+ (np.inf, if there is no such transduction)
"""
up_costs = self.operation_costs.get(up, None)
if up_costs is None:
@@ -345,9 +345,9 @@ def get_operation_cost(self, up, low):
def inverse(self):
"""
- Строит пробразователь, задающий обратное конечное преобразование
+ Build a converter specifying the inverse finite transformation
"""
- # УПРОСТИТЬ ОБРАЩЕНИЕ!!!
+ # SIMPLIFY HANDLING!!!
inversed_transducer = SegmentTransducer(self.alphabet, operation_costs=dict())
inversed_transducer.operation_costs = self._reversed_operation_costs
inversed_transducer._reversed_operation_costs = self.operation_costs
@@ -359,29 +359,29 @@ def inverse(self):
def distance(self, first, second, return_transduction=False):
"""
- Вычисляет трансдукцию минимальной стоимости,
- отображающую first в second
+ Calculates the minimum cost transduction,
+ displaying first in second
- Аргументы:
+ Arguments:
-----------
first : string
second : string
- Верхний и нижний элементы трансдукции
+ Upper and lower transduction elements
return_transduction : bool (optional, default=False)
- следует ли возвращать трансдукцию минимального веса
- (см. возвращаемое значение)
+ should the minimum weight transduction be returned
+ (see return value)
- Возвращает:
+ Returns:
-----------
- (final_cost, transductions) : tuple(float, list)
- если return_transduction=True, то возвращает
- минимальную стоимость трансдукции, переводящей first в second
- и список трансдукций с данной стоимостью
-
- final_cost : float
- если return_transduction=False, то возвращает
- минимальную стоимость трансдукции, переводящей first в second
+ (final_cost, conversions) : tuple(float, list)
+ if return_transduction=True, then returns
+ the minimum cost of the transduction that converts first to second
+ and a list of transductions with a given cost
+
+ final_const : float
+ if return_transduction=False, it returns
+ the minimum cost of the transduction that converts first to second
"""
if return_transduction:
add_pred = (lambda x, y: (y == np.inf or x < y))
@@ -404,13 +404,13 @@ def distance(self, first, second, return_transduction=False):
def transduce(self, first, second, threshold):
"""
- Возвращает все трансдукции, переводящие first в second,
- чья стоимость не превышает threshold
+ Returns all transductions that translate first to second,
+ whose value does not exceed threshold
- Возвращает:
+ Returns:
----------
result : list
- список вида [(трансдукция, стоимость)]
+ a list of the form [(transduction, cost)]
"""
add_pred = (lambda x, y: x <= threshold)
clear_pred = (lambda x, y: False)
@@ -424,20 +424,20 @@ def transduce(self, first, second, threshold):
def lower_transductions(self, word, max_cost, return_cost=True):
"""
- Возвращает все трансдукции с верхним элементом word,
- чья стоимость не превышает max_cost
+ Returns all transductions with the top word element,
+ whose cost does not exceed max_cost
- ` Возвращает:
+ ` Returns:
----------
result : list
- список вида [(трансдукция, стоимость)], если return_cost=True
- список трансдукций, если return_cost=False
- список отсортирован в порядке возрастания стоимости трансдукции
+ a list of the form [(transduction, cost)] if return_cost=True
+ the list of transductions, if return_cost=False
+ the list is sorted in ascending order of the cost of transduction
"""
prefixes = [[] for i in range(len(word) + 1)]
prefixes[0].append(((), 0.0))
for pos in range(len(prefixes)):
- # вставки
+ # inserts
prefixes[pos] = self._perform_insertions(prefixes[pos], max_cost)
max_upperside_length = min(len(word) - pos, self.max_up_length)
for upperside_length in range(1, max_upperside_length + 1):
@@ -479,40 +479,40 @@ def upper_transductions(self, word, max_cost, return_cost=True):
def _fill_levenshtein_table(self, first, second, update_func, add_pred, clear_pred,
threshold=None):
"""
- Функция, динамически заполняющая таблицу costs стоимости трансдукций,
- costs[i][j] --- минимальная стоимость трансдукции,
- переводящей first[:i] в second[:j]
+ A function that dynamically fills in the costs table of the cost of transductions,
+ costs[i][j] --- minimum cost of transduction,
+ translating first[:i] to second[:j]
- Аргументы:
+ Arguments:
----------
first, second : string
- Верхний и нижний элементы трансдукции
+ Upper and lower transduction elements
update_func : callable, float*float -> bool
- update_func(x, y) возвращает новое значение в ячейке таблицы costs,
- если старое значение --- y, а потенциально новое значение --- x
- везде update_func = min
+ update_func(x, y) returns a new value in a cell of the costs table,
+ if the old value is --- y, and the potentially new value is--- x
+ everywhere update_func = min
add_pred : callable : float*float -> bool
- add_pred(x, y) возвращает, производится ли добавление
- нового элемента p стоимости x в ячейку backtraces[i][j]
- в зависимости от значения costs[i][j]=y и текущей стоимости x
+ add_pred(x, y) returns whether an addition is being made
+ a new element p of value x in the cell backtraces[i][j]
+ depending on the value of costs[i][j]=y and the current cost of x
clear_pred : callable : float*float -> bool
- clear_pred(x, y) возвращает, производится ли очистка
- ячейки backtraces[i][j] в зависимости от значения costs[i][j]=y
- и текущей стоимости x элемента p, добавляемого в эту ячейку
+ clear_pred(x, y) returns whether the cleanup is being performed
+ cells backtraces[i][j] depending on the value of costs[i][j]=y
+ and the current cost x of the element p being added to this cell
- Возвращает:
+ Returns:
-----------
costs : array, dtype=float, shape=(len(first)+1, len(second)+1)
- массив, в ячейке с индексами i, j которого хранится
- минимальная стоимость трансдукции, переводящей first[:i] в second[:j]
+ an array in a cell with indexes i, j of which is stored
+ the minimum cost of the transduction that translates first[:i] to second[:j]
backtraces : array, dtype=list, shape=(len(first)+1, len(second)+1)
- массив, в ячейке с индексами i, j которого хранятся
- обратные ссылки на предыдущую ячейку в оптимальной трансдукции,
- приводящей в ячейку backtraces[i][j]
+ an array in a cell with indexes i, j of which are stored
+ backlinks to the previous cell in optimal transduction,
+ leading to the cell backtraces[i][j]
"""
m, n = len(first), len(second)
- # если threshold=None, то в качестве порога берётся удвоенная стоимость
- # трансдукции, отображающей символы на одинаковых позициях друг в друга
+ # if threshold=None, then double the cost is taken as the threshold
+ # a transduction that maps symbols at the same positions into each other
if threshold is None:
threshold = 0.0
for a, b in zip(first, second):
@@ -524,7 +524,7 @@ def _fill_levenshtein_table(self, first, second, update_func, add_pred, clear_pr
for b in second[m:]:
threshold += self.get_operation_cost('', b)
threshold *= 2
- # инициализация возвращаемых массивов
+ # initialization of returned arrays
costs = np.zeros(shape=(m + 1, n + 1), dtype=np.float64)
costs[:] = np.inf
backtraces = [None] * (m + 1)
@@ -542,7 +542,7 @@ def _fill_levenshtein_table(self, first, second, update_func, add_pred, clear_pr
if costs[i][j] > threshold:
continue
if len(backtraces[i][j]) == 0 and i + j > 0:
- continue # не нашлось обратных ссылок
+ continue # no backlinks found
for j_right in range((j if i_right > i else j + 1),
min(j + max_low_length, n) + 1):
low = second[j: j_right]
@@ -560,8 +560,8 @@ def _fill_levenshtein_table(self, first, second, update_func, add_pred, clear_pr
def _make_reversed_operation_costs(self):
"""
- Заполняет массив _reversed_operation_costs
- на основе имеющегося массива operation_costs
+ Populates the _reversed_operation_costs array
+ based on the existing operation_costs array
"""
_reversed_operation_costs = dict()
for up, costs in self.operation_costs.items():
@@ -573,10 +573,10 @@ def _make_reversed_operation_costs(self):
def _make_maximal_key_lengths(self):
"""
- Вычисляет максимальную длину элемента low
- в элементарной трансдукции (up, low) для каждого up
- и максимальную длину элемента up
- в элементарной трансдукции (up, low) для каждого low
+ Calculates the maximum length of the element low
+ in the elementary transduction (up, low) for each up
+ and the maximum length of the up element
+ in the elementary transduction (up, low) for each low
"""
self.max_up_length = \
(max(len(up) for up in self.operation_costs)
@@ -594,27 +594,27 @@ def _make_maximal_key_lengths(self):
def _backtraces_to_transductions(self, first, second, backtraces, threshold, return_cost=False):
"""
- Восстанавливает трансдукции по таблице обратных ссылок
+ Restores transductions from the backlink table
- Аргументы:
+ Arguments:
----------
first, second : string
- верхние и нижние элементы трансдукции
+ upper and lower transduction elements
backtraces : array-like, dtype=list, shape=(len(first)+1, len(second)+1)
- таблица обратных ссылок
+ backlink table
threshold : float
- порог для отсева трансдукций,
- возвращаются только трансдукции стоимостью <= threshold
+ threshold for elimination by transduction,
+ only transductions worth <= threshold are returned
return_cost : bool (optional, default=False)
- если True, то вместе с трансдукциями возвращается их стоимость
+ if True, then their cost is returned along with the transductions
- Возвращает:
+ Returns:
-----------
result : list
- список вида [(трансдукция, стоимость)], если return_cost=True
- и вида [трансдукция], если return_cost=False,
- содержащий все трансдукции, переводящие first в second,
- чья стоимость не превышает threshold
+ a list of the form [(transduction, cost)] if return_cost=True
+ and of the form [transduction] if return_cost=False,
+ containing all the transductions that translate first to second,
+ whose value does not exceed threshold
"""
m, n = len(first), len(second)
agenda = [None] * (m + 1)
@@ -640,20 +640,20 @@ def _backtraces_to_transductions(self, first, second, backtraces, threshold, ret
def _perform_insertions(self, initial, max_cost):
"""
- возвращает все трансдукции стоимости <= max_cost,
- которые можно получить из элементов initial
+ returns all value transductions <= max_cost,
+ which can be obtained from the initial elements
- Аргументы:
+ Arguments:
----------
initial : list of tuples
- список исходных трансдукций вида [(трансдукция, стоимость)]
- max_cost : float
- максимальная стоимость трансдукции
+ list of initial transductions of the form [(transduction, cost)]
+ max_const : float
+ maximum cost of transduction
- Возвращает:
+ Returns:
-----------
final : list of tuples
- финальный список трансдукций вида [(трансдукция, стоимость)]
+ the final list of transductions of the form [(transduction, cost)]
"""
queue = list(initial)
final = initial
@@ -681,10 +681,10 @@ def _make_default_operation_costs(self, allow_spaces=False):
if allow_spaces:
current_costs[" "] = 1.0
self.operation_costs[a] = current_costs
- # транспозиции
+ # transpositions
for a, b in itertools.permutations(self.alphabet, 2):
self.operation_costs[a + b] = {b + a: 1.0}
- # пробелы
+ # spaces
if allow_spaces:
self.operation_costs[" "] = {c: 1.0 for c in self.alphabet}
self.operation_costs[" "][""] = 1.0
diff --git a/deeppavlov/models/tokenizers/jieba_tokenizer.py b/deeppavlov/models/tokenizers/jieba_tokenizer.py
deleted file mode 100644
index b56b19ec00..0000000000
--- a/deeppavlov/models/tokenizers/jieba_tokenizer.py
+++ /dev/null
@@ -1,68 +0,0 @@
-# Copyright 2020 Neural Networks and Deep Learning lab, MIPT
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import List, Union
-
-import jieba
-
-from deeppavlov.core.common.registry import register
-from deeppavlov.core.models.component import Component
-
-
-@register("jieba_tokenizer")
-class JiebaTokenizer(Component):
- """
- Tokenizes chinese text into tokens
-
- Doesn't have any parameters.
- """
-
- def __init__(self, **kwargs) -> None:
- jieba.initialize()
- pass
-
- @staticmethod
- def tokenize_str(text: str) -> str:
- """
- Tokenize a single string
-
- Args:
- text: a string to tokenize
-
- Returns:
- tokenized string
- """
- return ' '.join(jieba.cut(text))
-
- def __call__(self, batch: Union[List[str], List[List[str]]]) -> Union[List[str], List[List[str]]]:
- """
- Tokenize either list of strings or list of list of strings
-
- Args:
- batch a list of either strings or list of strings
-
- Returns:
- tokenized strings in the given format
- """
-
- if isinstance(batch[0], str):
- batch_tokenized = [JiebaTokenizer.tokenize_str(s) for s in batch]
- elif isinstance(batch[0], list):
- for lst in batch:
- batch_tokenized = [self(lst) for lst in batch]
- else:
- raise NotImplementedError('Not implemented for types other than'
- ' str or list')
-
- return batch_tokenized
diff --git a/deeppavlov/models/torch_bert/torch_bert_ranker.py b/deeppavlov/models/torch_bert/torch_bert_ranker.py
index 8990e8ef0e..aa4bae5019 100644
--- a/deeppavlov/models/torch_bert/torch_bert_ranker.py
+++ b/deeppavlov/models/torch_bert/torch_bert_ranker.py
@@ -47,7 +47,7 @@ class TorchBertRankerModel(TorchModel):
e.g. {'lr': 0.1, 'weight_decay': 0.001, 'momentum': 0.9}
"""
- def __init__(self, pretrained_bert: str,
+ def __init__(self, pretrained_bert: str = None,
bert_config_file: Optional[str] = None,
n_classes: int = 2,
return_probas: bool = True,
@@ -97,7 +97,7 @@ def train_on_batch(self, features_li: List[List[InputFeatures]], y: Union[List[i
self.optimizer.zero_grad()
loss, logits = self.model(b_input_ids, token_type_ids=None, attention_mask=b_input_masks,
- labels=b_labels)
+ labels=b_labels, return_dict=False)
loss.backward()
# Clip the norm of the gradients to 1.0.
# This is to help prevent the "exploding gradients" problem.
@@ -162,13 +162,18 @@ def load(self, fname=None):
if self.pretrained_bert:
log.info(f"From pretrained {self.pretrained_bert}.")
+ if Path(expand_path(self.pretrained_bert)).exists():
+ self.pretrained_bert = str(expand_path(self.pretrained_bert))
config = AutoConfig.from_pretrained(self.pretrained_bert,
# num_labels=self.n_classes,
output_attentions=False,
output_hidden_states=False)
+
self.model = AutoModelForSequenceClassification.from_pretrained(self.pretrained_bert, config=config)
+ # TODO: make better exception handling here and at
+ # deeppavlov.models.torch_bert.torch_transformers_classifier.TorchTransformersClassifierModel.load
try:
hidden_size = self.model.classifier.out_proj.in_features
@@ -178,7 +183,7 @@ def load(self, fname=None):
self.model.classifier.out_proj.out_features = self.n_classes
self.model.num_labels = self.n_classes
- except torch.nn.modules.module.ModuleAttributeError:
+ except AttributeError:
hidden_size = self.model.classifier.in_features
if self.n_classes != self.model.num_labels:
@@ -188,13 +193,10 @@ def load(self, fname=None):
self.model.num_labels = self.n_classes
- elif self.bert_config_file and Path(self.bert_config_file).is_file():
- self.bert_config = AutoConfig.from_json_file(str(expand_path(self.bert_config_file)))
- if self.attention_probs_keep_prob is not None:
- self.bert_config.attention_probs_dropout_prob = 1.0 - self.attention_probs_keep_prob
- if self.hidden_keep_prob is not None:
- self.bert_config.hidden_dropout_prob = 1.0 - self.hidden_keep_prob
+ elif self.bert_config_file and expand_path(self.bert_config_file).is_file():
+ self.bert_config = AutoConfig.from_pretrained(str(expand_path(self.bert_config_file)))
self.model = AutoModelForSequenceClassification.from_config(config=self.bert_config)
+
else:
raise ConfigError("No pre-trained BERT model is given.")
diff --git a/deeppavlov/models/torch_bert/torch_transformers_classifier.py b/deeppavlov/models/torch_bert/torch_transformers_classifier.py
index 3bf8077518..06029242d2 100644
--- a/deeppavlov/models/torch_bert/torch_transformers_classifier.py
+++ b/deeppavlov/models/torch_bert/torch_transformers_classifier.py
@@ -210,7 +210,8 @@ def load(self, fname=None):
else:
self.model = AutoModelForSequenceClassification.from_pretrained(self.pretrained_bert, config=config)
- # TODO need a better solution here
+ # TODO need a better solution here and at
+ # deeppavlov.models.torch_bert.torch_bert_ranker.TorchBertRankerModel.load
try:
hidden_size = self.model.classifier.out_proj.in_features
@@ -221,7 +222,7 @@ def load(self, fname=None):
self.model.classifier.out_proj.out_features = self.n_classes
self.model.num_labels = self.n_classes
- except torch.nn.modules.module.ModuleAttributeError:
+ except AttributeError:
hidden_size = self.model.classifier.in_features
if self.n_classes != self.model.num_labels:
diff --git a/deeppavlov/requirements/hdt.txt b/deeppavlov/requirements/hdt.txt
index ffd5cad4b0..02614c2ed0 100644
--- a/deeppavlov/requirements/hdt.txt
+++ b/deeppavlov/requirements/hdt.txt
@@ -1 +1,3 @@
+# TODO: hdt has pybind11 in requirements but imports it in the setup.py before installation. Should be fixed.
+# fasttext==0.9.1 installs pybind11==2.9.1. If install pybind11==2.2.4 after that two times, second installation will raise error Cannot uninstall pybind11 2.9.1, RECORD file not found.
hdt==2.3
diff --git a/deeppavlov/requirements/jieba.txt b/deeppavlov/requirements/jieba.txt
deleted file mode 100644
index dc9a2d185a..0000000000
--- a/deeppavlov/requirements/jieba.txt
+++ /dev/null
@@ -1 +0,0 @@
-jieba==0.42.1
diff --git a/deeppavlov/requirements/kenlm.txt b/deeppavlov/requirements/kenlm.txt
index 2210ba6aa5..9d57c24888 100644
--- a/deeppavlov/requirements/kenlm.txt
+++ b/deeppavlov/requirements/kenlm.txt
@@ -1 +1 @@
-git+https://github.com/kpu/kenlm.git@96d303cfb1a0c21b8f060dbad640d7ab301c019a#egg=kenlm
\ No newline at end of file
+pypi-kenlm==0.1.20210121
diff --git a/deeppavlov/requirements/nemo-asr.txt b/deeppavlov/requirements/nemo-asr.txt
deleted file mode 100644
index 1a072b36b7..0000000000
--- a/deeppavlov/requirements/nemo-asr.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-frozendict==1.2
-kaldi-io==0.9.4
-inflect==4.1.0
-unidecode==1.1.1
-librosa==0.7.2
-torch-stft==0.1.4
-numba==0.48
\ No newline at end of file
diff --git a/deeppavlov/requirements/nemo-tts.txt b/deeppavlov/requirements/nemo-tts.txt
deleted file mode 100644
index a0f3139b34..0000000000
--- a/deeppavlov/requirements/nemo-tts.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-matplotlib==3.2.1
-sentencepiece==0.1.85
-youtokentome==1.0.6
\ No newline at end of file
diff --git a/deeppavlov/requirements/nemo.txt b/deeppavlov/requirements/nemo.txt
deleted file mode 100644
index e6f8ff402a..0000000000
--- a/deeppavlov/requirements/nemo.txt
+++ /dev/null
@@ -1 +0,0 @@
-nemo-toolkit==0.10.0
\ No newline at end of file
diff --git a/deeppavlov/requirements/pytorch14.txt b/deeppavlov/requirements/pytorch14.txt
deleted file mode 100644
index f940e921a8..0000000000
--- a/deeppavlov/requirements/pytorch14.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-torch==1.4.0
-torchvision==0.5.0
\ No newline at end of file
diff --git a/deeppavlov/requirements/pytorch16.txt b/deeppavlov/requirements/pytorch16.txt
index 0d41debc01..8d28b32b61 100644
--- a/deeppavlov/requirements/pytorch16.txt
+++ b/deeppavlov/requirements/pytorch16.txt
@@ -1,2 +1,2 @@
-torch==1.6.0
-torchvision==0.7.0
\ No newline at end of file
+torch>=1.6.0,<=1.8.0
+torchvision>=0.7.0,<=0.9.0
\ No newline at end of file
diff --git a/deeppavlov/requirements/spacy.txt b/deeppavlov/requirements/spacy.txt
index 9693ba97a9..6523a4c4be 100644
--- a/deeppavlov/requirements/spacy.txt
+++ b/deeppavlov/requirements/spacy.txt
@@ -1 +1 @@
-spacy==2.2.3
\ No newline at end of file
+spacy>=2.2.3,<=2.3.3
diff --git a/deeppavlov/requirements/transformers28.txt b/deeppavlov/requirements/transformers28.txt
deleted file mode 100644
index ec122c087f..0000000000
--- a/deeppavlov/requirements/transformers28.txt
+++ /dev/null
@@ -1 +0,0 @@
-transformers==2.8.0
\ No newline at end of file
diff --git a/deeppavlov/requirements/whapi.txt b/deeppavlov/requirements/whapi.txt
index 8637c13b43..fb55598de1 100644
--- a/deeppavlov/requirements/whapi.txt
+++ b/deeppavlov/requirements/whapi.txt
@@ -1 +1 @@
-whapi==0.6.2
\ No newline at end of file
+whapi<=0.6.2
\ No newline at end of file
diff --git a/deeppavlov/requirements/xeger.txt b/deeppavlov/requirements/xeger.txt
index d415ec4b7e..0a55e3a7e7 100644
--- a/deeppavlov/requirements/xeger.txt
+++ b/deeppavlov/requirements/xeger.txt
@@ -1 +1 @@
-xeger==0.3.5
+xeger<=0.3.5
diff --git a/docs/_static/social/f_logo_RGB-Blue_58.png b/docs/_static/social/f_logo_RGB-Blue_58.png
deleted file mode 100644
index 743ec2d28b..0000000000
Binary files a/docs/_static/social/f_logo_RGB-Blue_58.png and /dev/null differ
diff --git a/docs/_templates/footer.html b/docs/_templates/footer.html
index 30f37e7371..cd7ef617dc 100644
--- a/docs/_templates/footer.html
+++ b/docs/_templates/footer.html
@@ -33,7 +33,6 @@
Problem? Ask a Question or try our Demo
-
diff --git a/docs/apiref/models/bert.rst b/docs/apiref/models/bert.rst
index 5ebcc6552d..c2e82f6b4d 100644
--- a/docs/apiref/models/bert.rst
+++ b/docs/apiref/models/bert.rst
@@ -12,18 +12,6 @@ deeppavlov.models.bert
.. automethod:: __call__
-.. autoclass:: deeppavlov.models.preprocessors.bert_preprocessor.BertRankerPreprocessor
-
- .. automethod:: __call__
-
-.. autoclass:: deeppavlov.models.preprocessors.bert_preprocessor.BertSepRankerPreprocessor
-
- .. automethod:: __call__
-
-.. autoclass:: deeppavlov.models.preprocessors.bert_preprocessor.BertSepRankerPredictorPreprocessor
-
- .. automethod:: __call__
-
.. autoclass:: deeppavlov.models.bert.bert_classifier.BertClassifierModel
.. automethod:: __call__
@@ -47,17 +35,3 @@ deeppavlov.models.bert
.. autoclass:: deeppavlov.models.bert.bert_squad.BertSQuADInferModel
.. automethod:: __call__
-
-.. autoclass:: deeppavlov.models.bert.bert_ranker.BertRankerModel
-
- .. automethod:: __call__
- .. automethod:: train_on_batch
-
-.. autoclass:: deeppavlov.models.bert.bert_ranker.BertSepRankerModel
-
- .. automethod:: __call__
- .. automethod:: train_on_batch
-
-.. autoclass:: deeppavlov.models.bert.bert_ranker.BertSepRankerPredictor
-
- .. automethod:: __call__
\ No newline at end of file
diff --git a/docs/apiref/models/embedders.rst b/docs/apiref/models/embedders.rst
index b004dfa006..bc80dddeb6 100644
--- a/docs/apiref/models/embedders.rst
+++ b/docs/apiref/models/embedders.rst
@@ -8,11 +8,6 @@ deeppavlov.models.embedders
.. automethod:: __call__
.. automethod:: __iter__
-.. autoclass:: deeppavlov.models.embedders.elmo_embedder.ELMoEmbedder
-
- .. automethod:: __call__
- .. automethod:: __iter__
-
.. autoclass:: deeppavlov.models.embedders.glove_embedder.GloVeEmbedder
.. automethod:: __call__
@@ -24,4 +19,4 @@ deeppavlov.models.embedders
.. autoclass:: deeppavlov.models.embedders.transformers_embedder.TransformersBertEmbedder
- .. automethod:: __call__
\ No newline at end of file
+ .. automethod:: __call__
diff --git a/docs/apiref/models/morpho_tagger.rst b/docs/apiref/models/morpho_tagger.rst
index 8e73a7a9ce..dcfac248fa 100644
--- a/docs/apiref/models/morpho_tagger.rst
+++ b/docs/apiref/models/morpho_tagger.rst
@@ -1,11 +1,6 @@
deeppavlov.models.morpho_tagger
===============================
-.. autoclass:: deeppavlov.models.morpho_tagger.morpho_tagger.MorphoTagger
- :members:
-
- .. automethod:: __call__
-
.. autofunction:: deeppavlov.models.morpho_tagger.common.predict_with_model
.. autoclass:: deeppavlov.models.morpho_tagger.lemmatizer.UDPymorphyLemmatizer
diff --git a/docs/apiref/models/nemo.rst b/docs/apiref/models/nemo.rst
deleted file mode 100644
index 27c2054336..0000000000
--- a/docs/apiref/models/nemo.rst
+++ /dev/null
@@ -1,32 +0,0 @@
-deeppavlov.models.nemo
-======================
-
-.. autoclass:: deeppavlov.models.nemo.asr.NeMoASR
-
- .. automethod:: __init__
- .. automethod:: __call__
-
-.. autoclass:: deeppavlov.models.nemo.tts.NeMoTTS
-
- .. automethod:: __init__
- .. automethod:: __call__
-
-.. autofunction:: deeppavlov.models.nemo.common.ascii_to_bytes_io
-
-.. autofunction:: deeppavlov.models.nemo.common.bytes_io_to_ascii
-
-.. autoclass:: deeppavlov.models.nemo.asr.AudioInferDataLayer
-
- .. automethod:: __init__
-
-.. autoclass:: deeppavlov.models.nemo.tts.TextDataLayer
-
- .. automethod:: __init__
-
-.. autoclass:: deeppavlov.models.nemo.vocoder.WaveGlow
-
- .. automethod:: __init__
-
-.. autoclass:: deeppavlov.models.nemo.vocoder.GriffinLim
-
- .. automethod:: __init__
diff --git a/docs/apiref/models/preprocessors.rst b/docs/apiref/models/preprocessors.rst
index 5561f511db..14a1285513 100644
--- a/docs/apiref/models/preprocessors.rst
+++ b/docs/apiref/models/preprocessors.rst
@@ -25,8 +25,6 @@ deeppavlov.models.preprocessors
.. autoclass:: deeppavlov.models.preprocessors.sanitizer.Sanitizer
-.. autoclass:: deeppavlov.models.preprocessors.siamese_preprocessor.SiamesePreprocessor
-
.. autofunction:: deeppavlov.models.preprocessors.str_lower.str_lower
.. autoclass:: deeppavlov.models.preprocessors.str_token_reverser.StrTokenReverser
diff --git a/docs/apiref/models/ranking.rst b/docs/apiref/models/ranking.rst
index e9331289af..504452ed4c 100644
--- a/docs/apiref/models/ranking.rst
+++ b/docs/apiref/models/ranking.rst
@@ -5,14 +5,8 @@ Ranking classes.
.. automodule:: deeppavlov.models.ranking.siamese_model
-.. autoclass:: deeppavlov.models.ranking.bilstm_siamese_network.BiLSTMSiameseNetwork
-
-.. autoclass:: deeppavlov.models.ranking.bilstm_gru_siamese_network.BiLSTMGRUSiameseNetwork
-
.. autoclass:: deeppavlov.models.ranking.keras_siamese_model.KerasSiameseModel
-.. autoclass:: deeppavlov.models.ranking.mpm_siamese_network.MPMSiameseNetwork
-
.. autoclass:: deeppavlov.models.ranking.siamese_model.SiameseModel
.. automethod:: load
@@ -20,7 +14,3 @@ Ranking classes.
.. automethod:: train_on_batch
.. automethod:: __call__
.. automethod:: reset
-
-.. autoclass:: deeppavlov.models.ranking.siamese_predictor.SiamesePredictor
-
-
diff --git a/docs/conf.py b/docs/conf.py
index b3a4f11237..bf2c5039b1 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -190,10 +190,10 @@
# -- Extension configuration -------------------------------------------------
-autodoc_mock_imports = ['bert_dp', 'bs4', 'faiss', 'fastText', 'fasttext', 'gensim', 'hdt', 'kenlm', 'librosa',
- 'lxml', 'nemo', 'nemo_asr', 'nemo_tts', 'nltk', 'opt_einsum', 'rapidfuzz', 'rasa',
- 'russian_tagsets', 'sacremoses', 'sortedcontainers', 'spacy', 'tensorflow', 'tensorflow_hub',
- 'torch', 'transformers', 'udapi', 'ufal_udpipe', 'whapi', 'xeger']
+autodoc_mock_imports = ['bert_dp', 'bs4', 'faiss', 'fastText', 'fasttext', 'gensim', 'hdt', 'kenlm', 'librosa', 'lxml',
+ 'nltk', 'opt_einsum', 'rapidfuzz', 'rasa', 'russian_tagsets', 'sacremoses', 'sortedcontainers',
+ 'spacy', 'tensorflow', 'tensorflow_hub', 'torch', 'transformers', 'udapi', 'ufal_udpipe',
+ 'whapi', 'xeger']
extlinks = {
'config': (f'https://github.com/deepmipt/DeepPavlov/blob/{release}/deeppavlov/configs/%s', None)
diff --git a/docs/features/models/bert.rst b/docs/features/models/bert.rst
index 9e68437742..34e6c3e2f3 100644
--- a/docs/features/models/bert.rst
+++ b/docs/features/models/bert.rst
@@ -22,19 +22,19 @@ There are several pre-trained BERT models released by Google Research, more deta
We have trained BERT-base model for other languages and domains:
- RuBERT, Russian, cased, 12-layer, 768-hidden, 12-heads, 180M parameters: `[deeppavlov] `__,
- `[deeppavlov_pytorch] `__
+ `[deeppavlov_pytorch] `__
- SlavicBERT, Slavic (bg, cs, pl, ru), cased, 12-layer, 768-hidden, 12-heads, 180M parameters: `[deeppavlov] `__,
- `[deeppavlov_pytorch] `__
+ `[deeppavlov_pytorch] `__
- Conversational BERT, English, cased, 12-layer, 768-hidden, 12-heads, 110M parameters: `[deeppavlov] `__,
- `[deeppavlov_pytorch] `__
+ `[deeppavlov_pytorch] `__
- Conversational RuBERT, Russian, cased, 12-layer, 768-hidden, 12-heads, 180M parameters: `[deeppavlov] `__,
- `[deeppavlov_pytorch] `__
+ `[deeppavlov_pytorch] `__
- Conversational DistilRuBERT, Russian, cased, 6-layer, 768-hidden, 12-heads, 135.4M parameters: `[deeppavlov_pytorch] `__
- Conversational DistilRuBERT-tiny, Russian, cased, 2-layer, 768-hidden, 12-heads, 107M parameters: `[deeppavlov_pytorch] `__
- Sentence Multilingual BERT, 101 languages, cased, 12-layer, 768-hidden, 12-heads, 180M parameters: `[deeppavlov] `__,
- `[deeppavlov_pytorch] `__
+ `[deeppavlov_pytorch] `__
- Sentence RuBERT, Russian, cased, 12-layer, 768-hidden, 12-heads, 180M parameters: `[deeppavlov] `__,
- `[deeppavlov_pytorch] `__
+ `[deeppavlov_pytorch] `__
The ``deeppavlov_pytorch`` models are designed to be run with the `HuggingFace's Transformers `__ library.
@@ -52,7 +52,7 @@ English cased version of BERT-base as initialization for English Conversational
Conversational RuBERT was trained on OpenSubtitles [5]_, Dirty, Pikabu, and Social Media segment of Taiga corpus [8]_.
We assembled new vocabulary for Conversational RuBERT model on this data and initialized model with RuBERT.
-Conversational DistilRuBERT (6 transformer layers) and DistilRuBERT-tiny (2 transformer layers) were trained on the same data as Conversational RuBERT and highly inspired by DistilBERT [13]_. Namely, Distil* models (students) used pretrained Conversational RuBERT as teacher and linear combination of the following losses:
+Conversational DistilRuBERT (6 transformer layers) and DistilRuBERT-tiny (2 transformer layers) were trained on the same data as Conversational RuBERT and highly inspired by DistilBERT [3]_. Namely, Distil* models (students) used pretrained Conversational RuBERT as teacher and linear combination of the following losses:
1. Masked language modeling loss (between student output logits for tokens and its true labels)
2. Kullback-Leibler divergence (between student and teacher output logits)
@@ -92,8 +92,8 @@ you can use or modify a :config:`BERT embedder configuration `
-and :config:`NER Ontonotes ` configuration files.
+Examples of using these embeddings in model training pipelines can be found in
+:config:`NER Ontonotes ` configuration file.
BERT for Classification
@@ -161,25 +161,6 @@ and end position in a given context. :class:`~deeppavlov.models.bert.bert_squad.
transformations to predict probability that current subtoken is start/end position of an answer. For details check
:doc:`Context Question Answering documentation page `.
-BERT for Ranking
-----------------
-There are two main approaches in text ranking. The first one is interaction-based which is relatively accurate but
-works slow and the second one is representation-based which is less accurate but faster [3]_.
-The interaction-based ranking based on BERT is represented in the DeepPavlov with two main components
-:class:`~deeppavlov.models.preprocessors.bert_preprocessor.BertRankerPreprocessor` on TensorFlow
-(:class:`~deeppavlov.models.preprocessors.torch_transformers_preprocessor.TorchBertRankerPreprocessor` on PyTorch)
-and :class:`~deeppavlov.models.bert.bert_ranker.BertRankerModel` on TensorFlow
-(:class:`~deeppavlov.models.torch_bert.torch_bert_ranker.TorchBertRankerModel` on PyTorch)
-and the representation-based ranking with components
-:class:`~deeppavlov.models.preprocessors.bert_preprocessor.BertSepRankerPreprocessor`
-and :class:`~deeppavlov.models.bert.bert_ranker.BertSepRankerModel` on TensorFlow.
-Additional components
-:class:`~deeppavlov.models.preprocessors.bert_preprocessor.BertSepRankerPredictorPreprocessor`
-and :class:`~deeppavlov.models.bert.bert_ranker.BertSepRankerPredictor` (on TensorFlow) are for usage in the ``interact`` mode
-where the task for ranking is to retrieve the best possible response from some provided response base with the help of
-the trained model. Working examples with the trained models are given :doc:`here `.
-Statistics are available :doc:`here `.
-
Using custom BERT in DeepPavlov
-------------------------------
@@ -195,7 +176,7 @@ the :doc:`config ` file must be changed to match new BERT
.. [1] Kuratov, Y., Arkhipov, M. (2019). Adaptation of Deep Bidirectional Multilingual Transformers for Russian Language. arXiv preprint arXiv:1905.07213.
.. [2] Arkhipov M., Trofimova M., Kuratov Y., Sorokin A. (2019). `Tuning Multilingual Transformers for Language-Specific Named Entity Recognition `__ . ACL anthology W19-3712.
-.. [3] McDonald, R., Brokos, G. I., & Androutsopoulos, I. (2018). Deep relevance ranking using enhanced document-query interactions. arXiv preprint arXiv:1809.01682.
+.. [3] Sanh, V., Debut, L., Chaumond, J., & Wolf, T. (2019). DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter. arXiv preprint arXiv:1910.01108.
.. [4] Yanran Li, Hui Su, Xiaoyu Shen, Wenjie Li, Ziqiang Cao, and Shuzi Niu. DailyDialog: A Manually Labelled Multi-turn Dialogue Dataset. IJCNLP 2017.
.. [5] P. Lison and J. Tiedemann, 2016, OpenSubtitles2016: Extracting Large Parallel Corpora from Movie and TV Subtitles. In Proceedings of the 10th International Conference on Language Resources and Evaluation (LREC 2016)
.. [6] Justine Zhang, Ravi Kumar, Sujith Ravi, Cristian Danescu-Niculescu-Mizil. Proceedings of NAACL, 2016.
@@ -205,4 +186,3 @@ the :doc:`config ` file must be changed to match new BERT
.. [10] Williams A., Bowman S. (2018) XNLI: Evaluating Cross-lingual Sentence Representations. arXiv preprint arXiv:1809.05053
.. [11] S. R. Bowman, G. Angeli, C. Potts, and C. D. Manning. (2015) A large annotated corpus for learning natural language inference. arXiv preprint arXiv:1508.05326
.. [12] N. Reimers, I. Gurevych (2019) Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks. arXiv preprint arXiv:1908.10084
-.. [13] Sanh, V., Debut, L., Chaumond, J., & Wolf, T. (2019). DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter. arXiv preprint arXiv:1910.01108.
diff --git a/docs/features/models/classifiers.rst b/docs/features/models/classifiers.rst
index 64905b4803..b31d34d63b 100644
--- a/docs/features/models/classifiers.rst
+++ b/docs/features/models/classifiers.rst
@@ -6,7 +6,7 @@ which are implemented as a number of different **neural networks** or **sklearn
Models can be used for binary, multi-class or multi-label classification.
List of available classifiers (more info see below):
-* **BERT classifier** (see :doc:`here `) builds BERT [8]_ architecture for classification problem on **TensorFlow** or on **PyTorch**.
+* **BERT classifier** (see :doc:`here `) builds BERT [5]_ architecture for classification problem on **TensorFlow** or on **PyTorch**.
* **Keras classifier** (see :doc:`here `) builds neural network on Keras with tensorflow backend.
@@ -27,18 +27,18 @@ Command line
python -m deeppavlov install
where ```` is a path to one of the :config:`provided config files `
-or its name without an extension, for example :config:`"intents_snips" `.
+or its name without an extension, for example :config:`"insults_kaggle_bert" `.
To download pre-trained models, vocabs, embeddings on the dataset of interest one should run the following command
providing corresponding name of the config file (see above)
-or provide flag ``-d`` for commands like ``interact``, ``telegram``, ``train``, ``evaluate``.:
+or provide flag ``-d`` for commands like ``interact``, ``telegram``, ``train``, ``evaluate``:
.. code:: bash
python -m deeppavlov download
where ```` is a path to one of the :config:`provided config files `
-or its name without an extension, for example :config:`"intents_snips" `.
+or its name without an extension, for example :config:`"insults_kaggle_bert" `.
When using KerasClassificationModel for **Windows** platforms one have to set `KERAS_BACKEND` to `tensorflow`:
@@ -53,7 +53,7 @@ When using KerasClassificationModel for **Windows** platforms one have to set `K
python -m deeppavlov interact [-d]
where ```` is a path to one of the :config:`provided config files `
-or its name without an extension, for example :config:`"intents_snips" `.
+or its name without an extension, for example :config:`"insults_kaggle_bert" `.
With the optional ``-d`` parameter all the data required to run selected pipeline will be **downloaded**.
**TRAIN** After preparing the config file (including change of dataset, pipeline elements or parameters)
@@ -73,7 +73,7 @@ Then training can be run in the following way:
python -m deeppavlov train
where ```` is a path to one of the :config:`provided config files `
-or its name without an extension, for example :config:`"intents_snips" `.
+or its name without an extension, for example :config:`"insults_kaggle_bert" `.
With the optional ``-d`` parameter all the data required to run selected pipeline will be **downloaded**.
Python code
@@ -94,34 +94,30 @@ Then one can build and interact a model from configuration file:
.. code:: python
- from deeppavlov import build_model, configs
+ from deeppavlov import build_model
- CONFIG_PATH = configs.classifiers.intents_snips # could also be configuration dictionary or string path or `pathlib.Path` instance
+ model = build_model('insults_kaggle_bert', download=True) # in case of necessity to download some data
- model = build_model(CONFIG_PATH, download=True) # in case of necessity to download some data
+ model = build_model('insults_kaggle_bert', download=False) # otherwise
- model = build_model(CONFIG_PATH, download=False) # otherwise
+ print(model(["You are dumb", "He lay flat on the brown, pine-needled floor of the forest"]))
- print(model(["What is the weather in Boston today?"]))
-
- >>> [['GetWeather']]
+ >>> ['Insult', 'Not Insult']
**TRAIN** Also training can be run in the following way:
.. code:: python
- from deeppavlov import train_model, configs
-
- CONFIG_PATH = configs.classifiers.intents_snips # could also be configuration dictionary or string path or `pathlib.Path` instance
+ from deeppavlov import train_model
- model = train_model(CONFIG_PATH, download=True) # in case of necessity to download some data
+ model = train_model('insults_kaggle_bert', download=True) # in case of necessity to download some data
- model = train_model(CONFIG_PATH, download=False) # otherwise
+ model = train_model('insults_kaggle_bert', download=False) # otherwise
BERT models
-----------
-BERT (Bidirectional Encoder Representations from Transformers) [8]_ is a Transformer pre-trained on masked language model
+BERT (Bidirectional Encoder Representations from Transformers) [5]_ is a Transformer pre-trained on masked language model
and next sentence prediction tasks. This approach showed state-of-the-art results on a wide range of NLP tasks in
English.
@@ -188,7 +184,7 @@ Therefore, for sklearn component classifier one should set ``ensure_list_output`
Pre-trained models
------------------
-We also provide with **pre-trained models** for classification on DSTC 2 dataset, SNIPS dataset, "AG News" dataset,
+We also provide with **pre-trained models** for classification on DSTC 2 dataset, "AG News" dataset,
"Detecting Insults in Social Commentary", Twitter sentiment in Russian dataset.
`DSTC 2 dataset `__ does not initially contain information about **intents**,
@@ -226,24 +222,6 @@ In the original dataset this user reply has characteristics
This message contains two intents ``(thankyou, bye)``. Train, valid and
test division is the same as on web-site.
-`SNIPS dataset `__
-contains **intent classification** task for 7 intents (approximately 2.4
-samples per intent):
-
-- GetWeather
-- BookRestaurant
-- PlayMusic
-- AddToPlaylist
-- RateBook
-- SearchScreeningEvent
-- SearchCreativeWork
-
-Initially, classification model on SNIPS dataset [7]_ was trained only as an
-example of usage that is why we provide pre-trained model for SNIPS with
-embeddings trained on DSTC-2 dataset that is not the best choice for
-this task. Train set is divided to train and validation sets to
-illustrate ``basic_classification_iterator`` work.
-
`Detecting Insults in Social Commentary dataset `__
contains binary classification task for **detecting insults** for
participants of conversation. Train, valid and test division is the same
@@ -291,69 +269,22 @@ of sentences. Each sentence were initially labelled with floating point value fr
the floating point labels are converted to integer labels according to the intervals `[0, 0.2], (0.2, 0.4], (0.4, 0.6], (0.6, 0.8], (0.8, 1.0]`
corresponding to `very negative`, `negative`, `neutral`, `positive`, `very positive` classes.
-`Yelp Reviews `__ contains 5-classes **sentiment classification** of product reviews.
-The labels are `1`, `2`, `3`, `4`, `5` corresponding to `very negative`, `negative`, `neutral`, `positive`, `very positive` classes.
-The reviews are long enough (cut up to 200 subtokens).
-
+------------------+--------------------+------+-------------------------------------------------------------------------------------------------+-------------+--------+--------+-----------+
| Task | Dataset | Lang | Model | Metric | Valid | Test | Downloads |
+==================+====================+======+=================================================================================================+=============+========+========+===========+
-| 28 intents | `DSTC 2`_ | En | :config:`DSTC 2 emb ` | Accuracy | 0.7613 | 0.7733 | 800 Mb |
-+ + + +-------------------------------------------------------------------------------------------------+ +--------+--------+-----------+
-| | | | :config:`Wiki emb ` | | 0.9629 | 0.9617 | 8.5 Gb |
-+ + + +-------------------------------------------------------------------------------------------------+ +--------+--------+-----------+
-| | | | :config:`BERT ` | | 0.9673 | 0.9636 | 800 Mb |
-+------------------+--------------------+ +-------------------------------------------------------------------------------------------------+-------------+--------+--------+-----------+
-| 7 intents | `SNIPS-2017`_ [7]_ | | :config:`DSTC 2 emb ` | F1-macro | 0.8591 | -- | 800 Mb |
-+ + + +-------------------------------------------------------------------------------------------------+ +--------+--------+-----------+
-| | | | :config:`Wiki emb ` | | 0.9820 | -- | 8.5 Gb |
-+ + + +-------------------------------------------------------------------------------------------------+ +--------+--------+-----------+
-| | | | :config:`Tfidf + SelectKBest + PCA + Wiki emb ` | | 0.9673 | -- | 8.6 Gb |
-+ + + +-------------------------------------------------------------------------------------------------+ +--------+--------+-----------+
-| | | | :config:`Wiki emb weighted by Tfidf ` | | 0.9786 | -- | 8.5 Gb |
-+------------------+--------------------+ +-------------------------------------------------------------------------------------------------+-------------+--------+--------+-----------+
-| Insult detection | `Insults`_ | | :config:`Reddit emb ` | ROC-AUC | 0.9263 | 0.8556 | 6.2 Gb |
-+ + + +-------------------------------------------------------------------------------------------------+ +--------+--------+-----------+
-| | | | :config:`English BERT ` | | 0.9255 | 0.8612 | 1200 Mb |
-+ + + +-------------------------------------------------------------------------------------------------+ +--------+--------+-----------+
-| | | | :config:`English Conversational BERT ` | | 0.9389 | 0.8941 | 1200 Mb |
-+ + + +-------------------------------------------------------------------------------------------------+ +--------+--------+-----------+
-| | | | :config:`English BERT on PyTorch ` | | 0.9329 | 0.877 | 1.1 Gb |
-+------------------+--------------------+ +-------------------------------------------------------------------------------------------------+-------------+--------+--------+-----------+
-| 5 topics | `AG News`_ | | :config:`Wiki emb ` | Accuracy | 0.8922 | 0.9059 | 8.5 Gb |
-+------------------+--------------------+ +-------------------------------------------------------------------------------------------------+-------------+--------+--------+-----------+
-| Intent |`Yahoo-L31`_ | | :config:`Yahoo-L31 on conversational BERT ` | ROC-AUC | 0.9436 | -- | 1200 Mb |
+| Insult detection | `Insults`_ | En | :config:`English BERT ` | ROC-AUC | 0.9327 | 0.8602 | 1.1 Gb |
+------------------+--------------------+ +-------------------------------------------------------------------------------------------------+-------------+--------+--------+-----------+
| Sentiment |`SST`_ | | :config:`5-classes SST on conversational BERT ` | Accuracy | 0.6456 | 0.6715 | 400 Mb |
-+ + + +-------------------------------------------------------------------------------------------------+ +--------+--------+-----------+
-| | | | :config:`5-classes SST on multilingual BERT ` | | 0.5738 | 0.6024 | 660 Mb |
-+ + + +-------------------------------------------------------------------------------------------------+ +--------+--------+-----------+
-| | | | :config:`3-classes SST SWCNN on PyTorch ` | | 0.7379 | 0.6312 | 4.3 Mb |
-+ +--------------------+ +-------------------------------------------------------------------------------------------------+ +--------+--------+-----------+
-| |`Yelp`_ | | :config:`5-classes Yelp on conversational BERT ` | | 0.6925 | 0.6842 | 400 Mb |
-+ + + +-------------------------------------------------------------------------------------------------+ +--------+--------+-----------+
-| | | | :config:`5-classes Yelp on multilingual BERT ` | | 0.5896 | 0.5874 | 660 Mb |
+------------------+--------------------+------+-------------------------------------------------------------------------------------------------+-------------+--------+--------+-----------+
-| Sentiment |`Twitter mokoron`_ | Ru | :config:`RuWiki+Lenta emb w/o preprocessing ` | | 0.9965 | 0.9961 | 6.2 Gb |
-+ + + +-------------------------------------------------------------------------------------------------+ +--------+--------+-----------+
-| | | | :config:`RuWiki+Lenta emb with preprocessing ` | | 0.7823 | 0.7759 | 6.2 Gb |
+| Sentiment |`Twitter mokoron`_ | Ru | :config:`RuWiki+Lenta emb w/o preprocessing ` | F1-macro | 0.9965 | 0.9961 | 6.2 Gb |
+ +--------------------+ +-------------------------------------------------------------------------------------------------+-------------+--------+--------+-----------+
-| |`RuSentiment`_ | | :config:`RuWiki+Lenta emb ` | F1-weighted | 0.6541 | 0.7016 | 6.2 Gb |
-+ + + +-------------------------------------------------------------------------------------------------+ +--------+--------+-----------+
-| | | | :config:`Twitter emb super-convergence ` [6]_ | | 0.7301 | 0.7576 | 3.4 Gb |
-+ + + +-------------------------------------------------------------------------------------------------+ +--------+--------+-----------+
-| | | | :config:`ELMo ` | | 0.7519 | 0.7875 | 700 Mb |
-+ + + +-------------------------------------------------------------------------------------------------+ +--------+--------+-----------+
-| | | | :config:`Multi-language BERT ` | | 0.6809 | 0.7193 | 1900 Mb |
+| |`RuSentiment`_ | | :config:`Multi-language BERT ` | F1-weighted | 0.6809 | 0.7193 | 1900 Mb |
+ + + +-------------------------------------------------------------------------------------------------+ +--------+--------+-----------+
| | | | :config:`Conversational RuBERT ` | | 0.7548 | 0.7742 | 657 Mb |
-+------------------+--------------------+ +-------------------------------------------------------------------------------------------------+-------------+--------+--------+-----------+
-| Intent |Ru like`Yahoo-L31`_ | | :config:`Conversational vs Informational on ELMo ` | ROC-AUC | 0.9412 | -- | 700 Mb |
+------------------+--------------------+------+-------------------------------------------------------------------------------------------------+-------------+--------+--------+-----------+
.. _`DSTC 2`: http://camdial.org/~mh521/dstc/
-.. _`SNIPS-2017`: https://github.com/snipsco/nlu-benchmark/tree/master/2017-06-custom-intent-engines
.. _`Insults`: https://www.kaggle.com/c/detecting-insults-in-social-commentary
.. _`AG News`: https://www.di.unipi.it/~gulli/AG_corpus_of_news_articles.html
.. _`Twitter mokoron`: http://study.mokoron.com/
@@ -362,7 +293,6 @@ The reviews are long enough (cut up to 200 subtokens).
.. _`Yahoo-L31`: https://webscope.sandbox.yahoo.com/catalog.php?datatype=l
.. _`Yahoo-L6`: https://webscope.sandbox.yahoo.com/catalog.php?datatype=l
.. _`SST`: https://nlp.stanford.edu/sentiment/index.html
-.. _`Yelp`: https://www.yelp.com/dataset
GLUE Benchmark
--------------
@@ -422,43 +352,13 @@ Then training process can be run in the same way:
python -m deeppavlov train
-Comparison
-----------
-
-The comparison of the presented model is given on **SNIPS** dataset [7]_. The
-evaluation of model scores was conducted in the same way as in [3]_ to
-compare with the results from the report of the authors of the dataset.
-The results were achieved with tuning of parameters and embeddings
-trained on Reddit dataset.
-
-+------------------------+-----------------+------------------+---------------+--------------+--------------+----------------------+------------------------+
-| Model | AddToPlaylist | BookRestaurant | GetWheather | PlayMusic | RateBook | SearchCreativeWork | SearchScreeningEvent |
-+========================+=================+==================+===============+==============+==============+======================+========================+
-| api.ai | 0.9931 | 0.9949 | 0.9935 | 0.9811 | 0.9992 | 0.9659 | 0.9801 |
-+------------------------+-----------------+------------------+---------------+--------------+--------------+----------------------+------------------------+
-| ibm.watson | 0.9931 | 0.9950 | 0.9950 | 0.9822 | 0.9996 | 0.9643 | 0.9750 |
-+------------------------+-----------------+------------------+---------------+--------------+--------------+----------------------+------------------------+
-| microsoft.luis | 0.9943 | 0.9935 | 0.9925 | 0.9815 | 0.9988 | 0.9620 | 0.9749 |
-+------------------------+-----------------+------------------+---------------+--------------+--------------+----------------------+------------------------+
-| wit.ai | 0.9877 | 0.9913 | 0.9921 | 0.9766 | 0.9977 | 0.9458 | 0.9673 |
-+------------------------+-----------------+------------------+---------------+--------------+--------------+----------------------+------------------------+
-| snips.ai | 0.9873 | 0.9921 | 0.9939 | 0.9729 | 0.9985 | 0.9455 | 0.9613 |
-+------------------------+-----------------+------------------+---------------+--------------+--------------+----------------------+------------------------+
-| recast.ai | 0.9894 | 0.9943 | 0.9910 | 0.9660 | 0.9981 | 0.9424 | 0.9539 |
-+------------------------+-----------------+------------------+---------------+--------------+--------------+----------------------+------------------------+
-| amazon.lex | 0.9930 | 0.9862 | 0.9825 | 0.9709 | 0.9981 | 0.9427 | 0.9581 |
-+------------------------+-----------------+------------------+---------------+--------------+--------------+----------------------+------------------------+
-+------------------------+-----------------+------------------+---------------+--------------+--------------+----------------------+------------------------+
-| Shallow-and-wide CNN | **0.9956** | **0.9973** | **0.9968** | **0.9871** | **0.9998** | **0.9752** | **0.9854** |
-+------------------------+-----------------+------------------+---------------+--------------+--------------+----------------------+------------------------+
-
How to improve the performance
------------------------------
-- One can use FastText [4]_ to train embeddings that are better suited
+- One can use FastText [3]_ to train embeddings that are better suited
for considered datasets.
- One can use some custom preprocessing to clean texts.
-- One can use ELMo [5]_ or BERT [8]_.
+- One can use ELMo [4]_ or BERT [5]_.
- All the parameters should be tuned on the validation set.
References
@@ -468,14 +368,8 @@ References
.. [2] Ю. В. Рубцова. Построение корпуса текстов для настройки тонового классификатора // Программные продукты и системы, 2015, №1(109), –С.72-78
-.. [3] https://www.slideshare.net/KonstantinSavenkov/nlu-intent-detection-benchmark-by-intento-august-2017
-
-.. [4] P. Bojanowski\ *, E. Grave*, A. Joulin, T. Mikolov, Enriching Word Vectors with Subword Information.
-
-.. [5] Peters, Matthew E., et al. "Deep contextualized word representations." arXiv preprint arXiv:1802.05365 (2018).
-
-.. [6] Smith L. N., Topin N. Super-convergence: Very fast training of residual networks using large learning rates. – 2018.
+.. [3] P. Bojanowski\ *, E. Grave*, A. Joulin, T. Mikolov, Enriching Word Vectors with Subword Information.
-.. [7] Coucke A. et al. Snips voice platform: an embedded spoken language understanding system for private-by-design voice interfaces //arXiv preprint arXiv:1805.10190. – 2018.
+.. [4] Peters, Matthew E., et al. "Deep contextualized word representations." arXiv preprint arXiv:1802.05365 (2018).
-.. [8] Devlin J. et al. Bert: Pre-training of deep bidirectional transformers for language understanding //arXiv preprint arXiv:1810.04805. – 2018.
+.. [5] Devlin J. et al. Bert: Pre-training of deep bidirectional transformers for language understanding //arXiv preprint arXiv:1810.04805. – 2018.
diff --git a/docs/features/models/morphotagger.rst b/docs/features/models/morphotagger.rst
index e8e7769cd5..dd21787edf 100644
--- a/docs/features/models/morphotagger.rst
+++ b/docs/features/models/morphotagger.rst
@@ -2,60 +2,19 @@ Neural Morphological Tagging
============================
It is an implementation of neural morphological tagger.
-As for now (November, 2019) we have two types of models:
-the BERT-based ones (available only for Russian) and
-the character-based bidirectional LSTM. The BERT model
-includes only a dense layer on the top of BERT embedder.
+The model includes only a dense layer on the top of BERT embedder.
See the `BERT paper `__
for a more complete description, as well as the
`BERT section `__ of the documentation.
-We plan to release more BERT-based models in near future.
-Most of our models follow
-`Heigold et al., 2017. An extensive empirical evaluation of
-character-based morphological tagging for 14
-languages `__.
-They also achieve the state-of-the-art performance among open source
-systems.
-
-The BERT-based model is trained on `Universal
-Dependencies corpora `__
-(version 2.3), while all the other models were trained
-on Universal Dependencies 2.0 corpora.
+The model is trained on `Universal Dependencies corpora `__
+(version 2.3).
+----------------+--------------+-----------------+-------------------------------+------------------+----------------+
-| Language | Code | UDPipe accuracy | UDPipe Future accuracy [#f1]_ | Our top accuracy | Model size (MB)|
+| Language | Code | UDPipe accuracy | UDPipe Future accuracy | Our top accuracy | Model size (MB)|
+================+==============+=================+===============================+==================+================+
-| Arabic | ar | 88.31 | | 90.85 | 23.7 |
-+----------------+--------------+-----------------+-------------------------------+------------------+----------------+
-| Czech | cs | 91.86 | | 94.35 | 41.8 |
-+----------------+--------------+-----------------+-------------------------------+------------------+----------------+
-| English | en | 92.53 | | 93.00 | 16.9 |
-+----------------+--------------+-----------------+-------------------------------+------------------+----------------+
-| French | fr | 95.25 | | 95.45 | 19.0 |
-+----------------+--------------+-----------------+-------------------------------+------------------+----------------+
-| German | de | 76.65 | | 83.83 | 18.6 |
-+----------------+--------------+-----------------+-------------------------------+------------------+----------------+
-| Hindi | hi | 87.74 | | 90.01 | 21.9 |
-+----------------+--------------+-----------------+-------------------------------+------------------+----------------+
-| Hungarian | hu | 69.52 | | 75.34 | 15.4 |
-+----------------+--------------+-----------------+-------------------------------+------------------+----------------+
-| Italian | it | 96.33 | | 96.47 | 32.0 |
-+----------------+--------------+-----------------+-------------------------------+------------------+----------------+
-| Russian | ru_syntagrus | 93.57 | | 96.23 | 48.7 |
-+----------------+--------------+-----------------+-------------------------------+------------------+----------------+
| Russian (UD2.3)| ru_syntagrus | 93.5 | 96.90 | 97.83 | 661 |
+----------------+--------------+-----------------+-------------------------------+------------------+----------------+
-| Spanish | es_ancora | 96.88 | | 97.00 | 20.8 |
-+----------------+--------------+-----------------+-------------------------------+------------------+----------------+
-| Turkish | tr | 86.98 | | 88.03 | 16.1 |
-+----------------+--------------+-----------------+-------------------------------+------------------+----------------+
-
-.. rubric:: Footnotes
-
-.. [#f1] No models available, only the source code. The scores are taken from
- `Straka. UDPipe 2.0 Prototype at CoNLL 2018 UD Shared Task. `__.
-
===========================
Usage examples.
@@ -65,7 +24,7 @@ Before using the model make sure that all required packages are installed using
.. code:: bash
- python -m deeppavlov install morpho_ru_syntagrus_pymorphy
+ python -m deeppavlov install morpho_ru_syntagrus_bert
For Windows platform one has to set `KERAS_BACKEND` to `tensorflow` (it could be done only once):
@@ -84,82 +43,10 @@ For Windows platform if one did not set `KERAS_BACKEND` to `tensorflow` from com
os.environ["KERAS_BACKEND"] = "tensorflow"
-
-.. code:: python
-
- from deeppavlov import build_model, configs
- model = build_model(configs.morpho_tagger.UD2_0.morpho_ru_syntagrus_pymorphy, download=True)
- sentences = ["Я шёл домой по незнакомой улице.", "Девушка пела в церковном хоре о всех уставших в чужом краю."]
- for parse in model(sentences):
- print(parse)
-
-If you want to use the obtained tags further in Python, just split the output using tabs and newlines.
-
-You may also pass the tokenized sentences instead of raw ones:
-
-.. code:: python
-
- sentences = [["Я", "шёл", "домой", "по", "незнакомой", "улице", "."]]
- for parse in model(sentences):
- print(parse)
-
-If your data is large, you can call
-:meth:`~deeppavlov.core.common.chainer.Chainer.batched_call` method of the model, which will additionally
-separate you list of sentences into small batches.
-
-.. code:: python
-
- from deeppavlov import build_model, configs
- model = build_model(configs.morpho_tagger.UD2_0.morpho_ru_syntagrus_pymorphy, download=True)
- sentences = ["Я шёл домой по незнакомой улице.", "Девушка пела в церковном хоре о всех уставших в чужом краю."]
- for parse in model.batched_call(sentences, batch_size=16):
- print(parse)
-
-::
-
- 1 Я PRON,Case=Nom|Number=Sing|Person=1 _
- 2 шёл VERB,Aspect=Imp|Gender=Masc|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act _
- 3 домой ADV,Degree=Pos _
- 4 по ADP _
- 5 незнакомой ADJ,Case=Dat|Degree=Pos|Gender=Fem|Number=Sing _
- 6 улице NOUN,Animacy=Inan|Case=Dat|Gender=Fem|Number=Sing _
- 7 . PUNCT _
-
- 1 Девушка NOUN,Animacy=Anim|Case=Nom|Gender=Fem|Number=Sing _
- 2 пела VERB,Aspect=Imp|Gender=Fem|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act _
- 3 в ADP _
- 4 церковном ADJ,Case=Loc|Degree=Pos|Gender=Masc|Number=Sing _
- 5 хоре NOUN,Animacy=Inan|Case=Loc|Gender=Masc|Number=Sing _
- 6 о ADP _
- 7 всех PRON,Animacy=Anim|Case=Loc|Number=Plur _
- 8 уставших VERB,Aspect=Perf|Case=Loc|Number=Plur|Tense=Past|VerbForm=Part|Voice=Act _
- 9 в ADP _
- 10 чужом ADJ,Case=Loc|Degree=Pos|Gender=Masc|Number=Sing _
- 11 краю NOUN,Animacy=Inan|Case=Loc|Gender=Masc|Number=Sing _
- 12 . PUNCT _
-
-If you want the output in UD format, try setting ``"data_format": ud`` in the ``tag_output_prettifier`` section
-of :config:`configuration file `
-you import.
-
-Advanced models (BERT and lemmatized models).
----------------------------------------------
-
-#. For Russian you can use the BERT-based model. It has much higher performance (97.8% instead of 96.2),
- however, you need a more powerful GPU (ideally, 16 GB) to train it. However, the speed
- of inference and training on such GPU is comparable with character-based model.
-
-#. Exclusively for Russian language you can obtain lemmatized UD output by using either the
- :config:`BERT model `
- :config:`augmented version `
- of Pymorphy model. Both models select the Pymorphy lemma whose tag correspond to the tag
- predicted by the tagger.
-
.. code:: python
from deeppavlov import build_model, configs
- model = build_model(configs.morpho_tagger.BERT.morpho_ru_syntagrus_bert, download=True)
- # model = build_model(configs.morpho_tagger.UD2_0.morpho_ru_syntagrus_pymorphy_lemmatize, download=True)
+ model = build_model('morpho_ru_syntagrus_bert', download=True)
sentences = ["Я шёл домой по незнакомой улице.", "Девушка пела в церковном хоре о всех уставших в чужом краю."]
for parse in model(sentences):
print(parse)
@@ -187,129 +74,14 @@ Advanced models (BERT and lemmatized models).
11 краю край NOUN _ Animacy=Inan|Case=Loc|Gender=Masc|Number=Sing _ _ _ _
12 . . PUNCT _ _ _ _ _ _
-Command line:
-----------------
-
-If you want to use our models from scratch, do the following
-(all the examples are for ru\_syntagrus\_pymorphy model,
-change the filenames accordingly to invoke models for other languages):
-
-#. Download data
-
- .. code:: bash
-
- python -m deeppavlov download morpho_ru_syntagrus_pymorphy
-
- To perform all downloads in runtime you can also run all subsequent
- commands with ``-d`` key,
-
-#. To apply a pre-trained ru\_syntagrus\_pymorphy model to ru\_syntagrus test
- data provided it was downloaded using the previous command, run
-
- .. code:: bash
-
- python -m deeppavlov.models.morpho_tagger morpho_ru_syntagrus_pymorphy \
- > -f ~/.deeppavlov/downloads/UD2.0_source/ru_syntagrus/ru_syntagrus-ud-test.conllu
-
- ``-f`` argument points to the path to the test data. If you do not pass it the model expects data from ``stdin``.
- This command writes the output to stdout, you can redirect it using standard ``>`` notation.
-
- - By default the ``deeppavlov.models.morpho_tagger`` script expects the data to be in CoNLL-U format,
- however, you can specify input format by using the `-i` key. For example, your input can be in one word per line
- format, in this case you set this key to ``"vertical"``. Note also that you can pass the data from
-
- .. code:: bash
-
- echo -e "Мама\nмыла\nраму\n.\n\nВаркалось\n,\nхливкие\nшорьки\nпырялись\nпо\nнаве\n." \
- > | python -m deeppavlov.models.morpho_tagger morpho_ru_syntagrus_pymorphy -i "vertical"
-
- ::
-
- 1 Мама NOUN Animacy=Anim|Case=Nom|Gender=Fem|Number=Sing
- 2 мыла VERB Aspect=Imp|Gender=Fem|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act
- 3 раму NOUN Animacy=Inan|Case=Acc|Gender=Fem|Number=Sing
- 4 . PUNCT _
-
- 1 Варкалось NOUN Animacy=Anim|Case=Nom|Gender=Masc|Number=Sing
- 2 , PUNCT _
- 3 хливкие ADJ Case=Nom|Degree=Pos|Number=Plur
- 4 шорьки NOUN Animacy=Inan|Case=Nom|Gender=Masc|Number=Plur
- 5 пырялись VERB Aspect=Imp|Mood=Ind|Number=Plur|Tense=Past|VerbForm=Fin|Voice=Mid
- 6 по ADP _
- 7 наве NOUN Animacy=Inan|Case=Dat|Gender=Masc|Number=Sing
- 8 . PUNCT _
-
-
- - Untokenized sentences (one sentence per line) can be tagged as well, in this case input format should be ``"text"``
-
- .. code:: bash
-
- echo -e "Мама мыла раму.\nВаркалось, хливкие шорьки пырялись по наве." \
- > | python -m deeppavlov.models.morpho_tagger morpho_ru_syntagrus_pymorphy -i "text"
-
- ::
- 1 Мама NOUN Animacy=Anim|Case=Nom|Gender=Fem|Number=Sing
- 2 мыла VERB Aspect=Imp|Gender=Fem|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act
- 3 раму NOUN Animacy=Inan|Case=Acc|Gender=Fem|Number=Sing
- 4 . PUNCT _
-
- 1 Варкалось NOUN Animacy=Anim|Case=Nom|Gender=Masc|Number=Sing
- 2 , PUNCT _
- 3 хливкие ADJ Case=Nom|Degree=Pos|Number=Plur
- 4 шорьки NOUN Animacy=Inan|Case=Nom|Gender=Masc|Number=Plur
- 5 пырялись VERB Aspect=Imp|Mood=Ind|Number=Plur|Tense=Past|VerbForm=Fin|Voice=Mid
- 6 по ADP _
- 7 наве NOUN Animacy=Inan|Case=Dat|Gender=Masc|Number=Sing
- 8 . PUNCT _
-
- - You can also obtain the output in CoNLL-U format by passing the ``-o ud`` argument:
-
- .. code:: bash
-
- echo -e "Мама мыла раму.\nВаркалось, хливкие шорьки пырялись по наве." \
- > | python -m deeppavlov.models.morpho_tagger morpho_ru_syntagrus_pymorphy -i "text" -o "ud"
-
- ::
-
- 1 Мама _ NOUN _ Animacy=Anim|Case=Nom|Gender=Fem|Number=Sing _ _ _ _
- 2 мыла _ VERB _ Aspect=Imp|Gender=Fem|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act _ _ _ _
- 3 раму _ NOUN _ Animacy=Inan|Case=Acc|Gender=Fem|Number=Sing _ _ _ _
- 4 . _ PUNCT _ _ _ _ _ _
-
- 1 Варкалось _ NOUN _ Animacy=Anim|Case=Nom|Gender=Masc|Number=Sing _ _ _ _
- 2 , _ PUNCT _ _ _ _ _ _
- 3 хливкие _ ADJ _ Case=Nom|Degree=Pos|Number=Plur _ _ _ _
- 4 шорьки _ NOUN _ Animacy=Inan|Case=Nom|Gender=Masc|Number=Plur _ _ _ _
- 5 пырялись _ VERB _ Aspect=Imp|Mood=Ind|Number=Plur|Tense=Past|VerbForm=Fin|Voice=Mid _ _ _ _
- 6 по _ ADP _ _ _ _ _ _
- 7 наве _ NOUN _ Animacy=Inan|Case=Dat|Gender=Masc|Number=Sing _ _ _ _
- 8 . _ PUNCT _ _ _ _ _ _
-
-
-#. To evaluate ru\_syntagrus model on ru\_syntagrus test subset, run
-
- .. code:: bash
-
- python -m deeppavlov evaluate morpho_ru_syntagrus_pymorphy
-
-#. To retrain model on ru\_syntagrus dataset, run one of the following
- (the first is for Pymorphy-enriched model)
-
- .. code:: bash
-
- python -m deeppavlov train morpho_ru_syntagrus_pymorphy
- python -m deeppavlov train morpho_ru_syntagrus
-
- Be careful, one epoch takes 2-60 minutes depending on your GPU.
-
-#. To tag Russian sentences from stdin, run
-
- .. code:: bash
+You may also pass the tokenized sentences instead of raw ones:
- python -m deeppavlov interact morpho_ru_syntagrus_pymorphy
+.. code:: python
-Read the detailed readme below.
+ sentences = [["Я", "шёл", "домой", "по", "незнакомой", "улице", "."]]
+ for parse in model(sentences):
+ print(parse)
Task description
----------------
@@ -349,336 +121,3 @@ project `__.
6 were AUX Mood=Ind|Tense=Past|VerbForm=Fin
7 married VERB Tense=Past|VerbForm=Part|Voice=Pass
8 . PUNCT _
-
-The full UD format (see below) includes more columns including lemma and
-syntactic information.
-
-Training data
-~~~~~~~~~~~~~
-
-Our tagger accepts the data in `CONLL-U
-format `__:
-
-::
-
- 1 Four four NUM CD NumType=Card 2 nummod _ _
- 2 months month NOUN NNS Number=Plur 3 obl:npmod _ _
- 3 later later ADV RB _ 7 advmod _ SpaceAfter=No
- 4 , , PUNCT , _ 7 punct _ _
- 5 we we PRON PRP Case=Nom|Number=Plur|Person=1|PronType=Prs 7 nsubj:pass _ _
- 6 were be AUX VBD Mood=Ind|Tense=Past|VerbForm=Fin 7 aux:pass _ _
- 7 married marry VERB VBN Tense=Past|VerbForm=Part|Voice=Pass 0 root _ SpaceAfter=No
- 8 . . PUNCT . _ 7 punct _ _
-
-It does not take into account the contents except the columns number
-2, 4, 6
-(the word itself, POS label and morphological tag), however, in the
-default setting the reader
-expects the word to be in column 2, the POS label in column 4 and the
-detailed tag description
-in column 6.
-
-Test data
-~~~~~~~~~
-
-When annotating unlabeled text, our model expects the data in
-10-column UD format as well. However, it does not pay attention to any column except the first one,
-which should be a number, and the second, which must contain a word.
-You can also pass only the words with exactly one word on each line
-by adding ``"from_words": True`` to ``dataset_reader`` section.
-Sentences are separated with blank lines.
-
-You can also pass the unlemmatized text as input. In this case it is preliminarly lemmatized using the
-NLTK ``word_tokenize`` function.
-
-Algorithm description
----------------------
-
-We adopt a neural model for morphological tagging from
-`Heigold et al., 2017. An extensive empirical evaluation of
-character-based morphological tagging for 14
-languages `__.
-We refer the reader to the paper for complete description of the
-algorithm. The tagger consists
-of two parts: a character-level network which creates embeddings for
-separate words and word-level
-recurrent network which transforms these embeddings to morphological
-tags.
-
-The character-level part implements the model from
-`Kim et al., 2015. Character-aware language
-models `__.
-First it embeds the characters into dense vectors, then passes these
-vectors through multiple
-parallel convolutional layers and concatenates the output of these
-convolutions. The convolution
-output is propagated through a highway layer to obtain the final word
-representation.
-
-You can optionally use a morphological dictionary during tagging. In
-this case our model collects
-a 0/1 vector with ones corresponding to the dictionary tags of a
-current word. This vector is
-passed through a one-layer perceptron to obtain an embedding of
-dictionary information.
-This embedding is concatenated with the output of character-level
-network.
-
-As a word-level network we utilize a Bidirectional LSTM, its outputs
-are projected through a dense
-layer with a softmax activation. In principle, several BiLSTM layers
-may be stacked as well
-as several convolutional or highway layers on character level;
-however, we did not observed
-any sufficient gain in performance and use shallow architecture
-therefore.
-
-Model configuration.
---------------------
-
-Training configuration
-~~~~~~~~~~~~~~~~~~~~~~
-
-We distribute pre-trained models for 11 languages trained on Universal Dependencies data.
-Configuration files for reproducible training are also available in
-:config:`deeppavlov/configs/morpho_tagger/UD2.0 `, for
-example
-:config:`deeppavlov/configs/morpho_tagger/UD2.0/morpho_en.json `.
-The configuration file consists of several parts:
-
-Dataset Reader
-^^^^^^^^^^^^^^
-
-The dataset reader describes the instance of
-:class:`~deeppavlov.dataset_readers.morphotagging_dataset_reader.MorphotaggerDatasetReader` class.
-
-::
-
- "dataset_reader": {
- "class_name": "morphotagger_dataset_reader",
- "data_path": "{DOWNLOADS_PATH}/UD2.0_source",
- "language": "en", "data_types": ["train", "dev", "test"]
- }
-
-``class_name`` field refers to the class MorphotaggerDatasetReader,
-``data_path`` contains the path to data directory, the ``language``
-field is used to derive the name of training and development file.
-Alternatively, you can specify these files separately by full (or absolute) paths
-like
-
-::
-
- "dataset_reader": {
- "class_name": "morphotagger_dataset_reader",
- "data_path": ["{DOWNLOADS_PATH}/UD2.0_source/en-ud-train.conllu",
- "{DOWNLOADS_PATH}/UD2.0_source/en-ud-dev.conllu",
- "{DOWNLOADS_PATH}/UD2.0_source/en-ud-test.conllu"]
- "data_types": ["train", "dev", "test"]
- }
-
-By default you need only the train file, the dev file is used to
-validate
-your model during training and the test file is for model evaluation
-after training. Since you need some validation data anyway, without
-the dev part
-you need to resplit your data as described in `Dataset
-Iterator <#dataset-iterator>`__ section.
-
-Your data should be in CONLL-U format. It refers to ``predict`` mode also, but in this case only word
-column is taken into account. If your data is in single word per line format and you do not want to
-reformat it, add ``"from_words": True`` to ``dataset_reader`` section. You can also specify
-which columns contain words, tags and detailed tags, for documentation see
-:func:`Documentation `.
-
-Dataset iterator
-^^^^^^^^^^^^^^^^
-
-:class:`Dataset iterator ` class
-performs simple batching and shuffling.
-
-::
-
- "dataset_iterator": {
- "class_name": "morphotagger_dataset"
- }
-
-By default it has no parameters, but if your training and validation
-data
-are in the same file, you may specify validation split here:
-
-::
-
- "dataset_iterator": {
- "class_name": "morphotagger_dataset",
- "validation_split": 0.2
- }
-
-Chainer
-^^^^^^^
-
-The ``chainer`` part of the configuration file contains the
-specification of the neural network model and supplementary things such as vocabularies.
-Chainer refers to an instance of :class:`~deeppavlov.core.common.chainer.Chainer`, see
-:doc:`configuration ` for a complete description.
-
-The major part of ``chainer`` is ``pipe``. The ``pipe`` contains
-vocabularies and the network itself as well
-as some pre- and post- processors. The first part lowercases the input
-and normalizes it (see :class:`~deeppavlov.models.preprocessors.capitalization.CapitalizationPreprocessor`).
-
-::
-
- "pipe": [
- {
- "id": "lowercase_preprocessor",
- "class_name": "lowercase_preprocessor",
- "in": ["x"],
- "out": ["x_processed"]
- },
-
-The second part is the tag vocabulary which transforms tag labels the
-model should predict to tag indexes.
-
-::
-
- {
- "id": "tag_vocab",
- "class_name": "simple_vocab",
- "fit_on": ["y"],
- "special_tokens": ["PAD", "BEGIN", "END"],
- "save_path": "{MODELS_PATH}/morpho_tagger/UD2.0/tag_en.dict",
- "load_path": "{MODELS_PATH}/morpho_tagger/UD2.0/tag_en.dict"
- },
-
-The third part is the character vocabulary used to represent words as sequences of indexes. Only the
-symbols which occur at least ``min_freq`` times in the training set are kept.
-
-::
-
- {
- "id": "char_vocab",
- "class_name": "simple_vocab",
- "min_freq": 3,
- "fit_on": ["x_processed"],
- "special_tokens": ["PAD", "BEGIN", "END"],
- "save_path": "{MODELS_PATH}/morpho_tagger/UD2.0/char_en.dict",
- "load_path": "{MODELS_PATH}/morpho_tagger/UD2.0/char_en.dict"
- },
-
-
-If you want to utilize external morphological knowledge, you can do it in two ways.
-The first is to use :class:`~deeppavlov.models.vectorizers.word_vectorizer.DictionaryVectorizer`.
-:class:`~deeppavlov.models.vectorizers.word_vectorizer.DictionaryVectorizer` is instantiated from a dictionary file.
-Each line of a dictionary file contains two columns:
-a word and a space-separated list of its possible tags. Tags can be in any possible format. The config part for
-:class:`~deeppavlov.models.vectorizers.word_vectorizer.DictionaryVectorizer` looks as
-
-::
-
- {
- "id": "dictionary_vectorizer",
- "class_name": "dictionary_vectorizer",
- "load_path": PATH_TO_YOUR_DICTIONARY_FILE,
- "save_path": PATH_TO_YOUR_DICTIONARY_FILE,
- "in": ["x"],
- "out": ["x_possible_tags"]
- }
-
-
-The second variant for external morphological dictionary, available only for Russian,
-is `Pymorphy2 `_. In this case the vectorizer list all Pymorphy2 tags
-for a given word and transforms them to UD2.0 format using
-`russian-tagsets `_ library. Possible UD2.0 tags
-are listed in a separate distributed with the library. This part of the config look as
-(see :config:`config `))
-
-::
-
- {
- "id": "pymorphy_vectorizer",
- "class_name": "pymorphy_vectorizer",
- "save_path": "{MODELS_PATH}/morpho_tagger/UD2.0/ru_syntagrus/tags_russian.txt",
- "load_path": "{MODELS_PATH}/morpho_tagger/UD2.0/ru_syntagrus/tags_russian.txt",
- "max_pymorphy_variants": 5,
- "in": ["x"],
- "out": ["x_possible_tags"]
- }
-
-The next part performs the tagging itself. Together with general parameters it describes
-the input parameters of :class:`~deeppavlov.models.morpho_tagger.morpho_tagger.MorphoTagger`) class.
-
-::
-
- {
- "in": ["x_processed"],
- "in_y": ["y"],
- "out": ["y_predicted"],
- "class_name": "morpho_tagger",
- "main": true,
- "save_path": "{MODELS_PATH}/morpho_tagger/UD2.0/ud_en.hdf5",
- "load_path": "{MODELS_PATH}/morpho_tagger/UD2.0/ud_en.hdf5",
- "tags": "#tag_vocab",
- "symbols": "#char_vocab",
- "verbose": 1,
- "char_embeddings_size": 32, "char_window_size": [1, 2, 3, 4, 5, 6, 7],
- "word_lstm_units": 128, "conv_dropout": 0.0, "char_conv_layers": 1,
- "char_highway_layers": 1, "highway_dropout": 0.0, "word_lstm_layers": 1,
- "char_filter_multiple": 50, "intermediate_dropout": 0.0, "word_dropout": 0.2,
- "lstm_dropout": 0.3, "regularizer": 0.01, "lm_dropout": 0.3
- }
-
-
-When an additional vectorizer is used, the first line is changed to
-``"in": ["x_processed", "x_possible_tags"]`` and an additional parameter
-``"word_vectorizers": [["#pymorphy_vectorizer.dim", 128]]`` is appended.
-
-Config includes general parameters of :class:`~deeppavlov.core.models.component.Component` class,
-described in the :doc:`configuration ` and specific
-:class:`~deeppavlov.models.morpho_tagger.morpho_tagger.MorphoTagger`
-parameters. The latter include
-
-- ``tags`` - tag vocabulary. ``#tag_vocab`` refers to an already defined model with ``"id" = "tag_vocab"``.
-- ``symbols`` - character vocabulary. ``#char_vocab`` refers to an already defined model with ``"id" = "char_vocab"``.
-
-and other specific parameters of the network, available in :class:`~deeppavlov.models.morpho_tagger.morpho_tagger.MorphoTagger` documentation.
-
-The ``"train"`` section of ``"chainer"`` contains training parameters, such as number of epochs,
-batch_size and logging frequency, see general readme for more details.
-
-**chainer** also includes the ``"prettifier"`` subsection, which describes the parameters
-of :class:`~deeppavlov.core.models.morpho_tagger.common.TagOutputPrettifier`
-which transforms the predictions of the tagger to a readable form.
-
-::
-
- {
- "in": ["x", "y_predicted"],
- "out": ["y_prettified"],
- "class_name": "tag_output_prettifier",
- "end": "\\n"
- }
-
-
-It takes two inputs — source sequence of words and predicted sequence of tags
-and produces the output of the format
-
-::
-
- 1 Это PRON Animacy=Inan|Case=Acc|Gender=Neut|Number=Sing
- 2 чутко ADV Degree=Pos
- 3 фиксируют VERB
- Aspect=Imp|Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin|Voice=Act
- 4 энциклопедические ADJ Case=Nom|Degree=Pos|Number=Plur
- 5 издания NOUN Animacy=Inan|Case=Nom|Gender=Neut|Number=Plur
- 6 . PUNCT _
-
- 1 Four NUM NumType=Card
- 2 months NOUN Number=Plur
- 3 later ADV *
- 4 , PUNCT *
- 5 we PRON Case=Nom|Number=Plur|Person=1|PronType=Prs
- 6 were AUX Mood=Ind|Tense=Past|VerbForm=Fin
- 7 married VERB Tense=Past|VerbForm=Part|Voice=Pass
- 8 . PUNCT _
-
-To generate output in 10 column CONLL-U format add ``"format_mode": "ud"`` to the described section.
diff --git a/docs/features/models/multitask_bert.rst b/docs/features/models/multitask_bert.rst
index 3f0f33021c..71e49c8622 100644
--- a/docs/features/models/multitask_bert.rst
+++ b/docs/features/models/multitask_bert.rst
@@ -25,7 +25,6 @@ insults, analyzes sentiment, and recognises named entities. Multi-task BERT conf
:config:`mt_bert_train_tutorial.json ` and for inference
:config:`mt_bert_inference_tutorial.json ` are based on configs
:config:`insults_kaggle_bert.json `,
-:config:`sentiment_sst_multi_bert.json `,
:config:`ner_conll2003_bert.json `.
We start with the ``metadata`` field of the configuration file. Multi-task BERT model is saved in
diff --git a/docs/features/models/nemo.rst b/docs/features/models/nemo.rst
deleted file mode 100644
index bfa3bd4421..0000000000
--- a/docs/features/models/nemo.rst
+++ /dev/null
@@ -1,164 +0,0 @@
-Speech recognition and synthesis (ASR and TTS)
-==============================================
-
-DeepPavlov contains models for automatic speech recognition (ASR) and text synthesis (TTS) based on pre-build modules
-from `NeMo `__ (v0.10.0) - NVIDIA toolkit for defining and building
-Conversational AI applications. Named arguments for modules initialization are taken from the NeMo config file (please
-do not confuse with the DeepPavlov config file that defines model pipeline).
-
-Speech recognition
-------------------
-
-The ASR pipeline is based on Jasper: an CTC-based end-to-end model. The model transcripts speech samples without
-any additional alignment information. :class:`~deeppavlov.models.nemo.asr.NeMoASR` contains following modules:
-
-- `AudioToMelSpectrogramPreprocessor `_ - uses arguments from ``AudioToMelSpectrogramPreprocessor`` section of the NeMo config file.
-- `JasperEncoder `__ - uses arguments from ``JasperEncoder`` section of the NeMo config file. Needs pretrained checkpoint.
-- `JasperDecoderForCTC `__ - uses arguments from ``JasperDecoder`` section of the NeMo config file. Needs pretrained checkpoint.
-- `GreedyCTCDecoder `__ - doesn't use any arguments.
-- :class:`~deeppavlov.models.nemo.asr.AudioInferDataLayer` - uses arguments from ``AudioToTextDataLayer`` section of the NeMo config file.
-
-NeMo config file for ASR should contain ``labels`` argument besides named arguments for the modules above. ``labels`` is
-a list of characters that can be output by the ASR model used in model training.
-
-Speech synthesis
-----------------
-
-The TTS pipeline that creates human audible speech from text is based on Tacotron 2 and Waveglow models.
-:class:`~deeppavlov.models.nemo.tts.NeMoTTS` contains following modules:
-
-- `TextEmbedding `__ - uses arguments from ``TextEmbedding`` section of the NeMo config file. Needs pretrained checkpoint.
-- `Tacotron2Encoder `__ - uses arguments from ``Tacotron2Encoder`` section of the NeMo config file. Needs pretrained checkpoint.
-- `Tacotron2DecoderInfer `__ - uses arguments from ``Tacotron2Decoder`` section of the NeMo config file. Needs pretrained checkpoint.
-- `Tacotron2Postnet `__ - uses arguments from ``Tacotron2Postnet`` section of the NeMo config file. Needs pretrained checkpoint.
-- :class:`~deeppavlov.models.nemo.vocoder.WaveGlow` - uses arguments from ``WaveGlowNM`` section of the NeMo config file. Needs pretrained checkpoint.
-- :class:`~deeppavlov.models.nemo.vocoder.GriffinLim` - uses arguments from ``GriffinLim`` section of the NeMo config file.
-- :class:`~deeppavlov.models.nemo.tts.TextDataLayer` - uses arguments from ``TranscriptDataLayer`` section of the NeMo config file.
-
-NeMo config file for TTS should contain ``labels`` and ``sample_rate`` args besides named arguments for the modules
-above. ``labels`` is a list of characters used in TTS model training.
-
-Audio encoding end decoding.
-----------------------------
-
-:func:`~deeppavlov.models.nemo.common.ascii_to_bytes_io` and :func:`~deeppavlov.models.nemo.common.bytes_io_to_ascii`
-was added to the library to achieve uniformity at work with both text and audio data. Components can be used to encode
-binary data to ascii string and decode back.
-
-Quck Start
-----------
-
-Preparation
-~~~~~~~~~~~
-
-Install requirements and download model files.
-
-.. code:: bash
-
- python -m deeppavlov install asr_tts
- python -m deeppavlov download asr_tts
-
-Examples below use `sounddevice `_ library. Install
-it with ``pip install sounddevice==0.3.15``. You may need to install ``libportaudio2`` package with
-``sudo apt-get install libportaudio2`` to make ``sounddevice`` work.
-
-.. note::
- ASR reads and TTS generates single channel WAV files. Files transferred to ASR are resampled to the frequency
- specified in the NeMo config file (16 kHz for models from DeepPavlov configs).
-
-Speech recognition
-~~~~~~~~~~~~~~~~~~
-
-DeepPavlov :config:`asr ` config contains minimal pipeline for english speech recognition using
-`QuartzNet15x5En `_ pretrained model.
-To record speech on your computer and print transcription run following script:
-
-.. code:: python
-
- from io import BytesIO
-
- import sounddevice as sd
- from scipy.io.wavfile import write
-
- from deeppavlov import build_model, configs
-
- sr = 16000
- duration = 3
-
- print('Recording...')
- myrecording = sd.rec(duration*sr, samplerate=sr, channels=1)
- sd.wait()
- print('done')
-
- out = BytesIO()
- write(out, sr, myrecording)
-
- model = build_model(configs.nemo.asr)
- text_batch = model([out])
-
- print(text_batch[0])
-
-Speech synthesis
-~~~~~~~~~~~~~~~~
-
-DeepPavlov :config:`tts ` config contains minimal pipeline for speech synthesis using
-`Tacotron2 `_ and
-`WaveGlow `_ pretrained models.
-To generate audiofile and save it to hard drive run following script:
-
-.. code:: python
-
- from deeppavlov import build_model, configs
-
- model = build_model(configs.nemo.tts)
- filepath_batch = model(['Hello world'], ['~/hello_world.wav'])
-
- print(f'Generated speech has successfully saved at {filepath_batch[0]}')
-
-Speech to speech
-~~~~~~~~~~~~~~~~
-
-Previous examples assume files with speech to recognize and files to be generated are on the same system where the
-DeepPavlov is running. DeepPavlov :config:`asr_tts ` config allows sending files with speech to
-recognize and receiving files with generated speech from another system. This config is recognizes received speech and
-re-sounds it.
-
-Run ``asr_tts`` in REST Api mode:
-
-.. code:: bash
-
- python -m deeppavlov riseapi asr_tts
-
-This python script supposes that you already have file with speech to recognize. You can use code from speech
-recognition example to record speech on your system. ``127.0.0.1`` should be replased by address of system where
-DeepPavlov has started.
-
-.. code:: python
-
- from base64 import encodebytes, decodebytes
-
- from requests import post
-
- with open('/path/to/wav/file/with/speech', 'rb') as fin:
- input_speech = fin.read()
-
- input_ascii = encodebytes(input_speech).decode('ascii')
-
- resp = post('http://127.0.0.1:5000/model', json={"speech_in_encoded": [input_ascii]})
- text, generated_speech_ascii = resp.json()[0]
- generated_speech = decodebytes(generated_speech_ascii.encode())
-
- with open('/path/where/to/save/generated/wav/file', 'wb') as fout:
- fout.write(generated_speech)
-
- print(f'Speech transcriptions is: {text}')
-
-.. warning::
- NeMo library v0.10.0 doesn't allow to infer batches longer than one without compatible NVIDIA GPU.
-
-Models training
----------------
-
-To get your own pre-trained checkpoints for NeMo modules see `Speech recognition `_
-and `Speech Synthesis `_ tutorials. Pre-trained models list could be found
-`here `_.
\ No newline at end of file
diff --git a/docs/features/models/ner.rst b/docs/features/models/ner.rst
index 3663bb84ef..b096a00a3e 100644
--- a/docs/features/models/ner.rst
+++ b/docs/features/models/ner.rst
@@ -34,7 +34,7 @@ Here is the list of all available configs:
+------------------------------------------------------------------------+--------------------+----------+-----------------+------------+------------+
| Model | Dataset | Language | Embeddings Size | Model Size | F1 score |
+========================================================================+====================+==========+=================+============+============+
- | :config:`ner_rus_bert_torch ` | Collection3 [1]_ | Ru | 700 MB | 2.0 GB | **97.7** |
+ | :config:`ner_rus_bert ` | Collection3 [1]_ | Ru | 700 MB | 2.0 GB | **97.7** |
+------------------------------------------------------------------------+ + +-----------------+------------+------------+
| :config:`ner_collection3_m1 ` | | | 1.1 GB | 1 GB | 97.8 |
+------------------------------------------------------------------------+ + +-----------------+------------+------------+
@@ -44,8 +44,6 @@ Here is the list of all available configs:
+------------------------------------------------------------------------+ +----------+-----------------+------------+------------+
| :config:`ner_ontonotes_bert_torch ` | | En | 400 MB | 1.3 GB | 87.9 |
+------------------------------------------------------------------------+ + +-----------------+------------+------------+
- | :config:`ner_ontonotes_m1 ` | | | 347 MB | 379.4 MB | 87.7 |
- +------------------------------------------------------------------------+ + +-----------------+------------+------------+
| :config:`ner_ontonotes ` | | | 331 MB | 7.8 MB | 86.7 |
+------------------------------------------------------------------------+--------------------+ +-----------------+------------+------------+
| :config:`ner_conll2003_bert ` | CoNLL-2003 | | 400 MB | 850 MB | 91.7 |
@@ -55,10 +53,6 @@ Here is the list of all available configs:
| :config:`ner_conll2003 ` | | | 331 MB | 3.1 MB | 89.9 |
+------------------------------------------------------------------------+ + +-----------------+------------+------------+
| :config:`conll2003_m1 ` | | | 339 MB | 359.7 MB | **91.9** |
- +------------------------------------------------------------------------+--------------------+ +-----------------+------------+------------+
- | :config:`ner_dstc2 ` | DSTC2 | | --- | 626 KB | 97.1 |
- +------------------------------------------------------------------------+--------------------+----------+-----------------+------------+------------+
- | :config:`vlsp2016_full ` | VLSP-2016 | Vi | 520 MB | 37.2 MB | 93.4 |
+------------------------------------------------------------------------+--------------------+----------+-----------------+------------+------------+
Models can be used from Python using the following code:
@@ -265,81 +259,6 @@ quality. Typical partition of a dataset into train, validation, and test
are 80%, 10%, 10%, respectively.
-
-Few-shot Language-Model based
------------------------------
-
-It is possible to get a cold-start baseline from just a few samples of labeled data in a couple of seconds. The solution
-is based on a Language Model trained on open domain corpus. On top of the LM a SVM classification layer is placed. It is
-possible to start from as few as 10 sentences containing entities of interest.
-
-The data for training this model should be collected in the following way. Given a collection of `N` sentences without
-markup, sequentially markup sentences until the total number of sentences with entity of interest become equal
-`K`. During the training both sentences with and without markup are used.
-
-
-Mean chunk-wise F1 scores for Russian language on 10 sentences with entities :
-
-+---------+-------+
-|PER | 84.85 |
-+---------+-------+
-|LOC | 68.41 |
-+---------+-------+
-|ORG | 32.63 |
-+---------+-------+
-
-(the total number of training sentences is bigger and defined by the distribution of sentences with / without entities).
-
-The model can be trained using CLI:
-
-.. code:: bash
-
- python -m deeppavlov train ner_few_shot_ru
-
-you have to provide the `train.txt`, `valid.txt`, and `test.txt` files in the format described in the `Training data`_
-section. The files must be in the `ner_few_shot_data` folder as described in the `dataset_reader` part of the config
-:config:`ner/ner_few_shot_ru_train.json ` .
-
-To train and use the model from python code the following snippet can be used:
-
-.. code:: python
-
- from deeppavlov import configs, train_model
-
- ner_model = train_model(configs.ner.ner_few_shot_ru, download=True)
-
- ner_model(['Example sentence'])
-
-Warning! This model can take a lot of time and memory if the number of sentences is greater than 1000!
-
-If a lot of data is available the few-shot setting can be simulated with special `dataset_iterator`. For this purpose
-the config
-:config:`ner/ner_few_shot_ru_train.json ` . The following code can be used for this
-simulation:
-
-.. code:: python
-
- from deeppavlov import configs, train_model
-
- ner_model = train_model(configs.ner.ner_few_shot_ru_simulate, download=True)
-
-In this config the `Collection dataset `__ is used. However, if
-there are files `train.txt`, `valid.txt`, and `test.txt` in the `ner_few_shot_data` folder they will be used instead.
-
-
-To use existing few-shot model use the following python interface can be used:
-
-.. code:: python
-
- from deeppavlov import configs, build_model
-
- ner_model = build_model(configs.ner.ner_few_shot_ru)
-
- ner_model([['Example', 'sentence']])
- ner_model(['Example sentence'])
-
-
-
NER-based Model for Sentence Boundary Detection Task
----------------------------------------------------
diff --git a/docs/features/models/neural_ranking.rst b/docs/features/models/neural_ranking.rst
index bef692cb83..f6da513281 100644
--- a/docs/features/models/neural_ranking.rst
+++ b/docs/features/models/neural_ranking.rst
@@ -12,104 +12,13 @@ Training and inference models on predifined datasets
BERT Ranking
~~~~~~~~~~~~
-Before using models make sure that all required packages are installed running the command for TensorFlow:
-
-.. code:: bash
-
- python -m deeppavlov install ranking_ubuntu_v2_bert_uncased
- python -m deeppavlov install ranking_ubuntu_v2_bert_sep
- python -m deeppavlov install ranking_ubuntu_v2_bert_sep_interact
-
-or on PyTorch:
+Before using models make sure that all required packages are installed running the command:
.. code:: bash
python -m deeppavlov install ranking_ubuntu_v2_torch_bert_uncased
-To train the interaction-based (accurate, slow) model on the `Ubuntu V2`_ from command line:
-
-::
-
- python -m deeppavlov train ranking_ubuntu_v2_bert_uncased [-d]
-
-To train the representation-based (accurate, fast) model on the `Ubuntu V2`_ from command line:
-
-::
-
- python -m deeppavlov train ranking_ubuntu_v2_bert_sep [-d]
-
-Further the trained representation-based model can be run for inference over the provided response base
-(~500K in our case) from command line:
-
-::
-
- python -m deeppavlov interact ranking_ubuntu_v2_bert_sep_interact [-d]
-
-Statistics on the models quality are available :doc:`here `.
-
-Building your own response base for bert ranking
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-For the BERT-based models we have the following mechanism of building your own response base.
-If you run ``python -m deeppavlov download ranking_ubuntu_v2_bert_sep_interact`` in console
-the model with the existing base will be downloaded.
-If you look in the folder where the model with the base is located you will find four files:
-``contexts.csv``, ``responses.csv``, ``cont_vecs.npy``, ``resp_vecs.npy``.
-These are possible responses with their corresponding contexts (``.csv`` files) and their vector representations (``.npy`` files)
-indexed using the model. Contexts for responses are used as additional features in some modes of the model operation
-(see the attribute ``interact_mode`` in the class :class:`~deeppavlov.models.preprocessors.bert_preprocessor.BertSepRankerPredictorPreprocessor`).
-If you would like to use your own response base you should remove all four files indicated above
-and place your own ``responses.csv`` file in the folder,
-and probably ``contexts.csv`` file depending on the value of the ``interact_mode`` you are planning to use.
-The format of these files is very simple, namely each line should represent single response (or context).
-You can use existing files as an example. Numbers of lines in ``responses.csv`` and ``contexts.csv`` must match exactly.
-Once you have provided these files, you can run the above command in console.
-As the system will not find vector representations, it will build them first.
-You will see the message ``Building BERT features for the response base...``
-(and probably ``Building BERT features for the context base...``) and then
-``Building BERT vector representations for the response base...``
-(and probably ``Building BERT vector representations for the context base...``).
-After this is done, you will be able to interact with the system.
-Next time you will use the model, built vector representations will be loaded.
-
-Ranking
-~~~~~~~
-
-To use Sequential Matching Network (SMN) or Deep Attention Matching Network (DAM) or
-Deep Attention Matching Network with Universal Sentence Encoder (DAM-USE-T)
-on the `Ubuntu V2`_ for inference, please run one of the following commands:
-
-::
-
- python -m deeppavlov interact -d ranking_ubuntu_v2_mt_word2vec_smn
- python -m deeppavlov interact -d ranking_ubuntu_v2_mt_word2vec_dam_transformer
-
-Now a user can enter a dialog consists of 10 context sentences and several (>=1) candidate response sentences separated by '&'
-and then get the probability that the response is proper continuation of the dialog:
-
-::
-
- :: & & & & & & & & bonhoeffer whar drives do you want to mount what & i have an ext3 usb drive & look with fdisk -l & hello there & fdisk is all you need
- >> [0.9776373 0.05753616 0.9642599 ]
-
-To train the models on the `Ubuntu V2`_ dataset please run one of the following commands:
-
-::
-
- python -m deeppavlov train -d ranking_ubuntu_v2_mt_word2vec_smn
- python -m deeppavlov train -d ranking_ubuntu_v2_mt_word2vec_dam_transformer
-
-As an example of configuration file see
-:config:`ranking_ubuntu_v2_mt_word2vec_smn.json `.
-
-If the model with multi-turn context is used
-(such as :class:`~deeppavlov.models.ranking.bilstm_gru_siamese_network.BiLSTMGRUSiameseNetwork`
-with the parameter ``num_context_turns`` set to the value higher than 1 in the configuration JSON file)
-then the ``context`` to evaluate should consist of ``num_context_turns`` strings connected by the ampersand.
-Some of these strings can be empty, i.e. equal to ``''``.
-
-
Paraphrase identification
~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -120,70 +29,16 @@ Before using the model make sure that all required packages are installed runnin
.. code:: bash
- python -m deeppavlov install paraphrase_ident_paraphraser
+ python -m deeppavlov install paraphraser_bert
-To train the model on the `paraphraser.ru`_ dataset with fasttext embeddings one can use the following code in python:
+To train the model on the `paraphraser.ru`_ dataset one can use the following code in Python:
.. code:: python
from deeppavlov import configs, train_model
- para_model = train_model(configs.ranking.paraphrase_ident_paraphraser, download=True)
-
-Training and inference on your own data
----------------------------------------
-
-Ranking
-~~~~~~~
-
-To train the model for ranking on your own data you should write your own :class:`~deeppavlov.core.data.dataset_reader.DatasetReader` component
-or you can use default :class:`~deeppavlov.dataset_readers.siamese_reader.SiameseReader`. In the latter case, you should provide
-three separate files in the default data format described below:
-
-**train.csv**: each line in the file contains ``context``, ``response`` and ``label`` separated by the tab key. ``label`` can be
-binary, i.e. 1 or 0 corresponding to the correct or incorrect ``response`` for the given ``context``, or it can be multi-class label.
-In the latter case, each unique ``context`` has the unique class ``label`` and the only correct ``response`` is indicated for each ``context``.
-Currently, all ranking and paraphrase identification models support `cross-entropy loss` training with binary labels.
-Some models, such as :class:`~deeppavlov.models.ranking.bilstm_siamese_network.BiLSTMSiameseNetwork`,
-:class:`~deeppavlov.models.ranking.bilstm_gru_siamese_network.BiLSTMGRUSiameseNetwork`
-and :class:`~deeppavlov.models.ranking.mpm_siamese_network.MPMSiameseNetwork` support also training with `triplet loss`
-(the parameter ``triplet_loss`` should be set to ``true`` for the model in the configuration JSON file in this case)
-which can give potentially few percent of performance over the `cross-entropy loss` training.
-
-If the model with multi-turn context is used
-(such as :class:`~deeppavlov.models.ranking.bilstm_gru_siamese_network.BiLSTMGRUSiameseNetwork`
-with the parameter ``num_context_turns`` set to the value higher than 1 in the configuration JSON file)
-then the ``context`` should be specified with ``num_context_turns`` strings separated by the tab key instead of a single string.
-Some of these strings can be empty, i.e. equal to ``''``.
-
-Classification metrics on the train dataset part (the parameter ``train_metrics`` in the JSON configuration file)
-such as ``f1``, ``acc`` and ``log_loss`` can be calculated only in the ``cross-entropy loss`` training mode.
-Both, `cross-entropy loss` and `triplet loss` training can output loss function value returned by
-:meth:`~deeppavlov.models.ranking.siamese_model.SiameseModel.train_on_batch` if the ``log_every_n_batches`` parameter is set to the non-negative value.
-
-
-**valid.csv**, **test.csv**: each line in these files contains ``context``, ``response_1``, ``response_2``, ..., ``response_n``
-separated by the tab key, where ``response_1`` is the correct response for the given ``context`` and the rest ``response_2``, ..., ``response_n``
-are incorrect response candidates. The number of responses `n` in these files should correspond to the
-parameter ``num_ranking_samples`` in the JSON configuration file. As an example see
-
-Such ranking metrics on the valid and test parts of the dataset (the parameter ``metrics`` in the JSON configuration file) as
-``r@1``, ``r@2``, ..., ``r@n`` and ``rank_response`` can be evaluated.
-
-As an example of data usage in the default format, please, see :config:`ranking_default.json `.
-To train the model with this configuration file in python:
-
-.. code:: python
-
- from deeppavlov import configs, train_model
-
- rank_model = train_model(configs.ranking.ranking_default, download=True)
-
-To train from command line:
-
-::
+ para_model = train_model('paraphraser_bert', download=True)
- python -m deeppavlov train deeppavlov/configs/ranking/ranking_default.json [-d]
Paraphrase identification
~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/docs/features/models/slot_filling.rst b/docs/features/models/slot_filling.rst
deleted file mode 100644
index 39b6e0c230..0000000000
--- a/docs/features/models/slot_filling.rst
+++ /dev/null
@@ -1,264 +0,0 @@
-Neural Named Entity Recognition and Slot Filling
-================================================
-
-This model solves Slot-Filling task using Levenshtein search and different neural network architectures for NER.
-To read about NER without slot filling please address :doc:`NER documentation `.
-This model serves for solving DSTC 2 Slot-Filling task. In most of the cases, NER task can be formulated as:
-
-*Given a sequence of tokens (words, and maybe punctuation symbols)
-provide a tag from a predefined set of tags for each token in the
-sequence.*
-
-For NER task there are some common types of entities used as tags:
-
-- persons
-- locations
-- organizations
-- expressions of time
-- quantities
-- monetary values
-
-Furthermore, to distinguish adjacent entities with the same tag many
-applications use BIO tagging scheme. Here "B" denotes beginning of an
-entity, "I" stands for "inside" and is used for all words comprising the
-entity except the first one, and "O" means the absence of entity.
-Example with dropped punctuation:
-
-::
-
- Restaraunt O
- in O
- the O
- west B-LOC
- of O
- the O
- city O
- serving O
- modern B-FOOD
- european I-FOOD
- cuisine O
-
-In the example above, ``FOOD`` means food tag, ``LOC`` means location
-tag, and "B-" and "I-" are prefixes identifying beginnings and
-continuations of the entities.
-
-Slot Filling is a typical step after the NER. It can be formulated as:
-
-*Given an entity of a certain type and a set of all possible values of
-this entity type provide a normalized form of the entity.*
-
-In this model, the Slot Filling task is solved by Levenshtein
-Distance search across all known entities of a given type.
-
-For example, there is an entity of "food" type:
-
-*chainese*
-
-It is definitely misspelled. The set of all known food entities is
-{'chinese', 'russian', 'european'}. The nearest known entity from the
-given set is *chinese*. So the output of the Slot Filling system will be
-*chinese*.
-
-Configuration of the model
---------------------------
-
-Configuration of the model can be performed in code or in JSON configuration file.
-To train the model you need to specify four groups of parameters:
-
-- ``dataset_reader``
-- ``dataset_iterator``
-- ``chainer``
-- ``train``
-
-In the subsequent text we show the parameter specification in config
-file. However, the same notation can be used to specify parameters in
-code by replacing the JSON with python dictionary.
-
-Dataset Reader
-~~~~~~~~~~~~~~
-
-The dataset reader is a class which reads and parses the data. It
-returns a dictionary with three fields: "train", "test", and "valid".
-The basic dataset reader is "ner\_dataset\_reader." The dataset reader
-config part with "ner\_dataset\_reader" should look like:
-
-::
-
- "dataset_reader": {
- "class_name": "dstc2_datasetreader",
- "data_path": "dstc2"
- }
-
-where ``class_name`` refers to the basic ner dataset reader class and ``data_path``
-is the path to the folder with DSTC 2 dataset.
-
-Dataset Iterator
-~~~~~~~~~~~~~~~~
-
-For simple batching and shuffling you can use "dstc2\_ner\_iterator".
-The part of the configuration file for the dataset iterator looks like:
-``"dataset_iterator": { "class_name": "dstc2_ner_iterator" }``
-
-There are no additional parameters in this part.
-
-Chainer
-~~~~~~~
-
-The chainer part of the configuration file contains the specification of
-the neural network model and supplementary things such as vocabularies.
-The chainer part must have the following form:
-
-::
-
- "chainer": {
- "in": ["x"],
- "in_y": ["y"],
- "pipe": [
- ...
- ],
- "out": ["y_predicted"]
- }
-
-The inputs and outputs must be specified in the pipe. "in" means regular
-input that is used for inference and train mode. "in\_y" is used for
-training and usually contains ground truth answers. "out" field stands
-for model prediction. The model inside the pipe must have output
-variable with name "y\_predicted" so that "out" knows where to get
-predictions.
-
-The major part of "chainer" is "pipe". The "pipe" contains the
-pre-processing modules, vocabularies and model. However, we can use
-existing pipelines:
-
-::
-
- "pipe": [
- {
- "in": ["x"],
- "class_name": "lazy_tokenizer",
- "out": ["x"]
- },
- {
- "in": ["x"],
- "config_path": "../deeppavlov/configs/ner/ner_dstc2.json",
- "out": ["tags"]
- },
- ...
- ]
-
-This part will initialize already existing pre-trained NER module. The
-only thing need to be specified is path to existing config. The
-preceding lazy tokenizer serves to extract tokens for raw string of
-text.
-
-The following component in the pipeline is the ``slotfiller``:
-
-::
-
- "pipe": [
- {
- "in": ["x_lower", "tags"],
- "class_name": "dstc_slotfilling",
- "save_path": "slotfill_dstc2/dstc_slot_vals.json",
- "load_path": "slotfill_dstc2/dstc_slot_vals.json",
- "out": ["slots"]
- }
-
-The ``slotfiller`` takes the tags and tokens to perform normalization of
-extracted entities. The normalization is performed via fuzzy Levenshtein
-search in dstc\_slot\_vals dictionary. The output of this component is
-dictionary of slot values found in the input utterances.
-
-The main part of the ``dstc_slotfilling`` componet is the slot values
-dictionary. The dicttionary has the following structure:
-
-::
-
- {
- "entity_type_0": {
- "entity_value_0": [
- "entity_value_0_variation_0",
- "entity_value_0_variation_1",
- "entity_value_0_variation_2"
- ],
- "entity_value_1": [
- "entity_value_1_variation_0"
- ],
- ...
- }
- "entity_type_1": {
- ...
-
-Slotfiller will perform fuzzy search through the all variations of all
-entity values of given entity type. The entity type is determined by the
-NER component.
-
-The last part of the config is metadata:
-
-::
-
- "metadata": {
- "variables": {
- "ROOT_PATH": "~/.deeppavlov",
- "NER_CONFIG_PATH": "{DEEPPAVLOV_PATH}/configs/ner/ner_dstc2.json",
- "DATA_PATH": "{ROOT_PATH}/downloads/dstc2",
- "SLOT_VALS_PATH": "{DATA_PATH}/dstc_slot_vals.json",
- "MODELS_PATH": "{ROOT_PATH}/models",
- "MODEL_PATH": "{MODELS_PATH}/slotfill_dstc2"
- },
- "download": [
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/dstc_slot_vals.tar.gz",
- "subdir": "{DATA_PATH}"
- },
- {
- "url": "http://files.deeppavlov.ai/deeppavlov_data/slotfill_dstc2.tar.gz",
- "subdir": "{MODELS_PATH}"
- }
- ]
- }
-
-It contains information for deployment of the model and urls for
-download pre-trained models.
-
-You can see all parts together in ``deeeppavlov/configs/ner/slotfill_dstc2.json``
-
-Usage of the model
-------------------
-
-Please see an example of training a Slot Filling model and using it for
-prediction:
-
-.. code:: python
-
- from deeppavlov import build_model, configs
-
- PIPELINE_CONFIG_PATH = configs.ner.slotfill_dstc2
- slotfill_model = build_model(PIPELINE_CONFIG_PATH, download=True)
- slotfill_model(['I would like some chinese food', 'The west part of the city would be nice'])
-
-This example assumes that the working directory is the root of the
-project.
-
-Slotfilling without NER
------------------------
-
-An alternative approach to Slot Filling problem could be fuzzy search
-for each instance of each slot value inside the text. This approach is
-realized in ``slotfill_raw`` component. The component uses needle in
-haystack
-
-The main advantage of this approach is elimination of a separate Named
-Entity Recognition module. However, absence of NER module make this
-model less robust to noise (words with similar spelling) especially for
-long utterances.
-
-Usage example:
-
-.. code:: python
-
- from deeppavlov import build_model, configs
-
- PIPELINE_CONFIG_PATH = configs.ner.slotfill_dstc2_raw
- slotfill_model = build_model(PIPELINE_CONFIG_PATH, download=True)
- slotfill_model(['I would like some chinese food', 'The west part of the city would be nice'])
diff --git a/docs/features/models/spelling_correction.rst b/docs/features/models/spelling_correction.rst
index e5d16ba3db..43827a506b 100644
--- a/docs/features/models/spelling_correction.rst
+++ b/docs/features/models/spelling_correction.rst
@@ -49,7 +49,7 @@ lines to stdout:
from deeppavlov import build_model, configs
- CONFIG_PATH = configs.spelling_correction.brillmoore_kartaslov_ru
+ CONFIG_PATH = configs.spelling_correction.levenshtein_corrector_ru
model = build_model(CONFIG_PATH, download=True)
for line in sys.stdin:
@@ -185,14 +185,9 @@ on Automatic Spelling Correction for Russian:
+-----------------------------------------------------------------------------------------+-----------+--------+-----------+---------------------+
| :config:`Damerau Levenshtein 1 + lm` | 59.38 | 53.44 | 56.25 | 39.3 |
+-----------------------------------------------------------------------------------------+-----------+--------+-----------+---------------------+
-| :config:`Brill Moore top 4 + lm` | 51.92 | 53.94 | 52.91 | 0.6 |
-+-----------------------------------------------------------------------------------------+-----------+--------+-----------+---------------------+
| Hunspell + lm | 41.03 | 48.89 | 44.61 | 2.1 |
+-----------------------------------------------------------------------------------------+-----------+--------+-----------+---------------------+
| JamSpell | 44.57 | 35.69 | 39.64 | 136.2 |
+-----------------------------------------------------------------------------------------+-----------+--------+-----------+---------------------+
-| :config:`Brill Moore top 1 ` | 41.29 | 37.26 | 39.17 | 2.4 |
-+-----------------------------------------------------------------------------------------+-----------+--------+-----------+---------------------+
| Hunspell | 30.30 | 34.02 | 32.06 | 20.3 |
+-----------------------------------------------------------------------------------------+-----------+--------+-----------+---------------------+
-
diff --git a/docs/features/models/squad.rst b/docs/features/models/squad.rst
index 128ea07627..e4ce9c7136 100644
--- a/docs/features/models/squad.rst
+++ b/docs/features/models/squad.rst
@@ -225,29 +225,8 @@ Link to SDSJ Task B dataset: http://files.deeppavlov.ai/datasets/sber_squad-v1.1
+========================================================================+================+=================+
| :config:`DeepPavlov RuBERT ` | 66.30+-0.24 | 84.60+-0.11 |
+------------------------------------------------------------------------+----------------+-----------------+
-| :config:`DeepPavlov multilingual BERT `| 64.35+-0.39 | 83.39+-0.08 |
+| :config:`DeepPavlov multilingual BERT `| 66.24 | 84.71 |
+------------------------------------------------------------------------+----------------+-----------------+
| :config:`DeepPavlov R-Net ` | 60.62 | 80.04 |
+------------------------------------------------------------------------+----------------+-----------------+
-
-DRCD
-~~~~~~~~~~~
-
-Pretrained models are available and can be downloaded:
-
-.. code:: bash
-
- python -m deeppavlov download deeppavlov/configs/squad/squad_zh_bert.json
- python -m deeppavlov download deeppavlov/configs/squad/squad_zh_zh_bert.json
-
-Link to DRCD dataset: http://files.deeppavlov.ai/datasets/DRCD.tar.gz
-Link to DRCD paper: https://arxiv.org/abs/1806.00920
-
-+------------------------------------------------------------------------+----------------+-----------------+
-| Model config | EM (dev) | F-1 (dev) |
-+========================================================================+================+=================+
-| :config:`DeepPavlov ChineseBERT ` | 84.19 | 89.23 |
-+------------------------------------------------------------------------+----------------+-----------------+
-| :config:`DeepPavlov multilingual BERT ` | 84.86 | 89.03 |
-+------------------------------------------------------------------------+----------------+-----------------+
diff --git a/docs/features/models/tfidf_ranking.rst b/docs/features/models/tfidf_ranking.rst
index d594b16b12..699c0d1b22 100644
--- a/docs/features/models/tfidf_ranking.rst
+++ b/docs/features/models/tfidf_ranking.rst
@@ -161,9 +161,7 @@ Scores for **TF-IDF Ranker** model:
| Model | Dataset | Recall@5 |
+------------------------------------------------------------------------------+----------------+-----------------+
| :config:`enwiki20180211 ` | | 75.6 |
-+------------------------------------------------------------------------------+ +-----------------+
-| :config:`enwiki20161221 ` | SQuAD (dev) | 76.2 |
-+------------------------------------------------------------------------------+ +-----------------+
++------------------------------------------------------------------------------+ SQuAD (dev) +-----------------+
| `DrQA`_ enwiki20161221 | | 77.8 |
+------------------------------------------------------------------------------+----------------+-----------------+
diff --git a/docs/features/overview.rst b/docs/features/overview.rst
index 7a515a40cb..88e8947eb1 100644
--- a/docs/features/overview.rst
+++ b/docs/features/overview.rst
@@ -23,7 +23,7 @@ which is inspired by Bi-LSTM+CRF architecture from https://arxiv.org/pdf/1603.01
+---------------------------------------------------------+-------+--------------------------------------------------------------------------------------------+-------------+
| Dataset | Lang | Model | Test F1 |
+=========================================================+=======+============================================================================================+=============+
-| Persons-1000 dataset with additional LOC and ORG markup | Ru | :config:`ner_rus_bert.json ` | 98.1 |
+| Persons-1000 dataset with additional LOC and ORG markup | Ru | :config:`ner_rus_bert.json ` | 97.7 |
+ + +--------------------------------------------------------------------------------------------+-------------+
| (Collection 3) | | :config:`ner_rus.json ` | 95.1 |
+ + +--------------------------------------------------------------------------------------------+-------------+
@@ -42,23 +42,8 @@ which is inspired by Bi-LSTM+CRF architecture from https://arxiv.org/pdf/1603.01
| | | :config:`ner_conll2003_torch_bert.json ` | 88.6 |
+ + +--------------------------------------------------------------------------------------------+-------------+
| | | :config:`ner_conll2003.json ` | 89.9 |
-+---------------------------------------------------------+ +--------------------------------------------------------------------------------------------+-------------+
-| DSTC2 | | :config:`ner_dstc2.json ` | 97.1 |
+---------------------------------------------------------+-------+--------------------------------------------------------------------------------------------+-------------+
-Slot filling models :doc:`[docs] `
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Based on fuzzy Levenshtein search to extract normalized slot values from text. The models either rely on NER results
-or perform needle in haystack search.
-
-+---------------------------------------------------------------------------------------------------------------------------+------------------+
-| Dataset | Slots Accuracy |
-+===========================================================================================================================+==================+
-| :config:`DSTC 2 ` | 98.85 |
-+---------------------------------------------------------------------------------------------------------------------------+------------------+
-
-
Classification model :doc:`[docs] `
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -70,66 +55,21 @@ Several pre-trained models are available and presented in Table below.
+------------------+---------------------+------+----------------------------------------------------------------------------------------------------+-------------+------------------+-----------------+-----------+
| Task | Dataset | Lang | Model | Metric | Valid | Test | Downloads |
+==================+=====================+======+====================================================================================================+=============+==================+=================+===========+
-| 28 intents | `DSTC 2`_ | En | :config:`DSTC 2 emb ` | Accuracy | 0.7613 | 0.7733 | 800 Mb |
-+ + + +----------------------------------------------------------------------------------------------------+ +------------------+-----------------+-----------+
-| | | | :config:`Wiki emb ` | | 0.9629 | 0.9617 | 8.5 Gb |
-+ + + +----------------------------------------------------------------------------------------------------+ +------------------+-----------------+-----------+
-| | | | :config:`BERT ` | | 0.9673 | 0.9636 | 800 Mb |
-+------------------+---------------------+ +----------------------------------------------------------------------------------------------------+-------------+------------------+-----------------+-----------+
-| 7 intents | `SNIPS-2017`_ [1]_ | | :config:`DSTC 2 emb ` | F1-macro | 0.8591 | -- | 800 Mb |
-+ + + +----------------------------------------------------------------------------------------------------+ +------------------+-----------------+-----------+
-| | | | :config:`Wiki emb ` | | 0.9820 | -- | 8.5 Gb |
-+ + + +----------------------------------------------------------------------------------------------------+ +------------------+-----------------+-----------+
-| | | | :config:`Tfidf + SelectKBest + PCA + Wiki emb ` | | 0.9673 | -- | 8.6 Gb |
-+ + + +----------------------------------------------------------------------------------------------------+ +------------------+-----------------+-----------+
-| | | | :config:`Wiki emb weighted by Tfidf ` | | 0.9786 | -- | 8.5 Gb |
-+------------------+---------------------+ +----------------------------------------------------------------------------------------------------+-------------+------------------+-----------------+-----------+
-| Insult detection | `Insults`_ | | :config:`Reddit emb ` | ROC-AUC | 0.9263 | 0.8556 | 6.2 Gb |
-+ + + +----------------------------------------------------------------------------------------------------+ +------------------+-----------------+-----------+
-| | | | :config:`English BERT ` | | 0.9255 | 0.8612 | 1200 Mb |
-+ + + +----------------------------------------------------------------------------------------------------+ +------------------+-----------------+-----------+
-| | | | :config:`English Conversational BERT ` | | 0.9389 | 0.8941 | 1200 Mb |
-+ + + +----------------------------------------------------------------------------------------------------+ +------------------+-----------------+-----------+
-| | | | :config:`English BERT on PyTorch ` | | 0.9329 | 0.877 | 1.1 Gb |
-+------------------+---------------------+ +----------------------------------------------------------------------------------------------------+-------------+------------------+-----------------+-----------+
-| 5 topics | `AG News`_ | | :config:`Wiki emb ` | Accuracy | 0.8922 | 0.9059 | 8.5 Gb |
-+------------------+---------------------+ +----------------------------------------------------------------------------------------------------+-------------+------------------+-----------------+-----------+
-| Intent | `Yahoo-L31`_ | | :config:`Yahoo-L31 on conversational BERT ` | ROC-AUC | 0.9436 | -- | 1200 Mb |
+| Insult detection | `Insults`_ | En | :config:`English BERT` | ROC-AUC | 0.9327 | 0.8602 | 1.1 Gb |
+------------------+---------------------+ +----------------------------------------------------------------------------------------------------+-------------+------------------+-----------------+-----------+
| Sentiment | `SST`_ | | :config:`5-classes SST on conversational BERT ` | Accuracy | 0.6456 | 0.6715 | 400 Mb |
-+ + + +----------------------------------------------------------------------------------------------------+ +------------------+-----------------+-----------+
-| | | | :config:`5-classes SST on multilingual BERT ` | | 0.5738 | 0.6024 | 660 Mb |
-+ + + +----------------------------------------------------------------------------------------------------+ +------------------+-----------------+-----------+
-| | | | :config:`3-classes SST SWCNN on PyTorch ` | | 0.7379 | 0.6312 | 4.3 Mb |
-+ +---------------------+ +----------------------------------------------------------------------------------------------------+ +------------------+-----------------+-----------+
-| | `Yelp`_ | | :config:`5-classes Yelp on conversational BERT ` | | 0.6925 | 0.6842 | 400 Mb |
-+ + + +----------------------------------------------------------------------------------------------------+ +------------------+-----------------+-----------+
-| | | | :config:`5-classes Yelp on multilingual BERT ` | | 0.5896 | 0.5874 | 660 Mb |
+------------------+---------------------+------+----------------------------------------------------------------------------------------------------+-------------+------------------+-----------------+-----------+
| Sentiment | `Twitter mokoron`_ | Ru | :config:`RuWiki+Lenta emb w/o preprocessing ` | | 0.9965 | 0.9961 | 6.2 Gb |
-+ + + +----------------------------------------------------------------------------------------------------+ +------------------+-----------------+-----------+
-| | | | :config:`RuWiki+Lenta emb with preprocessing ` | | 0.7823 | 0.7759 | 6.2 Gb |
+ +---------------------+ +----------------------------------------------------------------------------------------------------+-------------+------------------+-----------------+-----------+
-| | `RuSentiment`_ | | :config:`RuWiki+Lenta emb ` | F1-weighted | 0.6541 | 0.7016 | 6.2 Gb |
-+ + + +----------------------------------------------------------------------------------------------------+ +------------------+-----------------+-----------+
-| | | | :config:`Twitter emb super-convergence ` [2]_ | | 0.7301 | 0.7576 | 3.4 Gb |
-+ + + +----------------------------------------------------------------------------------------------------+ +------------------+-----------------+-----------+
-| | | | :config:`ELMo ` | | 0.7519 | 0.7875 | 700 Mb |
-+ + + +----------------------------------------------------------------------------------------------------+ +------------------+-----------------+-----------+
-| | | | :config:`Multi-language BERT ` | | 0.6809 | 0.7193 | 1900 Mb |
+| | `RuSentiment`_ | | :config:`Multi-language BERT ` | F1-weighted | 0.6809 | 0.7193 | 1900 Mb |
+ + + +----------------------------------------------------------------------------------------------------+ +------------------+-----------------+-----------+
| | | | :config:`Conversational RuBERT ` | | 0.7548 | 0.7742 | 657 Mb |
+ + + +----------------------------------------------------------------------------------------------------+ +------------------+-----------------+-----------+
| | | | :config:`Conversational DistilRuBERT-tiny ` | | 0.703 ± 0.0031 | 0.7348 ± 0.0028 | 690 Mb |
+ + + +----------------------------------------------------------------------------------------------------+ +------------------+-----------------+-----------+
| | | | :config:`Conversational DistilRuBERT-base ` | | 0.7376 ± 0.0045 | 0.7645 ± 0.035 | 1.0 Gb |
-+------------------+---------------------+ +----------------------------------------------------------------------------------------------------+-------------+------------------+-----------------+-----------+
-| Intent | Ru like`Yahoo-L31`_ | | :config:`Conversational vs Informational on ELMo ` | ROC-AUC | 0.9412 | -- | 700 Mb |
+------------------+---------------------+------+----------------------------------------------------------------------------------------------------+-------------+------------------+-----------------+-----------+
-.. [1] Coucke A. et al. Snips voice platform: an embedded spoken language understanding system for private-by-design voice interfaces //arXiv preprint arXiv:1805.10190. – 2018.
-.. [2] Smith L. N., Topin N. Super-convergence: Very fast training of residual networks using large learning rates. – 2018.
-
.. _`DSTC 2`: http://camdial.org/~mh521/dstc/
.. _`SNIPS-2017`: https://github.com/snipsco/nlu-benchmark/tree/master/2017-06-custom-intent-engines
.. _`Insults`: https://www.kaggle.com/c/detecting-insults-in-social-commentary
@@ -139,7 +79,6 @@ Several pre-trained models are available and presented in Table below.
.. _`Yahoo-L31`: https://webscope.sandbox.yahoo.com/catalog.php?datatype=l
.. _`Yahoo-L6`: https://webscope.sandbox.yahoo.com/catalog.php?datatype=l
.. _`SST`: https://nlp.stanford.edu/sentiment/index.html
-.. _`Yelp`: https://www.yelp.com/dataset
As no one had published intent recognition for DSTC-2 data, the
comparison of the presented model is given on **SNIPS** dataset. The
@@ -192,14 +131,10 @@ on Automatic Spelling Correction for Russian:
+-----------------------------------------------------------------------------------------+-----------+--------+-----------+---------------------+
| :config:`Damerau Levenshtein 1 + lm` | 53.26 | 53.74 | 53.50 | 29.3 |
+-----------------------------------------------------------------------------------------+-----------+--------+-----------+---------------------+
-| :config:`Brill Moore top 4 + lm` | 51.92 | 53.94 | 52.91 | 0.6 |
-+-----------------------------------------------------------------------------------------+-----------+--------+-----------+---------------------+
| Hunspell + lm | 41.03 | 48.89 | 44.61 | 2.1 |
+-----------------------------------------------------------------------------------------+-----------+--------+-----------+---------------------+
| JamSpell | 44.57 | 35.69 | 39.64 | 136.2 |
+-----------------------------------------------------------------------------------------+-----------+--------+-----------+---------------------+
-| :config:`Brill Moore top 1 ` | 41.29 | 37.26 | 39.17 | 2.4 |
-+-----------------------------------------------------------------------------------------+-----------+--------+-----------+---------------------+
| Hunspell | 30.30 | 34.02 | 32.06 | 20.3 |
+-----------------------------------------------------------------------------------------+-----------+--------+-----------+---------------------+
@@ -208,48 +143,6 @@ on Automatic Spelling Correction for Russian:
Ranking model :doc:`[docs] `
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-The main neural ranking model based on `LSTM-based deep learning models for non-factoid answer selection
-`__. The model performs ranking of responses or contexts from some database by their
-relevance for the given context.
-
-There are 3 alternative neural architectures available as well:
-
-Sequential Matching Network (SMN)
- Based on the work `Wu, Yu, et al. "Sequential Matching Network: A New Architecture for Multi-turn Response Selection in Retrieval-based Chatbots". ACL. 2017. `__
-
-Deep Attention Matching Network (DAM)
- Based on the work `Xiangyang Zhou, et al. "Multi-Turn Response Selection for Chatbots with Deep Attention Matching Network". Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers). 2018 `__
-
-Deep Attention Matching Network + Universal Sentence Encoder v3 (DAM-USE-T)
- Our new proposed architecture based on the works: `Xiangyang Zhou, et al. "Multi-Turn Response Selection for Chatbots with Deep Attention Matching Network". Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers). 2018 `__
- and `Daniel Cer, Yinfei Yang, Sheng-yi Kong, Nan Hua, Nicole Limtiaco, Rhomni St. John, Noah Constant, Mario Guajardo-Cespedes, Steve Yuan, Chris Tar, Brian Strope, Ray Kurzweil. 2018a. Universal Sentence Encoder for English. `__
-
-
-Available pre-trained models for ranking:
-
-.. table::
- :widths: auto
-
- +-------------------+----------------------------------------------------------------------------------------------------------------------+-----------+-----------------------------------+
- | Dataset | Model config | Val | Test |
- | | +-----------+-------+-------+-------+-----------+
- | | | R10@1 | R10@1 | R10@2 | R10@5 | Downloads |
- +===================+======================================================================================================================+===========+=======+=======+=======+===========+
- | `Ubuntu V2`_ | :config:`ranking_ubuntu_v2_mt_word2vec_dam_transformer ` | 74.32 | 74.46 | 86.77 | 97.38 | 2457 MB |
- +-------------------+----------------------------------------------------------------------------------------------------------------------+-----------+-------+-------+-------+-----------+
- | `Ubuntu V2`_ | :config:`ranking_ubuntu_v2_mt_word2vec_smn ` | 68.56 | 67.91 | 81.49 | 95.63 | 1609 MB |
- +-------------------+----------------------------------------------------------------------------------------------------------------------+-----------+-------+-------+-------+-----------+
- | `Ubuntu V2`_ | :config:`ranking_ubuntu_v2_bert_uncased ` | 66.5 | 66.6 | -- | -- | 396 MB |
- +-------------------+----------------------------------------------------------------------------------------------------------------------+-----------+-------+-------+-------+-----------+
- | `Ubuntu V2`_ | :config:`ranking_ubuntu_v2_bert_uncased on PyTorch ` | 65.73 | 65.74 | -- | -- | 1.1 Gb |
- +-------------------+----------------------------------------------------------------------------------------------------------------------+-----------+-------+-------+-------+-----------+
- | `Ubuntu V2`_ | :config:`ranking_ubuntu_v2_bert_sep ` | 66.5 | 66.5 | -- | -- | 396 MB |
- +-------------------+----------------------------------------------------------------------------------------------------------------------+-----------+-------+-------+-------+-----------+
- | `Ubuntu V2`_ | :config:`ranking_ubuntu_v2_mt_interact ` | 59.2 | 58.7 | -- | -- | 8906 MB |
- +-------------------+----------------------------------------------------------------------------------------------------------------------+-----------+-------+-------+-------+-----------+
-
-.. _`Ubuntu V2`: https://github.com/rkadlec/ubuntu-ranking-dataset-creator
-
Available pre-trained models for paraphrase identification:
.. table::
@@ -258,8 +151,6 @@ Available pre-trained models for paraphrase identification:
+------------------------+------------------------------------------------------------------------------------------------------+----------------+-----------------+------------+------------+----------------+-----------------+-----------+
| Dataset | Model config | Val (accuracy) | Test (accuracy) | Val (F1) | Test (F1) | Val (log_loss) | Test (log_loss) | Downloads |
+========================+======================================================================================================+================+=================+============+============+================+=================+===========+
- | `paraphraser.ru`_ | :config:`paraphrase_ident_paraphraser_ft ` | 83.8 | 75.4 | 87.9 | 80.9 | 0.468 | 0.616 | 5938M |
- +------------------------+------------------------------------------------------------------------------------------------------+----------------+-----------------+------------+------------+----------------+-----------------+-----------+
| `paraphraser.ru`_ | :config:`paraphrase_bert_multilingual ` | 87.4 | 79.3 | 90.2 | 83.4 | -- | -- | 1330M |
+------------------------+------------------------------------------------------------------------------------------------------+----------------+-----------------+------------+------------+----------------+-----------------+-----------+
| `paraphraser.ru`_ | :config:`paraphrase_rubert ` | 90.2 | 84.9 | 92.3 | 87.9 | -- | -- | 1325M |
@@ -272,27 +163,6 @@ Available pre-trained models for paraphrase identification:
.. _`paraphraser.ru`: https://paraphraser.ru/
-Comparison with other models on the `Ubuntu Dialogue Corpus v2 `__ (test):
-
-+---------------------------------------------------------------------------------------------------------------------------------------------+-----------+-----------+-----------+
-| Model | R@1 | R@2 | R@5 |
-+=============================================================================================================================================+===========+===========+===========+
-| SMN last [`Wu et al., 2017 `_] | -- | -- | -- |
-+---------------------------------------------------------------------------------------------------------------------------------------------+-----------+-----------+-----------+
-| SMN last [DeepPavlov :config:`ranking_ubuntu_v2_mt_word2vec_smn `] | 0.6791 | 0.8149 | 0.9563 |
-+---------------------------------------------------------------------------------------------------------------------------------------------+-----------+-----------+-----------+
-| DAM [`Zhou et al., 2018 `_] | -- | -- | -- |
-+---------------------------------------------------------------------------------------------------------------------------------------------+-----------+-----------+-----------+
-| MRFN-FLS [`Tao et al., 2019 `_] | -- | -- | -- |
-+---------------------------------------------------------------------------------------------------------------------------------------------+-----------+-----------+-----------+
-| IMN [`Gu et al., 2019 `_] | 0.771 | 0.886 | 0.979 |
-+---------------------------------------------------------------------------------------------------------------------------------------------+-----------+-----------+-----------+
-| IMN Ensemble [`Gu et al., 2019 `_] | **0.791** | **0.899** | **0.982** |
-+---------------------------------------------------------------------------------------------------------------------------------------------+-----------+-----------+-----------+
-| DAM-USE-T [DeepPavlov :config:`ranking_ubuntu_v2_mt_word2vec_dam_transformer `] | 0.7446 | 0.8677 | 0.9738 |
-+---------------------------------------------------------------------------------------------------------------------------------------------+-----------+-----------+-----------+
-
-
References:
* Yu Wu, Wei Wu, Ming Zhou, and Zhoujun Li. 2017. Sequential match network: A new architecture for multi-turn response selection in retrieval-based chatbots. In ACL, pages 372–381. https://www.aclweb.org/anthology/P17-1046
@@ -340,9 +210,9 @@ R-Net model is based on `R-NET: Machine Reading Comprehension with Self-matching
+----------------+---------------------------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+
| `SQuAD-v1.1`_ | :config:`DeepPavlov R-Net ` | en | 71.49 | 80.34 | ~2.5Gb |
+----------------+---------------------------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+
-| `SDSJ Task B`_ | :config:`DeepPavlov RuBERT ` | ru | 66.30 ± 0.24 | 84.60 ± 0.11 | 1325Mb |
+| `SDSJ Task B`_ | :config:`DeepPavlov RuBERT ` | ru | 66.30 ± 0.24 | 84.60 ± 0.11 | 1325Mb |
+----------------+---------------------------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+
-| `SDSJ Task B`_ | :config:`DeepPavlov multilingual BERT ` | ru | 64.35 ± 0.39 | 83.39 ± 0.08 | 1323Mb |
+| `SDSJ Task B`_ | :config:`DeepPavlov multilingual BERT ` | ru | 66.24 | 84.71 | 1.6 Gb |
+----------------+---------------------------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+
| `SDSJ Task B`_ | :config:`DeepPavlov R-Net ` | ru | 60.62 | 80.04 | ~5Gb |
+----------------+---------------------------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+
@@ -350,10 +220,6 @@ R-Net model is based on `R-NET: Machine Reading Comprehension with Self-matching
+----------------+---------------------------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+
| `SDSJ Task B`_ | :config:`DeepPavlov DistilRuBERT-base ` | ru | 61.23 ± 0.42 | 80.36 ± 0.28 | 1.18Gb |
+----------------+---------------------------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+
-| `DRCD`_ | :config:`DeepPavlov multilingual BERT ` | ch | 84.86 | 89.03 | 630Mb |
-+----------------+---------------------------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+
-| `DRCD`_ | :config:`DeepPavlov Chinese BERT ` | ch | 84.19 | 89.23 | 362Mb |
-+----------------+---------------------------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+
In the case when answer is not necessary present in given context we have :config:`squad_noans `
model. This model outputs empty string in case if there is no answer in context.
@@ -362,14 +228,9 @@ model. This model outputs empty string in case if there is no answer in context.
Morphological tagging model :doc:`[docs] `
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-We have a BERT-based model for Russian and character-based models for 11 languages.
-The character model is based on `Heigold et al., 2017. An extensive empirical evaluation of
-character-based morphological tagging for 14 languages `__.
-It is a state-of-the-art model for Russian and near state of the art for several other languages.
+We have a BERT-based model for Russian language.
Model takes as input tokenized sentences and outputs the corresponding
sequence of morphological labels in `UD format `__.
-The table below contains word and sentence accuracy on UD2.0 datasets.
-For more scores see :doc:`full table `.
.. table::
:widths: auto
@@ -383,32 +244,8 @@ For more scores see :doc:`full table `.
| +--------------------------------------------------------------------------------------------------------------+---------------+----------------+--------------------+
| | :config:`BERT-based model ` | 97.83 | 72.02 | 661 |
+----------------------+--------------------------------------------------------------------------------------------------------------+---------------+----------------+--------------------+
- | | `Pymorphy`_ + `russian_tagsets`_ (first tag) | 60.93 | 0.00 | |
- + +--------------------------------------------------------------------------------------------------------------+---------------+----------------+--------------------+
- | `UD2.0`_ (Russian) | `UD Pipe 1.2`_ (Straka et al., 2017) | 93.57 | 43.04 | |
- + +--------------------------------------------------------------------------------------------------------------+---------------+----------------+--------------------+
- | | :config:`Basic model ` | 95.17 | 50.58 | 48.7 |
- + +--------------------------------------------------------------------------------------------------------------+---------------+----------------+--------------------+
- | | :config:`Pymorphy-enhanced model ` | **96.23** | 58.00 | 48.7 |
- +----------------------+--------------------------------------------------------------------------------------------------------------+---------------+----------------+--------------------+
- | `UD2.0`_ (Czech) | `UD Pipe 1.2`_ (Straka et al., 2017) | 91.86 | 42.28 | |
- | +--------------------------------------------------------------------------------------------------------------+---------------+----------------+--------------------+
- | | :config:`Basic model ` | **94.35** | 51.56 | 41.8 |
- +----------------------+--------------------------------------------------------------------------------------------------------------+---------------+----------------+--------------------+
- | `UD2.0`_ (English) | `UD Pipe 1.2`_ (Straka et al., 2017) | 92.89 | 55.75 | |
- | +--------------------------------------------------------------------------------------------------------------+---------------+----------------+--------------------+
- | | :config:`Basic model ` | **93.00** | 55.18 | 16.9 |
- +----------------------+--------------------------------------------------------------------------------------------------------------+---------------+----------------+--------------------+
- | `UD2.0`_ (German) | `UD Pipe 1.2`_ (Straka et al., 2017) | 76.65 | 10.24 | |
- | +--------------------------------------------------------------------------------------------------------------+---------------+----------------+--------------------+
- | | :config:`Basic model ` | **83.83** | 15.25 | 18.6 |
- +----------------------+--------------------------------------------------------------------------------------------------------------+---------------+----------------+--------------------+
-.. _`Pymorphy`: https://pymorphy2.readthedocs.io/en/latest/
-.. _`russian_tagsets`: https://github.com/kmike/russian-tagsets
-.. _`UD2.0`: https://lindat.mff.cuni.cz/repository/xmlui/handle/11234/1-1983
.. _`UD2.3`: http://hdl.handle.net/11234/1-2895
-.. _`UD Pipe 1.2`: http://ufal.mff.cuni.cz/udpipe
.. _`UD Pipe 2.3`: http://ufal.mff.cuni.cz/udpipe
.. _`UD Pipe Future`: https://github.com/CoNLL-UD-2018/UDPipe-Future
@@ -452,33 +289,12 @@ Skills
Goal-oriented bot :doc:`[docs] `
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Based on Hybrid Code Networks (HCNs) architecture from `Jason D. Williams, Kavosh Asadi,
-Geoffrey Zweig, Hybrid Code Networks: practical and efficient end-to-end dialog control
+Based on Hybrid Code Networks (HCNs) architecture from `Jason D. Williams, Kavosh Asadi,
+Geoffrey Zweig, Hybrid Code Networks: practical and efficient end-to-end dialog control
with supervised and reinforcement learning – 2017 `__.
It allows to predict responses in a goal-oriented dialog. The model is
customizable: embeddings, slot filler and intent classifier can be switched on and off on demand.
-Available pre-trained models and their comparison with existing benchmarks:
-
-+-----------------------------------+------+------------------------------------------------------------------------------------+---------------+-----------+---------------+
-| Dataset | Lang | Model | Metric | Test | Downloads |
-+===================================+======+====================================================================================+===============+===========+===============+
-| `DSTC 2`_ | En | :config:`basic bot ` | Turn Accuracy | 0.380 | 10 Mb |
-+ (:ref:`modified `) + +------------------------------------------------------------------------------------+ +-----------+---------------+
-| | | :config:`bot with slot filler ` | | 0.542 | 400 Mb |
-+ + +------------------------------------------------------------------------------------+ +-----------+---------------+
-| | | :config:`bot with slot filler, intents & attention ` | | **0.553** | 8.5 Gb |
-+-----------------------------------+ +------------------------------------------------------------------------------------+ +-----------+---------------+
-| `DSTC 2`_ | | Bordes and Weston (2016) | | 0.411 | -- |
-+ + +------------------------------------------------------------------------------------+ +-----------+---------------+
-| | | Eric and Manning (2017) | | 0.480 | -- |
-+ + +------------------------------------------------------------------------------------+ +-----------+---------------+
-| | | Perez and Liu (2016) | | 0.487 | -- |
-+ + +------------------------------------------------------------------------------------+ +-----------+---------------+
-| | | Williams et al. (2017) | | **0.556** | -- |
-+-----------------------------------+------+------------------------------------------------------------------------------------+---------------+-----------+---------------+
-
-
ODQA :doc:`[docs] `
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -491,8 +307,6 @@ based on its Wikipedia knowledge.
+================+====================================================================+=======================+========+===========+
| `SQuAD-v1.1`_ | :config:`ODQA ` | enwiki (2018-02-11) | 35.89 | 9.7Gb |
+----------------+--------------------------------------------------------------------+-----------------------+--------+-----------+
-| `SQuAD-v1.1`_ | :config:`ODQA ` | enwiki (2016-12-21) | 37.83 | 9.3Gb |
-+----------------+--------------------------------------------------------------------+-----------------------+--------+-----------+
| `SDSJ Task B`_ | :config:`ODQA ` | ruwiki (2018-04-01) | 28.56 | 7.7Gb |
+----------------+--------------------------------------------------------------------+-----------------------+--------+-----------+
| `SDSJ Task B`_ | :config:`ODQA with RuBERT ` | ruwiki (2018-04-01) | 37.83 | 4.3Gb |
@@ -522,47 +336,29 @@ Word vectors for the Russian language trained on joint `Russian Wikipedia
-
-- Run goal-oriented bot with console interface:
-
- .. code-block:: bash
-
- python -m deeppavlov interact gobot_dstc2 -d
-
-- Run goal-oriented bot with REST API:
-
- .. code-block:: bash
-
- python -m deeppavlov riseapi gobot_dstc2 -d
-
-- Run slot-filling model with Telegram interface:
+- Run insults detection model with Telegram interface:
.. code-block:: bash
- python -m deeppavlov telegram slotfill_dstc2 -d -t
+ python -m deeppavlov telegram insults_kaggle_bert -d -t
-- Run slot-filling model with console interface:
+- Run insults detection model with console interface:
.. code-block:: bash
- python -m deeppavlov interact slotfill_dstc2 -d
+ python -m deeppavlov interact insults_kaggle_bert -d
-- Run slot-filling model with REST API:
+- Run insults detection model with REST API:
.. code-block:: bash
- python -m deeppavlov riseapi slotfill_dstc2 -d
+ python -m deeppavlov riseapi insults_kaggle_bert -d
-- Predict intents on every line in a file:
+- Predict whether it is an insult on every line in a file:
.. code-block:: bash
- python -m deeppavlov predict intents_snips -d --batch-size 15 < /data/in.txt > /data/out.txt
+ python -m deeppavlov predict insults_kaggle_bert -d --batch-size 15 < /data/in.txt > /data/out.txt
View `video demo `__ of deployment of a
@@ -571,4 +367,3 @@ goal-oriented bot and a slot-filling model with Telegram UI.
.. _`SQuAD-v1.1`: https://arxiv.org/abs/1606.05250
.. _`SDSJ Task B`: https://arxiv.org/abs/1912.09723
-.. _`DRCD`: https://arxiv.org/abs/1806.00920
diff --git a/docs/features/pretrained_vectors.rst b/docs/features/pretrained_vectors.rst
index ee8d6d01e0..f0d410c13d 100644
--- a/docs/features/pretrained_vectors.rst
+++ b/docs/features/pretrained_vectors.rst
@@ -28,58 +28,72 @@ The ``TensorFlow`` models can be run with the original `BERT repo `__ library.
The download links are:
-+----------------------------+---------------------------------------+--------------------------------------------------------------------------------------------------------------------+
-| Description | Model parameters | Download links |
-+============================+=======================================+====================================================================================================================+
-| RuBERT | vocab size = 120K, parameters = 180M, | `[tensorflow] `__, |
-| | size = 632MB | `[pytorch] `__ |
-+----------------------------+---------------------------------------+--------------------------------------------------------------------------------------------------------------------+
-| Slavic BERT | vocab size = 120K, parameters = 180M, | `[tensorflow] `__, |
-| | size = 632MB | `[pytorch] `__ |
-+----------------------------+---------------------------------------+--------------------------------------------------------------------------------------------------------------------+
-| Conversational BERT | vocab size = 30K, parameters = 110M, | `[tensorflow] `__, |
-| | size = 385MB | `[pytorch] `__ |
-+----------------------------+---------------------------------------+--------------------------------------------------------------------------------------------------------------------+
-| Conversational RuBERT | vocab size = 120K, parameters = 180M, | `[tensorflow] `__, |
-| | size = 630MB | `[pytorch] `__ |
-+----------------------------+---------------------------------------+--------------------------------------------------------------------------------------------------------------------+
-| Sentence Multilingual BERT | vocab size = 120K, parameters = 180M, | `[tensorflow] `__, |
-| | size = 630MB | `[pytorch] `__ |
-+----------------------------+---------------------------------------+--------------------------------------------------------------------------------------------------------------------+
-| Sentence RuBERT | vocab size = 120K, parameters = 180M, | `[tensorflow] `__, |
-| | size = 630MB | `[pytorch] `__ |
-+----------------------------+---------------------------------------+--------------------------------------------------------------------------------------------------------------------+
++----------------------------+---------------------------------------+----------------------------------------------------------------------------------------------------------------------+
+| Description | Model parameters | Download links |
++============================+=======================================+======================================================================================================================+
+| RuBERT | vocab size = 120K, parameters = 180M, | `[tensorflow] `__, |
+| | size = 632MB | `[pytorch] `__ |
++----------------------------+---------------------------------------+----------------------------------------------------------------------------------------------------------------------+
+| Slavic BERT | vocab size = 120K, parameters = 180M, | `[tensorflow] `__, |
+| | size = 632MB | `[pytorch] `__ |
++----------------------------+---------------------------------------+----------------------------------------------------------------------------------------------------------------------+
+| Conversational BERT | vocab size = 30K, parameters = 110M, | `[tensorflow] `__, |
+| | size = 385MB | `[pytorch] `__ |
++----------------------------+---------------------------------------+----------------------------------------------------------------------------------------------------------------------+
+| Conversational RuBERT | vocab size = 120K, parameters = 180M, | `[tensorflow] `__, |
+| | size = 630MB | `[pytorch] `__ |
++----------------------------+---------------------------------------+----------------------------------------------------------------------------------------------------------------------+
+| Sentence Multilingual BERT | vocab size = 120K, parameters = 180M, | `[tensorflow] `__, |
+| | size = 630MB | `[pytorch] `__ |
++----------------------------+---------------------------------------+----------------------------------------------------------------------------------------------------------------------+
+| Sentence RuBERT | vocab size = 120K, parameters = 180M, | `[tensorflow] `__, |
+| | size = 630MB | `[pytorch] `__ |
++----------------------------+---------------------------------------+----------------------------------------------------------------------------------------------------------------------+
ELMo
----
-| We are publishing :class:`Russian language ELMo embeddings model ` for tensorflow-hub and :class:`LM model ` for training and fine-tuning ELMo as LM model.
-| ELMo (Embeddings from Language Models) representations are pre-trained contextual representations from
- large-scale bidirectional language models. See a paper `Deep contextualized word representations
- `__ for more information about the algorithm and a detailed analysis.
+The ELMo can used via Python code as following:
-License
-~~~~~~~
+.. code:: python
+
+ import tensorflow as tf
+ import tensorflow_hub as hub
+ elmo = hub.Module("http://files.deeppavlov.ai/deeppavlov_data/elmo_ru-news_wmt11-16_1.5M_steps.tar.gz", trainable=True)
+ sess = tf.Session()
+ sess.run(tf.global_variables_initializer())
+ embeddings = elmo(["это предложение", "word"], signature="default", as_dict=True)["elmo"]
+ sess.run(embeddings)
+
+
+TensorFlow Hub module also supports tokenized sentences in the following format.
+
+.. code:: python
+
+ tokens_input = [["мама", "мыла", "раму"], ["рама", "", ""]]
+ tokens_length = [3, 1]
+ embeddings = elmo(inputs={"tokens": tokens_input,"sequence_len": tokens_length},signature="tokens",as_dict=True)["elmo"]
+ sess.run(embeddings)
-The pre-trained models are distributed under the `License Apache
-2.0 `__.
Downloads
~~~~~~~~~
-The models can be downloaded and run by configuration file or tensorflow hub module from:
+The models can be downloaded and run by tensorflow hub module from:
+
+--------------------------------------------------------------------+---------------------------------------------+------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| Description | Dataset parameters | Perplexity | Configuration file and tensorflow hub module |
+| Description | Dataset parameters | Perplexity | Tensorflow hub module |
+====================================================================+=============================================+==================+=======================================================================================================================================================================================================================================+
-| ELMo on `Russian Wikipedia `__ | lines = 1M, tokens = 386M, size = 5GB | 43.692 | `config_file `__, `module_spec `__ |
+| ELMo on `Russian Wikipedia `__ | lines = 1M, tokens = 386M, size = 5GB | 43.692 | `module_spec `__ |
+--------------------------------------------------------------------+---------------------------------------------+------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| ELMo on `Russian WMT News `__ | lines = 63M, tokens = 946M, size = 12GB | 49.876 | `config_file `__, `module_spec `__ |
+| ELMo on `Russian WMT News `__ | lines = 63M, tokens = 946M, size = 12GB | 49.876 | `module_spec `__ |
+--------------------------------------------------------------------+---------------------------------------------+------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| ELMo on `Russian Twitter `__ | lines = 104M, tokens = 810M, size = 8.5GB | 94.145 | `config_file `__, `module_spec `__ |
+| ELMo on `Russian Twitter `__ | lines = 104M, tokens = 810M, size = 8.5GB | 94.145 | `module_spec `__ |
+--------------------------------------------------------------------+---------------------------------------------+------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+
fastText
--------
diff --git a/docs/features/skills/go_bot.rst b/docs/features/skills/go_bot.rst
index e585ab8e55..7a3e81ccaa 100644
--- a/docs/features/skills/go_bot.rst
+++ b/docs/features/skills/go_bot.rst
@@ -4,13 +4,18 @@ Go-Bot Framework
Overview
********
-Go-Bot is an ML-driven framework designed to enable development of the goal-oriented skills for DeepPavlov Dream AI Assistant Platform.
+Go-Bot is an ML-driven framework designed to enable development of the goal-oriented skills for
+DeepPavlov Dream AI Assistant Platform.
-These goal-oriented skills can be written in Python (enabling using their corresponding Go-Bot-trained models natively) or in any other programming language (requiring running their corresponding Go-Bot-trained models as microservices).
+These goal-oriented skills can be written in Python (enabling using their corresponding Go-Bot-trained models natively)
+or in any other programming language (requiring running their corresponding Go-Bot-trained models as microservices).
-To build a Go-Bot-based goal-oriented skill, you need to provide Go-Bot framework with a dataset (in RASA v1 or DSTC2 formats), train model, download it, and then use it by either calling them natively from Python or by rising them as microservices and then calling them via its standard DeepPavlov REST API.
+To build a Go-Bot-based goal-oriented skill, you need to provide Go-Bot framework with a dataset (in RASA v1 format),
+train model, download it, and then use it by either calling them natively from Python or by rising them as microservices
+and then calling them via its standard DeepPavlov REST API.
-Currently, we support two different approaches to define domain model and behavior of a given goal-oriented skill - using either a subset of the v1 of the RASA DSLs (domain.yml, nlu.md, stories.md) or a DSTC2 format. As of the latest release, the following subset of functionality is supported:
+Currently we support a subset of the v1 of the RASA DSLs (domain.yml, nlu.md, stories.md) to define domain model and
+behavior of a given goal-oriented skill. As of the latest release, the following subset of functionality is supported:
* Intents
* Slots (simple slots requiring custom classifiers for custom data types)
@@ -18,28 +23,34 @@ Currently, we support two different approaches to define domain model and behavi
* Templated Responses (w/o variables)
* **Form-Filling** (basic, added in **v0.14 release**)
-In the future, we will expand support for RASA DSLs where appropriate to enable backward compatibility, add integration with the upcoming Intent Catcher component available as part of the DeepPavlov component library, and so on.
+In the future, we will expand support for RASA DSLs where appropriate to enable backward compatibility, add integration
+with the upcoming Intent Catcher component available as part of the DeepPavlov component library, and so on.
-To experiment with the Go-Bot you can follow tutorials for using RASA DSLs, or pick one of the two available pre-trained models designed around the DSTSC2 dataset (English).
+To experiment with the Go-Bot you can follow tutorials for using RASA DSLs.
RASA DSLs Format Support
************************
Overview
========
-While DSTC-2 schema format is quite rich, preparing this kind of dataset with all required annotations might be challenging. To simplify the process of building goal-oriented bots using DeepPavlov technology, in `v0.12.0 `_ we have introduced a (limited) support for defining them using RASA DSLs.
+To simplify the process of building goal-oriented bots using DeepPavlov technology,
+in `v0.12.0 `_ we have introduced
+a (limited) support for defining them using RASA DSLs.
.. note::
DSLs, known as Domain-Specific Languages, provide a rich mechanism to define the behavior, or "the what", while the underlying system uses the parser to transform these definitions into commands that implement this behavior, or "the how" using the system's components.
-RASA.ai is an another well-known Open Source Conversational AI Framework. Their approach to defining the domain model and behavior of the goal-oriented bots is quite simple for building simple goal-oriented bots. In this section you will learn how to use key parts of RASA DSLs (configuration files) to build your own goal-oriented skill based on the DeepPavlov's Go-Bot framework.
+RASA.ai is an another well-known Open Source Conversational AI Framework. Their approach to defining the domain model
+and behavior of the goal-oriented bots is quite simple for building simple goal-oriented bots.
+In this section you will learn how to use key parts of RASA DSLs (configuration files)
+to build your own goal-oriented skill based on the DeepPavlov's Go-Bot framework.
-
-
-While there are several configuration files used by the RASA platform, each with their own corresponding DSL (mostly re-purposed Markdown and YAML), for now only three essential files: ``stories.md``,
+While there are several configuration files used by the RASA platform, each with their own
+corresponding DSL (mostly re-purposed Markdown and YAML), for now only three essential files: ``stories.md``,
``nlu.md``, ``domain.yml`` are supported by the DeepPavlov Go-Bot Framework.
-These files allows you to define user stories that match intents and bot actions, intents with slots and entities, as well as the training data for the NLU components.
+These files allows you to define user stories that match intents and bot actions, intents with slots and entities,
+as well as the training data for the NLU components.
.. note::
As mentioned in our `blog post `__, **this is the very beginning of our work** focused on supporting RASA DSLs as a way to configure DeepPavlov-based goal-oriented chatbots.
@@ -220,9 +231,10 @@ How Do I: Build Go-Bot Skill with RASA DSLs (v1)
Tutorials
^^^^^^^^^
-We encourage you to explore the tutorials below to get better understanding of how to build basic and more advanced goal-oriented skills with these RASA DSLs:
+We encourage you to explore the tutorials below to get better understanding of how to build basic and more advanced
+goal-oriented skills with these RASA DSLs:
-* `Original Tutorial Notebook Featuring Simple and DSTC2-based Skills `_
+* `Original Tutorial Notebook Featuring Simple and DSTC2-based Skills `_
* `Tutorial Notebook Featuring Harvesters Maintenance Go-Bot Skill from Deepy 3000 Demo `_
@@ -230,7 +242,8 @@ We encourage you to explore the tutorials below to get better understanding of h
How Do I: Integrate Go-Bot-based Goal-Oriented Skill into DeepPavlov Deepy
============================================================================
-To integrate your Go-Bot-based goal-oriented skill into your Multiskill AI Assistant built using DeepPavlov Conversational AI Stack, follow the following instructions:
+To integrate your Go-Bot-based goal-oriented skill into your Multiskill AI Assistant built
+using DeepPavlov Conversational AI Stack, follow the following instructions:
1. Clone `Deepy repository `_
2. Replace ``docker-compose.yml`` in the root of the repository and ``pipeline_conf.json`` in the ``/agent/`` subdirectory with the corresponding files from the `deepy_gobot_base `_ **Deepy Distribution**
@@ -252,150 +265,7 @@ Tutorials
Follow this tutorial to experiment with the Form-Filling functionality in Go-Bot-based goal-oriented skills built using RASA DSLs (v1):
-* `Tutorial Notebook Featuring Basic Form-Filling `_
-
-
-DSTC2 Format Support
-**********************
-
-Overview
-==========
-
-The DeepPavlov Go-Bot Framework is based on [1]_ which introduces
-Hybrid Code Networks (HCNs) that combine an RNN with domain-specific
-knowledge and system action templates. Originally, the DSTC2 format was used for the dataset to train a Go-Bot-based goal-oriented skills upon.
-
-|alt text| **Diagram 1.** Dotted lines correspond to unrequired
-(optional) modules, black squares to trained models, trapezes are
-modules that depend on a dataset and must be provided by software
-developer.
-
-Here is a simple example of interaction with a trained goal-oriented skill
-(available for download):
-
-.. note::
-
- Models that rely on fasttext word embeddings will require 8.5 GB of disk space, those that use only one-hot encoding of words will require less than 300Mb of disk space.
-
-.. code:: bash
-
- >> Hello, welcome to the Cambridge restaurant system. You can ask for restaurants by area, price range or food type. How may I help you?
- x::hi, i want cheap food
- 2018-05-16 12:28:48.459 INFO in 'deeppavlov.models.go_bot.bot'['bot'] at line 346: Made api_call with {'pricerange': 'cheap'}, got 22 results.
- >> The lucky star serves chinese food in the cheap price range.
- x::is there anything with french food?
- 2018-05-16 12:29:02.795 INFO in 'deeppavlov.models.go_bot.bot'['bot'] at line 346: Made api_call with {'pricerange': 'cheap', 'food': 'french'}, got 0 results.
- >> Sorry there is no french restaurant in the cheap price range.
- x::ok, maybe there is something in expensive pricerange?
- 2018-05-16 12:29:23.337 INFO in 'deeppavlov.models.go_bot.bot'['bot'] at line 346: Made api_call with {'pricerange': 'expensive', 'food': 'french'}, got 2 results.
- >> Cote serves french food in the expensive price range.
- x::in what part of town is it?
- 2018-05-16 12:29:31.193 INFO in 'deeppavlov.models.go_bot.bot'['bot'] at line 346: Made api_call with {'pricerange': 'expensive', 'food': 'french'}, got 2 results.
- >> Restaurant two two is a nice place in the north of town serving tasty french food.
- x::ok, give me their phone number
- >> The phone number of restaurant two two is 01223 351880.
- x::address and post code too
- >> Restaurant two two is on 22 chesterton road chesterton, c.b 4, 3 a.x.
- x::ok, thank you!
- >> Restaurant two two serves french food in the expensive price range.
- x::and if i wanted an italian restuarant?
- 2018-05-16 12:30:15.98 INFO in 'deeppavlov.models.go_bot.bot'['bot'] at line 346: Made api_call with {'pricerange': 'expensive', 'food': 'italian'}, got 4 results.
- >> Frankie and bennys serves italian food in the expensive price range.
- x::more cheaply priced?
- 2018-05-16 12:30:36.560 INFO in 'deeppavlov.models.go_bot.bot'['bot'] at line 346: Made api_call with {'pricerange': 'cheap', 'food': 'italian'}, got 5 results.
- >> Da vinci pizzeria serves italian food in the cheap price range.
- x::ok, give me their phone number and address, please
- >> The phone number of da vinci pizzeria is 01223 351707 and it is on 20 milton road chesterton.
- x::thanks
- >> Da vinci pizzeria serves italian food in the cheap price range.
- x::bye
- >> You are welcome!
-
-
-Quick Demo
-============
-
-To quickly try out the Go-Bot capabilities you can use one of the two available pretrained models for DSTC2 dataset (English). Check them out by running this code:
-
-.. code:: python
-
- from deeppavlov import build_model, configs
-
- bot1 = build_model(configs.go_bot.gobot_dstc2, download=True)
-
- bot1(['hi, i want restaurant in the cheap pricerange'])
- bot1(['bye'])
-
- bot2 = build_model(configs.go_bot.gobot_dstc2_best, download=True)
-
- bot2(['hi, i want chinese restaurant'])
- bot2(['bye'])
-
-If some required packages are missing, install all the requirements by running in command line:
-
-.. code:: bash
-
- python -m deeppavlov install gobot_dstc2
-
-How Do I: Build Go-Bot with DSTC2
-===================================
-DSTC is a set of competitions originally known as "Dialog State Tracking Challenges" (DSTC, for short). First challenge
-was organized in 2012-2013. Starting as an initiative to provide a common testbed for the task of Dialog State Tracking,
-the first Dialog State Tracking Challenge (DSTC) was organized in 2013, followed by DSTC2&3 in 2014, DSTC4 in 2015,
-and DSTC5 in 2016. Given the remarkable success of the first five editions, and understanding both, the complexity
-of the dialog phenomenon and the interest of the research community in a wider variety of dialog related problems,
-the DSTC rebranded itself as "Dialog System Technology Challenges" for its sixth edition. Then, DSTC6 and DSTC7 have
-been completed in 2017 and 2018, respectively.
-
-DSTC-2 released a large number of training dialogs related to restaurant search. Compared to DSTC (which was in the bus
-timetables domain), DSTC 2 introduced changing user goals, tracking 'requested slots' as well as the new Restaurants domain.
-
-Historically, DeepPavlov's Go-Bot used this DSTC-2 approach to defining domain model and behavior of the goal-oriented bots.
-In this section you will learn how to use this approach to build a DSTC-2-based Go-Bot.
-
-Requirements
-^^^^^^^^^^^^
-
-**TO TRAIN** a go\_bot model you should have:
-
-1. (*optional, but recommended*) pretrained named entity recognition model (NER)
-
- - config :config:`configs/ner/slotfill_dstc2.json