diff --git a/examples/text_classification/__init__.py b/examples/text_classification/__init__.py
new file mode 100644
index 000000000..a5dd21c1f
--- /dev/null
+++ b/examples/text_classification/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2020 The Forte Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/examples/text_classification/config_classifier.py b/examples/text_classification/config_classifier.py
new file mode 100644
index 000000000..3000603ec
--- /dev/null
+++ b/examples/text_classification/config_classifier.py
@@ -0,0 +1,11 @@
+name = "bert_classifier"
+hidden_size = 768
+clas_strategy = "cls_time"
+dropout = 0.1
+num_classes = 2
+
+# This hyperparams is used in bert_with_hypertuning_main.py example
+hyperparams = {
+ "optimizer.warmup_steps": {"start": 10000, "end": 20000, "dtype": int},
+ "optimizer.static_lr": {"start": 1e-3, "end": 1e-2, "dtype": float}
+}
diff --git a/examples/text_classification/config_data.py b/examples/text_classification/config_data.py
new file mode 100644
index 000000000..cf7648787
--- /dev/null
+++ b/examples/text_classification/config_data.py
@@ -0,0 +1,68 @@
+pickle_data_dir = "data/IMDB"
+max_seq_length = 128 # the input will be truncated to max_seq_length
+num_classes = 2
+num_train_data = 25000
+
+# used for bert executor example
+max_batch_tokens = 128
+
+train_batch_size = 24
+max_train_epoch = 5
+display_steps = 50 # Print training loss every display_steps; -1 to disable
+
+# tbx config
+tbx_logging_steps = 5 # log the metrics for tbX visualization
+tbx_log_dir = "runs/"
+exp_number = 1 # experiment number
+
+eval_steps = 100 # Eval on the dev set every eval_steps; -1 to disable
+# Proportion of training to perform linear learning rate warmup for.
+# E.g., 0.1 = 10% of training.
+warmup_proportion = 0.1
+eval_batch_size = 8
+test_batch_size = 8
+
+feature_types = {
+ # Reading features from pickled data file.
+ # E.g., Reading feature "input_ids" as dtype `int64`;
+ # "FixedLenFeature" indicates its length is fixed for all data instances;
+ # and the sequence length is limited by `max_seq_length`.
+ "input_ids": ["int64", "stacked_tensor", max_seq_length],
+ "input_mask": ["int64", "stacked_tensor", max_seq_length],
+ "segment_ids": ["int64", "stacked_tensor", max_seq_length],
+ "label_ids": ["int64", "stacked_tensor"]
+}
+
+train_hparam = {
+ "allow_smaller_final_batch": False,
+ "batch_size": train_batch_size,
+ "dataset": {
+ "data_name": "data",
+ "feature_types": feature_types,
+ "files": "{}/train.pkl".format(pickle_data_dir)
+ },
+ "shuffle": True,
+ "shuffle_buffer_size": None
+}
+
+eval_hparam = {
+ "allow_smaller_final_batch": True,
+ "batch_size": eval_batch_size,
+ "dataset": {
+ "data_name": "data",
+ "feature_types": feature_types,
+ "files": "{}/eval.pkl".format(pickle_data_dir)
+ },
+ "shuffle": False
+}
+
+test_hparam = {
+ "allow_smaller_final_batch": True,
+ "batch_size": test_batch_size,
+ "dataset": {
+ "data_name": "data",
+ "feature_types": feature_types,
+ "files": "{}/predict.pkl".format(pickle_data_dir)
+ },
+ "shuffle": False
+}
diff --git a/examples/text_classification/download_imdb.py b/examples/text_classification/download_imdb.py
new file mode 100644
index 000000000..4dc8da9aa
--- /dev/null
+++ b/examples/text_classification/download_imdb.py
@@ -0,0 +1,29 @@
+# Copyright 2020 The Forte Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Download IMDB dataset.
+"""
+from forte.data.data_utils import maybe_download
+
+
+def main():
+ download_path = "data/IMDB_raw"
+ maybe_download(urls=[
+ "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"],
+ path=download_path,
+ extract=True)
+
+
+if __name__ == '__main__':
+ main()
diff --git a/examples/text_classification/main.py b/examples/text_classification/main.py
new file mode 100644
index 000000000..d3fcc1086
--- /dev/null
+++ b/examples/text_classification/main.py
@@ -0,0 +1,34 @@
+# Copyright 2020 The Forte Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import os
+
+import config_data
+import config_classifier
+
+from forte.models.imdb_text_classifier.model import IMDBClassifier
+
+
+def main():
+ model = IMDBClassifier(config_data, config_classifier)
+ if not os.path.isfile("data/IMDB/train.pkl")\
+ or not os.path.isfile("data/IMDB/eval.pkl")\
+ or not os.path.isfile("data/IMDB/predict.pkl"):
+ model.prepare_data("data/IMDB")
+ model.run(do_train=True, do_eval=True, do_test=False)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/examples/text_classification/preprocess_pipeline.py b/examples/text_classification/preprocess_pipeline.py
new file mode 100644
index 000000000..ef925a2e5
--- /dev/null
+++ b/examples/text_classification/preprocess_pipeline.py
@@ -0,0 +1,67 @@
+# Copyright 2020 The Forte Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Read all data in IMDB and merge them to a csv file."""
+import os
+
+from forte.data.caster import MultiPackBoxer
+from forte.data.multi_pack import MultiPack
+from forte.data.readers import LargeMovieReader
+from forte.pipeline import Pipeline
+from forte.utils.utils_io import maybe_create_dir
+from ft.onto.base_ontology import Document
+
+
+def main():
+ pipeline = Pipeline[MultiPack]()
+ reader = LargeMovieReader()
+ pipeline.set_reader(reader)
+ pipeline.add(MultiPackBoxer())
+
+ pipeline.initialize()
+
+ dataset_path = "data/IMDB_raw/aclImdb/"
+ input_file_path = {
+ "train": os.path.join(dataset_path, "train"),
+ "test": os.path.join(dataset_path, "test")
+ }
+ output_path = "data/IMDB/"
+ maybe_create_dir(output_path)
+ output_file_path = {
+ "train": os.path.join(output_path, "train.csv"),
+ "test": os.path.join(output_path, "test.csv")
+ }
+ set_labels = {
+ "train": ["pos", "neg", "unsup"],
+ "test": ["pos", "neg"],
+ }
+
+ for split in ["train", "test"]:
+ with open(output_file_path[split], "w", encoding="utf-8")\
+ as output_file:
+ output_file.write("\t".join(["content", "label", "id"]) + "\n")
+ for label in set_labels[split]:
+ data_packs = \
+ pipeline.process_dataset(
+ os.path.join(input_file_path[split], label))
+ for pack in data_packs:
+ example_id = pack.get_pack('default').pack_name
+ for pack_name in pack.pack_names:
+ p = pack.get_pack(pack_name)
+ for doc in p.get(Document):
+ output_file.write(
+ "\t".join([doc.text, label, example_id]) + "\n")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/examples/text_classification/run.sh b/examples/text_classification/run.sh
new file mode 100644
index 000000000..dba7e72d5
--- /dev/null
+++ b/examples/text_classification/run.sh
@@ -0,0 +1,3 @@
+python download_imdb.py
+python preprocess_pipeline.py
+python main.py
\ No newline at end of file
diff --git a/forte/models/imdb_text_classifier/__init__.py b/forte/models/imdb_text_classifier/__init__.py
new file mode 100644
index 000000000..a5dd21c1f
--- /dev/null
+++ b/forte/models/imdb_text_classifier/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2020 The Forte Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/forte/models/imdb_text_classifier/config_classifier.py b/forte/models/imdb_text_classifier/config_classifier.py
new file mode 100644
index 000000000..3000603ec
--- /dev/null
+++ b/forte/models/imdb_text_classifier/config_classifier.py
@@ -0,0 +1,11 @@
+name = "bert_classifier"
+hidden_size = 768
+clas_strategy = "cls_time"
+dropout = 0.1
+num_classes = 2
+
+# This hyperparams is used in bert_with_hypertuning_main.py example
+hyperparams = {
+ "optimizer.warmup_steps": {"start": 10000, "end": 20000, "dtype": int},
+ "optimizer.static_lr": {"start": 1e-3, "end": 1e-2, "dtype": float}
+}
diff --git a/forte/models/imdb_text_classifier/config_data.py b/forte/models/imdb_text_classifier/config_data.py
new file mode 100644
index 000000000..cf7648787
--- /dev/null
+++ b/forte/models/imdb_text_classifier/config_data.py
@@ -0,0 +1,68 @@
+pickle_data_dir = "data/IMDB"
+max_seq_length = 128 # the input will be truncated to max_seq_length
+num_classes = 2
+num_train_data = 25000
+
+# used for bert executor example
+max_batch_tokens = 128
+
+train_batch_size = 24
+max_train_epoch = 5
+display_steps = 50 # Print training loss every display_steps; -1 to disable
+
+# tbx config
+tbx_logging_steps = 5 # log the metrics for tbX visualization
+tbx_log_dir = "runs/"
+exp_number = 1 # experiment number
+
+eval_steps = 100 # Eval on the dev set every eval_steps; -1 to disable
+# Proportion of training to perform linear learning rate warmup for.
+# E.g., 0.1 = 10% of training.
+warmup_proportion = 0.1
+eval_batch_size = 8
+test_batch_size = 8
+
+feature_types = {
+ # Reading features from pickled data file.
+ # E.g., Reading feature "input_ids" as dtype `int64`;
+ # "FixedLenFeature" indicates its length is fixed for all data instances;
+ # and the sequence length is limited by `max_seq_length`.
+ "input_ids": ["int64", "stacked_tensor", max_seq_length],
+ "input_mask": ["int64", "stacked_tensor", max_seq_length],
+ "segment_ids": ["int64", "stacked_tensor", max_seq_length],
+ "label_ids": ["int64", "stacked_tensor"]
+}
+
+train_hparam = {
+ "allow_smaller_final_batch": False,
+ "batch_size": train_batch_size,
+ "dataset": {
+ "data_name": "data",
+ "feature_types": feature_types,
+ "files": "{}/train.pkl".format(pickle_data_dir)
+ },
+ "shuffle": True,
+ "shuffle_buffer_size": None
+}
+
+eval_hparam = {
+ "allow_smaller_final_batch": True,
+ "batch_size": eval_batch_size,
+ "dataset": {
+ "data_name": "data",
+ "feature_types": feature_types,
+ "files": "{}/eval.pkl".format(pickle_data_dir)
+ },
+ "shuffle": False
+}
+
+test_hparam = {
+ "allow_smaller_final_batch": True,
+ "batch_size": test_batch_size,
+ "dataset": {
+ "data_name": "data",
+ "feature_types": feature_types,
+ "files": "{}/predict.pkl".format(pickle_data_dir)
+ },
+ "shuffle": False
+}
diff --git a/forte/models/imdb_text_classifier/data/download_imdb.py b/forte/models/imdb_text_classifier/data/download_imdb.py
new file mode 100644
index 000000000..faefbac4a
--- /dev/null
+++ b/forte/models/imdb_text_classifier/data/download_imdb.py
@@ -0,0 +1,34 @@
+# Copyright 2020 The Forte Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import os
+import sys
+import subprocess
+
+
+def main():
+ if not os.path.exists("data/IMDB_raw"):
+ subprocess.run("mkdir data/IMDB_raw", shell=True, check=True)
+ # pylint: disable=line-too-long
+ subprocess.run(
+ 'wget -P data/IMDB_raw/ https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz',
+ shell=True, check=True)
+ subprocess.run(
+ 'tar xzvf data/IMDB_raw/aclImdb_v1.tar.gz -C data/IMDB_raw/ && rm data/IMDB_raw/aclImdb_v1.tar.gz',
+ shell=True, check=True)
+
+
+if __name__ == '__main__':
+ sys.exit(main())
diff --git a/forte/models/imdb_text_classifier/model.py b/forte/models/imdb_text_classifier/model.py
new file mode 100644
index 000000000..815570a04
--- /dev/null
+++ b/forte/models/imdb_text_classifier/model.py
@@ -0,0 +1,250 @@
+# Copyright 2020 The Forte Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import functools
+import logging
+import os
+
+import torch
+import torch.nn.functional as F
+import texar.torch as tx
+
+# pylint: disable=no-name-in-module
+from forte.models.imdb_text_classifier.utils import data_utils, model_utils
+
+
+class IMDBClassifier:
+ """
+ A baseline text classifier for the IMDB dataset.
+ The input data should be CSV format with columns (content label id).
+ An example usage can be found at examples/text_classification.
+ """
+
+ def __init__(self, config_data, config_classifier, checkpoint=None,
+ pretrained_model_name="bert-base-uncased"):
+ """Constructs the text classifier.
+ Args:
+ config_data: string, data config file.
+ """
+ self.config_data = config_data
+ self.config_classifier = config_classifier
+ self.checkpoint = checkpoint
+ self.pretrained_model_name = pretrained_model_name
+
+ def prepare_data(self, csv_data_dir):
+ """Prepares data.
+ """
+ logging.info("Loading data")
+
+ if self.config_data.pickle_data_dir is None:
+ output_dir = csv_data_dir
+ else:
+ output_dir = self.config_data.pickle_data_dir
+ tx.utils.maybe_create_dir(output_dir)
+
+ processor = data_utils.IMDbProcessor()
+
+ num_classes = len(processor.get_labels())
+ num_train_data = len(processor.get_train_examples(csv_data_dir))
+ logging.info(
+ 'num_classes:%d; num_train_data:%d', num_classes, num_train_data)
+
+ tokenizer = tx.data.BERTTokenizer(
+ pretrained_model_name=self.pretrained_model_name)
+
+ data_utils.prepare_record_data(
+ processor=processor,
+ tokenizer=tokenizer,
+ data_dir=csv_data_dir,
+ max_seq_length=self.config_data.max_seq_length,
+ output_dir=output_dir,
+ feature_types=self.config_data.feature_types)
+
+ def run(self, do_train, do_eval, do_test, output_dir="output/"):
+ """
+ Builds the model and runs.
+ """
+ tx.utils.maybe_create_dir(output_dir)
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+ logging.root.setLevel(logging.INFO)
+
+ # Loads data
+ num_train_data = self.config_data.num_train_data
+
+ hparams = {
+ k: v for k, v in self.config_classifier.__dict__.items()
+ if not k.startswith('__') and k != "hyperparams"}
+
+ # Builds BERT
+ model = tx.modules.BERTClassifier(
+ pretrained_model_name=self.pretrained_model_name,
+ hparams=hparams)
+ model.to(device)
+
+ num_train_steps = int(num_train_data / self.config_data.train_batch_size
+ * self.config_data.max_train_epoch)
+ num_warmup_steps = int(num_train_steps
+ * self.config_data.warmup_proportion)
+
+ # Builds learning rate decay scheduler
+ static_lr = 2e-5
+
+ vars_with_decay = []
+ vars_without_decay = []
+ for name, param in model.named_parameters():
+ if 'layer_norm' in name or name.endswith('bias'):
+ vars_without_decay.append(param)
+ else:
+ vars_with_decay.append(param)
+
+ opt_params = [{
+ 'params': vars_with_decay,
+ 'weight_decay': 0.01,
+ }, {
+ 'params': vars_without_decay,
+ 'weight_decay': 0.0,
+ }]
+ optim = tx.core.BertAdam(
+ opt_params, betas=(0.9, 0.999), eps=1e-6, lr=static_lr)
+
+ scheduler = torch.optim.lr_scheduler.LambdaLR(
+ optim, functools.partial(model_utils.get_lr_multiplier,
+ total_steps=num_train_steps,
+ warmup_steps=num_warmup_steps))
+
+ train_dataset = tx.data.RecordData(
+ hparams=self.config_data.train_hparam, device=device)
+ eval_dataset = tx.data.RecordData(
+ hparams=self.config_data.eval_hparam, device=device)
+ test_dataset = tx.data.RecordData(
+ hparams=self.config_data.test_hparam, device=device)
+
+ iterator = tx.data.DataIterator(
+ {"train": train_dataset, "eval": eval_dataset, "test": test_dataset}
+ )
+
+ def _compute_loss(logits, labels):
+ r"""Compute loss.
+ """
+ if model.is_binary:
+ loss = F.binary_cross_entropy(
+ logits.view(-1), labels.view(-1), reduction='mean')
+ else:
+ loss = F.cross_entropy(
+ logits.view(-1, model.num_classes),
+ labels.view(-1), reduction='mean')
+ return loss
+
+ def _train_epoch():
+ r"""Trains on the training set, and evaluates on the dev set
+ periodically.
+ """
+ iterator.switch_to_dataset("train")
+ model.train()
+
+ for batch in iterator:
+ optim.zero_grad()
+ input_ids = batch["input_ids"]
+ segment_ids = batch["segment_ids"]
+ labels = batch["label_ids"]
+
+ input_length = (1 - (input_ids == 0).int()).sum(dim=1)
+
+ logits, _ = model(input_ids, input_length, segment_ids)
+
+ loss = _compute_loss(logits, labels)
+ loss.backward()
+ optim.step()
+ scheduler.step()
+ step = scheduler.last_epoch
+
+ dis_steps = self.config_data.display_steps
+ if dis_steps > 0 and step % dis_steps == 0:
+ logging.info("step: %d; loss: %f", step, loss)
+
+ eval_steps = self.config_data.eval_steps
+ if eval_steps > 0 and step % eval_steps == 0:
+ _eval_epoch()
+ model.train()
+
+ @torch.no_grad()
+ def _eval_epoch():
+ """Evaluates on the dev set.
+ """
+ iterator.switch_to_dataset("eval")
+ model.eval()
+
+ nsamples = 0
+ avg_rec = tx.utils.AverageRecorder()
+ for batch in iterator:
+ input_ids = batch["input_ids"]
+ segment_ids = batch["segment_ids"]
+ labels = batch["label_ids"]
+
+ input_length = (1 - (input_ids == 0).int()).sum(dim=1)
+
+ logits, preds = model(input_ids, input_length, segment_ids)
+
+ loss = _compute_loss(logits, labels)
+ accu = tx.evals.accuracy(labels, preds)
+ batch_size = input_ids.size()[0]
+ avg_rec.add([accu, loss], batch_size)
+ nsamples += batch_size
+ logging.info("eval accu: %.4f; loss: %.4f; nsamples: %d",
+ avg_rec.avg(0), avg_rec.avg(1), nsamples)
+
+ @torch.no_grad()
+ def _test_epoch():
+ """Does predictions on the test set.
+ """
+ iterator.switch_to_dataset("test")
+ model.eval()
+
+ _all_preds = []
+ for batch in iterator:
+ input_ids = batch["input_ids"]
+ segment_ids = batch["segment_ids"]
+
+ input_length = (1 - (input_ids == 0).int()).sum(dim=1)
+
+ _, preds = model(input_ids, input_length, segment_ids)
+
+ _all_preds.extend(preds.tolist())
+
+ output_file = os.path.join(output_dir, "test_results.tsv")
+ with open(output_file, "w+") as writer:
+ writer.write("\n".join(str(p) for p in _all_preds))
+ logging.info("test output written to %s", output_file)
+
+ if self.checkpoint:
+ ckpt = torch.load(self.checkpoint)
+ model.load_state_dict(ckpt['model'])
+ optim.load_state_dict(ckpt['optimizer'])
+ scheduler.load_state_dict(ckpt['scheduler'])
+ if do_train:
+ for _ in range(self.config_data.max_train_epoch):
+ _train_epoch()
+ states = {
+ 'model': model.state_dict(),
+ 'optimizer': optim.state_dict(),
+ 'scheduler': scheduler.state_dict(),
+ }
+ torch.save(states, os.path.join(output_dir, 'model.ckpt'))
+
+ if do_eval:
+ _eval_epoch()
+
+ if do_test:
+ _test_epoch()
diff --git a/forte/models/imdb_text_classifier/utils/__init__.py b/forte/models/imdb_text_classifier/utils/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/forte/models/imdb_text_classifier/utils/data_utils.py b/forte/models/imdb_text_classifier/utils/data_utils.py
new file mode 100644
index 000000000..b933605f0
--- /dev/null
+++ b/forte/models/imdb_text_classifier/utils/data_utils.py
@@ -0,0 +1,662 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This is the Data Loading Pipeline for Sentence Classifier Task from:
+ `https://github.com/google-research/bert/blob/master/run_classifier.py`
+"""
+
+import copy
+import os
+import logging
+import math
+import random
+
+import numpy as np
+import texar.torch as tx
+
+
+class InputExample():
+ """A single training/test example for simple sequence classification."""
+
+ def __init__(self, guid, text_a, text_b=None, label=None):
+ """Constructs a InputExample.
+ Args:
+ guid: Unique id for the example.
+ text_a: string. The untokenized text of the first sequence.
+ For single sequence tasks, only this sequence must be specified.
+ text_b: (Optional) string. The untokenized text of the second
+ sequence. Only must be specified for sequence pair tasks.
+ label: (Optional) string. The label of the example. This should be
+ specified for train and dev examples, but not for test examples.
+ """
+ self.guid = guid
+ self.text_a = text_a
+ self.text_b = text_b
+ self.label = label
+
+
+class InputFeatures:
+ """A single set of features of data."""
+
+ def __init__(self, input_ids, input_mask, segment_ids, label_id):
+ self.input_ids = input_ids
+ self.input_mask = input_mask
+ self.segment_ids = segment_ids
+ self.label_id = label_id
+
+
+class DataProcessor():
+ """Base class for data converters for sequence classification data sets."""
+
+ def get_train_examples(self, data_dir):
+ """Gets a collection of `InputExample`s for the train set."""
+ raise NotImplementedError()
+
+ def get_dev_examples(self, data_dir):
+ """Gets a collection of `InputExample`s for the dev set."""
+ raise NotImplementedError()
+
+ def get_test_examples(self, data_dir):
+ """Gets a collection of `InputExample`s for prediction."""
+ raise NotImplementedError()
+
+ def get_labels(self):
+ """Gets the list of labels for this data set."""
+ raise NotImplementedError()
+
+ @classmethod
+ def _read_tsv(cls, input_file, quotechar=None): # pylint: disable=unused-argument
+ """Reads a tab separated value file."""
+ with open(input_file, "r", encoding="utf-8") as f:
+ # reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
+ lines = []
+ for line in f.readlines():
+ lines.append(line.split('\t'))
+ return lines
+
+
+def clean_web_text(st):
+ """clean text."""
+ st = st.replace("
", " ")
+ st = st.replace(""", "\"")
+ st = st.replace("
", " ") + if "", start_pos) + if end_pos != -1: + st = st[:start_pos] + st[end_pos + 1:] + else: + print("incomplete href") + print("before", st) + st = st[:start_pos] + st[start_pos + len("", "") + # print("after\n", st) + # print("") + st = st.replace("\\n", " ") + st = st.replace("\\", " ") + # while " " in st: + # st = st.replace(" ", " ") + return st + + +class IMDbProcessor(DataProcessor): + """Processor for the CoLA data set (GLUE version).""" + + def get_train_examples(self, raw_data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(raw_data_dir, "train.csv"), + quotechar='"'), "train") + + def get_dev_examples(self, raw_data_dir): + """The IMDB dataset does not have a dev set so we just use test set""" + return self._create_examples( + self._read_tsv(os.path.join(raw_data_dir, "test.csv"), + quotechar='"'), "test") + + def get_test_examples(self, raw_data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(raw_data_dir, "test.csv"), + quotechar='"'), "test") + + def get_unsup_examples(self, raw_data_dir, unsup_set): + """See base class.""" + if unsup_set == "unsup_ext": + return self._create_examples( + self._read_tsv(os.path.join(raw_data_dir, "unsup_ext.csv"), + quotechar='"'), "unsup_ext", skip_unsup=False) + elif unsup_set == "unsup_in": + return self._create_examples( + self._read_tsv( + os.path.join(raw_data_dir, "unsup.csv"), quotechar='"'), + "unsup_in", skip_unsup=False) + + def get_unsup_aug_examples(self, raw_data_dir, unsup_set): + """See base class.""" + if unsup_set == "unsup_ext": + return self._create_examples( + self._read_tsv(os.path.join(raw_data_dir, "unsup_ext.csv"), + quotechar='"'), "unsup_ext", skip_unsup=False) + elif unsup_set == "unsup_in": + return self._create_examples( + self._read_tsv(os.path.join(raw_data_dir, "train_aug.csv"), + quotechar='"'), "unsup_in", skip_unsup=False) + + def get_labels(self): + """See base class.""" + return ["pos", "neg"] + + def _create_examples(self, lines, set_type, skip_unsup=True): + """Creates examples for the training and dev sets.""" + examples = [] + print(len(lines)) + for (i, line) in enumerate(lines): + if i == 0 or len(line) == 1: # newline + continue + if skip_unsup and line[-2] == "unsup": + continue + # Original UDA implementation + # if line[-2] == "unsup" and len(line[0]) < 500: + # tf.logging.info("skipping short samples:{:s}".format(line[0])) + # continue + guid = "%s-%s" % (set_type, line[-1]) + text_a = " ".join(line[:-2]) + label = line[-2] + if label not in ["pos", "neg", "unsup"]: + print(line) + text_a = clean_web_text(text_a) + examples.append(InputExample(guid=guid, text_a=text_a, + text_b=None, label=label)) + return examples + + def get_train_size(self): + return 25000 + + def get_dev_size(self): + return 25000 + + +class SSTProcessor(DataProcessor): + """Processor for the MRPC data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "test.tsv")), "test") + + def get_labels(self): + """See base class.""" + return ["0", "1"] + + @staticmethod + def _create_examples(lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + if set_type in ('train', 'dev'): + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, i) + text_a = tx.utils.compat_as_text(line[0]) + # Single sentence classification, text_b doesn't exist + text_b = None + label = tx.utils.compat_as_text(line[1]) + examples.append(InputExample(guid=guid, text_a=text_a, + text_b=text_b, label=label)) + if set_type == 'test': + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, i) + text_a = tx.utils.compat_as_text(line[1]) + # Single sentence classification, text_b doesn't exist + text_b = None + label = '0' # arbitrary set as 0 + examples.append(InputExample(guid=guid, text_a=text_a, + text_b=text_b, label=label)) + return examples + + +class XnliProcessor(DataProcessor): + """Processor for the XNLI data set.""" + + def __init__(self): + self.language = "zh" + + def get_train_examples(self, data_dir): + """See base class.""" + lines = self._read_tsv( + os.path.join(data_dir, "multinli", + "multinli.train.%s.tsv" % self.language)) + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "train-%d" % (i) + text_a = tx.utils.compat_as_text(line[0]) + text_b = tx.utils.compat_as_text(line[1]) + label = tx.utils.compat_as_text(line[2]) + if label == tx.utils.compat_as_text("contradictory"): + label = tx.utils.compat_as_text("contradiction") + examples.append(InputExample(guid=guid, text_a=text_a, + text_b=text_b, label=label)) + return examples + + def get_dev_examples(self, data_dir): + """See base class.""" + lines = self._read_tsv(os.path.join(data_dir, "xnli.dev.tsv")) + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "dev-%d" % (i) + language = tx.utils.compat_as_text(line[0]) + if language != tx.utils.compat_as_text(self.language): + continue + text_a = tx.utils.compat_as_text(line[6]) + text_b = tx.utils.compat_as_text(line[7]) + label = tx.utils.compat_as_text(line[1]) + examples.append(InputExample(guid=guid, text_a=text_a, + text_b=text_b, label=label)) + return examples + + def get_labels(self): + """See base class.""" + return ["contradiction", "entailment", "neutral"] + + +class MnliProcessor(DataProcessor): + """Processor for the MultiNLI data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "dev_matched.tsv")), + "dev_matched") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "test_matched.tsv")), + "test") + + def get_labels(self): + """See base class.""" + return ["contradiction", "entailment", "neutral"] + + @staticmethod + def _create_examples(lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, + tx.utils.compat_as_text(line[0])) + text_a = tx.utils.compat_as_text(line[8]) + text_b = tx.utils.compat_as_text(line[9]) + if set_type == "test": + label = "contradiction" + else: + label = tx.utils.compat_as_text(line[-1]) + examples.append(InputExample(guid=guid, text_a=text_a, + text_b=text_b, label=label)) + return examples + + +class MrpcProcessor(DataProcessor): + """Processor for the MRPC data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train.tsv")), + "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "dev.tsv")), + "dev") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "test.tsv")), + "test") + + def get_labels(self): + """See base class.""" + return ["0", "1"] + + @staticmethod + def _create_examples(lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, i) + text_a = tx.utils.compat_as_text(line[3]) + text_b = tx.utils.compat_as_text(line[4]) + if set_type == "test": + label = "0" + else: + label = tx.utils.compat_as_text(line[0]) + examples.append(InputExample(guid=guid, text_a=text_a, + text_b=text_b, label=label)) + return examples + + +class ColaProcessor(DataProcessor): + """Processor for the CoLA data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train.tsv")), + "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "dev.tsv")), + "dev") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "test.tsv")), + "test") + + def get_labels(self): + """See base class.""" + return ["0", "1"] + + @staticmethod + def _create_examples(lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + # Only the test set has a header + if set_type == "test" and i == 0: + continue + guid = "%s-%s" % (set_type, i) + if set_type == "test": + text_a = tx.utils.compat_as_text(line[1]) + label = "0" + else: + text_a = tx.utils.compat_as_text(line[3]) + label = tx.utils.compat_as_text(line[1]) + examples.append(InputExample(guid=guid, text_a=text_a, + text_b=None, label=label)) + return examples + + +def convert_single_example(ex_index, example, label_list, max_seq_length, + tokenizer): + r"""Converts a single `InputExample` into a single `InputFeatures`.""" + label_map = {} + for (i, label) in enumerate(label_list): + label_map[label] = i + + input_ids, segment_ids, input_mask = \ + tokenizer.encode_text(text_a=example.text_a, + text_b=example.text_b, + max_seq_length=max_seq_length) + + label_id = label_map[example.label] + + # here we disable the verbose printing of the data + if ex_index < 0: + logging.info("*** Example ***") + logging.info("guid: %s", example.guid) + logging.info("input_ids: %s", " ".join([str(x) for x in input_ids])) + logging.info("input_ids length: %d", len(input_ids)) + logging.info("input_mask: %s", " ".join([str(x) for x in input_mask])) + logging.info("segment_ids: %s", " ".join([str(x) for x in segment_ids])) + logging.info("label: %s (id = %d)", example.label, label_id) + + feature = InputFeatures(input_ids=input_ids, + input_mask=input_mask, + segment_ids=segment_ids, + label_id=label_id) + return feature + + +def convert_examples_to_features_and_output_to_files( + examples, label_list, max_seq_length, tokenizer, output_file, + feature_types): + r"""Convert a set of `InputExample`s to a pickled file.""" + + with tx.data.RecordData.writer(output_file, feature_types) as writer: + for (ex_index, example) in enumerate(examples): + feature = convert_single_example(ex_index, example, label_list, + max_seq_length, tokenizer) + + features = { + "input_ids": feature.input_ids, + "input_mask": feature.input_mask, + "segment_ids": feature.segment_ids, + "label_ids": feature.label_id + } + writer.write(features) + + +def convert_unsup_examples_to_features_and_output_to_files( + examples, aug_examples, label_list, max_seq_length, tokenizer, + output_file, feature_types): + r"""Convert a set of `InputExample`s to a pickled file.""" + + with tx.data.RecordData.writer(output_file, feature_types) as writer: + print(len(examples), "unsup examples") + print(len(aug_examples), "augmented unsup examples") + assert len(examples) == len(aug_examples) + for (ex_index, (example, aug_example)) in \ + enumerate(zip(examples, aug_examples)): + feature = convert_single_example(ex_index, example, label_list, + max_seq_length, tokenizer) + aug_feature = convert_single_example(ex_index, aug_example, + label_list, max_seq_length, tokenizer) + + features = { + "input_ids": feature.input_ids, + "input_mask": feature.input_mask, + "segment_ids": feature.segment_ids, + "label_ids": feature.label_id, + "aug_input_ids": aug_feature.input_ids, + "aug_input_mask": aug_feature.input_mask, + "aug_segment_ids": aug_feature.segment_ids, + "aug_label_ids": aug_feature.label_id, + } + writer.write(features) + + +def replace_with_length_check( + ori_text, new_text, + use_min_length, + use_max_length_diff_ratio): + """Use new_text if the text length satisfies several constraints.""" + if len(ori_text) < use_min_length or len(new_text) < use_min_length: + if random.random() < 0.001: + print("not replacing due to short text: \n\t" + "ori: {:s}\n\tnew: {:s}\n".format(ori_text, new_text)) + return ori_text + length_diff_ratio = 1.0 * (len(new_text) - len(ori_text)) / len(ori_text) + if math.fabs(length_diff_ratio) > use_max_length_diff_ratio: + if random.random() < 0.001: + print("not replacing due to too different text length:\n" + "\tori: {:s}\n\tnew: {:s}\n".format( + ori_text, + new_text)) + return ori_text + return new_text + + +def back_translation(examples, back_translation_file, data_total_size): + """Run back translation.""" + use_min_length = 10 + use_max_length_diff_ratio = 0.5 + logging.info("running bt augmentation") + + text_per_example = 1 + + with open(back_translation_file, encoding='utf-8') as inf: + paraphrases = inf.readlines() + for i in range(len(paraphrases)): # pylint: disable=consider-using-enumerate + paraphrases[i] = paraphrases[i].strip() + assert len(paraphrases) == data_total_size + + aug_examples = [] + aug_cnt = 0 + for i in range(len(examples)): # pylint: disable=consider-using-enumerate + ori_example = examples[i] + text_a = replace_with_length_check( + ori_example.text_a, + paraphrases[i * text_per_example], + use_min_length, + use_max_length_diff_ratio, + ) + if text_a == paraphrases[i * text_per_example]: + aug_cnt += 1 + if ori_example.text_b is not None: + text_b = replace_with_length_check( + ori_example.text_b, + paraphrases[i * text_per_example + 1], + use_min_length, + use_max_length_diff_ratio, + ) + else: + text_b = None + + example = InputExample( + guid=ori_example.guid, + text_a=text_a, + text_b=text_b, + label=ori_example.label) + aug_examples += [example] + if np.random.random() < 0.0001: + pass + # tf.logging.info("\tori:\n\t\t{:s}\n\t\t{:s}\n\t\t{:s}\n".format( + # ori_example.text_a, ori_example.text_b, ori_example.label)) + # tf.logging.info("\tnew:\n\t\t{:s}\n\t\t{:s}\n\t\t{:s}\n".format( + # example.text_a, example.text_b, example.label)) + if i % 10000 == 0: + print("processing example # {:d}".format(i)) + logging.info("applied back translation for {:.1f} percent of data".format( + aug_cnt * 1. / len(examples) * 100)) + logging.info("finishing running back translation augmentation") + return aug_examples + + +def prepare_record_data(processor, tokenizer, + data_dir, max_seq_length, output_dir, + feature_types, unsup_feature_types=None, + sup_size_limit=None, unsup_bt_file=None): + r"""Prepare record data. + Args: + processor: Data Preprocessor, which must have get_labels, + get_train/dev/test/examples methods defined. + tokenizer: The Sentence Tokenizer. Generally should be + SentencePiece Model. + data_dir: The input data directory. + max_seq_length: Max sequence length. + output_dir: The directory to save the pickled file in. + feature_types: The original type of the feature. + """ + label_list = processor.get_labels() + + train_file = os.path.join(output_dir, "train.pkl") + if not os.path.isfile(train_file): + train_examples = processor.get_train_examples(data_dir) + if sup_size_limit is not None: + train_examples = get_data_by_size_lim( + train_examples, processor, sup_size_limit) + convert_examples_to_features_and_output_to_files( + train_examples, label_list, max_seq_length, + tokenizer, train_file, feature_types) + + eval_file = os.path.join(output_dir, "eval.pkl") + if not os.path.isfile(eval_file): + eval_examples = processor.get_dev_examples(data_dir) + convert_examples_to_features_and_output_to_files( + eval_examples, label_list, + max_seq_length, tokenizer, eval_file, feature_types) + + test_file = os.path.join(output_dir, "predict.pkl") + if not os.path.isfile(test_file): + test_examples = processor.get_test_examples(data_dir) + convert_examples_to_features_and_output_to_files( + test_examples, label_list, + max_seq_length, tokenizer, test_file, feature_types) + + if unsup_feature_types is not None: + unsup_file = os.path.join(output_dir, "unsup.pkl") + if not os.path.isfile(unsup_file): + unsup_label_list = label_list + ["unsup"] + unsup_examples = processor.get_unsup_examples(data_dir, "unsup_in") + unsup_aug_examples = copy.deepcopy(unsup_examples) + unsup_aug_examples = back_translation(unsup_aug_examples, + unsup_bt_file, len(unsup_aug_examples)) + convert_unsup_examples_to_features_and_output_to_files( + unsup_examples, unsup_aug_examples, unsup_label_list, + max_seq_length, tokenizer, unsup_file, unsup_feature_types) + + +def get_data_by_size_lim(train_examples, processor, sup_size): + """Deterministicly get a dataset with only sup_size examples.""" + # Assuming sup_size < number of labeled data and + # that there are same number of examples for each category + assert sup_size % len(processor.get_labels()) == 0 + per_label_size = sup_size // len(processor.get_labels()) + per_label_examples = {} + for i in range(len(train_examples)): # pylint: disable=consider-using-enumerate + label = train_examples[i].label + if label not in per_label_examples: + per_label_examples[label] = [] + per_label_examples[label] += [train_examples[i]] + + for label in processor.get_labels(): + assert len(per_label_examples[label]) >= per_label_size, ( + "label {} only has {} examples while the limit" + "is {}".format( + label, len(per_label_examples[label]), per_label_size)) + + new_train_examples = [] + for i in range(per_label_size): + for label in processor.get_labels(): + new_train_examples += [per_label_examples[label][i]] + train_examples = new_train_examples + return train_examples diff --git a/forte/models/imdb_text_classifier/utils/model_utils.py b/forte/models/imdb_text_classifier/utils/model_utils.py new file mode 100644 index 000000000..2e53492d8 --- /dev/null +++ b/forte/models/imdb_text_classifier/utils/model_utils.py @@ -0,0 +1,19 @@ +""" +Model utility functions +""" + + +def get_lr_multiplier(step: int, total_steps: int, warmup_steps: int) -> float: + r"""Calculate the learning rate multiplier given current step and the number + of warm-up steps. The learning rate schedule follows a linear warm-up and + linear decay. + """ + step = min(step, total_steps) + + multiplier = (1 - (step - warmup_steps) / (total_steps - warmup_steps)) + + if warmup_steps > 0 and step < warmup_steps: + warmup_percent_done = step / warmup_steps + multiplier = warmup_percent_done + + return multiplier