asyml · jrxk · Dec 1, 2020 · Dec 1, 2020 · Dec 6, 2020 · Dec 14, 2020
diff --git a/examples/text_classification/config_data.py b/examples/text_classification/config_data.py
@@ -6,7 +6,7 @@
 # used for bert executor example
 max_batch_tokens = 128
 
-train_batch_size = 32
+train_batch_size = 24
 max_train_epoch = 5
 display_steps = 50  # Print training loss every display_steps; -1 to disable
 

diff --git a/examples/text_classification/download_imdb.py b/examples/text_classification/download_imdb.py
@@ -11,27 +11,19 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-
-import os
-import sys
-import subprocess
+"""
+Download IMDB dataset.
+"""
+from forte.data.data_utils import maybe_download
 
 
 def main():
-    if not os.path.exists("data/IMDB_raw"):
-        subprocess.run("mkdir data/IMDB_raw", shell=True, check=True)
-    # pylint: disable=line-too-long
-    subprocess.run(
-        'wget -P data/IMDB_raw/ https://github.com/google-research/uda/blob/master/text/data/IMDB_raw/train_id_list.txt',
-        shell=True, check=True)
-    subprocess.run(
-        'wget -P data/IMDB_raw/ https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz',
-        shell=True, check=True)
-    subprocess.run(
-        'tar xzvf data/IMDB_raw/aclImdb_v1.tar.gz -C data/IMDB_raw/ && rm data/IMDB_raw/aclImdb_v1.tar.gz',
-        shell=True, check=True)
+    download_path = "data/IMDB_raw"
+    maybe_download(urls=[
+        "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"],
+        path=download_path,
+        extract=True)
 
 
 if __name__ == '__main__':
-    sys.exit(main())
+    main()
diff --git a/examples/text_classification/main.py b/examples/text_classification/main.py
@@ -15,10 +15,11 @@
 
 import os
 
-from forte.models.imdb_text_classifier.model import IMDBClassifier
 import config_data
 import config_classifier
 
+from forte.models.imdb_text_classifier.model import IMDBClassifier
+
 
 def main():
     model = IMDBClassifier(config_data, config_classifier)

diff --git a/examples/text_classification/preprocess_pipeline.py b/examples/text_classification/preprocess_pipeline.py
@@ -0,0 +1,67 @@
+# Copyright 2020 The Forte Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Read all data in IMDB and merge them to a csv file."""
+import os
+
+from forte.data.caster import MultiPackBoxer
+from forte.data.multi_pack import MultiPack
+from forte.data.readers import LargeMovieReader
+from forte.pipeline import Pipeline
+from forte.utils.utils_io import maybe_create_dir
+from ft.onto.base_ontology import Document
+
+
+def main():
+    pipeline = Pipeline[MultiPack]()
+    reader = LargeMovieReader()
+    pipeline.set_reader(reader)
+    pipeline.add(MultiPackBoxer())
+
+    pipeline.initialize()
+
+    dataset_path = "data/IMDB_raw/aclImdb/"
+    input_file_path = {
+        "train": os.path.join(dataset_path, "train"),
+        "test": os.path.join(dataset_path, "test")
+    }
+    output_path = "data/IMDB/"
+    maybe_create_dir(output_path)
+    output_file_path = {
+        "train": os.path.join(output_path, "train.csv"),
+        "test": os.path.join(output_path, "test.csv")
+    }
+    set_labels = {
+        "train": ["pos", "neg", "unsup"],
+        "test": ["pos", "neg"],
+    }
+
+    for split in ["train", "test"]:
+        with open(output_file_path[split], "w", encoding="utf-8")\
+            as output_file:
+            output_file.write("\t".join(["content", "label", "id"]) + "\n")
+            for label in set_labels[split]:
+                data_packs = \
+                    pipeline.process_dataset(
+                        os.path.join(input_file_path[split], label))
+                for pack in data_packs:
+                    example_id = pack.get_pack('default').pack_name
+                    for pack_name in pack.pack_names:
+                        p = pack.get_pack(pack_name)
+                        for doc in p.get(Document):
+                            output_file.write(
+                                "\t".join([doc.text, label, example_id]) + "\n")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/text_classification/run.sh b/examples/text_classification/run.sh
@@ -1,3 +1,3 @@
 python download_imdb.py
-python utils/imdb_format.py --raw_data_dir=data/IMDB_raw/aclImdb --train_id_path=data/IMDB_raw/train_id_list.txt --output_dir=data/IMDB
+python preprocess_pipeline.py
 python main.py
diff --git a/examples/text_classification/utils/imdb_format.py b/examples/text_classification/utils/imdb_format.py
diff --git a/forte/models/imdb_text_classifier/config_data.py b/forte/models/imdb_text_classifier/config_data.py
@@ -6,7 +6,7 @@
 # used for bert executor example
 max_batch_tokens = 128
 
-train_batch_size = 32
+train_batch_size = 24
 max_train_epoch = 5
 display_steps = 50  # Print training loss every display_steps; -1 to disable