log bash pipe faults

facebookresearch · Jul 13, 2022 · 460ee99 · 460ee99
1 parent 0484d42
commit 460ee99
Show file tree

Hide file tree

Showing 2 changed files with 11 additions and 21 deletions.
diff --git a/README.md b/README.md
@@ -31,7 +31,7 @@ be found in [10], together with an experimental evaluation.
 * [NumPy](http://www.numpy.org/), tested with 1.15.4
 * [Cython](https://pypi.org/project/Cython/), needed by Python wrapper of FastBPE, tested with 0.29.6
 * [Faiss](https://github.com/facebookresearch/faiss), for fast similarity search and bitext mining
-* [transliterate 1.10.2](https://pypi.org/project/transliterate), only used for Greek (`pip install transliterate`)
+* [transliterate 1.10.2](https://pypi.org/project/transliterate) (`pip install transliterate`)
 * [jieba 0.39](https://pypi.org/project/jieba/), Chinese segmenter (`pip install jieba`)
 * [mecab 0.996](https://pypi.org/project/JapaneseTokenizer/), Japanese segmenter
 * tokenization from the Moses encoder (installed automatically)

diff --git a/source/lib/text_processing.py b/source/lib/text_processing.py
@@ -18,9 +18,8 @@
 import sys
 import logging
 from pathlib import Path
-import fastBPE
 import numpy as np
-from subprocess import run, check_output, DEVNULL
+from subprocess import run, check_output, CalledProcessError, DEVNULL
 
 logging.basicConfig(
     stream=sys.stdout,
@@ -42,7 +41,7 @@
 SPM_DIR = LASER + '/tools-external/sentencepiece-master/build/src/'
 SPM = 'LD_LIBRARY_PATH=' + SPM_DIR + ' ' + SPM_DIR + '/spm_encode --output_format=piece'
 
-# Romanization (Greek only)
+# Romanization (and lower casing)
 ROMAN_LC = 'python3 ' + LASER + '/source/lib/romanize_lc.py -l '
 
 # Mecab tokenizer for Japanese
@@ -134,33 +133,24 @@ def SPMApply(inp_fname, out_fname, spm_model, lang='en',
                          '(de-escaped)' if descape else ''))
 
         assert os.path.isfile(spm_model), f'SPM model {spm_model} not found'
-        check_output(cat + inp_fname
+        command = (cat + inp_fname
             + '|' + REM_NON_PRINT_CHAR
             + '|' + NORM_PUNC + lang
             + ('|' + DESCAPE if descape else '')
             + '|' + ROMAN_LC + 'none'
             + '|' + SPM + " --model=" + spm_model
-            + ' > ' + out_fname,
-            shell=True, stderr=DEVNULL)
+            + ' > ' + out_fname)
+        try:
+            run(["/bin/bash", "-o", "pipefail", "-c", command], check=True, capture_output=True)
+        except CalledProcessError as e:
+            logger.error(e.stderr.decode().strip())
+            sys.exit(1)
+
     elif not over_write and verbose:
         logger.info('SPM encoded file {} exists already'
               .format(os.path.basename(out_fname)))
 
 
-###############################################################################
-#
-# Apply FastBPE on one line of text
-#
-###############################################################################
-
-def BPEfastLoad(line, bpe_codes):
-    bpe_vocab = bpe_codes.replace('fcodes', 'fvocab')
-    return fastBPE.fastBPE(bpe_codes, bpe_vocab)
-
-def BPEfastApplyLine(line, bpe):
-    return bpe.apply([line])[0]
-
-
 ###############################################################################
 #
 # Apply FastBPE on a whole file