Skip to content

Commit

Permalink
log bash pipe faults
Browse files Browse the repository at this point in the history
  • Loading branch information
heffernankevin committed Jul 13, 2022
1 parent 0484d42 commit 460ee99
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 21 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ be found in [10], together with an experimental evaluation.
* [NumPy](http://www.numpy.org/), tested with 1.15.4
* [Cython](https://pypi.org/project/Cython/), needed by Python wrapper of FastBPE, tested with 0.29.6
* [Faiss](https://github.com/facebookresearch/faiss), for fast similarity search and bitext mining
* [transliterate 1.10.2](https://pypi.org/project/transliterate), only used for Greek (`pip install transliterate`)
* [transliterate 1.10.2](https://pypi.org/project/transliterate) (`pip install transliterate`)
* [jieba 0.39](https://pypi.org/project/jieba/), Chinese segmenter (`pip install jieba`)
* [mecab 0.996](https://pypi.org/project/JapaneseTokenizer/), Japanese segmenter
* tokenization from the Moses encoder (installed automatically)
Expand Down
30 changes: 10 additions & 20 deletions source/lib/text_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,8 @@
import sys
import logging
from pathlib import Path
import fastBPE
import numpy as np
from subprocess import run, check_output, DEVNULL
from subprocess import run, check_output, CalledProcessError, DEVNULL

logging.basicConfig(
stream=sys.stdout,
Expand All @@ -42,7 +41,7 @@
SPM_DIR = LASER + '/tools-external/sentencepiece-master/build/src/'
SPM = 'LD_LIBRARY_PATH=' + SPM_DIR + ' ' + SPM_DIR + '/spm_encode --output_format=piece'

# Romanization (Greek only)
# Romanization (and lower casing)
ROMAN_LC = 'python3 ' + LASER + '/source/lib/romanize_lc.py -l '

# Mecab tokenizer for Japanese
Expand Down Expand Up @@ -134,33 +133,24 @@ def SPMApply(inp_fname, out_fname, spm_model, lang='en',
'(de-escaped)' if descape else ''))

assert os.path.isfile(spm_model), f'SPM model {spm_model} not found'
check_output(cat + inp_fname
command = (cat + inp_fname
+ '|' + REM_NON_PRINT_CHAR
+ '|' + NORM_PUNC + lang
+ ('|' + DESCAPE if descape else '')
+ '|' + ROMAN_LC + 'none'
+ '|' + SPM + " --model=" + spm_model
+ ' > ' + out_fname,
shell=True, stderr=DEVNULL)
+ ' > ' + out_fname)
try:
run(["/bin/bash", "-o", "pipefail", "-c", command], check=True, capture_output=True)
except CalledProcessError as e:
logger.error(e.stderr.decode().strip())
sys.exit(1)

elif not over_write and verbose:
logger.info('SPM encoded file {} exists already'
.format(os.path.basename(out_fname)))


###############################################################################
#
# Apply FastBPE on one line of text
#
###############################################################################

def BPEfastLoad(line, bpe_codes):
bpe_vocab = bpe_codes.replace('fcodes', 'fvocab')
return fastBPE.fastBPE(bpe_codes, bpe_vocab)

def BPEfastApplyLine(line, bpe):
return bpe.apply([line])[0]


###############################################################################
#
# Apply FastBPE on a whole file
Expand Down

0 comments on commit 460ee99

Please sign in to comment.