From 5882dc51724b25d41799cad16a7c06c52a259503 Mon Sep 17 00:00:00 2001 From: DongjiGao Date: Wed, 29 Jan 2020 03:51:03 -0500 Subject: [PATCH] [egs] gale_arabic: add python script to process xml file (#3886) --- egs/gale_arabic/s5d/local/check_tools.sh | 15 ++++++++ egs/gale_arabic/s5d/local/prepare_data.sh | 43 ++++++++++++++++------- egs/gale_arabic/s5d/local/process_xml.py | 35 ++++++++++++++++++ egs/gale_arabic/s5d/run.sh | 5 ++- egs/mgb2_arabic/s5/local/check_tools.sh | 15 ++++++++ egs/mgb2_arabic/s5/local/mgb_data_prep.sh | 40 ++++++++++++++------- egs/mgb2_arabic/s5/local/process_xml.py | 35 ++++++++++++++++++ egs/mgb2_arabic/s5/run.sh | 5 ++- 8 files changed, 166 insertions(+), 27 deletions(-) create mode 100755 egs/gale_arabic/s5d/local/check_tools.sh create mode 100755 egs/gale_arabic/s5d/local/process_xml.py create mode 100755 egs/mgb2_arabic/s5/local/check_tools.sh create mode 100755 egs/mgb2_arabic/s5/local/process_xml.py diff --git a/egs/gale_arabic/s5d/local/check_tools.sh b/egs/gale_arabic/s5d/local/check_tools.sh new file mode 100755 index 00000000000..448a6536946 --- /dev/null +++ b/egs/gale_arabic/s5d/local/check_tools.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +# check whether bs4 and lxml is installed +if ! python3 -c "import bs4" 2>/dev/null; then + echo "$0: BeautifulSoup4 not installed, you can install it by 'pip install beautifulsoup4' if you prefer to use python to process xml file" + exit 1; +fi + +if ! python3 -c "import lxml" 2>/dev/null; then + echo "$0: lxml not installed, you can install it by 'pip install lxml' if you prefer to use python to process xml file" + exit 1; +fi + +echo "both BeatufileSoup4 and lxml are installed in python" +exit 0 diff --git a/egs/gale_arabic/s5d/local/prepare_data.sh b/egs/gale_arabic/s5d/local/prepare_data.sh index 820f938b267..d09ff00acd0 100755 --- a/egs/gale_arabic/s5d/local/prepare_data.sh +++ b/egs/gale_arabic/s5d/local/prepare_data.sh @@ -31,6 +31,7 @@ dir10=/export/corpora/LDC/LDC2018S05/ text10=/export/corpora/LDC/LDC2018T14/ mgb2_dir="" +process_xml="" mer=80 . ./utils/parse_options.sh @@ -108,13 +109,6 @@ cd $top_pwd # prepare MGB2 data if [ ! -z $mgb2_dir ]; then echo "preparing MGB2 data" - # check xml - if [ -z $(which xml) ]; then - echo "$0: Could not find tool xml" - echo "$0: To use MGB2 you must have xml installed" - echo "$0: Download and install it from xmlstar.sourceforge.net" - exit 1 - fi xmldir=$mgb2_dir/train/xml/bw output_dir=$gale_data/mgb2 @@ -126,12 +120,35 @@ if [ ! -z $mgb2_dir ]; then mv $output_dir/mgb2 ${output_dir}/.backup fi - ls $mgb2_dir/train/wav/ | while read name; do - basename=`basename -s .wav $name` - [ ! -e $xmldir/$basename.xml ] && echo "Missing $xmldir/$basename.xml" && exit 1 - xml sel -t -m '//segments[@annotation_id="transcript_align"]' -m "segment" -n -v "concat(@who,' ',@starttime,' ',@endtime,' ',@WMER,' ')" -m "element" -v "concat(text(),' ')" $xmldir/$basename.xml | local/add_to_datadir.py $basename $output_dir $mer - echo $basename $db_dir/train/wav/$basename.wav >> $output_dir/wav.scp - done + if [ $process_xml == 'python' ]; then + echo "using python to process xml file" + # check if bs4 and lxml are installed in python + local/check_tools.sh + ls $mgb2_dir/train/wav/ | while read name; do + basename=`basename -s .wav $name` + [ ! -e $xmldir/$basename.xml ] && echo "Missing $xmldir/$basename.xml" && exit 1 + local/process_xml.py $xmldir/$basename.xml - | local/add_to_datadir.py $basename $train_dir $mer + echo $basename $db_dir/train/wav/$basename.wav >> $output_dir/wav.scp + done + elif [ $process_xml == 'xml' ]; then + # check if xml binary exsits + if command -v xml >/dev/null 2>/dev/null; then + echo "using xml" + ls $mgb2_dir/train/wav/ | while read name; do + basename=`basename -s .wav $name` + [ ! -e $xmldir/$basename.xml ] && echo "Missing $xmldir/$basename.xml" && exit 1 + xml sel -t -m '//segments[@annotation_id="transcript_align"]' -m "segment" -n -v "concat(@who,' ',@starttime,' ',@endtime,' ',@WMER,' ')" -m "element" -v "concat(text(),' ')" $xmldir/$basename.xml | local/add_to_datadir.py $basename $output_dir $mer + echo $basename $db_dir/train/wav/$basename.wav >> $output_dir/wav.scp + done + else + echo "xml not found, you may use python by '--process-xml python'" + exit 1; + fi + else + # invalid option + echo "$0: invalid option for --process-xml, choose from 'xml' or 'python'" + exit 1; + fi # add mgb2 data to training data (GALE/all and wav.scp) mv $gale_data/all $gale_data/all.gale diff --git a/egs/gale_arabic/s5d/local/process_xml.py b/egs/gale_arabic/s5d/local/process_xml.py new file mode 100755 index 00000000000..3c6eed452ac --- /dev/null +++ b/egs/gale_arabic/s5d/local/process_xml.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python3 + +from bs4 import BeautifulSoup +import sys +import argparse + +def get_args(): + parser = argparse.ArgumentParser(description="""This script process xml file.""") + parser.add_argument("xml", type=str, help="""Input xml file""") + parser.add_argument("output", type=str, help="""output text file""") + args = parser.parse_args() + return args + +def process_xml(xml_handle, output_handle): + soup = BeautifulSoup(xml_handle, "xml") + for segment in soup.find_all("segment"): + who = segment["who"] + starttime = segment["starttime"] + endtime = segment["endtime"] + WMER = segment["WMER"] + text = " ".join([element.string for element in segment.find_all("element") if element.string != None]) + output_handle.write("{} {} {} {} {}\n".format(who, starttime, endtime, WMER, text)) + xml_handle.close() + output_handle.close() + +def main(): + args = get_args() + + xml_handle = open(args.xml, 'r') + output_handle = sys.stdout if args.output == '-' else open(args.output, 'w') + + process_xml(xml_handle, output_handle) + +if __name__ == "__main__": + main() diff --git a/egs/gale_arabic/s5d/run.sh b/egs/gale_arabic/s5d/run.sh index 68d15c106dc..f8fdafe0a77 100755 --- a/egs/gale_arabic/s5d/run.sh +++ b/egs/gale_arabic/s5d/run.sh @@ -46,6 +46,9 @@ galeData=GALE mgb2_dir="" giga_dir="" +# preference on how to process xml file (use xml binary or python) +process_xml="" + run_rnnlm=false . ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. ## This relates to the queue. @@ -64,7 +67,7 @@ if [ $stage -le 0 ]; then echo "$0: Preparing data..." options="" - [ ! -z $mgb2_dir ] && options="--mgb2-dir $mgb2_dir" + [ ! -z $mgb2_dir ] && options="--process-xml python --mgb2-dir $mgb2_dir" local/prepare_data.sh $options echo "$0: Preparing lexicon and LM..." diff --git a/egs/mgb2_arabic/s5/local/check_tools.sh b/egs/mgb2_arabic/s5/local/check_tools.sh new file mode 100755 index 00000000000..448a6536946 --- /dev/null +++ b/egs/mgb2_arabic/s5/local/check_tools.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +# check whether bs4 and lxml is installed +if ! python3 -c "import bs4" 2>/dev/null; then + echo "$0: BeautifulSoup4 not installed, you can install it by 'pip install beautifulsoup4' if you prefer to use python to process xml file" + exit 1; +fi + +if ! python3 -c "import lxml" 2>/dev/null; then + echo "$0: lxml not installed, you can install it by 'pip install lxml' if you prefer to use python to process xml file" + exit 1; +fi + +echo "both BeatufileSoup4 and lxml are installed in python" +exit 0 diff --git a/egs/mgb2_arabic/s5/local/mgb_data_prep.sh b/egs/mgb2_arabic/s5/local/mgb_data_prep.sh index afd7f92b7cf..681894a9e29 100755 --- a/egs/mgb2_arabic/s5/local/mgb_data_prep.sh +++ b/egs/mgb2_arabic/s5/local/mgb_data_prep.sh @@ -4,8 +4,8 @@ # 2016-2019 Vimal Manohar # 2019 Dongji Gao -if [ $# -ne 2 ]; then - echo "Usage: $0 " +if [ $# -ne 3 ]; then + echo "Usage: $0 " exit 1; fi @@ -23,12 +23,6 @@ for x in $train_dir $dev_dir; do fi done -if [ -z $(which xml) ]; then - echo "$0: Could not find tool xml" - echo "$0: Download and install it from xmlstar.sourceforge.net" - exit 1 -fi - find $db_dir/train/wav -type f -name "*.wav" | \ awk -F/ '{print $NF}' | perl -pe 's/\.wav//g' > \ $train_dir/wav_list @@ -39,11 +33,33 @@ head -500 $train_dir/wav_list > $train_dir/wav_list.short set -e -o pipefail xmldir=$db_dir/train/xml/bw -cat $train_dir/wav_list | while read basename; do +if [ $process_xml == "python" ]; then + echo "using python to process xml file" + # check if bs4 and lxml are installin in python + local/check_tools.sh + # process xml file using python + cat $train_dir/wav_list | while read basename; do [ ! -e $xmldir/$basename.xml ] && echo "Missing $xmldir/$basename.xml" && exit 1 - xml sel -t -m '//segments[@annotation_id="transcript_align"]' -m "segment" -n -v "concat(@who,' ',@starttime,' ',@endtime,' ',@WMER,' ')" -m "element" -v "concat(text(),' ')" $xmldir/$basename.xml | local/add_to_datadir.py $basename $train_dir $mer - echo $basename $wavDir/$basename.wav >> $train_dir/wav.scp -done + local/process_xml.py $xmldir/$basename.xml - | local/add_to_datadir.py $basename $train_dir $mer + done +elif [ $process_xml == 'xml' ]; then + # check if xml binary exsits + if command -v xml >/dev/null 2>/dev/null; then + echo "using xml" + cat $train_dir/wav_list | while read basename; do + [ ! -e $xmldir/$basename.xml ] && echo "Missing $xmldir/$basename.xml" && exit 1 + xml sel -t -m '//segments[@annotation_id="transcript_align"]' -m "segment" -n -v "concat(@who,' ',@starttime,' ',@endtime,' ',@WMER,' ')" -m "element" -v "concat(text(),' ')" $xmldir/$basename.xml | local/add_to_datadir.py $basename $train_dir $mer + echo $basename $wavDir/$basename.wav >> $train_dir/wav.scp + done + else + echo "xml not found, you may use python by '--process-xml python'" + exit 1; + fi +else + # invalid option + echo "$0: invalid option for --process-xml, choose from 'xml' or 'python'" + exit 1; +fi for x in text segments; do cp $db_dir/dev/${x}.all $dev_dir/${x} diff --git a/egs/mgb2_arabic/s5/local/process_xml.py b/egs/mgb2_arabic/s5/local/process_xml.py new file mode 100755 index 00000000000..3c6eed452ac --- /dev/null +++ b/egs/mgb2_arabic/s5/local/process_xml.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python3 + +from bs4 import BeautifulSoup +import sys +import argparse + +def get_args(): + parser = argparse.ArgumentParser(description="""This script process xml file.""") + parser.add_argument("xml", type=str, help="""Input xml file""") + parser.add_argument("output", type=str, help="""output text file""") + args = parser.parse_args() + return args + +def process_xml(xml_handle, output_handle): + soup = BeautifulSoup(xml_handle, "xml") + for segment in soup.find_all("segment"): + who = segment["who"] + starttime = segment["starttime"] + endtime = segment["endtime"] + WMER = segment["WMER"] + text = " ".join([element.string for element in segment.find_all("element") if element.string != None]) + output_handle.write("{} {} {} {} {}\n".format(who, starttime, endtime, WMER, text)) + xml_handle.close() + output_handle.close() + +def main(): + args = get_args() + + xml_handle = open(args.xml, 'r') + output_handle = sys.stdout if args.output == '-' else open(args.output, 'w') + + process_xml(xml_handle, output_handle) + +if __name__ == "__main__": + main() diff --git a/egs/mgb2_arabic/s5/run.sh b/egs/mgb2_arabic/s5/run.sh index a763869903c..e4192c067b0 100755 --- a/egs/mgb2_arabic/s5/run.sh +++ b/egs/mgb2_arabic/s5/run.sh @@ -6,6 +6,9 @@ stage=-1 +# preference on how to process xml file [python, xml] +process_xml="python" + . ./cmd.sh if [ -f ./path.sh ]; then . ./path.sh; fi . utils/parse_options.sh @@ -50,7 +53,7 @@ fi if [ $stage -le 1 ]; then #DATA PREPARATION echo "Preparing training data" - local/mgb_data_prep.sh DB $mer + local/mgb_data_prep.sh DB $mer $process_xml fi if [ $stage -le 2 ]; then