Skip to content

Commit

Permalink
[egs] gale_arabic: add python script to process xml file (#3886)
Browse files Browse the repository at this point in the history
  • Loading branch information
DongjiGao authored Jan 29, 2020
1 parent 1121c31 commit 5882dc5
Show file tree
Hide file tree
Showing 8 changed files with 166 additions and 27 deletions.
15 changes: 15 additions & 0 deletions egs/gale_arabic/s5d/local/check_tools.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#!/bin/bash

# check whether bs4 and lxml is installed
if ! python3 -c "import bs4" 2>/dev/null; then
echo "$0: BeautifulSoup4 not installed, you can install it by 'pip install beautifulsoup4' if you prefer to use python to process xml file"
exit 1;
fi

if ! python3 -c "import lxml" 2>/dev/null; then
echo "$0: lxml not installed, you can install it by 'pip install lxml' if you prefer to use python to process xml file"
exit 1;
fi

echo "both BeatufileSoup4 and lxml are installed in python"
exit 0
43 changes: 30 additions & 13 deletions egs/gale_arabic/s5d/local/prepare_data.sh
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ dir10=/export/corpora/LDC/LDC2018S05/
text10=/export/corpora/LDC/LDC2018T14/

mgb2_dir=""
process_xml=""
mer=80

. ./utils/parse_options.sh
Expand Down Expand Up @@ -108,13 +109,6 @@ cd $top_pwd
# prepare MGB2 data
if [ ! -z $mgb2_dir ]; then
echo "preparing MGB2 data"
# check xml
if [ -z $(which xml) ]; then
echo "$0: Could not find tool xml"
echo "$0: To use MGB2 you must have xml installed"
echo "$0: Download and install it from xmlstar.sourceforge.net"
exit 1
fi

xmldir=$mgb2_dir/train/xml/bw
output_dir=$gale_data/mgb2
Expand All @@ -126,12 +120,35 @@ if [ ! -z $mgb2_dir ]; then
mv $output_dir/mgb2 ${output_dir}/.backup
fi

ls $mgb2_dir/train/wav/ | while read name; do
basename=`basename -s .wav $name`
[ ! -e $xmldir/$basename.xml ] && echo "Missing $xmldir/$basename.xml" && exit 1
xml sel -t -m '//segments[@annotation_id="transcript_align"]' -m "segment" -n -v "concat(@who,' ',@starttime,' ',@endtime,' ',@WMER,' ')" -m "element" -v "concat(text(),' ')" $xmldir/$basename.xml | local/add_to_datadir.py $basename $output_dir $mer
echo $basename $db_dir/train/wav/$basename.wav >> $output_dir/wav.scp
done
if [ $process_xml == 'python' ]; then
echo "using python to process xml file"
# check if bs4 and lxml are installed in python
local/check_tools.sh
ls $mgb2_dir/train/wav/ | while read name; do
basename=`basename -s .wav $name`
[ ! -e $xmldir/$basename.xml ] && echo "Missing $xmldir/$basename.xml" && exit 1
local/process_xml.py $xmldir/$basename.xml - | local/add_to_datadir.py $basename $train_dir $mer
echo $basename $db_dir/train/wav/$basename.wav >> $output_dir/wav.scp
done
elif [ $process_xml == 'xml' ]; then
# check if xml binary exsits
if command -v xml >/dev/null 2>/dev/null; then
echo "using xml"
ls $mgb2_dir/train/wav/ | while read name; do
basename=`basename -s .wav $name`
[ ! -e $xmldir/$basename.xml ] && echo "Missing $xmldir/$basename.xml" && exit 1
xml sel -t -m '//segments[@annotation_id="transcript_align"]' -m "segment" -n -v "concat(@who,' ',@starttime,' ',@endtime,' ',@WMER,' ')" -m "element" -v "concat(text(),' ')" $xmldir/$basename.xml | local/add_to_datadir.py $basename $output_dir $mer
echo $basename $db_dir/train/wav/$basename.wav >> $output_dir/wav.scp
done
else
echo "xml not found, you may use python by '--process-xml python'"
exit 1;
fi
else
# invalid option
echo "$0: invalid option for --process-xml, choose from 'xml' or 'python'"
exit 1;
fi

# add mgb2 data to training data (GALE/all and wav.scp)
mv $gale_data/all $gale_data/all.gale
Expand Down
35 changes: 35 additions & 0 deletions egs/gale_arabic/s5d/local/process_xml.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#!/usr/bin/env python3

from bs4 import BeautifulSoup
import sys
import argparse

def get_args():
parser = argparse.ArgumentParser(description="""This script process xml file.""")
parser.add_argument("xml", type=str, help="""Input xml file""")
parser.add_argument("output", type=str, help="""output text file""")
args = parser.parse_args()
return args

def process_xml(xml_handle, output_handle):
soup = BeautifulSoup(xml_handle, "xml")
for segment in soup.find_all("segment"):
who = segment["who"]
starttime = segment["starttime"]
endtime = segment["endtime"]
WMER = segment["WMER"]
text = " ".join([element.string for element in segment.find_all("element") if element.string != None])
output_handle.write("{} {} {} {} {}\n".format(who, starttime, endtime, WMER, text))
xml_handle.close()
output_handle.close()

def main():
args = get_args()

xml_handle = open(args.xml, 'r')
output_handle = sys.stdout if args.output == '-' else open(args.output, 'w')

process_xml(xml_handle, output_handle)

if __name__ == "__main__":
main()
5 changes: 4 additions & 1 deletion egs/gale_arabic/s5d/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,9 @@ galeData=GALE
mgb2_dir=""
giga_dir=""

# preference on how to process xml file (use xml binary or python)
process_xml=""

run_rnnlm=false
. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
## This relates to the queue.
Expand All @@ -64,7 +67,7 @@ if [ $stage -le 0 ]; then
echo "$0: Preparing data..."

options=""
[ ! -z $mgb2_dir ] && options="--mgb2-dir $mgb2_dir"
[ ! -z $mgb2_dir ] && options="--process-xml python --mgb2-dir $mgb2_dir"
local/prepare_data.sh $options

echo "$0: Preparing lexicon and LM..."
Expand Down
15 changes: 15 additions & 0 deletions egs/mgb2_arabic/s5/local/check_tools.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#!/bin/bash

# check whether bs4 and lxml is installed
if ! python3 -c "import bs4" 2>/dev/null; then
echo "$0: BeautifulSoup4 not installed, you can install it by 'pip install beautifulsoup4' if you prefer to use python to process xml file"
exit 1;
fi

if ! python3 -c "import lxml" 2>/dev/null; then
echo "$0: lxml not installed, you can install it by 'pip install lxml' if you prefer to use python to process xml file"
exit 1;
fi

echo "both BeatufileSoup4 and lxml are installed in python"
exit 0
40 changes: 28 additions & 12 deletions egs/mgb2_arabic/s5/local/mgb_data_prep.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
# 2016-2019 Vimal Manohar
# 2019 Dongji Gao

if [ $# -ne 2 ]; then
echo "Usage: $0 <DB-dir> <mer-sel>"
if [ $# -ne 3 ]; then
echo "Usage: $0 <DB-dir> <mer-sel> <process-xml>"
exit 1;
fi

Expand All @@ -23,12 +23,6 @@ for x in $train_dir $dev_dir; do
fi
done

if [ -z $(which xml) ]; then
echo "$0: Could not find tool xml"
echo "$0: Download and install it from xmlstar.sourceforge.net"
exit 1
fi

find $db_dir/train/wav -type f -name "*.wav" | \
awk -F/ '{print $NF}' | perl -pe 's/\.wav//g' > \
$train_dir/wav_list
Expand All @@ -39,11 +33,33 @@ head -500 $train_dir/wav_list > $train_dir/wav_list.short
set -e -o pipefail

xmldir=$db_dir/train/xml/bw
cat $train_dir/wav_list | while read basename; do
if [ $process_xml == "python" ]; then
echo "using python to process xml file"
# check if bs4 and lxml are installin in python
local/check_tools.sh
# process xml file using python
cat $train_dir/wav_list | while read basename; do
[ ! -e $xmldir/$basename.xml ] && echo "Missing $xmldir/$basename.xml" && exit 1
xml sel -t -m '//segments[@annotation_id="transcript_align"]' -m "segment" -n -v "concat(@who,' ',@starttime,' ',@endtime,' ',@WMER,' ')" -m "element" -v "concat(text(),' ')" $xmldir/$basename.xml | local/add_to_datadir.py $basename $train_dir $mer
echo $basename $wavDir/$basename.wav >> $train_dir/wav.scp
done
local/process_xml.py $xmldir/$basename.xml - | local/add_to_datadir.py $basename $train_dir $mer
done
elif [ $process_xml == 'xml' ]; then
# check if xml binary exsits
if command -v xml >/dev/null 2>/dev/null; then
echo "using xml"
cat $train_dir/wav_list | while read basename; do
[ ! -e $xmldir/$basename.xml ] && echo "Missing $xmldir/$basename.xml" && exit 1
xml sel -t -m '//segments[@annotation_id="transcript_align"]' -m "segment" -n -v "concat(@who,' ',@starttime,' ',@endtime,' ',@WMER,' ')" -m "element" -v "concat(text(),' ')" $xmldir/$basename.xml | local/add_to_datadir.py $basename $train_dir $mer
echo $basename $wavDir/$basename.wav >> $train_dir/wav.scp
done
else
echo "xml not found, you may use python by '--process-xml python'"
exit 1;
fi
else
# invalid option
echo "$0: invalid option for --process-xml, choose from 'xml' or 'python'"
exit 1;
fi

for x in text segments; do
cp $db_dir/dev/${x}.all $dev_dir/${x}
Expand Down
35 changes: 35 additions & 0 deletions egs/mgb2_arabic/s5/local/process_xml.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#!/usr/bin/env python3

from bs4 import BeautifulSoup
import sys
import argparse

def get_args():
parser = argparse.ArgumentParser(description="""This script process xml file.""")
parser.add_argument("xml", type=str, help="""Input xml file""")
parser.add_argument("output", type=str, help="""output text file""")
args = parser.parse_args()
return args

def process_xml(xml_handle, output_handle):
soup = BeautifulSoup(xml_handle, "xml")
for segment in soup.find_all("segment"):
who = segment["who"]
starttime = segment["starttime"]
endtime = segment["endtime"]
WMER = segment["WMER"]
text = " ".join([element.string for element in segment.find_all("element") if element.string != None])
output_handle.write("{} {} {} {} {}\n".format(who, starttime, endtime, WMER, text))
xml_handle.close()
output_handle.close()

def main():
args = get_args()

xml_handle = open(args.xml, 'r')
output_handle = sys.stdout if args.output == '-' else open(args.output, 'w')

process_xml(xml_handle, output_handle)

if __name__ == "__main__":
main()
5 changes: 4 additions & 1 deletion egs/mgb2_arabic/s5/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@

stage=-1

# preference on how to process xml file [python, xml]
process_xml="python"

. ./cmd.sh
if [ -f ./path.sh ]; then . ./path.sh; fi
. utils/parse_options.sh
Expand Down Expand Up @@ -50,7 +53,7 @@ fi
if [ $stage -le 1 ]; then
#DATA PREPARATION
echo "Preparing training data"
local/mgb_data_prep.sh DB $mer
local/mgb_data_prep.sh DB $mer $process_xml
fi

if [ $stage -le 2 ]; then
Expand Down

0 comments on commit 5882dc5

Please sign in to comment.