Skip to content

Commit

Permalink
use GT from bags in assets instead of obsolete zip on webpage
Browse files Browse the repository at this point in the history
  • Loading branch information
bertsky committed Feb 20, 2020
1 parent c377fde commit f38420f
Show file tree
Hide file tree
Showing 5 changed files with 19 additions and 54 deletions.
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[submodule "repo/assets"]
path = repo/assets
url = https://github.com/OCR-D/assets
16 changes: 12 additions & 4 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,16 @@ test: test/assets deps-test
$(PYTHON) -m pytest test $(PYTEST_ARGS)

# prepare test assets
test/assets:
# TODO: instead of this, use bag repos, or add something useful to OCR-D/assets
test/prepare_gt.bash $@
test/assets: repo/assets
mkdir -p $@
ocrd workspace clone $</data/kant_aufklaerung_1784/data/mets.xml -a $@/kant_aufklaerung_1784
bash test/prepare_gt.bash $@

.PHONY: help deps deps-test install test
repo/assets: always-update
git submodule sync $@
git submodule update --init $@

clean:
$(RM) -r test/assets model_dta_test.h5

.PHONY: help deps deps-test install test clean always-update
1 change: 1 addition & 0 deletions repo/assets
Submodule assets added at 923efb
51 changes: 2 additions & 49 deletions test/prepare_gt.bash
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ GT_FILES="kant_aufklaerung_1784 loeber_heuschrecken_1693"

trap "rm -fr '$TMP_DIR'" ERR

test ! -e "$1" # target directory must already exist
test -d "$1" # target directory must already exist

test -d "$CACHE_DIR" || mkdir -p "$CACHE_DIR"

Expand All @@ -18,54 +18,7 @@ for GT_FILE in $GT_FILES; do
sed -e '/ /d;/^[[].*[]]$/d' < "$CACHE_DIR/$GT_FILE" > "$TMP_DIR/${GT_FILE}.txt"
done

cat <<EOF > "$TMP_DIR/page-extract-imagefilename.xsl"
<xsl:stylesheet
version="1.0"
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15">
<!-- unfortunately, this totally depends on the exact namespace string, i.e. version -->
<!-- rid of xml syntax: -->
<xsl:output
method="text"
standalone="yes"
omit-xml-declaration="yes"/>
<!-- get imageFilename attribute verbatim: -->
<xsl:template match="pc:PcGts/pc:Page">
<xsl:value-of select="@imageFilename" disable-output-escaping="yes"/>
<xsl:apply-templates/>
</xsl:template>
<!-- override implicit rules copying elements and attributes: -->
<xsl:template match="text()"/>
</xsl:stylesheet>
EOF

for GT_FILE in $GT_FILES; do
test -f "$CACHE_DIR/${GT_FILE}.zip" ||
wget -P "$CACHE_DIR" http://www.ocr-d.de/sites/all/GTDaten/${GT_FILE}.zip
mkdir -p "$TMP_DIR/$GT_FILE"
unzip -jod "$TMP_DIR/$GT_FILE/data" "$CACHE_DIR/${GT_FILE}.zip"
pushd "$TMP_DIR/$GT_FILE/data"
ocrd workspace init .
mkdir -p OCR-D-IMG OCR-D-GT-PAGE
ZEROS=0000
i=0
for PAGE_FILE in *.xml; do
test "x$PAGE_FILE" = xmets.xml && continue
i=$((i+1))
ID=${ZEROS:0:$((4-${#i}))}$i
IMG_FILE=$(xsltproc "$TMP_DIR/page-extract-imagefilename.xsl" "$PAGE_FILE")
test -f "$IMG_FILE"
mv "$IMG_FILE" OCR-D-IMG/
ocrd workspace add -G OCR-D-IMG -i OCR-D-IMG_$ID -g phys_$ID -m image/tiff "OCR-D-IMG/$IMG_FILE"
# workaround for OCR-D/core#176 (still true for ocrd v1.0.0)
sed -i -e 's|imageFilename="|imageFilename="OCR-D-IMG/|' "$PAGE_FILE"
mv "$PAGE_FILE" OCR-D-GT-PAGE/
ocrd workspace add -G OCR-D-GT-PAGE -i OCR-D-GT-PAGE_$ID -g phys_$ID -m application/vnd.prima.page+xml "OCR-D-GT-PAGE/$PAGE_FILE"
done
popd
done

mv "$TMP_DIR" "$1" # atomic
mv "$TMP_DIR"/*.txt "$1"



Expand Down
2 changes: 1 addition & 1 deletion test/test_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def setUp(self):

def runTest(self):
resolver = Resolver()
workspace = resolver.workspace_from_url('test/assets/kant_aufklaerung_1784/data/mets.xml',
workspace = resolver.workspace_from_url('test/assets/kant_aufklaerung_1784/mets.xml',
dst_dir=WORKSPACE_DIR, download=True)
self.assertIsNotNone(workspace)
#
Expand Down

0 comments on commit f38420f

Please sign in to comment.