From f38420fe627e17bce9e902a4204417ecfc62d08a Mon Sep 17 00:00:00 2001 From: bertsky Date: Mon, 17 Feb 2020 01:11:47 +0100 Subject: [PATCH] use GT from bags in assets instead of obsolete zip on webpage --- .gitmodules | 3 +++ Makefile | 16 ++++++++++---- repo/assets | 1 + test/prepare_gt.bash | 51 ++------------------------------------------ test/test_wrapper.py | 2 +- 5 files changed, 19 insertions(+), 54 deletions(-) create mode 100644 .gitmodules create mode 160000 repo/assets diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..5b24fbb --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "repo/assets"] + path = repo/assets + url = https://github.com/OCR-D/assets diff --git a/Makefile b/Makefile index e0ca12c..cc4e1ec 100644 --- a/Makefile +++ b/Makefile @@ -38,8 +38,16 @@ test: test/assets deps-test $(PYTHON) -m pytest test $(PYTEST_ARGS) # prepare test assets -test/assets: - # TODO: instead of this, use bag repos, or add something useful to OCR-D/assets - test/prepare_gt.bash $@ +test/assets: repo/assets + mkdir -p $@ + ocrd workspace clone $ "$TMP_DIR/${GT_FILE}.txt" done -cat < "$TMP_DIR/page-extract-imagefilename.xsl" - - - - - - - - - - - - -EOF - -for GT_FILE in $GT_FILES; do - test -f "$CACHE_DIR/${GT_FILE}.zip" || - wget -P "$CACHE_DIR" http://www.ocr-d.de/sites/all/GTDaten/${GT_FILE}.zip - mkdir -p "$TMP_DIR/$GT_FILE" - unzip -jod "$TMP_DIR/$GT_FILE/data" "$CACHE_DIR/${GT_FILE}.zip" - pushd "$TMP_DIR/$GT_FILE/data" - ocrd workspace init . - mkdir -p OCR-D-IMG OCR-D-GT-PAGE - ZEROS=0000 - i=0 - for PAGE_FILE in *.xml; do - test "x$PAGE_FILE" = xmets.xml && continue - i=$((i+1)) - ID=${ZEROS:0:$((4-${#i}))}$i - IMG_FILE=$(xsltproc "$TMP_DIR/page-extract-imagefilename.xsl" "$PAGE_FILE") - test -f "$IMG_FILE" - mv "$IMG_FILE" OCR-D-IMG/ - ocrd workspace add -G OCR-D-IMG -i OCR-D-IMG_$ID -g phys_$ID -m image/tiff "OCR-D-IMG/$IMG_FILE" - # workaround for OCR-D/core#176 (still true for ocrd v1.0.0) - sed -i -e 's|imageFilename="|imageFilename="OCR-D-IMG/|' "$PAGE_FILE" - mv "$PAGE_FILE" OCR-D-GT-PAGE/ - ocrd workspace add -G OCR-D-GT-PAGE -i OCR-D-GT-PAGE_$ID -g phys_$ID -m application/vnd.prima.page+xml "OCR-D-GT-PAGE/$PAGE_FILE" - done - popd -done - -mv "$TMP_DIR" "$1" # atomic +mv "$TMP_DIR"/*.txt "$1" diff --git a/test/test_wrapper.py b/test/test_wrapper.py index 4bd9922..1dcc811 100644 --- a/test/test_wrapper.py +++ b/test/test_wrapper.py @@ -21,7 +21,7 @@ def setUp(self): def runTest(self): resolver = Resolver() - workspace = resolver.workspace_from_url('test/assets/kant_aufklaerung_1784/data/mets.xml', + workspace = resolver.workspace_from_url('test/assets/kant_aufklaerung_1784/mets.xml', dst_dir=WORKSPACE_DIR, download=True) self.assertIsNotNone(workspace) #