diff --git a/.gitignore b/.gitignore index 60fb0416a..295e976e4 100644 --- a/.gitignore +++ b/.gitignore @@ -133,6 +133,7 @@ venv cert* *.bk .ruby-version +.vscode # Sphinx documentation docs/_build/ diff --git a/.pylintrc b/.pylintrc index c158d2b8d..505260c8f 100644 --- a/.pylintrc +++ b/.pylintrc @@ -3,7 +3,7 @@ # A comma-separated list of package or module names from where C extensions may # be loaded. Extensions are loading into the active Python interpreter and may # run arbitrary code -extension-pkg-whitelist= +extension-pkg-whitelist=lxml # Add files or directories to the blacklist. They should be base names, not # paths. diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 841ee4132..1a6cef5ae 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -2,6 +2,14 @@ CHANGELOG ========= + +Release 2.2.0 +--------------------- +* Stream ingest uploads to S3 +* Adds status records for ingest tasks +* Adds bulk ingest +* Adds email notifications for ingest success and failure + Release 2.1.1 --------------------- * Fixes migration conflicts diff --git a/apps/__init__.py b/apps/__init__.py index 055908fac..c88b4e185 100644 --- a/apps/__init__.py +++ b/apps/__init__.py @@ -1,4 +1,4 @@ -__version__ = "2.0.0" +__version__ = "2.2.0" __version_info__ = tuple( [ int(num) if num.isdigit() else num diff --git a/apps/cms/wagtail_hooks.py b/apps/cms/wagtail_hooks.py index 8b423953c..c227923c0 100644 --- a/apps/cms/wagtail_hooks.py +++ b/apps/cms/wagtail_hooks.py @@ -1,5 +1,5 @@ """Add custom .css hook""" -from django.contrib.staticfiles.templatetags.staticfiles import static +from django.templatetags.static import static from django.utils.html import format_html from wagtail.core import hooks diff --git a/apps/iiif/annotations/models.py b/apps/iiif/annotations/models.py index da29e442d..9d9a6c465 100644 --- a/apps/iiif/annotations/models.py +++ b/apps/iiif/annotations/models.py @@ -5,7 +5,7 @@ from django.db.models import signals from django.core.exceptions import ValidationError from django.dispatch import receiver -from django.utils.translation import ugettext_lazy as _ +from django.utils.translation import gettext as _ from django.contrib.auth import get_user_model from abc import abstractmethod from bs4 import BeautifulSoup diff --git a/apps/iiif/canvases/fixtures/alto.xml b/apps/iiif/canvases/fixtures/alto.xml old mode 100644 new mode 100755 index 13e26bf3c..482eddd0c --- a/apps/iiif/canvases/fixtures/alto.xml +++ b/apps/iiif/canvases/fixtures/alto.xml @@ -1,132 +1,41 @@ - - - - - Af-beeldinghe van d'eerste eeuwe der Societeyt Iesu voor ooghen ghestelt door de Duyts-Nederlantsche provincie der seluer societeyt., p. 10 - - - Emory University Library and Information Technology Services - - -

Abbyy file derived from OCR of Bolland, Johannes, 1596-1665, Henschenius, Godefridus, 1601-1681, Tollenaere, Jean de, 1582-1643, Poirters, Adrien, 1605-1674, Galle, Cornelis, 1576-1650,, Natalis, Michel, 1610-1668,, Diepenbeeck, Abraham van, 1596-1675,, Plantijnsche Drukkerij. Af-beeldinghe van d'eerste eeuwe der Societeyt Iesu voor ooghen ghestelt door de Duyts-Nederlantsche provincie der seluer societeyt., ['1640'].

-
-
-
- - - - - - mm - - - - - AEN DEN LESIIU - - - tnaken, om te fchijnen bouen alle andere te kracycn, en die te mer- - - - drucken ? - - - Of ty dit ergbens in'tbeleydt man dit heel ftuck^, met de minfee - - - merfmacdehjckheyt man eenighe andere Orden oft Religte, ghedaen - - - hebben, datftellen ley ten oordeele manden onpartijdighen Lepr; - - - den Tvelcken bier minden fal d'af-beeldingbe mande eerfie eeulve on- - - - fer S octet eyt, die "toy met on fen H. Vader gbeerne kennen de laetfle - - - en de minfee te %jjn, onderfoo mele oude ende treffelijcke Or dens Van - - - S.Augufinus, Beneditlus, Bernardus, Norbertus ,Domimcwi, Fran- - - - cifciis, ende meer andere , die met mcerdere mrucht en glorie inde - - - H.Kercke merkeert hebben. 'tis defen gheoorloft ghelveeil 'tvocdt - - - gberucbt,dathen naeghingb,en noch heden-fdaeghs molght, als eenen - - - toet-feen man bun innerhjcl^ yvefen, aende "Svereldt, nu mondeltjck^ - - - inde predtkatien, nu fchrtftelijck_ inde gbedruckfe boecken, moor oo- - - - ghen te ftellen, om daer aen het goudt manden ijuer en liefde te keu- - - - ren, met de loelckefy de glorie Godts en des naefeenfaligbeyt, neffens - - - hunne eygbene molmaecktheyt ghetracht hebben te moorderen. Soa - - - *n magb het ons dan oock_noch tot blaeme noch tot phande ghedijen, - - - dat Ivy onfe meeder de Socteteyt, die ons iuffchen feo meel drucks - - - ■en lijdens, foo mele opmallen ende ouerlafeen , feo mele merVolghin- - - - gben en martehenfihterals eene nae-vrucht op'teynde der Tvereldt, - - - aen de H.Kercke ghebaert heeft, met eene lof-rijeke danckbaerheyt - - - oppellen: te mm0ds "dry d'eere ende de glorie man alle haere daden aen - - - Godt den Hecre a/leen, en met aen oris feluen, toe en eyghenen. - - - Daerotn pet ghy de Socteteyt m't moor- bladt man dit Boeck. in - - - pnnte gbeflelt met d'ooghen opTvaerts ten bemel gbeflaghen, tvaerfy - - - met een' oprechte meymngbe Tvederom benen phickt, al datfe man - - - daer ontfangben heeft, als ofse op alles loaer medefy biergheprefen en - - - *verciert "ioordt, met een' ingbekeertbeyt en Tveer-flagh des herten, - - - (lommelingb andnvoordde, datfe allefftns moor heeft, Tot meerdere - - - eere ende glorie Godts. Inde rechte handt houdtfe onfe Conftitu- - - - tien ende Regbelen; indeflmcke op eenen dry-meet bet kruya met de - - - bernende ~totrcldt} in de Tpelcke den mierighen ijuer Van S.Ignatius, - - - Xaue- - - - - -
\ No newline at end of file + + + + pixel + + ./P100.tif + + + + + tesseract 4.0.0 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/apps/iiif/canvases/fixtures/bad_hocr.hocr b/apps/iiif/canvases/fixtures/bad_hocr.hocr new file mode 100755 index 000000000..995ee0d83 --- /dev/null +++ b/apps/iiif/canvases/fixtures/bad_hocr.hocr @@ -0,0 +1,33 @@ + + + + + + + + + + +
+
+

+ + MAGNA + CAMPI + MARTII + + + + ICHNOGRAPHIA + DESCRIPTA + SV + NT + +

+
+
+
+
+ + diff --git a/apps/iiif/canvases/fixtures/bad_alto.xml b/apps/iiif/canvases/fixtures/bad_tei.xml similarity index 100% rename from apps/iiif/canvases/fixtures/bad_alto.xml rename to apps/iiif/canvases/fixtures/bad_tei.xml diff --git a/apps/iiif/canvases/fixtures/hocr.hocr b/apps/iiif/canvases/fixtures/hocr.hocr new file mode 100755 index 000000000..d04d8fb6c --- /dev/null +++ b/apps/iiif/canvases/fixtures/hocr.hocr @@ -0,0 +1,33 @@ + + + + + + + + + + +
+
+

+ + MAGNA + CAMPI + MARTII + + + + ICHNOGRAPHIA + DESCRIPTA + SV + NT + +

+
+
+
+
+ + diff --git a/apps/iiif/canvases/fixtures/hops.xml b/apps/iiif/canvases/fixtures/hops.xml new file mode 100644 index 000000000..e5d5b8f51 --- /dev/null +++ b/apps/iiif/canvases/fixtures/hops.xml @@ -0,0 +1,105 @@ + + + + + Cascade + 1 + US + 5.50 + 0.0000000 + Boil + + Use For: American ales and lagers +Aroma: Strong spicy, floral, grapefriut character +Substitutes: Centennial +Examples: Sierra Nevade Pale Ale, Anchor Liberty Ale +A hops with Northern Brewers Heritage + Both +
Pellet
+ 6.00 + 50.0 + 0.00 oz + 0.00 oz + - +
+ + Galena + 1 + US + 13.00 + 0.0000000 + Boil + + Use for: General bittering hops for all beers +Aroma: Strong, clean, balanced bittering +Substitute: Eroica, Northern Brewer, Cluster, Chinook +Examples: Catamount Porter + Bittering +
Pellet
+ 7.50 + 15.0 + 0.00 oz + 0.00 oz + - +
+ + Goldings, B.C. + 1 + Canada + 5.00 + 0.0000000 + Boil + + Used for: Bittering and finishing British ales, bitters, porters and stouts. +Aroma: Spicy, floral, rounded mild aroma. +Substitutes: East Kent Goldings, Fuggles + Aroma +
Pellet
+ 3.20 + 40.0 + 0.00 oz + 0.00 oz + - +
+ + Northern Brewer + 1 + Germany + 8.50 + 0.0000000 + Boil + + Also called Hallertauer Northern Brewers +Use for: Bittering and finishing both ales and lagers of all kinds +Aroma: Fine, dry, clean bittering hop. Unique flavor. +Substitute: Hallertauer Mittelfrueh, Hallertauer +Examples: Anchor Steam, Old Peculiar, + Both +
Pellet
+ 4.00 + 35.0 + 0.00 oz + 0.00 oz + - +
+ + Tettnang + 1 + Germany + 4.50 + 0.0000000 + Boil + + Use for: German ales, lagers and wheat beer +Aroma: Noble, mild, fine, slightly spicy +Substitutes: Saaz, Spalt +Examples: Sam Adams Octoberfest, Anderson Valley ESB + Aroma +
Pellet
+ 3.50 + 40.0 + 0.00 oz + 0.00 oz + - +
+
diff --git a/apps/iiif/canvases/fixtures/tei.xml b/apps/iiif/canvases/fixtures/tei.xml new file mode 100644 index 000000000..13e26bf3c --- /dev/null +++ b/apps/iiif/canvases/fixtures/tei.xml @@ -0,0 +1,132 @@ + + + + + Af-beeldinghe van d'eerste eeuwe der Societeyt Iesu voor ooghen ghestelt door de Duyts-Nederlantsche provincie der seluer societeyt., p. 10 + + + Emory University Library and Information Technology Services + + +

Abbyy file derived from OCR of Bolland, Johannes, 1596-1665, Henschenius, Godefridus, 1601-1681, Tollenaere, Jean de, 1582-1643, Poirters, Adrien, 1605-1674, Galle, Cornelis, 1576-1650,, Natalis, Michel, 1610-1668,, Diepenbeeck, Abraham van, 1596-1675,, Plantijnsche Drukkerij. Af-beeldinghe van d'eerste eeuwe der Societeyt Iesu voor ooghen ghestelt door de Duyts-Nederlantsche provincie der seluer societeyt., ['1640'].

+
+
+
+ + + + + + mm + + + + + AEN DEN LESIIU + + + tnaken, om te fchijnen bouen alle andere te kracycn, en die te mer- + + + drucken ? + + + Of ty dit ergbens in'tbeleydt man dit heel ftuck^, met de minfee + + + merfmacdehjckheyt man eenighe andere Orden oft Religte, ghedaen + + + hebben, datftellen ley ten oordeele manden onpartijdighen Lepr; + + + den Tvelcken bier minden fal d'af-beeldingbe mande eerfie eeulve on- + + + fer S octet eyt, die "toy met on fen H. Vader gbeerne kennen de laetfle + + + en de minfee te %jjn, onderfoo mele oude ende treffelijcke Or dens Van + + + S.Augufinus, Beneditlus, Bernardus, Norbertus ,Domimcwi, Fran- + + + cifciis, ende meer andere , die met mcerdere mrucht en glorie inde + + + H.Kercke merkeert hebben. 'tis defen gheoorloft ghelveeil 'tvocdt + + + gberucbt,dathen naeghingb,en noch heden-fdaeghs molght, als eenen + + + toet-feen man bun innerhjcl^ yvefen, aende "Svereldt, nu mondeltjck^ + + + inde predtkatien, nu fchrtftelijck_ inde gbedruckfe boecken, moor oo- + + + ghen te ftellen, om daer aen het goudt manden ijuer en liefde te keu- + + + ren, met de loelckefy de glorie Godts en des naefeenfaligbeyt, neffens + + + hunne eygbene molmaecktheyt ghetracht hebben te moorderen. Soa + + + *n magb het ons dan oock_noch tot blaeme noch tot phande ghedijen, + + + dat Ivy onfe meeder de Socteteyt, die ons iuffchen feo meel drucks + + + ■en lijdens, foo mele opmallen ende ouerlafeen , feo mele merVolghin- + + + gben en martehenfihterals eene nae-vrucht op'teynde der Tvereldt, + + + aen de H.Kercke ghebaert heeft, met eene lof-rijeke danckbaerheyt + + + oppellen: te mm0ds "dry d'eere ende de glorie man alle haere daden aen + + + Godt den Hecre a/leen, en met aen oris feluen, toe en eyghenen. + + + Daerotn pet ghy de Socteteyt m't moor- bladt man dit Boeck. in + + + pnnte gbeflelt met d'ooghen opTvaerts ten bemel gbeflaghen, tvaerfy + + + met een' oprechte meymngbe Tvederom benen phickt, al datfe man + + + daer ontfangben heeft, als ofse op alles loaer medefy biergheprefen en + + + *verciert "ioordt, met een' ingbekeertbeyt en Tveer-flagh des herten, + + + (lommelingb andnvoordde, datfe allefftns moor heeft, Tot meerdere + + + eere ende glorie Godts. Inde rechte handt houdtfe onfe Conftitu- + + + tien ende Regbelen; indeflmcke op eenen dry-meet bet kruya met de + + + bernende ~totrcldt} in de Tpelcke den mierighen ijuer Van S.Ignatius, + + + Xaue- + + + + +
\ No newline at end of file diff --git a/apps/iiif/canvases/management/commands/rebuild_ocr.py b/apps/iiif/canvases/management/commands/rebuild_ocr.py index a07212328..b345470f4 100644 --- a/apps/iiif/canvases/management/commands/rebuild_ocr.py +++ b/apps/iiif/canvases/management/commands/rebuild_ocr.py @@ -91,8 +91,6 @@ def __rebuild(self, canvas, testing=False): self.stdout.write('Adding OCR for canvas {c}'.format(c=canvas.pid)) with Bar('Processing', max=len(ocr)) as prog_bar: for word in ocr: - if word['content'] == 'Dope': - print(word) if ( word == '' or 'content' not in word or @@ -111,9 +109,6 @@ def __rebuild(self, canvas, testing=False): canvas=canvas ) except Annotation.DoesNotExist: - if word['content'] == 'Dope': - for _ in range(10): - print('DANG') anno = Annotation( w=word['w'], h=word['h'], diff --git a/apps/iiif/canvases/migrations/0006_canvas_ocr_file_path.py b/apps/iiif/canvases/migrations/0006_canvas_ocr_file_path.py index af5d9ba2d..f5631b2a7 100644 --- a/apps/iiif/canvases/migrations/0006_canvas_ocr_file_path.py +++ b/apps/iiif/canvases/migrations/0006_canvas_ocr_file_path.py @@ -15,4 +15,11 @@ class Migration(migrations.Migration): name='ocr_file_path', field=models.FilePathField(allow_folders=True, blank=True, null=True, path='/tmp', recursive=True), ), + migrations.RemoveField( + model_name='canvas', + name='IIIF_IMAGE_SERVER_BASE', + ), + migrations.DeleteModel( + name='IServer', + ), ] diff --git a/apps/iiif/canvases/migrations/0010_auto_20210928_1301.py b/apps/iiif/canvases/migrations/0010_auto_20210928_1301.py new file mode 100644 index 000000000..11e5c0e30 --- /dev/null +++ b/apps/iiif/canvases/migrations/0010_auto_20210928_1301.py @@ -0,0 +1,31 @@ +# Generated by Django 2.2.23 on 2021-09-28 13:01 + +from django.db import migrations, models +import django.db.models.deletion +import uuid + + +class Migration(migrations.Migration): + + dependencies = [ + ('manifests', '0021_auto_20210928_1301'), + ('canvases', '0009_auto_20210819_1508'), + ] + + operations = [ + migrations.AddField( + model_name='canvas', + name='image_server', + field=models.ForeignKey(null=True, on_delete=django.db.models.deletion.DO_NOTHING, to='manifests.ImageServer'), + ), + migrations.AlterField( + model_name='canvas', + name='id', + field=models.UUIDField(default=uuid.uuid4, primary_key=True, serialize=False), + ), + migrations.AlterField( + model_name='canvas', + name='pid', + field=models.CharField(default='2qkkkqds', help_text="Unique ID. Do not use -'s or spaces in the pid.", max_length=255), + ) + ] diff --git a/apps/iiif/canvases/migrations/0011_auto_20211007_2031.py b/apps/iiif/canvases/migrations/0011_auto_20211007_2031.py new file mode 100644 index 000000000..56d66f2fe --- /dev/null +++ b/apps/iiif/canvases/migrations/0011_auto_20211007_2031.py @@ -0,0 +1,18 @@ +# Generated by Django 2.2.24 on 2021-10-07 20:31 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('canvases', '0010_auto_20210928_1301'), + ] + + operations = [ + migrations.AlterField( + model_name='canvas', + name='pid', + field=models.CharField(default='2qkqfvwv', help_text="Unique ID. Do not use _'s or spaces in the pid.", max_length=255), + ), + ] diff --git a/apps/iiif/canvases/migrations/0012_auto_20211012_1612.py b/apps/iiif/canvases/migrations/0012_auto_20211012_1612.py new file mode 100644 index 000000000..c33467a6c --- /dev/null +++ b/apps/iiif/canvases/migrations/0012_auto_20211012_1612.py @@ -0,0 +1,18 @@ +# Generated by Django 2.2.24 on 2021-10-12 16:12 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('canvases', '0011_auto_20211007_2031'), + ] + + operations = [ + migrations.AlterField( + model_name='canvas', + name='pid', + field=models.CharField(default='2qmwbpfz', help_text="Unique ID. Do not use _'s or spaces in the pid.", max_length=255), + ), + ] diff --git a/apps/iiif/canvases/migrations/0013_auto_20211018_1913.py b/apps/iiif/canvases/migrations/0013_auto_20211018_1913.py new file mode 100644 index 000000000..d342814f9 --- /dev/null +++ b/apps/iiif/canvases/migrations/0013_auto_20211018_1913.py @@ -0,0 +1,23 @@ +# Generated by Django 2.2.24 on 2021-10-18 19:13 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('canvases', '0012_auto_20211012_1612'), + ] + + operations = [ + migrations.AlterField( + model_name='canvas', + name='label', + field=models.CharField(max_length=1000), + ), + migrations.AlterField( + model_name='canvas', + name='pid', + field=models.CharField(default='2qnj9psx', help_text="Unique ID. Do not use _'s or spaces in the pid.", max_length=255), + ), + ] diff --git a/apps/iiif/canvases/models.py b/apps/iiif/canvases/models.py index b5cd34e17..9e6f60500 100644 --- a/apps/iiif/canvases/models.py +++ b/apps/iiif/canvases/models.py @@ -1,48 +1,29 @@ """Django models representing IIIF canvases and IIIF image server info.""" from genericpath import exists -import uuid import os from boto3 import resource from bs4 import BeautifulSoup -import tempfile from urllib.parse import quote import config.settings.local as settings -from django.apps import apps from django.db import models -from django.dispatch import receiver from django.contrib.auth import get_user_model -from ..manifests.models import Manifest +from ..models import IiifBase +from ..manifests.models import Manifest, ImageServer from ..annotations.models import Annotation from . import services USER = get_user_model() -# TODO: This has moved to Manifest. Remove one everyone has migrated. -class IServer(models.Model): - """Django model for IIIF image server info. Each canvas has one IServer""" - id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False) - IIIF_IMAGE_SERVER_BASE = models.CharField( - max_length=255, - default=settings.IIIF_IMAGE_SERVER_BASE - ) - - def __str__(self): - return "%s" % (self.IIIF_IMAGE_SERVER_BASE) - -class Canvas(models.Model): +class Canvas(IiifBase): """Django model for IIIF Canvas objects.""" - id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False) - label = models.CharField(max_length=255) - pid = models.CharField(max_length=255) summary = models.TextField(blank=True, null=True) manifest = models.ForeignKey(Manifest, on_delete=models.CASCADE) + image_server = models.ForeignKey(ImageServer, on_delete=models.DO_NOTHING, null=True) position = models.IntegerField() height = models.IntegerField(default=0) width = models.IntegerField(default=0) ocr_offset = models.IntegerField(default=0) resource = models.TextField(blank=True, null=True) - # TODO: This has moved to Manifest. Remove one everyone has migrated. - # IIIF_IMAGE_SERVER_BASE = models.ForeignKey(IServer, on_delete=models.CASCADE, null=True) is_starting_page = models.BooleanField(default=False) preferred_ocr = ( ('word', 'word'), @@ -69,16 +50,25 @@ def identifier(self): @property def service_id(self): """Concatenated property to represent IIIF service id.""" + self.__check_image_server() + if self.image_server is None: + + return None + return '{h}/{c}'.format( - h=self.manifest.image_server.server_base, + h=self.image_server.server_base, c=quote(self.pid) ) @property def resource_id(self): - """Concatenated propert to represent IIIF resource id.""" + """Concatenated property to represent IIIF resource id.""" + self.__check_image_server() + if self.image_server is None: + return None + return '{h}/{r}'.format( - h=self.manifest.image_server.server_base, + h=self.image_server.server_base, r=self.resource or self.pid ) @@ -100,33 +90,30 @@ def image_info(self): def thumbnail(self): """Concatenated property to represent IIIF thumbnail link.""" return self.resource_id + '/full/200,/0/default.jpg' - # return '{h}/{c}/full/200,/0/default.jpg'.format( - # h=self.manifest.image_server.server_base, - # c=self.resource - # ) @property def social_media(self): """Concatenated property to represent IIIF image link for use in Open Graph metadata.""" + self.__check_image_server() + + if self.image_server is None: + return None + return '{h}/{c}/full/600,/0/default.jpg'.format( - h=self.manifest.image_server.server_base, + h=self.image_server.server_base, c=self.resource ) @property def twitter_media1(self): """Concatenated property for twitter cards and Open Graph metadata.""" - # TODO: shouldn't this use `self.manifest.image_server.server_base` + # TODO: shouldn't this use `self.image_server.server_base` return f'{self.resource_id}/full/600,/0/default.jpg' @property def twitter_media2(self): """Concatenated property for twitter cards and Open Graph metadata.""" return f'{self.resource_id}/full/600,/0/default.jpg' - # return '{h}/{c}/full/600,/0/default.jpg'.format( - # h=self.manifest.image_server.server_base, - # c=self.resource - # ) @property def uri(self): @@ -142,16 +129,8 @@ def thumbnail_crop_landscape(self): if self.height > self.width: # portrait return f'{self.resource_id}/full/,250/0/default.jpg' - # return '{h}/{c}/full/,250/0/default.jpg'.format( - # h=self.manifest.image_server.server_base, - # c=self.resource - # ) # landscape return f'{self.resource_id}/pct:25,0,50,100/,250/0/default.jpg' - # return '{h}/{c}/pct:25,0,50,100/,250/0/default.jpg'.format( - # h=self.manifest.image_server.server_base, - # c=self.resource - # ) @property def thumbnail_crop_tallwide(self): @@ -159,13 +138,8 @@ def thumbnail_crop_tallwide(self): if self.height > self.width: # portrait return f'{self.resource_id}/pct:5,5,90,90/,250/0/default.jpg' - # return '{h}/{c}/pct:5,5,90,90/,250/0/default.jpg'.format( - # h=self.manifest.image_server.server_base, - # c=self.resource - # ) # landscape return f'{self.resource_id}/pct:5,5,90,90/250,/0/default.jpg' - # return "%s/%s/pct:5,5,90,90/250,/0/default.jpg" % (self.manifest.image_server.server_base, self.resource) @property def thumbnail_crop_volume(self): @@ -173,16 +147,8 @@ def thumbnail_crop_volume(self): if self.height > self.width: # portrait return f'{self.resource_id}/pct:15,15,70,70/,600/0/default.jpg' - # return '{h}/{c}/pct:15,15,70,70/,600/0/default.jpg'.format( - # h=self.manifest.image_server.server_base, - # c=self.resource - # ) # landscape return f'{self.resource_id}/pct:25,15,50,85/,600/0/default.jpg' - # return '{h}/{c}/pct:25,15,50,85/,600/0/default.jpg'.format( - # h=self.manifest.image_server.server_base, - # c=self.resource - # ) @property def result(self): @@ -199,17 +165,29 @@ def result(self): def save(self, *args, **kwargs): # pylint: disable = signature-differs """ Override save function to set `resource_id` add OCR, - and set as manifest's `start_canvas` if manifest does not have one. + set as manifest's `start_canvas` if manifest does not have one, + and set """ - if self.image_info: - self.width = self.image_info['width'] - self.height = self.image_info['height'] + self.__check_image_server() - if self.resource is None: - self.resource = self.pid + if self.manifest and self.position is None: + self.position = self.manifest.canvas_set.count() + 1 + + if self.image_info: + # TODO: Consider changing the default value for height and width + # so we don't have to check for 0 in addition to None. + if self.width == 0 or self.height == 0: + self.width = None + self.height = None + if self.width is None and self.height is None: + self.width = self.image_info['width'] + self.height = self.image_info['height'] super().save(*args, **kwargs) + if self.resource is None: + self.resource = self.pid + self.save() if self.manifest and self.manifest.start_canvas is None: self.manifest.save() @@ -218,17 +196,17 @@ def delete(self, *args, **kwargs): """ Override the delete function to clean up files. """ - if self.manifest.image_server.storage_service == 's3': + if self.image_server.storage_service == 's3': s3 = resource('s3') - s3.Object(self.manifest.image_server.storage_path, self.file_name).delete() + s3.Object(self.image_server.storage_path, self.file_name).delete() if self.ocr_file_path: ocr_file = self.ocr_file_path.split("/")[-1] key = f'{self.manifest.pid}/_*ocr*_/{ocr_file}' - s3.Object(self.manifest.image_server.storage_path, key).delete() + s3.Object(self.image_server.storage_path, key).delete() else: try: - os.remove(os.path.join(self.manifest.image_server.storage_path, self.file_name)) + os.remove(os.path.join(self.image_server.storage_path, self.file_name)) except (FileNotFoundError, TypeError): pass try: @@ -239,14 +217,29 @@ def delete(self, *args, **kwargs): super().delete(*args, **kwargs) + # TODO: The way we construct PIDs for Canvas objects might need some + # rethinking. + def clean_pid(self): + """ Override the `__clean_pid` method that replaces underscores (_). + Canvas PIDs are combonation of the Manifest PID and the Canvase's + file name, seperated by an underscore. This is how Cantaloupe finds + the image file. """ + pass + def __str__(self): return str(self.pid) + def __check_image_server(self): + try: + if self.image_server is None and self.manifest.image_server is not None: + self.image_server = self.manifest.image_server + except Manifest.DoesNotExist: + return None + class Meta: # pylint: disable=too-few-public-methods, missing-class-docstring ordering = ['position'] class Meta: # pylint: disable=too-few-public-methods, missing-class-docstring - # Translators: admin:skip verbose_name = 'canvas' # Translators: admin:skip verbose_name_plural = 'canvases' diff --git a/apps/iiif/canvases/services.py b/apps/iiif/canvases/services.py index 5e6042d33..157046be5 100644 --- a/apps/iiif/canvases/services.py +++ b/apps/iiif/canvases/services.py @@ -1,10 +1,15 @@ # pylint: disable=invalid-name """Module to provide some common functions for Canvas objects.""" import csv -from os import environ, path -from xml.etree import ElementTree -import httpretty +from io import BytesIO +import json +from os import environ, path, unlink +import re +import tempfile +from hocr_spec import HocrValidator +from lxml import etree from django.conf import settings +import httpretty from apps.iiif.annotations.models import Annotation from apps.utils.fetch import fetch_url @@ -15,8 +20,12 @@ class IncludeQuotesDialect(csv.Dialect): # pylint: disable=too-few-public-method delimiter = '\t' quoting = csv.QUOTE_NONE # perform no special processing of quote characters -@httpretty.activate -def get_fake_canvas_info(canvas): +class HocrValidationError(Exception): + """Exception for hOCR validation errors.""" + pass # pylint: disable=unnecessary-pass + +# @httpretty.activate +def activate_fake_canvas_info(canvas): """Function to mock a response for testing. :param canvas: Canvas object @@ -27,20 +36,6 @@ def get_fake_canvas_info(canvas): with open('apps/iiif/canvases/fixtures/info.json', 'r') as file: iiif_image_info = file.read().replace('\n', '') httpretty.register_uri(httpretty.GET, canvas.service_id, body=iiif_image_info) - response = fetch_url( - canvas.resource_id, - timeout=settings.HTTP_REQUEST_TIMEOUT, - data_format='json' - ) - return response - -def get_fake_ocr(): - """Generate fake OCR data for testing. - - :return: OCR data - :rtype: dict - """ - return def get_ocr(canvas): """Function to determine method for fetching OCR for a canvas. @@ -51,8 +46,8 @@ def get_ocr(canvas): :rtype: list """ if canvas.default_ocr == "line": - result = fetch_alto_ocr(canvas) - return add_alto_ocr(result) + result = fetch_tei_ocr(canvas) + return parse_tei_ocr(result) result = fetch_positional_ocr(canvas) return add_positional_ocr(canvas, result) @@ -67,8 +62,9 @@ def get_canvas_info(canvas): """ # If testing, just fake it. if environ['DJANGO_ENV'] == 'test': - response = get_fake_canvas_info(canvas) - return response + httpretty.enable(allow_net_connect=False) + activate_fake_canvas_info(canvas) + # return response response = fetch_url( canvas.resource_id, @@ -77,6 +73,22 @@ def get_canvas_info(canvas): ) return response +def fetch_tei_ocr(canvas): + """Function to fetch TEI OCR data for a given canvas. + + :param canvas: Canvas object + :type canvas: apps.iiif.canvases.models.Canvas + :return: Positional OCR data + :rtype: requests.models.Response + """ + if 'archivelab' in canvas.manifest.image_server.server_base: + return None + url = "{p}{c}/datastreams/tei/content".format( + p=settings.DATASTREAM_PREFIX, + c=canvas.pid.replace('fedora:', '') + ) + + return fetch_url(url, data_format='text/plain') # TODO: Maybe add "OCR Source" and "OCR Type" attributes to the manifest model. That might # help make this more universal. @@ -127,32 +139,69 @@ def fetch_positional_ocr(canvas): data_format='text' ) - if canvas.ocr_file_path.startswith('https') and 's3' in canvas.ocr_file_path: - return fetch_url(canvas.ocr_file_path, data_format='text') - # Not sure we will need this. Leaving just as a reminder. - # else: - # file = open(canvas.ocr_file_path) - # data = file.read() - # file.close() - # return data - url = "{p}{c}{s}".format( p=settings.DATASTREAM_PREFIX, c=canvas.pid.replace('fedora:', ''), s=settings.DATASTREAM_SUFFIX ) - if environ['DJANGO_ENV'] == 'test': - fake_alto = open(path.join(settings.APPS_DIR, 'iiif/canvases/fixtures/ocr_words.json')) - words = fake_alto.read() + if ( + environ['DJANGO_ENV'] == 'test' + and 'images.readux.ecds.emory' not in canvas.manifest.image_server.server_base + and canvas.ocr_file_path is None + ): + fake_json = open(path.join(settings.APPS_DIR, 'iiif/canvases/fixtures/ocr_words.json')) + words = fake_json.read() httpretty.enable() httpretty.register_uri(httpretty.GET, url, body=words) + if canvas.ocr_file_path is not None: + if canvas.image_server.storage_service == 's3': + return canvas.image_server.bucket.Object(canvas.ocr_file_path).get()['Body'].read() + # if canvas.image_server.storage_service == 'sftp': + # Do something different return fetch_url(url, data_format='text/plain') +def is_json(to_test): + """Function to test if data is shaped like JSON. + + :param to_test: String or bytes + :type to_test: requests.models.Response + :return: True if shaped like JSON, False if not. + :rtype: bool + """ + if isinstance(to_test, bytes): + as_str = to_test.decode('utf-8') + else: + as_str = str(to_test) + try: + json.loads(as_str) + except ValueError: + return False + return True + +def is_tsv(to_test): + """Function to test if data is shaped like a TSV. + + :param to_test: String or bytes + :type to_test: requests.models.Response + :return: True if shaped like a TSV, False if not. + :rtype: bool + """ + if isinstance(to_test, bytes): + as_str = to_test.decode('utf-8') + as_list = as_str.splitlines() + else: + as_str = str(to_test) + as_list = as_str.split('\n') + if len(as_list) > 1: + if len(as_str.split('\t')) > 1: + return True + return False + def add_positional_ocr(canvas, result): - """Function to parse fetched OCR data for a canvas. + """Function to disambiguate and parse fetched OCR data for a canvas. :param canvas: Canvas object :type canvas: apps.iiif.canvases.models.Canvas @@ -161,100 +210,170 @@ def add_positional_ocr(canvas, result): :return: List of dicts of parsed OCR data. :rtype: list """ + if result is None: + return None + if canvas.ocr_file_path is None: + if isinstance(result, dict) or is_json(result): + ocr = parse_dict_ocr(result) + elif is_tsv(result) and isinstance(result, bytes): + if result.decode('utf-8') == result.decode('UTF-8-sig'): + ocr = parse_tsv_ocr(result) + else: + ocr = parse_fedora_ocr(result) + elif is_tsv(result): + ocr = parse_tsv_ocr(result) + elif canvas.ocr_file_path.endswith('.json'): + ocr = parse_dict_ocr(result) + elif canvas.ocr_file_path.endswith('.tsv') or canvas.ocr_file_path.endswith('.tab'): + ocr = parse_tsv_ocr(result) + elif canvas.ocr_file_path.endswith('.xml'): + ocr = parse_xml_ocr(result) + elif canvas.ocr_file_path.endswith('.hocr'): + ocr = parse_hocr_ocr(result) + if ocr: + return ocr + return None + +def parse_alto_ocr(result): + """Function to parse fetched ALTO OCR data for a given canvas. + + :param result: Fetched ALTO OCR data + :type result: requests.models.Response + :return: Parsed OCR data + :rtype: list + """ if result is None: return None ocr = [] - if 'archivelab' in canvas.manifest.image_server.server_base: - if result is not None and 'ocr' in result and result['ocr'] is not None: - for index, word in enumerate(result['ocr']): # pylint: disable=unused-variable - if len(word) > 0: - for w in word: - ocr.append({ - 'content': w[0], - 'w': (w[1][2] - w[1][0]), - 'h': (w[1][1] - w[1][3]), - 'x': w[1][0], - 'y': w[1][3] - }) - elif 'images.readux.ecds.emory' in canvas.manifest.image_server.server_base: - - lines = result.split('\n') - # if (lines[0].startswith(content)): - # lines.pop(0) - # Sometimes the TSV has some extra tabs at the beginign and the end. These have - # to be cleaned out. It gets complicatied. - for index, line in enumerate(lines): - print('----') - print(line) - print('----') - # First we remove any leading column that is empty. - line = line.strip() - lines[index] = line - # It might be true that the "content" column is empty. However, we just - # removed it. So we have to add it back. - if lines[index].count('\t') == 3: - lines[index] = ' \t' + lines[index] - - reader = csv.DictReader(lines, dialect=IncludeQuotesDialect) - - for row in reader: - content = row['content'] - w = int(row['w']) - h = int(row['h']) - x = int(row['x']) - y = int(row['y']) - ocr.append({ - 'content': content, - 'w': w, - 'h': h, - 'x': x, - 'y': y, - }) + unvalidated_root = etree.fromstring(result) + if 'ns-v2' in unvalidated_root.tag: + schema_file = 'xml_schema/alto-2-1.xsd' + elif 'ns-v3' in unvalidated_root.tag: + schema_file = 'xml_schema/alto-3-1.xsd' + elif 'ns-v4' in unvalidated_root.tag: + schema_file = 'xml_schema/alto-4-2.xsd' else: - if result is not None: - # What comes back from fedora is 8-bit bytes - for index, word in enumerate(result.decode('UTF-8-sig').strip().split('\r\n')): - if len(word.split('\t')) == 5: - ocr.append({ - 'content': word.split('\t')[4], - 'w': int(word.split('\t')[2]), - 'h': int(word.split('\t')[3]), - 'x': int(word.split('\t')[0]), - 'y': int(word.split('\t')[1]) - }) + schema_file = 'xml_schema/alto-1-4.xsd' + parser = etree.XMLParser(schema = etree.XMLSchema(file=schema_file)) + # The following will raise etree.XMLSyntaxError if invalid + root = etree.fromstring(result, parser=parser) + strings = root.findall('.//String') + if not strings: + strings = root.findall('.//{*}String') + for string in strings: + attrib = {k.lower(): v for k, v in string.attrib.items()} + ocr.append({ + 'content': attrib['content'], + 'h': int(attrib['height']), + 'w': int(attrib['width']), + 'x': int(attrib['hpos']), + 'y': int(attrib['vpos']) + }) if ocr: return ocr return None -def fetch_alto_ocr(canvas): - """Function to fetch Alto OCR data for a given canvas. +def parse_hocr_ocr(result): + """Function to parse fetched hOCR data for a given canvas. - :param canvas: Canvas object - :type canvas: apps.iiif.canvases.models.Canvas - :return: Positional OCR data - :rtype: requests.models.Response + :param result: Fetched hOCR data + :type result: requests.models.Response + :return: Parsed OCR data + :rtype: list """ - if 'archivelab' in canvas.manifest.image_server.server_base: - return None - url = "{p}{c}/datastreams/tei/content".format( - p=settings.DATASTREAM_PREFIX, - c=canvas.pid.replace('fedora:', '') + if isinstance(result, bytes): + as_string = result.decode('utf-8') + else: + as_string = str(result) + # Regex to ignore x_size, x_ascenders, x_descenders. this is a known issue with + # tesseract producded hOCR: https://github.com/tesseract-ocr/tesseract/issues/3303 + result_without_invalid = re.sub( + r'([ ;]+)(x_size [0-9\.\-;]+)|( x_descenders [0-9\.\-;]+)|( x_ascenders [0-9\.\-;]+)', + repl='', string=as_string ) + file_like_hocr = BytesIO(result_without_invalid.encode('utf-8')) + with tempfile.NamedTemporaryFile(delete=False) as tmp_file: + file_like_hocr.seek(0) + tmp_file.write(file_like_hocr.read()) + tmp_file.flush() + temp_file_name = tmp_file.name + validator = HocrValidator(profile='relaxed') + report = validator.validate(source=temp_file_name) + is_valid = report.format('bool') + if not is_valid: + report_text = report.format('text') + unlink(temp_file_name) + raise HocrValidationError(str(report_text)) + unlink(temp_file_name) + ocr = [] + file_like_hocr.seek(0) + tree = etree.parse(file_like_hocr) + words = tree.findall(".//span[@class]") + if not words: + words = tree.findall(".//{*}span[@class]") + for word in words: + if word.attrib['class'] == 'ocrx_word': + all_attribs = word.attrib['title'].split(';') + bbox = next((attrib for attrib in all_attribs if 'bbox' in attrib), '') + # Splitting 'bbox x0 y0 x1 y1' + bbox_attribs = bbox.split(' ') + if len(bbox_attribs) == 5: + ocr.append({ + 'content': word.text, + 'h': int(bbox_attribs[4]) - int(bbox_attribs[2]), + 'w': int(bbox_attribs[3]) - int(bbox_attribs[1]), + 'x': int(bbox_attribs[1]), + 'y': int(bbox_attribs[2]) + }) + if ocr: + return ocr + return None - return fetch_url(url, data_format='text/plain') +def parse_dict_ocr(result): + """Function to parse dict or JSON OCR data. -def add_alto_ocr(result): - """Function to add fetched Alto OCR data for a given canvas. + :param result: Fetched dict OCR data + :type result: requests.models.Response + :return: Parsed OCR data + :rtype: list + """ + ocr = [] + if isinstance(result, bytes): + as_string = result.decode('utf-8') + as_dict = json.loads(as_string) + elif isinstance(result, str): + as_dict = json.loads(result) + else: + as_dict = result + if 'ocr' in as_dict and as_dict['ocr'] is not None: + for index, word in enumerate(as_dict['ocr']): # pylint: disable=unused-variable + if len(word) > 0: + for w in word: + ocr.append({ + 'content': w[0], + 'w': (w[1][2] - w[1][0]), + 'h': (w[1][1] - w[1][3]), + 'x': w[1][0], + 'y': w[1][3], + }) + if ocr: + return ocr + return None - :param result: Fetched OCR data +def parse_tei_ocr(result): + """Function to parse fetched TEI OCR data for a given canvas. + + :param result: Fetched TEI OCR data :type result: requests.models.Response - :return: Parsed Alto OCR data + :return: Parsed OCR data :rtype: list """ if result is None: return None ocr = [] - surface = ElementTree.fromstring(result)[-1][0] + parser = etree.XMLParser(schema = etree.XMLSchema(file='xml_schema/tei_all.xsd')) + # The following will raise etree.XMLSyntaxError if invalid + surface = etree.fromstring(result, parser=parser)[-1][0] for zones in surface: if 'zone' in zones.tag: for line in zones: @@ -262,15 +381,102 @@ def add_alto_ocr(result): # continue ocr.append({ 'content': line[-1].text, - 'h': int(line.attrib['lry']) - int(line.attrib['uly']), - 'w': int(line.attrib['lrx']) - int(line.attrib['ulx']), - 'x': int(line.attrib['ulx']), - 'y': int(line.attrib['uly']) + 'h': int(line.get('lry')) - int(line.get('uly')), + 'w': int(line.get('lrx')) - int(line.get('ulx')), + 'x': int(line.get('ulx')), + 'y': int(line.get('uly')) }) if ocr: return ocr return None +def parse_tsv_ocr(result): + """Function to parse fetched TSV OCR data for a given canvas. + + :param result: Fetched TSV OCR data + :type result: requests.models.Response + :return: Parsed OCR data + :rtype: list + """ + ocr = [] + if isinstance(result, bytes): + lines = result.decode('utf-8').splitlines() + else: + lines = str(result).split('\n') + + # Sometimes the TSV has some extra tabs at the beginning and the end. These have + # to be cleaned out. It gets complicatied. + for index, line in enumerate(lines): + # First we remove any leading column that is empty. + line = line.strip() + lines[index] = line + # It might be true that the "content" column is empty. However, we just + # removed it. So we have to add it back. + if lines[index].count('\t') == 3: + lines[index] = ' \t' + lines[index] + + reader = csv.DictReader(lines, dialect=IncludeQuotesDialect) + + for row in reader: + content = row['content'] + w = int(row['w']) + h = int(row['h']) + x = int(row['x']) + y = int(row['y']) + ocr.append({ + 'content': content, + 'w': w, + 'h': h, + 'x': x, + 'y': y, + }) + if ocr: + return ocr + return None + +def parse_fedora_ocr(result): + """Function to parse fetched Fedora OCR data for a given canvas. + + :param result: Fetched Fedora OCR data (bytes) + :type result: requests.models.Response + :return: Parsed OCR data + :rtype: list + """ + ocr = [] + if isinstance(result, bytes): + # What comes back from fedora is 8-bit bytes + for _, word in enumerate(result.decode('UTF-8-sig').strip().split('\r\n')): + if len(word.split('\t')) == 5: + ocr.append({ + 'content': word.split('\t')[4], + 'w': int(word.split('\t')[2]), + 'h': int(word.split('\t')[3]), + 'x': int(word.split('\t')[0]), + 'y': int(word.split('\t')[1]) + }) + return ocr + +def parse_xml_ocr(result): + """Function to determine the flavor of XML OCR and then parse accordingly. + + :param result: Fetched XML OCR data + :type result: requests.models.Response + :return: Parsed OCR data + :rtype: list + """ + root = etree.fromstring(result) + if ( + re.match(r'{[0-9A-Za-z.:/#-]+}alto|alto', root.tag) + or 'www.loc.gov/standards/alto' in root.find('.//*').tag + ): + return parse_alto_ocr(result) + if root.find('.//teiHeader') is not None or root.find('.//{*}teiHeader') is not None: + return parse_tei_ocr(result) + if root.find('.//div') is not None or root.find('.//{*}div') is not None: + # Fallback to hOCR if it looks like XHTML + return parse_hocr_ocr(result) + return None + def add_ocr_annotations(canvas, ocr): word_order = 1 for word in ocr: diff --git a/apps/iiif/canvases/tests/factories.py b/apps/iiif/canvases/tests/factories.py index d0eb28f3d..df384b574 100644 --- a/apps/iiif/canvases/tests/factories.py +++ b/apps/iiif/canvases/tests/factories.py @@ -4,10 +4,10 @@ import random from factory.django import DjangoModelFactory from factory import Faker +from apps.utils.noid import encode_noid from ..models import Canvas class CanvasFactory(DjangoModelFactory): - pid = str(random.randrange(2000, 5000) + random.randrange(200, 500)) label = Faker("name") height = random.randrange(200, 500) width = random.randrange(200, 500) @@ -15,6 +15,11 @@ class CanvasFactory(DjangoModelFactory): manifest = None ocr_file_path = None default_ocr = 'word' + pid = encode_noid() class Meta: # pylint: disable=too-few-public-methods, missing-class-docstring model = Canvas + +class CanvasNoDimensionsFactory(CanvasFactory): + height = 0 + width = 0 diff --git a/apps/iiif/canvases/tests/test_model.py b/apps/iiif/canvases/tests/test_model.py index 6f2b5d6af..8dfed0c5d 100644 --- a/apps/iiif/canvases/tests/test_model.py +++ b/apps/iiif/canvases/tests/test_model.py @@ -7,7 +7,7 @@ from moto import mock_s3 from django.conf import settings from apps.iiif.manifests.tests.factories import ManifestFactory, ImageServerFactory -from .factories import CanvasFactory +from .factories import CanvasFactory, CanvasNoDimensionsFactory class TestCanvasModels(TestCase): fixtures = ['kollections.json', 'manifests.json', 'canvases.json', 'annotations.json'] @@ -82,3 +82,41 @@ def test_delete_canvas_from_s3(self): self.assertEqual(get_image_error, 'NoSuchKey') self.assertEqual(get_ocr_error, 'NoSuchKey') + def test_no_manifest(self): + canvas = Canvas() + assert canvas.service_id is None + assert canvas.resource_id is None + assert canvas.social_media is None + + def test_string_representation(self): + canvas = CanvasFactory.create(manifest=ManifestFactory.create()) + assert str(canvas) == canvas.pid + + def test_get_image_info(self): + image_server = ImageServerFactory.create(server_base='http://fake.info') + manifest = ManifestFactory.create(image_server=image_server) + canvas = CanvasFactory.create(manifest=manifest) + assert canvas.image_info['height'] == 3000 + assert canvas.image_info['width'] == 3000 + + def test_setting_height_width_from_iiif(self): + image_server = ImageServerFactory.create(server_base='http://fake.info') + manifest = ManifestFactory.create(image_server=image_server) + canvas = CanvasFactory.build() + canvas.height = None + canvas.width = None + assert canvas.height != 3000 + assert canvas.width != 3000 + canvas.manifest = manifest + canvas.save() + canvas.refresh_from_db() + assert canvas.height == 3000 + assert canvas.width == 3000 + + def test_setting_height_and_width(self): + canvas = CanvasNoDimensionsFactory.build(manifest=ManifestFactory.create()) + assert canvas.height == 0 + assert canvas.width == 0 + canvas.save() + assert canvas.height == 3000 + assert canvas.width == 3000 diff --git a/apps/iiif/canvases/tests/test_services.py b/apps/iiif/canvases/tests/test_services.py index cdf672246..3f081a98b 100644 --- a/apps/iiif/canvases/tests/test_services.py +++ b/apps/iiif/canvases/tests/test_services.py @@ -1,15 +1,16 @@ """ Test cases for :class:`apps.iiif.canvases` """ -from apps.iiif import manifests import json from os.path import join -from urllib.parse import quote -import httpretty +import boto3 +from moto import mock_s3 from django.test import TestCase, Client from django.urls import reverse +from lxml.etree import XMLSyntaxError import config.settings.local as settings from apps.iiif.manifests.tests.factories import ManifestFactory, ImageServerFactory +from apps.utils.noid import encode_noid from ..models import Canvas from .. import services from ..apps import CanvasesConfig @@ -28,6 +29,10 @@ def setUp(self): self.assumed_volume_pid = 'readux:st7r6' self.assumed_iiif_base = 'https://loris.library.emory.edu' + def set_up_mock_s3(self, manifest): + conn = boto3.resource('s3', region_name='us-east-1') + conn.create_bucket(Bucket=manifest.image_server.storage_path) + def test_app_config(self): assert CanvasesConfig.verbose_name == 'Canvases' assert CanvasesConfig.name == 'apps.iiif.canvases' @@ -104,23 +109,20 @@ def test_fedora_ocr_creation(self): assert isinstance(word['y'], int) assert isinstance(word['content'], str) - def test_ocr_from_alto(self): - alto = open('apps/iiif/canvases/fixtures/alto.xml', 'r').read() - ocr = services.add_alto_ocr(alto) + def test_ocr_from_tei(self): + tei = open('apps/iiif/canvases/fixtures/tei.xml', 'r').read() + ocr = services.parse_tei_ocr(tei) assert ocr[1]['content'] == 'AEN DEN LESIIU' assert ocr[1]['h'] == 28 assert ocr[1]['w'] == 461 assert ocr[1]['x'] == 814 assert ocr[1]['y'] == 185 - def test_line_by_line_from_alto(self): - # self.canvas.default_ocr = 'line' - # self.canvas.annotation_set.all().delete() - # self.canvas.save() + def test_line_by_line_from_tei(self): canvas = CanvasFactory.create(default_ocr='line', manifest=ManifestFactory.create()) - ocr_file = open(join(settings.APPS_DIR, 'iiif/canvases/fixtures/alto.xml'), 'r').read() - alto = services.add_alto_ocr(ocr_file) - services.add_ocr_annotations(canvas, alto) + ocr_file = open(join(settings.APPS_DIR, 'iiif/canvases/fixtures/tei.xml'), 'r').read() + tei = services.parse_tei_ocr(ocr_file) + services.add_ocr_annotations(canvas, tei) updated_canvas = Canvas.objects.get(pk=canvas.pk) ocr = updated_canvas.annotation_set.first() assert 'mm' in ocr.content @@ -133,9 +135,7 @@ def test_line_by_line_from_alto(self): assert anno.order == num def test_ocr_from_tsv(self): - iiif_server = ImageServerFactory(server_base='https://images.readux.ecds.emory.fake/') - self.canvas.manifest.image_server = iiif_server - self.canvas.manifest.save() + self.manifest.image_server = ImageServerFactory(server_base='https://images.readux.ecds.emory.fake/') canvas = CanvasFactory(manifest=self.canvas.manifest, pid='boo') canvas.save() # TODO: TOO MANY STEPS TO MAKE OCR???? @@ -156,14 +156,13 @@ def test_ocr_from_tsv(self): assert '> ' in ocr2.content assert canvas.annotation_set.all().count() == 5 - def test_no_alto_from_empty_result(self): - ocr = services.add_alto_ocr(None) + def test_no_tei_from_empty_result(self): + ocr = services.parse_tei_ocr(None) assert ocr is None - def test_from_bad_alto(self): - alto = open('apps/iiif/canvases/fixtures/bad_alto.xml', 'r').read() - ocr = services.add_alto_ocr(alto) - assert ocr is None + def test_from_bad_tei(self): + tei = open('apps/iiif/canvases/fixtures/bad_tei.xml', 'r').read() + self.assertRaises(XMLSyntaxError, services.parse_tei_ocr, tei) def test_canvas_detail(self): kwargs = {'manifest': self.manifest.pid, 'pid': self.canvas.pid} @@ -199,9 +198,9 @@ def test_wide_image_crops(self): def test_result_property(self): assert self.canvas.result == "a retto , dio Quef\u00eca de'" - def test_no_alto_for_internet_archive(self): + def test_no_tei_for_internet_archive(self): self.canvas.manifest.image_server.server_base = 'https://iiif.archivelab.org/iiif/' - assert services.fetch_alto_ocr(self.canvas) is None + assert services.fetch_tei_ocr(self.canvas) is None def test_fetch_positional_ocr(self): self.canvas.manifest.image_server.server_base = 'https://iiif.archivelab.org/iiif/' @@ -219,21 +218,201 @@ def test_fetch_positional_ocr_with_offset(self): # self.canvas.manifest.image_server.server_base = 'oxford' # assert services.fetch_positional_ocr(self.canvas) is None - def test_get_image_info(self): - image_server = ImageServerFactory.create(server_base='http://fake.info') - manifest = ManifestFactory.create(image_server=image_server) - canvas = CanvasFactory.create(manifest=manifest) - assert canvas.image_info['height'] == 3000 - assert canvas.image_info['width'] == 3000 - - def test_setting_height_width_from_iiif(self): - image_server = ImageServerFactory.create(server_base='http://fake.info') - manifest = ManifestFactory.create(image_server=image_server) - canvas = CanvasFactory.build() - assert canvas.height != 3000 - assert canvas.width != 3000 - canvas.manifest = manifest - canvas.save() - canvas.refresh_from_db() - assert canvas.height == 3000 - assert canvas.width == 3000 + @mock_s3 + def test_ocr_in_s3(self): + bucket_name = encode_noid() + manifest = ManifestFactory.create( + image_server = ImageServerFactory.create(storage_service = 's3', storage_path=bucket_name, server_base='images.readux.ecds.emory') + ) + self.set_up_mock_s3(manifest) + tsv_file_path = 'apps/iiif/canvases/fixtures/00000002.tsv' + canvas = manifest.canvas_set.first() + canvas.ocr_file_path = f'{manifest.pid}/_*ocr*_/00000002.tsv' + manifest.image_server.bucket.upload_file(tsv_file_path, f'{manifest.pid}/_*ocr*_/00000002.tsv') + fetched_ocr = services.fetch_positional_ocr(canvas) + assert open(tsv_file_path, 'rb').read() == fetched_ocr + + @mock_s3 + def test_fetched_ocr_result_is_string(self): + """ Test when fetched OCR is a string. """ + bucket_name = encode_noid() + manifest = ManifestFactory.create( + image_server = ImageServerFactory.create(storage_service = 's3', storage_path=bucket_name, server_base='images.readux.ecds.emory') + ) + self.set_up_mock_s3(manifest) + tsv_file_path = 'apps/iiif/canvases/fixtures/00000002.tsv' + canvas = manifest.canvas_set.first() + canvas.ocr_file_path = f'{manifest.pid}/_*ocr*_/00000002.tsv' + manifest.image_server.bucket.upload_file(tsv_file_path, f'{manifest.pid}/_*ocr*_/00000002.tsv') + ocr_result = open(tsv_file_path, 'r').read() + assert isinstance(ocr_result, str) + ocr = services.add_positional_ocr(canvas, ocr_result) + assert len(ocr) == 10 + assert ocr[0]['content'] == 'Manuscript' + + @mock_s3 + def test_fetched_ocr_result_is_bytes(self): + """ Test when fetched OCR is a bytes. """ + bucket_name = encode_noid() + manifest = ManifestFactory.create( + image_server = ImageServerFactory.create(storage_service = 's3', storage_path=bucket_name, server_base='images.readux.ecds.emory') + ) + self.set_up_mock_s3(manifest) + tsv_file_path = 'apps/iiif/canvases/fixtures/00000002.tsv' + canvas = manifest.canvas_set.first() + canvas.ocr_file_path = f'{manifest.pid}/_*ocr*_/00000002.tsv' + manifest.image_server.bucket.upload_file(tsv_file_path, f'{manifest.pid}/_*ocr*_/00000002.tsv') + ocr_result = open(tsv_file_path, 'rb').read() + assert isinstance(ocr_result, bytes) + ocr = services.add_positional_ocr(canvas, ocr_result) + assert len(ocr) == 10 + assert ocr[0]['content'] == 'Manuscript' + + def test_from_alto_ocr(self): + """ Test parsing ALTO OCR """ + alto = open('apps/iiif/canvases/fixtures/alto.xml', 'rb').read() + ocr = services.parse_alto_ocr(alto) + assert ocr[0]['content'] == 'MAGNA' + assert ocr[0]['h'] == 164 + assert ocr[0]['w'] == 758 + assert ocr[0]['x'] == 1894 + assert ocr[0]['y'] == 1787 + + def test_from_hocr(self): + """ Test parsing hOCR """ + hocr = open('apps/iiif/canvases/fixtures/hocr.hocr', 'rb').read() + ocr = services.parse_hocr_ocr(hocr) + assert ocr[0]['content'] == 'MAGNA' + assert ocr[0]['h'] == 164 + assert ocr[0]['w'] == 758 + assert ocr[0]['x'] == 1894 + assert ocr[0]['y'] == 1787 + + def test_from_bad_hocr(self): + """ Test parsing bad hOCR """ + bad_hocr = open('apps/iiif/canvases/fixtures/bad_hocr.hocr', 'rb').read() + self.assertRaises(services.HocrValidationError, services.parse_hocr_ocr, bad_hocr) + + def test_identifying_alto_xml(self): + """ Test identifying XML file as ALTO OCR """ + alto = open('apps/iiif/canvases/fixtures/alto.xml', 'rb').read() + ocr = services.parse_xml_ocr(alto) + assert ocr[0]['content'] == 'MAGNA' + assert ocr[0]['h'] == 164 + assert ocr[0]['w'] == 758 + assert ocr[0]['x'] == 1894 + assert ocr[0]['y'] == 1787 + + def test_identifying_hocr_xml(self): + """ Test identifying XML file as hOCR """ + hocr = open('apps/iiif/canvases/fixtures/hocr.hocr', 'rb').read() + ocr = services.parse_xml_ocr(hocr) + assert ocr[0]['content'] == 'MAGNA' + assert ocr[0]['h'] == 164 + assert ocr[0]['w'] == 758 + assert ocr[0]['x'] == 1894 + assert ocr[0]['y'] == 1787 + + def test_identifying_tei_xml(self): + """ Test identifying XML file as hOCR """ + tei = open('apps/iiif/canvases/fixtures/tei.xml', 'r').read() + ocr = services.parse_xml_ocr(tei) + assert ocr[1]['content'] == 'AEN DEN LESIIU' + assert ocr[1]['h'] == 28 + assert ocr[1]['w'] == 461 + assert ocr[1]['x'] == 814 + assert ocr[1]['y'] == 185 + + def test_identification_failure(self): + """ Test identifying XML on non-XML fails """ + tsv = open('apps/iiif/canvases/fixtures/sample.tsv', 'r').read() + self.assertRaises(XMLSyntaxError, services.parse_xml_ocr, tsv) + + def test_unidentifiable_xml(self): + """ Test identifying XML that is not TEI, ALTO, or hOCR """ + hops = open('apps/iiif/canvases/fixtures/hops.xml', 'rb').read() + ocr = services.parse_xml_ocr(hops) + assert ocr is None + + @mock_s3 + def test_add_alto_ocr_by_filename(self): + """ Test get_ocr when OCR is ALTO file (by filename). """ + bucket_name = encode_noid() + manifest = ManifestFactory.create( + image_server = ImageServerFactory.create( + storage_service = 's3', + storage_path=bucket_name, + server_base='images.readux.ecds.emory' + ) + ) + self.set_up_mock_s3(manifest) + tsv_file_path = 'apps/iiif/canvases/fixtures/alto.xml' + canvas = manifest.canvas_set.first() + canvas.ocr_file_path = f'{manifest.pid}/_*ocr*_/alto.xml' + manifest.image_server.bucket.upload_file(tsv_file_path, f'{manifest.pid}/_*ocr*_/alto.xml') + ocr = services.get_ocr(canvas) + assert ocr[0]['content'] == 'MAGNA' + assert ocr[0]['h'] == 164 + assert ocr[0]['w'] == 758 + assert ocr[0]['x'] == 1894 + assert ocr[0]['y'] == 1787 + + @mock_s3 + def test_add_hocr_by_filename(self): + """ Test get_ocr when OCR is hOCR file (by filename). """ + bucket_name = encode_noid() + manifest = ManifestFactory.create( + image_server = ImageServerFactory.create( + storage_service = 's3', + storage_path=bucket_name, + server_base='images.readux.ecds.emory' + ) + ) + self.set_up_mock_s3(manifest) + tsv_file_path = 'apps/iiif/canvases/fixtures/hocr.hocr' + canvas = manifest.canvas_set.first() + canvas.ocr_file_path = f'{manifest.pid}/_*ocr*_/hocr.hocr' + manifest.image_server.bucket.upload_file(tsv_file_path, f'{manifest.pid}/_*ocr*_/hocr.hocr') + ocr = services.get_ocr(canvas) + assert ocr[0]['content'] == 'MAGNA' + assert ocr[0]['h'] == 164 + assert ocr[0]['w'] == 758 + assert ocr[0]['x'] == 1894 + assert ocr[0]['y'] == 1787 + + @mock_s3 + def test_add_json_ocr_by_filename(self): + """ Test get_ocr when OCR is JSON file (by filename). """ + bucket_name = encode_noid() + manifest = ManifestFactory.create( + image_server = ImageServerFactory.create( + storage_service = 's3', + storage_path=bucket_name, + server_base='images.readux.ecds.emory' + ) + ) + self.set_up_mock_s3(manifest) + tsv_file_path = 'apps/iiif/canvases/fixtures/ocr_words.json' + canvas = manifest.canvas_set.first() + canvas.ocr_file_path = f'{manifest.pid}/_*ocr*_/ocr_words.json' + manifest.image_server.bucket.upload_file( + tsv_file_path, f'{manifest.pid}/_*ocr*_/ocr_words.json' + ) + ocr = services.get_ocr(canvas) + assert ocr[0]['content'] == 'Dope' + + def test_none_ocr(self): + """ Test add_positional_ocr when fetched OCR is None. """ + ocr = services.add_positional_ocr(self.canvas, None) + assert ocr is None + + def test_none_alto_ocr(self): + """ Test add_positional_ocr when fetched OCR is None. """ + ocr = services.parse_alto_ocr(None) + assert ocr is None + + def test_not_tsv(self): + """ Test is_tsv with something that is not TSV """ + not_tsv = 'test string' + is_tsv = services.is_tsv(not_tsv) + self.assertFalse(is_tsv) diff --git a/apps/iiif/canvases/urls.py b/apps/iiif/canvases/urls.py index 71c046205..09158b195 100644 --- a/apps/iiif/canvases/urls.py +++ b/apps/iiif/canvases/urls.py @@ -2,9 +2,10 @@ URL patterns for :class:`apps.iiif.canvases` """ from django.urls import path -from .views import IIIFV2Detail, IIIFV2List +from .views import IIIFV2Detail, IIIFV2List, CanvasResource urlpatterns = [ path('iiif//canvas', IIIFV2List.as_view(), name='RenderCanvasList'), path('iiif//canvas/', IIIFV2Detail.as_view(), name='RenderCanvasDetail'), + path('volume//page/iiif/resource/', CanvasResource.as_view(), name='ResourceResult') ] diff --git a/apps/iiif/canvases/views.py b/apps/iiif/canvases/views.py index 4454f26a8..9a000be3d 100644 --- a/apps/iiif/canvases/views.py +++ b/apps/iiif/canvases/views.py @@ -74,3 +74,14 @@ def get(self, request, *args, **kwargs): # pylint: disable = unused-argument ), safe=False ) + +class CanvasResource(View): + def get(self, *args, **kwargs): + canvas = Canvas.objects.get(pid=self.kwargs['pid']) + + return JsonResponse( + { + 'resource': canvas.resource_id + "/full/full/0/default.jpg", + 'text': canvas.result + } + ) diff --git a/apps/iiif/choices.py b/apps/iiif/choices.py index 16f647261..eb20b3d42 100644 --- a/apps/iiif/choices.py +++ b/apps/iiif/choices.py @@ -1,5 +1,5 @@ """ Collection of choices to be used in choice fields. """ -from django.utils.translation import ugettext_lazy as _ +from django.utils.translation import gettext as _ class Choices(): """ Collection of choices to be used in choice fields. """ diff --git a/apps/iiif/kollections/migrations/0006_auto_20210928_1301.py b/apps/iiif/kollections/migrations/0006_auto_20210928_1301.py new file mode 100644 index 000000000..d5ff341a4 --- /dev/null +++ b/apps/iiif/kollections/migrations/0006_auto_20210928_1301.py @@ -0,0 +1,39 @@ +# Generated by Django 2.2.23 on 2021-09-28 13:01 + +from django.db import migrations, models +import uuid + + +class Migration(migrations.Migration): + + dependencies = [ + ('kollections', '0005_collection_updated_at'), + ] + + operations = [ + migrations.AlterField( + model_name='collection', + name='id', + field=models.UUIDField(default=uuid.uuid4, primary_key=True, serialize=False), + ), + migrations.AlterField( + model_name='collection', + name='label', + field=models.CharField(max_length=255), + ), + migrations.AlterField( + model_name='collection', + name='label_de', + field=models.CharField(max_length=255, null=True), + ), + migrations.AlterField( + model_name='collection', + name='label_en', + field=models.CharField(max_length=255, null=True), + ), + migrations.AlterField( + model_name='collection', + name='pid', + field=models.CharField(default='2qkkkqds', help_text="Unique ID. Do not use -'s or spaces in the pid.", max_length=255), + ), + ] diff --git a/apps/iiif/kollections/migrations/0007_auto_20211007_2031.py b/apps/iiif/kollections/migrations/0007_auto_20211007_2031.py new file mode 100644 index 000000000..3c52927fb --- /dev/null +++ b/apps/iiif/kollections/migrations/0007_auto_20211007_2031.py @@ -0,0 +1,18 @@ +# Generated by Django 2.2.24 on 2021-10-07 20:31 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('kollections', '0006_auto_20210928_1301'), + ] + + operations = [ + migrations.AlterField( + model_name='collection', + name='pid', + field=models.CharField(default='2qkqfvwv', help_text="Unique ID. Do not use _'s or spaces in the pid.", max_length=255), + ), + ] diff --git a/apps/iiif/kollections/migrations/0008_auto_20211012_1612.py b/apps/iiif/kollections/migrations/0008_auto_20211012_1612.py new file mode 100644 index 000000000..ace99b5c1 --- /dev/null +++ b/apps/iiif/kollections/migrations/0008_auto_20211012_1612.py @@ -0,0 +1,18 @@ +# Generated by Django 2.2.24 on 2021-10-12 16:12 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('kollections', '0007_auto_20211007_2031'), + ] + + operations = [ + migrations.AlterField( + model_name='collection', + name='pid', + field=models.CharField(default='2qmwbpfz', help_text="Unique ID. Do not use _'s or spaces in the pid.", max_length=255), + ), + ] diff --git a/apps/iiif/kollections/migrations/0009_auto_20211018_1913.py b/apps/iiif/kollections/migrations/0009_auto_20211018_1913.py new file mode 100644 index 000000000..f4bc2f7aa --- /dev/null +++ b/apps/iiif/kollections/migrations/0009_auto_20211018_1913.py @@ -0,0 +1,33 @@ +# Generated by Django 2.2.24 on 2021-10-18 19:13 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('kollections', '0008_auto_20211012_1612'), + ] + + operations = [ + migrations.AlterField( + model_name='collection', + name='label', + field=models.CharField(max_length=1000), + ), + migrations.AlterField( + model_name='collection', + name='label_de', + field=models.CharField(max_length=1000, null=True), + ), + migrations.AlterField( + model_name='collection', + name='label_en', + field=models.CharField(max_length=1000, null=True), + ), + migrations.AlterField( + model_name='collection', + name='pid', + field=models.CharField(default='2qnj9psx', help_text="Unique ID. Do not use _'s or spaces in the pid.", max_length=255), + ), + ] diff --git a/apps/iiif/kollections/models.py b/apps/iiif/kollections/models.py index d3820dab7..c64ac7601 100644 --- a/apps/iiif/kollections/models.py +++ b/apps/iiif/kollections/models.py @@ -7,18 +7,13 @@ from django.contrib.postgres.fields import JSONField from django.core.files.base import ContentFile import config.settings.local as settings +from ..models import IiifBase -class Collection(models.Model): +class Collection(IiifBase): """Model for IIIF Collection.""" - id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False) - label = models.CharField(max_length=255, help_text="Title of the collection.") summary = models.TextField( help_text="Description of the collection." ) - pid = models.CharField( - max_length=255, - help_text="Unique ID. Do not use -'s or spaces in the pid." - ) attribution = models.CharField( max_length=255, null=True, diff --git a/apps/iiif/kollections/tests/factories.py b/apps/iiif/kollections/tests/factories.py index 382861748..97a982b05 100644 --- a/apps/iiif/kollections/tests/factories.py +++ b/apps/iiif/kollections/tests/factories.py @@ -1,16 +1,16 @@ """ Factory to create collestions for testing. """ -import random from factory.django import DjangoModelFactory, ImageField from factory import Faker +from apps.utils.noid import encode_noid from ..models import Collection class CollectionFactory(DjangoModelFactory): """ Factory for mocking :class:`apps.iiif.kollections.models.Collection` objects. """ - pid = str(random.randrange(2000, 5000)) + pid = encode_noid() label = Faker("name") original = ImageField( width=1024, diff --git a/apps/iiif/manifests/migrations/0020_auto_20210915_1327.py b/apps/iiif/manifests/migrations/0020_auto_20210915_1327.py new file mode 100644 index 000000000..43e321179 --- /dev/null +++ b/apps/iiif/manifests/migrations/0020_auto_20210915_1327.py @@ -0,0 +1,19 @@ +# Generated by Django 2.2.23 on 2021-09-15 13:27 + +from django.db import migrations, models +import django.db.models.deletion + + +class Migration(migrations.Migration): + + dependencies = [ + ('manifests', '0019_merge_20210913_1336'), + ] + + operations = [ + migrations.AlterField( + model_name='manifest', + name='start_canvas', + field=models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='start_canvas', to='canvases.Canvas'), + ), + ] diff --git a/apps/iiif/manifests/migrations/0021_auto_20210928_1301.py b/apps/iiif/manifests/migrations/0021_auto_20210928_1301.py new file mode 100644 index 000000000..dae8f1c38 --- /dev/null +++ b/apps/iiif/manifests/migrations/0021_auto_20210928_1301.py @@ -0,0 +1,18 @@ +# Generated by Django 2.2.23 on 2021-09-28 13:01 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('manifests', '0020_auto_20210915_1327'), + ] + + operations = [ + migrations.AlterField( + model_name='manifest', + name='pid', + field=models.CharField(default='2qkkkqds', help_text="Unique ID. Do not use -'s or spaces in the pid.", max_length=255, unique=True), + ), + ] diff --git a/apps/iiif/manifests/migrations/0022_auto_20210928_1322.py b/apps/iiif/manifests/migrations/0022_auto_20210928_1322.py new file mode 100644 index 000000000..f53fbca96 --- /dev/null +++ b/apps/iiif/manifests/migrations/0022_auto_20210928_1322.py @@ -0,0 +1,18 @@ +# Generated by Django 2.2.23 on 2021-09-28 13:22 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('manifests', '0021_auto_20210928_1301'), + ] + + operations = [ + migrations.AlterField( + model_name='manifest', + name='pid', + field=models.CharField(default='2qkkn65b', help_text="Unique ID. Do not use -'s or spaces in the pid.", max_length=255, unique=True), + ), + ] diff --git a/apps/iiif/manifests/migrations/0023_auto_20210928_1325.py b/apps/iiif/manifests/migrations/0023_auto_20210928_1325.py new file mode 100644 index 000000000..74c72eebb --- /dev/null +++ b/apps/iiif/manifests/migrations/0023_auto_20210928_1325.py @@ -0,0 +1,18 @@ +# Generated by Django 2.2.23 on 2021-09-28 13:25 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('manifests', '0022_auto_20210928_1322'), + ] + + operations = [ + migrations.AlterField( + model_name='manifest', + name='pid', + field=models.CharField(default='2qkknfvm', help_text="Unique ID. Do not use -'s or spaces in the pid.", max_length=255, unique=True), + ), + ] diff --git a/apps/iiif/manifests/migrations/0024_auto_20210928_1342.py b/apps/iiif/manifests/migrations/0024_auto_20210928_1342.py new file mode 100644 index 000000000..873d04528 --- /dev/null +++ b/apps/iiif/manifests/migrations/0024_auto_20210928_1342.py @@ -0,0 +1,18 @@ +# Generated by Django 2.2.23 on 2021-09-28 13:42 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('manifests', '0023_auto_20210928_1325'), + ] + + operations = [ + migrations.AlterField( + model_name='manifest', + name='pid', + field=models.CharField(default='2qkkpmjn', help_text="Unique ID. Do not use -'s or spaces in the pid.", max_length=255, unique=True), + ), + ] diff --git a/apps/iiif/manifests/migrations/0025_auto_20210928_1427.py b/apps/iiif/manifests/migrations/0025_auto_20210928_1427.py new file mode 100644 index 000000000..accb63f02 --- /dev/null +++ b/apps/iiif/manifests/migrations/0025_auto_20210928_1427.py @@ -0,0 +1,18 @@ +# Generated by Django 2.2.23 on 2021-09-28 14:27 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('manifests', '0024_auto_20210928_1342'), + ] + + operations = [ + migrations.AlterField( + model_name='manifest', + name='pid', + field=models.CharField(default='2qkksvw1', help_text="Unique ID. Do not use -'s or spaces in the pid.", max_length=255), + ), + ] diff --git a/apps/iiif/manifests/migrations/0026_auto_20210928_1447.py b/apps/iiif/manifests/migrations/0026_auto_20210928_1447.py new file mode 100644 index 000000000..0d34a2b66 --- /dev/null +++ b/apps/iiif/manifests/migrations/0026_auto_20210928_1447.py @@ -0,0 +1,18 @@ +# Generated by Django 2.2.23 on 2021-09-28 14:47 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('manifests', '0025_auto_20210928_1427'), + ] + + operations = [ + migrations.AlterField( + model_name='manifest', + name='pid', + field=models.CharField(default='2qkkvbk3', help_text="Unique ID. Do not use -'s or spaces in the pid.", max_length=255, unique=True), + ), + ] diff --git a/apps/iiif/manifests/migrations/0027_auto_20210928_1450.py b/apps/iiif/manifests/migrations/0027_auto_20210928_1450.py new file mode 100644 index 000000000..dc97edfb9 --- /dev/null +++ b/apps/iiif/manifests/migrations/0027_auto_20210928_1450.py @@ -0,0 +1,18 @@ +# Generated by Django 2.2.23 on 2021-09-28 14:50 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('manifests', '0026_auto_20210928_1447'), + ] + + operations = [ + migrations.AlterField( + model_name='manifest', + name='pid', + field=models.CharField(default='2qkkvj46', help_text="Unique ID. Do not use -'s or spaces in the pid.", max_length=255), + ), + ] diff --git a/apps/iiif/manifests/migrations/0028_auto_20211007_2031.py b/apps/iiif/manifests/migrations/0028_auto_20211007_2031.py new file mode 100644 index 000000000..a9d561217 --- /dev/null +++ b/apps/iiif/manifests/migrations/0028_auto_20211007_2031.py @@ -0,0 +1,18 @@ +# Generated by Django 2.2.24 on 2021-10-07 20:31 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('manifests', '0027_auto_20210928_1450'), + ] + + operations = [ + migrations.AlterField( + model_name='manifest', + name='pid', + field=models.CharField(default='2qkqfvwv', help_text="Unique ID. Do not use _'s or spaces in the pid.", max_length=255), + ), + ] diff --git a/apps/iiif/manifests/migrations/0029_auto_20211012_1612.py b/apps/iiif/manifests/migrations/0029_auto_20211012_1612.py new file mode 100644 index 000000000..c9454741b --- /dev/null +++ b/apps/iiif/manifests/migrations/0029_auto_20211012_1612.py @@ -0,0 +1,18 @@ +# Generated by Django 2.2.24 on 2021-10-12 16:12 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('manifests', '0028_auto_20211007_2031'), + ] + + operations = [ + migrations.AlterField( + model_name='manifest', + name='pid', + field=models.CharField(default='2qmwbpfz', help_text="Unique ID. Do not use _'s or spaces in the pid.", max_length=255), + ), + ] diff --git a/apps/iiif/manifests/migrations/0030_auto_20211018_1913.py b/apps/iiif/manifests/migrations/0030_auto_20211018_1913.py new file mode 100644 index 000000000..339b33927 --- /dev/null +++ b/apps/iiif/manifests/migrations/0030_auto_20211018_1913.py @@ -0,0 +1,23 @@ +# Generated by Django 2.2.24 on 2021-10-18 19:13 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('manifests', '0029_auto_20211012_1612'), + ] + + operations = [ + migrations.AlterField( + model_name='manifest', + name='label', + field=models.CharField(max_length=1000), + ), + migrations.AlterField( + model_name='manifest', + name='pid', + field=models.CharField(default='2qnj9psx', help_text="Unique ID. Do not use _'s or spaces in the pid.", max_length=255), + ), + ] diff --git a/apps/iiif/manifests/models.py b/apps/iiif/manifests/models.py index 00480be62..98f6bd2ed 100644 --- a/apps/iiif/manifests/models.py +++ b/apps/iiif/manifests/models.py @@ -15,6 +15,7 @@ import config.settings.local as settings from ..choices import Choices from ..kollections.models import Collection +from..models import IiifBase JSONEncoder_olddefault = JSONEncoder.default # pylint: disable = invalid-name def JSONEncoder_newdefault(self, o): # pylint: disable = invalid-name @@ -46,7 +47,7 @@ def __str__(self): @property def bucket(self): - if self.storage_source == 's3': + if self.storage_service == 's3': s3 = resource('s3') return s3.Bucket(self.storage_path) return None @@ -73,16 +74,13 @@ def with_documents(self): vector = SearchVector(StringAgg('canvas__annotation__content', delimiter=' ')) return self.get_queryset().annotate(document=vector) -class Manifest(ClusterableModel): +class Manifest(IiifBase): """Model class for IIIF Manifest""" DIRECTIONS = ( ('left-to-right', 'Left to Right'), ('right-to-left', 'Right to Left') ) - id = models.UUIDField(primary_key=True, default=uuid4, editable=True) - pid = models.CharField(max_length=255) - label = models.CharField(max_length=255) summary = models.TextField() author = models.TextField(null=True) published_city = models.TextField(null=True) @@ -115,6 +113,7 @@ class Manifest(ClusterableModel): created_at = models.DateTimeField(auto_now_add=True) updated_at = models.DateTimeField(auto_now=True) autocomplete_search_field = 'label' + # TODO: This has to be removed/redone before we upgrade to Django 3 search_vector = SearchVectorField(null=True, editable=False) image_server = models.ForeignKey(ImageServer, on_delete=models.DO_NOTHING, null=True) objects = ManifestManager() @@ -125,7 +124,6 @@ class Manifest(ClusterableModel): blank=True, null=True ) - # image_server = models.ForeignKey(ImageServer, on_delete=models.CASCADE, null=True) def get_absolute_url(self): """Absolute URL for manifest @@ -203,22 +201,27 @@ def user_annotation_count(self, user=None): #update search_vector every time the entry updates def save(self, *args, **kwargs): # pylint: disable = arguments-differ - if '_' in self.pid: - self.pid = self.pid.replace('_', '-') + + if not self._state.adding and 'pid' in self.get_dirty_fields() and self.image_server and self.image_server.storage_service == 's3': + self.__rename_s3_objects() + + super().save(*args, **kwargs) + Canvas = apps.get_model('canvases.canvas') try: if self.start_canvas is None and hasattr(self, 'canvas_set') and self.canvas_set.exists(): - self.start_canvas = self.canvas_set.first() + print([c.position] for c in self.canvas_set.all()) + self.start_canvas = self.canvas_set.all().order_by('position').first() + self.save() except Canvas.DoesNotExist: self.start_canvas = None - super().save(*args, **kwargs) - if 'update_fields' not in kwargs or 'search_vector' not in kwargs['update_fields']: instance = self._meta.default_manager.with_documents().get(pk=self.pk) instance.search_vector = instance.document instance.save(update_fields=['search_vector']) + def delete(self, *args, **kwargs): """ When a manifest is delted, the related canvas objects are deleted (`on_delete`=models.CASCADE). @@ -231,6 +234,14 @@ def delete(self, *args, **kwargs): super().delete(*args, **kwargs) + def __rename_s3_objects(self): + original_pid = self.get_dirty_fields()['pid'] + keys = [f.key for f in self.image_server.bucket.objects.filter(Prefix=f'{original_pid}/')] + for key in keys: + obj = self.image_server.bucket.Object(key.replace(original_pid, self.pid)) + obj.copy({ 'Bucket': self.image_server.storage_path, 'Key': key }) + self.image_server.bucket.Object(key).delete() + # TODO: is this needed? class Note(models.Model): """Note for manifest""" diff --git a/apps/iiif/manifests/tests/factories.py b/apps/iiif/manifests/tests/factories.py index 7a5fd7c07..e93ed5975 100644 --- a/apps/iiif/manifests/tests/factories.py +++ b/apps/iiif/manifests/tests/factories.py @@ -2,6 +2,8 @@ from random import randrange from factory.django import DjangoModelFactory, ImageField from factory import Faker, RelatedFactory, SubFactory +from time import time +from apps.utils.noid import encode_noid from ..models import Manifest, ImageServer from ...canvases.tests.factories import CanvasFactory @@ -15,7 +17,7 @@ class Meta: # pylint: disable=too-few-public-methods, missing-class-docstring class ManifestFactory(DjangoModelFactory): """Creates a Manifest object for testing.""" - pid = str(randrange(2000, 5000)) + pid = encode_noid() label = Faker("name") canvase = RelatedFactory(CanvasFactory, 'manifest') logo = ImageField(from_path='apps/iiif/canvases/tests/ecds.png') @@ -23,3 +25,13 @@ class ManifestFactory(DjangoModelFactory): class Meta: # pylint: disable=too-few-public-methods, missing-class-docstring model = Manifest + +class EmptyManifestFactory(DjangoModelFactory): + """Creates a Manifest object for testing.""" + pid = encode_noid() + label = Faker("name") + logo = ImageField(from_path='apps/iiif/canvases/tests/ecds.png') + image_server = SubFactory(ImageServerFactory) + + class Meta: # pylint: disable=too-few-public-methods, missing-class-docstring + model = Manifest \ No newline at end of file diff --git a/apps/iiif/manifests/tests/test_model.py b/apps/iiif/manifests/tests/test_model.py index 680d935c1..28f9f6376 100644 --- a/apps/iiif/manifests/tests/test_model.py +++ b/apps/iiif/manifests/tests/test_model.py @@ -1,8 +1,11 @@ +from time import time +from apps.utils.noid import encode_noid from apps.iiif.canvases.models import Canvas from django.test import TestCase from apps.iiif.canvases.models import Canvas from apps.iiif.canvases.tests.factories import CanvasFactory -from .factories import ManifestFactory +from .factories import ManifestFactory, ImageServerFactory +from ..models import Manifest class TestManifestModel(TestCase): def test_delete_manifest_and_canvases(self): @@ -29,3 +32,33 @@ def test_delete_manifest_and_canvases(self): self.assertEqual(Canvas.objects.count(), initial_total_canvas_count) + def test_user_annotation_count_no_user(self): + """ Should return `None` when no user given. """ + manifest = ManifestFactory.create() + assert manifest.user_annotation_count() is None + + def test_no_canvases(self): + """ The start canvas should be `None` if no canvases. """ + manifest = Manifest() + manifest.save() + assert manifest.start_canvas is None + + def test_no_duplicate_pids(self): + """ Manifests should not be created with duplicate PIDs """ + for _ in range(0, 20): + Manifest.objects.create(pid='a-random-pid') + + pids = set([m.pid for m in Manifest.objects.all()]) + assert 'a-random-pid' in pids + assert len(pids) == Manifest.objects.all().count() + +class TestImageServerModel(TestCase): + def test_string_representation(self): + """ It should return teh `server_base` property when cast as `str`. """ + image_server = ImageServerFactory.create() + assert str(image_server) == image_server.server_base + + def test_bucket_is_none_when_not_s3(self): + """ Non-S3 image servers should not have a bucket. """ + image_server = ImageServerFactory.create() + assert image_server.bucket is None \ No newline at end of file diff --git a/apps/iiif/manifests/tests/test_views.py b/apps/iiif/manifests/tests/test_views.py index 7e3f59e23..673fc2120 100644 --- a/apps/iiif/manifests/tests/test_views.py +++ b/apps/iiif/manifests/tests/test_views.py @@ -2,6 +2,7 @@ ''' import json from datetime import datetime +from time import sleep from django.test import TestCase, Client from django.test import RequestFactory from django.conf import settings @@ -13,7 +14,7 @@ from ..views import ManifestSitemap, ManifestRis from ..models import Manifest from ..forms import JekyllExportForm -from .factories import ManifestFactory +from .factories import ManifestFactory, EmptyManifestFactory from ...canvases.models import Canvas from ...canvases.tests.factories import CanvasFactory @@ -29,16 +30,14 @@ class ManifestTests(TestCase): ] def setUp(self): - # fixtures = ['kollections.json', 'manifests.json', 'canvases.json', 'annotations.json'] self.user = get_user_model().objects.get(pk=111) self.factory = RequestFactory() self.client = Client() - # self.volume = Manifest.objects.get(pk='464d82f6-6ae5-4503-9afc-8e3cdd92a3f1') self.volume = ManifestFactory.create( publisher='ECDS', published_city='Atlanta' ) - for num in [1, 2, 3]: + for num in range(0, 3): CanvasFactory.create( manifest=self.volume, position=num @@ -53,11 +52,14 @@ def test_properties(self): assert self.volume.publisher_bib == 'Atlanta : ECDS' assert self.volume.thumbnail_logo.endswith("/media/logos/ecds.png") assert self.volume.baseurl.endswith("/iiif/v2/%s" % (self.volume.pid)) - assert self.volume.start_canvas.identifier.endswith("/iiif/%s/canvas/%s" % (self.volume.pid, self.start_canvas.pid)) + assert self.volume.start_canvas.identifier.endswith("/iiif/%s/canvas/%s" % (self.volume.pid, self.volume.start_canvas.pid)) def test_default_start_canvas(self): self.start_canvas.is_starting_page = False self.start_canvas.save() + self.volume.start_canvas = None + self.volume.save() + self.volume.refresh_from_db() assert self.volume.start_canvas.identifier.endswith("/iiif/%s/canvas/%s" % (self.volume.pid, self.default_start_canvas.pid)) def test_meta(self): @@ -97,15 +99,17 @@ def test_form_mode_choices_with_github(self): assert form.fields['mode'].choices[1][0] == 'github' def test_manifest_search_vector_exists(self): - assert self.volume.search_vector is None - self.volume.save() - self.volume.refresh_from_db() - assert self.volume.search_vector is not None + volume = ManifestFactory.create() + assert not self.volume.search_vector + volume.save() + volume.refresh_from_db() + assert volume.search_vector is not None def test_multiple_starting_canvases(self): - volume = ManifestFactory.create() - for num in range(4): - CanvasFactory.create(manifest=volume, is_starting_page=True) + volume = EmptyManifestFactory.create(canvas=None) + assert volume.canvas_set.exists() is False + for index, _ in enumerate(range(4)): + CanvasFactory.create(manifest=volume, is_starting_page=True, position=index+1) manifest = json.loads( serialize( 'manifest', @@ -116,6 +120,8 @@ def test_multiple_starting_canvases(self): ) ) first_canvas = volume.canvas_set.all().first() + assert volume.start_canvas.position <= 1 + assert first_canvas.position <= 1 assert first_canvas.pid in manifest['thumbnail']['@id'] def test_no_starting_canvases(self): diff --git a/apps/iiif/manifests/tests/tests.py b/apps/iiif/manifests/tests/tests.py index 8f09af93a..2d072cf6a 100644 --- a/apps/iiif/manifests/tests/tests.py +++ b/apps/iiif/manifests/tests/tests.py @@ -3,7 +3,10 @@ """ import json import random +import boto3 +from moto import mock_s3 from datetime import datetime +from time import sleep from django.test import TestCase, Client from django.test import RequestFactory from django.conf import settings @@ -12,6 +15,7 @@ from django.core.serializers import serialize from allauth.socialaccount.models import SocialAccount from iiif_prezi.loader import ManifestReader +from apps.utils.noid import encode_noid from ..views import ManifestSitemap, ManifestRis from ..models import Manifest, ImageServer, RelatedLink from ..forms import JekyllExportForm @@ -139,8 +143,10 @@ def test_manifest_search_vector_exists(self): def test_multiple_starting_canvases(self): volume = ManifestFactory.create() - for num in range(4): - CanvasFactory.create(manifest=volume, is_starting_page=True) + for index, _ in enumerate(range(4)): + CanvasFactory.create(manifest=volume, is_starting_page=True, position=index+1) + sleep(2) + # volume.refresh_from_db() manifest = json.loads( serialize( 'manifest', @@ -150,8 +156,8 @@ def test_multiple_starting_canvases(self): exportdate=datetime.utcnow() ) ) - first_canvas = volume.canvas_set.all().first() - assert first_canvas.pid in manifest['thumbnail']['@id'] + first_canvas = volume.canvas_set.all().order_by('position').first() + assert volume.start_canvas.pid in manifest['thumbnail']['@id'] def test_no_starting_canvases(self): manifest = ManifestFactory.create() @@ -159,12 +165,17 @@ def test_no_starting_canvases(self): manifest.canvas_set.all().get(is_starting_page=True) except Canvas.DoesNotExist as error: assert str(error) == 'Canvas matching query does not exist.' + manifest.refresh_from_db() serialized_manifest = json.loads( serialize( 'manifest', [manifest] ) ) + print('*') + print(manifest.canvas_set.count()) + print([c.position for c in manifest.canvas_set.all()]) + print('*') assert manifest.canvas_set.all().first().pid in serialized_manifest['thumbnail']['@id'] def test_default_iiif_image_server_url(self): @@ -196,4 +207,21 @@ def test_serialized_related_links(self): assert 'seeAlso' in with_links.keys() assert isinstance(with_links['seeAlso'], list) assert len(with_links['seeAlso']) == 1 - assert with_links['seeAlso'][0] == 'images.org' \ No newline at end of file + assert with_links['seeAlso'][0] == 'images.org' + + @mock_s3 + def test_renameing_pid_when_images_are_in_s3(self): + """ It should copy the canvas files to a folder with new pid and delete old pid. """ + image_server = ImageServerFactory.create(storage_path='earthgang', storage_service='s3') + manifest = ManifestFactory(image_server=image_server) + conn = boto3.resource('s3', region_name='us-east-1') + conn.create_bucket(Bucket=image_server.storage_path) + original_pid = manifest.pid + image_server.bucket.upload_file('apps/iiif/canvases/fixtures/00000002.jpg', f'{manifest.pid}/00000002.jpg') + assert f'{manifest.pid}/00000002.jpg' in [f.key for f in image_server.bucket.objects.all()] + manifest.pid = encode_noid(0) + manifest.save() + manifest.refresh_from_db() + assert f'{original_pid}/00000002.jpg' not in [f.key for f in image_server.bucket.objects.all()] + assert original_pid not in [f.key for f in image_server.bucket.objects.all()] + assert f'{manifest.pid}/00000002.jpg' in [f.key for f in image_server.bucket.objects.all()] diff --git a/apps/iiif/models.py b/apps/iiif/models.py new file mode 100644 index 000000000..ba44646aa --- /dev/null +++ b/apps/iiif/models.py @@ -0,0 +1,37 @@ +from time import time +from uuid import uuid4, UUID +from dirtyfields import DirtyFieldsMixin +from django.db import models, IntegrityError +from modelcluster.models import ClusterableModel +from apps.utils.noid import encode_noid + +class IiifBase(DirtyFieldsMixin, ClusterableModel): + id = models.UUIDField(primary_key=True, default=uuid4, editable=True) + pid = models.CharField( + max_length=255, + default=encode_noid(), + blank=False, + help_text="Unique ID. Do not use _'s or spaces in the pid." + ) + label = models.CharField(max_length=1000) + + def save(self, *args, **kwargs): # pylint: disable = arguments-differ + self.clean_pid() + + if self._state.adding: + dup_pids = self.__class__.objects.filter(pid=self.pid) + if dup_pids.exists() or not self.pid: + self.dup_pid = self.pid + self.pid = encode_noid() + + + super().save(*args, **kwargs) + + def clean_pid(self): + """ Cantaloupe is generally configured substitute a slash (/) + with an underscore (_) for the file path of the images. """ + self.pid = self.pid.replace('_', '-') + + + class Meta: # pylint: disable=too-few-public-methods, missing-class-docstring + abstract = True diff --git a/apps/ingest/admin.py b/apps/ingest/admin.py index e7448e7eb..4b29bb983 100644 --- a/apps/ingest/admin.py +++ b/apps/ingest/admin.py @@ -1,11 +1,18 @@ """[summary]""" import logging -from os import environ, path +from mimetypes import guess_type +from os import environ, path, remove, listdir, rmdir from django.contrib import admin +from django.core.files.storage import FileSystemStorage from django.shortcuts import redirect -import apps.ingest.tasks as tasks -from .models import Bulk, Local, Remote, Volume -from .services import create_manifest +from django.urls import reverse +from django.utils.html import format_html +from django_celery_results.models import TaskResult +from apps.ingest import tasks +from apps.ingest.storages import IngestStorage +from .models import Bulk, IngestTaskWatcher, Local, Remote +from .services import clean_metadata, create_manifest, get_associated_meta, get_metadata_from +from .forms import BulkVolumeUploadForm LOGGER = logging.getLogger(__name__) class LocalAdmin(admin.ModelAdmin): @@ -16,11 +23,24 @@ class LocalAdmin(admin.ModelAdmin): def save_model(self, request, obj, form, change): obj.save() obj.manifest = create_manifest(obj) + obj.creator = request.user obj.save() obj.refresh_from_db() super().save_model(request, obj, form, change) - if environ['DJANGO_ENV'] != 'test': - tasks.create_canvas_form_local_task.delay(obj.id) + if environ["DJANGO_ENV"] != 'test': + local_task_id = tasks.create_canvas_form_local_task.delay(obj.id) + local_task_result = TaskResult(task_id=local_task_id) + local_task_result.save() + file = request.FILES['bundle'] + IngestTaskWatcher.manager.create_watcher( + task_id=local_task_id, + task_result=local_task_result, + task_creator=request.user, + associated_manifest=obj.manifest, + filename=file.name + ) + else: + tasks.create_canvas_form_local_task(obj.id) def response_add(self, request, obj, post_url_continue=None): obj.refresh_from_db() @@ -40,27 +60,161 @@ def save_model(self, request, obj, form, change): obj.save() obj.refresh_from_db() super().save_model(request, obj, form, change) - if environ['DJANGO_ENV'] != 'test': - tasks.create_remote_canvases.delay(obj.id) + if environ["DJANGO_ENV"] != 'test': + remote_task_id = tasks.create_remote_canvases.delay(obj.id) + remote_task_result = TaskResult(task_id=remote_task_id) + remote_task_result.save() + IngestTaskWatcher.manager.create_watcher( + task_id=remote_task_id, + task_result=remote_task_result, + task_creator=request.user, + filename=obj.remote_url, + associated_manifest=obj.manifest + ) def response_add(self, request, obj, post_url_continue=None): obj.refresh_from_db() manifest_id = obj.manifest.id return redirect('/admin/manifests/manifest/{m}/change/'.format(m=manifest_id)) -class VolumeInline(admin.StackedInline): - model = Volume - extra = 1 - class BulkAdmin(admin.ModelAdmin): - inlines = [VolumeInline] + """Django admin ingest.models.bulk resource.""" + + form = BulkVolumeUploadForm def save_model(self, request, obj, form, change): + form.storage = IngestStorage() obj.save() + # Get files from multi upload form + files = request.FILES.getlist("volume_files") + # Find the metadata file and load it into list of dicts + all_metadata = get_metadata_from(files) + for file in files: + # Skip metadata file now + if 'metadata' in file.name.casefold() and 'zip' not in guess_type(file.name)[0]: + continue + + # Associate metadata with zipfile + if all_metadata is not None: + file_meta = clean_metadata(get_associated_meta(all_metadata, file)) + else: + file_meta = {} + + # Save in storage + bundle_path = form.storage.save( + path.join("bulk", str(obj.id), file.name), file + ) + + # Create local + new_local = Local.objects.create( + bulk=obj, + bundle=bundle_path, + image_server=obj.image_server, + metadata=file_meta, + creator=request.user + ) + new_local.save() + new_local.manifest = create_manifest(new_local) + new_local.save() + new_local.refresh_from_db() + if environ["DJANGO_ENV"] != 'test': + local_task_id = tasks.create_canvas_form_local_task.delay(new_local.id) + local_task_result = TaskResult(task_id=local_task_id) + local_task_result.save() + IngestTaskWatcher.manager.create_watcher( + task_id=local_task_id, + task_result=local_task_result, + task_creator=request.user, + associated_manifest=new_local.manifest, + filename=file.name + ) + + obj.refresh_from_db() + super().save_model(request, obj, form, change) + + def response_add(self, request, obj, post_url_continue=None): + # Delete local file + file_path = obj.volume_files.path + if path.isfile(file_path): + remove(file_path) + dir_path = file_path[0:file_path.rindex('/')] + if len(listdir(dir_path)) == 0: + rmdir(dir_path) + obj.delete() + return redirect("/admin/manifests/manifest/?o=-4") + + class Meta: # pylint: disable=too-few-public-methods, missing-class-docstring + model = Bulk + +class TaskWatcherAdmin(admin.ModelAdmin): + """Django admin for ingest.models.IngestTaskWatcher resource.""" + + list_display = ( + "id", + "filename", + "task_name", + "task_status", + "task_creator", + "date_created", + "date_done", + ) + fields = ( + "id", + "filename", + "task_name", + "task_status", + "task_creator", + "date_created", + "date_done", + ) + list_filter = ('task_creator', 'task_result__task_name') + search_fields = ('filename',) + date_hierarchy = 'task_result__date_created' + empty_value_display = '(none)' + + def task_status(self, obj): + """ Returns the task result with a link to view its details """ + if obj.task_result: + url = reverse('admin:%s_%s_change' % ( + obj.task_result._meta.app_label, + obj.task_result._meta.model_name + ), args=[obj.task_result.id] ) + return format_html( + "{label}", + url=url, + label=obj.task_result.status + ) + return None + task_status.admin_order_field = 'task_result__status' + + def task_name(self, obj): + """ Returns the task name for this task """ + if obj.task_result: + return obj.task_result.task_name + return None + task_name.admin_order_field = 'task_result__task_name' + + def date_created(self, obj): + """ Returns the creation date for this task """ + if obj.task_result: + return obj.task_result.date_created + return None + date_created.admin_order_field = 'task_result__date_created' + + def date_done(self, obj): + """ Returns the finished date for this task """ + if obj.task_result: + return obj.task_result.date_done + return None + date_done.admin_order_field = 'task_result__date_done' + + def has_add_permission(self, request): + return False - for afile in request.FILES.getlist('photos_multiple'): - Volume.objects.create(bulk_id=obj.id, volume_file=afile) + def has_change_permission(self, request, obj=None): + return False admin.site.register(Local, LocalAdmin) admin.site.register(Remote, RemoteAdmin) admin.site.register(Bulk, BulkAdmin) +admin.site.register(IngestTaskWatcher, TaskWatcherAdmin) diff --git a/apps/ingest/celery.py b/apps/ingest/celery.py index bc7c48ce3..0b506b526 100644 --- a/apps/ingest/celery.py +++ b/apps/ingest/celery.py @@ -8,7 +8,7 @@ # set the default Django settings module for the 'celery' program. os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'config.settings.local') -app = Celery('apps.ingest') +app = Celery('apps.ingest', result_extended=True) # Using a string here means the worker will not have to # pickle the object when using Windows. diff --git a/apps/ingest/fixtures/metadata.csv b/apps/ingest/fixtures/metadata.csv new file mode 100644 index 000000000..35814bde6 --- /dev/null +++ b/apps/ingest/fixtures/metadata.csv @@ -0,0 +1,2 @@ +Filename,Label,Summary,Author,Published city,Published date,Publisher +no_meta_file,Test Bundle,Test file,Test author,Test City,2021,Pubilsher test \ No newline at end of file diff --git a/apps/ingest/fixtures/single-image.zip b/apps/ingest/fixtures/single-image.zip index b2671c9ee..f7ab31fb4 100644 Binary files a/apps/ingest/fixtures/single-image.zip and b/apps/ingest/fixtures/single-image.zip differ diff --git a/apps/ingest/forms.py b/apps/ingest/forms.py new file mode 100644 index 000000000..564e402ca --- /dev/null +++ b/apps/ingest/forms.py @@ -0,0 +1,11 @@ +from django import forms +from django.forms import ClearableFileInput +from .models import Bulk + +class BulkVolumeUploadForm(forms.ModelForm): + class Meta: + model = Bulk + fields = ['image_server', 'volume_files'] + widgets = { + 'volume_files': ClearableFileInput(attrs={'multiple': True}), + } diff --git a/apps/ingest/mail.py b/apps/ingest/mail.py new file mode 100644 index 000000000..60351458c --- /dev/null +++ b/apps/ingest/mail.py @@ -0,0 +1,68 @@ +from traceback import format_tb +from django.urls.base import reverse +from django.template.loader import get_template +from django.conf import settings +from django.core.mail import send_mail + +def send_email_on_failure(task_watcher=None, exception=None, traceback=None): + """Function to send an email on task success signal from Celery. + + :param task_watcher: The task watcher object + :type task_watcher: app.ingest.models.TaskWatcher + :param exception: Exception instance raised + :type exception: Exception + :param traceback: Stack trace object + :type traceback: traceback + """ + context = {} + if task_watcher is not None: + context['filename'] = task_watcher.filename + if exception is not None: + context['exception'] = exception.__repr__() + if traceback is not None: + context['traceback'] = '\n'.join(format_tb(traceback)) + context['result_url'] = settings.HOSTNAME + reverse( + "admin:%s_%s_change" + % ( + task_watcher.task_result._meta.app_label, + task_watcher.task_result._meta.model_name, + ), + args=[task_watcher.task_result.id], + ) + html_email = get_template('ingest_failure_email.html').render(context) + text_email = get_template('ingest_failure_email.txt').render(context) + if task_watcher is not None and task_watcher.task_creator is not None: + send_mail( + '[Readux] Failed: Ingest ' + task_watcher.filename, + text_email, + settings.READUX_EMAIL_SENDER, + [task_watcher.task_creator.email], + fail_silently=False, + html_message=html_email + ) + +def send_email_on_success(task_watcher=None): + context = {} + if task_watcher is not None: + context['filename'] = task_watcher.filename + if task_watcher is not None and task_watcher.associated_manifest is not None: + context['manifest_url'] = settings.HOSTNAME + reverse( + 'admin:manifests_manifest_change', args=(task_watcher.associated_manifest.id,) + ) + context['manifest_pid'] = task_watcher.associated_manifest.pid + context['volume_url'] = task_watcher.associated_manifest.get_absolute_url() + else: + context['manifests_list_url'] = settings.HOSTNAME + reverse( + 'admin:manifests_manifest_changelist' + ) + html_email = get_template('ingest_success_email.html').render(context) + text_email = get_template('ingest_success_email.txt').render(context) + if task_watcher is not None and task_watcher.task_creator is not None: + send_mail( + '[Readux] Ingest complete: ' + task_watcher.filename, + text_email, + settings.READUX_EMAIL_SENDER, + [task_watcher.task_creator.email], + fail_silently=False, + html_message=html_email + ) \ No newline at end of file diff --git a/apps/ingest/migrations/0001_squashed_0005_auto_20201027_1653.py b/apps/ingest/migrations/0001_squashed_0005_auto_20201027_1653.py index 53235ac1f..54fe41309 100644 --- a/apps/ingest/migrations/0001_squashed_0005_auto_20201027_1653.py +++ b/apps/ingest/migrations/0001_squashed_0005_auto_20201027_1653.py @@ -22,8 +22,8 @@ class Migration(migrations.Migration): fields=[ ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), ('bundle', models.FileField(upload_to='')), - ('temp_file_path', models.FilePathField(default=apps.ingest.models.make_temp_file, path='/tmp/tmpz0h5pz94')), - ('image_server', models.ForeignKey(null=True, on_delete=django.db.models.deletion.DO_NOTHING, to='canvases.IServer')), + ('temp_file_path', models.FilePathField(path='/tmp/tmpz0h5pz94')), + ('image_server', models.ForeignKey(null=True, on_delete=django.db.models.deletion.DO_NOTHING, to='manifests.ImageServer')), ('manifest', models.ForeignKey(null=True, on_delete=django.db.models.deletion.DO_NOTHING, to='manifests.Manifest')), ], ), diff --git a/apps/ingest/migrations/0002_auto_20201021_1610.py b/apps/ingest/migrations/0002_auto_20201021_1610.py index c580d3397..5aedf96ac 100644 --- a/apps/ingest/migrations/0002_auto_20201021_1610.py +++ b/apps/ingest/migrations/0002_auto_20201021_1610.py @@ -11,11 +11,6 @@ class Migration(migrations.Migration): ] operations = [ - migrations.AddField( - model_name='local', - name='temp_file_path', - field=models.FilePathField(default=None, path=apps.ingest.models.make_temp_file), - ), migrations.AlterField( model_name='local', name='bundle', diff --git a/apps/ingest/migrations/0002_auto_20210106_1658.py b/apps/ingest/migrations/0002_auto_20210106_1658.py index 730bc8df0..d7de0b6d6 100644 --- a/apps/ingest/migrations/0002_auto_20210106_1658.py +++ b/apps/ingest/migrations/0002_auto_20210106_1658.py @@ -12,11 +12,6 @@ class Migration(migrations.Migration): ] operations = [ - migrations.AlterField( - model_name='local', - name='temp_file_path', - field=models.FilePathField(default=apps.ingest.models.make_temp_file, path='/tmp/tmpeqhs0abf'), - ), migrations.CreateModel( name='Remote', fields=[ diff --git a/apps/ingest/migrations/0002_auto_20210107_2159.py b/apps/ingest/migrations/0002_auto_20210107_2159.py index c4690182a..7c91a5cb7 100644 --- a/apps/ingest/migrations/0002_auto_20210107_2159.py +++ b/apps/ingest/migrations/0002_auto_20210107_2159.py @@ -11,9 +11,5 @@ class Migration(migrations.Migration): ] operations = [ - migrations.AlterField( - model_name='local', - name='temp_file_path', - field=models.FilePathField(default=apps.ingest.models.make_temp_file, path='/var/folders/6q/m6_cn6w96g158vldpfhnn6k40000gn/T/tmpe_pzad5b'), - ), + ] diff --git a/apps/ingest/migrations/0003_auto_20201027_1452.py b/apps/ingest/migrations/0003_auto_20201027_1452.py index a4145cbaa..e6fa623ab 100644 --- a/apps/ingest/migrations/0003_auto_20201027_1452.py +++ b/apps/ingest/migrations/0003_auto_20201027_1452.py @@ -17,10 +17,5 @@ class Migration(migrations.Migration): model_name='local', name='image_server', field=models.ForeignKey(null=True, on_delete=django.db.models.deletion.CASCADE, to='canvases.IServer'), - ), - migrations.AlterField( - model_name='local', - name='temp_file_path', - field=models.FilePathField(default=apps.ingest.models.make_temp_file, path='/tmp/tmpps0kssmg'), - ), + ) ] diff --git a/apps/ingest/migrations/0003_auto_20210108_1619.py b/apps/ingest/migrations/0003_auto_20210108_1619.py index ddc76163a..526bc6232 100644 --- a/apps/ingest/migrations/0003_auto_20210108_1619.py +++ b/apps/ingest/migrations/0003_auto_20210108_1619.py @@ -11,9 +11,5 @@ class Migration(migrations.Migration): ] operations = [ - migrations.AlterField( - model_name='local', - name='temp_file_path', - field=models.FilePathField(default=apps.ingest.models.make_temp_file, path='/tmp/tmphgs7_kqb'), - ), + ] diff --git a/apps/ingest/migrations/0004_auto_20201027_1605.py b/apps/ingest/migrations/0004_auto_20201027_1605.py index f89f89ff1..1291e996b 100644 --- a/apps/ingest/migrations/0004_auto_20201027_1605.py +++ b/apps/ingest/migrations/0004_auto_20201027_1605.py @@ -11,9 +11,5 @@ class Migration(migrations.Migration): ] operations = [ - migrations.AlterField( - model_name='local', - name='temp_file_path', - field=models.FilePathField(default=apps.ingest.models.make_temp_file, path='/tmp/tmplg99z8r0'), - ), + ] diff --git a/apps/ingest/migrations/0004_auto_20210108_1922.py b/apps/ingest/migrations/0004_auto_20210108_1922.py index 17e3f80e8..32fa05958 100644 --- a/apps/ingest/migrations/0004_auto_20210108_1922.py +++ b/apps/ingest/migrations/0004_auto_20210108_1922.py @@ -11,9 +11,5 @@ class Migration(migrations.Migration): ] operations = [ - migrations.AlterField( - model_name='local', - name='temp_file_path', - field=models.FilePathField(default=apps.ingest.models.make_temp_file, path='/tmp/tmptgz6a_hb'), - ), + ] diff --git a/apps/ingest/migrations/0005_auto_20201027_1653.py b/apps/ingest/migrations/0005_auto_20201027_1653.py index 581c7f631..dbeb94a81 100644 --- a/apps/ingest/migrations/0005_auto_20201027_1653.py +++ b/apps/ingest/migrations/0005_auto_20201027_1653.py @@ -21,11 +21,6 @@ class Migration(migrations.Migration): migrations.AlterField( model_name='local', name='image_server', - field=models.ForeignKey(null=True, on_delete=django.db.models.deletion.DO_NOTHING, to='canvases.IServer'), - ), - migrations.AlterField( - model_name='local', - name='temp_file_path', - field=models.FilePathField(default=apps.ingest.models.make_temp_file, path='/tmp/tmpz0h5pz94'), - ), + field=models.ForeignKey(null=True, on_delete=django.db.models.deletion.DO_NOTHING, to='manifests.ImageServer'), + ) ] diff --git a/apps/ingest/migrations/0005_auto_20210119_1936.py b/apps/ingest/migrations/0005_auto_20210119_1936.py index b0e805c66..da61d2c4c 100644 --- a/apps/ingest/migrations/0005_auto_20210119_1936.py +++ b/apps/ingest/migrations/0005_auto_20210119_1936.py @@ -18,9 +18,4 @@ class Migration(migrations.Migration): name='image_server', field=models.ForeignKey(null=True, on_delete=django.db.models.deletion.DO_NOTHING, to='manifests.ImageServer'), ), - migrations.AlterField( - model_name='local', - name='temp_file_path', - field=models.FilePathField(default=apps.ingest.models.make_temp_file, path='/tmp/tmp6pbyozq4'), - ), ] diff --git a/apps/ingest/migrations/0006_auto_20210122_1646.py b/apps/ingest/migrations/0006_auto_20210122_1646.py index fac6067ca..64d37a74e 100644 --- a/apps/ingest/migrations/0006_auto_20210122_1646.py +++ b/apps/ingest/migrations/0006_auto_20210122_1646.py @@ -11,9 +11,5 @@ class Migration(migrations.Migration): ] operations = [ - migrations.AlterField( - model_name='local', - name='temp_file_path', - field=models.FilePathField(default=apps.ingest.models.make_temp_file, path='/tmp/tmpi8gubb4i'), - ), + ] diff --git a/apps/ingest/migrations/0008_auto_20210309_1840.py b/apps/ingest/migrations/0008_auto_20210309_1840.py index b9f3ae9f2..79c60a295 100644 --- a/apps/ingest/migrations/0008_auto_20210309_1840.py +++ b/apps/ingest/migrations/0008_auto_20210309_1840.py @@ -11,9 +11,5 @@ class Migration(migrations.Migration): ] operations = [ - migrations.AlterField( - model_name='local', - name='temp_file_path', - field=models.FilePathField(default=apps.ingest.models.make_temp_file, path='/tmp/tmpwe52cbdj'), - ), + ] diff --git a/apps/ingest/migrations/0009_auto_20210805_1731.py b/apps/ingest/migrations/0009_auto_20210805_1731.py index 43b0bfe5d..bc58097ba 100644 --- a/apps/ingest/migrations/0009_auto_20210805_1731.py +++ b/apps/ingest/migrations/0009_auto_20210805_1731.py @@ -34,9 +34,4 @@ class Migration(migrations.Migration): name='remote', options={'verbose_name_plural': 'Remote'}, ), - migrations.AlterField( - model_name='local', - name='temp_file_path', - field=models.FilePathField(default=apps.ingest.models.make_temp_file, path='/tmp/tmpz_6fa3al'), - ), ] diff --git a/apps/ingest/migrations/0010_auto_20210805_1743.py b/apps/ingest/migrations/0010_auto_20210805_1743.py index 007c45938..f974954ac 100644 --- a/apps/ingest/migrations/0010_auto_20210805_1743.py +++ b/apps/ingest/migrations/0010_auto_20210805_1743.py @@ -14,10 +14,5 @@ class Migration(migrations.Migration): migrations.RenameModel( old_name='BulkUploads', new_name='Volume', - ), - migrations.AlterField( - model_name='local', - name='temp_file_path', - field=models.FilePathField(default=apps.ingest.models.make_temp_file, path='/tmp/tmptvgpofrt'), - ), + ) ] diff --git a/apps/ingest/migrations/0011_auto_20210805_1748.py b/apps/ingest/migrations/0011_auto_20210805_1748.py index abab13049..38aee87a3 100644 --- a/apps/ingest/migrations/0011_auto_20210805_1748.py +++ b/apps/ingest/migrations/0011_auto_20210805_1748.py @@ -17,10 +17,5 @@ class Migration(migrations.Migration): name='bulk', field=models.ForeignKey(default='6b48744e-30a6-4de3-8ff5-5a527cf48e4e', on_delete=django.db.models.deletion.DO_NOTHING, to='ingest.Bulk'), preserve_default=False, - ), - migrations.AlterField( - model_name='local', - name='temp_file_path', - field=models.FilePathField(default=apps.ingest.models.make_temp_file, path='/tmp/tmpouz1ixtm'), - ), + ) ] diff --git a/apps/ingest/migrations/0015_auto_20210811_1605.py b/apps/ingest/migrations/0015_auto_20210811_1605.py index a642eeaeb..6edcf2461 100644 --- a/apps/ingest/migrations/0015_auto_20210811_1605.py +++ b/apps/ingest/migrations/0015_auto_20210811_1605.py @@ -17,11 +17,6 @@ class Migration(migrations.Migration): name='bundle', field=models.FileField(storage=apps.ingest.storages.TmpStorage(), upload_to=''), ), - migrations.AlterField( - model_name='local', - name='temp_file_path', - field=models.FilePathField(default=apps.ingest.models.make_temp_file, path='/tmp/tmpcmngb1zt'), - ), migrations.AlterField( model_name='volume', name='volume_file', diff --git a/apps/ingest/migrations/0016_auto_20210812_1339.py b/apps/ingest/migrations/0016_auto_20210812_1339.py index a1b4cd641..4dc09b46f 100644 --- a/apps/ingest/migrations/0016_auto_20210812_1339.py +++ b/apps/ingest/migrations/0016_auto_20210812_1339.py @@ -15,10 +15,5 @@ class Migration(migrations.Migration): model_name='local', name='bundle_local', field=models.BooleanField(default=False), - ), - migrations.AlterField( - model_name='local', - name='temp_file_path', - field=models.FilePathField(default=apps.ingest.models.make_temp_file, path='/tmp/tmpr3y3av34'), - ), + ) ] diff --git a/apps/ingest/migrations/0017_auto_20210812_1358.py b/apps/ingest/migrations/0017_auto_20210812_1358.py index c74ee1c97..66437331a 100644 --- a/apps/ingest/migrations/0017_auto_20210812_1358.py +++ b/apps/ingest/migrations/0017_auto_20210812_1358.py @@ -19,10 +19,5 @@ class Migration(migrations.Migration): model_name='local', name='local_bundle_path', field=models.CharField(blank=True, max_length=100, null=True), - ), - migrations.AlterField( - model_name='local', - name='temp_file_path', - field=models.FilePathField(default=apps.ingest.models.make_temp_file, path='/tmp/tmp6nlm3j64'), - ), + ) ] diff --git a/apps/ingest/migrations/0020_auto_20210915_1327.py b/apps/ingest/migrations/0020_auto_20210915_1327.py new file mode 100644 index 000000000..c22e494ba --- /dev/null +++ b/apps/ingest/migrations/0020_auto_20210915_1327.py @@ -0,0 +1,29 @@ +# Generated by Django 2.2.23 on 2021-09-15 13:27 + +from django.db import migrations, models +import django.db.models.deletion + + +class Migration(migrations.Migration): + + dependencies = [ + ('manifests', '0020_auto_20210915_1327'), + ('ingest', '0019_auto_20210819_1310'), + ] + + operations = [ + migrations.AlterModelOptions( + name='bulk', + options={'verbose_name_plural': 'Bulk'}, + ), + migrations.AddField( + model_name='bulk', + name='image_server', + field=models.ForeignKey(null=True, on_delete=django.db.models.deletion.DO_NOTHING, to='manifests.ImageServer'), + ), + migrations.AlterField( + model_name='volume', + name='bulk', + field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='ingest.Bulk'), + ), + ] diff --git a/apps/ingest/migrations/0021_auto_20210915_1433.py b/apps/ingest/migrations/0021_auto_20210915_1433.py new file mode 100644 index 000000000..619066660 --- /dev/null +++ b/apps/ingest/migrations/0021_auto_20210915_1433.py @@ -0,0 +1,24 @@ +# Generated by Django 2.2.23 on 2021-09-15 14:33 + +import apps.ingest.models +import apps.ingest.storages +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('ingest', '0020_auto_20210915_1327'), + ] + + operations = [ + migrations.AddField( + model_name='bulk', + name='volume_files', + field=models.FileField(default='test', storage=apps.ingest.storages.IngestStorage(), upload_to=apps.ingest.models.bulk_path), + preserve_default=False, + ), + migrations.DeleteModel( + name='Volume', + ), + ] diff --git a/apps/ingest/migrations/0022_auto_20210915_1651.py b/apps/ingest/migrations/0022_auto_20210915_1651.py new file mode 100644 index 000000000..54b998785 --- /dev/null +++ b/apps/ingest/migrations/0022_auto_20210915_1651.py @@ -0,0 +1,25 @@ +# Generated by Django 2.2.23 on 2021-09-15 16:51 + +import apps.ingest.storages +from django.db import migrations, models +import django.db.models.deletion + + +class Migration(migrations.Migration): + + dependencies = [ + ('ingest', '0021_auto_20210915_1433'), + ] + + operations = [ + migrations.AddField( + model_name='local', + name='bulk', + field=models.ForeignKey(null=True, on_delete=django.db.models.deletion.CASCADE, related_name='local_uploads', to='ingest.Bulk'), + ), + migrations.AlterField( + model_name='bulk', + name='volume_files', + field=models.FileField(storage=apps.ingest.storages.IngestStorage(), upload_to=''), + ), + ] diff --git a/apps/ingest/migrations/0023_auto_20210916_1803.py b/apps/ingest/migrations/0023_auto_20210916_1803.py new file mode 100644 index 000000000..5d25b056a --- /dev/null +++ b/apps/ingest/migrations/0023_auto_20210916_1803.py @@ -0,0 +1,24 @@ +# Generated by Django 2.2.23 on 2021-09-16 18:03 + +from django.db import migrations, models +import django.db.models.deletion + + +class Migration(migrations.Migration): + + dependencies = [ + ('ingest', '0022_auto_20210915_1651'), + ] + + operations = [ + migrations.AlterField( + model_name='bulk', + name='volume_files', + field=models.FileField(upload_to=''), + ), + migrations.AlterField( + model_name='local', + name='bulk', + field=models.ForeignKey(null=True, on_delete=django.db.models.deletion.DO_NOTHING, related_name='local_uploads', to='ingest.Bulk'), + ), + ] diff --git a/apps/ingest/migrations/0024_auto_20210916_1809.py b/apps/ingest/migrations/0024_auto_20210916_1809.py new file mode 100644 index 000000000..c187f29ea --- /dev/null +++ b/apps/ingest/migrations/0024_auto_20210916_1809.py @@ -0,0 +1,19 @@ +# Generated by Django 2.2.23 on 2021-09-16 18:09 + +from django.db import migrations, models +import django.db.models.deletion + + +class Migration(migrations.Migration): + + dependencies = [ + ('ingest', '0023_auto_20210916_1803'), + ] + + operations = [ + migrations.AlterField( + model_name='local', + name='bulk', + field=models.ForeignKey(null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='local_uploads', to='ingest.Bulk'), + ), + ] diff --git a/apps/ingest/migrations/0025_auto_20210920_2130.py b/apps/ingest/migrations/0025_auto_20210920_2130.py new file mode 100644 index 000000000..81d791c25 --- /dev/null +++ b/apps/ingest/migrations/0025_auto_20210920_2130.py @@ -0,0 +1,25 @@ +# Generated by Django 2.2.23 on 2021-09-20 21:30 + +import apps.ingest.models +import django.contrib.postgres.fields.jsonb +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('ingest', '0024_auto_20210916_1809'), + ] + + operations = [ + migrations.AddField( + model_name='local', + name='metadata', + field=django.contrib.postgres.fields.jsonb.JSONField(blank=True, default=dict), + ), + migrations.AlterField( + model_name='bulk', + name='volume_files', + field=models.FileField(upload_to=apps.ingest.models.bulk_path), + ), + ] diff --git a/apps/ingest/migrations/0026_remote_metadata.py b/apps/ingest/migrations/0026_remote_metadata.py new file mode 100644 index 000000000..59225d180 --- /dev/null +++ b/apps/ingest/migrations/0026_remote_metadata.py @@ -0,0 +1,19 @@ +# Generated by Django 2.2.23 on 2021-09-20 22:08 + +import django.contrib.postgres.fields.jsonb +from django.db import migrations + + +class Migration(migrations.Migration): + + dependencies = [ + ('ingest', '0025_auto_20210920_2130'), + ] + + operations = [ + migrations.AddField( + model_name='remote', + name='metadata', + field=django.contrib.postgres.fields.jsonb.JSONField(blank=True, default=dict), + ), + ] diff --git a/apps/ingest/migrations/0027_remove_local_local_bundle_path.py b/apps/ingest/migrations/0027_remove_local_local_bundle_path.py new file mode 100644 index 000000000..5e8ac7adf --- /dev/null +++ b/apps/ingest/migrations/0027_remove_local_local_bundle_path.py @@ -0,0 +1,17 @@ +# Generated by Django 2.2.23 on 2021-09-22 19:57 + +from django.db import migrations + + +class Migration(migrations.Migration): + + dependencies = [ + ('ingest', '0026_remote_metadata'), + ] + + operations = [ + migrations.RemoveField( + model_name='local', + name='local_bundle_path', + ), + ] diff --git a/apps/ingest/migrations/0028_ingesttaskwatcher.py b/apps/ingest/migrations/0028_ingesttaskwatcher.py new file mode 100644 index 000000000..ff69456d5 --- /dev/null +++ b/apps/ingest/migrations/0028_ingesttaskwatcher.py @@ -0,0 +1,29 @@ +# Generated by Django 2.2.23 on 2021-09-28 18:39 + +from django.conf import settings +from django.db import migrations, models +import django.db.models.deletion + + +class Migration(migrations.Migration): + + dependencies = [ + migrations.swappable_dependency(settings.AUTH_USER_MODEL), + ('ingest', '0027_remove_local_local_bundle_path'), + ] + + operations = [ + migrations.CreateModel( + name='IngestTaskWatcher', + fields=[ + ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('filename', models.CharField(max_length=255, null=True)), + ('task_id', models.CharField(max_length=255, null=True)), + ('task_creator', models.ForeignKey(null=True, on_delete=django.db.models.deletion.CASCADE, related_name='created_tasks', to=settings.AUTH_USER_MODEL)), + ('task_result', models.ForeignKey(null=True, on_delete=django.db.models.deletion.CASCADE, to='django_celery_results.TaskResult')), + ], + options={ + 'verbose_name_plural': 'IngestTaskWatcher', + }, + ), + ] diff --git a/apps/ingest/migrations/0029_auto_20210928_1851.py b/apps/ingest/migrations/0029_auto_20210928_1851.py new file mode 100644 index 000000000..2044fb009 --- /dev/null +++ b/apps/ingest/migrations/0029_auto_20210928_1851.py @@ -0,0 +1,20 @@ +# Generated by Django 2.2.23 on 2021-09-28 18:51 + +from django.db import migrations +import django.db.models.manager + + +class Migration(migrations.Migration): + + dependencies = [ + ('ingest', '0028_ingesttaskwatcher'), + ] + + operations = [ + migrations.AlterModelManagers( + name='ingesttaskwatcher', + managers=[ + ('manager', django.db.models.manager.Manager()), + ], + ), + ] diff --git a/apps/ingest/migrations/0030_auto_20211007_2031.py b/apps/ingest/migrations/0030_auto_20211007_2031.py new file mode 100644 index 000000000..ce9d6da7a --- /dev/null +++ b/apps/ingest/migrations/0030_auto_20211007_2031.py @@ -0,0 +1,25 @@ +# Generated by Django 2.2.24 on 2021-10-07 20:31 + +from django.conf import settings +from django.db import migrations, models +import django.db.models.deletion + + +class Migration(migrations.Migration): + + dependencies = [ + migrations.swappable_dependency(settings.AUTH_USER_MODEL), + ('ingest', '0029_auto_20210928_1851'), + ] + + operations = [ + migrations.AlterModelOptions( + name='ingesttaskwatcher', + options={'verbose_name_plural': 'Ingest Statuses'}, + ), + migrations.AddField( + model_name='local', + name='creator', + field=models.ForeignKey(null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='created_locals', to=settings.AUTH_USER_MODEL), + ), + ] diff --git a/apps/ingest/migrations/0031_ingesttaskwatcher_associated_manifest.py b/apps/ingest/migrations/0031_ingesttaskwatcher_associated_manifest.py new file mode 100644 index 000000000..c6cbd4e36 --- /dev/null +++ b/apps/ingest/migrations/0031_ingesttaskwatcher_associated_manifest.py @@ -0,0 +1,20 @@ +# Generated by Django 2.2.24 on 2021-10-12 16:12 + +from django.db import migrations, models +import django.db.models.deletion + + +class Migration(migrations.Migration): + + dependencies = [ + ('manifests', '0029_auto_20211012_1612'), + ('ingest', '0030_auto_20211007_2031'), + ] + + operations = [ + migrations.AddField( + model_name='ingesttaskwatcher', + name='associated_manifest', + field=models.ForeignKey(null=True, on_delete=django.db.models.deletion.SET_NULL, to='manifests.Manifest'), + ), + ] diff --git a/apps/ingest/models.py b/apps/ingest/models.py index 3179fd79b..6c10017db 100644 --- a/apps/ingest/models.py +++ b/apps/ingest/models.py @@ -1,56 +1,90 @@ """ Model classes for ingesting volumes. """ -import imghdr import os import uuid import logging -import httpretty -from boto3 import client, resource from io import BytesIO -from urllib.parse import urlparse, unquote from mimetypes import guess_type -from shutil import rmtree -from tempfile import gettempdir, mkdtemp -from zipfile import ZipFile +import httpretty +from stream_unzip import stream_unzip, TruncatedDataError +from boto3 import client from tablib import Dataset from django.db import models from django.conf import settings +from django.contrib.postgres.fields import JSONField +from django_celery_results.models import TaskResult from apps.iiif.canvases.models import Canvas from apps.iiif.canvases.tasks import add_ocr_task -from apps.iiif.canvases.services import add_ocr_annotations, get_ocr from apps.iiif.manifests.models import Manifest, ImageServer -import apps.ingest.services as services +from apps.ingest import services from apps.utils.fetch import fetch_url from .storages import IngestStorage LOGGER = logging.getLogger(__name__) -def make_temp_file(): - """Creates a temporary directory. +def bulk_path(instance, filename): + return os.path.join('bulk', str(instance.id), filename ) - :return: Absolute path to the temporary directory - :rtype: str - """ - temp_file = mkdtemp() - return temp_file +class IngestTaskWatcherManager(models.Manager): + """ Manager class for associating user and ingest data with a task result """ + def create_watcher(self, filename, task_id, task_result, task_creator, associated_manifest=None): + """ + Creates an instance of IngestTaskWatcher with provided params + """ + watcher = self.create( + filename=filename, + task_id=task_id, + task_result=task_result, + task_creator=task_creator, + associated_manifest=associated_manifest + ) + return watcher + + +class IngestTaskWatcher(models.Model): + """ Model class for associating user and ingest data with a task result """ + filename = models.CharField(max_length=255, null=True) + task_id = models.CharField(max_length=255, null=True) + task_result = models.ForeignKey(TaskResult, on_delete=models.CASCADE, null=True) + task_creator = models.ForeignKey( + settings.AUTH_USER_MODEL, + on_delete=models.CASCADE, + null=True, + related_name='created_tasks' + ) + associated_manifest = models.ForeignKey(Manifest, on_delete=models.SET_NULL, null=True) + manager = IngestTaskWatcherManager() + + class Meta: + verbose_name_plural = 'Ingest Statuses' + +class IngestAbstractModel(models.Model): + metadata = JSONField(default=dict, blank=True) + manifest = models.ForeignKey(Manifest, on_delete=models.DO_NOTHING, null=True) + + class Meta: # pylint: disable=too-few-public-methods, missing-class-docstring + abstract = True -def bulk_path(instance, filename): - return os.path.join('bulk', str(instance.bulk.id), filename ) class Bulk(models.Model): + """ Model class for bulk ingesting volumes from local files. """ id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False) + image_server = models.ForeignKey(ImageServer, on_delete=models.DO_NOTHING, null=True) + volume_files = models.FileField(blank=False, upload_to=bulk_path) -class Volume(models.Model): - id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False) - bulk = models.ForeignKey(Bulk, on_delete=models.DO_NOTHING, null=False) - volume_file = models.FileField(storage=IngestStorage(), upload_to=bulk_path) + class Meta: + verbose_name_plural = 'Bulk' -class Local(models.Model): +class Local(IngestAbstractModel): """ Model class for ingesting a volume from local files. """ - # temp_file_path = models.FilePathField(path=make_temp_file(), default=make_temp_file) + bulk = models.ForeignKey(Bulk, related_name='local_uploads', on_delete=models.SET_NULL, null=True) bundle = models.FileField(blank=False, storage=IngestStorage()) image_server = models.ForeignKey(ImageServer, on_delete=models.DO_NOTHING, null=True) - manifest = models.ForeignKey(Manifest, on_delete=models.DO_NOTHING, null=True) - local_bundle_path = models.CharField(max_length=500, null=True, blank=True) + creator = models.ForeignKey( + settings.AUTH_USER_MODEL, + on_delete=models.SET_NULL, + null=True, + related_name='created_locals' + ) class Meta: verbose_name_plural = 'Local' @@ -61,135 +95,103 @@ def s3_client(self): return client('s3') return None - @property - def bucket(self): - s3 = resource('s3') - return s3.Bucket(self.image_server.storage_path) - - @property - def tmp_bucket(self): - return resource('s3').Bucket('readux-ingest') - - - @property - def zip_ref(self): - """Create a reference to the uploaded zip file. - - :return: zipfile.ZipFile object of uploaded - :rtype: zipfile.ZipFile - https://medium.com/@johnpaulhayes/how-extract-a-huge-zip-file-in-an-amazon-s3-bucket-by-using-aws-lambda-and-python-e32c6cf58f06 + def open_metadata(self): """ - if self.local_bundle_path and os.path.exists(self.local_bundle_path): - return ZipFile(self.local_bundle_path) - if self.local_bundle_path: - return self.__fallback_download() - try: - buffer = BytesIO(self.bundle.file.obj.get()['Body'].read()) - return ZipFile(buffer) - except OverflowError: - # TODO: Figure out how to test this. - return self.__fallback_download() - - @property - def metadata(self): - """ - Extract metadata from file. - :return: If metadata file exists, returns the values. If no file, returns None. - :rtype: dict or None + Set metadata property from extracted metadata from file. """ - metadata = None - for file in self.zip_ref.infolist(): - if 'metadata' in file.filename.casefold(): - - if file.is_dir(): - continue - if metadata is not None: - continue - if self.__is_junk(file.filename): - continue - if 'ocr' in file.filename.casefold(): - continue - if 'image' in file.filename.casefold(): - continue - - if 'csv' in guess_type(file.filename)[0] or 'tab-separated' in guess_type(file.filename)[0]: - data = self.zip_ref.read(file.filename) - metadata = Dataset().load(data.decode('utf-8-sig')) - else: - metadata = Dataset().load(self.zip_ref.read(file.filename)) - - if metadata is not None: - metadata = services.clean_metadata(metadata.dict[0]) - - return metadata + try: + for zipped_file, _, unzipped_chunks in stream_unzip(self.__zipped_chunks()): + _, file_name, file_type = self.__file_info(zipped_file) + tmp_file = bytes() + for chunk in unzipped_chunks: + if file_type and not self.__is_junk(file_name) and 'metadata' in file_name: + tmp_file += chunk + if len(tmp_file) > 0 and file_type and not self.__is_junk(file_name): + if 'csv' in file_type or 'tab-separated' in file_type: + metadata = Dataset().load(tmp_file.decode('utf-8-sig')) + elif 'officedocument' in file_type: + metadata = Dataset().load(BytesIO(tmp_file)) + if metadata is not None: + self.metadata = services.clean_metadata(metadata.dict[0]) + return + except TruncatedDataError: + # TODO: Why does `apps.ingest.tests.test_admin.IngestAdminTest.test_local_admin_save` raise this? + pass - def extract_images_s3(self): + def volume_to_s3(self): """ - Extract image files directly to S3 + Unzip and upload image and OCR files in the bundle, without loading the entire ZIP file + into memory or any of its uncompressed files. """ - if self.s3_client is None: - return + for zipped_file, _, unzipped_chunks in stream_unzip(self.__zipped_chunks()): + file_path, file_name, file_type = self.__file_info(zipped_file) + has_type_or_is_hocr = file_name.endswith('.hocr') or file_type + tmp_file = bytes() + for chunk in unzipped_chunks: + if has_type_or_is_hocr and not self.__is_junk(file_name): + tmp_file += chunk + if (file_type or file_name.endswith('.hocr')) and not self.__is_junk(file_name): + file_name = file_name.replace('_', '-') + if file_type and 'image' in file_type and 'images' in file_path: + self.image_server.bucket.upload_fileobj( + BytesIO(tmp_file), + f'{self.manifest.pid}/{file_name}' + ) + if file_type: + is_ocr_file_type = ( + 'text' in file_type + or 'xml' in file_type + or 'json' in file_type + or 'html' in file_type + ) + if 'ocr' in file_path and ( + file_name.endswith('.hocr') or is_ocr_file_type + ): + self.image_server.bucket.upload_fileobj( + BytesIO(tmp_file), + f'{self.manifest.pid}/_*ocr*_/{file_name}' + ) - for filename in self.zip_ref.namelist(): - if self.__is_junk(filename): - continue - type = guess_type(filename)[0] - if type is not None and 'image' in type: - # TODO: check if file already exists in S3. - # If it does, compare the hash and the S3 etag. - # Don't upload if files are the same. - self.s3_client.upload_fileobj( - self.zip_ref.open(filename), - Bucket=self.image_server.storage_path, - Key='{p}/{f}'.format(p=self.manifest.pid, f=filename.split("/")[-1].replace('_', '-')) - ) + @property + def file_list(self): + """Returns a list of files in the zip. Used for testing. - def extract_ocr_s3(self): + :return: List of files in zip. + :rtype: list """ - Locate and extract OCR files directly to S3 - """ - if self.s3_client is None: - return + files = [] + for zipped_file, file_size, unzipped_chunks in stream_unzip(self.__zipped_chunks()): + file_path, file_name, file_type = self.__file_info(zipped_file) + files.append(file_path) + # Not looping through the chunks throws an UnexpectedSignatureError + for chunk in unzipped_chunks: + pass - for file in self.zip_ref.infolist(): - if 'ocr' not in file.filename.casefold(): - continue - if 'metadata.' in file.filename: - # The metadata file could slip through. - # It's unlikely and will not hurt anything. - continue - if file.is_dir(): - continue - if self.__is_junk(file.filename): - continue - type = guess_type(file.filename)[0] - if type is not None and 'text' in type: - self.s3_client.upload_fileobj( - self.zip_ref.open(file.filename), - Bucket=self.image_server.storage_path, - Key='{p}/_*ocr*_/{f}'.format(p=self.manifest.pid, f=file.filename.split("/")[-1].replace('_', '-')) - ) + return files def create_canvases(self): """ Create Canvas objects for each image file. """ - self.extract_images_s3() - self.extract_ocr_s3() + self.volume_to_s3() image_files = [ - file.key for file in self.bucket.objects.filter(Prefix=self.manifest.pid) if '_*ocr*_' not in file.key + file.key for file in self.image_server.bucket.objects.filter(Prefix=f'{self.manifest.pid}/') if '_*ocr*_' not in file.key and file.key.split('/')[0] == self.manifest.pid ] if len(image_files) == 0: # TODO: Throw an error here? pass + ocr_files = [ - file.key for file in self.bucket.objects.filter(Prefix=self.manifest.pid) if '_*ocr*_' in file.key + file.key for file in self.image_server.bucket.objects.filter(Prefix=f'{self.manifest.pid}/') if '_*ocr*_' in file.key and file.key.split('/')[0] == self.manifest.pid ] for index, key in enumerate(sorted(image_files)): - image_file = key.split('/')[-1] + image_file = key.split('/')[-1].replace('_', '-') + + if not image_file: + continue LOGGER.debug(f'Creating canvas from {image_file}') @@ -199,7 +201,7 @@ def create_canvases(self): try: ocr_key = [key for key in ocr_files if image_name in key][0] - ocr_file_path = f'https://readux.s3.amazonaws.com/{ocr_key}' + ocr_file_path = ocr_key except IndexError: # Every image may not have a matching OCR file ocr_file_path = None @@ -215,46 +217,45 @@ def create_canvases(self): if created and canvas.ocr_file_path is not None: if os.environ['DJANGO_ENV'] == 'test': - ocr = get_ocr(canvas) - if ocr is not None: - add_ocr_annotations(canvas, ocr) + add_ocr_task(canvas.id) else: - add_ocr_task.delay(canvas.id) + ocr_task_id = add_ocr_task.delay(canvas.id) + ocr_task_result = TaskResult(task_id=ocr_task_id) + ocr_task_result.save() + IngestTaskWatcher.manager.create_watcher( + task_id=ocr_task_id, + task_result=ocr_task_result, + task_creator=self.creator, + filename=canvas.ocr_file_path + ) + if self.manifest.canvas_set.count() == len(image_files): - self.clean_up() + self.delete() else: # TODO: Log or though an error/waring? pass - def clean_up(self): - """ Method to clean up all the files. """ - if self.local_bundle_path and os.path.exists(self.local_bundle_path): - os.remove(self.local_bundle_path) - - self.delete() - @staticmethod def __is_junk(path): file = path.split('/')[-1] return file.startswith('.') or file.startswith('~') or file.startswith('__') - def __fallback_download(self): - self.local_bundle_path = os.path.join( - gettempdir(), - self.bundle.file.obj.key.split('/')[-1] - ) - - if os.path.isfile(self.local_bundle_path) is False: - self.bundle.file.obj.download_file(self.local_bundle_path) - self.save() + @staticmethod + def __file_info(path): + path = path.decode('UTF-8') + return [ + path, + path.split('/')[-1], + guess_type(path)[0] + ] - return ZipFile(self.local_bundle_path) + def __zipped_chunks(self): + yield from self.bundle.file.obj.get()['Body'].iter_chunks(chunk_size=10240) -class Remote(models.Model): +class Remote(IngestAbstractModel): """ Model class for ingesting a volume from remote manifest. """ remote_url = models.CharField(max_length=255) - manifest = models.ForeignKey(Manifest, on_delete=models.DO_NOTHING, null=True) class Meta: verbose_name_plural = 'Remote' @@ -286,13 +287,10 @@ def remote_manifest(self): ) return fetch_url(self.remote_url) - @property - def metadata(self): + def open_metadata(self): """ Take a remote IIIF manifest and create a derivative version. """ if self.remote_manifest['@context'] == 'http://iiif.io/api/presentation/2/context.json': - return services.parse_iiif_v2_manifest(self.remote_manifest) - - return None + self.metadata = services.parse_iiif_v2_manifest(self.remote_manifest) def create_canvases(self): # TODO: What if there are multiple sequences? Is that even allowed in IIIF? diff --git a/apps/ingest/services.py b/apps/ingest/services.py index e8b043d85..cb7052aa8 100644 --- a/apps/ingest/services.py +++ b/apps/ingest/services.py @@ -1,9 +1,11 @@ """ Module of service classes and methods for ingest. """ +from mimetypes import guess_type +from time import time from urllib.parse import unquote, urlparse -from uuid import uuid4 from django.apps import apps -from apps.iiif.manifests.models import Manifest +from tablib.core import Dataset from apps.iiif.manifests.models import Manifest, RelatedLink +from apps.utils.noid import encode_noid def clean_metadata(metadata): """Remove keys that do not aligin with Manifest fields. @@ -35,18 +37,21 @@ def create_manifest(ingest): manifest = None # Make a copy of the metadata so we don't extract it over and over. try: + if not bool(ingest.manifest) or ingest.manifest is None: + ingest.open_metadata() + metadata = dict(ingest.metadata) except TypeError: metadata = None - if metadata is not None: - manifest, created = Manifest.objects.get_or_create(pid=metadata['pid'].replace('_', '-')) + if metadata: + if 'pid' in metadata: + manifest, created = Manifest.objects.get_or_create(pid=metadata['pid'].replace('_', '-')) + else: + manifest = Manifest.objects.create() for (key, value) in metadata.items(): setattr(manifest, key, value) - # TODO: I'm not sure this is what we want to do - # if not created: - # manifest.canvas_set.all().delete() else: - manifest = Manifest(pid=str(uuid4())) + manifest = Manifest() manifest.image_server = ingest.image_server manifest.save() @@ -136,3 +141,41 @@ def parse_iiif_v2_canvas(canvas): 'label': label, 'resource': resource } + +def get_metadata_from(files): + """ + Find metadata file in uploaded files. + :return: If metadata file exists, returns the values. If no file, returns None. + :rtype: list or None + """ + metadata = None + for file in files: + if metadata is not None: + continue + if 'zip' in guess_type(file.name)[0]: + continue + if 'metadata' in file.name.casefold(): + stream = file.read() + if 'csv' in guess_type(file.name)[0] or 'tab-separated' in guess_type(file.name)[0]: + metadata = Dataset().load(stream.decode('utf-8-sig')).dict + else: + metadata = Dataset().load(stream).dict + return metadata + +def get_associated_meta(all_metadata, file): + """ + Associate metadata with filename. + :return: If a matching filename is found, returns the row as dict, + with generated pid. Otherwise, returns {}. + :rtype: dict + """ + file_meta = {} + extless_filename = file.name[0:file.name.rindex('.')] + for meta_dict in all_metadata: + for key, val in meta_dict.items(): + if key.casefold() == 'filename': + metadata_found_filename = val + # Match filename column, case-sensitive, against filename + if metadata_found_filename and metadata_found_filename in (extless_filename, file.name): + file_meta = meta_dict + return file_meta diff --git a/apps/ingest/tasks.py b/apps/ingest/tasks.py index 43b1451a7..c046f57ae 100644 --- a/apps/ingest/tasks.py +++ b/apps/ingest/tasks.py @@ -3,9 +3,12 @@ """ Common tasks for ingest. """ import logging from celery import Celery +from celery.signals import task_success, task_failure from django.apps import apps from django.conf import settings +from apps.ingest.models import IngestTaskWatcher from .services import create_manifest +from .mail import send_email_on_failure, send_email_on_success # Use `apps.get_model` to avoid circular import error. Because the parameters used to # create a background task have to be serializable, we can't just pass in the model object. @@ -19,7 +22,7 @@ logging.getLogger('s3transfer').setLevel(logging.ERROR) logging.getLogger('factory').setLevel(logging.ERROR) -app = Celery('apps.ingest') +app = Celery('apps.ingest', result_extended=True) app.config_from_object('django.conf:settings') app.autodiscover_tasks(lambda: settings.INSTALLED_APPS) @@ -39,7 +42,6 @@ def create_canvas_form_local_task(ingest_id): local_ingest.refresh_from_db() local_ingest.create_canvases() - # Sometimes, the IIIF server is not ready to process the image by the time the canvas is saved to # the database. As a double check loop through to make sure the height and width has been saved. for canvas in local_ingest.manifest.canvas_set.all(): @@ -61,3 +63,29 @@ def create_remote_canvases(ingest_id, *args, **kwargs): remote_ingest.refresh_from_db() remote_ingest.create_canvases() + + +@task_failure.connect +def send_email_on_failure_task(sender=None, exception=None, task_id=None, traceback=None, *args, **kwargs): + """Function to send an email on task success signal from Celery. + + :param sender: The task object + :type sender: celery.task + """ + if sender is not None and 'creating_canvases_from_local' in sender.name: + task_watcher = IngestTaskWatcher.manager.get(task_id=task_id) + if task_watcher is not None: + send_email_on_failure(task_watcher, exception, traceback, *args, **kwargs) + +@task_success.connect +def send_email_on_success_task(sender=None, **kwargs): + """Function to send an email on task success signal from Celery. + + :param sender: The task object + :type sender: celery.task + """ + if sender is not None and 'creating_canvases_from_local' in sender.name: + task_id = sender.request.id + task_watcher = IngestTaskWatcher.manager.get(task_id=task_id) + if task_watcher is not None: + send_email_on_success(task_watcher) diff --git a/apps/ingest/templates/admin/ingest/bulk/change_form.html b/apps/ingest/templates/admin/ingest/bulk/change_form.html new file mode 100644 index 000000000..6ef03a4cb --- /dev/null +++ b/apps/ingest/templates/admin/ingest/bulk/change_form.html @@ -0,0 +1,132 @@ +{% extends "admin/change_form.html" %} +{% load i18n admin_urls %} +{% load static %} + +{% block extrastyle %} + {{ block.super }} + +{% endblock %} +{% block submit_buttons_bottom %} +
+ + +
+{% endblock %} +{% block admin_change_form_document_ready %} + {{ block.super }} + +{% endblock %} +{% block content %} + {{ block.super }} +
+
Uploading...
+
0%
+
+ +
+
+

+ You must leave this window open during upload. +

+

+ Once upload completes, you will be sent to the list of manifests. + You may navigate away while the rest of the ingest completes; you + will receive an email to notify you when the ingest has completed. +

+
+
+{% endblock %} \ No newline at end of file diff --git a/apps/ingest/tests/factories.py b/apps/ingest/tests/factories.py index 759edd071..a580a66fe 100644 --- a/apps/ingest/tests/factories.py +++ b/apps/ingest/tests/factories.py @@ -1,11 +1,11 @@ from os.path import join -import boto3 -from moto import mock_s3 +from django_celery_results.models import TaskResult from factory.django import DjangoModelFactory, FileField from factory import Faker, SubFactory from django.conf import settings -from apps.iiif.manifests.tests.factories import ImageServerFactory -from apps.ingest.models import Local, Remote +from apps.users.tests.factories import UserFactory +from apps.iiif.manifests.tests.factories import ImageServerFactory, ManifestFactory +from apps.ingest.models import Bulk, Local, Remote, IngestTaskWatcher class LocalFactory(DjangoModelFactory): class Meta: @@ -14,11 +14,34 @@ class Meta: bundle = FileField(filename='bundle.zip', filepath=join(settings.APPS_DIR, 'ingest/fixtures/bundle.zip')) image_server = SubFactory(ImageServerFactory) manifest = None - local_bundle_path = None class RemoteFactory(DjangoModelFactory): class Meta: model = Remote manifest = None - remote_url = Faker('url') \ No newline at end of file + remote_url = Faker('url') + +class BulkFactory(DjangoModelFactory): + class Meta: + model = Bulk + + volume_files = FileField(filename='bundle.zip', filepath=join(settings.APPS_DIR, 'ingest/fixtures/bundle.zip')) + image_server = SubFactory(ImageServerFactory) + +class TaskResultFactory(DjangoModelFactory): + class Meta: + model = TaskResult + + task_id = '1' + task_name = 'fake_task' + +class IngestTaskWatcherFactory(DjangoModelFactory): + class Meta: + model = IngestTaskWatcher + + task_id = '1' + filename = Faker('file_path') + task_result = SubFactory(TaskResultFactory) + task_creator = SubFactory(UserFactory) + associated_manifest = SubFactory(ManifestFactory) diff --git a/apps/ingest/tests/test_admin.py b/apps/ingest/tests/test_admin.py index 9719648a5..5c7a16d63 100644 --- a/apps/ingest/tests/test_admin.py +++ b/apps/ingest/tests/test_admin.py @@ -1,16 +1,21 @@ -from os import environ from os.path import join import boto3 -from moto import mock_s3 -from django.test import TestCase +from django.conf import settings from django.contrib.admin.sites import AdminSite -from django.core.files.uploadedfile import SimpleUploadedFile +from django.core import files from django.http import HttpResponseRedirect -from django.conf import settings +from django.test import TestCase +from django.test.client import RequestFactory +from django_celery_results.models import TaskResult +from moto import mock_s3 +from apps.ingest.forms import BulkVolumeUploadForm +from apps.iiif.canvases.models import Canvas +from apps.iiif.manifests.models import Manifest from apps.iiif.manifests.tests.factories import ManifestFactory, ImageServerFactory -from apps.ingest.models import Local, Remote -from apps.ingest.admin import LocalAdmin, RemoteAdmin -from .factories import LocalFactory, RemoteFactory +from apps.ingest.models import Bulk, Local, Remote, IngestTaskWatcher +from apps.ingest.admin import BulkAdmin, LocalAdmin, RemoteAdmin, TaskWatcherAdmin +from apps.users.tests.factories import UserFactory +from .factories import BulkFactory, LocalFactory, RemoteFactory, TaskResultFactory @mock_s3 class IngestAdminTest(TestCase): @@ -24,23 +29,55 @@ def setUp(self): storage_path='readux' ) + self.user = UserFactory.create(is_superuser=True) + + self.task_result = TaskResultFactory() + self.task_watcher = IngestTaskWatcher.manager.create_watcher( + task_id='1', + task_result=self.task_result, + task_creator=self.user, + filename='test_fake.zip' + ) + # Create fake bucket for moto's mock S3 service. conn = boto3.resource('s3', region_name='us-east-1') conn.create_bucket(Bucket='readux') conn.create_bucket(Bucket='readux-ingest') def test_local_admin_save(self): - """It should add a manifest to the Local object""" - local = LocalFactory.create(local_bundle_path=join(self.fixture_path, 'bundle.zip')) + """It should add a create a manifest and canvases and delete the Local object""" + local = LocalFactory.build( + image_server=self.image_server + ) + + original_manifest_count = Manifest.objects.count() + original_canvas_count = Canvas.objects.count() + + request_factory = RequestFactory() + + with open(join(self.fixture_path, 'no_meta_file.zip'), 'rb') as f: + content = files.base.ContentFile(f.read()) - assert local.manifest is None + local.bundle = files.File(content.file, 'no_meta_file.zip') + + req = request_factory.post('/admin/ingest/local/add/', data={}) + req.user = self.user local_model_admin = LocalAdmin(model=Local, admin_site=AdminSite()) - local_model_admin.save_model(obj=local, request=None, form=None, change=None) + local_model_admin.save_model(obj=local, request=req, form=None, change=None) + + # Saving should kick off the task to create the canvases and then delete + # the `Local` ingest object when done. + try: + local.refresh_from_db() + assert False + except Local.DoesNotExist: + assert True - local.refresh_from_db() - assert local.manifest is not None - # assert local.manifest.canvas_set.count() == 10 + # A new `Manifest` should have been created along with the canvases + # in the ingest + assert Manifest.objects.count() == original_manifest_count + 1 + assert Canvas.objects.count() == original_canvas_count + 10 def test_local_admin_response_add(self): """It should redirect to new manifest""" @@ -79,3 +116,151 @@ def test_remote_admin_response_add(self): assert isinstance(response, HttpResponseRedirect) assert response.url == f'/admin/manifests/manifest/{remote.manifest.id}/change/' + + def test_bulk_admin_save(self): + """It should add a Local object to this Bulk object""" + bulk = BulkFactory.create() + + assert len(bulk.local_uploads.all()) == 0 + + request_factory = RequestFactory() + req = request_factory.post('/admin/ingest/bulk/add/') + req.user = self.user + + bulk_model_admin = BulkAdmin(model=Bulk, admin_site=AdminSite()) + mock_form = BulkVolumeUploadForm() + req.FILES['volume_files'] = bulk.volume_files + bulk_model_admin.save_model(obj=bulk, request=req, form=mock_form, change=None) + + bulk.refresh_from_db() + assert len(bulk.local_uploads.all()) == 1 + + def test_bulk_admin_save_multiple(self): + """It should add three Local objects to this Bulk object""" + bulk = BulkFactory.create() + + assert len(bulk.local_uploads.all()) == 0 + + # Add 3 files to POST request + data = {} + file_list = [bulk.volume_files] + filepath2 = join(settings.APPS_DIR, 'ingest/fixtures/bundle_with_underscores.zip') + with open(filepath2, 'rb') as f: + content1 = files.base.ContentFile(f.read()) + file2 = files.File(content1.file, 'bundle_with_underscores.zip') + filepath3 = join(settings.APPS_DIR, 'ingest/fixtures/no_meta_file.zip') + with open(filepath3, 'rb') as f: + content2 = files.base.ContentFile(f.read()) + file3 = files.File(content2.file, 'no_meta_file.zip') + file_list.append(file2) + file_list.append(file3) + data['volume_files'] = file_list + + request_factory = RequestFactory() + req = request_factory.post('/admin/ingest/bulk/add/', data=data) + req.user = self.user + + bulk_model_admin = BulkAdmin(model=Bulk, admin_site=AdminSite()) + mock_form = BulkVolumeUploadForm() + bulk_model_admin.save_model(obj=bulk, request=req, form=mock_form, change=None) + + bulk.refresh_from_db() + assert len(bulk.local_uploads.all()) == 3 + + def test_bulk_admin_response_add(self): + """It should delete the Bulk object and redirect to manifests list""" + + bulk = BulkFactory.create() + bulk_model_admin = BulkAdmin(model=Bulk, admin_site=AdminSite()) + response = bulk_model_admin.response_add(obj=bulk, request=None) + + with self.assertRaises(Bulk.DoesNotExist): + bulk.refresh_from_db() + assert isinstance(response, HttpResponseRedirect) + assert response.url == '/admin/manifests/manifest/?o=-4' + + def test_bulk_admin_with_external_metadata(self): + """It should add the metadata to the matching Local object""" + bulk = BulkFactory.create(image_server=self.image_server) + + data = {} + data['volume_files'] = [] + + # Mock upload metadata csv with matching pid for zip + filepath1 = join(settings.APPS_DIR, 'ingest/fixtures/metadata.csv') + with open(filepath1, 'rb') as open_file: + content1 = files.base.ContentFile(open_file.read()) + file1 = files.File(content1.file, 'metadata.csv') + data['volume_files'].append(file1) + + # Mock upload a zip with no metadata + filepath2 = join(settings.APPS_DIR, 'ingest/fixtures/no_meta_file.zip') + with open(filepath2, 'rb') as open_file: + content2 = files.base.ContentFile(open_file.read()) + file2 = files.File(content2.file, 'no_meta_file.zip') + data['volume_files'].append(file2) + + request_factory = RequestFactory() + req = request_factory.post('/admin/ingest/bulk/add/', data=data) + req.user = self.user + + bulk_model_admin = BulkAdmin(model=Bulk, admin_site=AdminSite()) + mock_form = BulkVolumeUploadForm() + bulk_model_admin.save_model(obj=bulk, request=req, form=mock_form, change=None) + + bulk.refresh_from_db() + + local = bulk.local_uploads.first() + assert local.metadata is not None + assert isinstance(local.metadata, dict) + assert len(local.metadata) != 0 + assert local.metadata['label'] == 'Test Bundle' + + + def test_task_watcher_admin_functions(self): + """It should get the appropriate values from the watcher's associated TaskResult""" + watcher = self.task_watcher + assert isinstance(watcher.task_result, TaskResult) + assert watcher.task_id == watcher.task_result.task_id + assert watcher.task_result.task_name == 'fake_task' + assert watcher.task_result.status == 'PENDING' + + watcher_admin = TaskWatcherAdmin(model=IngestTaskWatcher, admin_site=AdminSite()) + assert watcher_admin.task_name(watcher) == 'fake_task' + assert 'PENDING' in watcher_admin.task_status(watcher) + + + # def test_local_admin_save_update_manifest(self): + # """It should add a manifest to the Local object""" + # local = LocalFactory.create() + + # assert local.manifest is None + + # request_factory = RequestFactory() + + # with open(join(self.fixture_path, 'no_meta_file.zip'), 'rb') as f: + # content = files.base.ContentFile(f.read()) + + # local.bundle = files.File(content.file, 'no_meta_file.zip') + + # req = request_factory.post('/admin/ingest/local/add/', data={}) + + # local_model_admin = LocalAdmin(model=Local, admin_site=AdminSite()) + # local_model_admin.save_model(obj=local, request=req, form=None, change=None) + + # local.refresh_from_db() + + # assert local.manifest is not None + # assert not local.manifest.label + + # manifest = local.manifest + # new_label = Faker._get_faker().name() + # manifest.label = new_label + # manifest.save() + + # assert manifest.label == new_label + + # create_canvas_form_local_task(local.id) + # manifest.refresh_from_db() + + # assert manifest.label == new_label diff --git a/apps/ingest/tests/test_local.py b/apps/ingest/tests/test_local.py index 111c3a964..16dc318f1 100644 --- a/apps/ingest/tests/test_local.py +++ b/apps/ingest/tests/test_local.py @@ -15,6 +15,7 @@ from apps.iiif.manifests.tests.factories import ManifestFactory, ImageServerFactory from ..models import Local from ..services import create_manifest +from ..storages import IngestStorage pytestmark = pytest.mark.django_db(transaction=True) # pylint: disable = invalid-name @@ -25,7 +26,7 @@ def setUp(self): """ Set instance variables. """ self.fixture_path = join(settings.APPS_DIR, 'ingest/fixtures/') self.image_server = ImageServerFactory( - server_base='http://images.readux.ecds.emory', + server_base='http://readux.s3.amazonaws.com', storage_service='s3', storage_path='readux' ) @@ -34,10 +35,11 @@ def setUp(self): conn.create_bucket(Bucket=self.image_server.storage_path) conn.create_bucket(Bucket='readux-ingest') - def mock_local(self, bundle, with_manifest=False): + def mock_local(self, bundle, with_manifest=False, metadata={}): # Note, I tried to use the factory here, but could not get it to override the file for bundle. local = Local( - image_server = self.image_server + image_server = self.image_server, + metadata = metadata ) local.bundle = SimpleUploadedFile( name=bundle, @@ -58,29 +60,29 @@ def test_bundle_upload(self): for bundle in ['bundle.zip', 'nested_volume.zip', 'csv_meta.zip']: local = self.mock_local(bundle) - assert bundle in [f.key for f in local.tmp_bucket.objects.all()] + assert bundle in [f.key for f in IngestStorage().bucket.objects.all()] def test_image_upload_to_s3(self): - local = self.mock_local('bundle.zip', True) + local = self.mock_local('bundle.zip', with_manifest=True) - local.extract_images_s3() + local.volume_to_s3() - image_files = [f.key for f in local.bucket.objects.filter(Prefix=local.manifest.pid)] + image_files = [f.key for f in local.image_server.bucket.objects.filter(Prefix=local.manifest.pid)] assert f'{local.manifest.pid}/00000008.jpg' in image_files def test_ocr_upload_to_s3(self): - local = self.mock_local('nested_volume.zip', True) + local = self.mock_local('nested_volume.zip', with_manifest=True) - local.extract_ocr_s3() + local.volume_to_s3() - image_files = [f.key for f in local.bucket.objects.filter(Prefix=local.manifest.pid)] + ocr_files = [f.key for f in local.image_server.bucket.objects.filter(Prefix=local.manifest.pid)] - assert f'{local.manifest.pid}/_*ocr*_/00000008.tsv' in image_files + assert f'{local.manifest.pid}/_*ocr*_/00000008.tsv' in ocr_files def test_metadata_from_excel(self): """ It should create a manifest with metadat supplied in an Excel file. """ - local = self.mock_local('bundle.zip', True) + local = self.mock_local('bundle.zip', with_manifest=True) assert 'pid' in local.metadata.keys() @@ -89,7 +91,7 @@ def test_metadata_from_excel(self): def test_metadata_from_csv(self): """ It should create a manifest with metadata supplied in a CSV file. """ - local = self.mock_local('csv_meta.zip', True) + local = self.mock_local('csv_meta.zip', with_manifest=True) assert 'pid' in local.metadata.keys() @@ -98,7 +100,7 @@ def test_metadata_from_csv(self): def test_metadata_from_tsv(self): """ It should create a manifest with metadata supplied in a CSV file. """ - local = self.mock_local('tsv.zip', True) + local = self.mock_local('tsv.zip', with_manifest=True) assert 'pid' in local.metadata.keys() @@ -107,18 +109,18 @@ def test_metadata_from_tsv(self): def test_no_metadata_file(self): """ It should create a Manifest even when no metadata file is supplied. """ - local = self.mock_local('no_meta_file.zip', True) + local = self.mock_local('no_meta_file.zip', with_manifest=True) - assert UUID(local.manifest.pid).version == 4 + assert isinstance(local.manifest.pid, str) + assert len(local.manifest.pid) == 8 def test_single_image(self): - """ - """ - local = self.mock_local('single-image.zip', True) + """ It should work when only one image is present. """ + local = self.mock_local('single-image.zip', with_manifest=True) - local.extract_images_s3() + local.volume_to_s3() - image_files = [f.key for f in local.bucket.objects.filter(Prefix=local.manifest.pid)] + image_files = [f.key for f in local.image_server.bucket.objects.filter(Prefix=local.manifest.pid)] assert f'{local.manifest.pid}/0011.jpg' in image_files @@ -126,15 +128,15 @@ def test_removing_junk(self): """ Any hidden files should not be uploaded. """ - local = self.mock_local('bundle_with_junk.zip', True) + local = self.mock_local('bundle_with_junk.zip', with_manifest=True) - local.extract_images_s3() - local.extract_ocr_s3() + local.volume_to_s3() + local.volume_to_s3() - ingest_files = [f.key for f in local.bucket.objects.filter(Prefix=local.manifest.pid)] + ingest_files = [f.key for f in local.image_server.bucket.objects.filter(Prefix=local.manifest.pid)] - assert 'ocr/.junk.tsv' in [f.filename for f in local.zip_ref.infolist()] - assert 'images/.00000010.jpg' in [f.filename for f in local.zip_ref.infolist()] + assert 'ocr/.junk.tsv' in local.file_list + assert 'images/.00000010.jpg' in local.file_list assert f'{local.manifest.pid}/00000009.jpg' in ingest_files assert f'{local.manifest.pid}/.00000010.jpg' not in ingest_files assert f'{local.manifest.pid}/_*ocr*_/00000003.tsv' in ingest_files @@ -144,17 +146,16 @@ def test_removing_underscores(self): """ Any hidden files should be removed. """ - local = self.mock_local('bundle_with_underscores.zip', True) + local = self.mock_local('bundle_with_underscores.zip', with_manifest=True) - local.extract_images_s3() - local.extract_ocr_s3() + local.volume_to_s3() - ingest_files = [f.key for f in local.bucket.objects.filter(Prefix=local.manifest.pid)] + ingest_files = [f.key for f in local.image_server.bucket.objects.filter(Prefix=local.manifest.pid)] - underscore_files = [f.filename for f in local.zip_ref.infolist() if '_' in f.filename] + underscore_files = [f for f in local.file_list if '_' in f] assert len(underscore_files) == 10 - assert len([f.filename for f in local.zip_ref.infolist() if '-' in f.filename]) == 0 - for underscore in [f.filename for f in local.zip_ref.infolist() if '_' in f.filename]: + assert len([f for f in local.file_list if '-' in f]) == 0 + for underscore in [f for f in local.file_list if '_' in f]: assert underscore not in ingest_files def test_when_metadata_in_filename(self): @@ -162,13 +163,13 @@ def test_when_metadata_in_filename(self): Make sure it doesn't get get confused when the word "metadata" is in every path. """ - local = self.mock_local('metadata.zip', True) + local = self.mock_local('metadata.zip', with_manifest=True) - local.extract_images_s3() - local.extract_ocr_s3() + local.volume_to_s3() + local.volume_to_s3() - files_in_zip = [f.filename for f in local.zip_ref.infolist()] - ingest_files = [f.key for f in local.bucket.objects.filter(Prefix=local.manifest.pid)] + files_in_zip = local.file_list + ingest_files = [f.key for f in local.image_server.bucket.objects.filter(Prefix=local.manifest.pid)] assert 'metadata/images/' in files_in_zip assert all('metadata' in f for f in files_in_zip) @@ -190,10 +191,10 @@ def test_when_underscore_in_pid(self): pid='p_i_d' ) - local.extract_images_s3() - local.extract_ocr_s3() + local.volume_to_s3() + local.volume_to_s3() - ingest_files = [f.key for f in local.bucket.objects.filter(Prefix=local.manifest.pid)] + ingest_files = [f.key for f in local.image_server.bucket.objects.filter(Prefix=local.manifest.pid)] assert all('p-i-d' in f for f in ingest_files) @@ -202,7 +203,7 @@ def test_creating_canvases(self): Make sure it doesn't get get confused when the word "metadata" is in every path. """ - local = self.mock_local('bundle.zip', True) + local = self.mock_local('bundle.zip', with_manifest=True) local.create_canvases() pid = local.manifest.pid @@ -220,24 +221,34 @@ def test_creating_canvases(self): assert Canvas.objects.get(pid=f'{pid}_00000010.jpg').position == 10 - def test_it_downloads_zip_when_local_bundle_path_is_not_none(self): - local = self.mock_local('metadata.zip', True) - local.local_bundle_path = 'swoop' - files_in_zip = [f.filename for f in local.zip_ref.infolist()] - assert 'metadata/images/' in files_in_zip - assert exists(join(gettempdir(), 'metadata.zip')) - assert local.local_bundle_path == join(gettempdir(), 'metadata.zip') - - def test_it_cleans_up(self): - local = self.mock_local('single-image.zip', True) - local.local_bundle_path = 'swoop' - local.zip_ref - assert exists(join(gettempdir(), 'single-image.zip')) - local.create_canvases() - manifest = Manifest.objects.get(pk=local.manifest.id) - assert manifest.canvas_set.count() == 1 - assert exists(join(gettempdir(), 'single-image.zip')) is False - try: - Local.objects.get(pk=local.id) - except Local.DoesNotExist: - pass + # def test_it_downloads_zip_when_local_bundle_path_is_not_none(self): + # local = self.mock_local('metadata.zip', with_manifest=True) + # local.local_bundle_path = 'swoop' + # files_in_zip = local.file_list + # assert 'metadata/images/' in files_in_zip + # assert exists(join(gettempdir(), 'metadata.zip')) + # assert local.local_bundle_path == join(gettempdir(), 'metadata.zip') + + # def test_it_cleans_up(self): + # local = self.mock_local('single-image.zip', with_manifest=True) + # local.local_bundle_path = 'swoop' + # local.zip_ref + # assert exists(join(gettempdir(), 'single-image.zip')) + # local.create_canvases() + # manifest = Manifest.objects.get(pk=local.manifest.id) + # assert manifest.canvas_set.count() == 1 + # assert exists(join(gettempdir(), 'single-image.zip')) is False + # try: + # Local.objects.get(pk=local.id) + # except Local.DoesNotExist: + # pass + + def test_it_creates_mainfest_with_metadata_property(self): + metadata = { + 'pid': '808', + 'title': 'Goodie Mob' + } + local = self.mock_local('no_meta_file.zip', metadata=metadata) + local.manifest = create_manifest(local) + assert local.manifest.pid == '808' + assert local.manifest.title == 'Goodie Mob' diff --git a/apps/ingest/tests/test_mail.py b/apps/ingest/tests/test_mail.py new file mode 100644 index 000000000..dd37f7c8c --- /dev/null +++ b/apps/ingest/tests/test_mail.py @@ -0,0 +1,33 @@ +from html import escape +from traceback import format_tb +from django.core import mail +from django.test import TestCase +from apps.utils.fake_traceback import FakeException, FakeTraceback +from ..mail import send_email_on_failure, send_email_on_success +from .factories import IngestTaskWatcherFactory + +class MailTest(TestCase): + def test_send_failure_email(self): + task_watcher = IngestTaskWatcherFactory.create() + fake_tb = FakeTraceback() + fake_exc = FakeException('error').with_traceback(fake_tb) + + send_email_on_failure(task_watcher, fake_exc, fake_tb) + + # Test that one message has been sent. + self.assertEqual(len(mail.outbox), 1) + + # Verify that the subject of the first message is correct. + self.assertEqual(mail.outbox[0].subject, f'[Readux] Failed: Ingest {task_watcher.filename}') + assert escape(format_tb(fake_tb)[0]) in mail.outbox[0].body + + def test_send_success_email(self): + task_watcher = IngestTaskWatcherFactory.create() + + send_email_on_success(task_watcher) + + # Test that one message has been sent. + self.assertEqual(len(mail.outbox), 1) + + # Verify that the subject of the first message is correct. + self.assertEqual(mail.outbox[0].subject, f'[Readux] Ingest complete: {task_watcher.filename}') diff --git a/apps/ingest/tests/test_remote.py b/apps/ingest/tests/test_remote.py index ebd9babe7..ccc9a2aa9 100644 --- a/apps/ingest/tests/test_remote.py +++ b/apps/ingest/tests/test_remote.py @@ -17,6 +17,7 @@ def test_metadata_from_remote_manifest(self): manifest=ManifestFactory.create(), remote_url='https://iiif.archivelab.org/iiif/09359080.4757.emory.edu/manifest.json' # pylint: disable=line-too-long ) + remote.open_metadata() assert remote.metadata['pid'] == '09359080.4757.emory.edu' assert remote.metadata['label'] == 'Address by Hon. Frederick Douglass' assert remote.metadata['summary'] == 'Respect Frederick Douglass.' diff --git a/apps/ingest/tests/test_services.py b/apps/ingest/tests/test_services.py index f93a00020..331d8e614 100644 --- a/apps/ingest/tests/test_services.py +++ b/apps/ingest/tests/test_services.py @@ -1,14 +1,17 @@ """ Tests for ingest.services """ import os import json +import boto3 +from moto import mock_s3 from django.test import TestCase from django.core.files.uploadedfile import SimpleUploadedFile from django.conf import settings +from factory.django import FileField from apps.iiif.canvases.tests.factories import CanvasFactory from apps.iiif.manifests.models import Manifest from apps.iiif.manifests.tests.factories import ManifestFactory, ImageServerFactory import apps.ingest.services as services -from .factories import RemoteFactory +from .factories import LocalFactory, RemoteFactory class ServicesTest(TestCase): """ Tests for ingest.services """ @@ -80,3 +83,26 @@ def test_parse_v2_manifest_with_label_as_list(self): data = json.loads(open(os.path.join(settings.APPS_DIR, 'ingest/fixtures/manifest-label-as-array.json')).read()) metadata = services.parse_iiif_v2_manifest(data) self.assertEqual(metadata['label'], 'Address by American Hero Frederick Douglass') + + @mock_s3 + def test_when_pid_not_in_metadata(self): + image_server = ImageServerFactory.create() + conn = boto3.resource('s3', region_name='us-east-1') + conn.create_bucket(Bucket=image_server.storage_path) + conn.create_bucket(Bucket='readux-ingest') + for _ in range(1, 5): + ManifestFactory.create() + local = LocalFactory.create( + image_server=image_server, + bundle=FileField( + filename='no_meta_file.zip', + filepath=os.path.join(settings.APPS_DIR, 'ingest/fixtures/no_meta_file.zip') + ) + ) + local.metadata['label'] = 'Southernplayalisticadillacmuzik' + local.manifest = None + assert 'pid' not in local.metadata + assert dict(local.metadata) is not None + local.manifest = services.create_manifest(local) + assert local.manifest.label == 'Southernplayalisticadillacmuzik' + assert local.manifest.pid is not None diff --git a/apps/readux/__init__.py b/apps/readux/__init__.py index 55fa725bd..04188a16d 100644 --- a/apps/readux/__init__.py +++ b/apps/readux/__init__.py @@ -1 +1 @@ -__version__ = '2.1.1' +__version__ = '2.2.0' diff --git a/apps/readux/views.py b/apps/readux/views.py index 8e32ff31d..118af9f4b 100644 --- a/apps/readux/views.py +++ b/apps/readux/views.py @@ -226,6 +226,7 @@ def get_context_data(self, **kwargs): # context['all'] = True context['page'] = canvas context['volume'] = manifest + context['pagelink'] = manifest.image_server context['collectionlink'] = Page.objects.type(CollectionsPage).first() context['volumelink'] = Page.objects.type(VolumesPage).first() context['user_annotation_page_count'] = UserAnnotation.objects.filter( diff --git a/apps/static/js/vue-readux.js b/apps/static/js/vue-readux.js index fa91c8095..ef656e8b9 100644 --- a/apps/static/js/vue-readux.js +++ b/apps/static/js/vue-readux.js @@ -199,7 +199,7 @@ Vue.component("v-info-content-url-multiple", { // url copy component made for when the url is modified externally (outside Vue.js) Vue.component("v-info-content-url-external", { - props: ["label", "url"], + props: ["label", "url", "volume"], data: function () { return { localUrl: this.url, @@ -237,10 +237,114 @@ Vue.component("v-info-content-url-external", { var protocol = window.location.protocol; var host = window.location.host; var canvas = event.detail.canvas; - var volume = event.detail.volume; + var volume = vm.volume; var url = - protocol + "//" + host + "/volume/" + volume + "/page/" + canvas; + protocol + "//" + host + volume + "/page/" + canvas; vm.localUrl = url; + vm.vol = volume; + } + }); + }, +}); + +// url copy component made for when the url is modified externally (outside Vue.js) - trying image link +Vue.component("v-info-content-url-image-link", { + props: ["label", "pagelink"], + data: function () { + return { + localUrls: this.url, + pageresource: this.pageresource, + }; + }, + template: ` +
+ + +
+ `, + methods: { + onCopy() { + alert(`You have copied: ${this.localUrls}`); + }, + onError() { + alert(`Something went wrong with copy.`); + }, + }, + mounted() { + var vm = this; + window.addEventListener("canvasswitch", function (event) { + if (event.detail) { + var protocol = window.location.protocol; + var host = window.location.host; + var canvas = event.detail.canvas; + var volume = event.detail.volume; + var localpagelink = vm.pagelink; + axios.get(`iiif/resource/${event.detail.canvas}`) + .then(response => { + console.log(response.data.resource); + console.log(response.data.text); + vm.pageresource = response.data.resource; + vm.pagetext = response.data.text; + }).catch(error => {console.log(error);}) + var url = + localpagelink + "/" + canvas + "/full/full/0/default.jpg"; + vm.localUrls = url; + vm.can = canvas; + } + }); + }, +}); + +// adapted from (url copy component made for when the url is modified externally (outside Vue.js)) - now page text modal +Vue.component("v-info-content-url-page-text", { + props: [], + data: function () { + return { + pagetext: this.pagetext, + }; + }, + template: ` +
+
+ +

Text

+

{{pagetext}}

+
+
+ `, + methods: { + }, + mounted() { + var vm = this; + window.addEventListener("canvasswitch", function (event) { + if (event.detail) { + var protocol = window.location.protocol; + var host = window.location.host; + var canvas = event.detail.canvas; + var volume = event.detail.volume; + var localpagelink = vm.pagelink; + axios.get(`iiif/resource/${event.detail.canvas}`) + .then(response => { + console.log(response.data.resource); + console.log(response.data.text); + vm.pageresource = response.data.resource; + vm.pagetext = response.data.text; + }).catch(error => {console.log(error);}) + var url = + localpagelink + "/" + canvas + "/full/full/0/default.jpg"; + vm.localUrls = url; + vm.can = canvas; } }); }, @@ -270,7 +374,7 @@ var readux = new Vue({ toggleMoreInfo: function(){ this.showMoreInfo = !this.showMoreInfo } - + // ascaddURL: function(element) { // $(element).attr('href', function () { // if (window.location.search.length == 0) { @@ -316,4 +420,3 @@ var readux = new Vue({ } } }); - diff --git a/apps/templates/admin/ingest/bulk/change_form.html b/apps/templates/admin/ingest/bulk/change_form.html deleted file mode 100644 index f0b810255..000000000 --- a/apps/templates/admin/ingest/bulk/change_form.html +++ /dev/null @@ -1,10 +0,0 @@ -{% extends "admin/change_form.html" %} -{% block inline_field_sets %} -{% for inline_admin_formset in inline_admin_formsets %} - {% include inline_admin_formset.opts.template %} -{% endfor %} -
-

Upload Volumes

- -
-{% endblock %} \ No newline at end of file diff --git a/apps/templates/base.html b/apps/templates/base.html index 01c60bcfb..8a89687e9 100644 --- a/apps/templates/base.html +++ b/apps/templates/base.html @@ -1,6 +1,8 @@ {% load static i18n %} {% load menu_tags %} {% load sass_tags %} +{% load wagtailcore_tags %} +{% wagtail_site as current_site %} @@ -53,7 +55,7 @@ {% endif %} - + @@ -148,9 +150,9 @@

\ No newline at end of file diff --git a/apps/templates/ingest_failure_email.txt b/apps/templates/ingest_failure_email.txt new file mode 100644 index 000000000..fd5205af1 --- /dev/null +++ b/apps/templates/ingest_failure_email.txt @@ -0,0 +1,9 @@ +Readux +------- + +Ingest failed: {{ filename }} + +Details: +- Error: {{ exception }} +- Result URL: {{ result_url }} +- Traceback: {{ traceback }} \ No newline at end of file diff --git a/apps/templates/ingest_success_email.html b/apps/templates/ingest_success_email.html new file mode 100644 index 000000000..e8c761c8b --- /dev/null +++ b/apps/templates/ingest_success_email.html @@ -0,0 +1,18 @@ +

+ Readux +

+ +

Ingest complete: {{ filename }}

+
+ Details: + +
\ No newline at end of file diff --git a/apps/templates/ingest_success_email.txt b/apps/templates/ingest_success_email.txt new file mode 100644 index 000000000..dbf434900 --- /dev/null +++ b/apps/templates/ingest_success_email.txt @@ -0,0 +1,13 @@ +Readux +------- + +Ingest complete: {{ filename }} + +Details: +{% if manifest_url %} +- Manifest PID: {{ manifest_pid }} +- Edit manifest: {{ manifest_url }} +- Link to volume: {{ volume_url }} +{% else %} +- List of manifests: {{ manifests_list_url }} +{% endif %} \ No newline at end of file diff --git a/apps/templates/page.html b/apps/templates/page.html index 793e22629..a89ca2583 100644 --- a/apps/templates/page.html +++ b/apps/templates/page.html @@ -324,6 +324,21 @@

target="_blank">Send to Voyant + + + + + + + + + @@ -346,11 +361,15 @@

url="https://{{ request.META.HTTP_HOST }}{% url 'volume' volume.pid %}/page/all"> + + + + {% if "/page/all" in request.get_full_path in request.get_full_path %} - + {% else %} - + {% endif %} diff --git a/apps/users/forms.py b/apps/users/forms.py index 11c0bb450..4721cc1f8 100644 --- a/apps/users/forms.py +++ b/apps/users/forms.py @@ -1,6 +1,6 @@ from django.contrib.auth import get_user_model, forms from django.core.exceptions import ValidationError -from django.utils.translation import ugettext_lazy as _ +from django.utils.translation import gettext as _ from allauth.socialaccount.forms import SignupForm from django.forms import CharField, BooleanField, TextInput @@ -55,7 +55,7 @@ class ReaduxSocialSignupForm(SignupForm): class Meta: # pylint: disable=too-few-public-methods, missing-class-docstring model = User fields = ['name'] - + name = CharField(max_length=30, label='User Name to associate with Annotations') agree = BooleanField(label='Check this box to confirm that you agree to the full Readux Terms of Service found on the Terms of Service page. By creating an account on Readux, you acknowledge that: Any information I personally create and enter (from here "Data") will be stored and may be accessible to site administrators. My Data will not be publicly accessible unless I elect to make my Data public. The host of this site and the makers of Readux are not responsible for ensuring the stability or privacy of my Data. I agree to the full Readux Terms of Service.') diff --git a/apps/users/models.py b/apps/users/models.py index dec8e66b4..153006a6c 100644 --- a/apps/users/models.py +++ b/apps/users/models.py @@ -1,7 +1,7 @@ from django.contrib.auth.models import AbstractUser from django.db.models import CharField, BooleanField from django.urls import reverse -from django.utils.translation import ugettext_lazy as _ +from django.utils.translation import gettext as _ class User(AbstractUser): diff --git a/apps/users/tests/factories.py b/apps/users/tests/factories.py index 5c7bbe4a7..54b9884ad 100644 --- a/apps/users/tests/factories.py +++ b/apps/users/tests/factories.py @@ -9,6 +9,9 @@ class UserFactory(DjangoModelFactory): username = Faker("user_name") email = Faker("email") name = Faker("name") + is_superuser = False + is_staff = False + is_active = True @post_generation def password(self, create: bool, extracted: Sequence[Any], **kwargs): diff --git a/apps/utils/fake_traceback.py b/apps/utils/fake_traceback.py new file mode 100644 index 000000000..6e2a30f23 --- /dev/null +++ b/apps/utils/fake_traceback.py @@ -0,0 +1,54 @@ +""" +Utility to create fake traceback for testing. + +This is probably overkill and likely not the best way. + +Taken from Stack Overflow: https://stackoverflow.com/a/19258720/1792144 + +""" + +from random import randint + +class FakeCode(object): + def __init__(self, co_filename, co_name): + self.co_filename = co_filename + self.co_name = co_name + + +class FakeFrame(object): + def __init__(self, f_code, f_globals): + self.f_code = f_code + self.f_globals = f_globals + + +class FakeTraceback(object): + def __init__(self, frames=[FakeFrame(FakeCode("made_up_filename.py", "non_existent_function"), {})], line_nums=[randint(1, 100)]): + if len(frames) != len(line_nums): + raise ValueError("Ya messed up!") + self._frames = frames + self._line_nums = line_nums + self.tb_frame = frames[0] + self.tb_lineno = line_nums[0] + + @property + def tb_next(self): + if len(self._frames) > 1: + return FakeTraceback(self._frames[1:], self._line_nums[1:]) + + +class FakeException(Exception): + def __init__(self, *args, **kwargs): + self._tb = None + super().__init__(*args, **kwargs) + + @property + def __traceback__(self): + return self._tb + + @__traceback__.setter + def __traceback__(self, value): + self._tb = value + + def with_traceback(self, value): + self._tb = value + return self diff --git a/apps/utils/noid.py b/apps/utils/noid.py new file mode 100644 index 000000000..253723f9c --- /dev/null +++ b/apps/utils/noid.py @@ -0,0 +1,89 @@ +''' +The `NOID `_ +(Nice Opaque Identifier) minting logic here uses an unbounded NOID minter +with "extended digits" (:attr:`ALPHABET`) with a check character at the end. + +For comparison, you can install the `Ruby `_ +or `Perl `_ NOID +implementations and create a new minter comparable to the one implemented +here with this command:: + + noid dbcreate .zeeeek + +Similarly, noids implemented through this application can be validated +using:: + + noid validate- pzc8v + +''' + +# Historical note: +# This noid minting logic was originally implemented for Emory Library's PID +# Manager https://github.com/emory-libraries/pidman/blob/1.0.3/pidman/pid/noid.py. +# That implementations was based on the Noid.pm Perl module by John Kunze. +# They began using the original Perl code for generating +# NOIDs, but in re-examining how they used the utility they realized they didn't +# really use most of its functionality at all. By the time they realized this +# they were calling into the Perl code from within a Postgres database +# underneath this Django application. they decided to remove a few layers of +# dependencies by re-implementing the tiny bit of NOID-generation logic we +# actually used in Python for direct access from Django. + +from random import randint +from time import time + +#: NOID alphabet; specifies the characters to be used for minting noids +ALPHABET = '0123456789bcdfghjkmnpqrstvwxz' +ALPHASIZE = len(ALPHABET) + + +def random_num(): + '''Generate a random number based on current time and a random number.''' + a, b = str(time()).split('.') + return int(a) - int(b) - randint(0, 1000) + +def _digits(num): + '''Represent num in base ALPHASIZE. Return an array of digits, most + significant first.''' + if not num: + return [] + arr = [] + while num: + digit = num % ALPHASIZE + num = num // ALPHASIZE + arr.append(digit) + arr.reverse() + return arr + + +def _checksum(digits): + '''Custom per-digit checksum algorithm originally implemented in Noid.pm + and duplicated here for compatibility''' + sum = 0 + pos = 1 + for digit in digits: + sum += pos * digit + pos += 1 + return sum % ALPHASIZE + + +def encode_noid(num=None): + if num is None: + num = random_num() + '''Encode an integer as a NOID string, including final checksum + character.''' + digits = _digits(num) + digits.append(_checksum(digits)) + return ''.join([ALPHABET[digit] for digit in digits]) + + +def decode_noid(noid): + '''Decode the integer represented by a NOID string, ignoring the final + checksum character.''' + noid = noid[:-1] # strip checksum character + power = len(noid) - 1 + num = 0 + for char in noid: + num += ALPHABET.index(char) * (ALPHASIZE ** power) + power -= 1 + return num diff --git a/apps/utils/tests.py b/apps/utils/tests.py index 9d2e91c12..1fc1f9ade 100644 --- a/apps/utils/tests.py +++ b/apps/utils/tests.py @@ -1,5 +1,8 @@ +from apps.utils.noid import encode_noid +from time import time from django.test import TestCase from .fetch import fetch_url +from .noid import _digits, decode_noid, encode_noid import httpretty import json @@ -39,3 +42,12 @@ def test_response_bad_content(self): fetch_url('http://cnn.com', verbosity=3) assert 'bad content' in cm.output[0] assert 'WARNING' in cm.output[0] + + def test_digits_with_empty_sting(self): + assert _digits('') == [] + + def test_noid_decode(self): + now = int(time()) + noid = encode_noid(now) + assert noid != now + assert decode_noid(noid) == now diff --git a/config/settings/base.py b/config/settings/base.py index 34a6c8274..ce493a5a5 100644 --- a/config/settings/base.py +++ b/config/settings/base.py @@ -179,7 +179,6 @@ 'django.contrib.auth.middleware.AuthenticationMiddleware', 'django.contrib.messages.middleware.MessageMiddleware', 'django.middleware.clickjacking.XFrameOptionsMiddleware', - 'wagtail.core.middleware.SiteMiddleware', 'wagtail.contrib.redirects.middleware.RedirectMiddleware', ] diff --git a/config/urls.py b/config/urls.py index b88a54621..862d889cd 100644 --- a/config/urls.py +++ b/config/urls.py @@ -37,12 +37,12 @@ re_path(r'^cms/', include(wagtailadmin_urls)), re_path(r'^documents/', include(wagtaildocs_urls)), re_path(r'^pages/', include(wagtail_urls)), - url(r'^', include('apps.iiif.canvases.urls')), - url(r'^', include('apps.iiif.manifests.urls')), - url(r'^', include('apps.iiif.annotations.urls')), - url(r'^', include('apps.iiif.kollections.urls')), - path('accounts/', include('allauth.urls')),# url(r'^', include('readux.collection.urls')), - # url(r'^', include('readux.volumes.urls')), + re_path(r'^', include('apps.iiif.canvases.urls')), + re_path(r'^', include('apps.iiif.manifests.urls')), + re_path(r'^', include('apps.iiif.annotations.urls')), + re_path(r'^', include('apps.iiif.kollections.urls')), + path('accounts/', include('allauth.urls')),# re_path(r'^', include('readux.collection.urls')), + # re_path(r'^', include('readux.volumes.urls')), # path("", TemplateView.as_view(template_name="pages/home.html"), name="home"), # path( # 'about/', @@ -58,7 +58,7 @@ ), # path("accounts/", include("allauth.urls")), # Your stuff: custom urls includes go here - url(r'^', include('apps.readux.urls')), + re_path(r'^', include('apps.readux.urls')), re_path(r'', include(wagtail_urls)), path(r'^summernote/', include('django_summernote.urls')), diff --git a/package-lock.json b/package-lock.json index 319fe71d0..4699a81fa 100644 --- a/package-lock.json +++ b/package-lock.json @@ -347,21 +347,12 @@ "dev": true }, "axios": { - "version": "0.19.0", - "resolved": "https://registry.npmjs.org/axios/-/axios-0.19.0.tgz", - "integrity": "sha512-1uvKqKQta3KBxIz14F2v06AEHZ/dIoeKfbTRkK1E5oqjDnuEerLmYTgJB5AiQZHJcljpg1TuRzdjDR06qNk0DQ==", + "version": "0.23.0", + "resolved": "https://registry.npmjs.org/axios/-/axios-0.23.0.tgz", + "integrity": "sha512-NmvAE4i0YAv5cKq8zlDoPd1VLKAqX5oLuZKs8xkJa4qi6RGn0uhCYFjWtHHC9EM/MwOwYWOs53W+V0aqEXq1sg==", "dev": true, "requires": { - "follow-redirects": "1.5.10", - "is-buffer": "^2.0.2" - }, - "dependencies": { - "is-buffer": { - "version": "2.0.3", - "resolved": "https://registry.npmjs.org/is-buffer/-/is-buffer-2.0.3.tgz", - "integrity": "sha512-U15Q7MXTuZlrbymiz95PJpZxu8IlipAp4dtS3wOdgPXx3mqBnslrWU14kxfHB+Py/+2PVKSr37dMAgM2A4uArw==", - "dev": true - } + "follow-redirects": "^1.14.4" } }, "backo2": { @@ -3224,13 +3215,10 @@ "dev": true }, "follow-redirects": { - "version": "1.5.10", - "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.5.10.tgz", - "integrity": "sha512-0V5l4Cizzvqt5D44aTXbFZz+FtyXV1vrDN6qrelxtfYQKW0KO0W2T/hkE8xvGa/540LkZlkaUjO4ailYTFtHVQ==", - "dev": true, - "requires": { - "debug": "=3.1.0" - } + "version": "1.14.4", + "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.14.4.tgz", + "integrity": "sha512-zwGkiSXC1MUJG/qmeIFH2HBJx9u0V46QGUe3YR1fXG8bXQxq7fLj0RjLZQ5nubr9qNJUZrH+xUcwXEoXNpfS+g==", + "dev": true }, "for-in": { "version": "1.0.2", @@ -5742,6 +5730,16 @@ "yargs": "6.6.0" }, "dependencies": { + "axios": { + "version": "0.19.0", + "resolved": "https://registry.npmjs.org/axios/-/axios-0.19.0.tgz", + "integrity": "sha512-1uvKqKQta3KBxIz14F2v06AEHZ/dIoeKfbTRkK1E5oqjDnuEerLmYTgJB5AiQZHJcljpg1TuRzdjDR06qNk0DQ==", + "dev": true, + "requires": { + "follow-redirects": "1.5.10", + "is-buffer": "^2.0.2" + } + }, "debug": { "version": "4.1.1", "resolved": "https://registry.npmjs.org/debug/-/debug-4.1.1.tgz", @@ -5751,6 +5749,38 @@ "ms": "^2.1.1" } }, + "follow-redirects": { + "version": "1.5.10", + "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.5.10.tgz", + "integrity": "sha512-0V5l4Cizzvqt5D44aTXbFZz+FtyXV1vrDN6qrelxtfYQKW0KO0W2T/hkE8xvGa/540LkZlkaUjO4ailYTFtHVQ==", + "dev": true, + "requires": { + "debug": "=3.1.0" + }, + "dependencies": { + "debug": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/debug/-/debug-3.1.0.tgz", + "integrity": "sha512-OX8XqP7/1a9cqkxYw2yXss15f26NKWBpDXQd0/uK/KPqdQhxbPa994hnzjcE2VqQpDslf55723cKPUOGSmMY3g==", + "dev": true, + "requires": { + "ms": "2.0.0" + } + }, + "ms": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz", + "integrity": "sha1-VgiurfwAvmwpAd9fmGF4jeDVl8g=", + "dev": true + } + } + }, + "is-buffer": { + "version": "2.0.5", + "resolved": "https://registry.npmjs.org/is-buffer/-/is-buffer-2.0.5.tgz", + "integrity": "sha512-i2R6zNFDwgEHJyQUtJEk0XFi1i0dPFn/oqjK3/vPCcDeJvW5NQ83V8QbicfF1SupOaB0h8ntgBC2YiE7dfyctQ==", + "dev": true + }, "ms": { "version": "2.1.2", "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.2.tgz", diff --git a/package.json b/package.json index 2145d297e..bd539931d 100644 --- a/package.json +++ b/package.json @@ -3,6 +3,7 @@ "version": "2.0.0", "dependencies": {}, "devDependencies": { + "axios": "^0.23.0", "browser-sync": "^2.14.0", "del": "^2.2.2", "gulp": "^3.9.1", diff --git a/readux/__init__.py b/readux/__init__.py index 7cf2e77b6..3739befa8 100644 --- a/readux/__init__.py +++ b/readux/__init__.py @@ -1,4 +1,4 @@ -__version_info__ = (2, 0, 0, None) +__version_info__ = (2, 2, 0, None) # Dot-connect all but the last. Last is dash-connected if not None. __version__ = '.'.join([str(i) for i in __version_info__[:-1]]) diff --git a/requirements/base.txt b/requirements/base.txt index d73cb05c6..9ed5e507c 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -1,24 +1,26 @@ -pytz==2019.3 # https://github.com/stub42/pytz -python-slugify==4.0.0 # https://github.com/un33k/python-slugify +pytz==2021.3 # https://github.com/stub42/pytz +python-slugify==5.0.2 # https://github.com/un33k/python-slugify Pillow -argon2-cffi==18.3.0 # https://github.com/hynek/argon2_cffi +argon2-cffi==21.1.0 # https://github.com/hynek/argon2_cffi redis==3.5.3 # https://github.com/antirez/redis bs4 pyyaml progress python-memcached==1.59 pyld==1.0.5 +hocr-spec==0.2.0 # Django # ------------------------------------------------------------------------------ -# Django AllAuth 0.45 is NOT compatible with Django :( -# When we get to update, check this re: summernote https://github.com/summernote/django-summernote/issues/449 -Django==2.2.23 # https://www.djangoproject.com/ +# TODO: To upgrade to Django 3, we have to redo the full text search on the manifest model. +# TODO: When we get to update, check this re: summernote https://github.com/summernote/django-summernote/issues/449 +Django==2.2.24 # https://www.djangoproject.com/ #django-environ==0.4.5 # https://github.com/joke2k/django-environ git+https://github.com/joke2k/django-environ@develop -django-model-utils==3.1.2 # https://github.com/jazzband/django-model-utils +django-model-utils==4.2.0 # https://github.com/jazzband/django-model-utils +django-dirtyfields>=1.7.0 # https://github.com/romgar/django-dirtyfields django-allauth>=0.45.0 # https://github.com/pennersr/django-allauth -django-crispy-forms==1.8.1 # https://github.com/django-crispy-forms/django-crispy-forms +django-crispy-forms==1.13.0 # https://github.com/django-crispy-forms/django-crispy-forms django-redis==5.0.0 # https://github.com/niwinz/django-redis # beta version required for Django 2.x django-modeltranslation>=0.17.2 # https://github.com/deschler/django-modeltranslation @@ -39,24 +41,25 @@ django-sass-processor pylint-django==2.4.4 # Wagtail -wagtail==2.8.1 +wagtail==2.13.4 # wagtailmenus==2.13 -git+git://github.com/rkhleics/wagtailmenus.git@ddbc73797cfa9e6c94c1b0850b4ad0bc02b09667 -wagtail-condensedinlinepanel==0.5.3 +git+git://github.com/jayvarner/wagtailmenus.git@master git+git://github.com/jcmundy/wagtail-autocomplete.git@master#egg=wagtail-autocomplete #wagtail-autocomplete==0.3.1 -wagtail-cache==1.0.1 +wagtail-cache==1.0.2 # Import/Export -django-import-export==2.0.2 -gitpython==3.0.5 -gitdb2==2.0.6 +django-import-export==2.6.1 +gitpython==3.1.24 django-background-tasks==1.2.5 # S3 Uploads -boto3==1.18.20 -django-storages==1.11.1 # https://github.com/jschneier/django-storages +boto3==1.19.0 +django-storages==1.12.2 # https://github.com/jschneier/django-storages + +# Python function to stream unzip all the files in a ZIP archive, without loading the entire ZIP file into memory or any of its uncompressed files. +stream-unzip>=0.0.58 # jekyll theme for zipfile used in export functionality -e git+https://github.com/emory-libraries-ecds/digitaledition-jekylltheme.git@readux_2#egg=digitaledition-jekylltheme diff --git a/requirements/local.txt b/requirements/local.txt index ea5d896a4..99a771133 100644 --- a/requirements/local.txt +++ b/requirements/local.txt @@ -1,38 +1,39 @@ -r ./base.txt -Werkzeug==0.16.1 # https://github.com/pallets/werkzeug -ipdb==0.11 # https://github.com/gotcha/ipdb -Sphinx==1.8.1 # https://github.com/sphinx-doc/sphinx +Werkzeug==2.0.2 # https://github.com/pallets/werkzeug +ipdb==0.13.9 # https://github.com/gotcha/ipdb +Sphinx==4.2.0 # https://github.com/sphinx-doc/sphinx +# TODO: Upgrade after moving to Django 3 psycopg2-binary==2.8.4 # https://github.com/psycopg/psycopg2 # Testing # ------------------------------------------------------------------------------ -mypy==0.770 # https://github.com/python/mypy -pytest==5.4.2 # https://github.com/pytest-dev/pytest +mypy==0.910 # https://github.com/python/mypy +pytest==6.2.5 # https://github.com/pytest-dev/pytest pytest-sugar==0.9.4 # https://github.com/Frozenball/pytest-sugar -pytest-cov==2.12.1 -requests==2.22.0 +pytest-cov==3.0.0 +requests==2.26.0 iiif-prezi==0.3.0 # https://github.com/iiif-prezi/iiif-prezi -httpretty==1.0.2 # https://pypi.org/project/httpretty/ +httpretty==1.1.4 # https://pypi.org/project/httpretty/ #mock==4.0.2 -cssutils==1.0.2 # https://pypi.org/project/cssutils/ -pytest-django==4.3.0 # https://github.com/pytest-dev/pytest-django -moto==2.2.2 # https://github.com/spulec/moto +cssutils==2.3.0 # https://pypi.org/project/cssutils/ +pytest-django==4.4.0 # https://github.com/pytest-dev/pytest-django +moto==2.2.10 # https://github.com/spulec/moto # Code quality # ------------------------------------------------------------------------------ -flake8==3.7.9 # https://github.com/PyCQA/flake8 +flake8==4.0.1 # https://github.com/PyCQA/flake8 coverage==5.2.1 # https://github.com/nedbat/coveragepy coveralls # Django # ------------------------------------------------------------------------------ factory-boy==3.2.0 # https://github.com/FactoryBoy/factory_boy -faker==8.10.0 +faker==9.5.1 -django-debug-toolbar==2.2 # https://github.com/jazzband/django-debug-toolbar -django-extensions==2.2.9 # https://github.com/django-extensions/django-extensions -django-coverage-plugin==1.8.0 # https://github.com/nedbat/django_coverage_plugin +django-debug-toolbar==3.2.2 # https://github.com/jazzband/django-debug-toolbar +django-extensions==3.1.3 # https://github.com/django-extensions/django-extensions +django-coverage-plugin==2.0.1 # https://github.com/nedbat/django_coverage_plugin pyopenssl # for running dev server under https # Deployment diff --git a/requirements/production.txt b/requirements/production.txt index babc89820..3f4fb7deb 100644 --- a/requirements/production.txt +++ b/requirements/production.txt @@ -2,11 +2,10 @@ -r ./base.txt -gunicorn==20.0.4 # https://github.com/benoitc/gunicorn -psycopg2==2.8.4 --no-binary psycopg2 # https://github.com/psycopg/psycopg2 -Collectfast==2.1.0 # https://github.com/antonagestam/collectfast +gunicorn==20.1.0 # https://github.com/benoitc/gunicorn +Collectfast==2.2.0 # https://github.com/antonagestam/collectfast raven==6.10.0 # https://github.com/getsentry/raven-python # Django # ------------------------------------------------------------------------------ -django-anymail[mailgun]==4.3 # https://github.com/anymail/django-anymail \ No newline at end of file +django-anymail[mailgun]==8.4 # https://github.com/anymail/django-anymail \ No newline at end of file diff --git a/utility/ingest_scripts/canvas.py b/utility/ingest_scripts/canvas.py index 48724f081..270424c3d 100644 --- a/utility/ingest_scripts/canvas.py +++ b/utility/ingest_scripts/canvas.py @@ -7,7 +7,6 @@ def make_csv(args): print(','.join(['manifest', - 'IIIF_IMAGE_SERVER_BASE', 'id', 'label', 'pid', diff --git a/xml_schema/alto-1-4.xsd b/xml_schema/alto-1-4.xsd new file mode 100644 index 000000000..0d57f8a8d --- /dev/null +++ b/xml_schema/alto-1-4.xsd @@ -0,0 +1,713 @@ + + + + + + + + + + + + + + + + + + + + + + + ALTO (analyzed layout and text object) stores layout information and + OCR recognized text of pages of any kind of printed documents like books, journals and newspapers. + ALTO is a standardized XML format to store layout and content information. + It is designed to be used as an extension schema to METS (Metadata Encoding and Transmission Standard), + where METS provides metadata and structural information while ALTO contains content and physical information. + + + + + + + Describes general settings of the alto file like measurement units and metadata + + + + + + All measurement values inside the alto file except fontsize are related to this unit. The default is 1/10 of mm + + + + + + + + + + + + + + + + + + + + + + + + + Styles define properties of layout elements. A style defined in a parent element is used as default style for all related children elements. + + + + + + A text style defines font properties of text. + + + + + + + + + A paragraph style defines formatting properties of text blocks. + + + + + + Indicates the alignement of the paragraph. Could be left, right, center or justify. + + + + + + + + + + + + + Left indent of the paragraph in relation to the column. + + + + + Right indent of the paragraph in relation to the column. + + + + + Line spacing between two lines of the paragraph. Measurement calculated from baseline to baseline. + + + + + Indent of the first line of the paragraph if this is different from the other lines. A negative value indicates an indent to the left, a positive value indicates an indent to the right. + + + + + + + + + + The root layout element. + + + + + + One page of a book or journal. + + + + + + The area between the top line of print and the upper edge of the leaf. It may contain page number or running title. + + + + + The area between the printspace and the left border of a page. May contain margin notes. + + + + + The area between the printspace and the right border of a page. May contain margin notes. + + + + + The area between the bottom line of letterpress or writing and the bottom edge of the leaf. It may contain a page number, a signature number or a catch word. + + + + + Rectangle covering the printed area of a page. Page number and running title are not part of the print space. + + + + + + + Any user-defined class like title page. + + + + + + + + The number of the page within the document. + + + + + The page number that is printed on the page. + + + + + Gives brief information about original page quality + + + + + + + + + + + + + + + + Gives more details about the original page quality, since QUALITY attribute gives only brief and restrictive information + + + + + Position of the page. Could be lefthanded, righthanded, cover, foldout or single if it has no special position. + + + + + + + + + + + + + + A link to the processing description that has been used for this page. + + + + + Estimated percentage of OCR Accuracy in range from 0 to 100 + + + + + Page Confidence: Confidence level of the ocr for this page. A value between 0 (unsure) and 1 (sure). + + + + + + + + + + + + + + + + + + + + Group of available block types + + + + + A block of text. + + + + + A picture or image. + + + + + A graphic used to separate blocks. Usually a line or rectangle. + + + + + A block that consists of other blocks + + + + + + + Base type for any kind of block on the page. + + + + + + + + + + + + + Tells the rotation of the block e.g. text or illustration. The value is in degree counterclockwise. + + + + + The next block in reading sequence on the page. + + + + + + + A sequence of chars. Strings are separated by white spaces or hyphenation chars. + + + + + Any alternative for the word. + + + + + + + Identifies the purpose of the alternative. + + + + + + + + + + + + + + + + + + + + + + + + Type of the substitution (if any). + + + + + + + + + + + + Content of the substiution. + + + + + Word Confidence: Confidence level of the ocr for this string. A value between 0 (unsure) and 1 (sure). + + + + + + + + + + + Confidence level of each character in that string. A list of numbers, one number between 0 (sure) and 9 (unsure) for each character. + + + + + + A region on a page + + + + + + + + + + + + + + A list of points + + + + + + Describes the bounding shape of a block, if it is not rectangular. + + + + + + + + + + A polygon shape. + + + + + + An ellipse shape. + + + + + + + + + A circle shape. + + + + + + + + Formatting attributes. Note that these attributes are assumed to be inherited from ancestor elements of the document hierarchy. + + + + The font name. + + + + + + + The font size, in points (1/72 of an inch). + + + + + Font color as RGB value + + + + + + + Serif or Sans-Serif + + + + + + + + + fixed or proportional + + + + + + + + + Information to identify the image file from which the OCR text was created. + + + + + + + + + A unique identifier for the image file. This is drawn from MIX. + This identifier must be unique within the local system. To facilitate file sharing or interoperability with other systems, fileIdentifierLocation may be added to designate the system or application where the identifier is unique. + + + + + + A location qualifier, i.e., a namespace. + + + + + + + + Information on how the text was created, including preprocessing, OCR processing, and postprocessing steps. + Where possible, this draws from MIX's change history. + + + + + + + + + + A processing step. + + + + + Date or DateTime the image was processed. + + + + + Identifies the organizationlevel producer(s) of the processed image. + + + + + An ordinal listing of the image processing steps performed. For example, "image despeckling." + + + + + A description of any setting of the processing application. For example, for a multi-engine OCR application this might include the engines which were used. Ideally, this description should be adequate so that someone else using the same application can produce identical results. + + + + + + + + Information about a software application. Where applicable, the preferred method for determining this information is by selecting Help --> About. + + + + + The name of the organization or company that created the application. + + + + + The name of the application. + + + + + The version of the application. + + + + + A description of any important characteristics of the application, especially for non-commercial applications. For example, if a non-commercial application is built using commercial components, e.g., an OCR engine SDK. Those components should be mentioned here. + + + + + + + + + + List of any combination of font styles + + + + + + + + + + + + + + + + + + + + + + A block that consists of other blocks + + + + + + + + + A user defined string to identify the type of composed block (e.g. table, advertisement, ...) + + + + + An ID to link to an image which contains only the composed block. The ID and the file link is defined in the related METS file. + + + + + + + + A picture or image. + + + + + + A user defined string to identify the type of illustration like photo, map, drawing, chart, ... + + + + + A link to an image which contains only the illustration. + + + + + + + + A graphic used to separate blocks. Usually a line or rectangle. + + + + + + + + A block of text. + + + + + + + A single line of text. + + + + + + + + A white space. + + + + + + + + + + + + A hyphenation char. Can appear only at the end of a line. + + + + + + + + + + + + + + + + + + + Correction Status. Indicates whether manual correction has been done or not. + + + + + + + + + + diff --git a/xml_schema/alto-2-1.xsd b/xml_schema/alto-2-1.xsd new file mode 100644 index 000000000..f9b92945f --- /dev/null +++ b/xml_schema/alto-2-1.xsd @@ -0,0 +1,852 @@ + + + + + + + + + + + + + + + + + + + + + + + + ALTO (analyzed layout and text object) stores layout information and + OCR recognized text of pages of any kind of printed documents like books, journals and newspapers. + ALTO is a standardized XML format to store layout and content information. + It is designed to be used as an extension schema to METS (Metadata Encoding and Transmission Standard), + where METS provides metadata and structural information while ALTO contains content and physical information. + + + + + + + Describes general settings of the alto file like measurement units and metadata + + + + + + + All measurement values inside the alto file are related to + this unit, except the font size. + Coordinates as being used in HPOS and VPOS are absolute coordinates referring to the upper-left corner of a page. + The upper left corner of the page is defined as coordinate (0/0). + + values meaning: + mm10: 1/10th of millimeter + inch1200: 1/1200th of inch + pixel: 1 pixel + + The values for pixel will be related to the resolution of the image based + on which the layout is described. Incase the original image is not known + the scaling factor can be calculated based on total width and height of + the image and the according information of the PAGE element. + + + + + + + + + + + + + + + + + + + + + + + + + + Styles define properties of layout elements. A style defined in a parent element is used as default style for all related children elements. + + + + + + A text style defines font properties of text. + + + + + + + + + A paragraph style defines formatting properties of text blocks. + + + + + + Indicates the alignement of the paragraph. Could be left, right, center or justify. + + + + + + + + + + + + + Left indent of the paragraph in relation to the column. + + + + + Right indent of the paragraph in relation to the column. + + + + + Line spacing between two lines of the paragraph. Measurement calculated from baseline to baseline. + + + + + Indent of the first line of the paragraph if this is different from the other lines. A negative value indicates an indent to the left, a positive value indicates an indent to the right. + + + + + + + + + + + Tag define properties of additional characteristic. The tags are referenced from related content element on Block or String element by attribute TAGREF via the tag ID. + This container element contains the individual elements for LayoutTags, StructureTags, RoleTags, NamedEntityTags and OtherTags + + + + + + The root layout element. + + + + + + One page of a book or journal. + + + + + + The area between the top line of print and the upper edge of the leaf. It may contain page number or running title. + + + + + The area between the printspace and the left border of a page. May contain margin notes. + + + + + The area between the printspace and the right border of a page. May contain margin notes. + + + + + The area between the bottom line of letterpress or writing and the bottom edge of the leaf. It may contain a page number, a signature number or a catch word. + + + + + Rectangle covering the printed area of a page. Page number and running title are not part of the print space. + + + + + + + Any user-defined class like title page. + + + + + + + + The number of the page within the document. + + + + + The page number that is printed on the page. + + + + + Gives brief information about original page quality + + + + + + + + + + + + + + + + Gives more details about the original page quality, since QUALITY attribute gives only brief and restrictive information + + + + + Position of the page. Could be lefthanded, righthanded, cover, foldout or single if it has no special position. + + + + + + + + + + + + + + A link to the processing description that has been used for this page. + + + + + Estimated percentage of OCR Accuracy in range from 0 to 100 + + + + + Page Confidence: Confidence level of the ocr for this page. A value between 0 (unsure) and 1 (sure). + + + + + + + + + + + + + + + + + + + + Group of available block types + + + + + A block of text. + + + + + A picture or image. + + + + + A graphic used to separate blocks. Usually a line or rectangle. + + + + + A block that consists of other blocks + + + + + + + Base type for any kind of block on the page. + + + + + + + + + + + + + + Tells the rotation of the block e.g. text or illustration. The value is in degree counterclockwise. + + + + + The next block in reading sequence on the page. + + + + + Correction Status. Indicates whether manual correction has been done or not. The correction status should be recorded at the highest level possible (Block, TextLine, String). + + + + + + + A sequence of chars. Strings are separated by white spaces or hyphenation chars. + + + + + Any alternative for the word. + + + + + + + Identifies the purpose of the alternative. + + + + + + + + + + + + + + + + + + + + + + + + + Type of the substitution (if any). + + + + + + + + + + + + Content of the substiution. + + + + + Word Confidence: Confidence level of the ocr for this string. A value between 0 (unsure) and 1 (sure). + + + + + + + + + + + Confidence level of each character in that string. A list of numbers, one number between 0 (sure) and 9 (unsure) for each character. + + + + + Correction Status. Indicates whether manual correction has been done or not. The correction status should be recorded at the highest level possible (Block, TextLine, String). + + + + + Attribute to record language of the string. The language should be recorded at the highest level possible. + + + + + + A region on a page + + + + + + + + + + + + + + A list of points + + + + + + Describes the bounding shape of a block, if it is not rectangular. + + + + + + + + + + A polygon shape. + + + + + + An ellipse shape. HPOS and VPOS describe the center of the ellipse. + HLENGTH and VLENGTH are the width and height of the described ellipse. + + + + + + + + + A circle shape. HPOS and VPOS describe the center of the circle. + + + + + + + + Formatting attributes. Note that these attributes are assumed to be inherited from ancestor elements of the document hierarchy. + + + + The font name. + + + + + + + The font size, in points (1/72 of an inch). + + + + + Font color as RGB value + + + + + + + Serif or Sans-Serif + + + + + + + + + fixed or proportional + + + + + + + + + Information to identify the image file from which the OCR text was created. + + + + + + + + + A unique identifier for the image file. This is drawn from MIX. + This identifier must be unique within the local system. To facilitate file sharing or interoperability with other systems, fileIdentifierLocation may be added to designate the system or application where the identifier is unique. + + + + + + A location qualifier, i.e., a namespace. + + + + + + + + Information on how the text was created, including preprocessing, OCR processing, and postprocessing steps. + Where possible, this draws from MIX's change history. + + + + + + + + + + A processing step. + + + + + Date or DateTime the image was processed. + + + + + Identifies the organizationlevel producer(s) of the processed image. + + + + + An ordinal listing of the image processing steps performed. For example, "image despeckling." + + + + + A description of any setting of the processing application. For example, for a multi-engine OCR application this might include the engines which were used. Ideally, this description should be adequate so that someone else using the same application can produce identical results. + + + + + + + + Information about a software application. Where applicable, the preferred method for determining this information is by selecting Help --> About. + + + + + The name of the organization or company that created the application. + + + + + The name of the application. + + + + + The version of the application. + + + + + A description of any important characteristics of the application, especially for non-commercial applications. For example, if a non-commercial application is built using commercial components, e.g., an OCR engine SDK. Those components should be mentioned here. + + + + + + + + + + List of any combination of font styles + + + + + + + + + + + + + + + + + + + + + + A block that consists of other blocks + + + + + + + + + A user defined string to identify the type of composed block (e.g. table, advertisement, ...) + + + + + An ID to link to an image which contains only the composed block. The ID and the file link is defined in the related METS file. + + + + + + + + A picture or image. + + + + + + A user defined string to identify the type of illustration like photo, map, drawing, chart, ... + + + + + A link to an image which contains only the illustration. + + + + + + + + A graphic used to separate blocks. Usually a line or rectangle. + + + + + + + + A block of text. + + + + + + + A single line of text. + + + + + + + + A white space. + + + + + + + + + + + + + A hyphenation char. Can appear only at the end of a line. + + + + + + + + + + + + + + + + + + + + + Attribute to record language of the textline. + + + + + Correction Status. Indicates whether manual correction has been done or not. The correction status should be recorded at the highest level possible (Block, TextLine, String). + + + + + + + + Attribute deprecated. LANG should be used instead. + + + + + Attribute to record language of the textblock. + + + + + + + + + There are following variation of tag types available: + LayoutTag – criteria about arrangement or graphical appearance + StructureTag – criteria about grouping or formation + RoleTag – criteria about function or mission + NamedEntityTag – criteria about assignment of terms to their relationship / meaning (NER) + OtherTag – criteria about any other characteristic not listed above, the TYPE attribute is intended to be used for classification within those. + + + + + + + + + + + + + + + + + + The xml data wrapper element XmlData is used to contain XML encoded metadata. + The content of an XmlData element can be in any namespace or in no namespace. + As permitted by the XML Schema Standard, the processContents attribute value for the + metadata in an XmlData is set to “lax”. Therefore, if the source schema and its location are + identified by means of an XML schemaLocation attribute, then an XML processor will validate + the elements for which it can find declarations. If a source schema is not identified, or cannot be + found at the specified schemaLocation, then an XML validator will check for well-formedness, + but otherwise skip over the elements appearing in the XmlData element. + + + + + + + + + + + + + Type can be used to classify and group the information within each tag + element type. + + + + + + Content / information value of the tag. + + + + + Description text for tag information for clarification. + + + + + Any URI for authority or description relevant information. + + + + + diff --git a/xml_schema/alto-3-1.xsd b/xml_schema/alto-3-1.xsd new file mode 100644 index 000000000..7686fbf8a --- /dev/null +++ b/xml_schema/alto-3-1.xsd @@ -0,0 +1,934 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ALTO (analyzed layout and text object) stores layout information and + OCR recognized text of pages of any kind of printed documents like books, journals and newspapers. + ALTO is a standardized XML format to store layout and content information. + It is designed to be used as an extension schema to METS (Metadata Encoding and Transmission Standard), + where METS provides metadata and structural information while ALTO contains content and physical information. + + + + + + + + Describes general settings of the alto file like measurement units and metadata + + + + + Styles define properties of layout elements. A style defined in a parent element is used as default style for all related children elements. + + + + + + Tag define properties of additional characteristic. The tags are referenced from related content element on Block or String element by attribute TAGREF via the tag ID. + This container element contains the individual elements for LayoutTags, StructureTags, RoleTags, NamedEntityTags and OtherTags + + + + + + The root layout element. + + + + + + Schema version of the ALTO file. + + + + + + + + + + + + + + + + + + + + + + + + + + + + There are following variation of tag types available: + LayoutTag – criteria about arrangement or graphical appearance + StructureTag – criteria about grouping or formation + RoleTag – criteria about function or mission + NamedEntityTag – criteria about assignment of terms to their relationship / meaning (NER) + OtherTag – criteria about any other characteristic not listed above, the TYPE attribute is intended to be used for classification within those. + + + + + + + + + + + + + + + Gives brief information about original page quality + + + + + + + + + + + + + + Gives more details about the original page quality, since QUALITY attribute gives only brief and restrictive information + + + + + + Position of the page. Could be lefthanded, righthanded, cover, foldout or single if it has no special position. + + + + + + + + + + + + Page Confidence: Confidence level of the ocr for this page. A value between 0 (unsure) and 1 (sure). + + + + + + + + + One page of a book or journal. + + + + + The area between the top line of print and the upper edge of the leaf. It may contain page number or running title. + + + + + The area between the printspace and the left border of a page. May contain margin notes. + + + + + The area between the printspace and the right border of a page. May contain margin notes. + + + + + The area between the bottom line of letterpress or writing and the bottom edge of the leaf. It may contain a page number, a signature number or a catch word. + + + + + Rectangle covering the printed area of a page. Page number and running title are not part of the print space. + + + + + + + Any user-defined class like title page. + + + + + + + + The number of the page within the document. + + + + + The page number that is printed on the page. + + + + + + + + A link to the processing description that has been used for this page. + + + + + Estimated percentage of OCR Accuracy in range from 0 to 100 + + + + + + + + + + + + + A text style defines font properties of text. + + + + + + + A paragraph style defines formatting properties of text blocks. + + + + + Indicates the alignement of the paragraph. Could be left, right, center or justify. + + + + + + + + + + + + + Left indent of the paragraph in relation to the column. + + + + + Right indent of the paragraph in relation to the column. + + + + + Line spacing between two lines of the paragraph. Measurement calculated from baseline to baseline. + + + + + Indent of the first line of the paragraph if this is different from the other lines. A negative value indicates an indent to the left, a positive value indicates an indent to the right. + + + + + + + + + + + + + + + + + + + + + + + + + + + Group of available block types + + + + + A block of text. + + + + + A picture or image. + + + + + A graphic used to separate blocks. Usually a line or rectangle. + + + + + A block that consists of other blocks + + + + + + + Base type for any kind of block on the page. + + + + + + + + + + + + + + Tells the rotation of e.g. text or illustration within the block. The value is in degree counterclockwise. + + + + + The next block in reading sequence on the page. + + + + + Correction Status. Indicates whether manual correction has been done or not. The correction status should be recorded at the highest level possible (Block, TextLine, String). + + + + + + + A white space. + + + + + + + + + + Type of the substitution (if any). + + + + + + + + + + + + + + + Word Confidence: Confidence level of the ocr for this string. A value between 0 (unsure) and 1 (sure). + + + + + + + + + Any alternative for the word. + + + + + + Identifies the purpose of the alternative. + + + + + + + + A sequence of chars. Strings are separated by white spaces or hyphenation chars. + + + + + + + + + + + + + + + + + + Content of the substitution. + + + + + + Confidence level of each character in that string. A list of numbers, one number between 0 (sure) and 9 (unsure) for each character. + + + + + Correction Status. Indicates whether manual correction has been done or not. The correction status should be recorded at the highest level possible (Block, TextLine, String). + + + + + Attribute to record language of the string. The language should be recorded at the highest level possible. + + + + + + A region on a page + + + + + + + + + + + + + + + A list of points + + + + + + Describes the bounding shape of a block, if it is not rectangular. + + + + + + + + + + A polygon shape. + + + + + + An ellipse shape. HPOS and VPOS describe the center of the ellipse. + HLENGTH and VLENGTH are the width and height of the described ellipse. + The attribute ROTATION tells the rotation of the e.g. text or + illustration within the block. The value is in degrees counterclockwise. + + + + + + + + + + A circle shape. HPOS and VPOS describe the center of the circle. + + + + + + + + Formatting attributes. Note that these attributes are assumed to be inherited from ancestor elements of the document hierarchy. + + + + The font name. + + + + + + + The font size, in points (1/72 of an inch). + + + + + Font color as RGB value + + + + + + + Serif or Sans-Serif + + + + + + + + + fixed or proportional + + + + + + + + + + + All measurement values inside the alto file are related to + this unit, except the font size. + Coordinates as being used in HPOS and VPOS are absolute coordinates referring to the upper-left corner of a page. + The upper left corner of the page is defined as coordinate (0/0). + + values meaning: + mm10: 1/10th of millimeter + inch1200: 1/1200th of inch + pixel: 1 pixel + + The values for pixel will be related to the resolution of the image based + on which the layout is described. Incase the original image is not known + the scaling factor can be calculated based on total width and height of + the image and the according information of the PAGE element. + + + + + + + + + + + Information to identify the image file from which the OCR text was created. + + + + + + + + + + + + + + + + + + + A unique identifier for the image file. This is drawn from MIX. + This identifier must be unique within the local system. + To facilitate file sharing or interoperability with other systems, fileIdentifierLocation may be added to designate the system or application where the identifier is unique. + + + + + + A location qualifier, i.e., a namespace. + + + + + + + + + + + + + + A unique identifier for the document. + This identifier must be unique within the local system. + To facilitate file sharing or interoperability with other systems, documentIdentifierLocation may be added to designate the system or application where the identifier is unique. + + + + + + A location qualifier, i.e., a namespace. + + + + + + + + Information on how the text was created, including preprocessing, OCR processing, and postprocessing steps. + Where possible, this draws from MIX's change history. + + + + + + + + + + A processing step. + + + + + Date or DateTime the image was processed. + + + + + Identifies the organizationlevel producer(s) of the processed image. + + + + + An ordinal listing of the image processing steps performed. For example, "image despeckling." + + + + + A description of any setting of the processing application. For example, for a multi-engine OCR application this might include the engines which were used. Ideally, this description should be adequate so that someone else using the same application can produce identical results. + + + + + + + + Information about a software application. Where applicable, the preferred method for determining this information is by selecting Help --> About. + + + + + The name of the organization or company that created the application. + + + + + The name of the application. + + + + + The version of the application. + + + + + A description of any important characteristics of the application, especially for non-commercial applications. For example, if a non-commercial application is built using commercial components, e.g., an OCR engine SDK. Those components should be mentioned here. + + + + + + + + + + List of any combination of font styles + + + + + + + + + + + + + + + + + + + + + + A block that consists of other blocks + + + + + + + + + A user defined string to identify the type of composed block (e.g. table, advertisement, ...) + + + + + An ID to link to an image which contains only the composed block. The ID and the file link is defined in the related METS file. + + + + + + + + A picture or image. + + + + + + A user defined string to identify the type of illustration like photo, map, drawing, chart, ... + + + + + A link to an image which contains only the illustration. + + + + + + + + A graphic used to separate blocks. Usually a line or rectangle. + + + + + + + + A block of text. + + + + + + + A single line of text. + + + + + + + + + + + A hyphenation char. Can appear only at the end of a line. + + + + + + + + + + + + + + + + + + + + + Attribute to record language of the textline. + + + + + Correction Status. Indicates whether manual correction has been done or not. The correction status should be recorded at the highest level possible (Block, TextLine, String). + + + + + + + + Attribute deprecated. LANG should be used instead. + + + + + Attribute to record language of the textblock. + + + + + + + + + + + The xml data wrapper element XmlData is used to contain XML encoded metadata. + The content of an XmlData element can be in any namespace or in no namespace. + As permitted by the XML Schema Standard, the processContents attribute value for the + metadata in an XmlData is set to “lax”. Therefore, if the source schema and its location are + identified by means of an XML schemaLocation attribute, then an XML processor will validate + the elements for which it can find declarations. If a source schema is not identified, or cannot be + found at the specified schemaLocation, then an XML validator will check for well-formedness, + but otherwise skip over the elements appearing in the XmlData element. + + + + + + + + + + + + + Type can be used to classify and group the information within each tag + element type. + + + + + + Content / information value of the tag. + + + + + Description text for tag information for clarification. + + + + + Any URI for authority or description relevant information. + + + + + diff --git a/xml_schema/alto-4-2.xsd b/xml_schema/alto-4-2.xsd new file mode 100644 index 000000000..cfae77620 --- /dev/null +++ b/xml_schema/alto-4-2.xsd @@ -0,0 +1,1105 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ALTO (analyzed layout and text object) stores layout information and + OCR recognized text of pages of any kind of printed documents like books, journals and newspapers. + ALTO is a standardized XML format to store layout and content information. + It is designed to be used as an extension schema to METS (Metadata Encoding and Transmission Standard), + where METS provides metadata and structural information while ALTO contains content and physical information. + + + + + + + + Describes general settings of the alto file like measurement units and metadata + + + + + Styles define properties of layout elements. A style defined in a parent element is used as default style for all related children elements. + + + + + + Tag define properties of additional characteristic. The tags are referenced from related content element on Block or String element by attribute TAGREF via the tag ID. + This container element contains the individual elements for LayoutTags, StructureTags, RoleTags, NamedEntityTags and OtherTags + + + + + + The root layout element. + + + + + + Schema version of the ALTO file. + + + + + + + + + + Element deprecated. 'Processing' should be used instead. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + There are following variation of tag types available: + LayoutTag – criteria about arrangement or graphical appearance + StructureTag – criteria about grouping or formation + RoleTag – criteria about function or mission + NamedEntityTag – criteria about assignment of terms to their relationship / meaning (NER) + OtherTag – criteria about any other characteristic not listed above, the TYPE attribute is intended to be used for classification within those. + + + + + + + + + + + + + + + Gives brief information about original page quality + + + + + + + + + + + + + + Gives more details about the original page quality, since QUALITY attribute gives only brief and restrictive information + + + + + + Position of the page. Could be lefthanded, righthanded, cover, foldout or single if it has no special position. + + + + + + + + + + + + Page Confidence: Confidence level of the ocr for this page. A value between 0 (unsure) and 1 (sure). + + + + + + + + + One page of a book or journal. + + + + + The area between the top line of print and the upper edge of the leaf. It may contain page number or running title. + + + + + The area between the printspace and the left border of a page. May contain margin notes. + + + + + The area between the printspace and the right border of a page. May contain margin notes. + + + + + The area between the bottom line of letterpress or writing and the bottom edge of the leaf. It may contain a page number, a signature number or a catch word. + + + + + Rectangle covering the printed area of a page. Page number and running title are not part of the print space. + + + + + + + Any user-defined class like title page. + + + + + + + + + The number of the page within the document. + + + + + The page number that is printed on the page. + + + + + + + + A link to the processing description that has been used for this page. + + + + + Estimated percentage of OCR Accuracy in range from 0 to 100 + + + + + + + + + + + + + A text style defines font properties of text. + + + + + + + A paragraph style defines formatting properties of text blocks. + + + + + Indicates the alignement of the paragraph. Could be left, right, center or justify. + + + + + + + + + + + + + Left indent of the paragraph in relation to the column. + + + + + Right indent of the paragraph in relation to the column. + + + + + Line spacing between two lines of the paragraph. Measurement calculated from baseline to baseline. + + + + + Indent of the first line of the paragraph if this is different from the other lines. A negative value indicates an indent to the left, a positive value indicates an indent to the right. + + + + + + + + + + + + + + + + + + + + + + + + + + + Group of available block types + + + + + A block of text. + + + + + A picture or image. + + + + + A graphic used to separate blocks. Usually a line or rectangle. + + + + + A block that consists of other blocks + + + + + + + Base type for any kind of block on the page. + + + + + + + + + + + + + + + Tells the rotation of e.g. text or illustration within the block. The value is in degree counterclockwise. + + + + + The next block in reading sequence on the page. + + + + + Correction Status. Indicates whether manual correction has been done or not. The correction status should be recorded at the highest level possible (Block, TextLine, String). + + + + + + + A white space. + + + + + + + + + + Type of the substitution (if any). + + + + + + + + + + + + + + + Word Confidence: Confidence level of the ocr for this string. A value between 0 (unsure) and 1 (sure). + + + + + + + + + + Any alternative for the word. + Alternative can outline a variant of writing by new typing / spelling rules, typically manually done or by dictionary replacements. + The above sample is an old composed character "Æ" of ancient time, which is replaced now by "Ä". + As variant are meant alternatives of the real printed content which are options outlined by the text recognition process. + Similar sample: "Straße" vs. "Strasse". Such alternatives are not coming from text recognition. + + + + + + + Identifies the purpose of the alternative. + + + + + + + + A sequence of chars. Strings are separated by white spaces or hyphenation chars. + + + + + + + + + + + + + + + + + + + + Content of the substitution. + + + + + + Confidence level of each character in that string. A list of numbers, one number between 0 (sure) and 9 (unsure) for each character. + + + + + Correction Status. Indicates whether manual correction has been done or not. The correction status should be recorded at the highest level possible (Block, TextLine, String). + + + + + Attribute to record language of the string. The language should be recorded at the highest level possible. + + + + + + A region on a page + + + + + + + + + + + + + + + + + + A list of points + + + + + + Describes the bounding shape of a block, if it is not rectangular. + + + + + + + + + + A polygon shape. + + + + + + An ellipse shape. HPOS and VPOS describe the center of the ellipse. + HLENGTH and VLENGTH are the width and height of the described ellipse. + The attribute ROTATION tells the rotation of the e.g. text or + illustration within the block. The value is in degrees counterclockwise. + + + + + + + + + + A circle shape. HPOS and VPOS describe the center of the circle. + + + + + + + + Formatting attributes. Note that these attributes are assumed to be inherited from ancestor elements of the document hierarchy. + + + + The font name. + + + + + + + The font size, in points (1/72 of an inch). + + + + + Font color as RGB value + + + + + + + Serif or Sans-Serif + + + + + + + + + fixed or proportional + + + + + + + + + + + All measurement values inside the alto file are related to + this unit, except the font size. + Coordinates as being used in HPOS and VPOS are absolute coordinates referring to the upper-left corner of a page. + The upper left corner of the page is defined as coordinate (0/0). + + values meaning: + mm10: 1/10th of millimeter + inch1200: 1/1200th of inch + pixel: 1 pixel + + The values for pixel will be related to the resolution of the image based + on which the layout is described. Incase the original image is not known + the scaling factor can be calculated based on total width and height of + the image and the according information of the PAGE element. + + + + + + + + + + + Information to identify the image file from which the OCR text was created. + + + + + + + + + + + + + + + + + + + A unique identifier for the image file. This is drawn from MIX. + This identifier must be unique within the local system. + To facilitate file sharing or interoperability with other systems, fileIdentifierLocation may be added to designate the system or application where the identifier is unique. + + + + + + A location qualifier, i.e., a namespace. + + + + + + + + + + + + + + A unique identifier for the document. + This identifier must be unique within the local system. + To facilitate file sharing or interoperability with other systems, documentIdentifierLocation may be added to designate the system or application where the identifier is unique. + + + + + + A location qualifier, i.e., a namespace. + + + + + + + + Deprecated. processingType should be used instead. + Information on how the text was created, including preprocessing, OCR processing, and postprocessing steps. Where possible, this draws from MIX's change history. + + + + + + + + + + Description of the processing step. + + + + + Classification of the category of operation, how the file was created, including generation, modification, preprocessing, postprocessing or any other steps. + + + + + Date or DateTime the image was processed. + + + + + Identifies the organizationlevel producer(s) of the processed image. + + + + + An ordinal listing of the image processing steps performed. For example, "image despeckling." + + + + + A description of any setting of the processing application. For example, for a multi-engine OCR application this might include the engines which were used. Ideally, this description should be adequate so that someone else using the same application can produce identical results. + + + + + + + + + + + + + + + + + + + + + Information about a software application. Where applicable, the preferred method for determining this information is by selecting Help -- About. + + + + + The name of the organization or company that created the application. + + + + + The name of the application. + + + + + The version of the application. + + + + + A description of any important characteristics of the application, especially for non-commercial applications. For example, if a non-commercial application is built using commercial components, e.g., an OCR engine SDK. Those components should be mentioned here. + + + + + + + + + + List of any combination of font styles + + + + + + + + + + + + + + + + + + + + + + + A block that consists of other blocks + + + + + + + + + A user defined string to identify the type of composed block (e.g. table, advertisement, ...) + + + + + An ID to link to an image which contains only the composed block. The ID and the file link is defined in the related METS file. + + + + + + + + A picture or image. + + + + + + A user defined string to identify the type of illustration like photo, map, drawing, chart, ... + + + + + A link to an image which contains only the illustration. + + + + + + + + A graphic used to separate blocks. Usually a line or rectangle. + + + + + + + + A block of text. + + + + + + + A single line of text. + + + + + + + + + + + + + A hyphenation char. Can appear only at the end of a line. + + + + + + + + + + + + + + + + + + + + + Pixel coordinates based on the left-hand top corner of an image which define a polyline on which a line of text rests. + + + + + Attribute to record language of the textline. + + + + + Correction Status. Indicates whether manual correction has been done or not. The correction status should be recorded at the highest level possible (Block, TextLine, String). + + + + + + + + Attribute deprecated. LANG should be used instead. + + + + + Attribute to record language of the textblock. + + + + + + + + + + + The xml data wrapper element XmlData is used to contain XML encoded metadata. + The content of an XmlData element can be in any namespace or in no namespace. + As permitted by the XML Schema Standard, the processContents attribute value for the + metadata in an XmlData is set to “lax”. Therefore, if the source schema and its location are + identified by means of an XML schemaLocation attribute, then an XML processor will validate + the elements for which it can find declarations. If a source schema is not identified, or cannot be + found at the specified schemaLocation, then an XML validator will check for well-formedness, + but otherwise skip over the elements appearing in the XmlData element. + + + + + + + + + + + + + Type can be used to classify and group the information within each tag element type. + + + + + Content / information value of the tag. + + + + + Description text for tag information for clarification. + + + + + Any URI for authority or description relevant information. + + + + + + + Modern OCR software stores information on glyph level. A glyph is essentially a character or ligature. + Accordingly the value for the glyph element will be defined as follows: + Pre-composed representation = base + combining character(s) (decomposed representation) + See http://www.fileformat.info/info/unicode/char/0101/index.htm + "U+0101" = (U+0061) + (U+0304) + "combining characters" ("base characters" in combination with non-spacing marks or characters which are combined to one) are represented as one "glyph", e.g. áàâ. + + Each glyph has its own coordinate information and must be separately addressable as a distinct object. + Correction and verification processes can be carried out for individual characters. + + Post-OCR analysis of the text as well as adaptive OCR algorithm must be able to record information on glyph level. + In order to reproduce the decision of the OCR software, optional characters must be recorded. These are called variants. + The OCR software evaluates each variant and picks the one with the highest confidence score as the glyph. + The confidence score expresses how confident the OCR software is that a single glyph had been recognized correctly. + + The glyph elements are in order of the word. Each glyph need to be recorded to built up the whole word sequence. + + The glyph’s CONTENT attribute is no replacement for the string’s CONTENT attribute. + Due to post-processing steps such as correction the values of both attributes may be inconsistent. + + + + + + + + + + + CONTENT contains the precomposed representation (combining character) of the character from the parent String element. + The sequence position of the Gylph element matches the position of the character in the String. + + + + + + + + + + + + + This GC attribute records a float value between 0.0 and 1.0 that expresses the level of confidence for the variant where is 1 is certain. + This attribute is optional. If it is not available, the default value for the variant is “0”. + The GC attribute semantic is the same as the WC attribute on the String element and VC on Variant element. + + + + + + + + + + + + + + + + + + Alternative (combined) character for the glyph, outlined by OCR engine or similar recognition processes. + In case the variant are two (combining) characters, two characters are outlined in one Variant element. + E.g. a Glyph element with CONTENT="m" can have a Variant element with the content "rn". + Details for different use-cases see on the samples on GitHub. + + + + + + Each Variant represents an option for the glyph that the OCR software detected as possible alternatives. + In case the variant are two (combining) characters, two characters are outlined in one Variant element. + E.g. a Glyph element with CONTENT="m" can have a Variant element with the content "rn". + Details for different use-cases see on the samples on GitHub. + + + + + + + + + + + + + This VC attribute records a float value between 0.0 and 1.0 that expresses the level of confidence for the variant where is 1 is certain. + This attribute is optional. If it is not available, the default value for the variant is “0”. + The VC attribute semantic is the same as the GC attribute on the Glyph element. + + + + + + + + + + + \ No newline at end of file diff --git a/xml_schema/tei_all.xsd b/xml_schema/tei_all.xsd new file mode 100644 index 000000000..bcee046e9 --- /dev/null +++ b/xml_schema/tei_all.xsd @@ -0,0 +1,22708 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + (anchored) indicates whether the copy text shows the exact place of reference for the note. + + + + + + + (target end) points to the end of the span to which the note is attached, if the note is not embedded in the text at that point. + + + + + + + + + + + + + + + + + + indicates the person, or group of people, to whom the element content is ascribed. + + + + + + + + + + + + + + + + + + + indicates the person, or group of people, to whom a speech act or action is directed. + + + + + + + + + + + + + + + + + + + provides an externally-defined means of identifying the entity (or entities) being named, using a coded value of some kind. + + + + + + + (reference) provides an explicit means of locating a full definition or identity for the entity being named by means of one or more URIs. + + + + + + + + + + + + + + + + + + + + + + gives a minimum estimated value for the approximate measurement. + + + + + + + + + + + + + + + + gives a maximum estimated value for the approximate measurement. + + + + + + + + + + + + + + + + where the measurement summarizes more than one observation or a range, supplies the minimum value observed. + + + + + + + + + + + + + + + + where the measurement summarizes more than one observation or a range, supplies the maximum value observed. + + + + + + + + + + + + + + + + + + + + + + + + + + + names the unit used for the measurement +Suggested values include: 1] cm (centimetres); 2] mm (millimetres); 3] in (inches); 4] line; 5] char (characters) + + + + + + + + (centimetres) + + + + + + + + + (millimetres) + + + + + + + + + (inches) + + + + + + + + + lines of text + + + + + + + + + (characters) characters of text + + + + + + + + + + + + + + + + + specifies the length in the units specified + + + + + + + + + + + + + + + + indicates the size of the object concerned using a project-specific vocabulary combining quantity and units in a single string of words. + + + + + + + characterizes the precision of the values specified by the other attributes. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + where the measurement summarizes more than one observation, specifies the applicability of this measurement. +Sample values include: 1] all; 2] most; 3] range + + + + + + + + + + + + + + + + + + + + + + + + + categorizes the cause of the damage, if it can be identified. +Sample values include: 1] rubbing; 2] mildew; 3] smoke + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + assigns an arbitrary number to each stretch of damage regarded as forming part of the same physical phenomenon. + + + + + + + + + + indicates whether or not the element bearing this attribute should be considered to mark the end of an orthographic token in the same way as whitespace. + + + + + + + + + + + + + + + + + + + + + + + + + supplies the value of the date or time in a standard form, e.g. yyyy-mm-dd. + + + + + + + + + + specifies the earliest possible date for the event in standard form, e.g. yyyy-mm-dd. + + + + + + + + + + specifies the latest possible date for the event in standard form, e.g. yyyy-mm-dd. + + + + + + + + + + indicates the starting point of the period in standard form, e.g. yyyy-mm-dd. + + + + + + + + + + indicates the ending point of the period in standard form, e.g. yyyy-mm-dd. + + + + + + + + + + + + + + + + + indicates one or more systems or calendars to which the date represented by the content of this element belongs. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + indicates whether or not this element is selected by default when its parent is selected. + + + + + + This element is selected if its parent is selected + + + + + This element can only be selected explicitly, unless it is the only one of its kind, in which case it is selected if its parent is selected. + + + + + + + + + + + + + identifies one or more declarable elements within the header, which are understood to apply to the element bearing this attribute and its content. + + + + + + + + + + + + + + + + + + specifies whether or not its parent element is fragmented in some way, typically by some other overlapping structure: for example a speech which is divided between two or more verse stanzas, a paragraph which is split across a page division, a verse line which is divided between two speakers. + + + + + + (yes) the element is fragmented in some (unspecified) respect + + + + + (no) the element is not fragmented, or no claim is made as to its completeness + + + + + (initial) this is the initial part of a fragmented element + + + + + (medial) this is a medial part of a fragmented element + + + + + (final) this is the final part of a fragmented element + + + + + + + + + + + + + + + + (organization) specifies how the content of the division is organized. + + + + + + no claim is made about the sequence in which the immediate contents of this division are to be processed, or their inter-relationships. + + + + + the immediate contents of this element are regarded as forming a logical unit, to be processed in sequence. + + + + + + + + + + indicates whether this division is a sample of the original source and if so, from which part. + + + + + + division lacks material present at end in source. + + + + + division lacks material at start and end. + + + + + division lacks material at start. + + + + + position of sampled material within original unknown. + + + + + division is not a sample. + + + + + + + + + + + + + describes the status of a document either currently or, when associated with a dated element, at the time indicated. +Sample values include: 1] approved; 2] candidate; 3] cleared; 4] deprecated; 5] draft; 6] embargoed; 7] expired; 8] frozen; 9] galley; 10] proposed; 11] published; 12] recommendation; 13] submitted; 14] unfinished; 15] withdrawn + + + + + + + + + + + + + + + (duration) indicates the length of this element in time. + + + + + + + + + + + (certainty) signifies the degree of certainty associated with the intervention or interpretation. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + (responsible party) indicates the agency responsible for the intervention or interpretation, for example an editor or transcriber. + + + + + + + + + + + + + + + + + + + indicates the nature of the evidence supporting the reliability or accuracy of the intervention or interpretation. +Suggested values include: 1] internal; 2] external; 3] conjecture + + + + + + + + + + + + there is internal evidence to support the intervention. + + + + + + + + + there is external evidence to support the intervention. + + + + + + + + + the intervention or interpretation has been made by the editor, cataloguer, or scholar on the basis of their expertise. + + + + + + + + + + + + + + + + + + + + + + indicates whether this is an instant revision or not. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + (rendition) indicates how the element in question was rendered or presented in the source text. + + + + + + + + + + + + + + + + + + + + + contains an expression in some formal style definition language which defines the rendering or presentation used for this element in the source text + + + + + + + points to a description of the rendering or presentation used for this element in the source text. + + + + + + + + + + + + + + + + + + specifies the source from which some aspect of this element is drawn. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + (number) gives a number (or other label) for an element, which is not necessarily unique within the document. + + + + + + + + + + + + + + + + + + + + + + + + gives a name or other identifier for the scribe believed to be responsible for this hand. + + + + + + + + + + + + + + + + + + + characterizes the particular script or writing style used by this hand, for example secretary, copperplate, Chancery, Italian, etc. + + + + + + + + + + + + + + + + + + + + + + + + + + + describes the tint or type of ink, e.g. brown, or other writing medium, e.g. pencil + + + + + + + + + + + + + + + + + + + + + specifies how widely this hand is used in the manuscript. + + + + + + only this hand is used throughout the manuscript + + + + + this hand is used through most of the manuscript + + + + + this hand is used occasionally in the manuscript + + + + + + + + + + + + + (MIME media type) specifies the applicable multimedia internet mail extension (MIME) media type + + + + + + + + + + + + + + + + + + + + + + + + + + + Where the media are displayed, indicates the display width + + + + + + + + + + + + Where the media are displayed, indicates the display height + + + + + + + + + + + + Where the media are displayed, indicates a scale factor to be applied when generating the desired display size + + + + + + + + + + + + + + + + + + + (uniform resource locator) specifies the URL from which the media concerned may be obtained. + + + + + + + (instances) points to instances of the analysis or interpretation represented by the current element. + + + + + + + + + + + + + + + + + + + + + (unit) indicates the units used for the measurement, usually using the standard symbol for the desired units. +Suggested values include: 1] m (metre); 2] kg (kilogram); 3] s (second); 4] Hz (hertz); 5] Pa (pascal); 6] Ω (ohm); 7] L (litre); 8] t (tonne); 9] ha (hectare); 10] Å (ångström); 11] mL (millilitre); 12] cm (centimetre); 13] dB (decibel); 14] kbit (kilobit); 15] Kibit (kibibit); 16] kB (kilobyte); 17] KiB (kibibyte); 18] MB (megabyte); 19] MiB (mebibyte) + + + + + + + + (metre) SI base unit of length + + + + + + + + + (kilogram) SI base unit of mass + + + + + + + + + (second) SI base unit of time + + + + + + + + + (hertz) SI unit of frequency + + + + + + + + + (pascal) SI unit of pressure or stress + + + + + + + + + (ohm) SI unit of electric resistance + + + + + + + + + (litre) 1 dm³ + + + + + + + + + (tonne) 10³ kg + + + + + + + + + (hectare) 1 hm² + + + + + + + + + (ångström) 10⁻¹⁰ m + + + + + + + + + (millilitre) + + + + + + + + + (centimetre) + + + + + + + + + (decibel) see remarks, below + + + + + + + + + (kilobit) 10³ or 1000 bits + + + + + + + + + (kibibit) 2¹⁰ or 1024 bits + + + + + + + + + (kilobyte) 10³ or 1000 bytes + + + + + + + + + (kibibyte) 2¹⁰ or 1024 bytes + + + + + + + + + (megabyte) 10⁶ or 1 000 000 bytes + + + + + + + + + (mebibyte) 2²⁰ or 1 048 576 bytes + + + + + + + + + + + + + + + + + + + + (quantity) specifies the number of the specified units that comprise the measurement + + + + + + + + + + + + + + + + (commodity) indicates the substance that is being measured + + + + + + + + + + + + + + + + + + + + + + + + + + may be used to specify further information about the entity referenced by this name in the form of a set of whitespace-separated values, for example the occupation of a person, or the status of a place. + + + + + + + + + + + + + + + + + + + + + (reference to the canonical name) provides a means of locating the canonical form (nym) of the names associated with the object named by the element bearing it. + + + + + + + + + + + + + + + + + + names the notation used for the content of the element. + + + + + + + + + + + + + + + specifies where this item is placed. +Suggested values include: 1] top; 2] bottom; 3] margin; 4] opposite; 5] overleaf; 6] above; 7] right; 8] below; 9] left; 10] end; 11] inline; 12] inspace + + + + + + + + + + + + at the top of the page + + + + + + + + + at the foot of the page + + + + + + + + + in the margin (left, right, or both) + + + + + + + + + on the opposite, i.e. facing, page + + + + + + + + + on the other side of the leaf + + + + + + + + + above the line + + + + + + + + + to the right, e.g. to the right of a vertical line of text, or to the right of a figure + + + + + + + + + below the line + + + + + + + + + to the left, e.g. to the left of a vertical line of text, or to the left of a figure + + + + + + + + + at the end of e.g. chapter or volume. + + + + + + + + + within the body of the text. + + + + + + + + + in a predefined space, for example left by an earlier scribe. + + + + + + + + + + + + + + + + + + + + + + + + + + characterizes the element in some sense, using any convenient classification scheme or typology. + + + + + + + + + + + + (subtype) provides a sub-categorization of the element, if needed + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + specifies the destination of the reference by supplying one or more URI References + + + + + + + + + + + + + + + (evaluate) specifies the intended meaning when the target of a pointer is itself a pointer. + + + + + + if the element pointed to is itself a pointer, then the target of that pointer will be taken, and so on, until an element is found which is not a pointer. + + + + + if the element pointed to is itself a pointer, then its target (whether a pointer or not) is taken as the target of this pointer. + + + + + no further evaluation of targets is carried out beyond that needed to find the element specified in the pointer's target. + + + + + + + + + + + + + + + + optionally specifies the identifiers of the elements within which all elements indicated by the contents of this element lie. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + points at one or more sets of zero or more elements each. + + + + + + + + + + + + + + + + + + + + + + + + (function) characterizes the function of the segment. + + + + + + + + + + + + + + + supplies the sort key for this element in an index, list or group which contains it. + + + + + + + + + + + + + + + + (edition) supplies a sigil or other arbitrary identifier for the source edition in which the associated feature (for example, a page, column, or line break) occurs at this point in the text. + + + + + + + + + + + + + + + + + + + + + (edition reference) provides a pointer to the source edition in which the associated feature (for example, a page, column, or line break) occurs at this point in the text. + + + + + + + + + + + + + + + + + + indicates the end of a span initiated by the element bearing this attribute. + + + + + + + + + + + identifies the language used to describe the rendition. + + + + + + Cascading Stylesheet Language + + + + + Extensible Stylesheet Language Formatting Objects + + + + + Informal free text description + + + + + A user-defined rendition description language + + + + + + + + + + + + + + + + + + + + + + + + indicates the location within a temporal alignment at which this element begins. + + + + + + + indicates the location within a temporal alignment at which this element ends. + + + + + + + + + + + + + + indicates the effect of the intervention, for example in the case of a deletion, strikeouts which include too much or too little text, or in the case of an addition, an insertion which duplicates some of the text already present. +Sample values include: 1] duplicate; 2] duplicate-partial; 3] excessStart; 4] excessEnd; 5] shortStart; 6] shortEnd; 7] partial; 8] unremarkable + + + + + + + + + + + + documents the presumed cause for the intervention. + + + + + + + + + + + + (sequence) assigns a sequence number related to the order in which the encoded features carrying this attribute are believed to have occurred. + + + + + + + + + + + + identifies the unit of information conveyed by the element, e.g. columns, pages, volume, entry. +Suggested values include: 1] volume (volume); 2] issue; 3] page (page); 4] line; 5] chapter (chapter); 6] part; 7] column; 8] entry + + + + + + + + (volume) the element contains a volume number. + + + + + + + + + the element contains an issue number, or volume and issue numbers. + + + + + + + + + (page) the element contains a page number or page range. + + + + + + + + + the element contains a line number or line range. + + + + + + + + + (chapter) the element contains a chapter indication (number and/or title) + + + + + + + + + the element identifies a part of a book or collection. + + + + + + + + + the element identifies a column. + + + + + + + + + the element identifies an entry number or label in a list of entries. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + indicates whether the pronunciation or orthography applies to all or part of a word. +Suggested values include: 1] full (full form); 2] pref (prefix); 3] suff (suffix); 4] inf (infix); 5] part (partial) + + + + + + + + (full form) + + + + + + + + + (prefix) + + + + + + + + + (suffix) + + + + + + + + + (infix) + + + + + + + + + (partial) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + indicates whether the name component is given in full, as an abbreviation or simply as an initial. + + + + + + (yes) the name component is spelled out in full. + + + + + (abbreviated) the name component is given in an abbreviated form. + + + + + (initial letter) the name component is indicated only by one initial. + + + + + + + + + + (sort) specifies the sort order of the name component in relation to others within the name. + + + + + + + + + + + + + + + (duration) indicates the length of this element in time. + + + + + + + + + + + + + + + provides a conventional name for the kind of section changing at this milestone. +Suggested values include: 1] page; 2] column; 3] line; 4] book; 5] poem; 6] canto; 7] speaker; 8] stanza; 9] act; 10] scene; 11] section; 12] absent; 13] unnumbered + + + + + + + + + + + + + column breaks. + + + + + + + + + + + + + + any units termed book, liber, etc. + + + + + + + + + individual poems in a collection. + + + + + + + + + cantos or other major sections of a poem. + + + + + + + + + changes of speaker or narrator. + + + + + + + + + stanzas within a poem, book, or canto. + + + + + + + + + acts within a play. + + + + + + + + + scenes within a play or act. + + + + + + + + + sections of any kind. + + + + + + + + + passages not present in the reference edition. + + + + + + + + + passages present in the text, but not to be included as part of the reference. + + + + + + + + + + + + + + + + (paragraph) marks paragraphs in prose. [3.1. Paragraphs 7.2.5. Speech Contents] + + + + + + + + + + + + + + + (foreign) identifies a word or phrase as belonging to some language other than that of the surrounding text. [3.3.2.1. Foreign Words or Expressions] + + + + + + + + + + + + (emphasized) marks words or phrases which are stressed or emphasized for linguistic or rhetorical effect. [3.3.2.2. Emphatic Words and Phrases 3.3.2. Emphasis, Foreign Words, and Unusual Language] + + + + + + + + + + + + (highlighted) marks a word or phrase as graphically distinct from the surrounding text, for reasons concerning which no claim is made. [3.3.2.2. Emphatic Words and Phrases 3.3.2. Emphasis, Foreign Words, and Unusual Language] + + + + + + + + + + + + + identifies any word or phrase which is regarded as linguistically distinct, for example as archaic, technical, dialectal, non-preferred, etc., or as forming part of a sublanguage. [3.3.2.3. Other Linguistically Distinct Material] + + + + + + + + + specifies the sublanguage or register to which the word or phrase is being assigned + + + + + + + + + + specifies how the phrase is distinct diachronically + + + + + specifies how the phrase is distinct diatopically + + + + + specifies how the phrase is distinct diastratically + + + + + + + + + (speech or thought) indicates passages thought or spoken aloud, whether explicitly indicated in the source or not, whether directly or indirectly reported, whether by real people or fictional characters. [3.3.3. Quotation] + + + + + + + + + may be used to indicate whether the quoted matter is regarded as having been vocalized or signed. + + + + + + + + + + + + + + + + + + + + + + + may be used to indicate whether the quoted matter is regarded as direct or indirect speech. + + + + + + + + + + + + + + + + + + + + + + + + + + + (quotation) contains a phrase or passage attributed by the narrator or author to some agency external to the text. [3.3.3. Quotation 4.3.1. Grouped Texts] + + + + + + + + + + + + + + + (quoted) contains material which is distinguished from the surrounding text using quotation marks or a similar method, for any one of a variety of reasons including, but not limited to: direct speech or thought, technical terms or jargon, authorial distance, quotations from elsewhere, and passages that are mentioned but not used. [3.3.3. Quotation] + + + + + + + + + (type) may be used to indicate whether the offset passage is spoken or thought, or to characterize it more finely. +Suggested values include: 1] spoken (spoken); 2] thought (thought); 3] written (written); 4] soCalled (so called); 5] foreign (foreign); 6] distinct (distinct); 7] term; 8] emph (emph); 9] mentioned (mentioned) + + + + + + + + (spoken) representation of speech + + + + + + + + + (thought) representation of thought, e.g. internal monologue + + + + + + + + + (written) quotation from a written source + + + + + + + + + (so called) authorial distance + + + + + + + + + (foreign) + + + + + + + + + (distinct) linguistically distinct + + + + + + + + + technical term + + + + + + + + + (emph) rhetorically emphasized + + + + + + + + + (mentioned) refering to itself, not its normal referent + + + + + + + + + + + + + + + + + + + (cited quotation) contains a quotation from some other document, together with a bibliographic reference to its source. In a dictionary it may contain an example text with at least one occurrence of the word form, used in the sense being described, or a translation of the headword, or an example. [3.3.3. Quotation 4.3.1. Grouped Texts 9.3.5.1. Examples] + + + + + + + + + + + + + + + + + + + + marks words or phrases mentioned, not used. [3.3.3. Quotation] + + + + + + + + + + + + (so called) contains a word or phrase for which the author or narrator indicates a disclaiming of responsibility, for example by the use of scare quotes or italics. [3.3.3. Quotation] + + + + + + + + + + + + (description) contains a short description of the purpose, function, or use of its parent element, or when the parent is a documentation element, describes or defines the object being documented. [22.4.1. Description of Components] + + + + + + + + + + characterizes the element in some sense, using any convenient classification scheme or typology. +Suggested values include: 1] deprecationInfo (deprecation information) + + + + + + + + (deprecation + information) This element describes why or how its parent element is being deprecated, typically including recommendations for alternate encoding. + + + + + + + + + + + + + + + + + + + (gloss) identifies a phrase or word used to provide a gloss or definition for some other word or phrase. [3.4.1. Terms and Glosses 22.4.1. Description of Components] + + + + + + + + + + + + + + + + + (term) contains a single-word, multi-word, or symbolic designation which is regarded as a technical term. [3.4.1. Terms and Glosses] + + + + + + + + + + + + + + + + + + (ruby container) contains a passage of base text along with its associated ruby gloss(es). [3.4.2. Ruby Annotations] + + + + + + + + + + + + + (ruby base) contains the base text annotated by a ruby gloss. [3.4.2. Ruby Annotations] + + + + + + + + + + + + + (ruby text) contains a ruby text, an annotation closely associated with a passage of the main text. [3.4.2. Ruby Annotations] + + + + + + + + + + + supplies a pointer to the base being glossed by this ruby text. + + + + + points to the starting point of the span of text being glossed by this ruby text. + + + + + points to the ending point of the span of text being glossed. + + + + + + + + + (Latin for thus or so) contains text reproduced although apparently incorrect or inaccurate. [3.5.1. Apparent Errors] + + + + + + + + + + + + (correction) contains the correct form of a passage apparently erroneous in the copy text. [3.5.1. Apparent Errors] + + + + + + + + + + + + + + (choice) groups a number of alternative encodings for the same point in a text. [3.5. Simple Editorial Changes] + + + + + + + + + + + + (regularization) contains a reading which has been regularized or normalized in some sense. [3.5.2. Regularization and +Normalization 12. Critical Apparatus] + + + + + + + + + + + + + + (original form) contains a reading which is marked as following the original, rather than being normalized or corrected. [3.5.2. Regularization and +Normalization 12. Critical Apparatus] + + + + + + + + + + + + (gap) indicates a point where material has been omitted in a transcription, whether for editorial reasons described in the TEI header, as part of sampling practice, or because the material is illegible, invisible, or inaudible. [3.5.3. Additions, Deletions, and Omissions] + + + + + + + + + + + + + (reason) gives the reason for omission +Suggested values include: 1] cancelled (cancelled); 2] deleted (deleted); 3] editorial (editorial); 4] illegible (illegible); 5] inaudible (inaudible); 6] irrelevant (irrelevant); 7] sampling (sampling) + + + + + + + + + + + + (cancelled) + + + + + + + + + (deleted) + + + + + + + + + (editorial) for features omitted from transcription due to editorial policy + + + + + + + + + (illegible) + + + + + + + + + (inaudible) + + + + + + + + + (irrelevant) + + + + + + + + + (sampling) + + + + + + + + + + + + + + + + + + + + (agent) in the case of text omitted because of damage, categorizes the cause of the damage, if it can be identified. +Sample values include: 1] rubbing (rubbing); 2] mildew (mildew); 3] smoke (smoke) + + + + + + + + + + + + (deliberately marked omission) indicates a purposeful marking in the source document signalling that content has been omitted, and may also supply or describe the omitted content. [3.5.3. Additions, Deletions, and Omissions] + + + + + + + + + + + + + + + (addition) contains letters, words, or phrases inserted in the source text by an author, scribe, or a previous annotator or corrector. [3.5.3. Additions, Deletions, and Omissions] + + + + + + + + + + + + + + + + (deletion) contains a letter, word, or passage deleted, marked as deleted, or otherwise indicated as superfluous or spurious in the copy text by an author, scribe, or a previous annotator or corrector. [3.5.3. Additions, Deletions, and Omissions] + + + + + + + + + + + + + + + (unclear) contains a word, phrase, or passage which cannot be transcribed with certainty because it is illegible or inaudible in the source. [11.3.3.1. Damage, Illegibility, and Supplied Text 3.5.3. Additions, Deletions, and Omissions] + + + + + + + + + + indicates why the material is hard to transcribe. +Suggested values include: 1] illegible (illegible); 2] inaudible (inaudible); 3] faded (faded); 4] background_noise (background_noise); 5] eccentric_ductus (eccentric_ductus) + + + + + + + + + + + + (illegible) + + + + + + + + + (inaudible) + + + + + + + + + (faded) + + + + + + + + + (background_noise) + + + + + + + + + (eccentric_ductus) indicates illegibility due to an unusual, awkward, or incompetent execution of a glyph or glyphs + + + + + + + + + + + + + + + + + + + + Where the difficulty in transcription arises from damage, categorizes the cause of the damage, if it can be identified. +Sample values include: 1] rubbing; 2] mildew; 3] smoke + + + + + + + + + + + + + + (name, proper noun) contains a proper noun or noun phrase. [3.6.1. Referring Strings] + + + + + + + + + + + + + + + + (referencing string) contains a general purpose name or referring string. [13.2.1. Personal Names 3.6.1. Referring Strings] + + + + + + + + + + + + + + (electronic mail address) contains an email address identifying a location to which email messages can be delivered. [3.6.2. Addresses] + + + + + + + + + + + + (address) contains a postal address, for example of a publisher, an organization, or an individual. [3.6.2. Addresses 2.2.4. Publication, Distribution, Licensing, etc. 3.12.2.4. Imprint, Size of a Document, and Reprint Information] + + + + + + + + + + + + + + + (address line) contains one line of a postal address. [3.6.2. Addresses 2.2.4. Publication, Distribution, Licensing, etc. 3.12.2.4. Imprint, Size of a Document, and Reprint Information] + + + + + + + + + + + + contains a full street address including any name or number identifying a building as well as the name of the street or route on which it is located. [3.6.2. Addresses] + + + + + + + + + + + + (postal code) contains a numerical or alphanumeric code used as part of a postal address to simplify sorting or delivery of mail. [3.6.2. Addresses] + + + + + + + + (postal box or post office box) contains a number or other identifier for some postal delivery point other than a street address. [3.6.2. Addresses] + + + + + + + + (number) contains a number, written in any form. [3.6.3. Numbers and +Measures] + + + + + + + + + + indicates the type of numeric value. +Suggested values include: 1] cardinal; 2] ordinal; 3] fraction; 4] percentage + + + + + + + + absolute number, e.g. 21, 21.5 + + + + + + + + + ordinal number, e.g. 21st + + + + + + + + + fraction, e.g. one half or three-quarters + + + + + + + + + a percentage + + + + + + + + + + + + + + + supplies the value of the number in standard form. + + + + + + + + + + + + + + + + + + (measure) contains a word or phrase referring to some quantity of an object or commodity, usually comprising a number, a unit, and a commodity name. [3.6.3. Numbers and +Measures] + + + + + + + + + + specifies the type of measurement in any convenient typology. + + + + + + + + + + + + + + (measure group) contains a group of dimensional specifications which relate to the same object, for example the height and width of a manuscript page. [10.3.4. Dimensions] + + + + + + + + + + + + + + contains a symbol, a word or a phrase referring to a unit of measurement in any kind of formal or informal system. [3.6.3. Numbers and +Measures] + + + + + + + + + + + + + + (date) contains a date in any format. [3.6.4. Dates and Times 2.2.4. Publication, Distribution, Licensing, etc. 2.6. The Revision Description 3.12.2.4. Imprint, Size of a Document, and Reprint Information 15.2.3. The Setting Description 13.4. Dates] + + + + + + + + + + + + + + + + + + + (time) contains a phrase defining a time of day in any format. [3.6.4. Dates and Times] + + + + + + + + + + + + + + + + + + + (abbreviation) contains an abbreviation of any sort. [3.6.5. Abbreviations and Their Expansions] + + + + + + + + + (type) allows the encoder to classify the abbreviation according to some convenient typology. +Sample values include: 1] suspension (suspension); 2] contraction (contraction); 3] brevigraph; 4] superscription (superscription); 5] acronym (acronym); 6] title (title); 7] organization (organization); 8] geographic (geographic) + + + + + + + + + + + + + + (expansion) contains the expansion of an abbreviation. [3.6.5. Abbreviations and Their Expansions] + + + + + + + + + + + + + (pointer) defines a pointer to another location. [3.7. Simple Links and Cross-References 16.1. Links] + + + + + + + + + + + + + (reference) defines a reference to another location, possibly modified by additional text or comment. [3.7. Simple Links and Cross-References 16.1. Links] + + + + + + + + + + + + + + + + + (list) contains any sequence of items organized as a list. [3.8. Lists] + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + (type) describes the nature of the items in the list. +Suggested values include: 1] gloss (gloss); 2] index (index); 3] instructions (instructions); 4] litany (litany); 5] syllogism (syllogism) + + + + + + + + + + + + + (index) each list item is an entry in an index such as the alphabetical topical index at the back of a print volume. + + + + + + + + + (instructions) each list item is a step in a sequence of instructions, as in a recipe. + + + + + + + + + (litany) each list item is one of a sequence of petitions, supplications or invocations, typically in a religious ritual. + + + + + + + + + (syllogism) each list item is part of an argument consisting of two or more propositions and a final conclusion derived from them. + + + + + + + + + + + + + + + + + (item) contains one component of a list. [3.8. Lists 2.6. The Revision Description] + + + + + + + + + + + + + (label) contains any label or heading used to identify part of a text, typically but not exclusively in a list or glossary. [3.8. Lists] + + + + + + + + + + + + + + + (heading) contains any type of heading, for example the title of a section, or the heading of a list, glossary, manuscript description, etc. [4.2.1. Headings and Trailers] + + + + + + + + + + + + + + + + + + + (heading for list labels) contains the heading for the label or term column in a glossary list or similar structured list. [3.8. Lists] + + + + + + + + + + + + (heading for list items) contains the heading for the item or gloss column in a glossary list or similar structured list. [3.8. Lists] + + + + + + + + + + + + (note) contains a note or annotation. [3.9.1. Notes and Simple Annotation 2.2.6. The Notes Statement 3.12.2.8. Notes and Statement of Language 9.3.5.4. Notes within Entries] + + + + + + + + + + + + + + + + + contains a group of notes [3.9.1.1. Encoding Grouped Notes] + + + + + + + + + + + + + + + + + + + + (index entry) marks a location to be indexed for whatever purpose. [3.9.2. Index Entries] + + + + + + + + + + + a single word which follows the rules defining a legal XML name (see ), supplying a name to specify which index (of several) the index entry belongs to. + + + + + + + indicates the location of any form of external media such as an audio or video clip etc. [3.10. Graphics and Other Non-textual Components] + + + + + + + + + + + + + + (MIME media type) specifies the applicable multimedia internet mail extension (MIME) media type + + + + + + + + + + + + + + + + + + + + + (graphic) indicates the location of a graphic or illustration, either forming part of a text, or providing an image of it. [3.10. Graphics and Other Non-textual Components 11.1. Digital Facsimiles] + + + + + + + + + + + + provides encoded binary data representing an inline graphic, audio, video or other object. [3.10. Graphics and Other Non-textual Components] + + + + + + + + + + + + + + + + + + + + + + + + + + + (milestone) marks a boundary point separating any kind of section of a text, typically but not necessarily indicating a point at which some part of a standard reference system changes, where the change is not represented by a structural element. [3.11.3. Milestone +Elements] + + + + + + + + + + + + + (gathering beginning) marks the beginning of a new gathering or quire in a transcribed codex. [3.11.3. Milestone +Elements] + + + + + + + + + + + + (page beginning) marks the beginning of a new page in a paginated document. [3.11.3. Milestone +Elements] + + + + + + + + + + + + (line beginning) marks the beginning of a new (typographic) line in some edition or version of a text. [3.11.3. Milestone +Elements 7.2.5. Speech Contents] + + + + + + + + + + + + (column beginning) marks the beginning of a new column of a text on a multi-column page. [3.11.3. Milestone +Elements] + + + + + + + + + + + + (analytic level) contains bibliographic elements describing an item (e.g. an article or poem) published within a monograph or journal and not as an independent publication. [3.12.2.1. Analytic, Monographic, and Series Levels] + + + + + + + + + + + + + + + + + + + (monographic level) contains bibliographic elements describing an item (e.g. a book or journal) published as an independent item (i.e. as a separate physical object). [3.12.2.1. Analytic, Monographic, and Series Levels] + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + (series information) contains information about the series in which a book or other bibliographic item has appeared. [3.12.2.1. Analytic, Monographic, and Series Levels] + + + + + + + + + + + + + + + + + + + + (author) in a bibliographic reference, contains the name(s) of an author, personal or corporate, of a work; for example in the same form as that provided by a recognized bibliographic name authority. [3.12.2.2. Titles, Authors, and Editors 2.2.1. The Title Statement] + + + + + + + + + + + + + + contains a secondary statement of responsibility for a bibliographic item, for example the name of an individual, institution or organization, (or of several such) acting as editor, compiler, translator, etc. [3.12.2.2. Titles, Authors, and Editors] + + + + + + + + + + + + + + (statement of responsibility) supplies a statement of responsibility for the intellectual content of a text, edition, recording, or series, where the specialized elements for authors, editors, etc. do not suffice or do not apply. May also be used to encode information about individuals or organizations which have played a role in the production or distribution of a bibliographic work. [3.12.2.2. Titles, Authors, and Editors 2.2.1. The Title Statement 2.2.2. The Edition Statement 2.2.5. The Series Statement] + + + + + + + + + + + + + + + + + + + + + + (responsibility) contains a phrase describing the nature of a person's intellectual responsibility, or an organization's role in the production or distribution of a work. [3.12.2.2. Titles, Authors, and Editors 2.2.1. The Title Statement 2.2.2. The Edition Statement 2.2.5. The Series Statement] + + + + + + + + + + + + + + (title) contains a title for any kind of work. [3.12.2.2. Titles, Authors, and Editors 2.2.1. The Title Statement 2.2.5. The Series Statement] + + + + + + + + + + + classifies the title according to some convenient typology. +Sample values include: 1] main; 2] sub (subordinate); 3] alt (alternate); 4] short; 5] desc (descriptive) + + + + + + + + + + indicates the bibliographic level for a title, that is, whether it identifies an article, book, journal, series, or unpublished material. + + + + + + (analytic) the title applies to an analytic item, such as an article, poem, or other work published as part of a larger item. + + + + + (monographic) the title applies to a monograph such as a book or other item considered to be a distinct publication, including single volumes of multi-volume works + + + + + (journal) the title applies to any serial or periodical publication such as a journal, magazine, or newspaper + + + + + (series) the title applies to a series of otherwise distinct publications such as a collection + + + + + (unpublished) the title applies to any unpublished material (including theses and dissertations unless published by a commercial press) + + + + + + + + + + + + contains the formalized descriptive title for a meeting or conference, for use in a bibliographic description for an item derived from such a meeting, or as a heading or preamble to publications emanating from it. [3.12.2.2. Titles, Authors, and Editors] + + + + + + + + + + + + + + groups information relating to the publication or distribution of a bibliographic item. [3.12.2.4. Imprint, Size of a Document, and Reprint Information] + + + + + + + + + + + + + + + + + + + + + + (publisher) provides the name of the organization responsible for the publication or distribution of a bibliographic item. [3.12.2.4. Imprint, Size of a Document, and Reprint Information 2.2.4. Publication, Distribution, Licensing, etc.] + + + + + + + + + + + + + (scope of bibliographic reference) defines the scope of a bibliographic reference, for example as a list of page numbers, or a named subdivision of a larger work. [3.12.2.5. Scopes and Ranges in Bibliographic Citations] + + + + + + + + + + + + + (cited range) defines the range of cited content, often represented by pages or other units [3.12.2.5. Scopes and Ranges in Bibliographic Citations] + + + + + + + + + + + + + + (publication place) contains the name of the place where a bibliographic item was published. [3.12.2.4. Imprint, Size of a Document, and Reprint Information] + + + + + + + + + + + + + (bibliographic citation) contains a loosely-structured bibliographic citation of which the sub-components may or may not be explicitly tagged. [3.12.1. Methods of Encoding Bibliographic References and Lists of References 2.2.7. The Source Description 15.3.2. Declarable Elements] + + + + + + + + + + + + + + + + + + + + + + (structured bibliographic citation) contains a structured bibliographic citation, in which only bibliographic sub-elements appear and in a specified order. [3.12.1. Methods of Encoding Bibliographic References and Lists of References 2.2.7. The Source Description 15.3.2. Declarable Elements] + + + + + + + + + + + + + + + + + + + + + + + + + (citation list) contains a list of bibliographic citations of any kind. [3.12.1. Methods of Encoding Bibliographic References and Lists of References 2.2.7. The Source Description 15.3.2. Declarable Elements] + + + + + + + + + + + + + + + + + + + + + + + + + + + + contains or references some other bibliographic item which is related to the present one in some specified manner, for example as a constituent or alternative version of it. [3.12.2.7. Related Items] + + + + + + + + + + + points to the related bibliographic element by means of an absolute or relative URI reference + + + + + + + (verse line) contains a single, possibly incomplete, line of verse. [3.13.1. Core Tags for Verse 3.13. Passages of Verse or Drama 7.2.5. Speech Contents] + + + + + + + + + + + + + + + + + (line group) contains one or more verse lines functioning as a formal unit, e.g. a stanza, refrain, verse paragraph, etc. [3.13.1. Core Tags for Verse 3.13. Passages of Verse or Drama 7.2.5. Speech Contents] + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + (speech) contains an individual speech in a performance text, or a passage presented as such in a prose or verse text. [3.13.2. Core Tags for Drama 3.13. Passages of Verse or Drama 7.2.2. Speeches and Speakers] + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + contains a specialized form of heading or label, giving the name of one or more speakers in a dramatic text or fragment. [3.13.2. Core Tags for Drama] + + + + + + + + + + + + (stage direction) contains any kind of stage direction within a dramatic text or fragment. [3.13.2. Core Tags for Drama 3.13. Passages of Verse or Drama 7.2.4. Stage Directions] + + + + + + + + + + indicates the kind of stage direction. +Suggested values include: 1] setting; 2] entrance; 3] exit; 4] business; 5] novelistic; 6] delivery; 7] modifier; 8] location; 9] mixed + + + + + + + + + + describes a setting. + + + + + + + + + describes an entrance. + + + + + + + + + describes an exit. + + + + + + + + + describes stage business. + + + + + + + + + is a narrative, motivating stage direction. + + + + + + + + + describes how a character speaks. + + + + + + + + + gives some detail about a character. + + + + + + + + + describes a location. + + + + + + + + + more than one of the above + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + (version) specifies the version number of the TEI Guidelines against which this document is valid. + + + + + + + + + + + + (automatically generated text division) indicates the location at which a textual division generated automatically by a text-processing application is to appear. [3.9.2. Index Entries] + + + + + + + + specifies what type of generated text division (e.g. index, table of contents, etc.) is to appear. +Sample values include: 1] index; 2] toc; 3] figlist; 4] tablist + + + + + + + + + + + + (text language) describes the languages and writing systems identified within the bibliographic work being described, rather than its description. [3.12.2.4. Imprint, Size of a Document, and Reprint Information 10.6.6. Languages and Writing Systems] + + + + + + + + (main language) supplies a code which identifies the chief language used in the bibliographic work. + + + + + + + + + + + + + + + + + + (other languages) one or more codes identifying any other languages used in the bibliographic work. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + specifies a regular expression against which the values of other attributes can be matched. + + + + + + + + + (TEI header) supplies descriptive and declarative metadata associated with a digital resource or set of resources. [2.1.1. The TEI Header and Its Components 15.1. Varieties of Composite Text] + + + + + + + + + + + + + (file description) contains a full bibliographic description of an electronic file. [2.2. The File Description 2.1.1. The TEI Header and Its Components] + + + + + + + + + + + + + + + + + + + (title statement) groups information about the title of a work and those responsible for its content. [2.2.1. The Title Statement 2.2. The File Description] + + + + + + + + + + + + (sponsor) specifies the name of a sponsoring organization or institution. [2.2.1. The Title Statement] + + + + + + + + + + + + + + (funding body) specifies the name of an individual, institution, or organization responsible for the funding of a project or text. [2.2.1. The Title Statement] + + + + + + + + + + + + + + (principal researcher) supplies the name of the principal researcher responsible for the creation of an electronic text. [2.2.1. The Title Statement] + + + + + + + + + + + + + + (edition statement) groups information relating to one edition of a text. [2.2.2. The Edition Statement 2.2. The File Description] + + + + + + + + + + + + + + + (edition) describes the particularities of one edition of a text. [2.2.2. The Edition Statement] + + + + + + + + + + + + (extent) describes the approximate size of a text stored on some carrier medium or of some other object, digital or non-digital, specified in any convenient units. [2.2.3. Type and Extent of File 2.2. The File Description 3.12.2.4. Imprint, Size of a Document, and Reprint Information 10.7.1. Object Description] + + + + + + + + + + + + (publication statement) groups information concerning the publication or distribution of an electronic or other text. [2.2.4. Publication, Distribution, Licensing, etc. 2.2. The File Description] + + + + + + + + + + + + + + + (distributor) supplies the name of a person or other agency responsible for the distribution of a text. [2.2.4. Publication, Distribution, Licensing, etc.] + + + + + + + + + + + + + (release authority) supplies the name of a person or other agency responsible for making a work available, other than a publisher or distributor. [2.2.4. Publication, Distribution, Licensing, etc.] + + + + + + + + + + + + + (identifier) supplies any form of identifier used to identify some object, such as a bibliographic item, a person, a title, an organization, etc. in a standardized way. [13.3.1. Basic Principles 2.2.4. Publication, Distribution, Licensing, etc. 2.2.5. The Series Statement 3.12.2.4. Imprint, Size of a Document, and Reprint Information] + + + + + + + + + + + + + categorizes the identifier, for example as an ISBN, Social Security number, etc. +Suggested values include: 1] ISBN; 2] ISSN; 3] DOI; 4] URI; 5] VIAF; 6] ESTC; 7] OCLC + + + + + + + + + + + + + International Standard Serial Number: an eight-digit number to uniquely identify a serial publication. + + + + + + + + + Digital Object Identifier: a unique string of letters and numbers assigned to an electronic document. + + + + + + + + + Uniform Resource Identifier: a string of characters to uniquely identify a resource which usually contains indication of the means of accessing that resource, the name of its host, and its filepath. + + + + + + + + + A data number in the Virtual Internet Authority File assigned to link different names in catalogs around the world for the same entity. + + + + + + + + + English Short-Title Catalogue number: an identifying number assigned to a document in English printed in the British Isles or North America before 1801. + + + + + + + + + OCLC control number (record number) for the union catalog record in WorldCat, a union catalog for member libraries in the Online Computer Library Center global cooperative. + + + + + + + + + + + + + + + + + (availability) supplies information about the availability of a text, for example any restrictions on its use or distribution, its copyright status, any licence applying to it, etc. [2.2.4. Publication, Distribution, Licensing, etc.] + + + + + + + + + + + (status) supplies a code identifying the current availability of the text. + + + + + + (free) the text is freely available. + + + + + (unknown) the status of the text is unknown. + + + + + (restricted) the text is not freely available. + + + + + + + + + + contains information about a licence or other legal agreement applicable to the text. [2.2.4. Publication, Distribution, Licensing, etc.] + + + + + + + + + + + + + + (series statement) groups information about the series, if any, to which a publication belongs. [2.2.5. The Series Statement 2.2. The File Description] + + + + + + + + + + + + + + + + + + + + + + + (notes statement) collects together any notes providing information about a text additional to that recorded in other parts of the bibliographic description. [2.2.6. The Notes Statement 2.2. The File Description] + + + + + + + + + + + + (source description) describes the source(s) from which an electronic text was derived or generated, typically a bibliographic description in the case of a digitized text, or a phrase such as "born digital" for a text which has no previous existence. [2.2.7. The Source Description] + + + + + + + + + + + + + + + + + (fully-structured bibliographic citation) contains a fully-structured bibliographic citation, in which all components of the TEI file description are present. [3.12.1. Methods of Encoding Bibliographic References and Lists of References 2.2. The File Description 2.2.7. The Source Description 15.3.2. Declarable Elements] + + + + + + + + + + + + + + + + + + + + + + + + + + + + (encoding description) documents the relationship between an electronic text and the source or sources from which it was derived. [2.3. The Encoding Description 2.1.1. The TEI Header and Its Components] + + + + + + + + + + + + (schema reference) describes or points to a related customization or schema file [2.3.10. The Schema Specification] + + + + + + + + + the identifier used for the customization or schema + + + + + + + (project description) describes in detail the aim or purpose for which an electronic file was encoded, together with any other relevant information concerning the process by which it was assembled or collected. [2.3.1. The Project Description 2.3. The Encoding Description 15.3.2. Declarable Elements] + + + + + + + + + + (sampling declaration) contains a prose description of the rationale and methods used in sampling texts in the creation of a corpus or collection. [2.3.2. The Sampling Declaration 2.3. The Encoding Description 15.3.2. Declarable Elements] + + + + + + + + + + (editorial practice declaration) provides details of editorial principles and practices applied during the encoding of a text. [2.3.3. The Editorial Practices Declaration 2.3. The Encoding Description 15.3.2. Declarable Elements] + + + + + + + + + + + + + (correction principles) states how and under what circumstances corrections have been made in the text. [2.3.3. The Editorial Practices Declaration 15.3.2. Declarable Elements] + + + + + + + + indicates the degree of correction applied to the text. + + + + + + the text has been thoroughly checked and proofread. + + + + + the text has been checked at least once. + + + + + the text has not been checked. + + + + + the correction status of the text is unknown. + + + + + + + + indicates the method adopted to indicate corrections within the text. + + + + + + corrections have been made silently + + + + + corrections have been represented using markup + + + + + + + + + + (normalization) indicates the extent of normalization or regularization of the original source carried out in converting it to electronic form. [2.3.3. The Editorial Practices Declaration 15.3.2. Declarable Elements] + + + + + + + + indicates the method adopted to indicate normalizations within the text. + + + + + + normalization made silently + + + + + normalization represented using markup + + + + + + + + + + (quotation) specifies editorial practice adopted with respect to quotation marks in the original. [2.3.3. The Editorial Practices Declaration 15.3.2. Declarable Elements] + + + + + + + + (quotation marks) indicates whether or not quotation marks have been retained as content within the text. + + + + + + no quotation marks have been retained + + + + + some quotation marks have been retained + + + + + all quotation marks have been retained + + + + + + + + + + (hyphenation) summarizes the way in which hyphenation in a source text has been treated in an encoded version of it. [2.3.3. The Editorial Practices Declaration 15.3.2. Declarable Elements] + + + + + + + + (end-of-line) indicates whether or not end-of-line hyphenation has been retained in a text. + + + + + + all end-of-line hyphenation has been retained, even though the lineation of the original may not have been. + + + + + end-of-line hyphenation has been retained in some cases. + + + + + all soft end-of-line hyphenation has been removed: any remaining end-of-line hyphenation should be retained. + + + + + all end-of-line hyphenation has been removed: any remaining hyphenation occurred within the line. + + + + + + + + + + (segmentation) describes the principles according to which the text has been segmented, for example into sentences, tone-units, graphemic strata, etc. [2.3.3. The Editorial Practices Declaration 15.3.2. Declarable Elements] + + + + + + + + + + (standard values) specifies the format used when standardized date or number values are supplied. [2.3.3. The Editorial Practices Declaration 15.3.2. Declarable Elements] + + + + + + + + + + (interpretation) describes the scope of any analytic or interpretive information added to the text in addition to the transcription. [2.3.3. The Editorial Practices Declaration] + + + + + + + + + + specifies editorial practice adopted with respect to punctuation marks in the original. [2.3.3. The Editorial Practices Declaration 3.2. Treatment of Punctuation] + + + + + + + + indicates whether or not punctation marks have been retained as content within the text. + + + + + + no punctuation marks have been retained + + + + + some punctuation marks have been retained + + + + + all punctuation marks have been retained + + + + + + + + indicates the positioning of punctuation marks that are associated with marked up text as being encoded within the element surrounding the text or immediately before or after it. + + + + + + punctuation marks found at the start or end of a marked up text component are included within its surrounding element; + + + + + punctuation marks found at the start or end of a marked up text component appear immediately before or after the surrounding element + + + + + + + + + + (tagging declaration) provides detailed information about the tagging applied to a document. [2.3.4. The Tagging Declaration 2.3. The Encoding Description] + + + + + + + + + + + + + (element usage) documents the usage of a specific element within a specified document. [2.3.4. The Tagging Declaration] + + + + + + + + + specifies the number of occurrences of this element within the text. + + + + + + + + + + (namespace) supplies the formal name of the namespace to which the elements documented by its children belong. [2.3.4. The Tagging Declaration] + + + + + + + + + specifies the full formal name of the namespace concerned. + + + + + + + (rendition) supplies information about the rendition or appearance of one or more elements in the source text. [2.3.4. The Tagging Declaration] + + + + + + + + + where CSS is used, provides a way of defining pseudo-elements, that is, styling rules applicable to specific sub-portions of an element. +Sample values include: 1] first-line; 2] first-letter; 3] before; 4] after + + + + + + + + + + + + + + + (style definition language declaration) specifies the name of the formal language in which style or renditional information is supplied elsewhere in the document. The specific version of the scheme may also be supplied. [2.3.5. The Default Style Definition Language Declaration] + + + + + + + + + + + (references declaration) specifies how canonical references are constructed for this text. [2.3.6.3. Milestone Method 2.3. The Encoding Description 2.3.6. The Reference System Declaration] + + + + + + + + + + + + + + + (citation structure) declares a structure and method for citing the current document. [3.11.4. Declaring Reference Systems 16.2.5.4. Citation Structures] + + + + + + + + + + + + (delimiter) supplies a delimiting string preceding the structural component. + + + + + + + + + + + + + + + + + + + + (citation data) specifies how information may be extracted from citation structures. [3.11.4. Declaring Reference Systems 16.2.5.4. Citation Structures] + + + + + + + (property) A URI indicating a property definition. + + + + + + + (canonical reference pattern) specifies an expression and replacement pattern for transforming a canonical reference into a URI. [2.3.6.3. Milestone Method 2.3.6. The Reference System Declaration 2.3.6.2. Search-and-Replace Method] + + + + + + + + + + (prefix definition) defines a prefixing scheme used in teidata.pointer values, showing how abbreviated URIs using the scheme may be expanded into full URIs. [16.2.3. Using Abbreviated Pointers] + + + + + + + + supplies a name which functions as the prefix for an abbreviated pointing scheme such as a private URI scheme. The prefix constitutes the text preceding the first colon. + + + + + + + + + + + + (list of prefix definitions) contains a list of definitions of prefixing schemes used in teidata.pointer values, showing how abbreviated URIs using each scheme may be expanded into full URIs. [16.2.3. Using Abbreviated Pointers] + + + + + + + + + + + + + + + (reference state) specifies one component of a canonical reference defined by the milestone method. [2.3.6.3. Milestone Method 2.3.6. The Reference System Declaration] + + + + + + + + specifies the fixed length of the reference component. + + + + + (delimiter) supplies a delimiting string following the reference component. + + + + + + + (classification declarations) contains one or more taxonomies defining any classificatory codes used elsewhere in the text. [2.3.7. The Classification Declaration 2.3. The Encoding Description] + + + + + + + + + + + (taxonomy) defines a typology either implicitly, by means of a bibliographic citation, or explicitly by a structured taxonomy. [2.3.7. The Classification Declaration] + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + (category) contains an individual descriptive category, possibly nested within a superordinate category, within a user-defined taxonomy. [2.3.7. The Classification Declaration] + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + supplies a commonly used code name for the datum employed. +Suggested values include: 1] WGS84 (World Geodetic System); 2] MGRS (Military Grid Reference System); 3] OSGB36 (ordnance survey great britain); 4] ED50 (European Datum coordinate system) + + + + + + + + (World Geodetic System) a pair of numbers to be interpreted as latitude followed by longitude according to the World Geodetic System. + + + + + + + + + (Military Grid Reference System) the values supplied are geospatial entity object codes, based on + + + + + + + + + (ordnance survey great britain) the value supplied is to be interpreted as a British National Grid Reference. + + + + + + + + + (European Datum coordinate system) the value supplied is to be interpreted as latitude followed by longitude according to the European Datum coordinate system. + + + + + + + + + + + + + + + + + + + (unit declarations) provides information about units of measurement that are not members of the International System of Units. [2.3.9. The Unit Declaration] + + + + + + + + + + + + + (unit definition) contains descriptive information related to a specific unit of measurement. [2.3.9. The Unit Declaration] + + + + + + + + + + + + + + + + + defines how to calculate one unit of measure in terms of another. [2.3.9. The Unit Declaration] + + + + + + + + + + + + + (application information) records information about an application which has edited the TEI file. [2.3.11. The Application Information Element] + + + + + + + + + provides information about an application which has acted upon the document. [2.3.11. The Application Information Element] + + + + + + + + + + + + + + + supplies an identifier for the application, independent of its version number or display name. + + + + + supplies a version number for the application, independent of its identifier or display name. + + + + + + + + + + + + (text-profile description) provides a detailed description of non-bibliographic aspects of a text, specifically the languages and sublanguages used, the situation in which it was produced, the participants and their setting. [2.4. The Profile Description 2.1.1. The TEI Header and Its Components] + + + + + + + + + (note on hand) describes a particular style or hand distinguished within a manuscript. [10.7.2. Writing, Decoration, and Other Notations] + + + + + + + + + + + + + contains a summary or formal abstract prefixed to an existing source document by the encoder. [2.4.4. Abstracts] + + + + + + + + + + + + (creation) contains information about the creation of a text. [2.4.1. Creation 2.4. The Profile Description] + + + + + + + + + + + + + (language usage) describes the languages, sublanguages, registers, dialects, etc. represented within a text. [2.4.2. Language Usage 2.4. The Profile Description 15.3.2. Declarable Elements] + + + + + + + + + + + + + (language) characterizes a single language or sublanguage used within a text. [2.4.2. Language Usage] + + + + + + + + + + + + + + + + + + + + + + + specifies the approximate percentage (by volume) of the text which uses this language. + + + + + + + + + (text classification) groups information which describes the nature or topic of a text in terms of a standard classification scheme, thesaurus, etc. [2.4.3. The Text Classification] + + + + + + + + + + + + + + (keywords) contains a list of keywords or phrases identifying the topic or nature of a text. [2.4.3. The Text Classification] + + + + + + + + + + + + + (classification code) contains the classification code used for this text in some standard classification system. [2.4.3. The Text Classification] + + + + + + + + + + + + + (category reference) specifies one or more defined categories within some taxonomy or text typology. [2.4.3. The Text Classification] + + + + + + + + + + (calendar description) contains a description of the calendar system used in any dating expression found in the text. [2.4. The Profile Description 2.4.5. Calendar Description] + + + + + + + + + + + (calendar) describes a calendar or dating system used in a dating formula in the text. [2.4.5. Calendar Description] + + + + + + + + + + (correspondence + description) contains a description of the actions related to one act of correspondence. [2.4.6. Correspondence Description] + + + + + + + + + + + + + + + (correspondence action) contains a structured description of the place, the name of a person/organization and the date related to the sending/receiving of a message or any other action related to the correspondence. [2.4.6. Correspondence Description] + + + + + + + + + + + + describes the nature of the action. +Suggested values include: 1] sent; 2] received; 3] transmitted; 4] redirected; 5] forwarded + + + + + + + + information concerning the sending or dispatch of a message. + + + + + + + + + information concerning the receipt of a message. + + + + + + + + + information concerning the transmission of a message, i.e. between the dispatch and the next receipt, redirect or forwarding. + + + + + + + + + information concerning the redirection of an unread message. + + + + + + + + + information concerning the forwarding of a message. + + + + + + + + + + + + + + + + + (correspondence context) provides references to preceding or following correspondence related to this piece of correspondence. [2.4.6. Correspondence Description] + + + + + + + + + (non-TEI metadata) provides a container element into which metadata in non-TEI formats may be placed. [2.5. Non-TEI Metadata] + + + + + + + + + + + (revision description) summarizes the revision history for a file. [2.6. The Revision Description 2.1.1. The TEI Header and Its Components] + + + + + + + + + + + + + + (change) documents a change or set of changes made during the production of a source document, or during the revision of an electronic file. [2.6. The Revision Description 2.4.1. Creation 11.7. Identifying Changes and Revisions] + + + + + + + + + + + + (target) points to one or more elements that belong to this change. + + + + + + + + + + + + + + + + + describes a particular script distinguished within the description of a manuscript or similar resource. [10.7.2. Writing, Decoration, and Other Notations] + + + + + + + + + + + + + groups a number of change descriptions associated with either the creation of a source text or the revision of an encoded text. [2.6. The Revision Description 11.7. Identifying Changes and Revisions] + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + specifies the version number of the TEI Guidelines against which this document is valid. + + + + + + + + + + + + (text) contains a single text of any kind, whether unitary or composite, for example a poem or drama, a collection of essays, a novel, a dictionary, or a corpus sample. [4. Default Text Structure 15.1. Varieties of Composite Text] + + + + + + + + + + + + + + + + + + + + + + + + + + + (text body) contains the whole body of a single unitary text, excluding any front or back matter. [4. Default Text Structure] + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + (group) contains the body of a composite text, grouping together a sequence of distinct texts (or groups of such texts) which are regarded as a unit for some purpose, for example the collected works of an author, a sequence of prose essays, etc. [4. Default Text Structure 4.3.1. Grouped Texts 15.1. Varieties of Composite Text] + + + + + + + + + + + + + + + + + + + + + + + + + + + + (floating text) contains a single text of any kind, whether unitary or composite, which interrupts the text containing it at any point and after which the surrounding text resumes. [4.3.2. Floating Texts] + + + + + + + + + + + + + + + + + + + + + + + + + + (text division) contains a subdivision of the front, body, or back of a text. [4.1. Divisions of the Body] + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + (level-1 text division) contains a first-level subdivision of the front, body, or back of a text. [4.1.2. Numbered Divisions] + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + (level-2 text division) contains a second-level subdivision of the front, body, or back of a text. [4.1.2. Numbered Divisions] + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + (level-3 text division) contains a third-level subdivision of the front, body, or back of a text. [4.1.2. Numbered Divisions] + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + (level-4 text division) contains a fourth-level subdivision of the front, body, or back of a text. [4.1.2. Numbered Divisions] + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + (level-5 text division) contains a fifth-level subdivision of the front, body, or back of a text. [4.1.2. Numbered Divisions] + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + (level-6 text division) contains a sixth-level subdivision of the front, body, or back of a text. [4.1.2. Numbered Divisions] + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + (level-7 text division) contains the smallest possible subdivision of the front, body or back of a text, larger than a paragraph. [4.1.2. Numbered Divisions] + + + + + + + + + + + + + + + + + + + + + + + + + + + contains a closing title or footer appearing at the end of a division of a text. [4.2.4. Content of Textual Divisions 4.2. Elements Common to All Divisions] + + + + + + + + + + + + + + + + + + + (byline) contains the primary statement of responsibility given for a work on its title page or at the head or end of the work. [4.2.2. Openers and Closers 4.5. Front Matter] + + + + + + + + + + + + + + (dateline) contains a brief description of the place, date, time, etc. of production of a letter, newspaper story, or other work, prefixed or suffixed to it as a kind of heading or trailer. [4.2.2. Openers and Closers] + + + + + + + + + + + + + + (argument) contains a formal list or prose description of the topics addressed by a subdivision of a text. [4.2. Elements Common to All Divisions 4.6. Title Pages] + + + + + + + + + + + + + + + + + + (epigraph) contains a quotation, anonymous or attributed, appearing at the start or end of a section or on a title page. [4.2.3. Arguments, Epigraphs, and Postscripts 4.2. Elements Common to All Divisions 4.6. Title Pages] + + + + + + + + + + + + (opener) groups together dateline, byline, salutation, and similar phrases appearing as a preliminary group at the start of a division, especially of a letter. [4.2. Elements Common to All Divisions] + + + + + + + + + + + + + + + + + + + + (closer) groups together salutations, datelines, and similar phrases appearing as a final group at the end of a division, especially of a letter. [4.2.2. Openers and Closers 4.2. Elements Common to All Divisions] + + + + + + + + + + + + + + + + + (salutation) contains a salutation or greeting prefixed to a foreword, dedicatory epistle, or other division of a text, or the salutation in the closing of a letter, preface, etc. [4.2.2. Openers and Closers] + + + + + + + + + + + + + (signature) contains the closing salutation, etc., appended to a foreword, dedicatory epistle, or other division of a text. [4.2.2. Openers and Closers] + + + + + + + + + + + + + contains a postscript, e.g. to a letter. [4.2. Elements Common to All Divisions] + + + + + + + + + + + + + + + + + + + + + + + + (title page) contains the title page of a text, appearing within the front or back matter. [4.6. Title Pages] + + + + + + + + + + + + + + + classifies the title page according to any convenient typology. + + + + + + + + + + + + (document title) contains the title of a document, including all its constituents, as given on a title page. [4.6. Title Pages] + + + + + + + + + + + + + + + + (title part) contains a subsection or division of the title of a work, as indicated on a title page. [4.6. Title Pages] + + + + + + + + + (type) specifies the role of this subdivision of the title. +Suggested values include: 1] main (main); 2] sub (subordinate); 3] alt (alternate); 4] short (short); 5] desc (descriptive) + + + + + + + + (main) main title of the work + + + + + + + + + (subordinate) subtitle of the work + + + + + + + + + (alternate) alternative title of the work + + + + + + + + + (short) abbreviated form of title + + + + + + + + + (descriptive) descriptive paraphrase of the work + + + + + + + + + + + + + + + + + + + (document author) contains the name of the author of the document, as given on the title page (often but not always contained in a byline). [4.6. Title Pages] + + + + + + + + + + + + + (imprimatur) contains a formal statement authorizing the publication of a work, sometimes required to appear on a title page or its verso. [4.6. Title Pages] + + + + + + + + + + + + (document edition) contains an edition statement as presented on a title page of a document. [4.6. Title Pages] + + + + + + + + + + + + (document imprint) contains the imprint statement (place and date of publication, publisher name), as given (usually) at the foot of a title page. [4.6. Title Pages] + + + + + + + + + + + + + + + + (document date) contains the date of a document, as given on a title page or in a dateline. [4.6. Title Pages] + + + + + + + + (when) gives the value of the date in standard form, i.e. YYYY-MM-DD. + + + + + + + + + + + + (front matter) contains any prefatory matter (headers, abstracts, title page, prefaces, dedications, etc.) found at the start of a document, before the main body. [4.6. Title Pages 4. Default Text Structure] + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + (back matter) contains any appendixes, etc. following the main part of a text. [4.7. Back Matter 4. Default Text Structure] + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + provides the name of the character or glyph property being defined. + + + + + + + provides the value of the character or glyph property being defined. + + + + + + + specifies the version number of the Unicode Standard in which this property name is defined. +Suggested values include: 1] 1.0.1; 2] 1.1; 3] 2.0; 4] 2.1; 5] 3.0; 6] 3.1; 7] 3.2; 8] 4.0; 9] 4.1; 10] 5.0; 11] 5.1; 12] 5.2; 13] 6.0; 14] 6.1; 15] 6.2; 16] 6.3; 17] 7.0; 18] 8.0; 19] 9.0; 20] 10.0; 21] 11.0; 22] 12.0; 23] 12.1; 24] unassigned + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + (character or glyph) represents a glyph, or a non-standard character. [5. Characters, Glyphs, and Writing Modes] + + + + + + + points to a description of the character or glyph intended. + + + + + + + (character declarations) provides information about nonstandard characters and glyphs. [5.2. Markup Constructs for Representation of Characters and Glyphs] + + + + + + + + + + + + + + + (character) provides descriptive information about a character. [5.2. Markup Constructs for Representation of Characters and Glyphs] + + + + + + + + + + + + + + + + + + + + (character glyph) provides descriptive information about a character glyph. [5.2. Markup Constructs for Representation of Characters and Glyphs] + + + + + + + + + + + + + + + + + + + + (locally defined property) provides a locally defined character (or glyph) property. [5.2.1. Character Properties] + + + + + + + + + + + + + + + + + + + (unihan property) holds the name and value of a normative or informative Unihan character (or glyph) property as part of its attributes. [5.2.1. Character Properties] + + + + + + + specifies the normalized name of a unicode han database (Unihan) propertyspecifies the value of a named Unihan property + + + + + + + + + + + + (unicode property) provides a Unicode property for a character (or glyph). [5.2.1. Character Properties] + + + + + + + specifies the normalized name of a Unicode propertyspecifies the value of a named Unicode property. + + + + + + + (value) contains a single value for some property, attribute, or other analysis. [5.2.1. Character Properties] + + + + + + + + + + + + (Unicode property name) contains the name of a registered Unicode normative or informative property. [5.2.1. Character Properties] + + + + + + specifies the version number of the Unicode Standard in which this property name is defined. + + + + + + + + + + + + (locally-defined property name) contains a locally defined name for some property. [5.2.1. Character Properties] + + + + + + + + (character glyph name) contains the name of a glyph, expressed following Unicode conventions for character names. [5.2. Markup Constructs for Representation of Characters and Glyphs] + + + + + + + + (character property) provides a name and value for some property of the parent character or glyph. [5.2. Markup Constructs for Representation of Characters and Glyphs] + + + + + + + + + + + + + + + + (character name) contains the name of a character, expressed following Unicode conventions. [5.2. Markup Constructs for Representation of Characters and Glyphs] + + + + + + + + + + + + + + (metrical structure, conventional) contains a user-specified encoding for the conventional metrical structure of the element. + + + + + + + (metrical structure, realized) contains a user-specified encoding for the actual realization of the conventional metrical structure applicable to the element. + + + + + + + (rhyme scheme) specifies the rhyme scheme applicable to a group of verse lines. + + + + + + + + + + (enjambement) indicates that the end of a verse line is marked by enjambement. +Sample values include: 1] no; 2] yes; 3] weak; 4] strong + + + + + + + + + + + + + + + + + + + + + + indicates whether the notation conveys the abstract metrical form, its actual prosodic realization, or the rhyme scheme, or some combination thereof. + + + + + + + + + + + + + + + + + + + + + + + + + + + + (regular expression pattern) specifies a regular expression defining any value that is legal for this notation. + + + + + + + + + + + + specifies the character or character sequence being documented. + + + + + + + + + + + + + + + + + + + + + + + + marks the point at which a metrical line may be divided. [6.2. Components of the Verse Line] + + + + + + + + marks the rhyming part of a metrical line. [6.5. Rhyme] + + + + + + + + + provides a label (usually a single letter) to identify which part of a rhyme scheme this rhyming string instantiates. + + + + + + + + + + + + + + (setting) contains a description of the setting, time, locale, appearance, etc., of the action of a play, typically found in the front matter of a printed performance text (not a stage direction). [7.1. Front and Back Matter + ] + + + + + + + + + + + + + + + + + + (prologue) contains the prologue to a drama, typically spoken by an actor out of character, possibly in association with a particular performance or venue. [7.1.2. Prologues and Epilogues 7.1. Front and Back Matter + ] + + + + + + + + + + + + + + + + + + + + + + (epilogue) contains the epilogue to a drama, typically spoken by an actor out of character, possibly in association with a particular performance or venue. [7.1.2. Prologues and Epilogues 7.1. Front and Back Matter + ] + + + + + + + + + + + + + + + + + + + + + + (performance) contains a section of front or back matter describing how a dramatic piece is to be performed in general or how it was performed on some specific occasion. [7.1.3. Records of Performances 7.1. Front and Back Matter + ] + + + + + + + + + + + + + + + + + + + + + + (cast list) contains a single cast list or dramatis personae. [7.1.4. Cast Lists 7.1. Front and Back Matter + ] + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + (cast list item) contains a single entry within a cast list, describing either a single role or a list of non-speaking roles. [7.1.4. Cast Lists] + + + + + + + + + + + + + characterizes the cast item. + + + + + + the item describes a single role. + + + + + the item describes a list of non-speaking roles. + + + + + + + + + + (role) contains the name of a dramatic role, as given in a cast list. [7.1.4. Cast Lists] + + + + + + + + + + + + (role description) describes a character's role in a drama. [7.1.4. Cast Lists] + + + + + + + + + + + + contains the name of an actor appearing within a cast list. [7.1.4. Cast Lists] + + + + + + + + + + + + + (speech group) contains a group of speeches or songs in a performance text presented in a source as constituting a single unit or number. [7.2.3. Grouped Speeches] + + + + + + + + + + + + + + + + + + (movement) marks the actual movement of one or more characters. [7.2.4. Stage Directions] + + + + + + + + characterizes the movement, for example as an entrance or exit. +Suggested values include: 1] entrance; 2] exit; 3] onStage + + + + + + + + character is entering the stage. + + + + + + + + + character is exiting the stage. + + + + + + + + + character moves on stage + + + + + + + + + + + + + + + specifies the direction of a stage movement. +Sample values include: 1] L (left); 2] R (right); 3] C (center) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + (view) describes the visual context of some part of a screen play in terms of what the spectator sees, generally independent of any dialogue. [7.3.1. Technical Information 7.3. Other Types of Performance Text] + + + + + + + + + + + + (camera) describes a particular camera angle or viewpoint in a screen play. [7.3.1. Technical Information 7.3. Other Types of Performance Text] + + + + + + + + + + + + + (sound) describes a sound effect or musical sequence specified within a screen play or radio script. [7.3.1. Technical Information 7.3. Other Types of Performance Text] + + + + + + + + + categorizes the sound in some respect, e.g. as music, special effect, etc. + + + + + + + + + + indicates whether the sound overlaps the surrounding speeches or interrupts them. + + + + + + + + + + + + + + + + + + + + + + + + + + + (caption) contains the text of a caption or other text displayed as part of a film script or screenplay. [7.3.1. Technical Information 7.3. Other Types of Performance Text] + + + + + + + + + + + + (technical stage direction) describes a special-purpose stage direction that is not meant for the actors. [7.3.1. Technical Information] + + + + + + + + + categorizes the technical stage direction. +Suggested values include: 1] light; 2] sound; 3] prop; 4] block + + + + + + + + a lighting cue + + + + + + + + + a sound cue + + + + + + + + + a prop cue + + + + + + + + + a blocking instruction + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + (script statement) contains a citation giving details of the script used for a spoken text. [8.2. Documenting the Source of Transcribed Speech 2.2.7. The Source Description 15.3.2. Declarable Elements] + + + + + + + + + + + + + (recording statement) describes a set of recordings used as the basis for transcription of a spoken text. [8.2. Documenting the Source of Transcribed Speech 2.2.7. The Source Description] + + + + + + + + + + + + (recording event) provides details of an audio or video recording event used as the source of a spoken text, either directly or from a public broadcast. [8.2. Documenting the Source of Transcribed Speech 15.3.2. Declarable Elements] + + + + + + + + + + + + + the kind of recording. + + + + + + audio recording + + + + + audio and video recording + + + + + + + + + + (equipment) provides technical details of the equipment and media used for an audio or video recording used as the source for a spoken text. [8.2. Documenting the Source of Transcribed Speech 15.3.2. Declarable Elements] + + + + + + + + + + (broadcast) describes a broadcast used as the source of a spoken text. [8.2. Documenting the Source of Transcribed Speech 15.3.2. Declarable Elements] + + + + + + + + + + + + + + describes the set of transcription conventions used, particularly for spoken material. [8.2. Documenting the Source of Transcribed Speech] + + + + + + + + + + + supplies an identifier for the encoding convention, independent of any version number. + + + + + supplies a version number for the encoding conventions used, if any. + + + + + + + + + + + + (utterance) contains a stretch of speech usually preceded and followed by silence or by a change of speaker. [8.3.1. Utterances] + + + + + + + + + + + + (transition) indicates the nature of the transition between this utterance and the previous one. + + + + + + this utterance begins without unusual pause or rapidity. + + + + + this utterance begins with a markedly shorter pause than normal. + + + + + this utterance begins before the previous one has finished. + + + + + this utterance begins after a noticeable pause. + + + + + + + + + + + + (pause) marks a pause either between or within utterances. [8.3.2. Pausing] + + + + + + + + + + + (vocal) marks any vocalized but not necessarily lexical phenomenon, for example voiced pauses, non-lexical backchannels, etc. [8.3.3. Vocal, Kinesic, Incident] + + + + + + + + + + indicates whether or not the phenomenon is repeated. + + + + + + + + + + + + + + + + + + + + + + + + + (kinesic) marks any communicative phenomenon, not necessarily vocalized, for example a gesture, frown, etc. [8.3.3. Vocal, Kinesic, Incident] + + + + + + + + + + indicates whether or not the phenomenon is repeated. + + + + + + + + + + + + + + + + + + + + + + + + + (incident) marks any phenomenon or occurrence, not necessarily vocalized or communicative, for example incidental noises or other events affecting communication. [8.3.3. Vocal, Kinesic, Incident] + + + + + + + + + + + + (writing) contains a passage of written text revealed to participants in the course of a spoken text. [8.3.4. Writing] + + + + + + + + + + + indicates whether the writing is revealed all at once or gradually. + + + + + + + + + + + + + + + + + + + + + + + + + + + (shift) marks the point at which some paralinguistic feature of a series of utterances by any one speaker changes. [8.3.6. Shifts] + + + + + + + a paralinguistic feature. +Suggested values include: 1] tempo; 2] loud; 3] pitch; 4] tension; 5] rhythm; 6] voice + + + + + + + + speed of utterance. + + + + + + + + + loudness. + + + + + + + + + pitch range. + + + + + + + + + tension or stress pattern. + + + + + + + + + rhythmic qualities. + + + + + + + + + voice quality. + + + + + + + + + + + + + + + specifies the new state of the paralinguistic feature specified. + + + + + + + + + + + + groups together various annotations, e.g. for parallel interpretations of a spoken segment. [8.4.6. Analytic Coding] + + + + + + + + + + + + + + + + + + + + (normalized) provides the normalized/standardized form of information present in the source text in a non-normalized form + + + + + + + (original) gives the original string or is the empty string when the element does not appear in the source text. + + + + + + + + + + + + + + + provides a lemma (base form) for the word, typically uninflected and serving both as an identifier (e.g. in dictionary contexts, as a headword), and as a basis for potential inflections. + + + + + + + provides a pointer to a definition of the lemma for the word, for example in an online lexicon. + + + + + + + (part of speech) indicates the part of speech assigned to a token (i.e. information on whether it is a noun, adjective, or verb), usually according to some official reference vocabulary (e.g. for German: STTS, for English: CLAWS, for Polish: NKJP, etc.). + + + + + + + + + + when present, it provides information on whether the token in question is adjacent to another, and if so, on which side. The definition of this attribute is adapted from ISO MAF (Morpho-syntactic Annotation Framework), ISO 24611:2012. + + + + + + (the token is not adjacent to another) + + + + + (there is no whitespace on the left side of the token) + + + + + (there is no whitespace on the right side of the token) + + + + + (there is no whitespace on either side of the token) + + + + + (the token overlaps with another; other devices (specifying the extent and the area of overlap) are needed to more precisely locate this token in the character stream) + + + + + + + + + + + + + + + + + + + + + + + + (s-unit) contains a sentence-like division of a text. [17.1. Linguistic Segment Categories 8.4.1. Segmentation] + + + + + + + + + + + + + + + (clause) represents a grammatical clause. [17.1. Linguistic Segment Categories] + + + + + + + + + + + + + + + (phrase) represents a grammatical phrase. [17.1. Linguistic Segment Categories] + + + + + + + + + + + + + + + (word) represents a grammatical (not necessarily orthographic) word. [17.1. Linguistic Segment Categories 17.4.2. Lightweight Linguistic Annotation] + + + + + + + + + + + + + + + + + + + + + + + + (morpheme) represents a grammatical morpheme. [17.1. Linguistic Segment Categories] + + + + + + + + + + + + + + + + + supplies the morpheme's base form. + + + + + + + + + + + + (character) represents a character. [17.1. Linguistic Segment Categories] + + + + + + + + + + + + + + + (punctuation character) contains a character or string of characters regarded as constituting a single punctuation mark. [17.1.2. Below the Word Level 17.4.2. Lightweight Linguistic Annotation] + + + + + + + + + + + + + + indicates the extent to which this punctuation mark conventionally separates words or phrases + + + + + + the punctuation mark is a word separator + + + + + the punctuation mark is not a word separator + + + + + the punctuation mark may or may not be a word separator + + + + + + + + provides a name for the kind of unit delimited by this punctuation mark. + + + + + + + + + + indicates whether this punctuation mark precedes or follows the unit it delimits. + + + + + + + associates an interpretative annotation directly with a span of text. [17.3. Spans and Interpretations] + + + + + + + + + + + indicates what kind of phenomenon is being noted in the passage. +Sample values include: 1] image; 2] character; 3] theme; 4] allusion + + + + + + + + + + + gives the identifier of the node which is the end-point of the span of text being annotated. + + + + + + + + + (span group) collects together span tags. [17.3. Spans and Interpretations] + + + + + + + + + + + + indicates what kind of phenomenon is being noted in the passage. +Sample values include: 1] image; 2] character; 3] theme; 4] allusion + + + + + + + + + + + + (interpretation) summarizes a specific interpretative annotation which can be linked to a span of text. [17.3. Spans and Interpretations] + + + + + + + + + + + + + indicates what kind of phenomenon is being noted in the passage. +Sample values include: 1] image; 2] character; 3] theme; 4] allusion + + + + + + + + + + + + (interpretation group) collects together a set of related interpretations which share responsibility or type. [17.3. Spans and Interpretations] + + + + + + + + + + + + indicates what kind of phenomenon is being noted in the passage. +Sample values include: 1] image; 2] character; 3] theme; 4] allusion + + + + + + + + + + + + + + + + + + + + + + + + indicates type of entry, in dictionaries with multiple types. +Suggested values include: 1] main; 2] hom (homograph); 3] xref (cross reference); 4] affix; 5] abbr (abbreviation); 6] supplemental; 7] foreign + + + + + + + + a main entry (default). + + + + + + + + + (homograph) groups information relating to one homograph within an entry. + + + + + + + + + (cross reference) a reduced entry whose only function is to point to another main entry (e.g. for forms of an irregular verb or for variant spellings: was pointing to be, or esthete to aesthete). + + + + + + + + + an entry for a prefix, infix, or suffix. + + + + + + + + + (abbreviation) an entry for an abbreviation. + + + + + + + + + a supplemental entry (for use in dictionaries which issue supplements to their main work in which they include updated information about entries). + + + + + + + + + an entry for a foreign word in a monolingual dictionary. + + + + + + + + + + + + + + + + + + + + + + + + + + + (expand) gives an expanded form of information presented more concisely in the dictionary + + + + + + + (split) gives the list of split values for a merged form + + + + + + + (value) gives a value which lacks any realization in the printed source text. + + + + + + + + + + (merged into) gives a reference to another element, where the original appears as a merged form. + + + + + + + (optional) indicates whether the element is optional or not + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + (super entry) groups a sequence of entries within any kind of lexical resource, such as a dictionary or lexicon which function as a single unit, for example a set of homographs. [9.1. Dictionary Body and Overall Structure] + + + + + + + + + + + + + + + + + (entry) contains a single structured entry in any kind of lexical resource, such as a dictionary or lexicon. [9.1. Dictionary Body and Overall Structure 9.2. The Structure of Dictionary Entries] + + + + + + + + + + + + + + + + + + (unstructured entry) contains a single unstructured entry in any kind of lexical resource, such as a dictionary or lexicon. [9.1. Dictionary Body and Overall Structure 9.2. The Structure of Dictionary Entries] + + + + + + + + + + + + + + + + + + + (homograph) groups information relating to one homograph within an entry. [9.2. The Structure of Dictionary Entries] + + + + + + + + + + + + + + + groups together all information relating to one word sense in a dictionary entry, for example definitions, examples, and translation equivalents. [9.2. The Structure of Dictionary Entries] + + + + + + + + + + + + + + gives the nesting depth of this sense. + + + + + + + (dictionary scrap) encloses a part of a dictionary entry in which other phrase-level dictionary elements are freely combined. [9.1. Dictionary Body and Overall Structure 9.2. The Structure of Dictionary Entries] + + + + + + + + + + + + + + + + (form information group) groups all the information on the written and spoken forms of one headword. [9.3.1. Information on Written and Spoken Forms] + + + + + + + + + + + + + + + classifies form as simple, compound, etc. +Suggested values include: 1] simple; 2] lemma; 3] variant; 4] compound; 5] derivative; 6] inflected; 7] phrase + + + + + + + + single free lexical item + + + + + + + + + the headword itself + + + + + + + + + a variant form + + + + + + + + + word formed from simple lexical items + + + + + + + + + word derived from headword + + + + + + + + + word in other than usual dictionary form + + + + + + + + + multiple-word lexical item + + + + + + + + + + + + + + + + + (orthographic form) gives the orthographic form of a dictionary headword. [9.3.1. Information on Written and Spoken Forms] + + + + + + + + + + + + gives the type of spelling. + + + + + + + + + + + + + + (pronunciation) contains the pronunciation(s) of the word. [9.3.1. Information on Written and Spoken Forms] + + + + + + + + + + + + + + + + (hyphenation) contains a hyphenated form of a dictionary headword, or hyphenation information in some other form. [9.3.1. Information on Written and Spoken Forms] + + + + + + + + + + + + + + (syllabification) contains the syllabification of the headword. [9.3.1. Information on Written and Spoken Forms] + + + + + + + + + + + + + + (stress) contains the stress pattern for a dictionary headword, if given separately. [9.3.1. Information on Written and Spoken Forms] + + + + + + + + + + + + + (grammatical information) within an entry in a dictionary or a terminological data file, contains grammatical information relating to a term, word, or form. [9.3.2. Grammatical Information] + + + + + + + + + + + + + + + + + + + + + (gender) identifies the morphological gender of a lexical item, as given in the dictionary. [9.3.1. Information on Written and Spoken Forms] + + + + + + + + + + + + + (number) indicates grammatical number associated with a form, as given in a dictionary. [9.3.1. Information on Written and Spoken Forms 9.3.2. Grammatical Information] + + + + + + + + + + + + + (case) contains grammatical case information given by a dictionary for a given form. [9.3.1. Information on Written and Spoken Forms] + + + + + + + + + + + + + (person) contains an indication of the grammatical person (1st, 2nd, 3rd, etc.) associated with a given inflected form in a dictionary. [9.3.1. Information on Written and Spoken Forms] + + + + + + + + + + + + + (tense) indicates the grammatical tense associated with a given inflected form in a dictionary. [9.3.1. Information on Written and Spoken Forms] + + + + + + + + + + + + + (mood) contains information about the grammatical mood of verbs (e.g. indicative, subjunctive, imperative). [9.3.1. Information on Written and Spoken Forms] + + + + + + + + + + + + + (inflectional class) indicates the inflectional class associated with a lexical item. [9.3.1. Information on Written and Spoken Forms] + + + + + + + + + + indicates the type of indicator used to specify the inflection class, when it is necessary to distinguish between the usual abbreviated indications (e.g. inv) and other kinds of indicators, such as special codes referring to conjugation patterns, etc. +Sample values include: 1] abbrev; 2] verbTable + + + + + + + + + + + + + + + + + + + + + + + + + + + + (part of speech) indicates the part of speech assigned to a dictionary headword such as noun, verb, or adjective. [9.3.2. Grammatical Information] + + + + + + + + + + + + + (subcategorization) contains subcategorization information (transitive/intransitive, countable/non-countable, etc.) [9.3.2. Grammatical Information] + + + + + + + + + + + + + (collocate) contains any sequence of words that co-occur with the headword with significant frequency. [9.3.2. Grammatical Information] + + + + + + + + + + + + + + (definition) contains definition text in a dictionary entry. [9.3.3.1. Definitions] + + + + + + + + + + + + + (etymology) encloses the etymological information in a dictionary entry. [9.3.4. Etymological Information] + + + + + + + + + + + + + + + + + + + + + + (language name) contains the name of a language mentioned in etymological or other linguistic discussion. [9.3.4. Etymological Information] + + + + + + + + + + + + + (usage) contains usage information in a dictionary entry. [9.3.5.2. Usage Information and Other Labels] + + + + + + + + + + classifies the usage information using any convenient typology. +Sample values include: 1] geo (geographic); 2] time; 3] dom (domain); 4] register (register); 5] style; 6] plev (preference level); 7] lang (language); 8] gram (grammatical); 9] syn (synonym); 10] hyper (hypernym); 11] colloc (collocation); 12] comp (complement); 13] obj (object); 14] subj (subject); 15] verb; 16] hint + + + + + + + + + + + + + + (label) contains a label for a form, example, translation, or other piece of information, e.g. abbreviation for, contraction of, literally, approximately, synonyms:, etc. [9.3.1. Information on Written and Spoken Forms 9.3.3.2. Translation Equivalents 9.3.5.3. Cross-References to Other Entries] + + + + + + + + + + classifies the label using any convenient typology. + + + + + + + + + + + + + + (cross-reference phrase) contains a phrase, sentence, or icon referring the reader to some other location in this or another text. [9.3.5.3. Cross-References to Other Entries] + + + + + + + + + + + + + + + + indicates the type of cross reference, using any convenient typology. +Sample values include: 1] syn (synonym); 2] etym (etymological); 3] cf (compare or consult); 4] illus (illustration) + + + + + + + + + + + + (related entry) contains a dictionary entry for a lexical item related to the headword, such as a compound phrase or derived form, embedded inside a larger entry. [9.3.6. Related Entries] + + + + + + + + + + + + + + + + + (orthographic-form reference) in a dictionary example, indicates a reference to the orthographic form(s) of the headword. [9.4. Headword and Pronunciation References] + + + + + + + + + + + + + + indicates the kind of typographic modification made to the headword in the reference. +Sample values include: 1] cap (capital); 2] noHyph (no hyphen) + + + + + + + + + + + + (pronunciation reference) in a dictionary example, indicates a reference to the pronunciation(s) of the headword. [9.4. Headword and Pronunciation References] + + + + + + + + + + + + + + + + + + + indicates whether the passage being quoted is defective, i.e. incomplete through loss or damage. + + + + + + + + + + + + + + + + + + + + + + + + + + + + identifies the text types or classifications applicable to this item by pointing to other elements or resources defining the classification concerned. + + + + + + + + + + + + + + (manuscript description) contains a description of a single identifiable manuscript or other text-bearing object such as early printed books. [10.1. Overview] + + + + + + + + + + + + + + + + + + + + + + + + + + + + + (catchwords) describes the system used to ensure correct ordering of the quires or similar making up a codex, incunable, or other object typically by means of annotations at the foot of the page. [10.3.7. Catchwords, Signatures, Secundo Folio] + + + + + + + + + + + + (dimensions) contains a dimensional specification. [10.3.4. Dimensions] + + + + + + + + + + + + indicates which aspect of the object is being measured. +Sample values include: 1] leaves; 2] ruled; 3] pricked; 4] written; 5] miniatures; 6] binding; 7] box + + + + + + + + + + + + contains any single measurement forming part of a dimensional specification of some sort. [10.3.4. Dimensions] + + + + + + + + + + + + + + (height) contains a measurement measured along the axis at a right angle to the bottom of the object. [10.3.4. Dimensions] + + + + + + + + + + + + + + + + + + + + + + + (width) contains a measurement of an object along the axis parallel to its bottom, e.g. perpendicular to the spine of a book or codex. [10.3.4. Dimensions] + + + + + + + + + + + + + (heraldry) contains a heraldic formula or phrase, typically found as part of a blazon, coat of arms, etc. [10.3.8. Heraldry] + + + + + + + + + + + + (locus) defines a location within a manuscript, manuscript part, or other object typically as a (possibly discontinuous) sequence of folio references. [10.3.5. References to Locations within a Manuscript] + + + + + + + + + + + + + + (from) specifies the starting point of the location in a normalized form, typically a page number. + + + + + + + + + + (to) specifies the end-point of the location in a normalized form, typically as a page number. + + + + + + + + + + + + (locus group) groups a number of locations which together form a distinct but discontinuous item within a manuscript, manuscript part, or other object. [10.3.5. References to Locations within a Manuscript] + + + + + + + + + + + + (material) contains a word or phrase describing the material of which the object being described is composed. [10.3.2. Material and Object Type] + + + + + + + + + + describes the function or use of the material in relation to the object as a whole. +Sample values include: 1] binding; 2] endband; 3] slipcase; 4] support; 5] tie + + + + + + + + + + identifies one or more elements to which the metamark applies. + + + + + + + + + + + + + + + + + (object type) contains a word or phrase describing the type of object being referred to. [10.3.2. Material and Object Type] + + + + + + + + + + + + + (origin date) contains any form of date, used to identify the date of origin for a manuscript, manuscript part, or other object. [10.3.1. Origination] + + + + + + + + + + + + + + + + + (origin place) contains any form of place name, used to identify the place of origin for a manuscript, manuscript part, or other object. [10.3.1. Origination] + + + + + + + + + + + + + + + + (second folio) marks the word or words taken from a fixed point in a codex (typically the beginning of the second leaf) in order to provide a unique identifier for it. [10.3.7. Catchwords, Signatures, Secundo Folio] + + + + + + + + + + + + (signatures) contains discussion of the leaf or quire signatures found within a codex or similar object. [10.3.7. Catchwords, Signatures, Secundo Folio] + + + + + + + + + + + + (stamp) contains a word or phrase describing a stamp or similar device. [10.3.3. Watermarks and Stamps] + + + + + + + + + + + + + + (watermark) contains a word or phrase describing a watermark or similar device. [10.3.3. Watermarks and Stamps] + + + + + + + + + + + + (manuscript identifier) contains the information required to identify the manuscript or similar object being described. [10.4. The Manuscript Identifier] + + + + + + + + + + + + + + + + + + + + + + + + + + + + (institution) contains the name of an organization such as a university or library, with which a manuscript or other object is identified, generally its holding institution. [10.4. The Manuscript Identifier] + + + + + + + + + + + + + (repository) contains the name of a repository within which manuscripts or other objects are stored, possibly forming part of an institution. [10.4. The Manuscript Identifier] + + + + + + + + + + + + + (collection) contains the name of a collection of manuscripts or other objects, not necessarily located within a single repository. [10.4. The Manuscript Identifier] + + + + + + + + + + + + + + (alternative identifier) contains an alternative or former structured identifier used for a manuscript or other object, such as a former catalogue number. [10.4. The Manuscript Identifier] + + + + + + + + + + + + + + + + + + + + + + + + (alternative name) contains any form of unstructured alternative name used for a manuscript or other object, such as an ocellus nominum, or nickname. [10.4. The Manuscript Identifier] + + + + + + + + + + + + + + (colophon) contains the colophon of an item: that is, a statement providing information regarding the date, place, agency, or reason for production of the manuscript or other object. [10.6.1. The msItem and msItemStruct Elements] + + + + + + + + + + + + + (explicit) contains the explicit of a item, that is, the closing words of the text proper, exclusive of any rubric or colophon which might follow it. [10.6.1. The msItem and msItemStruct Elements] + + + + + + + + + + + + + + (filiation) contains information concerning the manuscript or other object's filiation, i.e. its relationship to other surviving manuscripts or other objects of the same text or contents, its protographs, antigraphs and apographs. [10.6.1. The msItem and msItemStruct Elements] + + + + + + + + + + + + + (final rubric) contains the string of words that denotes the end of a text division, often with an assertion as to its author and title, usually set off from the text itself by red ink, by a different size or type of script, or by some other such visual device. [10.6.1. The msItem and msItemStruct Elements] + + + + + + + + + + + + + + contains the incipit of a manuscript or similar object item, that is the opening words of the text proper, exclusive of any rubric which might precede it, of sufficient length to identify the work uniquely; such incipits were, in former times, frequently used a means of reference to a work, in place of a title. [10.6.1. The msItem and msItemStruct Elements] + + + + + + + + + + + + + + (manuscript contents) describes the intellectual content of a manuscript, manuscript part, or other object either as a series of paragraphs or as a series of structured manuscript items. [10.6. Intellectual Content] + + + + + + + + + + + + + + + + + + + + + + (manuscript item) describes an individual work or item within the intellectual content of a manuscript, manuscript part, or other object. [10.6.1. The msItem and msItemStruct Elements] + + + + + + + + + + + + + + + + + + + + + + + + (structured manuscript item) contains a structured description for an individual work or item within the intellectual content of a manuscript, manuscript part, or other object. [10.6.1. The msItem and msItemStruct Elements] + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + (rubric) contains the text of any rubric or heading attached to a particular manuscript item, that is, a string of words through which a manuscript or other object signals the beginning of a text division, often with an assertion as to its author and title, which is in some way set off from the text itself, typically in red ink, or by use of different size or type of script, or some other such visual device. [10.6.1. The msItem and msItemStruct Elements] + + + + + + + + + + + + + + contains an overview of the available information concerning some aspect of an item or object (for example, its intellectual content, history, layout, typography etc.) as a complement or alternative to the more detailed information carried by more specific elements. [10.6. Intellectual Content] + + + + + + + + + + + + (physical description) contains a full physical description of a manuscript, manuscript part, or other object optionally subdivided using more specialized elements from the model.physDescPart class. [10.7. Physical Description] + + + + + + + + + + + + + + + + + + + + + (object description) contains a description of the physical components making up the object which is being described. [10.7.1. Object Description] + + + + + + + + + + + + + (form) a short project-specific name identifying the physical form of the carrier, for example as a codex, roll, fragment, partial leaf, cutting etc. + + + + + + + + + + + + (support description) groups elements describing the physical support for the written part of a manuscript or other object. [10.7.1. Object Description] + + + + + + + + + + + + + + + + (material) a short project-defined name for the material composing the majority of the support +Suggested values include: 1] paper; 2] parch (parchment); 3] mixed + + + + + + + + + + + + + + + + + (parchment) + + + + + + + + + + + + + + + + + + + + + + + + + + (support) contains a description of the materials etc. which make up the physical support for the written part of a manuscript or other object. [10.7.1. Object Description] + + + + + + + + + + + + (collation) contains a description of how the leaves, bifolia, or similar objects are physically arranged. [10.7.1. Object Description] + + + + + + + + + + + + (foliation) describes the numbering system or systems used to count the leaves or pages in a codex or similar object. [10.7.1.4. Foliation] + + + + + + + + + + + + (condition) contains a description of the physical condition of the manuscript or object. [10.7.1.5. Condition] + + + + + + + + + + + + (layout description) collects the set of layout descriptions applicable to a manuscript or other object. [10.7.2. Writing, Decoration, and Other Notations] + + + + + + + + + + + + + + + (layout) describes how text is laid out on the page or surface of the object, including information about any ruling, pricking, or other evidence of page-preparation techniques. [10.7.2. Writing, Decoration, and Other Notations] + + + + + + + + (columns) specifies the number of columns per page + + + + + + + + + + + + + + + + + + (textual streams) indicates the number of streams per page, each of which contains an independent textual stream + + + + + + + + + + + + + + + + + + (ruled lines) specifies the number of ruled lines per column + + + + + + + + + + + + + + + + + + (written lines) specifies the number of written lines per column + + + + + + + + + + + + + + + + + + + + + + (description of hands) contains a description of all the different hands used in a manuscript or other object. [10.7.2. Writing, Decoration, and Other Notations] + + + + + + + + + + + + + (hands) specifies the number of distinct hands identified within the manuscript + + + + + + + (typeface description) contains a description of the typefaces or other aspects of the printing of an incunable or other printed source. [10.7.2.1. Writing] + + + + + + + + + + + + + + + (typographic note) describes a particular font or other significant typographic feature distinguished within the description of a printed resource. [10.7.2. Writing, Decoration, and Other Notations] + + + + + + + + + + + + + contains a description of the scripts used in a manuscript or other object. [10.7.2.1. Writing] + + + + + + + + + + + + + + + (music notation) contains description of type of musical notation. [10.7.2. Writing, Decoration, and Other Notations] + + + + + + + + + + + + + + + + + + + + + + + + (note on decoration) contains a note describing either a decorative component of a manuscript or other object, or a fairly homogenous class of such components. [10.7.3. Bindings, Seals, and Additional Material] + + + + + + + + + + + + + (additions) contains a description of any significant additions found within a manuscript or other object, such as marginalia or other annotations. [10.7.2. Writing, Decoration, and Other Notations] + + + + + + + + + + + + + + + + + + + + + + + + + (binding) contains a description of one binding, i.e. type of covering, boards, etc. applied to a manuscript or other object. [10.7.3.1. Binding Descriptions] + + + + + + + + + + + + (contemporary) specifies whether or not the binding is contemporary with the majority of its contents + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + (seal) contains a description of one seal or similar applied to the object described [10.7.3.2. Seals] + + + + + + + + + + + + (contemporary) specifies whether or not the seal is contemporary with the item to which it is affixed + + + + + + + + + + + + + + + + + + + + + + + + + (accompanying material) contains details of any significant additional material which may be closely associated with the manuscript or object being described, such as non-contemporaneous documents or fragments bound in with it at some earlier historical period. [10.7.3.3. Accompanying Material] + + + + + + + + + + + + + (history) groups elements describing the full history of a manuscript, manuscript part, or other object. [10.8. History] + + + + + + + + + + + + + + + + + (origin) contains any descriptive or other information concerning the origin of a manuscript, manuscript part, or other object. [10.8. History] + + + + + + + + + + + + + + (provenance) contains any descriptive or other information concerning a single identifiable episode during the history of a manuscript, manuscript part, or other object after its creation but before its acquisition. [10.8. History] + + + + + + + + + + + + + + (acquisition) contains any descriptive or other information concerning the process by which a manuscript or manuscript part or other object entered the holding institution. [10.8. History] + + + + + + + + + + + + + (additional) groups additional information, combining bibliographic information about a manuscript or other object, or surrogate copies of it, with curatorial or administrative information. [10.9. Additional Information] + + + + + + + + + + + + + (administrative information) contains information about the present custody and availability of the manuscript or other object, and also about the record description itself. [10.9.1. Administrative Information] + + + + + + + + + + + + + + (recorded history) provides information about the source and revision status of the parent manuscript or object description itself. [10.9.1. Administrative Information] + + + + + + + + + + + + + + + (source) describes the original source for the information contained with a manuscript or object description. [10.9.1.1. Record History] + + + + + + + + + + + + (custodial history) contains a description of a manuscript or other object's custodial history, either as running prose or as a series of dated custodial events. [10.9.1.2. Availability and Custodial History] + + + + + + + + + + + + (custodial event) describes a single event during the custodial history of a manuscript or other object. [10.9.1.2. Availability and Custodial History] + + + + + + + + + + + + + + (surrogates) contains information about any representations of the manuscript or other object being described which may exist in the holding institution or elsewhere. [10.9. Additional Information] + + + + + + + + + + + + (manuscript part) contains information about an originally distinct manuscript or part of a manuscript, which is now part of a composite manuscript. [10.10. Manuscript Parts] + + + + + + + + + + + + + + + + + + + + + + + (manuscript fragment) contains information about a fragment described in relation to a prior context, typically as a description of a virtual reconstruction of a manuscript or other object whose fragments were catalogued separately [10.11. Manuscript Fragments] + + + + + + + + + + + + + + + + + + + + + + + + + + + + + (facsimile) points to one or more images, portions of an image, or surfaces which correspond to the current element. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + indicates the element within a transcription of the text containing at least the start of the writing represented by this zone or surface. + + + + + + + gives the x coordinate value for the upper left corner of a rectangular space. + + + + + + + + + + + + + + + + gives the y coordinate value for the upper left corner of a rectangular space. + + + + + + + + + + + + + + + + gives the x coordinate value for the lower right corner of a rectangular space. + + + + + + + + + + + + + + + + gives the y coordinate value for the lower right corner of a rectangular space. + + + + + + + + + + + + + + + + identifies a two dimensional area by means of a series of pairs of numbers, each of which gives the x,y coordinates of a point on a line enclosing the area. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + contains a representation of some written source in the form of a set of images rather than as transcribed or encoded text. [11.1. Digital Facsimiles] + + + + + + + + + + + + + + + + + + contains a transcription or other representation of a single source document potentially forming part of a dossier génétique or collection of sources. [11.1. Digital Facsimiles 11.2.2. Embedded Transcription] + + + + + + + + + + + + + + + defines a written surface as a two-dimensional coordinate space, optionally grouping one or more graphic representations of that space, zones of interest within that space, and transcriptions of the writing within them. [11.1. Digital Facsimiles 11.2.2. Embedded Transcription] + + + + + + + + + + + + + + + + + + + + + + + + + + describes the method by which this surface is or was connected to the main surface +Sample values include: 1] glued; 2] pinned; 3] sewn + + + + + + + + + + indicates whether the surface is attached and folded in such a way as to provide two writing surfaces + + + + + + + defines any kind of useful grouping of written surfaces, for example the recto and verso of a single leaf, which the encoder wishes to treat as a single unit. [11.1. Digital Facsimiles] + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + identifies a line within the container or bounding box specified by the parent element by means of a series of two or more pairs of numbers, each of which gives the x,y coordinates of a point on the line. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + (damage) contains an area of damage to the text witness. [11.3.3.1. Damage, Illegibility, and Supplied Text] + + + + + + + + + + + + + + (damaged span of text) marks the beginning of a longer sequence of text which is damaged in some way but still legible. [11.3.3.1. Damage, Illegibility, and Supplied Text] + + + + + + + + + + + (deleted span of text) marks the beginning of a longer sequence of text deleted, marked as deleted, or otherwise signaled as superfluous or spurious by an author, scribe, annotator, or corrector. [11.3.1.4. Additions and Deletions] + + + + + + + + + + + + (editorial expansion) contains a sequence of letters added by an editor or transcriber when expanding an abbreviation. [11.3.1.2. Abbreviation and Expansion] + + + + + + + + + + + + + + (forme work) contains a running head (e.g. a header, footer), catchword, or similar material appearing on the current page. [11.6. Headers, Footers, and Similar Matter] + + + + + + + + + + + classifies the material encoded according to some useful typology. +Sample values include: 1] header; 2] footer; 3] pageNum (page number); 4] lineNum (line number); 5] sig (signature); 6] catch (catchword) + + + + + + + + + + + + + + + + + + + + + + (handwriting shift) marks the beginning of a sequence of text written in a new hand, or the beginning of a scribal stint. [11.3.2.1. Document Hands] + + + + + + + + + + (abbreviation marker) contains a sequence of letters or signs present in an abbreviation which are omitted or replaced in the expanded form of the abbreviation. [11.3.1.2. Abbreviation and Expansion] + + + + + + + + + + + + + + (restore) indicates restoration of text to an earlier state by cancellation of an editorial or authorial marking or instruction. [11.3.1.6. Cancellation of Deletions and Other Markings] + + + + + + + + + + + + + + + (space) indicates the location of a significant space in the text. [11.4.1. Space] + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + (responsible party) (responsible party) indicates the individual responsible for identifying and measuring the space + + + + + + + + + + + + + (dimension) indicates whether the space is horizontal or vertical. + + + + + + the space is horizontal. + + + + + the space is vertical. + + + + + + + + + + (substitution) groups one or more deletions (or surplus text) with one or more additions when the combination is to be regarded as a single intervention in the text. [11.3.1.5. Substitutions] + + + + + + + + + + + + + + + + (substitution join) identifies a series of possibly fragmented additions, deletions, or other revisions on a manuscript that combine to make up a single intervention in the text [11.3.1.5. Substitutions] + + + + + + + + + + + + + + + (supplied) signifies text supplied by the transcriber or editor for any reason; for example because the original cannot be read due to physical damage, or because of an obvious omission by the author or scribe. [11.3.3.1. Damage, Illegibility, and Supplied Text] + + + + + + + + + + one or more words indicating why the text has had to be supplied, e.g. overbinding, faded-ink, lost-folio, omitted-in-original. + + + + + + + + + + + + + + + + + + + + + + + (surplus) marks text present in the source which the editor believes to be superfluous or redundant. [11.3.3.1. Damage, Illegibility, and Supplied Text] + + + + + + + + + + one or more words indicating why this text is believed to be superfluous, e.g. repeated, interpolated etc. + + + + + + + + + + + + + + + + + + + + + + + (secluded text) Secluded. Marks text present in the source which the editor believes to be genuine but out of its original place (which is unknown). [11.3.1.7. Text Omitted from or Supplied in the Transcription] + + + + + + + + + + one or more words indicating why this text has been secluded, e.g. interpolated etc. + + + + + + + + + + + + + + + + + + + + + + + contains the transcription of a topographic line in the source document [11.2.2. Embedded Transcription] + + + + + + + + + + + + + + + + supplies a list of transpositions, each of which is indicated at some point in a document typically by means of metamarks. [11.3.4.5. Transpositions] + + + + + + + + + + + + contains or describes any kind of graphic or written signal within a document the function of which is to determine how it should be read rather than forming part of the actual content of the document. [11.3.4.2. Metamarks] + + + + + + + + + + describes the function (for example status, insertion, deletion, transposition) of the metamark. + + + + + + + + + + identifies one or more elements to which the metamark applies. + + + + + + + + + + + + + + + + + represents any kind of modification identified within a single document. [11.3.4.1. Generic Modification] + + + + + + + + + + + + + + + + indicates one or more cancelled interventions in a document which have subsequently been marked as reaffirmed or repeated. [11.3.4.4. Confirmation, Cancellation, and Reinstatement of Modifications] + + + + + + + + + points to one or more elements representing the interventions which are being reasserted. + + + + + + + + + + + + + + + contains a sequence of writing which has been retraced, for example by over-inking, to clarify or fix it. [11.3.4.3. Fixation and Clarification] + + + + + + + + + + + + + + + describes a single textual transposition as an ordered list of at least two pointers specifying the order in which the elements indicated should be re-combined. [11.3.4.5. Transpositions] + + + + + + + + + + + + + indicates one or more marked-up interventions in a document which have subsequently been marked for cancellation. [11.3.4.4. Confirmation, Cancellation, and Reinstatement of Modifications] + + + + + + + + + points to one or more elements representing the interventions which are to be reverted or undone. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + (witness or witnesses) contains a space-delimited list of one or more sigla indicating the witnesses to this reading beginning or ending at this point. + + + + + + + + + + + + + + + + + + (witness or witnesses) contains a space-delimited list of one or more pointers indicating the witnesses which attest to a given reading. + + + + + + + + + + + + + + + + + + + + + + + classifies the reading according to some useful typology. +Sample values include: 1] substantive (substantive); 2] orthographic (orthographic) + + + + + + + + + + + + classifies the cause for the variant reading, according to any appropriate typology of possible origins. +Sample values include: 1] homeoteleuton; 2] homeoarchy; 3] paleographicConfusion; 4] haplography; 5] dittography; 6] falseEmendation + + + + + + + + + + + + (variant sequence) provides a number indicating the position of this reading in a sequence, when there is reason to presume a sequence to the variants. + + + + + + + points to other readings that are required when adopting the current reading or lemma. + + + + + + + + + + + + + + (apparatus entry) contains one entry in a critical apparatus, with an optional lemma and usually one or more readings or notes on the relevant passage. [12.1.1. The Apparatus Entry] + + + + + + + + + + + + + + + + + classifies the variation contained in this element according to some convenient typology. + + + + + + + + + + identifies the beginning of the lemma in the base text. + + + + + identifies the endpoint of the lemma in the base text. + + + + + (location) indicates the location of the variation, when the location-referenced method of apparatus markup is used. + + + + + + + + + + + + + + + + + + + + + (list of apparatus entries) contains a list of apparatus entries. [12.2. Linking the Apparatus to the Text] + + + + + + + + + + + + + + + + + + + (lemma) contains the lemma, or base text, of a textual variation. [12.1. The Apparatus Entry, Readings, and Witnesses] + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + (reading) contains a single reading within a textual variation. [12.1. The Apparatus Entry, Readings, and Witnesses] + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + (reading group) within a textual variation, groups two or more readings perceived to have a genetic relationship or other affinity. [12.1. The Apparatus Entry, Readings, and Witnesses] + + + + + + + + + + + + + + + + + + + (witness detail) gives further information about a particular witness, or witnesses, to a particular reading. [12.1. The Apparatus Entry, Readings, and Witnesses] + + + + + + + + + + + + + + + describes the type of information given about the witness. + + + + + + + + + + (witnesses) indicates the sigil or sigla identifying the witness or witnesses to which the detail refers. + + + + + + + + + + + + + + + (wit) contains a list of one or more sigla of witnesses attesting a given reading, in a textual variation. [12.1.4. Witness Information] + + + + + + + + + + + + + (witness list) lists definitions for all the witnesses referred to by a critical apparatus, optionally grouped hierarchically. [12.1. The Apparatus Entry, Readings, and Witnesses] + + + + + + + + + + + + + + + + + (witness) contains either a description of a single witness referred to within the critical apparatus, or a list of witnesses which is to be referred to by a single sigil. [12.1. The Apparatus Entry, Readings, and Witnesses] + + + + + + + + + + + + + + + (fragmented witness start) indicates the beginning, or resumption, of the text of a fragmentary witness. [12.1.5. Fragmentary Witnesses] + + + + + + + + + (fragmented witness end) indicates the end, or suspension, of the text of a fragmentary witness. [12.1.5. Fragmentary Witnesses] + + + + + + + + + (lacuna start) indicates the beginning of a lacuna in the text of a mostly complete textual witness. [12.1.5. Fragmentary Witnesses] + + + + + + + + + (lacuna end) indicates the end of a lacuna in a mostly complete textual witness. [12.1.5. Fragmentary Witnesses] + + + + + + + + + (variant encoding) declares the method used to encode text-critical variants. [12.1.1. The Apparatus Entry] + + + + + + indicates which method is used to encode the apparatus of variants. + + + + + + apparatus uses line numbers or other canonical reference scheme referenced in a base text. + + + + + apparatus indicates the precise locations of the beginning and ending of each lemma relative to a base text. + + + + + alternate readings of a passage are given in parallel in the text; no notion of a base text is necessary. + + + + + + + + indicates whether the apparatus appears within the running text or external to it. + + + + + + apparatus appears within the running text. + + + + + apparatus appears outside the base text. + + + + + + + + + + + + + + + + + + + + supplies the value of a date or time in some custom standard form. + + + + + + + + + + + + + + + + + + + + + specifies the earliest possible date for the event in some custom standard form. + + + + + + + + + + + + + + + + + + + + + specifies the latest possible date for the event in some custom standard form. + + + + + + + + + + + + + + + + + + + + + indicates the starting point of the period in some custom standard form. + + + + + + + + + + + + + + + + + + + + + indicates the ending point of the period in some custom standard form. + + + + + + + + + + + + + + + + + + + + + supplies a pointer to some location defining a named point in time with reference to which the datable item is understood to have occurred + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + supplies the value of a date or time in a standard form. + + + + + + + + + + + + + + + + specifies the earliest possible date for the event in standard form, e.g. yyyy-mm-dd. + + + + + + + + + + + + + + + + specifies the latest possible date for the event in standard form, e.g. yyyy-mm-dd. + + + + + + + + + + + + + + + + indicates the starting point of the period in standard form. + + + + + + + + + + + + + + + + indicates the ending point of the period in standard form. + + + + + + + + + + + + + + + (organization name) contains an organizational name. [13.2.2. Organizational Names] + + + + + + + + + + + + + + + + (personal name) contains a proper noun or proper-noun phrase referring to a person, possibly including one or more of the person's forenames, surnames, honorifics, added names, etc. [13.2.1. Personal Names] + + + + + + + + + + + + + + + + (surname) contains a family (inherited) name, as opposed to a given, baptismal, or nick name. [13.2.1. Personal Names] + + + + + + + + + + + + + + (forename) contains a forename, given or baptismal name. [13.2.1. Personal Names] + + + + + + + + + + + + + + (generational name component) contains a name component used to distinguish otherwise similar names on the basis of the relative ages or generations of the persons named. [13.2.1. Personal Names] + + + + + + + + + + + + + + (name link) contains a connecting phrase or link used within a name but not regarded as part of it, such as van der or of. [13.2.1. Personal Names] + + + + + + + + + + + + + (additional name) contains an additional name component, such as a nickname, epithet, or alias, or any other descriptive phrase used within a personal name. [13.2.1. Personal Names] + + + + + + + + + + + + + + (role name) contains a name component which indicates that the referent has a particular role or position in society, such as an official title or rank. [13.2.1. Personal Names] + + + + + + + + + + + + + + (place name) contains an absolute or relative place name. [13.2.3. Place Names] + + + + + + + + + + + + + + + + (bloc) contains the name of a geo-political unit consisting of two or more nation states or countries. [13.2.3. Place Names] + + + + + + + + + + + + + + + (country) contains the name of a geo-political unit, such as a nation, country, colony, or commonwealth, larger than or administratively superior to a region and smaller than a bloc. [13.2.3. Place Names] + + + + + + + + + + + + + + + (region) contains the name of an administrative unit such as a state, province, or county, larger than a settlement, but smaller than a country. [13.2.3. Place Names] + + + + + + + + + + + + + + + (settlement) contains the name of a settlement such as a city, town, or village identified as a single geo-political or administrative unit. [13.2.3. Place Names] + + + + + + + + + + + + + + + (district) contains the name of any kind of subdivision of a settlement, such as a parish, ward, or other administrative or geographic unit. [13.2.3. Place Names] + + + + + + + + + + + + + + + (offset) marks that part of a relative temporal or spatial expression which indicates the direction of the offset between the two place names, dates, or times involved in the expression. [13.2.3. Place Names] + + + + + + + + + + + + + + + + + (geographical name) identifies a name associated with some geographical feature such as Windrush Valley or Mount Sinai. [13.2.3. Place Names] + + + + + + + + + + + + + + + + (geographical feature name) contains a common noun identifying some geographical feature contained within a geographic name, such as valley, mount, etc. [13.2.3. Place Names] + + + + + + + + + + + + + + + + + (affiliation) contains an informal description of a person's present or past affiliation with some organization, for example an employer or sponsor. [15.2.2. The Participant Description] + + + + + + + + + + + + characterizes the element in some sense, using any convenient classification scheme or typology. +Sample values include: 1] sponsor; 2] recommend; 3] discredit; 4] pledged + + + + + + + + + + + + + + (age) specifies the age of a person. [13.3.2.1. Personal Characteristics] + + + + + + + + + + + + characterizes the element in some sense, using any convenient classification scheme or typology. +Sample values include: 1] western; 2] sui; 3] subjective; 4] objective; 5] inWorld (in world); 6] chronological; 7] biological; 8] psychological; 9] functional + + + + + + + + + + supplies a numeric code representing the age or age group + + + + + + + + + (birth) contains information about a person's birth, such as its date and place. [15.2.2. The Participant Description] + + + + + + + + + + + + + characterizes the element in some sense, using any convenient classification scheme or typology. +Sample values include: 1] caesarean (caesarean section); 2] vaginal (vaginal delivery); 3] exNihilo (ex nihilo); 4] incorporated; 5] founded; 6] established + + + + + + + + + + + + + + (climate) contains information about the physical climate of a place. [13.3.4.3. States, Traits, and Events] + + + + + + + + + + + + + + + + + + + + + + + + + (death) contains information about a person's death, such as its date and place. [15.2.2. The Participant Description] + + + + + + + + + + + + + characterizes the element in some sense, using any convenient classification scheme or typology. +Sample values include: 1] proclaimed; 2] assumed; 3] verified; 4] clinical; 5] brain; 6] natural; 7] unnatural; 8] fragmentation; 9] dissolution + + + + + + + + + + + + + + (education) contains a description of the educational experience of a person. [15.2.2. The Participant Description] + + + + + + + + + + + + characterizes the element in some sense, using any convenient classification scheme or typology. +Sample values include: 1] primary; 2] secondary; 3] undergraduate; 4] graduate; 5] residency; 6] apprenticeship + + + + + + + + + + + + + + (event) contains data relating to any kind of significant event associated with a person, place, or organization. [13.3.1. Basic Principles] + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + (faith) specifies the faith, religion, or belief set of a person. [13.3.2.1. Personal Characteristics] + + + + + + + + + + + + characterizes the element in some sense, using any convenient classification scheme or typology. +Sample values include: 1] practicing; 2] clandestine; 3] patrilineal; 4] matrilineal; 5] convert + + + + + + + + + + + + + + (floruit) contains information about a person's period of activity. [13.3.2.1. Personal Characteristics] + + + + + + + + + + + + + + + (geographical coordinates) contains any expression of a set of geographic coordinates, representing a point, line, or area on the surface of the earth in some notation. [13.3.4.1. Varieties of Location] + + + + + + + + + + + + + + + + + + + + + + characterizes the element in some sense, using any convenient classification scheme or typology. +Sample values include: 1] listening; 2] speaking; 3] reading; 4] writing + + + + + + + + + + supplies one or more valid language tags for the languages specified + + + + + + + + + + + + + + + + + + + + + + + + + + + + + (language known) summarizes the state of a person's linguistic competence, i.e., knowledge of a single language. [15.2.2. The Participant Description] + + + + + + + + + + supplies a valid language tag for the language concerned. + + + + + + + + + + + + + + + + + + a code indicating the person's level of knowledge for this language + + + + + + + + + + + + + + (list of organizations) contains a list of elements, each of which provides information about an identifiable organization. [13.2.2. Organizational Names] + + + + + + + + + + + + + + + + + + + + + + + + + + + + + (list of events) contains a list of descriptions, each of which provides information about an identifiable event. [13.3.1. Basic Principles] + + + + + + + + + + + + + + + + + + + + + + + + + + (list of persons) contains a list of descriptions, each of which provides information about an identifiable person or a group of people, for example the participants in a language interaction, or the people referred to in a historical source. [13.3.2. The Person Element 15.2. Contextual Information 2.4. The Profile Description 15.3.2. Declarable Elements] + + + + + + + + + + + + + + + + + + + + + + + + + + + + + (list of places) contains a list of places, optionally followed by a list of relationships (other than containment) defined amongst them. [2.2.7. The Source Description 13.3.4. Places] + + + + + + + + + + + + + + + + + + + + + + + + + + + + + provides information about relationships identified amongst people, places, and organizations, either informally as prose or as formally expressed relation links. [13.3.2.3. Personal Relationships] + + + + + + + + + + + + + + + + + + + + + (location) defines the location of a place as a set of geographical coordinates, in terms of other named geo-political entities, or as an address. [13.3.4. Places] + + + + + + + + + + + + + + + + + + + + + (nationality) contains an informal description of a person's present or past nationality or citizenship. [15.2.2. The Participant Description] + + + + + + + + + + + + characterizes the element in some sense, using any convenient classification scheme or typology. +Sample values include: 1] birth; 2] naturalised; 3] self-assigned + + + + + + + + + + + + + + (occupation) contains an informal description of a person's trade, profession or occupation. [15.2.2. The Participant Description] + + + + + + + + + + + + characterizes the element in some sense, using any convenient classification scheme or typology. +Sample values include: 1] primary; 2] other; 3] paid; 4] unpaid + + + + + + + + + + + + + + + + (organization) provides information about an identifiable organization such as a business, a tribe, or any other grouping of people. [13.3.3. Organizational Data] + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + specifies a primary role or classification for the organization. + + + + + + + + + + + + + + + + + + + + + (person) provides information about an identifiable individual, for example a participant in a language interaction, or a person referred to in a historical source. [13.3.2. The Person Element 15.2.2. The Participant Description] + + + + + + + + + + + + + + + + specifies a primary role or classification for the person. + + + + + + + + + + + + + + + + + + + specifies the sex of the person. + + + + + + + + + + + + + + + + + + + specifies an age group for the person. + + + + + + + + + + + + provides information about one of the personalities identified for a given individual, where an individual has multiple personalities. [13.3.2. The Person Element] + + + + + + + + + + + + + + + specifies a primary role or classification for the persona. + + + + + + + + + + + + + + + + + + + specifies the sex of the persona. + + + + + + + + + + + + + + + + + + + specifies an age group for the persona. + + + + + + + + + + + + (personal group) describes a group of individuals treated as a single person for analytic purposes. [15.2.2. The Participant Description] + + + + + + + + + + + + + + specifies the role of this group of participants in the interaction. + + + + + + + + + + specifies the sex of the participant group. + + + + + + + + + + + + + + + + + + + specifies the age group of the participants. + + + + + + + + + + describes informally the size or approximate size of the group for example by means of a number and an indication of accuracy e.g. approx 200. + + + + + + + + + + + + + + + + + + + + + (personal pronouns) indicates the personal pronouns used, or assumed to be used, by the individual being described. [13.3.2.1. Personal Characteristics] + + + + + + + + + + (evidence) indicates support for the listed personal pronouns. +Suggested values include: 1] conjecture (conjecture); 2] selfIdentification (self identification); 3] trustedThirdParty (trusted third party) + + + + + + + + (conjecture) The given value was selected based on assumptions by someone besides the person to whom this pronoun applies. As a result, the value may be erroneous. + + + + + + + + + (self identification) The given value has been explicitly stated or confirmed by the person to whom this pronoun applies. + + + + + + + + + (trusted third party) The given value has been supplied by another individual trusted by the encoder to know the preferences of the person to whom this pronoun applies. + + + + + + + + + + + + + + + (value) supplies a regularized value for personal pronouns. +Sample values include: 1] e (e); 2] he (he); 3] she (she); 4] they (they) + + + + + + + + + + + + + + + + + + + + + + + (place) contains data about a geographic location [13.3.4. Places] + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + (population) contains information about the population of a place. [13.3.4.3. States, Traits, and Events] + + + + + + + + + + + + + + + + + + + + + + + + + + + + (relationship) describes any kind of relationship or linkage amongst a specified group of places, events, persons, objects or other items. [13.3.2.3. Personal Relationships] + + + + + + + + + + + + + + supplies a name for the kind of relationship of which this is an instance. + + + + + + + + + + identifies the active participants in a non-mutual relationship, or all the participants in a mutual one. + + + + + + + + + + + + + supplies a list of participants amongst all of whom the relationship holds equally. + + + + + + + + + + + + + identifies the passive participants in a non-mutual relationship. + + + + + + + + + + + + + + + (residence) describes a person's present or past places of residence. [15.2.2. The Participant Description] + + + + + + + + + + + + characterizes the element in some sense, using any convenient classification scheme or typology. +Sample values include: 1] primary; 2] secondary; 3] temporary; 4] permanent + + + + + + + + + + + + + + (sex) specifies the sex of a person. [13.3.2.1. Personal Characteristics] + + + + + + + + + + + characterizes the element in some sense, using any convenient classification scheme or typology. +Sample values include: 1] explicit; 2] implicit + + + + + + + + + + supplies a coded value for sex + + + + + + + + + + + + + + + + + + + + + + + (socio-economic status) contains an informal description of a person's perceived social or economic status. [15.2.2. The Participant Description] + + + + + + + + + + + + characterizes the element in some sense, using any convenient classification scheme or typology. +Sample values include: 1] atBirth; 2] atDeath; 3] dependent; 4] inherited; 5] independent + + + + + + + + + + + + + + + + (state) contains a description of some status or quality attributed to a person, place, or organization often at some specific time or for a specific date range. [13.3.1. Basic Principles 13.3.2.1. Personal Characteristics] + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + (terrain) contains information about the physical terrain of a place. [13.3.4.3. States, Traits, and Events] + + + + + + + + + + + + + + + + + + + + + + + + + (trait) contains a description of some status or quality attributed to a person, place, or organization typically, but not necessarily, independent of the volition or action of the holder and usually not at some specific time or for a specific date range. [13.3.1. Basic Principles 13.3.2.1. Personal Characteristics] + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + (name of an object) contains a proper noun or noun phrase used to refer to an object. [13.2.4. Object Names] + + + + + + + + + + + + + + + + contains a description of a single identifiable physical object. [13.3.5. Objects] + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + (list of objects) contains a list of descriptions, each of which provides information about an identifiable physical object. [13.3.5. Objects] + + + + + + + + + + + + + + + + + + + + + + + + + + (object identifier) groups one or more identifiers or pieces of locating information concerning a single object. [13.3.5. Objects] + + + + + + + + + + + + + + + + + + + (canonical name) contains the definition for a canonical name or name component of any kind. [13.3.6. Names and Nyms] + + + + + + + + + + + + + + points to constituent nyms + + + + + + + + + + + + + + + (list of canonical names) contains a list of nyms, that is, standardized names for any thing. [13.3.6. Names and Nyms] + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + (role) indicates the kind of information held in this cell or in each cell of this row. +Suggested values include: 1] label; 2] data + + + + + + + + labelling or descriptive information only. + + + + + + + + + data values. + + + + + + + + + + + + + + + + + (rows) indicates the number of rows occupied by this cell or row. + + + + + + + (columns) indicates the number of columns occupied by this cell or row. + + + + + + (table) contains text displayed in tabular form, in rows and columns. [14.1.1. TEI Tables] + + + + + + + + + + + + + + + + + + + + + + + + + + + (rows) indicates the number of rows in the table. + + + + + (columns) indicates the number of columns in each row of the table. + + + + + + + (row) contains one row of a table. [14.1.1. TEI Tables] + + + + + + + + + + + + (cell) contains one cell of a table. [14.1.1. TEI Tables] + + + + + + + + + + + + + (formula) contains a mathematical or other formula. [14.2. Formulæ and Mathematical Expressions] + + + + + + + + + + + + + encodes the presence of music notation in a text [14.3. Notated Music in Written Text] + + + + + + + + + + + + + + + + + (figure) groups elements representing or containing graphic information such as an illustration, formula, or figure. [14.4. Specific Elements for Graphic Images] + + + + + + + + + + + + + + + + + + + (description of figure) contains a brief prose description of the appearance or content of a graphic figure, for use when documenting an image without displaying it. [14.4. Specific Elements for Graphic Images] + + + + + + + + + + + + (text description) provides a description of a text in terms of its situational parameters. [15.2.1. The Text Description] + + + + + + + + + + + + + + + + + + + (participation description) describes the identifiable speakers, voices, or other participants in any kind of text or other persons named or otherwise referred to in a text, edition, or metadata. [15.2. Contextual Information] + + + + + + + + + + + + + + + + + (setting description) describes the setting or settings within which a language interaction takes place, or other places otherwise referred to in a text, edition, or metadata. [15.2. Contextual Information 2.4. The Profile Description] + + + + + + + + + + + + + + + + + (primary channel) describes the medium or channel by which a text is delivered or experienced. For a written text, this might be print, manuscript, email, etc.; for a spoken one, radio, telephone, face-to-face, etc. [15.2.1. The Text Description] + + + + + + + + specifies the mode of this channel with respect to speech and writing. + + + + + + (spoken) + + + + + (written) + + + + + (spoken to be written) e.g. dictation + + + + + (written to be spoken) e.g. a script + + + + + (mixed) + + + + + (unknown or inapplicable) + + + + + + + + + + + + (constitution) describes the internal composition of a text or text sample, for example as fragmentary, complete, etc. [15.2.1. The Text Description] + + + + + + + + + specifies how the text was constituted. + + + + + + a single complete text + + + + + a text made by combining several smaller items, each individually complete + + + + + (fragments) a text made by combining several smaller, not necessarily complete, items + + + + + composition unknown or unspecified + + + + + + + + + + + + (derivation) describes the nature and extent of originality of this text. [15.2.1. The Text Description] + + + + + + + + + categorizes the derivation of the text. +Sample values include: 1] original; 2] revision; 3] translation; 4] abridgment; 5] plagiarism; 6] traditional + + + + + + + + + + + + + + (domain of use) describes the most important social context in which the text was realized or for which it is intended, for example private vs. public, education, religion, etc. [15.2.1. The Text Description] + + + + + + + + + categorizes the domain of use. +Sample values include: 1] art; 2] domestic; 3] religious; 4] business; 5] education; 6] govt (government); 7] public + + + + + + + + + + + + + + (factuality) describes the extent to which the text may be regarded as imaginative or non-imaginative, that is, as describing a fictional or a non-fictional world. [15.2.1. The Text Description] + + + + + + + + + categorizes the factuality of the text. + + + + + + the text is to be regarded as entirely imaginative + + + + + the text is to be regarded as entirely informative or factual + + + + + the text contains a mixture of fact and fiction + + + + + the fiction/fact distinction is not regarded as helpful or appropriate to this text + + + + + + + + + + + + (interaction) describes the extent, cardinality and nature of any interaction among those producing and experiencing the text, for example in the form of response or interjection, commentary, etc. [15.2.1. The Text Description] + + + + + + + + + specifies the degree of interaction between active and passive participants in the text. + + + + + + no interaction of any kind, e.g. a monologue + + + + + some degree of interaction, e.g. a monologue with set responses + + + + + complete interaction, e.g. a face to face conversation + + + + + this parameter is inappropriate or inapplicable in this case + + + + + + + + specifies the number of active participants (or addressors) producing parts of the text. +Suggested values include: 1] singular; 2] plural; 3] corporate; 4] unknown + + + + + + + + a single addressor + + + + + + + + + many addressors + + + + + + + + + a corporate addressor + + + + + + + + + number of addressors unknown or unspecifiable + + + + + + + + + + + + + + + specifies the number of passive participants (or addressees) to whom a text is directed or in whose presence it is created or performed. +Suggested values include: 1] self; 2] single; 3] many; 4] group; 5] world + + + + + + + + text is addressed to the originator e.g. a diary + + + + + + + + + text is addressed to one other person e.g. a personal letter + + + + + + + + + text is addressed to a countable number of others e.g. a conversation in which all participants are identified + + + + + + + + + text is addressed to an undefined but fixed number of participants e.g. a lecture + + + + + + + + + text is addressed to an undefined and indeterminately large number e.g. a published book + + + + + + + + + + + + + + + + + + + (preparedness) describes the extent to which a text may be regarded as prepared or spontaneous. [15.2.1. The Text Description] + + + + + + + + + a keyword characterizing the type of preparedness. +Sample values include: 1] none; 2] scripted; 3] formulaic; 4] revised + + + + + + + + + + + + + + characterizes a single purpose or communicative function of the text. [15.2.1. The Text Description] + + + + + + + + + specifies a particular kind of purpose. +Suggested values include: 1] persuade; 2] express; 3] inform; 4] entertain + + + + + + + + didactic, advertising, propaganda, etc. + + + + + + + + + self expression, confessional, etc. + + + + + + + + + convey information, educate, etc. + + + + + + + + + amuse, entertain, etc. + + + + + + + + + + + + + + + specifies the extent to which this purpose predominates. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + describes one particular setting in which a language interaction takes place. [15.2.3. The Setting Description] + + + + + + + + + + + + + + + + + contains a brief informal description of the kind of place concerned, for example: a room, a restaurant, a park bench, etc. [15.2.3. The Setting Description] + + + + + + + + + + + + (activity) contains a brief informal description of what a participant in a language interaction is doing other than speaking, if anything. [15.2.3. The Setting Description] + + + + + + + + + + + + + + + + + + + + + + + (corresponds) points to elements that correspond to the current element in some way. + + + + + + + + + + + + + + + (synchronous) points to elements that are synchronous with the current element. + + + + + + + + + + + + + + + points to an element that is the same as the current element. + + + + + + + points to an element of which the current element is a copy. + + + + + + + points to the next element of a virtual aggregate of which the current element is part. + + + + + + + (previous) points to the previous element of a virtual aggregate of which the current element is part. + + + + + + + points to elements that are in exclusive alternation with the current element. + + + + + + + + + + + + + + + selects one or more alternants; if one alternant is selected, the ambiguity or uncertainty is marked as resolved. If more than one alternant is selected, the degree of ambiguity or uncertainty is marked as reduced by the number of alternants not selected. + + + + + + + + + + + + + + (link) defines an association or hypertextual link among elements or passages, of some type not more precisely specifiable by other elements. [16.1. Links] + + + + + + + + + + (link group) defines a collection of associations or hypertextual links. [16.1. Links] + + + + + + + + + + + + + + + + (anonymous block) contains any arbitrary component-level unit of text, acting as an anonymous container for phrase or inter level elements analogous to, but without the semantic baggage of, a paragraph. [16.3. Blocks, Segments, and Anchors] + + + + + + + + + + + + + + + + (anchor point) attaches an identifier to a point within a text, whether or not it corresponds with a textual element. [8.4.2. Synchronization and Overlap 16.5. Correspondence and Alignment] + + + + + + + + + (arbitrary segment) represents any segmentation of text below the chunk level. [16.3. Blocks, Segments, and Anchors 6.2. Components of the Verse Line 7.2.5. Speech Contents] + + + + + + + + + + + + + + + + indicates a point in time either relative to other elements in the same timeline tag, or absolutely. [16.4.2. Placing Synchronous Events in Time] + + + + + + supplies an absolute value for the time. + + + + + + + + + + + + + (days) + + + + + + + + + (hours) + + + + + + + + + (minutes) + + + + + + + + + (seconds) + + + + + + + + + (milliseconds) + + + + + + + + + + + + + + + specifies a time interval either as a number or as one of the keywords defined by the datatype teidata.interval + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + (timeline) provides a set of ordered points in time which can be linked to elements of a spoken text to create a temporal alignment of that text. [16.4.2. Placing Synchronous Events in Time] + + + + + + + + + designates the origin of the timeline, i.e. the time at which it begins. + + + + + + + + + + (days) + + + + + + + + + (hours) + + + + + + + + + (minutes) + + + + + + + + + (seconds) + + + + + + + + + (milliseconds) + + + + + + + + + + + + + + + specifies a time interval either as a positive integral value or using one of a set of predefined codes. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + (join) identifies a possibly fragmented segment of text, by pointing at the possibly discontiguous elements which compose it. [16.7. Aggregation] + + + + + + + + + + + + specifies the name of an element which this aggregation may be understood to represent. + + + + + indicates whether the targets to be joined include the entire element indicated (the entire subtree including its root), or just the children of the target (the branches of the subtree). + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + (alternation) identifies an alternation or a set of choices among elements or passages. [16.8. Alternation] + + + + + + + + + specifies the destination of the reference by supplying one or more URI References + + + + + + + + + + + + + + + + + states whether the alternations gathered in this collection are exclusive or inclusive. + + + + + + (exclusive) indicates that the alternation is exclusive, i.e. that at most one of the alternatives occurs. + + + + + (inclusive) indicates that the alternation is not exclusive, i.e. that one or more of the alternatives occur. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + states whether the alternations gathered in this collection are exclusive or inclusive. + + + + + + (exclusive) indicates that the alternation is exclusive, i.e. that at most one of the alternatives occurs. + + + + + (inclusive) indicates that the alternation is not exclusive, i.e. that one or more of the alternatives occur. + + + + + + + + + + Functions as a container element for linked data, contextual information, and stand-off annotations embedded in a TEI document. [16.10. The standOff Container] + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + specifies the destination of the reference by supplying one or more URI References + + + + + + + + + + + + + + + + + + + + + + + intent is to assess the target resource in some way, rather than simply make a comment about it + + + + + intent is to create a bookmark to the target or part thereof + + + + + intent is to classify the target in some way + + + + + intent is to comment about the target + + + + + intent is to describe the target, rather than (for example) comment on it + + + + + intent is to request an edit or a change to the target resource + + + + + intent is to highlight the target resource or a segment thereof + + + + + intent is to assign an identity to the target + + + + + intent is to link to a resource related to the target + + + + + intent is to assign some value or quality to the target + + + + + intent is to ask a question about the target + + + + + intent is to reply to a previous statement, either an annotation or another resource + + + + + intent is to associate a tag with the target + + + + + + + + + + + + + + + + + + + + + + + (feature system declaration) provides a feature system declaration comprising one or more feature structure declarations or feature structure declaration links. [18.11. Feature System Declaration] + + + + + + + + + (feature structure declaration) declares one type of feature structure. [18.11. Feature System Declaration] + + + + + + + + + + + gives a name for the type of feature structure being declared. + + + + + + + + + + + + + + + + + + + + + + (feature system description (in FSD)) describes in prose what is represented by the type of feature structure declared in the enclosing fsDecl. [18.11. Feature System Declaration] + + + + + + + + + + + + (feature structure declaration link) associates the name of a typed feature structure with a feature structure declaration for it. [18.11. Feature System Declaration] + + + + + + + + + + + + + + + + (feature declaration) declares a single feature, specifying its name, organization, range of allowed values, and optionally its default value. [18.11. Feature System Declaration] + + + + + + + + + + + + indicates whether or not the value of this feature may be present. + + + + + + + (feature description (in FSD)) describes in prose what is represented by the feature being declared and its values. [18.11. Feature System Declaration] + + + + + + + + + + + + + + + + + + + + + + + + + + + defines a conditional default value for a feature; the condition is specified as a feature structure, and is met if it subsumes the feature structure in the text for which a default value is sought. [18.11. Feature System Declaration] + + + + + + + + + + + + + + + + + + + + + (feature-structure constraints) specifies constraints on the content of valid feature structures. [18.11. Feature System Declaration] + + + + + + + + + + + + (conditional feature-structure constraint) defines a conditional feature-structure constraint; the consequent and the antecedent are specified as feature structures or feature-structure collections; the constraint is satisfied if both the antecedent and the consequent subsume a given feature structure, or if the antecedent does not. [18.11. Feature System Declaration] + + + + + + + + + + + + + + + + + + + (bi-conditional feature-structure constraint) defines a biconditional feature-structure constraint; both consequent and antecedent are specified as feature structures or groups of feature structures; the constraint is satisfied if both subsume a given feature structure, or if both do not. [18.11. Feature System Declaration] + + + + + + + + + + + + + + + + + + + + + + + + (feature structure) represents a feature structure, that is, a collection of feature-value pairs organized as a structural unit. [18.2. Elementary Feature Structures and the Binary +Feature Value] + + + + + + + + + + specifies the type of the feature structure. + + + + + + + + + + (features) references the feature-value specifications making up this feature structure. + + + + + + + + + + + + + + + (feature) represents a feature value specification, that is, the association of a name with a value of any of several different types. [18.2. Elementary Feature Structures and the Binary +Feature Value] + + + + + + + + a single word which follows the rules defining a legal XML name (see ), providing a name for the feature. + + + + + (feature value) references any element which can be used to represent the value of a feature. + + + + + + + (binary value) represents the value part of a feature-value specification which can contain either of exactly two possible values. [18.2. Elementary Feature Structures and the Binary +Feature Value] + + + + + + + supplies a binary value. + + + + + + + (symbolic value) represents the value part of a feature-value specification which contains one of a finite list of symbols. [18.3. Other Atomic Feature Values] + + + + + + + supplies a symbolic value for the feature, one of a finite list that may be specified in a feature declaration. + + + + + + + + + + + + (numeric value) represents the value part of a feature-value specification which contains a numeric value or range. [18.3. Other Atomic Feature Values] + + + + + + + + + + + + + + + + + + supplies an upper bound for the numeric value represented. + + + + + + + + + + + + + + specifies whether the value represented should be truncated to give an integer value. + + + + + + + (string value) represents the value part of a feature-value specification which contains a string. [18.3. Other Atomic Feature Values] + + + + + + + + + + + + + (value label) represents the value part of a feature-value specification which appears at more than one point in a feature structure. [18.6. Re-entrant Feature Structures] + + + + + + + supplies a name identifying the sharing point. + + + + + + + + + + + + (collection of values) represents the value part of a feature-value specification which contains multiple values organized as a set, bag, or list. [18.7. Collections as Complex Feature Values] + + + + + + + + + + (organization) indicates organization of given value or values as set, bag or list. + + + + + + indicates that the given values are organized as a set. + + + + + indicates that the given values are organized as a bag (multiset). + + + + + indicates that the given values are organized as a list. + + + + + + + + + + (default feature value) represents the value part of a feature-value specification which contains a defaulted value. [18.9. Default Values] + + + + + + + + (value alternation) represents the value part of a feature-value specification which contains a set of values, only one of which can be valid. [18.8.1. Alternation] + + + + + + + + + + + + (value negation) represents a feature value which is the negation of its content. [18.8.2. Negation] + + + + + + + + + + + + + indicates the organization of the resulting merged values as set, bag or list. + + + + + + indicates that the resulting values are organized as a set. + + + + + indicates that the resulting values are organized as a bag (multiset). + + + + + indicates that the resulting values are organized as a list. + + + + + + + + + + + + + + + + + + (feature-value library) assembles a library of reusable feature value elements (including complete feature structures). [18.4. Feature Libraries and Feature-Value Libraries] + + + + + + + + + (graph) encodes a graph, which is a collection of nodes, and arcs which connect the nodes. [19.1. Graphs and Digraphs] + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + describes the type of graph. +Suggested values include: 1] undirected; 2] directed; 3] transitionNetwork; 4] transducer + + + + + + + + undirected graph + + + + + + + + + directed graph + + + + + + + + + a directed graph with distinguished initial and final nodes + + + + + + + + + a transition network with up to two labels on each arc + + + + + + + + + + + + + + + states the order of the graph, i.e., the number of its nodes. + + + + + states the size of the graph, i.e., the number of its arcs. + + + + + + + (node) encodes a node, a possibly labeled point in a graph. [19.1. Graphs and Digraphs] + + + + + + + + + + + provides a type for a node. +Suggested values include: 1] initial; 2] final + + + + + + + + initial node in a transition network + + + + + + + + + final node in a transition network + + + + + + + + + + + + + + + provides the value of a node, which is a feature structure or other analytic element. + + + + + (adjacent to) gives the identifiers of the nodes which are adjacent to the current node. + + + + + + + + + + + + + (adjacent from) gives the identifiers of the nodes which are adjacent from the current node. + + + + + + + + + + + + + (adjacent) gives the identifiers of the nodes which are both adjacent to and adjacent from the current node. + + + + + + + + + + + + + gives the in degree of the node, the number of nodes which are adjacent from the given node. + + + + + gives the out degree of the node, the number of nodes which are adjacent to the given node. + + + + + gives the degree of the node, the number of arcs with which the node is incident. + + + + + + + (arc) encodes an arc, the connection from one node to another in a graph. [19.1. Graphs and Digraphs] + + + + + + + + + + gives the identifier of the node which is adjacent from this arc. + + + + + gives the identifier of the node which is adjacent to this arc. + + + + + + + (tree) encodes a tree, which is made up of a root, internal nodes, leaves, and arcs from root to leaves. [19.2. Trees] + + + + + + + + + + + + + + + + + + + + gives the maximum number of children of the root and internal nodes of the tree. + + + + + (ordered) indicates whether or not the tree is ordered, or if it is partially ordered. + + + + + + indicates that all of the branching nodes of the tree are ordered. + + + + + indicates that some of the branching nodes of the tree are ordered and some are unordered. + + + + + indicates that all of the branching nodes of the tree are unordered. + + + + + + + + gives the order of the tree, i.e., the number of its nodes. + + + + + + + (root node) represents the root node of a tree. [19.2. Trees] + + + + + + + + + identifies the root node of the network by pointing to a feature structure or other analytic element. + + + + + identifies the elements which are the children of the root node. + + + + + + + + + + + + + (ordered) indicates whether or not the root is ordered. + + + + + + + + + + + + + + + + + + + + + + + gives the out degree of the root, the number of its children. + + + + + + + (intermediate (or internal) node) represents an intermediate (or internal) node of a tree. [19.2. Trees] + + + + + + + + + indicates an intermediate node, which is a feature structure or other analytic element. + + + + + provides a list of identifiers of the elements which are the children of the intermediate node. + + + + + + + + + + + + + provides the identifier of the element which is the parent of this node. + + + + + (ordered) indicates whether or not the internal node is ordered. + + + + + + + + + + + + + + + + + + + + + + + provides the identifier of an element which this node follows. + + + + + gives the out degree of an intermediate node, the number of its children. + + + + + + + (leaf) encodes the leaves (terminal nodes) of a tree. [19.2. Trees] + + + + + + + + + provides a pointer to a feature structure or other analytic element. + + + + + provides the identifier of parent of a leaf. + + + + + provides an identifier of an element which this leaf follows. + + + + + + + + + + + + + + + + + + + + provides the value of an embedding tree, which is a feature structure or other analytic element. + + + + + + + + + + + + + + + + + + supplies a value for the triangle, in the form of the identifier of a feature structure or other analytic element. + + + + + + + + + + + + + + + indicates the value of an embedding leaf, which is a feature structure or other analytic element. + + + + + + + (forest) provides for groups of rooted trees. [19.3. Another Tree Notation] + + + + + + + + + + + + + + provides for lists of forests. [19.3. Another Tree Notation] + + + + + + + + + + + identifies the type of the forest group. + + + + + + + + + + + + indicates the numerical accuracy or precision associated with some aspect of the text markup. [21.2. Indications of Precision] + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + supplies a standard deviation associated with the value in question + + + + + + + + + + + + + + + + indicates the degree of certainty associated with some aspect of the text markup. [21.1.2. Structured Indications of Uncertainty] + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + characterizes the element in some sense, using any convenient classification scheme or typology; sample categorization of annotations of uncertainty might use following values: +Sample values include: 1] ignorance; 2] incompleteness; 3] credibility; 4] imprecision + + + + + + + + + + indicates more exactly the aspect concerning which certainty is being expressed: specifically, whether the markup is correctly located, whether the correct element or attribute name has been used, or whether the content of the element or attribute is correct, etc. + + + + + + uncertainty concerns whether the name of the element or attribute used is correctly applied. + + + + + uncertainty concerns whether the start of the element is correctly identified. + + + + + uncertainty concerns whether the end of the element is correctly identified. + + + + + uncertainty concerns both the start and the end of the element. + + + + + uncertainty concerns the content (for an element) or the value (for an attribute) + + + + + + + + + + + + + + + indicates one or more element(s) characterizing the conditions which are assumed in the assignment of a degree of confidence. + + + + + + + + + + + + + + + + (responsibility) identifies the individual(s) responsible for some aspect of the content or markup of particular element(s). [21.3. Attribution of Responsibility] + + + + + + + + + + + indicates the specific aspect of the encoding (markup or content) for which responsibility is being assigned. + + + + + + + + + + responsibility is being assigned concerning the name of the element or attribute used. + + + + + responsibility is being assigned concerning the start of the element concerned. + + + + + responsibility is being assigned concerning the end of the element concerned. + + + + + responsibility is being assigned concerning the location of the element concerned. + + + + + responsibility is being assigned concerning the content (for an element) or the value (for an attribute) + + + + + + + + + + + + + + + + + + + specifies the date on which the source text was extracted and sent to the translator + + + + + + + + + + + + + + + + + + + + + + + + the condition under which the element bearing this attribute applies, given as an XPath predicate expression. + + + + + + + + + + + (minimum number of occurences) indicates the smallest number of times this component may occur. + + + + + + + (maximum number of occurences) indicates the largest number of times this component may occur. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + specifies the effect of this declaration on its parent object. + + + + + + this declaration is added to the current definitions + + + + + if present already, the whole of the declaration for this object is removed from the current setup + + + + + this declaration changes the declaration of the same name in the current definition + + + + + this declaration replaces the declaration of the same name in the current definition + + + + + + + + + + + + + + + + supplies the identifier by which this element may be referenced. + + + + + + + says whether this object should be predeclared in the tei infrastructure module. + + + + + + + supplies a name for the module in which this object is to be declared. + + + + + + + + + + provides a date before which the construct being defined will not be removed. + + + + + + + + + + (namespace) specifies the namespace to which this element belongs + + + + + + (attribute) contains the name of an attribute appearing within running text. [22. Documentation Elements] + + + + + + + + (scheme) supplies an identifier for the scheme in which this name is defined. +Sample values include: 1] TEI (Text Encoding Initiative); 2] DBK (Docbook); 3] XX (unknown); 4] imaginary (imaginary); 5] XHTML (XHTML); 6] XML (XML); 7] XI (XI) + + + + + + + + + + + + + + contains literal code from some formal language such as a programming language. [22.1.1. Phrase Level Terms] + + + + + + (formal language) a name identifying the formal language in which the code is expressed + + + + + + + + + + + + (example) contains any kind of illustrative example. [22.5. Element Specifications 22.5.3. Attribute List Specification] + + + + + + + + + + + + (element name) contains the name (generic identifier) of an element. [22. Documentation Elements 22.5. Element Specifications] + + + + + + + + supplies the name of the scheme in which this name is defined. +Sample values include: 1] TEI; 2] DBK (docbook); 3] XX (unknown); 4] Schematron; 5] HTML + + + + + + + + + + + + + + + + + + + + (tag) contains text of a complete start- or end-tag, possibly including attribute specifications, but excluding the opening and closing markup delimiter characters. [22. Documentation Elements] + + + + + + + indicates the type of XML tag intended + + + + + + a start-tag, with delimiters < and > is intended + + + + + an end-tag, with delimiters </ and > is intended + + + + + an empty tag, with delimiters < and /> is intended + + + + + a pi (processing instruction), with delimiters <? and ?> is intended + + + + + a comment, with delimiters <!-- and --> is intended + + + + + a marked-section, with delimiters <[CDATA[ and ]]> is intended + + + + + + + + supplies the name of the schema in which this tag is defined. +Sample values include: 1] TEI (text encoding initiative); 2] DBK (docbook); 3] XX (unknown); 4] Schematron; 5] HTML + + + + + + + + + + + + (value) contains a single attribute value. [22. Documentation Elements 22.5.3. Attribute List Specification] + + + + + + + + (specification list) marks where a list of descriptions is to be inserted into the prose documentation. [22.1.2. Element and Attribute Descriptions] + + + + + + + + + + + (specification description) indicates that a description of the specified element, class, or macro should be included at this point within a document. [22.1.2. Element and Attribute Descriptions] + + + + + + (identifier) supplies the identifier of the documentary element or class for which a description is to be obtained. + + + + + (attributes) supplies attribute names for which descriptions should additionally be obtained. + + + + + + + + + + points to the specification for an attribute or model class which is to be included in a schema [22.6. Class Specifications] + + + + + + + the identifier used for the required class within the source indicated. + + + + + indicates how references to this class within a content model should be interpreted. + + + + + + any one member of the class may appear + + + + + a single occurrence of all members of the class may appear in sequence + + + + + a single occurrence of one or more members of the class may appear in sequence + + + + + one or more occurrences of one or more members of the class may appear in sequence. + + + + + one or more occurrences of all members of the class may appear in sequence + + + + + + + + supplies a list of class members which are to be included in the schema being defined. + + + + + + + + supplies a list of class members which are to be excluded from the schema being defined. + + + + + + + + + + points to the specification for some element which is to be included in a schema [22.2. Modules and Schemas] + + + + + + + the identifier used for the required element within the source indicated. + + + + + + + points to the specification for some pattern which is to be included in a schema [22.7. Macro Specifications] + + + + + + the identifier used for the required pattern within the source indicated. + + + + + + + (module reference) references a module which is to be incorporated into a schema. [22.2. Modules and Schemas] + + + + + + + + + specifies a default prefix which will be prepended to all patterns from the imported module + + + + + supplies a list of the elements which are to be copied from the specified module into the schema being defined. + + + + + + + + supplies a list of the elements which are not to be copied from the specified module into the schema being defined. + + + + + + + + the name of a TEI module + + + + + (uniform resource locator) refers to a non-TEI module of RELAX NG code by external location + + + + + + + (module specification) documents the structure, content, and purpose of a single module, i.e. a named and externally visible group of declarations. [22.2. Modules and Schemas] + + + + + + + + + + + + + + + + + + + (schema specification) generates a TEI-conformant schema and documentation for it. [2.3. The Encoding Description 22.2. Modules and Schemas 23.5.1. Making a Unified ODD] + + + + + + + + + + + + + + + + + + + specifies entry points to the schema, i.e. which patterns may be used as the root of documents conforming to it. + + + + + + + + + + + + + specifies a default prefix which will be prepended to all patterns relating to TEI elements, unless otherwise stated. + + + + + (target language) specifies which language to use when creating the objects in a schema if names for elements or attributes are available in more than one language + + + + + + + + + + + + + + + + + + (documentation language) specifies which languages to use when creating documentation if the description for an element, attribute, class or macro is available in more than one language + + + + + + + + + + + + + + + + + + + + + + + + + + + (default namespace exclusions) provides a list of namespaces and/or prefixed element names to be excluded by default from anyName in RELAX NG schemas + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + (specification group) contains any convenient grouping of specifications for use within the current module. [22.2. Modules and Schemas] + + + + + + + + + + + + + + + + points at the specification group which logically belongs here. + + + + + + + (element specification) documents the structure, content, and purpose of a single element type. [22.5. Element Specifications 22. Documentation Elements] + + + + + + + + + + + + + + + + + + + + + + + + + + + specifies a default prefix which will be prepended to all patterns relating to the element, unless otherwise stated. + + + + + + + (class specification) contains reference information for a TEI element class; that is a group of elements which appear together in content models, or which share some common attribute, or both. [22.3. Specification Elements 22.6. Class Specifications] + + + + + + + + + + + + + + + + + + + + indicates whether this is a model class or an attribute class + + + + + + (content model) members of this class appear in the same content models + + + + + (attributes) members of this class share common attributes + + + + + + + + indicates which alternation and sequence instantiations of a model class may be referenced. By default, all variations are permitted. + + + + + + + + + + members of the class are alternatives + + + + + members of the class are to be provided in sequence + + + + + members of the class may be provided, in sequence, but are optional + + + + + members of the class may be provided one or more times, in sequence, but are optional. + + + + + members of the class may be provided one or more times, in sequence + + + + + members of the class are alternatives + + + + + members of the class are to be provided in sequence + + + + + members of the class may be provided, in sequence, but are optional + + + + + members of the class may be provided one or more times, in sequence, but are optional. + + + + + members of the class may be provided one or more times, in sequence + + + + + members of the class are alternatives + + + + + members of the class are to be provided in sequence + + + + + members of the class may be provided, in sequence, but are optional + + + + + members of the class may be provided one or more times, in sequence, but are optional. + + + + + members of the class may be provided one or more times, in sequence + + + + + members of the class are alternatives + + + + + members of the class are to be provided in sequence + + + + + members of the class may be provided, in sequence, but are optional + + + + + members of the class may be provided one or more times, in sequence, but are optional. + + + + + members of the class may be provided one or more times, in sequence + + + + + members of the class are alternatives + + + + + members of the class are to be provided in sequence + + + + + members of the class may be provided, in sequence, but are optional + + + + + members of the class may be provided one or more times, in sequence, but are optional. + + + + + members of the class may be provided one or more times, in sequence + + + + + + + + + + + + + + + + (datatype specification) documents a datatype. [22.3. Specification Elements 22.7. Macro Specifications] + + + + + + + + + + + + + + + + + + + + + + + (macro specification) documents the function and implementation of a pattern. [22.3. Specification Elements 22.7. Macro Specifications] + + + + + + + + + + + + + + + + + + + + + + + (remarks) contains any commentary or discussion about the usage of an element, attribute, class, or entity not otherwise documented within the containing element. [22.5. Element Specifications 22.5.3. Attribute List Specification 22.6. Class Specifications 22.7. Macro Specifications] + + + + + + + + + specifies the remark concerned. + + + + + + + (list of references) supplies a list of significant references to places where this element is discussed, in the current document or elsewhere. + + + + + + + + + + + + + (exemplum) groups an example demonstrating the use of an element along with optional paragraphs of commentary. [22.5. Element Specifications] + + + + + + + + + + + + + + + + + + (classes) specifies all the classes of which the documented element or class is a member or subclass. [22.5. Element Specifications 22.6. Class Specifications] + + + + + + + + + specifies the effect of this declaration on its parent module. + + + + + + this declaration changes the declaration of the same name in the current definition + + + + + this declaration replaces the declaration of the same name in the current definition + + + + + + + + + + specifies class membership of the documented element or class. [22.4.3. Classification of Components] + + + + + + + + specifies the identifier for a class of which the documented element or class is a member or subclass + + + + + specifies the effect of this declaration on its parent module. + + + + + + this declaration is added to the current definitions + + + + + this declaration and all of its children are removed from the current setup + + + + + + + + supplies the maximum number of times the element can occur in elements which use this model class in their content model + + + + + + + + + + + + + + supplies the minumum number of times the element must occur in elements which use this model class in their content model + + + + + + + + + + + + + + + + + + (equivalent) specifies a component which is considered equivalent to the parent element, either by co-reference, or by external link. [3.4.1. Terms and Glosses 22.4.1. Description of Components] + + + + + + + + a single word which follows the rules defining a legal XML name (see ), naming the underlying concept of which the parent is a representation. + + + + + (uniform resource identifier) references the underlying concept of which the parent is a representation by means of some external identifier + + + + + references an external script which contains a method to transform instances of this element to canonical TEI + + + + + + + (alternate identifier) supplies the recommended XML name for an element, class, attribute, etc. in some language. [3.4.1. Terms and Glosses 22.4.1. Description of Components] + + + + + + + + + describes the processing intended for a specified element. [22.5.4.1. The TEI processing model] + + + + + + + + + + + + + + + names the process or function which this processing model uses in order to produce output. +Suggested values include: 1] alternate; 2] anchor; 3] block; 4] body; 5] break; 6] cell; 7] cit; 8] document; 9] figure; 10] glyph; 11] graphic; 12] heading; 13] index; 14] inline; 15] link; 16] list; 17] listItem; 18] metadata; 19] note; 20] omit; 21] paragraph; 22] row; 23] section; 24] table; 25] text; 26] title + + + + + + + + support display of alternative visualisations, for example by displaying the preferred content, by displaying both in parallel, or by toggling between the two. + + + + + + + + + create an identifiable anchor point in the output. + + + + + + + + + create a block structure + + + + + + + + + create the body of a document. + + + + + + + + + create a line, column, or page break according to the value of type + + + + + + + + + create a table cell + + + + + + + + + show the content, with an indication of the source + + + + + + + + + start a new output document + + + + + + + + + make a figure with the title as caption + + + + + + + + + show a character by looking up reference to a chardesc at the given URI + + + + + + + + + if url is present, uses it to display graphic, else display a placeholder image. + + + + + + + + + creates a heading. + + + + + + + + + generate list according to type. + + + + + + + + + creates inline element out of content + + + + + + + + + create hyperlink + + + + + + + + + create a list. + + + + + + + + + create a list item. + + + + + + + + + create metadata section + + + + + + + + + create a note, often out of line, depending on the value of place; could be margin, footnote, endnote, inline + + + + + + + + + do nothing, do not process children + + + + + + + + + create a paragraph out of content. + + + + + + + + + create a table row + + + + + + + + + create a new section of the output document + + + + + + + + + create a table + + + + + + + + + create literal text + + + + + + + + + create document title + + + + + + + + + + + + + + + whether to obey any rendition attribute that is present. + + + + + the intended output. +Sample values include: 1] web; 2] print; 3] plain + + + + + + + + + + the name of a CSS class which should be associated with this element + + + + + + + + + + + + + + + + + + + + + + + + + + + + + the condition under which this model applies given as an XPath Predicate Expression + + + + + whether to obey any rendition attribute that is present + + + + + the intended output method +Suggested values include: 1] web; 2] print; 3] plaintext + + + + + + + + the output is intended for presentation in a web format + + + + + + + + + the output is intended for presentation in a print format + + + + + + + + + the output is intended for presentation in a plain text format + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + whether to obey any rendition attribute which is present + + + + + the intended output method +Suggested values include: 1] web; 2] print; 3] plaintext + + + + + + + + the output is intended for presentation in a web format + + + + + + + + + the output is intended for presentation in a print format + + + + + + + + + the output is intended for presentation in a plain text format + + + + + + + + + + + + + + + + + describes the rendering or appearance intended for all occurrences of an element in a specified context for a specified type of output. + + + + + + provides a way of defining pseudo-elements, that is, styling rules applicable to specific sub-portions of an element. +Sample values include: 1] first-line; 2] first-letter; 3] before; 4] after + + + + + + + + + + + + list of parameter specifications + + + + + + + + + + + supplies specification for one parameter of a model behaviour [22.5.4.8. Defining a processing model] + + + + + + + + + + + + + provides a parameter for a model behaviour by supplying its name and an XPath expression identifying the location of its content. [22.5.4.5. Behaviours and their parameters] + + + + + + a name for the parameter being supplied +Suggested values include: 1] alternate; 2] default; 3] height; 4] id; 5] label; 6] level; 7] link; 8] place; 9] type; 10] url; 11] width + + + + + + + + + + + + + + + + + + when used with behaviour graphic, a parameter of this name supplies a value for the height of the graphic e.g. "300px", "50%". + + + + + + + + + a parameter of this name should supply a unique identifier for the element being processed; as for example with the anchor behaviour + + + + + + + + + a parameter of this name should supply an expression to be used to label something, for example `concat('Page ', @n)` for a page break or `@n` for a footnote reference; typically used with the note or break behaviours + + + + + + + + + when used with the heading behaviour, a parameter of this name supplies a positive integer indicating the hierarchic level of a heading. + + + + + + + + + when used with the link behaviour, a parameter of this name should supply a URL to be used as the target of a link. + + + + + + + + + when used with the note behaviour, a parameter of this name should provide a string which describes the intended placement of some text; typical values include "margin", "footnote", "endnote", "inline", "bottom" + + + + + + + + + a parameter of this name can be used to categorize the specified behaviour in any way; for example the kind of break (when used with the break behaviour) or the kind of index to be generated (if used with the index behaviour) etc. + + + + + + + + + when used with behaviour graphic, a parameter of this name supplies a a URL indicating the graphic intended. + + + + + + + + + when used with behaviour graphic, a parameter of this name supplies a value for the width of the graphic e.g. "400px", "70%". + + + + + + + + + + + + + + + supplies an XPath expression which when evaluated provides the value for the parameter + + + + + + + (content model) contains a declaration of the intended content model for the element (or other construct) being specified. [22.5. Element Specifications] + + + + + + + + + + + controls whether or not pattern names generated in the corresponding RELAX NG schema source are automatically prefixed to avoid potential nameclashes. + + + + + + + + + + + + + indicates that the constructs referenced by its children form a sequence [22.5.1. Defining Content Models] + + + + + + + + if true, indicates that the order in which component elements of a sequence appear in a document must correspond to the order in which they are given in the content model. + + + + + + + indicates that the constructs referenced by its children form an alternation [22.5.1. Defining Content Models] + + + + + + + + + + + + + (constraint rules) the formal rules of a constraint [22.5. Element Specifications] + + + + + + + + + (constraint on schema) contains a formal constraint, typically expressed in a rule-based schema language, to which a construct must conform in order to be considered valid [22.5. Element Specifications] + + + + + + + + + + + + + + + + + + + + Indicates that this constraint specification warns that some other construct in the schema is deprecated. + + + + + + + + + + + + + + + supplies the name of the language in which the constraints are defined +Suggested values include: 1] schematron (ISO Schematron) + + + + + + + + (ISO Schematron) + + + + + + + + + + + + + + + + + + + + + + + + + (organization) specifies whether all the attributes in the list are available (org="group") or only one of them (org="choice") + + + + + + grouped + + + + + alternated + + + + + + + + + + (attribute definition) contains the definition of a single attribute. [22.5.3. Attribute List Specification] + + + + + + + + + + + + + + + + + + + + + + specifies the optionality of the attribute. + + + + + + (required) + + + + + (recommended ) + + + + + (optional ) + + + + + + + + (namespace) specifies the namespace to which this attribute belongs + + + + + + + (attribute pointer) points to the definition of an attribute or group of attributes. [22.2. Modules and Schemas] + + + + + + the name of the attribute class + + + + + + + + + + the name of the attribute + + + + + + + (datatype) specifies the declared value for an attribute, by referring to any datatype defined by the chosen schema language. [1.4.2. Datatype Specifications 22.5.3. Attribute List Specification] + + + + + + + + + + (minimum number of occurences) indicates the minimum number of times this datatype may occur in an instance of the attribute being defined + + + + + (maximum number of occurences) indicates the maximum number of times this datatype may occur in an instance of the attribute being defined + + + + + + + + + + + + + + + + + + + + identifies the datatype of an attribute value, either by referencing an item in an externally defined datatype library, or by pointing to a TEI-defined data specification [22.5.3.1. Datatypes] + + + + + + + + + the identifier used for this datatype specification + + + + + + a pointer to a datatype defined in some datatype library + + + + + supplies a string representing a regular expression providing additional constraints on the strings used to represent values of this datatype + + + + + + + + + + the name of the facet. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + the facet value. + + + + + + + (default value) specifies the default declared value for an attribute. [22.5.3. Attribute List Specification] + + + + + + + + + + + + + + + + + + + + documents a single value in a predefined list of values. [22.5.3. Attribute List Specification] + + + + + + + + + + + + + + + + + + + specifies the value concerned. + + + + + + + + + + + + + + specifies the extensibility of the list of values specified. + + + + + + only the values specified are permitted. + + + + + (semi-open) all the values specified should be supported, but other values are legal and software should have appropriate fallback processing for them. + + + + + the values specified are sample values only. + + + + + + + + + + indicates the presence of a text node in a content model [22. Documentation Elements] + + + + + + + + indicates the presence of any elements in a content model [22. Documentation Elements] + + + + + + + supplies a list of namespaces to one of which the permitted elements must belong. + + + + + + + + + + + + + supplies a list of namespaces or prefixed element names which are not permitted. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + indicates the presence of an empty node within a content model [22. Documentation Elements] + + + + + + diff --git a/xml_schema/tei_all_dcr.xsd b/xml_schema/tei_all_dcr.xsd new file mode 100644 index 000000000..c5c480e2b --- /dev/null +++ b/xml_schema/tei_all_dcr.xsd @@ -0,0 +1,32 @@ + + + + + + + + contains a PID (persistent identifier) that aligns the given element with the appropriate Data Category (or categories) in ISOcat. + + + + + + + + + + + + + contains a PID (persistent identifier) that aligns the content of the given element or the value of the given attribute with the appropriate simple Data Category (or categories) in ISOcat. + + + + + + + + + + + diff --git a/xml_schema/tei_all_teix.xsd b/xml_schema/tei_all_teix.xsd new file mode 100644 index 000000000..5d3f2f909 --- /dev/null +++ b/xml_schema/tei_all_teix.xsd @@ -0,0 +1,36 @@ + + + + + + + + + + + + indicates the intended validity of the example with respect to a schema. + + + + + + the example is intended to be fully valid, assuming that its root element, or a provided root element, could have been used as a possible root element in the schema concerned. + + + + + the example could be transformed into a valid document by inserting any number of valid attributes and child elements anywhere within it; or it is valid against a version of the schema concerned in which the provision of character data, list, element, or attribute values has been made optional. + + + + + the example is not intended to be valid, and contains deliberate errors. + + + + + + + + diff --git a/xml_schema/tei_all_xml.xsd b/xml_schema/tei_all_xml.xsd new file mode 100644 index 000000000..f9445c27e --- /dev/null +++ b/xml_schema/tei_all_xml.xsd @@ -0,0 +1,50 @@ + + + + + + + + (identifier) provides a unique identifier for the element bearing the attribute. + + + + + + + + + + + + + + + + + + + + provides a base URI reference with which applications can resolve relative URI references into absolute URI references. + + + + + signals an intention about how white space should be managed by applications. + + + + + + signals that the application's default white-space processing modes are acceptable + + + + + indicates the intent that applications preserve all white space + + + + + +