Skip to content

Commit

Permalink
feat: RAS format 1.2, new DTD with annotations support
Browse files Browse the repository at this point in the history
*updated attribute list for <s> to support annotations

* feat: RAS format 1.2

* fix(docs): update version in the comment too

* ci: verbose output when utf8 on win test fails

* fix(test): cs-ref.readalong also needs to current dtd version

* fix: more places where the current dtd 1.2 version is needed (#238)

---------

Co-authored-by: David Huggins-Daines <[email protected]>
Co-authored-by: Eric Joanis <[email protected]>
  • Loading branch information
3 people authored Aug 26, 2024
1 parent aed50e2 commit d4b8a69
Show file tree
Hide file tree
Showing 10 changed files with 193 additions and 10 deletions.
4 changes: 4 additions & 0 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -122,4 +122,8 @@ jobs:
# Warning: the diff line below is PowerShell syntax, not bash!
run: |
echo ćś | readalongs make-xml -l fra - - | findstr /v meta > cs.readalong
echo Output ====
cat cs.readalong
echo Reference ====
cat test/data/cs-ref.readalong
if (diff (cat cs.readalong) (cat test/data/cs-ref.readalong)) { throw "Output did not match reference" }
2 changes: 1 addition & 1 deletion docs/cli-guide.md
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ The format of the generated XML is based on [TEI
Lite](https://tei-c.org/guidelines/customization/lite/) but is
considerably simplified. The DTD (document type definition) can be
found in the ReadAlong Studio source code under
`readalongs/static/read-along-1.1.dtd`.
`readalongs/static/read-along-1.2.dtd`.

(dna)=

Expand Down
2 changes: 1 addition & 1 deletion readalongs/_version.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
VERSION = "1.1.0"

READALONG_FILE_FORMAT_VERSION = "1.1"
READALONG_FILE_FORMAT_VERSION = "1.2"
8 changes: 4 additions & 4 deletions readalongs/align.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ def parse_and_make_xml(
"""Parse XML input and run tokenization and G2P.
Args:
xml_path (str): Path to input in ReadAlong XML format (see static/read-along-1.1.dtd)
xml_path (str): Path to input in ReadAlong XML format (see static/read-along-1.2.dtd)
config (dict): Optional; ReadAlong-Studio configuration to use
save_temps (str): Optional; Save temporary files, by default None
verbose_g2p_warnings (boolean): Optional; display all g2p errors and warnings
Expand Down Expand Up @@ -574,7 +574,7 @@ def align_audio(
"""Align an XML input file to an audio file.
Args:
xml_path (str): Path to input file in ReadAlong XML format (see static/read-along-1.1.dtd)
xml_path (str): Path to input file in ReadAlong XML format (see static/read-along-1.2.dtd)
audio_path (str): Path to audio input. Must be in a format supported by ffmpeg
unit (str): Optional; Element to create alignments for, by default 'w'
bare (boolean): Optional;
Expand Down Expand Up @@ -1192,7 +1192,7 @@ def convert_to_xhtml(tokenized_xml, title="Book"):


def create_ras_from_text(lines: Iterable[str], text_languages=Sequence[str]) -> str:
"""Create input xml in ReadAlong XML format (see static/read-along-1.1.dtd)
"""Create input xml in ReadAlong XML format (see static/read-along-1.2.dtd)
Uses the line sequence to infer paragraph and sentence structure from plain text:
Assumes a double blank line marks a page break, and a single blank line
marks a paragraph break.
Expand Down Expand Up @@ -1240,7 +1240,7 @@ def create_ras_from_text(lines: Iterable[str], text_languages=Sequence[str]) ->


def create_input_ras(**kwargs):
"""Create input xml in ReadAlong XML format (see static/read-along-1.1.dtd)
"""Create input xml in ReadAlong XML format (see static/read-along-1.2.dtd)
Uses readlines to infer paragraph and sentence structure from plain text.
Assumes a double blank line marks a page break, and a single blank line
marks a paragraph break.
Expand Down
106 changes: 106 additions & 0 deletions readalongs/static/read-along-1.2.dtd
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
<!-- VERSION: 1.2 -->
<!ELEMENT read-along (meta|text|body|div|span|anchor|silence|graphic|p|s|w)*>
<!ATTLIST read-along
use-assets-folder CDATA #IMPLIED
href CDATA #IMPLIED
audio CDATA #IMPLIED
xml:lang CDATA #IMPLIED
language CDATA #IMPLIED
lang CDATA #IMPLIED
version CDATA #IMPLIED>

<!ELEMENT text (body|div|span|anchor|silence|graphic|p|s|w)*>
<!ATTLIST text
xml:lang CDATA #IMPLIED
lang CDATA #IMPLIED
fallback-langs CDATA #IMPLIED
id CDATA #IMPLIED>

<!ELEMENT body (div|span|anchor|silence|graphic|p|s|w)*>
<!ATTLIST body
xml:lang CDATA #IMPLIED
lang CDATA #IMPLIED
id CDATA #IMPLIED>

<!ELEMENT anchor EMPTY>
<!ATTLIST anchor time CDATA #REQUIRED>

<!ELEMENT silence EMPTY>
<!ATTLIST silence dur CDATA #REQUIRED>

<!ELEMENT graphic EMPTY>
<!ATTLIST graphic
url CDATA #REQUIRED
id CDATA #IMPLIED>

<!ELEMENT div (#PCDATA|div|span|anchor|silence|graphic|p|s|w)*>
<!ATTLIST div
xml:lang CDATA #IMPLIED
lang CDATA #IMPLIED
id CDATA #IMPLIED
class CDATA #IMPLIED
type CDATA #IMPLIED
do-not-align CDATA #IMPLIED
time CDATA #IMPLIED
dur CDATA #IMPLIED>

<!ELEMENT span (#PCDATA|div|span|anchor|silence|graphic|p|s|w)*>
<!ATTLIST span
xml:lang CDATA #IMPLIED
lang CDATA #IMPLIED
id CDATA #IMPLIED
class CDATA #IMPLIED
type CDATA #IMPLIED
do-not-align CDATA #IMPLIED
time CDATA #IMPLIED
dur CDATA #IMPLIED>

<!ELEMENT p (#PCDATA|span|anchor|silence|s|w)*>
<!ATTLIST p
xml:lang CDATA #IMPLIED
lang CDATA #IMPLIED
id CDATA #IMPLIED
class CDATA #IMPLIED
do-not-align CDATA #IMPLIED
time CDATA #IMPLIED
dur CDATA #IMPLIED>

<!ELEMENT s (#PCDATA|span|anchor|silence|w)*>
<!ATTLIST s
xml:lang CDATA #IMPLIED
lang CDATA #IMPLIED
id CDATA #IMPLIED
class CDATA #IMPLIED
do-not-align CDATA #IMPLIED
time CDATA #IMPLIED
dur CDATA #IMPLIED
annotation-id CDATA #IMPLIED
sentence-id CDATA #IMPLIED>

<!ELEMENT w (#PCDATA|span|syl)*>
<!ATTLIST w
xml:lang CDATA #IMPLIED
effective-g2p-lang CDATA #IMPLIED
lang CDATA #IMPLIED
id CDATA #IMPLIED
class CDATA #IMPLIED
do-not-align CDATA #IMPLIED
ARPABET CDATA #IMPLIED
time CDATA #IMPLIED
dur CDATA #IMPLIED>

<!ELEMENT syl (#PCDATA|span)*>
<!ATTLIST syl
xml:lang CDATA #IMPLIED
lang CDATA #IMPLIED
id CDATA #IMPLIED
class CDATA #IMPLIED
do-not-align CDATA #IMPLIED
ARPABET CDATA #IMPLIED
time CDATA #IMPLIED
dur CDATA #IMPLIED>

<!ELEMENT meta EMPTY>
<!ATTLIST meta name CDATA #REQUIRED
content CDATA #REQUIRED
id CDATA #IMPLIED>
2 changes: 1 addition & 1 deletion readalongs/web_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@
# Call get_langs() when the server loads to load the languages into memory
LANGS = get_langs()
# Get the DTD
DTDPATH = os.path.join(os.path.dirname(__file__), "static", "read-along-1.1.dtd")
DTDPATH = os.path.join(os.path.dirname(__file__), "static", "read-along-1.2.dtd")
with open(DTDPATH) as dtdfh:
DTD = etree.DTD(dtdfh)

Expand Down
2 changes: 1 addition & 1 deletion test/data/cs-ref.readalong
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
<?xml version='1.0' encoding='utf-8'?>
<read-along version="1.1">
<read-along version="1.2">
<text xml:lang="fra" fallback-langs="und">
<body>
<div type="page">
Expand Down
34 changes: 34 additions & 0 deletions test/data/ej-fra-annotated.readalong
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
<?xml version='1.0' encoding='utf-8'?>
<read-along version="1.2">
<meta name="generator" content="human made" id="meta0" />
<meta name="annotations-ids" content="translation1, translation2" id="meta1" />
<meta name="annotations-labels" content="Algonquin, English" id="meta2" />
<meta name="annotations-labels-eng" content="Algonquin, English" id="meta3" />
<meta name="annotations-labels-fra" content="algonquin, anglais" id="meta4" />
<text xml:lang="fra" id="t0">
<body id="t0b0">
<div type="page" id="t0b0d0" class="two-column-layout-page">
<graphic url="avatar.png" id="t0b0d0graphic0" />
<p id="t0b0d0p0">
<s id="t0b0d0p0s0"><w id="t0b0d0p0s0w0" time="0.455" dur="1.165">Bonjour</w>.</s>
<s do-not-align="true" xml:lang="otw" id="t0b0d0p0s0an01" annotation-id="translation1" sentence-id="t0b0d0p0s0">
Kwei.</s>
<s do-not-align="true" xml:lang="eng" id="t0b0d0p0s0an02" annotation-id="translation2"
sentence-id="t0b0d0p0s0">
Hello.</s>
<s id="t0b0d0p0s1"><w id="t0b0d0p0s1w0" time="1.620" dur="0.070">Je</w> <w
id="t0b0d0p0s1w1" time="1.690" dur="0.070">m</w>'<w id="t0b0d0p0s1w2" time="1.760"
dur="0.240">appelle</w> <w id="t0b0d0p0s1w3" time="2.000" dur="1.705">Éric</w> <w
id="t0b0d0p0s1w4" time="3.705" dur="1.905">Joanis</w>.</s>
<s do-not-align="true" xml:lang="otw" id="t0b0d0p0s1an01" annotation-id="translation1"
sentence-id="t0b0d0p0s1">Éric
Joanis nindijinikàz.</s>
<s do-not-align="true" xml:lang="eng" id="t0b0d0p0s1an01" annotation-id="translation2"
sentence-id="t0b0d0p0s1">My
name is Éric Joanis.</s>
</p>
</div>

</body>
</text>
</read-along>
34 changes: 34 additions & 0 deletions test/data/ras-dtd-1.2.readalong
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
<?xml version='1.0' encoding='utf-8'?>
<read-along version="1.2">
<meta name="generator" content="human made" id="meta0" />
<meta name="annotations-ids" content="translation1, translation2" id="meta1" />
<meta name="annotations-labels" content="Algonquin, English" id="meta2" />
<meta name="annotations-labels-eng" content="Algonquin, English" id="meta3" />
<meta name="annotations-labels-fra" content="algonquin, anglais" id="meta4" />
<text xml:lang="fra" id="t0">
<body id="t0b0">
<div type="page" id="t0b0d0" class="two-column-layout-page">
<graphic url="avatar.png" id="t0b0d0graphic0" />
<p id="t0b0d0p0">
<s id="t0b0d0p0s0"><w id="t0b0d0p0s0w0" time="0.455" dur="1.165">Bonjour</w>.</s>
<s do-not-align="true" xml:lang="otw" id="t0b0d0p0s0an01" annotation-id="translation1" sentence-id="t0b0d0p0s0">
Kwei.</s>
<s do-not-align="true" xml:lang="eng" id="t0b0d0p0s0an02" annotation-id="translation2"
sentence-id="t0b0d0p0s0">
Hello.</s>
<s id="t0b0d0p0s1"><w id="t0b0d0p0s1w0" time="1.620" dur="0.070">Je</w> <w
id="t0b0d0p0s1w1" time="1.690" dur="0.070">m</w>'<w id="t0b0d0p0s1w2" time="1.760"
dur="0.240">appelle</w> <w id="t0b0d0p0s1w3" time="2.000" dur="1.705">Éric</w> <w
id="t0b0d0p0s1w4" time="3.705" dur="1.905">Joanis</w>.</s>
<s do-not-align="true" xml:lang="otw" id="t0b0d0p0s1an01" annotation-id="translation1"
sentence-id="t0b0d0p0s1">Éric
Joanis nindijinikàz.</s>
<s do-not-align="true" xml:lang="eng" id="t0b0d0p0s1an01" annotation-id="translation2"
sentence-id="t0b0d0p0s1">My
name is Éric Joanis.</s>
</p>
</div>

</body>
</text>
</read-along>
9 changes: 7 additions & 2 deletions test/test_dtd.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,13 @@
from readalongs.text.util import load_xml

DTDPATH = os.path.join(
dirname(__file__), "..", "readalongs", "static", "read-along-1.1.dtd"
dirname(__file__), "..", "readalongs", "static", "read-along-1.2.dtd"
)

VALID_RAS = """
ej-fra-anchors2.readalong
ej-fra-anchors.readalong
ej-fra-annotated.readalong
ej-fra-converted.readalong
ej-fra-dna.readalong
ej-fra-package.readalong
Expand Down Expand Up @@ -70,7 +71,11 @@ def test_invalid_inputs(self):

def test_backwards_compatibility(self):
# the DTD needs to be backwards compatible as long as the major version does not change
versions = ["ras-dtd-1.0.readalong", "ras-dtd-1.1.readalong"]
versions = [
"ras-dtd-1.0.readalong",
"ras-dtd-1.1.readalong",
"ras-dtd-1.2.readalong",
]
for name in versions:
path = os.path.join(dirname(__file__), "data", name.strip())
# DTD is text, XML is binary... okay
Expand Down

0 comments on commit d4b8a69

Please sign in to comment.