Skip to content

Commit

Permalink
Merge branch 'develop'
Browse files Browse the repository at this point in the history
  • Loading branch information
geoffroy-noel-ddh committed Nov 17, 2023
2 parents 857bab1 + 7af14f0 commit fc5d505
Show file tree
Hide file tree
Showing 21 changed files with 1,990 additions and 80 deletions.
7 changes: 6 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,11 @@ build
npm-debug.log
.nyc
.env
.unotes
.DS_Store
tmp
app/tests
app/tests
/.vscode
tools/l
tools/ISicily
tools/ISicily2
18 changes: 18 additions & 0 deletions .vscode/launch.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
{
// Use IntelliSense to learn about possible attributes.
// Hover to view descriptions of existing attributes.
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0",
"configurations": [
{
"type": "node",
"request": "launch",
"name": "Launch Program",
"skipFiles": [
"<node_internals>/**"
],
"program": "${workspaceFolder}/tools/testWords.js",
"cwd": "${workspaceFolder}/tools"
}
]
}
13 changes: 11 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
KDL codebase for the CROSSREADS research project

Main site: https://crossreads.web.ox.ac.uk/

Annotator: https://kingsdigitallab.github.io/crossreads/annotator.html

Main site: https://crossreads.web.ox.ac.uk/

# Content

## Annotator
Expand Down Expand Up @@ -50,3 +50,12 @@ npm ci
npm start
```

## Testing

Test that the encoding of the TEI corpus allows word and sign segmentation.

```bash
cd tools
npm ci
npm run test:words
```
4 changes: 4 additions & 0 deletions app/annotator.html
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,11 @@
// })
</script>

<!-- https://www.saxonica.com/download/javascript.xml -->
<script src="assets/SaxonJS2.rt.js"></script>
<script src="utils.js"></script>
<script src="xml-utils.js"></script>
<script src="crossreads-xml.js"></script>
<script type="module">
// TODO: UPGRADE
// TODO: relative import
Expand Down
1,122 changes: 1,122 additions & 0 deletions app/assets/SaxonJS2.rt.js

Large diffs are not rendered by default.

37 changes: 27 additions & 10 deletions app/assets/annotator.js
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ const DEFINITIONS_PATH = 'app/data/pal/definitions-digipal.json'
const DTS_COLLECTION_PATH = './data/2023-08/collection.json'
const OPENSEADRAGON_IMAGE_URL_PREFIX = './node_modules/openseadragon/build/openseadragon/images/'
const TEI_TO_HTML_XSLT_PATH = './data/tei2html.xslt'
const HTML_TO_HTML_XSLT_PATH = './data/html2html.xslt'
const DTS_ROOT = 'https://crossreads.web.ox.ac.uk'
// -1: never; 10000: check every 10 secs
const AUTO_SAVE_EVERY_MILLISEC = 10000
Expand Down Expand Up @@ -548,24 +549,25 @@ createApp({
const uri = this.objectDtsURL
fetch(uri)
.then(res => res.text())
.then(res => this.getDOMFromTEIString(res))
.then(xml => {
this.setImagesFromObjectXML(xml)
this.setTextFromObjectXML(xml)
// .then(res => this.getDOMFromTEIString(res))
.then(async (xmlString) => {
await this.setImagesFromXMLString(xmlString)
this.setTextFromXMLString(xmlString)
})
}
},
getDOMFromTEIString(str) {
str = str.normalize("NFD").replace(/\p{Diacritic}/gu, "")
return new window.DOMParser().parseFromString(str, 'text/xml')
},
setImagesFromObjectXML(xml) {
async setImagesFromXMLString(xmlString) {
// get all the tei:graphic -> image locations
this.images = {}
let it = xml.evaluate('//tei:graphic', xml, this.getURIFromXMLPrefix, XPathResult.ORDERED_NODE_ITERATOR_TYPE, null)
while (true) {
let node = it.iterateNext()
if (!node) break
// let it = xml.evaluate('//tei:graphic', xml, this.getURIFromXMLPrefix, XPathResult.ORDERED_NODE_ITERATOR_TYPE, null)
let xmlObject = await xmlUtils.fromString(xmlString)
for (let node of xmlUtils.xpath(xmlObject, '//tei:graphic')) {
// let node = it.iterateNext()
// if (!node) break
// TODO: less assumption about encoding, make it more robust
let uri = node.attributes['url'].value
this.images[uri] = {
Expand All @@ -583,7 +585,21 @@ createApp({
// this.onSelectImage(img)
this.image = img
},
setTextFromObjectXML(xml) {
setTextFromXMLString(xmlString) {
let res = crossreadsXML.getHtmlFromTei(xmlString)
this.text = xmlUtils.toString(res)
console.log(this.text)
// attach events to each sign
Vue.nextTick(() => {
for (let sign of document.querySelectorAll('.sign')) {
sign.addEventListener('click', (e) => this.onClickSign(sign));
// sign.addEventListener('mouseenter', (e) => this.onMouseEnterSign(sign));
// sign.addEventListener('mouseleave', (e) => this.onMouseLeaveSign(sign));
}
this.updateSignHighlights()
})
},
setTextFromObjectXML2(xml) {
// xml (TEI) -> this.text (XHTML)
fetch(TEI_TO_HTML_XSLT_PATH)
.then(res => res.text())
Expand Down Expand Up @@ -612,6 +628,7 @@ createApp({
}
// to string
this.text = new XMLSerializer().serializeToString(doc)
console.log(this.text)
// attach events to each sign
Vue.nextTick(() => {
for (let sign of document.querySelectorAll('.sign')) {
Expand Down
2 changes: 1 addition & 1 deletion app/assets/main.css
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,7 @@ The name of the element is inserted into the class attribute:
line-height: 3em;
}

span[data-tei-id] {
.is-word {
margin-right: 1em;
}

Expand Down
45 changes: 45 additions & 0 deletions app/crossreads-xml.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
(function (exports) {

// true if this code is running in the browser
const isBrowser = (typeof window !== "undefined");
// const SaxonJS = isBrowser ? window.SaxonJS : require('saxon-js');
// const fs = isBrowser ? null : require('fs');
const xmlUtils = isBrowser ? window.xmlUtils : require("./xml-utils");

let TEI2HTML_XSLT = 'data/tei2html.xslt'
let HTML2HTML_XSLT = 'data/html2html.xslt'
if (!isBrowser) {
TEI2HTML_XSLT = `../app/${TEI2HTML_XSLT}`
HTML2HTML_XSLT = `../app/${HTML2HTML_XSLT}`
}

exports.getHtmlFromTei = function(xmlString) {
// Remove diacritic, b/c
// a) XSLT template to markup each sign splits combined marks / modifier
// b) partners requested they are hidden in annotator text viewer (because they are editorially supplied)
// c) more complex to map to characters in the palaeographic definitions
// Example, see 1408 and https://github.com/kingsdigitallab/crossreads/issues/37
// https://raw.githubusercontent.com/ISicily/ISicily/master/inscriptions/ISic001408.xml
// έο̄ς
//
xmlString = xmlString.normalize("NFD")
// But:
// this removes non-combining marks as well, such as punctuation (ductus elevatus? middle dot) <g>
// <g ref="#interpunct">·</g>
// DONT USE THIS: it will remove non-diacritics, like &#183; (middle dot)
// xmlString = xmlString.replace(/\p{Diacritic}/gu, "")
xmlString = xmlString.replace(/[\u0300-\u036f]/gu, "")

// Remove spaces around <lb break="no">
// TODO: try to do it with XSLT? (too fiddly)
xmlString = xmlString.replace(/\s*(<lb[^>]+break="no"[^>]*>)\s*/g, '$1')

let ret = xmlUtils.xslt(xmlString, TEI2HTML_XSLT, true)

// assign the @data-idx sequentially relative to each .is-word
ret = xmlUtils.xslt(ret, HTML2HTML_XSLT, true)

return ret
}

})(typeof exports === "undefined" ? (this["crossreadsXML"] = {}) : exports);
1 change: 1 addition & 0 deletions app/data/html2html.sef.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"N":"package","version":"10","packageVersion":"1","saxonVersion":"SaxonJS 2.5","target":"JS","targetVersion":"2","name":"TOP-LEVEL","relocatable":"false","buildDateTime":"2023-11-12T01:09:29.899Z","ns":"xml=~ xsl=~ tei=http://www.tei-c.org/ns/1.0 =http://www.w3.org/1999/xhtml","C":[{"N":"co","id":"0","binds":"0","C":[{"N":"mode","onNo":"TC","flags":"","patternSlots":"0","prec":"","C":[{"N":"templateRule","rank":"0","prec":"0","seq":"1","ns":"xml=~ xsl=~ tei=http://www.tei-c.org/ns/1.0 =http://www.w3.org/1999/xhtml","minImp":"0","flags":"s","slots":"200","baseUri":"file:///home/jeff/src/prj/crossreads/app/data/html2html.xslt","line":"16","module":"html2html.xslt","expand-text":"false","match":"@data-idx","prio":"0","matches":"NA nQ{}data-idx","C":[{"N":"p.nodeTest","role":"match","test":"NA nQ{}data-idx","sType":"1NA nQ{}data-idx","ns":"= xml=~ fn=~ xsl=~ tei=http://www.tei-c.org/ns/1.0 "},{"N":"let","var":"Q{}n","slot":"0","sType":"*NA ","line":"17","role":"action","C":[{"N":"doc","sType":"1ND ","base":"file:///home/jeff/src/prj/crossreads/app/data/html2html.xslt","role":"select","C":[{"N":"valueOf","flags":"l","sType":"1NT ","C":[{"N":"numSeqFmt","flags":"1","C":[{"N":"nodeNum","role":"value","level":"any","needsNode":"true","C":[{"N":"dot","sType":"1NA nQ{}data-idx","ns":"= xml=~ fn=~ xsl=~ tei=http://www.tei-c.org/ns/1.0 ex=~ ","role":"select","line":"3"},{"N":"p.withUpper","role":"count","axis":"parent","sType":"1NE u[NE nQ{}span,NE nQ{http://www.w3.org/1999/xhtml}span]","ns":"= xml=~ fn=~ xsl=~ tei=http://www.tei-c.org/ns/1.0 ex=~ ","C":[{"N":"p.withPredicate","C":[{"N":"p.nodeTest","test":"NE u[NE nQ{}span,NE nQ{http://www.w3.org/1999/xhtml}span]"},{"N":"axis","name":"attribute","nodeTest":"*NA nQ{}data-idx"}]},{"N":"p.withUpper","axis":"ancestor-or-self","C":[{"N":"p.nodeTest","test":"N"},{"N":"p.withPredicate","C":[{"N":"p.nodeTest","test":"NE"},{"N":"fn","name":"contains","C":[{"N":"cvUntyped","to":"AS","diag":"0|0||contains","C":[{"N":"check","card":"?","diag":"0|0||contains","C":[{"N":"attVal","name":"Q{}class"}]}]},{"N":"str","val":"is-word"},{"N":"str","val":"http://www.w3.org/2005/xpath-functions/collation/codepoint"}]}]}]}]},{"N":"p.withPredicate","role":"from","sType":"1NE","ns":"= xml=~ fn=~ xsl=~ tei=http://www.tei-c.org/ns/1.0 ex=~ ","C":[{"N":"p.nodeTest","test":"NE"},{"N":"fn","name":"contains","C":[{"N":"cvUntyped","to":"AS","diag":"0|0||contains","C":[{"N":"check","card":"?","diag":"0|0||contains","C":[{"N":"attVal","name":"Q{}class"}]}]},{"N":"str","val":"is-word"},{"N":"str","val":"http://www.w3.org/2005/xpath-functions/collation/codepoint"}]}]}]},{"N":"str","sType":"1AS ","val":"1","role":"format"},{"N":"str","sType":"1AS ","val":"1","role":"startAt"}]}]}]},{"N":"att","name":"data-idx","sType":"1NA ","line":"18","C":[{"N":"fn","name":"string-join","role":"select","C":[{"N":"first","C":[{"N":"forEach","sType":"*AS ","C":[{"N":"data","sType":"*A ","C":[{"N":"mergeAdj","C":[{"N":"valueOf","sType":"1NT ","flags":"l","C":[{"N":"fn","name":"string-join","role":"select","C":[{"N":"first","C":[{"N":"forEach","sType":"*AS ","C":[{"N":"data","sType":"*A ","C":[{"N":"mergeAdj","C":[{"N":"arith10","sType":"?AO","op":"-","calc":"d-d","ns":"= xml=~ fn=~ xsl=~ tei=http://www.tei-c.org/ns/1.0 ","role":"select","line":"5","C":[{"N":"atomSing","diag":"1|0||arith","card":"?","C":[{"N":"first","C":[{"N":"varRef","name":"Q{}n","slot":"0"}]}]},{"N":"int","val":"1"}]}]}]},{"N":"fn","name":"string","sType":"1AS ","C":[{"N":"dot"}]}]}]},{"N":"str","sType":"1AS ","val":" "}]}]}]}]},{"N":"fn","name":"string","sType":"1AS ","C":[{"N":"dot"}]}]}]},{"N":"str","sType":"1AS ","val":""}]}]}]}]},{"N":"templateRule","rank":"1","prec":"0","seq":"0","ns":"xml=~ xsl=~ tei=http://www.tei-c.org/ns/1.0 =http://www.w3.org/1999/xhtml","minImp":"0","flags":"s","slots":"200","baseUri":"file:///home/jeff/src/prj/crossreads/app/data/html2html.xslt","line":"10","module":"html2html.xslt","expand-text":"false","match":"node()|@*","prio":"-0.5","matches":"N u[NT,NP,NC,NE]","C":[{"N":"p.nodeTest","role":"match","test":"N u[NT,NP,NC,NE]","sType":"1N u[NT,NP,NC,NE]"},{"N":"copy","sType":"1N u[1NT ,1NP ,1NC ,1NE ] ","flags":"cin","role":"action","line":"11","C":[{"N":"applyT","sType":"* ","line":"12","mode":"#unnamed","bSlot":"0","C":[{"N":"docOrder","sType":"*N u[N u[N u[N u[NT,NP],NC],NE],NA]","role":"select","line":"12","C":[{"N":"union","op":"|","sType":"*N u[N u[N u[N u[NT,NP],NC],NE],NA]","ns":"= xml=~ fn=~ xsl=~ tei=http://www.tei-c.org/ns/1.0 ","C":[{"N":"axis","name":"child","nodeTest":"*N u[NT,NP,NC,NE]"},{"N":"axis","name":"attribute","nodeTest":"*NA"}]}]}]}]}]},{"N":"templateRule","rank":"2","prec":"0","seq":"0","ns":"xml=~ xsl=~ tei=http://www.tei-c.org/ns/1.0 =http://www.w3.org/1999/xhtml","minImp":"0","flags":"s","slots":"200","baseUri":"file:///home/jeff/src/prj/crossreads/app/data/html2html.xslt","line":"10","module":"html2html.xslt","expand-text":"false","match":"node()|@*","prio":"-0.5","matches":"NA","C":[{"N":"p.nodeTest","role":"match","test":"NA","sType":"1NA"},{"N":"copy","sType":"1NA ","flags":"cin","role":"action","line":"11","C":[{"N":"applyT","sType":"* ","line":"12","mode":"#unnamed","bSlot":"0","C":[{"N":"docOrder","sType":"*N u[N u[N u[N u[NT,NP],NC],NE],NA]","role":"select","line":"12","C":[{"N":"union","op":"|","sType":"*N u[N u[N u[N u[NT,NP],NC],NE],NA]","ns":"= xml=~ fn=~ xsl=~ tei=http://www.tei-c.org/ns/1.0 ","C":[{"N":"axis","name":"child","nodeTest":"*N u[NT,NP,NC,NE]"},{"N":"axis","name":"attribute","nodeTest":"*NA"}]}]}]}]}]}]}]},{"N":"overridden"},{"N":"output","C":[{"N":"property","name":"Q{http://saxon.sf.net/}stylesheet-version","value":"10"},{"N":"property","name":"method","value":"html"},{"N":"property","name":"encoding","value":"utf-8"},{"N":"property","name":"indent","value":"yes"}]},{"N":"decimalFormat"}],"Σ":"bbe774a0"}
21 changes: 21 additions & 0 deletions app/data/html2html.xslt
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
<?xml version="1.0" ?>
<xsl:stylesheet version="1.0"
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:tei="http://www.tei-c.org/ns/1.0"
xmlns="http://www.w3.org/1999/xhtml"
>
<!-- saxon-js doesn't like 'version="1.0"' -->
<xsl:output method="html" encoding="utf-8" indent="yes"/>

<xsl:template match="node()|@*">
<xsl:copy>
<xsl:apply-templates select="node()|@*"/>
</xsl:copy>
</xsl:template>

<xsl:template match="@data-idx">
<xsl:variable name="n"><xsl:number level="any" from="*[contains(@class, 'is-word')]" count="*[contains(@class, 'is-word')]//span[@data-idx]"/></xsl:variable>
<xsl:attribute name="data-idx"><xsl:value-of select="$n - 1"/></xsl:attribute>
</xsl:template>

</xsl:stylesheet>
1 change: 1 addition & 0 deletions app/data/tei2html.sef.json

Large diffs are not rendered by default.

82 changes: 43 additions & 39 deletions app/data/tei2html.xslt
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,13 @@
<xsl:stylesheet version="1.0"
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:tei="http://www.tei-c.org/ns/1.0"
xmlns="http://www.w3.org/1999/xhtml"
>
<xsl:output method="html" version="1.0" encoding="utf-8" indent="yes"/>

<!-- IdentityTransform -->

<!-- saxon-js doesn't like 'version="1.0"' -->
<xsl:output method="html" encoding="utf-8" indent="yes"/>

<xsl:template match="/">
<xsl:apply-templates select="//tei:text/tei:body/tei:div[@type='edition']"/>
<xsl:apply-templates select="//tei:text/tei:body/tei:div[@type='edition'][not(@subtype='transliteration')]"/>
</xsl:template>

<xsl:template match="comment()">
Expand All @@ -19,69 +19,73 @@
</xsl:template>

<xsl:template match="tei:p|tei:div">
<xsl:copy>
<xsl:call-template name="lossless-attributes"/>
<xsl:apply-templates />
</xsl:copy>
<xsl:element name="{local-name()}">
<xsl:call-template name="lossless-attributes"/>
<xsl:apply-templates />
</xsl:element>
</xsl:template>

<xsl:template match="tei:lb">
<br>
<xsl:call-template name="lossless-attributes"/>
</br>
<span class="line-number">0</span>
<span class="line-number"><xsl:number level="any" count="tei:lb"/></span>
</xsl:template>

<!-- <xsl:template match="tei:ex">
</xsl:template> -->

<xsl:template name="lossless-span">
<span>
<xsl:call-template name="lossless-attributes"/>
<xsl:apply-templates />
</span>
<span>
<xsl:call-template name="lossless-attributes"/>
<xsl:apply-templates />
</span>
</xsl:template>

<xsl:template name="lossless-div">
<div>
<xsl:call-template name="lossless-attributes"/>
<xsl:apply-templates />
</div>
<div>
<xsl:call-template name="lossless-attributes"/>
<xsl:apply-templates />
</div>
</xsl:template>

<xsl:template name="lossless-attributes">
<xsl:attribute name="class">
<xsl:value-of select="concat('tei-', local-name())"/>
<xsl:if test="@type"> tei-type-<xsl:value-of select="@type"/></xsl:if>
</xsl:attribute>
<xsl:attribute name="data-tei"><xsl:value-of select="local-name()" /></xsl:attribute>
<xsl:apply-templates select="@*" mode="data-tei" />
</xsl:template>
<xsl:attribute name="class">
<xsl:value-of select="concat('tei-', local-name())"/>
<xsl:if test="@type"> tei-type-<xsl:value-of select="@type"/></xsl:if>
<xsl:if test="local-name() = 'w' or local-name() = 'name' or local-name() = 'g' or local-name() = 'placeName' or local-name() = 'num' or local-name() = 'orgName' or local-name()='orig'"> is-word</xsl:if>
</xsl:attribute>
<xsl:attribute name="data-tei"><xsl:value-of select="local-name()" /></xsl:attribute>
<!-- (tei:w|tei:name|tei:num) -->
<xsl:apply-templates select="@*" mode="data-tei" />
</xsl:template>

<xsl:template match="@*" mode="data-tei">
<xsl:attribute name="{concat('data-tei-', local-name())}"><xsl:value-of select="." /></xsl:attribute>
<xsl:attribute name="{concat('data-tei-', local-name())}"><xsl:value-of select="." /></xsl:attribute>
</xsl:template>

<xsl:template match="text()">
<xsl:template match="tei:w//text()|tei:name//text()|tei:g//text()|tei:placeName//text()|tei:num//text()|tei:orgName//text()|tei:orig//text()">
<xsl:call-template name="mark-up-every-character">
<xsl:with-param name="text" select="."/>
</xsl:call-template>
</xsl:template>

<xsl:template name="mark-up-every-character">
<xsl:param name="text"/>
<xsl:choose>
<xsl:when test="normalize-space(substring($text, 1, 1)) = ''"><xsl:text> </xsl:text></xsl:when>
<xsl:otherwise>
<span class="sign" data-idx="0"><xsl:value-of select="substring($text, 1, 1)"/></span>
</xsl:otherwise>
</xsl:choose>
<xsl:param name="text"/>
<xsl:choose>
<xsl:when test="normalize-space(substring($text, 1, 1)) = ''">
<span><xsl:text>&#160;</xsl:text></span>
</xsl:when>
<xsl:otherwise>
<span class="sign" data-idx="0"><xsl:value-of select="substring($text, 1, 1)"/></span>
</xsl:otherwise>
</xsl:choose>

<xsl:if test="string-length($text) > 1">
<xsl:call-template name="mark-up-every-character">
<xsl:with-param name="text" select="substring($text, 2, string-length($text) - 1)"/>
</xsl:call-template>
</xsl:if>
<xsl:if test="string-length($text) > 1">
<xsl:call-template name="mark-up-every-character">
<xsl:with-param name="text" select="substring($text, 2, string-length($text) - 1)"/>
</xsl:call-template>
</xsl:if>
</xsl:template>

</xsl:stylesheet>
Loading

0 comments on commit fc5d505

Please sign in to comment.