Skip to content

Commit

Permalink
read metadata from TEI files when building the corpus tree.
Browse files Browse the repository at this point in the history
  • Loading branch information
geoffroy-noel-ddh committed Jan 21, 2023
1 parent ef41997 commit b08a965
Show file tree
Hide file tree
Showing 5 changed files with 39 additions and 82 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,13 +29,13 @@ To start a local DTS web service:
## Must

* add a github action to generate static DTS from thornton corpus
* read metadata from TEI file when building the corpus tree to set title field
* parametrise the DTS server
* parametrise the SDTS generator
* test the static generator with github

## Should

* set the name of the top collection from settings.json
* default collection.json
* move settings file under a collection.json file under the top collection folder
* document!
Expand Down
5 changes: 3 additions & 2 deletions controllers/controller.js
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"use strict";
// TODO: add text metadata from TEI

// TODO: add text metadata from TEI
// TODO: use SaxonJS instead of DOMParser & XPath
const SaxonJS = require("saxon-js");
const fs = require("fs");
const DOMParser = require("@xmldom/xmldom").DOMParser;
Expand Down Expand Up @@ -50,7 +51,7 @@ var controllers = {
res.json(ret);
},
collections: async (req, res) => {
let ret = corpus.getItemAndSubItems()
let ret = await corpus.getItemAndSubItems()
/*
TODO:
"@context": {
Expand Down
91 changes: 23 additions & 68 deletions controllers/corpus.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
const xpath = require('xpath')
const dom = require('xmldom').DOMParser
const SaxonJS = require("saxon-js");
// const xpath = require('xpath')
// const dom = require('xmldom').DOMParser
const path = require("path");
const fs = require("fs");
const CorpusReader = require("./corpusReader")
Expand All @@ -9,13 +10,11 @@ const CorpusReader = require("./corpusReader")

/*
TODO:
M move file caching from controller to here
M add missing DTS fields in the getItem responses
M improve the IDs (in controller?)
M move file reading from controller to Corpus class
M create extension that can read corpus from Github folders
S optimise for large collections on github (with caching, lazy loading & pagination)
S document limitations (e.g. unique file name or @xml:id)
C move file caching from controller to here
C improve the IDs (in controller?)
C support for sub-collections (using a .collection.json file under each collection folder)
*/

Expand Down Expand Up @@ -68,7 +67,6 @@ class Corpus {
}

saveTree() {
console.log(`Save tree ${this.getTreePath()}`)
fs.writeFileSync(
this.getTreePath(),
JSON.stringify(this.tree, null, 2),
Expand All @@ -88,28 +86,36 @@ class Corpus {
return str.replace(/\W+/g, "-").replace(/(^-|-$)/g, "")
}

getItemAndSubItems(id="ROOT") {
async getItemAndSubItems(id="ROOT") {
return {
"@context": "https://distributed-text-services.github.io/specifications/context/1.0.0draft-2.json",
...this.getItem(id),
member: this.getSubItems(id)
...await this.getItem(id),
member: await this.getSubItems(id)
}
}

getItem(id="ROOT") {
return this._cleanItem(this._getItem(id))
async getItem(id="ROOT") {
return await this._cleanItem(this._getItem(id))
}

_getItem(id="ROOT") {
return CorpusReader.getTreeItem(this.tree, id)
}

_cleanItem(item) {
async _cleanItem(item) {
if (!item) return null

if (!item.title) {
let meta = await this.getMetadataFromTEIFile(item["@id"])
item.title = meta.title
this.saveTree()
}

let ret = {...item}
ret.totalParents = item.tree.parent ? 1 : 0;
ret.totalItems = item.tree.children || 0
ret.totalChildren = item.tree.children || 0

delete ret.tree
return ret
}
Expand All @@ -118,11 +124,11 @@ class Corpus {
return this._getItem(id).tree.source
}

getSubItems(id="ROOT") {
async getSubItems(id="ROOT") {
let ret = []
for (let item of Object.values(this.tree)) {
if (item.tree.parent === id) {
ret.push(this._cleanItem(item))
ret.push(await this._cleanItem(item))
}
}
return ret
Expand All @@ -132,60 +138,9 @@ class Corpus {
// console.log(this.tree[id])
return await this.reader.readItemContent(this._getItem(id)?.tree?.source)
}

// resetTree() {
// this.tree = {
// "ROOT": {
// "@id": "ROOT",
// "@type": "Collection",
// "title": "ROOT Collection",
// "tree": {
// "source": this.getAbsoluteSource(),
// "parent": null,
// "updated": new Date()
// }
// }
// }

// return this.tree
// }

// async buildTree(collectionPath) {
// // TODO: handle collections & sub-collections
// if (typeof collectionPath === "undefined") {
// collectionPath = this.source
// this.resetTree()
// }

// const directory = collectionPath;

// for (let filename of fs.readdirSync(directory).sort()) {
// let filePath = this.getAbsoluteSource();
// if (fs.lstatSync(filePath).isDirectory()) {
// this.buildTree(filePath);
// } else {
// if (filename.endsWith(".xml")) {
// let shortName = filename.replace(/\.[^.]*$/, "");
// // let documentId = `${idCollection}/${handle}`;
// let docId = `${shortName}`;
// // let teiMeta = await getMetadataFromTEIFile(filePath);
// this.tree[docId] = {
// "@id": docId,
// "@type": "Resource",
// "title": docId,
// // title: teiMeta.title,
// "tree": {
// "source": `${filePath}`,
// "parent": "ROOT",
// }
// };
// }
// }
// }
// }

async getMetadataFromTEIFile(filePath) {
let content = readFile(filePath);
async getMetadataFromTEIFile(documentId) {
let content = await this.readItemContent(documentId);
// optimisation: we extract the TEI header (so less xml to parse)
let m = content.match(/^.*<\/teiHeader>/s);
content = `${m[0]}</TEI>`;
Expand Down
6 changes: 3 additions & 3 deletions controllers/corpusReader.js
Original file line number Diff line number Diff line change
Expand Up @@ -91,8 +91,8 @@ class CorpusReaderFileSystem extends CorpusReader {
ret[docId] = {
"@id": docId,
"@type": "Resource",
"title": docId,
// title: teiMeta.title,
// "title": docId,
// title: teiMeta.title,
"tree": {
"source": `${path.resolve(filePath)}`,
"parent": parent['@id'],
Expand Down Expand Up @@ -163,7 +163,7 @@ class CorpusReaderGitHub extends CorpusReader {
ret[itemId] = {
"@id": itemId,
"@type": "Resource",
"title": itemId,
// "title": itemId,
tree: {
source: item.path,
parent: parent['@id']
Expand Down
17 changes: 9 additions & 8 deletions tests/01-corpus.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ const expect = require('chai').expect
const Corpus = require('../controllers/corpus')

/*
npx mocha -b -w ./tests/01-corpus.test.js
TODO:
improve describe/it structure
*/
Expand All @@ -25,25 +26,25 @@ for (let source of sources) {
});

it('return a root collection', async function() {
await corpus.buildAndSaveTree()
root = corpus.getItem()
await corpus.buildAndSaveTree(!source.startsWith('http'))
root = await corpus.getItem()
assert.ok(root)
assert.ok(root.totalParents === 0)
assert.equal(root.totalParents, 0)
});

it('return two documents under the root collection', function() {
let items = corpus.getSubItems()
it('return two documents under the root collection', async function() {
let items = await corpus.getSubItems()
assert.equal(items.length, 2)
});

it('return two documents under the root collection', function() {
let item = corpus.getItemAndSubItems()
it('return two documents under the root collection', async function() {
let item = await corpus.getItemAndSubItems()
// console.log(item)
assert.equal(item.member.length, 2)
});

it('return the content of the first member under root collection', async function() {
let item = corpus.getItemAndSubItems()
let item = await corpus.getItemAndSubItems()
let content = await corpus.readItemContent(item.member[0]["@id"])
assert.ok(content)
assert.ok(content.length > 10)
Expand Down

0 comments on commit b08a965

Please sign in to comment.