diff --git a/README.md b/README.md index 2592f53..1a92600 100644 --- a/README.md +++ b/README.md @@ -29,13 +29,13 @@ To start a local DTS web service: ## Must * add a github action to generate static DTS from thornton corpus -* read metadata from TEI file when building the corpus tree to set title field * parametrise the DTS server * parametrise the SDTS generator * test the static generator with github ## Should +* set the name of the top collection from settings.json * default collection.json * move settings file under a collection.json file under the top collection folder * document! diff --git a/controllers/controller.js b/controllers/controller.js index 14496e8..ed8fa97 100644 --- a/controllers/controller.js +++ b/controllers/controller.js @@ -1,6 +1,7 @@ "use strict"; -// TODO: add text metadata from TEI +// TODO: add text metadata from TEI +// TODO: use SaxonJS instead of DOMParser & XPath const SaxonJS = require("saxon-js"); const fs = require("fs"); const DOMParser = require("@xmldom/xmldom").DOMParser; @@ -50,7 +51,7 @@ var controllers = { res.json(ret); }, collections: async (req, res) => { - let ret = corpus.getItemAndSubItems() + let ret = await corpus.getItemAndSubItems() /* TODO: "@context": { diff --git a/controllers/corpus.js b/controllers/corpus.js index 1e29707..124613d 100644 --- a/controllers/corpus.js +++ b/controllers/corpus.js @@ -1,5 +1,6 @@ -const xpath = require('xpath') -const dom = require('xmldom').DOMParser +const SaxonJS = require("saxon-js"); +// const xpath = require('xpath') +// const dom = require('xmldom').DOMParser const path = require("path"); const fs = require("fs"); const CorpusReader = require("./corpusReader") @@ -9,13 +10,11 @@ const CorpusReader = require("./corpusReader") /* TODO: -M move file caching from controller to here M add missing DTS fields in the getItem responses -M improve the IDs (in controller?) -M move file reading from controller to Corpus class -M create extension that can read corpus from Github folders S optimise for large collections on github (with caching, lazy loading & pagination) S document limitations (e.g. unique file name or @xml:id) +C move file caching from controller to here +C improve the IDs (in controller?) C support for sub-collections (using a .collection.json file under each collection folder) */ @@ -68,7 +67,6 @@ class Corpus { } saveTree() { - console.log(`Save tree ${this.getTreePath()}`) fs.writeFileSync( this.getTreePath(), JSON.stringify(this.tree, null, 2), @@ -88,28 +86,36 @@ class Corpus { return str.replace(/\W+/g, "-").replace(/(^-|-$)/g, "") } - getItemAndSubItems(id="ROOT") { + async getItemAndSubItems(id="ROOT") { return { "@context": "https://distributed-text-services.github.io/specifications/context/1.0.0draft-2.json", - ...this.getItem(id), - member: this.getSubItems(id) + ...await this.getItem(id), + member: await this.getSubItems(id) } } - getItem(id="ROOT") { - return this._cleanItem(this._getItem(id)) + async getItem(id="ROOT") { + return await this._cleanItem(this._getItem(id)) } _getItem(id="ROOT") { return CorpusReader.getTreeItem(this.tree, id) } - _cleanItem(item) { + async _cleanItem(item) { if (!item) return null + + if (!item.title) { + let meta = await this.getMetadataFromTEIFile(item["@id"]) + item.title = meta.title + this.saveTree() + } + let ret = {...item} ret.totalParents = item.tree.parent ? 1 : 0; ret.totalItems = item.tree.children || 0 ret.totalChildren = item.tree.children || 0 + delete ret.tree return ret } @@ -118,11 +124,11 @@ class Corpus { return this._getItem(id).tree.source } - getSubItems(id="ROOT") { + async getSubItems(id="ROOT") { let ret = [] for (let item of Object.values(this.tree)) { if (item.tree.parent === id) { - ret.push(this._cleanItem(item)) + ret.push(await this._cleanItem(item)) } } return ret @@ -132,60 +138,9 @@ class Corpus { // console.log(this.tree[id]) return await this.reader.readItemContent(this._getItem(id)?.tree?.source) } - - // resetTree() { - // this.tree = { - // "ROOT": { - // "@id": "ROOT", - // "@type": "Collection", - // "title": "ROOT Collection", - // "tree": { - // "source": this.getAbsoluteSource(), - // "parent": null, - // "updated": new Date() - // } - // } - // } - - // return this.tree - // } - - // async buildTree(collectionPath) { - // // TODO: handle collections & sub-collections - // if (typeof collectionPath === "undefined") { - // collectionPath = this.source - // this.resetTree() - // } - - // const directory = collectionPath; - - // for (let filename of fs.readdirSync(directory).sort()) { - // let filePath = this.getAbsoluteSource(); - // if (fs.lstatSync(filePath).isDirectory()) { - // this.buildTree(filePath); - // } else { - // if (filename.endsWith(".xml")) { - // let shortName = filename.replace(/\.[^.]*$/, ""); - // // let documentId = `${idCollection}/${handle}`; - // let docId = `${shortName}`; - // // let teiMeta = await getMetadataFromTEIFile(filePath); - // this.tree[docId] = { - // "@id": docId, - // "@type": "Resource", - // "title": docId, - // // title: teiMeta.title, - // "tree": { - // "source": `${filePath}`, - // "parent": "ROOT", - // } - // }; - // } - // } - // } - // } - async getMetadataFromTEIFile(filePath) { - let content = readFile(filePath); + async getMetadataFromTEIFile(documentId) { + let content = await this.readItemContent(documentId); // optimisation: we extract the TEI header (so less xml to parse) let m = content.match(/^.*<\/teiHeader>/s); content = `${m[0]}`; diff --git a/controllers/corpusReader.js b/controllers/corpusReader.js index 6ce3fcd..951415d 100644 --- a/controllers/corpusReader.js +++ b/controllers/corpusReader.js @@ -91,8 +91,8 @@ class CorpusReaderFileSystem extends CorpusReader { ret[docId] = { "@id": docId, "@type": "Resource", - "title": docId, - // title: teiMeta.title, + // "title": docId, + // title: teiMeta.title, "tree": { "source": `${path.resolve(filePath)}`, "parent": parent['@id'], @@ -163,7 +163,7 @@ class CorpusReaderGitHub extends CorpusReader { ret[itemId] = { "@id": itemId, "@type": "Resource", - "title": itemId, + // "title": itemId, tree: { source: item.path, parent: parent['@id'] diff --git a/tests/01-corpus.test.js b/tests/01-corpus.test.js index f5c8b38..0235af1 100644 --- a/tests/01-corpus.test.js +++ b/tests/01-corpus.test.js @@ -3,6 +3,7 @@ const expect = require('chai').expect const Corpus = require('../controllers/corpus') /* +npx mocha -b -w ./tests/01-corpus.test.js TODO: improve describe/it structure */ @@ -25,25 +26,25 @@ for (let source of sources) { }); it('return a root collection', async function() { - await corpus.buildAndSaveTree() - root = corpus.getItem() + await corpus.buildAndSaveTree(!source.startsWith('http')) + root = await corpus.getItem() assert.ok(root) - assert.ok(root.totalParents === 0) + assert.equal(root.totalParents, 0) }); - it('return two documents under the root collection', function() { - let items = corpus.getSubItems() + it('return two documents under the root collection', async function() { + let items = await corpus.getSubItems() assert.equal(items.length, 2) }); - it('return two documents under the root collection', function() { - let item = corpus.getItemAndSubItems() + it('return two documents under the root collection', async function() { + let item = await corpus.getItemAndSubItems() // console.log(item) assert.equal(item.member.length, 2) }); it('return the content of the first member under root collection', async function() { - let item = corpus.getItemAndSubItems() + let item = await corpus.getItemAndSubItems() let content = await corpus.readItemContent(item.member[0]["@id"]) assert.ok(content) assert.ok(content.length > 10)