From 4edd7c7a7737c00a4b0d1568a625eb81d5a4b4ab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Parmentier?= Date: Fri, 24 Mar 2023 15:59:38 +0100 Subject: [PATCH 1/4] perf(ws-irc3sp): Split value into sentences And respect initials of species names. This accelerates the treatment for long texts. --- applications/ws-irc3sp/README.md | 4 -- applications/ws-irc3sp/public/v1/irc3sp.ini | 6 ++- applications/ws-irc3sp/public/v1/local.js | 45 +++++++++++++++++++++ 3 files changed, 49 insertions(+), 6 deletions(-) create mode 100644 applications/ws-irc3sp/public/v1/local.js diff --git a/applications/ws-irc3sp/README.md b/applications/ws-irc3sp/README.md index 29b5e1c..511a648 100644 --- a/applications/ws-irc3sp/README.md +++ b/applications/ws-irc3sp/README.md @@ -8,10 +8,6 @@ See [original program](https://gitbucket.inist.fr/scodex/IRC3/tree/master/IRC3sp) (French description of IRC3sp). -> 💡 The treatment is much quicker when you send an array containing a tokenized -> text (sentence by sentence). -> The payload may be like `[{"id":1,"value":["sentence 1", "sentence 2"]}]`. - ## Test ```bash diff --git a/applications/ws-irc3sp/public/v1/irc3sp.ini b/applications/ws-irc3sp/public/v1/irc3sp.ini index fc9ef34..6cc2a6d 100644 --- a/applications/ws-irc3sp/public/v1/irc3sp.ini +++ b/applications/ws-irc3sp/public/v1/irc3sp.ini @@ -18,16 +18,18 @@ post.parameters.1.description = Indent or not the JSON Result plugin = @ezs/spawn # JSONParse plugin = @ezs/basics +# sentences +plugin = ./v1/local.js [JSONParse] legacy = false separator = $ +[sentences] + [expand] path = env('path', 'value') size = 100 -# A cache is not a good idea on long texts -# cacheName = irc3sp-post-v1-irc3sp [expand/exec] # command should be executable ! diff --git a/applications/ws-irc3sp/public/v1/local.js b/applications/ws-irc3sp/public/v1/local.js new file mode 100644 index 0000000..404af21 --- /dev/null +++ b/applications/ws-irc3sp/public/v1/local.js @@ -0,0 +1,45 @@ +const LETTERS = "ABCDEFHIJKLMNOPQRSTUVWXYZ"; +const SENTENCE_ENDING = ".?!"; + +const sentences = (data, feed, ctx) => { + if (ctx.isLast()) { + return feed.close(); + } + + let value = data?.value; + if (Array.isArray(value)) { + if (value.length === 1) { + value = value[0]; + } + } + if (typeof value !== 'string') { + return feed.send({ ...data, value }); + } + + value = value.split("").reduce((a, c) => { + const currentSentence = a.slice(-1); + const [prev1, prev2] = a.slice(-1)[0].slice(-2); + if (SENTENCE_ENDING.includes(c)) { + if (c !== ".") { + return [...a.slice(0, -1), (currentSentence + c).trimStart(), " "]; + } + + if (prev1 !== " ") { + return [...a.slice(0, -1), (currentSentence + c).trimStart(), " "]; + } + + if (!LETTERS.includes(prev2)) { + return [...a.slice(0, -1), (currentSentence + c).trimStart(), " "]; + } + } + return [...a.slice(0, -1), currentSentence + c] + }, + [" "]) + .filter(sentence => sentence !== " ") + .map(s => s.trimStart()); + feed.send({ ...data, value }); +}; + +module.exports = { + sentences, +}; From 0c31b1cfbe1f52a1e6639bf6cb2ef782d6b2802c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Parmentier?= Date: Fri, 24 Mar 2023 16:11:14 +0100 Subject: [PATCH 2/4] refactor(ws-irc3sp): Factorize trimStart --- applications/ws-irc3sp/public/v1/local.js | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/applications/ws-irc3sp/public/v1/local.js b/applications/ws-irc3sp/public/v1/local.js index 404af21..1ae9526 100644 --- a/applications/ws-irc3sp/public/v1/local.js +++ b/applications/ws-irc3sp/public/v1/local.js @@ -21,15 +21,15 @@ const sentences = (data, feed, ctx) => { const [prev1, prev2] = a.slice(-1)[0].slice(-2); if (SENTENCE_ENDING.includes(c)) { if (c !== ".") { - return [...a.slice(0, -1), (currentSentence + c).trimStart(), " "]; + return [...a.slice(0, -1), currentSentence + c, " "]; } if (prev1 !== " ") { - return [...a.slice(0, -1), (currentSentence + c).trimStart(), " "]; + return [...a.slice(0, -1), currentSentence + c, " "]; } if (!LETTERS.includes(prev2)) { - return [...a.slice(0, -1), (currentSentence + c).trimStart(), " "]; + return [...a.slice(0, -1), currentSentence + c, " "]; } } return [...a.slice(0, -1), currentSentence + c] From 8aa96f9eeddb2e7960f7ff8207c94906b331f2a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Parmentier?= Date: Fri, 24 Mar 2023 16:53:06 +0100 Subject: [PATCH 3/4] build(ws-irc3sp): Remove useless apk --- applications/ws-irc3sp/Dockerfile | 1 - 1 file changed, 1 deletion(-) diff --git a/applications/ws-irc3sp/Dockerfile b/applications/ws-irc3sp/Dockerfile index 5e39005..5b692d3 100644 --- a/applications/ws-irc3sp/Dockerfile +++ b/applications/ws-irc3sp/Dockerfile @@ -12,7 +12,6 @@ RUN apk add --update-cache --no-cache \ openssl-dev \ perl \ perl-json \ - ca-certificates \ && \ gunzip public/v1/CoL.txt.gz && \ mv package-app.json package.json && \ From 162afe07762eab862c3d4f1f3d741c3afa214bab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Parmentier?= Date: Fri, 24 Mar 2023 16:53:34 +0100 Subject: [PATCH 4/4] release ws-irc3sp@1.1.0 --- applications/ws-irc3sp/README.md | 2 +- applications/ws-irc3sp/package.json | 2 +- applications/ws-irc3sp/public/swagger.json | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/applications/ws-irc3sp/README.md b/applications/ws-irc3sp/README.md index 511a648..0a3d359 100644 --- a/applications/ws-irc3sp/README.md +++ b/applications/ws-irc3sp/README.md @@ -1,4 +1,4 @@ -# ws-irc3sp@1.0.0 +# ws-irc3sp@1.1.0 IRC3: Indexation par Recherche et Comparaison de Chaînes de Caractères diff --git a/applications/ws-irc3sp/package.json b/applications/ws-irc3sp/package.json index 3bafc2d..0c666f9 100644 --- a/applications/ws-irc3sp/package.json +++ b/applications/ws-irc3sp/package.json @@ -1,7 +1,7 @@ { "private": true, "name": "ws-irc3sp", - "version": "1.0.0", + "version": "1.1.0", "description": "Lodex workers for ws-irc3sp", "repository": { "type": "git", diff --git a/applications/ws-irc3sp/public/swagger.json b/applications/ws-irc3sp/public/swagger.json index 7ab0638..f4a91a0 100644 --- a/applications/ws-irc3sp/public/swagger.json +++ b/applications/ws-irc3sp/public/swagger.json @@ -1,5 +1,5 @@ { "info": { - "version": "1.0.0" + "version": "1.1.0" } } \ No newline at end of file