Inist-CNRS · parmentf · Mar 24, 2023 · Mar 24, 2023 · Mar 24, 2023 · Mar 24, 2023
diff --git a/applications/ws-irc3sp/Dockerfile b/applications/ws-irc3sp/Dockerfile
@@ -12,7 +12,6 @@ RUN apk add --update-cache --no-cache \
 	openssl-dev \
 	perl \
 	perl-json \
-	ca-certificates \
 	&& \
 	gunzip public/v1/CoL.txt.gz && \
 	mv package-app.json package.json && \

diff --git a/applications/ws-irc3sp/README.md b/applications/ws-irc3sp/README.md
@@ -1,4 +1,4 @@
-# ws-irc3sp@1.0.0
+# ws-irc3sp@1.1.0
 
 IRC3: Indexation par Recherche et Comparaison de Chaînes de Caractères
 
@@ -8,10 +8,6 @@ See [original
 program](https://gitbucket.inist.fr/scodex/IRC3/tree/master/IRC3sp) (French
 description of IRC3sp).
 
-> 💡 The treatment is much quicker when you send an array containing a tokenized
-> text (sentence by sentence).  
-> The payload may be like `[{"id":1,"value":["sentence 1", "sentence 2"]}]`.
-
 ## Test
 
 ```bash

diff --git a/applications/ws-irc3sp/package.json b/applications/ws-irc3sp/package.json
@@ -1,7 +1,7 @@
 {
 	"private": true,
 	"name": "ws-irc3sp",
-	"version": "1.0.0",
+	"version": "1.1.0",
 	"description": "Lodex workers for ws-irc3sp",
 	"repository": {
 		"type": "git",

diff --git a/applications/ws-irc3sp/public/swagger.json b/applications/ws-irc3sp/public/swagger.json
@@ -1,5 +1,5 @@
 {
     "info": {
-        "version": "1.0.0"
+        "version": "1.1.0"
     }
 }
diff --git a/applications/ws-irc3sp/public/v1/irc3sp.ini b/applications/ws-irc3sp/public/v1/irc3sp.ini
@@ -18,16 +18,18 @@ post.parameters.1.description = Indent or not the JSON Result
 plugin = @ezs/spawn
 # JSONParse
 plugin = @ezs/basics
+# sentences
+plugin = ./v1/local.js
 
 [JSONParse]
 legacy = false
 separator = $
 
+[sentences]
+
 [expand]
 path = env('path', 'value')
 size = 100
-# A cache is not a good idea on long texts
-# cacheName = irc3sp-post-v1-irc3sp
 
 [expand/exec]
 # command should be executable !

diff --git a/applications/ws-irc3sp/public/v1/local.js b/applications/ws-irc3sp/public/v1/local.js
@@ -0,0 +1,45 @@
+const LETTERS = "ABCDEFHIJKLMNOPQRSTUVWXYZ";
+const SENTENCE_ENDING = ".?!";
+
+const sentences = (data, feed, ctx) => {
+    if (ctx.isLast()) {
+        return feed.close();
+    }
+
+    let value = data?.value;
+    if (Array.isArray(value)) {
+        if (value.length === 1) {
+            value = value[0];
+        }
+    }
+    if (typeof value !== 'string') {
+        return feed.send({ ...data, value });
+    }
+
+    value = value.split("").reduce((a, c) => {
+        const currentSentence = a.slice(-1);
+        const [prev1, prev2] = a.slice(-1)[0].slice(-2);
+        if (SENTENCE_ENDING.includes(c)) {
+            if (c !== ".") {
+                return [...a.slice(0, -1), currentSentence + c, "  "];
+            }
+
+            if (prev1 !== " ") {
+                return [...a.slice(0, -1), currentSentence + c, "  "];
+            }
+
+            if (!LETTERS.includes(prev2)) {
+                return [...a.slice(0, -1), currentSentence + c, "  "];
+            }
+        }
+        return [...a.slice(0, -1), currentSentence + c]
+    },
+        ["  "])
+        .filter(sentence => sentence !== "  ")
+        .map(s => s.trimStart());
+    feed.send({ ...data, value });
+};
+
+module.exports = {
+    sentences,
+};