From 3a4b646236c69c7feb4b0ae020bd7d79c3849ae6 Mon Sep 17 00:00:00 2001
From: Xi Bai <baixiac@gmail.com>
Date: Tue, 23 Jul 2024 16:35:17 +0100
Subject: [PATCH] fix raw data preservation for unsupervised training

---
 README.md                                | 86 +++++++++++-------------
 app/api/routers/unsupervised_training.py |  9 ++-
 app/requirements.txt                     |  1 +
 app/trainers/base.py                     |  6 +-
 docker/medcat-deid/requirements.txt      |  1 +
 docker/medcat-icd10/requirements.txt     |  1 +
 docker/medcat-snomed/requirements.txt    |  1 +
 docker/medcat-umls/requirements.txt      |  1 +
 docker/trf-deid/requirements.txt         |  1 +
 setup.cfg                                | 19 ++++--
 10 files changed, 70 insertions(+), 56 deletions(-)
diff --git a/README.md b/README.md
index bcb20d8..203acce 100644
--- a/README.md
+++ b/README.md
@@ -15,7 +15,7 @@ Currently, CMS offers both HTTP endpoints for running NLP-related jobs and a com
 - [SNOMED MedCAT Model](https://cogstack.github.io/CogStack-ModelServe/docs/medcat_snomed_model_apis.html)
 - [ICD-10 MedCAT Model](https://cogstack.github.io/CogStack-ModelServe/docs/medcat_icd10_model_apis.html)
 - [UMLS MedCAT Model](https://cogstack.github.io/CogStack-ModelServe/docs/medcat_umls_model_apis.html)
-- [AnonCAT Model](https://cogstack.github.io/CogStack-ModelServe/docs/anoncat_model_apis.html)
+- [De-ID MedCAT Model (AnonCAT)](https://cogstack.github.io/CogStack-ModelServe/docs/anoncat_model_apis.html)
 - [De-ID Transformers Model](https://cogstack.github.io/CogStack-ModelServe/docs/transformers_deid_model_apis.html)
 - [All-in-One Doc](https://cogstack.github.io/CogStack-ModelServe/docs/cogstack_model_serve_apis.html)
 
@@ -49,7 +49,7 @@ The following table summarises the servable model types with their respective ou
 |   medcat_snomed   |   medcat-snomed   | labelled with SNOMED concepts |
 |   medcat_icd10    |   medcat-icd10    | labelled with ICD-10 concepts |
 |    medcat_umls    |    medcat-umls    |  labelled with UMLS concepts  |
-|    medcat_deid    |    medcat-deid    |  labelled with PII concepts   |
+|    medcat_deid (anoncat)    |    medcat-deid    |  labelled with latest PII concepts   |
 | transformers_deid | de-identification |  labelled with PII concepts   |
 
 ## Run ModelServe in the container environment:
@@ -69,47 +69,11 @@ Then the API docs will be accessible at localhost on the mapped port specified i
 as a `cms` non-root user configured during the image build. Ensure the model package file is owned by the currently
 logged-in user to avoid permission-related errors. If the file ownership is altered, you will need to rebuild the image.
 
-### Serve models via streaming HTTP APIs
-You can send your texts to the CMS stream endpoint and receive NLP results as a stream. To that end,
-start CMS as a streamable service by running:
-```commandline
-python app/cli/cli.py serve --streamable --model-type <model-type> --model-path PATH/TO/MODEL_PACKAGE.zip --host 127.0.0.1 --port 8000
-```
-Currently, [JSON Lines](https://jsonlines.org/) is supported for formatting request and response bodies. For example, the following request:
-```commandline
-curl -X 'POST' 'http://127.0.0.1:8000/stream/process' \
-    -H 'Content-Type: application/x-ndjson' \
-    --data-binary $'{"name": "DOC", "text": "TEXT"}\n{"name": "ANOTHER_DOC", "text": "ANOTHER_TEXT"}'
-```
-will result in a response like {"doc_name": "DOC", "start": INT, "end": INT, "label_name": "STR", "label_id": "STR", ...}\n...
-
-#### Chat with served models
-You can also "chat" with the running model using the `/stream/ws` endpoint. For example:
-```html
-<form action="" onsubmit="send_doc(event)">
-    <input type="text" id="cms-input" autocomplete="off"/>
-    <button>Send</button>
-</form>
-<ul id="cms-output"></ul>
-<script>
-    var ws = new WebSocket("ws://localhost:8000/stream/ws");
-    ws.onmessage = function(event) {
-        document.getElementById("cms-output").appendChild(
-            Object.assign(document.createElement('li'), { textContent: event.data })
-        );
-    };
-    function send_doc(event) {
-        ws.send(document.getElementById("cms-input").value);
-        event.preventDefault();
-    };
-</script>
-```
-
 ### Auxiliary services
-In addition to the core services such as serving, training and evaluation, CMS provides several ready-to-use components
-to help users run CMS in a production environment. The default configuration can be customised to what suits your own needs.
-
-
+In addition to the core services such as serving, training and evaluation, CMS provides several ready-to-use components to help users make these production-ready. Their presence and default configuration can be customised to what suits your own needs. The diagram below illustrates the interactions between core and auxiliary services:
+<p align="center">
+  <img src="app/api/static/images/cms.png" alt="components" width="auto">
+</p>
 :information_source: <small style="line-height: 1.2;">Some environment variables with opinionated naming conventions do not necessarily imply that the stacks are bound to a specific
 cloud provider or even using a cloud service at all. For example, `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` are used
 to set credentials for `minio`, the default storage service, rather than for Amazon Web Services.</small>
@@ -187,8 +151,38 @@ docker compose -f docker-compose-auth.yml up -d
 Before running ModelServe, define extra environment variables to enable the built-in
 token-based authentication and hook it up with the database by following this [Instruction](./app/api/auth/README.md).
 
-The diagram illustrating the interactions between core and auxiliary services is presented below:
+### Serve models via streaming HTTP APIs
+You can send your texts to the CMS stream endpoint and receive NLP results as a stream. To that end,
+start CMS as a streamable service by running:
+```commandline
+python app/cli/cli.py serve --streamable --model-type <model-type> --model-path PATH/TO/MODEL_PACKAGE.zip --host 127.0.0.1 --port 8000
+```
+Currently, [JSON Lines](https://jsonlines.org/) is supported for formatting request and response bodies. For example, the following request:
+```commandline
+curl -X 'POST' 'http://127.0.0.1:8000/stream/process' \
+    -H 'Content-Type: application/x-ndjson' \
+    --data-binary $'{"name": "DOC", "text": "TEXT"}\n{"name": "ANOTHER_DOC", "text": "ANOTHER_TEXT"}'
+```
+will result in a response like {"doc_name": "DOC", "start": INT, "end": INT, "label_name": "STR", "label_id": "STR", ...}\n...
 
-<p align="center">
-  <img src="app/api/static/images/cms.png" alt="components" width="auto">
-</p>
+#### Chat with served models
+You can also "chat" with the running model using the `/stream/ws` endpoint. For example:
+```html
+<form action="" onsubmit="send_doc(event)">
+    <input type="text" id="cms-input" autocomplete="off"/>
+    <button>Send</button>
+</form>
+<ul id="cms-output"></ul>
+<script>
+    var ws = new WebSocket("ws://localhost:8000/stream/ws");
+    ws.onmessage = function(event) {
+        document.getElementById("cms-output").appendChild(
+            Object.assign(document.createElement('li'), { textContent: event.data })
+        );
+    };
+    function send_doc(event) {
+        ws.send(document.getElementById("cms-input").value);
+        event.preventDefault();
+    };
+</script>
+```
\ No newline at end of file
diff --git a/app/api/routers/unsupervised_training.py b/app/api/routers/unsupervised_training.py
index 904162a..8a2d98d 100644
--- a/app/api/routers/unsupervised_training.py
+++ b/app/api/routers/unsupervised_training.py
@@ -34,13 +34,20 @@ async def train_unsupervised(request: Request,
     file_names = []
     data_file.write("[")
     for td_idx, td in enumerate(training_data):
+        temp_td = tempfile.NamedTemporaryFile(mode="w")
         items = ijson.items(td.file, "item")
+        temp_td.write("[")
         for text_idx, text in enumerate(items):
             if text_idx > 0 or td_idx > 0:
                 data_file.write(",")
             json.dump(text, data_file)
+            if text_idx > 0:
+                temp_td.write(",")
+            json.dump(text, temp_td)
+        temp_td.write("]")
+        temp_td.flush()
         file_names.append("" if td.filename is None else td.filename)
-        files.append(td.file)
+        files.append(temp_td)
     data_file.write("]")
     data_file.flush()
     data_file.seek(0)
diff --git a/app/requirements.txt b/app/requirements.txt
index 640a085..ee71355 100644
--- a/app/requirements.txt
+++ b/app/requirements.txt
@@ -17,5 +17,6 @@ asyncpg~=0.27.0
 aiosqlite~=0.19.0
 evaluate~=0.4.1
 websockets~=12.0
+pynvml~=11.5.3
 setuptools
 wheel
\ No newline at end of file
diff --git a/app/trainers/base.py b/app/trainers/base.py
index b1b3053..a7b8e4d 100644
--- a/app/trainers/base.py
+++ b/app/trainers/base.py
@@ -75,13 +75,15 @@ def start_training(self,
                         dataset = datasets.load_dataset(doc_dataset.__file__,
                                                         data_files={"documents": data_file.name},
                                                         split="train",
-                                                        cache_dir=None)
+                                                        cache_dir=None,
+                                                        trust_remote_code=True)
                         self._tracker_client.save_train_dataset(dataset)
                     elif training_type == TrainingType.SUPERVISED.value:
                         dataset = datasets.load_dataset(anno_dataset.__file__,
                                                         data_files={"annotations": data_file.name},
                                                         split="train",
-                                                        cache_dir=None)
+                                                        cache_dir=None,
+                                                        trust_remote_code=True)
                         self._tracker_client.save_train_dataset(dataset)
                     else:
                         raise ValueError(f"Unknown training type: {training_type}")
diff --git a/docker/medcat-deid/requirements.txt b/docker/medcat-deid/requirements.txt
index 0d0d604..7959c04 100644
--- a/docker/medcat-deid/requirements.txt
+++ b/docker/medcat-deid/requirements.txt
@@ -17,5 +17,6 @@ asyncpg~=0.27.0
 aiosqlite~=0.19.0
 evaluate~=0.4.1
 websockets~=12.0
+pynvml~=11.5.3
 setuptools
 wheel
\ No newline at end of file
diff --git a/docker/medcat-icd10/requirements.txt b/docker/medcat-icd10/requirements.txt
index 0d0d604..7959c04 100644
--- a/docker/medcat-icd10/requirements.txt
+++ b/docker/medcat-icd10/requirements.txt
@@ -17,5 +17,6 @@ asyncpg~=0.27.0
 aiosqlite~=0.19.0
 evaluate~=0.4.1
 websockets~=12.0
+pynvml~=11.5.3
 setuptools
 wheel
\ No newline at end of file
diff --git a/docker/medcat-snomed/requirements.txt b/docker/medcat-snomed/requirements.txt
index 0d0d604..7959c04 100644
--- a/docker/medcat-snomed/requirements.txt
+++ b/docker/medcat-snomed/requirements.txt
@@ -17,5 +17,6 @@ asyncpg~=0.27.0
 aiosqlite~=0.19.0
 evaluate~=0.4.1
 websockets~=12.0
+pynvml~=11.5.3
 setuptools
 wheel
\ No newline at end of file
diff --git a/docker/medcat-umls/requirements.txt b/docker/medcat-umls/requirements.txt
index 0d0d604..7959c04 100644
--- a/docker/medcat-umls/requirements.txt
+++ b/docker/medcat-umls/requirements.txt
@@ -17,5 +17,6 @@ asyncpg~=0.27.0
 aiosqlite~=0.19.0
 evaluate~=0.4.1
 websockets~=12.0
+pynvml~=11.5.3
 setuptools
 wheel
\ No newline at end of file
diff --git a/docker/trf-deid/requirements.txt b/docker/trf-deid/requirements.txt
index 6f9c567..51a16de 100644
--- a/docker/trf-deid/requirements.txt
+++ b/docker/trf-deid/requirements.txt
@@ -17,5 +17,6 @@ asyncpg~=0.27.0
 aiosqlite~=0.19.0
 evaluate~=0.4.1
 websockets~=12.0
+pynvml~=11.5.3
 setuptools
 wheel
\ No newline at end of file
diff --git a/setup.cfg b/setup.cfg
index bd26e75..dd36f42 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -14,16 +14,16 @@ packages = find:
 platforms = any
 include_package_data = True
 install_requires =
-    medcat ~= 1.8.0
-    fastapi ~= 0.102.0
-    uvicorn ~= 0.22.0
+    medcat ~= 1.9.0
+    fastapi ~= 0.110.3
+    uvicorn ~= 0.29.0
     python-multipart ~= 0.0.5
     ijson ~= 3.1.4
     python-dotenv ~= 0.20.0
-    mlflow ~= 2.11.0
+    mlflow ~= 2.12.0
     psycopg2-binary ~= 2.9.4
     boto3 ~= 1.28.84
-    typer ~= 0.7.0
+    typer ~= 0.12.3
     prometheus-fastapi-instrumentator ~= 5.11.2
     slowapi ~= 0.1.7
     graypy ~= 2.1.0
@@ -31,6 +31,9 @@ install_requires =
     fastapi-users-db-sqlalchemy ~= 5.0.0
     asyncpg ~= 0.27.0
     aiosqlite ~= 0.19.0
+    evaluate ~= 0.4.1
+    websockets ~= 12.0
+    pynvml ~= 11.5.3
 python_requires = >=3.8
 
 [bdist_wheel]
@@ -45,10 +48,12 @@ test =
     pytest-mock ~= 3.7.0
     pytest-timeout ~= 2.1.0
     pytest-random-order ~= 1.1.0
-    pytest-asyncio ~= 0.21.0
+    pytest-asyncio ~= 0.23.7
+    pytest-cov ~= 4.1.0
     httpx ~= 0.24.1
-    mypy == 1.0.0
+    mypy == 1.8.0
     flake8 == 4.0.1
     locust ~= 2.11.0
+    typer-cli ~= 0.12.3
 all =
     %(test)s