From 1740e274342c1ac2424cb9300ab55f126ee702bc Mon Sep 17 00:00:00 2001
From: antsh3k <antshek@hotmail.com>
Date: Wed, 12 Jun 2024 23:23:25 +0100
Subject: [PATCH] #8694u8cku update to multiprocessing_batch_char_size

---
 medcat/3_run_model/run_model.ipynb | 79 +++++++++---------------------
 medcat/3_run_model/run_model.py    | 16 +++---
 2 files changed, 31 insertions(+), 64 deletions(-)

diff --git a/medcat/3_run_model/run_model.ipynb b/medcat/3_run_model/run_model.ipynb
index 0596225..23964ed 100755
--- a/medcat/3_run_model/run_model.ipynb
+++ b/medcat/3_run_model/run_model.ipynb
@@ -2,18 +2,9 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/opt/anaconda3/envs/medcat/lib/python3.10/site-packages/medcat/cat.py:18: TqdmExperimentalWarning: Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)\n",
-      "  from tqdm.autonotebook import tqdm, trange\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "import os\n",
     "from medcat.cat import CAT\n",
@@ -27,7 +18,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -48,7 +39,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -61,23 +52,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Created folder to store annotations here: /Users/anthonyshek/projects/working_with_cogstack/data/annotated_docs/test_project\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "# Changes these according to your project\n",
     "project_name = 'test_project' # Name of your project. Annotated documents relating to this project will be stored here.\n",
     "documents_to_annotate = \"cogstack_search_results/example documents to annotate.csv\" # Add your data file here\n",
     "\n",
-    "modelpack = 'mc_modelpack_snomed_int_16_mar_2022_25be3857ba34bdd5.zip'  # enter your model here. Should the the output of trained 'output_modelpack'.\n",
+    "modelpack = ''  # enter your model here. Should the the output of trained 'output_modelpack'.\n",
     "snomed_filter_path = None\n",
     "\n",
     "\n",
@@ -109,7 +92,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -127,17 +110,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "There is no concept filter set\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "# Set snomed filter if needed\n",
     "# This is a white list filter of concepts\n",
@@ -152,7 +127,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -161,7 +136,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": null,
    "metadata": {
     "scrolled": true
    },
@@ -175,19 +150,19 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "batch_char_size = 50000  # Batch size (BS) in number of characters\n",
-    "cat.multiprocessing(data_iterator(df, doc_id_column, doc_text_column),\n",
-    "                    batch_size_chars=batch_char_size,\n",
-    "                    only_cui=False,\n",
-    "                    nproc=8, # Number of processors\n",
-    "                    out_split_size_chars=20*batch_char_size,\n",
-    "                    save_dir_path=ann_folder_path,\n",
-    "                    min_free_memory=0.1,\n",
-    "                    )\n",
+    "cat.multiprocessing_batch_char_size(data_iterator(df, doc_id_column, doc_text_column),\n",
+    "                                    batch_size_chars=batch_char_size,\n",
+    "                                    only_cui=False,\n",
+    "                                    nproc=8, # Number of processors\n",
+    "                                    out_split_size_chars=20*batch_char_size,\n",
+    "                                    save_dir_path=ann_folder_path,\n",
+    "                                    min_free_memory=0.1,\n",
+    "                                    )\n",
     "\n",
     "medcat_logger.warning(f'Annotation process complete!')\n"
    ]
@@ -204,17 +179,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 53,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Error: There are documents which havent been annotated! Check 'medcat.log' for more info\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "# Check if everything has run smoothly. If an error has been raised check the logs\n",
     "try:\n",
diff --git a/medcat/3_run_model/run_model.py b/medcat/3_run_model/run_model.py
index 3d97c42..2a840ab 100644
--- a/medcat/3_run_model/run_model.py
+++ b/medcat/3_run_model/run_model.py
@@ -82,14 +82,14 @@ def relevant_text_gen(generator, doc_id = '_id', text_col='body_analysed'):
 
 batch_char_size = 500000  # Batch size (BS) in number of characters
 
-cat.multiprocessing(relevant_text_gen(search_gen),
-                    batch_size_chars=batch_char_size,
-                    only_cui=False,
-                    nproc=8, # Number of processors
-                    out_split_size_chars=20*batch_char_size,
-                    save_dir_path=ann_folder_path,
-                    min_free_memory=0.1,
-                    )
+cat.multiprocessing_batch_char_size(relevant_text_gen(search_gen),
+                                    batch_size_chars=batch_char_size,
+                                    only_cui=False,
+                                    nproc=8, # Number of processors
+                                    out_split_size_chars=20*batch_char_size,
+                                    save_dir_path=ann_folder_path,
+                                    min_free_memory=0.1,
+                                    )
 
 medcat_logger.warning(f'Annotation process complete!')