From 1740e274342c1ac2424cb9300ab55f126ee702bc Mon Sep 17 00:00:00 2001 From: antsh3k Date: Wed, 12 Jun 2024 23:23:25 +0100 Subject: [PATCH] #8694u8cku update to multiprocessing_batch_char_size --- medcat/3_run_model/run_model.ipynb | 79 +++++++++--------------------- medcat/3_run_model/run_model.py | 16 +++--- 2 files changed, 31 insertions(+), 64 deletions(-) diff --git a/medcat/3_run_model/run_model.ipynb b/medcat/3_run_model/run_model.ipynb index 0596225..23964ed 100755 --- a/medcat/3_run_model/run_model.ipynb +++ b/medcat/3_run_model/run_model.ipynb @@ -2,18 +2,9 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/opt/anaconda3/envs/medcat/lib/python3.10/site-packages/medcat/cat.py:18: TqdmExperimentalWarning: Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)\n", - " from tqdm.autonotebook import tqdm, trange\n" - ] - } - ], + "outputs": [], "source": [ "import os\n", "from medcat.cat import CAT\n", @@ -27,7 +18,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -48,7 +39,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -61,23 +52,15 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Created folder to store annotations here: /Users/anthonyshek/projects/working_with_cogstack/data/annotated_docs/test_project\n" - ] - } - ], + "outputs": [], "source": [ "# Changes these according to your project\n", "project_name = 'test_project' # Name of your project. Annotated documents relating to this project will be stored here.\n", "documents_to_annotate = \"cogstack_search_results/example documents to annotate.csv\" # Add your data file here\n", "\n", - "modelpack = 'mc_modelpack_snomed_int_16_mar_2022_25be3857ba34bdd5.zip' # enter your model here. Should the the output of trained 'output_modelpack'.\n", + "modelpack = '' # enter your model here. Should the the output of trained 'output_modelpack'.\n", "snomed_filter_path = None\n", "\n", "\n", @@ -109,7 +92,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -127,17 +110,9 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "There is no concept filter set\n" - ] - } - ], + "outputs": [], "source": [ "# Set snomed filter if needed\n", "# This is a white list filter of concepts\n", @@ -152,7 +127,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -161,7 +136,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": { "scrolled": true }, @@ -175,19 +150,19 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ "batch_char_size = 50000 # Batch size (BS) in number of characters\n", - "cat.multiprocessing(data_iterator(df, doc_id_column, doc_text_column),\n", - " batch_size_chars=batch_char_size,\n", - " only_cui=False,\n", - " nproc=8, # Number of processors\n", - " out_split_size_chars=20*batch_char_size,\n", - " save_dir_path=ann_folder_path,\n", - " min_free_memory=0.1,\n", - " )\n", + "cat.multiprocessing_batch_char_size(data_iterator(df, doc_id_column, doc_text_column),\n", + " batch_size_chars=batch_char_size,\n", + " only_cui=False,\n", + " nproc=8, # Number of processors\n", + " out_split_size_chars=20*batch_char_size,\n", + " save_dir_path=ann_folder_path,\n", + " min_free_memory=0.1,\n", + " )\n", "\n", "medcat_logger.warning(f'Annotation process complete!')\n" ] @@ -204,17 +179,9 @@ }, { "cell_type": "code", - "execution_count": 53, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Error: There are documents which havent been annotated! Check 'medcat.log' for more info\n" - ] - } - ], + "outputs": [], "source": [ "# Check if everything has run smoothly. If an error has been raised check the logs\n", "try:\n", diff --git a/medcat/3_run_model/run_model.py b/medcat/3_run_model/run_model.py index 3d97c42..2a840ab 100644 --- a/medcat/3_run_model/run_model.py +++ b/medcat/3_run_model/run_model.py @@ -82,14 +82,14 @@ def relevant_text_gen(generator, doc_id = '_id', text_col='body_analysed'): batch_char_size = 500000 # Batch size (BS) in number of characters -cat.multiprocessing(relevant_text_gen(search_gen), - batch_size_chars=batch_char_size, - only_cui=False, - nproc=8, # Number of processors - out_split_size_chars=20*batch_char_size, - save_dir_path=ann_folder_path, - min_free_memory=0.1, - ) +cat.multiprocessing_batch_char_size(relevant_text_gen(search_gen), + batch_size_chars=batch_char_size, + only_cui=False, + nproc=8, # Number of processors + out_split_size_chars=20*batch_char_size, + save_dir_path=ann_folder_path, + min_free_memory=0.1, + ) medcat_logger.warning(f'Annotation process complete!')