Skip to content

Commit

Permalink
#8694u8cku update to multiprocessing_batch_char_size
Browse files Browse the repository at this point in the history
  • Loading branch information
antsh3k committed Jun 12, 2024
1 parent d6655ce commit 1740e27
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 64 deletions.
79 changes: 23 additions & 56 deletions medcat/3_run_model/run_model.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,9 @@
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/opt/anaconda3/envs/medcat/lib/python3.10/site-packages/medcat/cat.py:18: TqdmExperimentalWarning: Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)\n",
" from tqdm.autonotebook import tqdm, trange\n"
]
}
],
"outputs": [],
"source": [
"import os\n",
"from medcat.cat import CAT\n",
Expand All @@ -27,7 +18,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -48,7 +39,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -61,23 +52,15 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Created folder to store annotations here: /Users/anthonyshek/projects/working_with_cogstack/data/annotated_docs/test_project\n"
]
}
],
"outputs": [],
"source": [
"# Changes these according to your project\n",
"project_name = 'test_project' # Name of your project. Annotated documents relating to this project will be stored here.\n",
"documents_to_annotate = \"cogstack_search_results/example documents to annotate.csv\" # Add your data file here\n",
"\n",
"modelpack = 'mc_modelpack_snomed_int_16_mar_2022_25be3857ba34bdd5.zip' # enter your model here. Should the the output of trained 'output_modelpack'.\n",
"modelpack = '' # enter your model here. Should the the output of trained 'output_modelpack'.\n",
"snomed_filter_path = None\n",
"\n",
"\n",
Expand Down Expand Up @@ -109,7 +92,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -127,17 +110,9 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"There is no concept filter set\n"
]
}
],
"outputs": [],
"source": [
"# Set snomed filter if needed\n",
"# This is a white list filter of concepts\n",
Expand All @@ -152,7 +127,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -161,7 +136,7 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": null,
"metadata": {
"scrolled": true
},
Expand All @@ -175,19 +150,19 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"batch_char_size = 50000 # Batch size (BS) in number of characters\n",
"cat.multiprocessing(data_iterator(df, doc_id_column, doc_text_column),\n",
" batch_size_chars=batch_char_size,\n",
" only_cui=False,\n",
" nproc=8, # Number of processors\n",
" out_split_size_chars=20*batch_char_size,\n",
" save_dir_path=ann_folder_path,\n",
" min_free_memory=0.1,\n",
" )\n",
"cat.multiprocessing_batch_char_size(data_iterator(df, doc_id_column, doc_text_column),\n",
" batch_size_chars=batch_char_size,\n",
" only_cui=False,\n",
" nproc=8, # Number of processors\n",
" out_split_size_chars=20*batch_char_size,\n",
" save_dir_path=ann_folder_path,\n",
" min_free_memory=0.1,\n",
" )\n",
"\n",
"medcat_logger.warning(f'Annotation process complete!')\n"
]
Expand All @@ -204,17 +179,9 @@
},
{
"cell_type": "code",
"execution_count": 53,
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Error: There are documents which havent been annotated! Check 'medcat.log' for more info\n"
]
}
],
"outputs": [],
"source": [
"# Check if everything has run smoothly. If an error has been raised check the logs\n",
"try:\n",
Expand Down
16 changes: 8 additions & 8 deletions medcat/3_run_model/run_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,14 +82,14 @@ def relevant_text_gen(generator, doc_id = '_id', text_col='body_analysed'):

batch_char_size = 500000 # Batch size (BS) in number of characters

cat.multiprocessing(relevant_text_gen(search_gen),
batch_size_chars=batch_char_size,
only_cui=False,
nproc=8, # Number of processors
out_split_size_chars=20*batch_char_size,
save_dir_path=ann_folder_path,
min_free_memory=0.1,
)
cat.multiprocessing_batch_char_size(relevant_text_gen(search_gen),
batch_size_chars=batch_char_size,
only_cui=False,
nproc=8, # Number of processors
out_split_size_chars=20*batch_char_size,
save_dir_path=ann_folder_path,
min_free_memory=0.1,
)

medcat_logger.warning(f'Annotation process complete!')

0 comments on commit 1740e27

Please sign in to comment.