From 7990e7dfbbf510e5de1f7e06c56827ad159e92c6 Mon Sep 17 00:00:00 2001 From: myhloli Date: Thu, 26 Dec 2024 17:30:00 +0800 Subject: [PATCH 01/16] feat(model): add npu support and optimize table model - Add NPU support for memory cleaning and model initialization - Optimize table model initialization and prediction process - Update memory utils to support NPU - Add language parameter for table model --- magic_pdf/libs/clean_memory.py | 3 ++ magic_pdf/model/pdf_extract_kit.py | 7 ++++ magic_pdf/model/sub_modules/model_init.py | 22 ++++++++-- magic_pdf/model/sub_modules/model_utils.py | 3 ++ .../table/rapidtable/rapid_table.py | 40 ++++++++++++++++--- magic_pdf/pdf_parse_union_core_v2.py | 8 ++++ 6 files changed, 73 insertions(+), 10 deletions(-) diff --git a/magic_pdf/libs/clean_memory.py b/magic_pdf/libs/clean_memory.py index 6bfc174f..236ac8fb 100644 --- a/magic_pdf/libs/clean_memory.py +++ b/magic_pdf/libs/clean_memory.py @@ -7,4 +7,7 @@ def clean_memory(): if torch.cuda.is_available(): torch.cuda.empty_cache() torch.cuda.ipc_collect() + elif torch.npu.is_available(): + torch.npu.empty_cache() + torch.npu.ipc_collect() gc.collect() \ No newline at end of file diff --git a/magic_pdf/model/pdf_extract_kit.py b/magic_pdf/model/pdf_extract_kit.py index ff3faead..2bcf12cd 100644 --- a/magic_pdf/model/pdf_extract_kit.py +++ b/magic_pdf/model/pdf_extract_kit.py @@ -87,6 +87,12 @@ def __init__(self, ocr: bool = False, show_log: bool = False, **kwargs): ) # 初始化解析方案 self.device = kwargs.get('device', 'cpu') + + if str(self.device).startswith("npu"): + import torch_npu + os.environ['FLAGS_npu_jit_compile'] = '0' + os.environ['FLAGS_use_stride_kernel'] = '0' + logger.info('using device: {}'.format(self.device)) models_dir = kwargs.get( 'models_dir', os.path.join(root_dir, 'resources', 'models') @@ -164,6 +170,7 @@ def __init__(self, ocr: bool = False, show_log: bool = False, **kwargs): table_model_path=str(os.path.join(models_dir, table_model_dir)), table_max_time=self.table_max_time, device=self.device, + lang=self.lang, ) logger.info('DocAnalysis init done!') diff --git a/magic_pdf/model/sub_modules/model_init.py b/magic_pdf/model/sub_modules/model_init.py index 81571bc3..832e46e4 100644 --- a/magic_pdf/model/sub_modules/model_init.py +++ b/magic_pdf/model/sub_modules/model_init.py @@ -1,6 +1,8 @@ +import torch from loguru import logger from magic_pdf.config.constants import MODEL_NAME +from magic_pdf.libs.config_reader import get_device from magic_pdf.model.model_list import AtomicModel from magic_pdf.model.sub_modules.layout.doclayout_yolo.DocLayoutYOLO import \ DocLayoutYOLOModel @@ -19,7 +21,7 @@ TableMasterPaddleModel -def table_model_init(table_model_type, model_path, max_time, _device_='cpu'): +def table_model_init(table_model_type, model_path, max_time, _device_='cpu', lang=None): if table_model_type == MODEL_NAME.STRUCT_EQTABLE: table_model = StructTableModel(model_path, max_new_tokens=2048, max_time=max_time) elif table_model_type == MODEL_NAME.TABLE_MASTER: @@ -29,7 +31,7 @@ def table_model_init(table_model_type, model_path, max_time, _device_='cpu'): } table_model = TableMasterPaddleModel(config) elif table_model_type == MODEL_NAME.RAPID_TABLE: - table_model = RapidTableModel() + table_model = RapidTableModel(lang) else: logger.error('table model type not allow') exit(1) @@ -38,6 +40,8 @@ def table_model_init(table_model_type, model_path, max_time, _device_='cpu'): def mfd_model_init(weight, device='cpu'): + if str(device).startswith("npu"): + device = torch.device(device) mfd_model = YOLOv8MFDModel(weight, device) return mfd_model @@ -53,6 +57,8 @@ def layout_model_init(weight, config_file, device): def doclayout_yolo_model_init(weight, device='cpu'): + if str(device).startswith("npu"): + device = torch.device(device) model = DocLayoutYOLOModel(weight, device) return model @@ -63,6 +69,12 @@ def ocr_model_init(show_log: bool = False, use_dilation=True, det_db_unclip_ratio=1.8, ): + + use_npu = False + device = get_device() + if str(device).startswith("npu"): + use_npu = True + if lang is not None and lang != '': model = ModifiedPaddleOCR( show_log=show_log, @@ -70,6 +82,7 @@ def ocr_model_init(show_log: bool = False, lang=lang, use_dilation=use_dilation, det_db_unclip_ratio=det_db_unclip_ratio, + use_npu=use_npu, ) else: model = ModifiedPaddleOCR( @@ -77,7 +90,7 @@ def ocr_model_init(show_log: bool = False, det_db_box_thresh=det_db_box_thresh, use_dilation=use_dilation, det_db_unclip_ratio=det_db_unclip_ratio, - # use_angle_cls=True, + use_npu=use_npu, ) return model @@ -146,7 +159,8 @@ def atom_model_init(model_name: str, **kwargs): kwargs.get('table_model_name'), kwargs.get('table_model_path'), kwargs.get('table_max_time'), - kwargs.get('device') + kwargs.get('device'), + kwargs.get('lang'), ) else: logger.error('model name not allow') diff --git a/magic_pdf/model/sub_modules/model_utils.py b/magic_pdf/model/sub_modules/model_utils.py index 641fd062..4fb163d5 100644 --- a/magic_pdf/model/sub_modules/model_utils.py +++ b/magic_pdf/model/sub_modules/model_utils.py @@ -54,4 +54,7 @@ def get_vram(device): if torch.cuda.is_available() and device != 'cpu': total_memory = torch.cuda.get_device_properties(device).total_memory / (1024 ** 3) # 将字节转换为 GB return total_memory + elif torch.npu.is_available() and device != 'cpu': + total_memory = torch.npu.get_device_properties(device).total_memory / (1024 ** 3) # 转为 GB + return total_memory return None \ No newline at end of file diff --git a/magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py b/magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py index 51307538..f7d395ca 100644 --- a/magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py +++ b/magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py @@ -1,16 +1,44 @@ +import os +import cv2 import numpy as np from rapid_table import RapidTable from rapidocr_paddle import RapidOCR +try: + import torchtext + + if torchtext.__version__ >= '0.18.0': + torchtext.disable_torchtext_deprecation_warning() +except ImportError: + pass +os.environ['NO_ALBUMENTATIONS_UPDATE'] = '1' # 禁止albumentations检查更新 + +from magic_pdf.model.sub_modules.model_init import AtomModelSingleton + class RapidTableModel(object): - def __init__(self): + def __init__(self, lang=None): self.table_model = RapidTable() - self.ocr_engine = RapidOCR(det_use_cuda=True, cls_use_cuda=True, rec_use_cuda=True) + # self.ocr_engine = RapidOCR(det_use_cuda=True, cls_use_cuda=True, rec_use_cuda=True) + + atom_model_manager = AtomModelSingleton() + self.ocr_engine = atom_model_manager.get_atom_model( + atom_model_name='ocr', + ocr_show_log=False, + det_db_box_thresh=0.3, + lang=lang, + ) def predict(self, image): - ocr_result, _ = self.ocr_engine(np.asarray(image)) - if ocr_result is None: + # ocr_result, _ = self.ocr_engine(np.asarray(image)) + + bgr_image = cv2.cvtColor(np.asarray(image), cv2.COLOR_RGB2BGR) + ocr_result = self.ocr_engine.ocr(bgr_image)[0] + ocr_result = [[item[0], item[1][0], item[1][1]] for item in ocr_result if + len(item) == 2 and isinstance(item[1], tuple)] + + if ocr_result: + html_code, table_cell_bboxes, elapse = self.table_model(np.asarray(image), ocr_result) + return html_code, table_cell_bboxes, elapse + else: return None, None, None - html_code, table_cell_bboxes, elapse = self.table_model(np.asarray(image), ocr_result) - return html_code, table_cell_bboxes, elapse \ No newline at end of file diff --git a/magic_pdf/pdf_parse_union_core_v2.py b/magic_pdf/pdf_parse_union_core_v2.py index 32e44b38..11b7e77b 100644 --- a/magic_pdf/pdf_parse_union_core_v2.py +++ b/magic_pdf/pdf_parse_union_core_v2.py @@ -284,6 +284,14 @@ def model_init(model_name: str): supports_bfloat16 = True else: supports_bfloat16 = False + + elif torch.npu.is_available(): + device = torch.device('npu') + if torch.npu.is_bf16_supported(): + supports_bfloat16 = True + else: + supports_bfloat16 = False + else: device = torch.device('cpu') supports_bfloat16 = False From 50f48417162c9f9413580f4eb2edbf382a5afdae Mon Sep 17 00:00:00 2001 From: myhloli Date: Thu, 26 Dec 2024 18:13:51 +0800 Subject: [PATCH 02/16] refactor(device): optimize memory cleaning and device selection - Update clean_memory function to support both CUDA and NPU devices - Implement get_device function to centralize device selection logic - Modify model initialization and memory cleaning to use the selected device - Update RapidTableModel to support both RapidOCR and PaddleOCR engines --- magic_pdf/libs/clean_memory.py | 17 ++++--- magic_pdf/model/batch_analyze.py | 3 +- .../model/doc_analyze_by_custom_model.py | 2 +- magic_pdf/model/pdf_extract_kit.py | 2 +- magic_pdf/model/sub_modules/model_init.py | 5 +-- magic_pdf/model/sub_modules/model_utils.py | 13 +++--- .../table/rapidtable/rapid_table.py | 45 ++++++++----------- magic_pdf/pdf_parse_union_core_v2.py | 21 +++++---- 8 files changed, 54 insertions(+), 54 deletions(-) diff --git a/magic_pdf/libs/clean_memory.py b/magic_pdf/libs/clean_memory.py index 236ac8fb..739e2278 100644 --- a/magic_pdf/libs/clean_memory.py +++ b/magic_pdf/libs/clean_memory.py @@ -3,11 +3,14 @@ import gc -def clean_memory(): - if torch.cuda.is_available(): - torch.cuda.empty_cache() - torch.cuda.ipc_collect() - elif torch.npu.is_available(): - torch.npu.empty_cache() - torch.npu.ipc_collect() +def clean_memory(device='cuda'): + if device == 'cuda': + if torch.cuda.is_available(): + torch.cuda.empty_cache() + torch.cuda.ipc_collect() + elif str(device).startswith("npu"): + import torch_npu + if torch.npu.is_available(): + torch_npu.empty_cache() + torch_npu.ipc_collect() gc.collect() \ No newline at end of file diff --git a/magic_pdf/model/batch_analyze.py b/magic_pdf/model/batch_analyze.py index 66573892..f82a7ca3 100644 --- a/magic_pdf/model/batch_analyze.py +++ b/magic_pdf/model/batch_analyze.py @@ -10,6 +10,7 @@ from magic_pdf.config.exceptions import CUDA_NOT_AVAILABLE from magic_pdf.data.dataset import Dataset from magic_pdf.libs.clean_memory import clean_memory +from magic_pdf.libs.config_reader import get_device from magic_pdf.model.doc_analyze_by_custom_model import ModelSingleton from magic_pdf.model.pdf_extract_kit import CustomPEKModel from magic_pdf.model.sub_modules.model_utils import ( @@ -268,7 +269,7 @@ def doc_batch_analyze( # TODO: clean memory when gpu memory is not enough clean_memory_start_time = time.time() - clean_memory() + clean_memory(get_device()) logger.info(f'clean memory time: {round(time.time() - clean_memory_start_time, 2)}') return InferenceResult(model_json, dataset) diff --git a/magic_pdf/model/doc_analyze_by_custom_model.py b/magic_pdf/model/doc_analyze_by_custom_model.py index 3f58dde0..88a55c57 100644 --- a/magic_pdf/model/doc_analyze_by_custom_model.py +++ b/magic_pdf/model/doc_analyze_by_custom_model.py @@ -183,7 +183,7 @@ def doc_analyze( model_json.append(page_dict) gc_start = time.time() - clean_memory() + clean_memory(get_device()) gc_time = round(time.time() - gc_start, 2) logger.info(f'gc time: {gc_time}') diff --git a/magic_pdf/model/pdf_extract_kit.py b/magic_pdf/model/pdf_extract_kit.py index 2bcf12cd..3e6c9d75 100644 --- a/magic_pdf/model/pdf_extract_kit.py +++ b/magic_pdf/model/pdf_extract_kit.py @@ -170,7 +170,7 @@ def __init__(self, ocr: bool = False, show_log: bool = False, **kwargs): table_model_path=str(os.path.join(models_dir, table_model_dir)), table_max_time=self.table_max_time, device=self.device, - lang=self.lang, + ocr_engine=self.ocr_model, ) logger.info('DocAnalysis init done!') diff --git a/magic_pdf/model/sub_modules/model_init.py b/magic_pdf/model/sub_modules/model_init.py index 832e46e4..d5ff7ee0 100644 --- a/magic_pdf/model/sub_modules/model_init.py +++ b/magic_pdf/model/sub_modules/model_init.py @@ -21,7 +21,7 @@ TableMasterPaddleModel -def table_model_init(table_model_type, model_path, max_time, _device_='cpu', lang=None): +def table_model_init(table_model_type, model_path, max_time, _device_='cpu', ocr_engine=None): if table_model_type == MODEL_NAME.STRUCT_EQTABLE: table_model = StructTableModel(model_path, max_new_tokens=2048, max_time=max_time) elif table_model_type == MODEL_NAME.TABLE_MASTER: @@ -31,7 +31,7 @@ def table_model_init(table_model_type, model_path, max_time, _device_='cpu', lan } table_model = TableMasterPaddleModel(config) elif table_model_type == MODEL_NAME.RAPID_TABLE: - table_model = RapidTableModel(lang) + table_model = RapidTableModel(ocr_engine) else: logger.error('table model type not allow') exit(1) @@ -160,7 +160,6 @@ def atom_model_init(model_name: str, **kwargs): kwargs.get('table_model_path'), kwargs.get('table_max_time'), kwargs.get('device'), - kwargs.get('lang'), ) else: logger.error('model name not allow') diff --git a/magic_pdf/model/sub_modules/model_utils.py b/magic_pdf/model/sub_modules/model_utils.py index 4fb163d5..bdf303de 100644 --- a/magic_pdf/model/sub_modules/model_utils.py +++ b/magic_pdf/model/sub_modules/model_utils.py @@ -45,7 +45,7 @@ def clean_vram(device, vram_threshold=8): total_memory = get_vram(device) if total_memory and total_memory <= vram_threshold: gc_start = time.time() - clean_memory() + clean_memory(device) gc_time = round(time.time() - gc_start, 2) logger.info(f"gc time: {gc_time}") @@ -54,7 +54,10 @@ def get_vram(device): if torch.cuda.is_available() and device != 'cpu': total_memory = torch.cuda.get_device_properties(device).total_memory / (1024 ** 3) # 将字节转换为 GB return total_memory - elif torch.npu.is_available() and device != 'cpu': - total_memory = torch.npu.get_device_properties(device).total_memory / (1024 ** 3) # 转为 GB - return total_memory - return None \ No newline at end of file + elif str(device).startswith("npu"): + import torch_npu + if torch.npu.is_available(): + total_memory = torch.npu.get_device_properties(device).total_memory / (1024 ** 3) # 转为 GB + return total_memory + else: + return None \ No newline at end of file diff --git a/magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py b/magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py index f7d395ca..2e4bb1e1 100644 --- a/magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py +++ b/magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py @@ -1,41 +1,32 @@ -import os import cv2 import numpy as np +from loguru import logger from rapid_table import RapidTable from rapidocr_paddle import RapidOCR -try: - import torchtext - - if torchtext.__version__ >= '0.18.0': - torchtext.disable_torchtext_deprecation_warning() -except ImportError: - pass -os.environ['NO_ALBUMENTATIONS_UPDATE'] = '1' # 禁止albumentations检查更新 - -from magic_pdf.model.sub_modules.model_init import AtomModelSingleton - class RapidTableModel(object): - def __init__(self, lang=None): + def __init__(self, ocr_engine): self.table_model = RapidTable() - # self.ocr_engine = RapidOCR(det_use_cuda=True, cls_use_cuda=True, rec_use_cuda=True) - - atom_model_manager = AtomModelSingleton() - self.ocr_engine = atom_model_manager.get_atom_model( - atom_model_name='ocr', - ocr_show_log=False, - det_db_box_thresh=0.3, - lang=lang, - ) + if ocr_engine is None: + self.ocr_model_name = "RapidOCR" + self.ocr_engine = RapidOCR(det_use_cuda=True, cls_use_cuda=True, rec_use_cuda=True) + else: + self.ocr_model_name = "PaddleOCR" + self.ocr_engine = ocr_engine def predict(self, image): - # ocr_result, _ = self.ocr_engine(np.asarray(image)) - bgr_image = cv2.cvtColor(np.asarray(image), cv2.COLOR_RGB2BGR) - ocr_result = self.ocr_engine.ocr(bgr_image)[0] - ocr_result = [[item[0], item[1][0], item[1][1]] for item in ocr_result if - len(item) == 2 and isinstance(item[1], tuple)] + if self.ocr_model_name == "RapidOCR": + ocr_result, _ = self.ocr_engine(np.asarray(image)) + elif self.ocr_model_name == "PaddleOCR": + bgr_image = cv2.cvtColor(np.asarray(image), cv2.COLOR_RGB2BGR) + ocr_result = self.ocr_engine.ocr(bgr_image)[0] + ocr_result = [[item[0], item[1][0], item[1][1]] for item in ocr_result if + len(item) == 2 and isinstance(item[1], tuple)] + else: + logger.error("OCR model not supported") + ocr_result = None if ocr_result: html_code, table_cell_bboxes, elapse = self.table_model(np.asarray(image), ocr_result) diff --git a/magic_pdf/pdf_parse_union_core_v2.py b/magic_pdf/pdf_parse_union_core_v2.py index 11b7e77b..f610ba37 100644 --- a/magic_pdf/pdf_parse_union_core_v2.py +++ b/magic_pdf/pdf_parse_union_core_v2.py @@ -14,7 +14,7 @@ from magic_pdf.data.dataset import Dataset, PageableData from magic_pdf.libs.boxbase import calculate_overlap_area_in_bbox1_area_ratio from magic_pdf.libs.clean_memory import clean_memory -from magic_pdf.libs.config_reader import get_local_layoutreader_model_dir, get_llm_aided_config +from magic_pdf.libs.config_reader import get_local_layoutreader_model_dir, get_llm_aided_config, get_device from magic_pdf.libs.convert_utils import dict_to_list from magic_pdf.libs.hash_utils import compute_md5 from magic_pdf.libs.pdf_image_tools import cut_image_to_pil_image @@ -277,21 +277,24 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang def model_init(model_name: str): from transformers import LayoutLMv3ForTokenClassification - + device = get_device() if torch.cuda.is_available(): device = torch.device('cuda') if torch.cuda.is_bf16_supported(): supports_bfloat16 = True else: supports_bfloat16 = False - - elif torch.npu.is_available(): - device = torch.device('npu') - if torch.npu.is_bf16_supported(): - supports_bfloat16 = True + elif str(device).startswith("npu"): + import torch_npu + if torch.npu.is_available(): + device = torch.device('npu') + if torch.npu.is_bf16_supported(): + supports_bfloat16 = True + else: + supports_bfloat16 = False else: + device = torch.device('cpu') supports_bfloat16 = False - else: device = torch.device('cpu') supports_bfloat16 = False @@ -865,7 +868,7 @@ def pdf_parse_union( 'pdf_info': pdf_info_list, } - clean_memory() + clean_memory(get_device()) return new_pdf_info_dict From dc0d30f5c8367aa87577fa604624b55cdaf6fc8d Mon Sep 17 00:00:00 2001 From: myhloli Date: Fri, 27 Dec 2024 15:00:06 +0800 Subject: [PATCH 03/16] build: add openai to requirements-docker.txt - Add openai package to requirements-docker.txt --- requirements-docker.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements-docker.txt b/requirements-docker.txt index 7397b364..665d2b62 100644 --- a/requirements-docker.txt +++ b/requirements-docker.txt @@ -21,4 +21,5 @@ doclayout_yolo==0.0.2 rapidocr-paddle rapid_table doclayout-yolo==0.0.2 +openai detectron2 From 2e87e649edd73133a871722ca34b55459ee9ffa8 Mon Sep 17 00:00:00 2001 From: myhloli Date: Mon, 30 Dec 2024 10:04:14 +0800 Subject: [PATCH 04/16] build(deps): update pydantic to latest version - Remove upper version limit for pydantic dependency - This change allows for the use of the latest pydantic version --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index b7df0569..e50ccc1e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,7 +4,7 @@ click>=8.1.7 fast-langdetect==0.2.0 loguru>=0.6.0 numpy>=1.21.6,<2.0.0 -pydantic>=2.7.2,<2.8.0 +pydantic>=2.7.2 PyMuPDF>=1.24.9 scikit-learn>=1.0.2 torch>=2.2.2 From 2684e7753b7724402bc64e21999670bd4974a96e Mon Sep 17 00:00:00 2001 From: myhloli Date: Mon, 30 Dec 2024 10:10:44 +0800 Subject: [PATCH 05/16] fix(npu): correct module name for NPU operations - Update `clean_memory.py` to use `torch_npu.npu` instead of `torch.npu` - Update `model_utils.py` to use `torch_npu.npu` instead of `torch.npu` - Simplify NPU availability check and bfloat16 support in `pdf_parse_union_core_v2.py` --- magic_pdf/libs/clean_memory.py | 5 ++--- magic_pdf/model/sub_modules/model_utils.py | 4 ++-- magic_pdf/pdf_parse_union_core_v2.py | 7 ++----- 3 files changed, 6 insertions(+), 10 deletions(-) diff --git a/magic_pdf/libs/clean_memory.py b/magic_pdf/libs/clean_memory.py index 739e2278..a3ab8b94 100644 --- a/magic_pdf/libs/clean_memory.py +++ b/magic_pdf/libs/clean_memory.py @@ -10,7 +10,6 @@ def clean_memory(device='cuda'): torch.cuda.ipc_collect() elif str(device).startswith("npu"): import torch_npu - if torch.npu.is_available(): - torch_npu.empty_cache() - torch_npu.ipc_collect() + if torch_npu.npu.is_available(): + torch_npu.npu.empty_cache() gc.collect() \ No newline at end of file diff --git a/magic_pdf/model/sub_modules/model_utils.py b/magic_pdf/model/sub_modules/model_utils.py index bdf303de..72196c8c 100644 --- a/magic_pdf/model/sub_modules/model_utils.py +++ b/magic_pdf/model/sub_modules/model_utils.py @@ -56,8 +56,8 @@ def get_vram(device): return total_memory elif str(device).startswith("npu"): import torch_npu - if torch.npu.is_available(): - total_memory = torch.npu.get_device_properties(device).total_memory / (1024 ** 3) # 转为 GB + if torch_npu.npu.is_available(): + total_memory = torch_npu.npu.get_device_properties(device).total_memory / (1024 ** 3) # 转为 GB return total_memory else: return None \ No newline at end of file diff --git a/magic_pdf/pdf_parse_union_core_v2.py b/magic_pdf/pdf_parse_union_core_v2.py index f610ba37..c7491a41 100644 --- a/magic_pdf/pdf_parse_union_core_v2.py +++ b/magic_pdf/pdf_parse_union_core_v2.py @@ -286,12 +286,9 @@ def model_init(model_name: str): supports_bfloat16 = False elif str(device).startswith("npu"): import torch_npu - if torch.npu.is_available(): + if torch_npu.npu.is_available(): device = torch.device('npu') - if torch.npu.is_bf16_supported(): - supports_bfloat16 = True - else: - supports_bfloat16 = False + supports_bfloat16 = False else: device = torch.device('cpu') supports_bfloat16 = False From 88b909e20e0e27da406e694bf0010ea229e54406 Mon Sep 17 00:00:00 2001 From: myhloli Date: Mon, 30 Dec 2024 14:18:02 +0800 Subject: [PATCH 06/16] refactor(magic_pdf): comment out npu-related code - Remove use_npu variable initialization - Comment out device assignment and npu check - Comment out use_npu parameter in ModifiedPaddleOCR constructor --- magic_pdf/model/sub_modules/model_init.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/magic_pdf/model/sub_modules/model_init.py b/magic_pdf/model/sub_modules/model_init.py index d5ff7ee0..0eeed246 100644 --- a/magic_pdf/model/sub_modules/model_init.py +++ b/magic_pdf/model/sub_modules/model_init.py @@ -70,10 +70,10 @@ def ocr_model_init(show_log: bool = False, det_db_unclip_ratio=1.8, ): - use_npu = False - device = get_device() - if str(device).startswith("npu"): - use_npu = True + # use_npu = False + # device = get_device() + # if str(device).startswith("npu"): + # use_npu = True if lang is not None and lang != '': model = ModifiedPaddleOCR( @@ -82,7 +82,7 @@ def ocr_model_init(show_log: bool = False, lang=lang, use_dilation=use_dilation, det_db_unclip_ratio=det_db_unclip_ratio, - use_npu=use_npu, + # use_npu=use_npu, ) else: model = ModifiedPaddleOCR( @@ -90,7 +90,7 @@ def ocr_model_init(show_log: bool = False, det_db_box_thresh=det_db_box_thresh, use_dilation=use_dilation, det_db_unclip_ratio=det_db_unclip_ratio, - use_npu=use_npu, + # use_npu=use_npu, ) return model From 7c5cdcd4d7b407121b7f93ddcd30e51fb3086ef8 Mon Sep 17 00:00:00 2001 From: myhloli Date: Thu, 2 Jan 2025 14:24:37 +0800 Subject: [PATCH 07/16] refactor(pdf_parse): improve character spacing handling in PDF text extraction - Update the logic for inserting spaces between characters- Consider the next character's position instead of the previous one - Adjust the spacing threshold to 25% of the average character width - Ignore spaces at the end of lines to prevent double spaces --- magic_pdf/pdf_parse_union_core_v2.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/magic_pdf/pdf_parse_union_core_v2.py b/magic_pdf/pdf_parse_union_core_v2.py index c7491a41..ef069487 100644 --- a/magic_pdf/pdf_parse_union_core_v2.py +++ b/magic_pdf/pdf_parse_union_core_v2.py @@ -92,9 +92,12 @@ def chars_to_content(span): content = '' for char in span['chars']: # 如果下一个char的x0和上一个char的x1距离超过一个字符宽度,则需要在中间插入一个空格 - if char['bbox'][0] - span['chars'][span['chars'].index(char) - 1]['bbox'][2] > char_avg_width: - content += ' ' - content += char['c'] + char1 = char + char2 = span['chars'][span['chars'].index(char) + 1] if span['chars'].index(char) + 1 < len(span['chars']) else None + if char2 and char2['bbox'][0] - char1['bbox'][2] > char_avg_width * 0.25 and char['c'] != ' ' and char2['c'] != ' ': + content += f"{char['c']} " + else: + content += char['c'] content = __replace_ligatures(content) span['content'] = __replace_0xfffd(content) From 512adb6701a73d1e4caa1e5a50d2c5b42e603874 Mon Sep 17 00:00:00 2001 From: myhloli Date: Fri, 3 Jan 2025 16:27:52 +0800 Subject: [PATCH 08/16] feat(model): add onnxruntime support for paddleocr on cpu - Implement ONNXModelSingleton to manage ONNX models - Modify ModifiedPaddleOCR to use ONNX models on ARM CPUs without CUDA - Update RapidTableModel to use RapidOCR with ONNXRuntime on CPU - Add rapidocr_onnxruntime dependency in setup.py --- magic-pdf.template.json | 2 +- magic_pdf/model/sub_modules/model_init.py | 8 +-- .../sub_modules/ocr/paddleocr/ocr_utils.py | 52 ++++++++++++++++++- .../ocr/paddleocr/ppocr_273_mod.py | 36 ++++++++++--- .../table/rapidtable/rapid_table.py | 9 +++- magic_pdf/post_proc/llm_aided.py | 1 + setup.py | 1 + 7 files changed, 92 insertions(+), 17 deletions(-) diff --git a/magic-pdf.template.json b/magic-pdf.template.json index 04e07ed7..30f82c88 100644 --- a/magic-pdf.template.json +++ b/magic-pdf.template.json @@ -7,7 +7,7 @@ "layoutreader-model-dir":"/tmp/layoutreader", "device-mode":"cpu", "layout-config": { - "model": "layoutlmv3" + "model": "doclayout_yolo" }, "formula-config": { "mfd_model": "yolo_v8_mfd", diff --git a/magic_pdf/model/sub_modules/model_init.py b/magic_pdf/model/sub_modules/model_init.py index 0eeed246..b4deaf73 100644 --- a/magic_pdf/model/sub_modules/model_init.py +++ b/magic_pdf/model/sub_modules/model_init.py @@ -70,11 +70,6 @@ def ocr_model_init(show_log: bool = False, det_db_unclip_ratio=1.8, ): - # use_npu = False - # device = get_device() - # if str(device).startswith("npu"): - # use_npu = True - if lang is not None and lang != '': model = ModifiedPaddleOCR( show_log=show_log, @@ -82,7 +77,6 @@ def ocr_model_init(show_log: bool = False, lang=lang, use_dilation=use_dilation, det_db_unclip_ratio=det_db_unclip_ratio, - # use_npu=use_npu, ) else: model = ModifiedPaddleOCR( @@ -90,7 +84,6 @@ def ocr_model_init(show_log: bool = False, det_db_box_thresh=det_db_box_thresh, use_dilation=use_dilation, det_db_unclip_ratio=det_db_unclip_ratio, - # use_npu=use_npu, ) return model @@ -160,6 +153,7 @@ def atom_model_init(model_name: str, **kwargs): kwargs.get('table_model_path'), kwargs.get('table_max_time'), kwargs.get('device'), + kwargs.get('ocr_engine') ) else: logger.error('model name not allow') diff --git a/magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py b/magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py index 11526d53..6b9df0da 100644 --- a/magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +++ b/magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py @@ -303,4 +303,54 @@ def calculate_is_angle(poly): return False else: # logger.info((p3[1] - p1[1])/height) - return True \ No newline at end of file + return True + + +class ONNXModelSingleton: + _instance = None + _models = {} + + def __new__(cls, *args, **kwargs): + if cls._instance is None: + cls._instance = super().__new__(cls) + return cls._instance + + def get_onnx_model(self, **kwargs): + + lang = kwargs.get('lang', None) + det_db_box_thresh = kwargs.get('det_db_box_thresh', 0.3) + use_dilation = kwargs.get('use_dilation', True) + det_db_unclip_ratio = kwargs.get('det_db_unclip_ratio', 1.8) + key = (lang, det_db_box_thresh, use_dilation, det_db_unclip_ratio) + if key not in self._models: + self._models[key] = onnx_model_init(key) + return self._models[key] + +def onnx_model_init(key): + + import importlib.resources + + resource_path = importlib.resources.path('rapidocr_onnxruntime.models','') + + onnx_model = None + additional_ocr_params = { + "use_onnx": True, + "det_model_dir": f'{resource_path}/ch_PP-OCRv4_det_infer.onnx', + "rec_model_dir": f'{resource_path}/ch_PP-OCRv4_rec_infer.onnx', + "cls_model_dir": f'{resource_path}/ch_ppocr_mobile_v2.0_cls_infer.onnx', + "det_db_box_thresh": key[1], + "use_dilation": key[2], + "det_db_unclip_ratio": key[3], + } + logger.info(f"additional_ocr_params: {additional_ocr_params}") + if key[0] is not None: + additional_ocr_params["lang"] = key[0] + + from paddleocr import PaddleOCR + onnx_model = PaddleOCR(**additional_ocr_params) + + if onnx_model is None: + logger.error('model init failed') + exit(1) + else: + return onnx_model \ No newline at end of file diff --git a/magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py b/magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py index d900254b..10d2094d 100644 --- a/magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +++ b/magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py @@ -1,7 +1,9 @@ import copy +import platform import time import cv2 import numpy as np +import torch from paddleocr import PaddleOCR from ppocr.utils.logging import get_logger @@ -9,12 +11,23 @@ from tools.infer.predict_system import sorted_boxes from tools.infer.utility import get_rotate_crop_image, get_minarea_rect_crop -from magic_pdf.model.sub_modules.ocr.paddleocr.ocr_utils import update_det_boxes, merge_det_boxes, check_img +from magic_pdf.model.sub_modules.ocr.paddleocr.ocr_utils import update_det_boxes, merge_det_boxes, check_img, \ + ONNXModelSingleton logger = get_logger() class ModifiedPaddleOCR(PaddleOCR): + def __init__(self, *args, **kwargs): + + super().__init__(*args, **kwargs) + + # 在cpu架构为arm且不支持cuda时调用onnx、 + if not torch.cuda.is_available() and platform.machine() in ['arm64', 'aarch64']: + self.use_onnx = True + onnx_model_manager = ONNXModelSingleton() + self.additional_ocr = onnx_model_manager.get_onnx_model(**kwargs) + def ocr(self, img, det=True, @@ -79,7 +92,10 @@ def preprocess_image(_image): ocr_res = [] for img in imgs: img = preprocess_image(img) - dt_boxes, elapse = self.text_detector(img) + if self.use_onnx: + dt_boxes, elapse = self.additional_ocr.text_detector(img) + else: + dt_boxes, elapse = self.text_detector(img) if dt_boxes is None: ocr_res.append(None) continue @@ -106,7 +122,10 @@ def preprocess_image(_image): img, cls_res_tmp, elapse = self.text_classifier(img) if not rec: cls_res.append(cls_res_tmp) - rec_res, elapse = self.text_recognizer(img) + if self.use_onnx: + rec_res, elapse = self.additional_ocr.text_recognizer(img) + else: + rec_res, elapse = self.text_recognizer(img) ocr_res.append(rec_res) if not rec: return cls_res @@ -121,7 +140,10 @@ def __call__(self, img, cls=True, mfd_res=None): start = time.time() ori_im = img.copy() - dt_boxes, elapse = self.text_detector(img) + if self.use_onnx: + dt_boxes, elapse = self.additional_ocr.text_detector(img) + else: + dt_boxes, elapse = self.text_detector(img) time_dict['det'] = elapse if dt_boxes is None: @@ -159,8 +181,10 @@ def __call__(self, img, cls=True, mfd_res=None): time_dict['cls'] = elapse logger.debug("cls num : {}, elapsed : {}".format( len(img_crop_list), elapse)) - - rec_res, elapse = self.text_recognizer(img_crop_list) + if self.use_onnx: + rec_res, elapse = self.additional_ocr.text_recognizer(img_crop_list) + else: + rec_res, elapse = self.text_recognizer(img_crop_list) time_dict['rec'] = elapse logger.debug("rec_res num : {}, elapsed : {}".format( len(rec_res), elapse)) diff --git a/magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py b/magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py index 2e4bb1e1..0112bd69 100644 --- a/magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py +++ b/magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py @@ -1,8 +1,8 @@ import cv2 import numpy as np +import torch from loguru import logger from rapid_table import RapidTable -from rapidocr_paddle import RapidOCR class RapidTableModel(object): @@ -10,7 +10,12 @@ def __init__(self, ocr_engine): self.table_model = RapidTable() if ocr_engine is None: self.ocr_model_name = "RapidOCR" - self.ocr_engine = RapidOCR(det_use_cuda=True, cls_use_cuda=True, rec_use_cuda=True) + if torch.cuda.is_available(): + from rapidocr_paddle import RapidOCR + self.ocr_engine = RapidOCR(det_use_cuda=True, cls_use_cuda=True, rec_use_cuda=True) + else: + from rapidocr_onnxruntime import RapidOCR + self.ocr_engine = RapidOCR() else: self.ocr_model_name = "PaddleOCR" self.ocr_engine = ocr_engine diff --git a/magic_pdf/post_proc/llm_aided.py b/magic_pdf/post_proc/llm_aided.py index ad95da3a..90bab31b 100644 --- a/magic_pdf/post_proc/llm_aided.py +++ b/magic_pdf/post_proc/llm_aided.py @@ -5,6 +5,7 @@ from openai import OpenAI +#@todo: 有的公式以"\"结尾,这样会导致尾部拼接的"$"被转义,也需要修复 formula_optimize_prompt = """请根据以下指南修正LaTeX公式的错误,确保公式能够渲染且符合原始内容: 1. 修正渲染或编译错误: diff --git a/setup.py b/setup.py index 743bd4c0..527db274 100644 --- a/setup.py +++ b/setup.py @@ -50,6 +50,7 @@ def parse_requirements(filename): "accelerate", # struct-eqtable依赖 "doclayout_yolo==0.0.2", # doclayout_yolo "rapidocr-paddle", # rapidocr-paddle + "rapidocr_onnxruntime", "rapid_table", # rapid_table "PyYAML", # yaml "openai", # openai SDK From 04febf52d04f4f67f69e9af3019bfa18b4fc9400 Mon Sep 17 00:00:00 2001 From: myhloli Date: Fri, 3 Jan 2025 17:31:57 +0800 Subject: [PATCH 09/16] refactor(ocr): comment out unnecessary log statement - Remove logger.info() call for additional_ocr_params to reduce log verbosity --- magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py b/magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py index 6b9df0da..157fa82f 100644 --- a/magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +++ b/magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py @@ -342,7 +342,7 @@ def onnx_model_init(key): "use_dilation": key[2], "det_db_unclip_ratio": key[3], } - logger.info(f"additional_ocr_params: {additional_ocr_params}") + # logger.info(f"additional_ocr_params: {additional_ocr_params}") if key[0] is not None: additional_ocr_params["lang"] = key[0] From 16a0a350ae57aac52439546db7b7c91585a7c2c8 Mon Sep 17 00:00:00 2001 From: myhloli Date: Sun, 5 Jan 2025 17:45:31 +0800 Subject: [PATCH 10/16] fix(magic-pdf): update OCR model selection logic - Add missing 'else' statement in OCR model selection logic - Ensure consistent formatting of 'if' statements for better readability - Remove unnecessary empty line in the 'app.py' file --- magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py | 2 ++ projects/gradio_app/app.py | 1 + 2 files changed, 3 insertions(+) diff --git a/magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py b/magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py index 10d2094d..8e0d9ffd 100644 --- a/magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +++ b/magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py @@ -27,6 +27,8 @@ def __init__(self, *args, **kwargs): self.use_onnx = True onnx_model_manager = ONNXModelSingleton() self.additional_ocr = onnx_model_manager.get_onnx_model(**kwargs) + else: + self.use_onnx = False def ocr(self, img, diff --git a/projects/gradio_app/app.py b/projects/gradio_app/app.py index 3e3c08aa..bc56849b 100644 --- a/projects/gradio_app/app.py +++ b/projects/gradio_app/app.py @@ -183,6 +183,7 @@ def to_pdf(file_path): return tmp_file_path + if __name__ == '__main__': with gr.Blocks() as demo: gr.HTML(header) From 9951a17026b22f6ddb1f54fbf72d5e9f15481806 Mon Sep 17 00:00:00 2001 From: myhloli Date: Sun, 5 Jan 2025 18:16:55 +0800 Subject: [PATCH 11/16] style(pdf_parse_union_core_v2): remove unnecessary spaces and improve code formatting- Remove extra space in conditional statement for character spacing logic - Adjust spacing in trigonometric checks for line direction- Improve overall code readability and consistency --- magic_pdf/pdf_parse_union_core_v2.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/magic_pdf/pdf_parse_union_core_v2.py b/magic_pdf/pdf_parse_union_core_v2.py index ef069487..20077b5d 100644 --- a/magic_pdf/pdf_parse_union_core_v2.py +++ b/magic_pdf/pdf_parse_union_core_v2.py @@ -94,7 +94,7 @@ def chars_to_content(span): # 如果下一个char的x0和上一个char的x1距离超过一个字符宽度,则需要在中间插入一个空格 char1 = char char2 = span['chars'][span['chars'].index(char) + 1] if span['chars'].index(char) + 1 < len(span['chars']) else None - if char2 and char2['bbox'][0] - char1['bbox'][2] > char_avg_width * 0.25 and char['c'] != ' ' and char2['c'] != ' ': + if char2 and char2['bbox'][0] - char1['bbox'][2] > char_avg_width * 0.25 and char['c'] != ' ' and char2['c'] != ' ': content += f"{char['c']} " else: content += char['c'] @@ -182,7 +182,7 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang for block in text_blocks_raw: for line in block['lines']: cosine, sine = line['dir'] - if abs (cosine) < 0.9 or abs(sine) > 0.1: + if abs(cosine) < 0.9 or abs(sine) > 0.1: continue for span in line['spans']: all_pymu_chars.extend(span['chars']) From f911a102ab199589981e9caf6a89b9fb287cb45f Mon Sep 17 00:00:00 2001 From: myhloli Date: Sun, 5 Jan 2025 20:45:19 +0800 Subject: [PATCH 12/16] feat(tools): add character bounding box drawing functionality - Add `draw_char_bbox` function to `draw_bbox.py` for drawing character bounding boxes - Integrate `draw_char_bbox` into `common.py` for use in PDF processing pipeline - Include option to draw character bounding boxes in debug mode --- magic_pdf/libs/draw_bbox.py | 20 ++++++++------------ magic_pdf/tools/common.py | 6 ++++++ 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/magic_pdf/libs/draw_bbox.py b/magic_pdf/libs/draw_bbox.py index 5cf04ac3..6d70c913 100644 --- a/magic_pdf/libs/draw_bbox.py +++ b/magic_pdf/libs/draw_bbox.py @@ -394,17 +394,13 @@ def draw_line_sort_bbox(pdf_info, pdf_bytes, out_path, filename): pdf_docs.save(f'{out_path}/{filename}') -def draw_layout_sort_bbox(pdf_info, pdf_bytes, out_path, filename): - layout_bbox_list = [] - - for page in pdf_info: - page_block_list = [] - for block in page['para_blocks']: - bbox = block['bbox'] - page_block_list.append(bbox) - layout_bbox_list.append(page_block_list) +def draw_char_bbox(pdf_bytes, out_path, filename): pdf_docs = fitz.open('pdf', pdf_bytes) for i, page in enumerate(pdf_docs): - draw_bbox_with_number(i, layout_bbox_list, page, [255, 0, 0], False) - - pdf_docs.save(f'{out_path}/{filename}_layout_sort.pdf') + for block in page.get_text('rawdict', flags=fitz.TEXT_PRESERVE_LIGATURES | fitz.TEXT_PRESERVE_WHITESPACE | fitz.TEXT_MEDIABOX_CLIP)['blocks']: + for line in block['lines']: + for span in line['spans']: + for char in span['chars']: + char_bbox = char['bbox'] + page.draw_rect(char_bbox, color=[1, 0, 0], fill=None, fill_opacity=1, width=0.3, overlay=True,) + pdf_docs.save(f'{out_path}/{filename}') diff --git a/magic_pdf/tools/common.py b/magic_pdf/tools/common.py index 565425aa..e5007b2a 100644 --- a/magic_pdf/tools/common.py +++ b/magic_pdf/tools/common.py @@ -9,6 +9,7 @@ from magic_pdf.config.make_content_config import DropMode, MakeMode from magic_pdf.data.data_reader_writer import FileBasedDataWriter from magic_pdf.data.dataset import PymuDocDataset +from magic_pdf.libs.draw_bbox import draw_char_bbox from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze from magic_pdf.operators.models import InferenceResult @@ -83,6 +84,7 @@ def do_parse( f_make_md_mode=MakeMode.MM_MD, f_draw_model_bbox=False, f_draw_line_sort_bbox=False, + f_draw_char_bbox=False, start_page_id=0, end_page_id=None, lang=None, @@ -94,6 +96,7 @@ def do_parse( logger.warning('debug mode is on') f_draw_model_bbox = True f_draw_line_sort_bbox = True + # f_draw_char_bbox = True pdf_bytes = convert_pdf_bytes_to_bytes_by_pymupdf( pdf_bytes, start_page_id, end_page_id @@ -205,6 +208,9 @@ def do_parse( os.path.join(local_md_dir, f'{pdf_file_name}_line_sort.pdf') ) + if f_draw_char_bbox: + draw_char_bbox(pdf_bytes, local_md_dir, f'{pdf_file_name}_char_bbox.pdf') + if f_dump_md: pipe_result.dump_md( md_writer, From 2e8601ab93168973530b33673201afc8d81afe84 Mon Sep 17 00:00:00 2001 From: myhloli Date: Sun, 5 Jan 2025 21:17:00 +0800 Subject: [PATCH 13/16] docs(README): update documentation for NPU support - Add section for using NPU acceleration in both English and Chinese README files - Update system requirements to include CANN environment for NPU support - Enhance the "Quick Start" guide with NPU-related information- Modify hardware requirements to specify "Ascend 910b" for NPU acceleration --- README.md | 25 +++- README_ja-JP.md | 327 ------------------------------------------------ README_zh-CN.md | 23 +++- 3 files changed, 39 insertions(+), 336 deletions(-) delete mode 100644 README_ja-JP.md diff --git a/README.md b/README.md index a085c7d0..0c3583c9 100644 --- a/README.md +++ b/README.md @@ -81,6 +81,7 @@
  • Online Demo
  • Quick CPU Demo
  • Using GPU
  • +
  • Using NPU
  • Usage @@ -129,7 +130,7 @@ https://github.com/user-attachments/assets/4bea02c9-6d54-4cd6-97ed-dff14340982c - OCR supports detection and recognition of 84 languages. - Supports multiple output formats, such as multimodal and NLP Markdown, JSON sorted by reading order, and rich intermediate formats. - Supports various visualization results, including layout visualization and span visualization, for efficient confirmation of output quality. -- Supports both CPU and GPU environments. +- Supports running in a pure CPU environment, and also supports GPU/NPU acceleration - Compatible with Windows, Linux, and Mac platforms. ## Quick Start @@ -141,6 +142,7 @@ There are three different ways to experience MinerU: - [Online Demo (No Installation Required)](#online-demo) - [Quick CPU Demo (Windows, Linux, Mac)](#quick-cpu-demo) - [Linux/Windows + CUDA](#Using-GPU) +- [Linux + CANN](#using-npu) > [!WARNING] > **Pre-installation Notice—Hardware and Software Environment Support** @@ -156,20 +158,24 @@ There are three different ways to experience MinerU: Operating System - Ubuntu 22.04 LTS + Linux after 2019 Windows 10 / 11 macOS 11+ CPU - x86_64(unsupported ARM Linux) + x86_64 / arm64 x86_64(unsupported ARM Windows) x86_64 / arm64 - Memory + Memory Requirements 16GB or more, recommended 32GB+ + + Storage Requirements + 20GB or more, with a preference for SSD + Python Version 3.10(Please make sure to create a Python 3.10 virtual environment using conda) @@ -186,6 +192,12 @@ There are three different ways to experience MinerU: 11.8 (manual installation) + cuDNN v8.7.0 (manual installation) None + + CANN Environment(NPU support) + 8.0+(Ascend 910b) + None + None + GPU Hardware Support List GPU VRAM 8GB or more @@ -273,6 +285,11 @@ If your device supports CUDA and meets the GPU requirements of the mainline envi magic-pdf --help ``` +### Using NPU + +If your device has NPU acceleration hardware, you can follow the tutorial below to use NPU acceleration: + + ## Usage ### Command Line diff --git a/README_ja-JP.md b/README_ja-JP.md deleted file mode 100644 index 08b1195f..00000000 --- a/README_ja-JP.md +++ /dev/null @@ -1,327 +0,0 @@ -> [!Warning] -> このドキュメントはすでに古くなっています。最新版のドキュメントを参照してください:[ENGLISH](README.md)。 -
    - -

    - -

    - -
    -
    - -[![stars](https://img.shields.io/github/stars/opendatalab/MinerU.svg)](https://github.com/opendatalab/MinerU) -[![forks](https://img.shields.io/github/forks/opendatalab/MinerU.svg)](https://github.com/opendatalab/MinerU) -[![open issues](https://img.shields.io/github/issues-raw/opendatalab/MinerU)](https://github.com/opendatalab/MinerU/issues) -[![issue resolution](https://img.shields.io/github/issues-closed-raw/opendatalab/MinerU)](https://github.com/opendatalab/MinerU/issues) -[![PyPI version](https://badge.fury.io/py/magic-pdf.svg)](https://badge.fury.io/py/magic-pdf) -[![Downloads](https://static.pepy.tech/badge/magic-pdf)](https://pepy.tech/project/magic-pdf) -[![Downloads](https://static.pepy.tech/badge/magic-pdf/month)](https://pepy.tech/project/magic-pdf) - -opendatalab%2FMinerU | Trendshift - - - - - -[English](README.md) | [简体中文](README_zh-CN.md) | [日本語](README_ja-JP.md) - -
    - - - -# MinerU - - -## 紹介 - -MinerUは、ワンストップのオープンソースで高品質なデータ抽出ツールであり、以下の主要な機能を含みます: - -- [Magic-PDF](#Magic-PDF) PDFドキュメント抽出 -- [Magic-Doc](#Magic-Doc) ウェブページと電子書籍の抽出 - - -# Magic-PDF - - -## 紹介 - -Magic-PDFは、PDFドキュメントをMarkdown形式に変換するためのツールであり、ローカルに保存されたファイルやS3プロトコルをサポートするオブジェクトストレージ上のファイルを処理することができます。 - -主な機能は以下の通りです: - -- 複数のフロントエンドモデル入力をサポート -- ヘッダー、フッター、脚注、ページ番号の削除 -- 人間が読みやすいレイアウトフォーマット -- 見出し、段落、リストなど、元のドキュメントの構造とフォーマットを保持 -- 画像や表を抽出してmarkdown内に表示 -- 数式をLaTeX形式に変換 -- 文字化けしたPDFの自動検出と変換 -- CPUおよびGPU環境に対応 -- Windows、Linux、macOSプラットフォームに対応 - - -https://github.com/user-attachments/assets/4bea02c9-6d54-4cd6-97ed-dff14340982c - - - -## プロジェクト全景 - -![プロジェクト全景](docs/images/project_panorama_en.png) - - -## フローチャート - -![フローチャート](docs/images/flowchart_en.png) - -### 依存リポジトリ - -- [PDF-Extract-Kit : 高品質なPDFコンテンツ抽出のための包括的なツールキット](https://github.com/opendatalab/PDF-Extract-Kit) 🚀🚀🚀 - -## 入門ガイド - -### 要件 - -- Python >= 3.9 - -依存関係の競合を避けるために、仮想環境の使用をお勧めします。venvとcondaの両方が適しています。 -例: -```bash -conda create -n MinerU python=3.10 -conda activate MinerU -``` - -### インストールと設定 - -#### 1. Magic-PDFのインストール - -**1.依存パッケージのインストール** - -フル機能パッケージはdetectron2に依存しており、コンパイルインストールが必要です。 -自分でコンパイルする必要がある場合は、https://github.com/facebookresearch/detectron2/issues/5114 を参照してください。 -または、私たちの事前コンパイルされたwhlパッケージを直接使用できます(Python 3.10に限定): - -```bash -pip install detectron2 --extra-index-url https://wheels.myhloli.com -``` - -**2.pipを使用してフル機能パッケージをインストールします** ->注意:pipでインストールされたパッケージはCPUのみをサポートし、クイックテストに最適です。 -> ->CUDA/MPSによる加速については、[CUDAまたはMPSによる加速](#4-CUDAまたはMPSによる加速)を参照してください。 - -```bash -pip install -U magic-pdf[full] -``` - -> ❗️❗️❗️ -> 私たちは0.6.2 ベータ版を事前にリリースし、私たちのログに記載されている多くの問題に対処しました。しかし、このビルドはまだ完全なQAテストを経ておらず、最終的なリリース品質を表していません。問題に遭遇した場合は、問題を通じて速やかに報告するか、0.6.1バージョンに戻ることをお願いします。 -> ```bash -> pip install -U magic-pdf[full] -> ``` - - -#### 2. モデルの重みファイルのダウンロード - -詳細については、[how_to_download_models](docs/how_to_download_models_en.md)を参照してください。 - -モデルの重みをダウンロードした後、'models'ディレクトリを大きなディスクスペースのあるディレクトリに移動します。できればSSDに移動してください。 - - -#### 3. 設定ファイルのコピーと設定 -リポジトリのルートディレクトリに[magic-pdf.template.json](magic-pdf.template.json)ファイルがあります。 -```bash -cp magic-pdf.template.json ~/magic-pdf.json -``` -magic-pdf.jsonで、"models-dir"をモデルの重みファイルがあるディレクトリに設定します。 - -```json -{ - "models-dir": "/tmp/models" -} -``` - - -#### 4. CUDAまたはMPSによる加速 -利用可能なNvidia GPUを持っている場合や、Apple Siliconを搭載したMacを使用している場合は、それぞれCUDAまたはMPSによる加速を利用できます。 -##### CUDA - -CUDAバージョンに応じたPyTorchバージョンをインストールする必要があります。 -この例では、CUDA 11.8バージョンをインストールします。詳細はhttps://pytorch.org/get-started/locally/ を参照してください。 -```bash -pip install --force-reinstall torch==2.3.1 torchvision==0.18.1 --index-url https://download.pytorch.org/whl/cu118 -``` -また、設定ファイルmagic-pdf.jsonの"device-mode"の値を変更する必要があります。 -```json -{ - "device-mode":"cuda" -} -``` - -##### MPS - -Mシリーズチップデバイスを搭載したmacOSユーザーは、推論加速のためにMPSを使用できます。 -設定ファイルmagic-pdf.jsonの"device-mode"の値を変更する必要があります。 -```json -{ - "device-mode":"mps" -} -``` - - -### 使用方法 - -#### 1. コマンドラインでの使用 - -###### シンプル - -```bash -magic-pdf pdf-command --pdf "pdf_path" --inside_model true -``` -プログラムが終了した後、"/tmp/magic-pdf"ディレクトリに生成されたmarkdownファイルが見つかります。 -markdownディレクトリには対応するxxx_model.jsonファイルがあります。 -ポストプロセッシングパイプラインの二次開発を行う場合は、次のコマンドを使用できます: -```bash -magic-pdf pdf-command --pdf "pdf_path" --model "model_json_path" -``` -この方法では、モデルデータを再実行する必要がなくなり、デバッグが便利になります。 - - -###### 詳細 - -```bash -magic-pdf --help -``` - - -#### 2. APIを使用した利用 - -###### ローカル -```python -image_writer = DiskReaderWriter(local_image_dir) -image_dir = str(os.path.basename(local_image_dir)) -jso_useful_key = {"_pdf_type": "", "model_list": []} -pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer) -pipe.pipe_classify() -pipe.pipe_parse() -md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none") -``` - -###### オブジェクトストレージ -```python -s3pdf_cli = S3ReaderWriter(pdf_ak, pdf_sk, pdf_endpoint) -image_dir = "s3://img_bucket/" -s3image_cli = S3ReaderWriter(img_ak, img_sk, img_endpoint, parent_path=image_dir) -pdf_bytes = s3pdf_cli.read(s3_pdf_path, mode=s3pdf_cli.MODE_BIN) -jso_useful_key = {"_pdf_type": "", "model_list": []} -pipe = UNIPipe(pdf_bytes, jso_useful_key, s3image_cli) -pipe.pipe_classify() -pipe.pipe_parse() -md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none") -``` - -デモは[demo.py](demo/demo.py)を参照してください - - -# Magic-Doc - - -## 紹介 - -Magic-Docは、ウェブページや多形式の電子書籍をmarkdown形式に変換するためのツールです。 - -主な機能は以下の通りです: - -- ウェブページ抽出 - - テキスト、画像、表、数式情報のクロスモーダルな正確な解析。 - -- 電子書籍ドキュメント抽出 - - epub、mobiなどのさまざまなドキュメント形式をサポートし、テキストと画像に完全対応。 - -- 言語タイプの識別 - - 176の言語を正確に認識。 - -https://github.com/opendatalab/MinerU/assets/11393164/a5a650e9-f4c0-463e-acc3-960967f1a1ca - - - -https://github.com/opendatalab/MinerU/assets/11393164/0f4a6fe9-6cca-4113-9fdc-a537749d764d - - - -https://github.com/opendatalab/MinerU/assets/11393164/20438a02-ce6c-4af8-9dde-d722a4e825b2 - - - - -## プロジェクトリポジトリ - -- [Magic-Doc](https://github.com/InternLM/magic-doc) - 優れたウェブページと電子書籍の抽出ツール - - -# 貢献者の皆様に感謝 - - - - - - -# ライセンス情報 - -[LICENSE.md](LICENSE.md) - -このプロジェクトは現在、PyMuPDFを利用して高度な機能を提供していますが、AGPLライセンスに準拠しているため、特定の使用ケースに制限を課す可能性があります。今後のバージョンでは、より寛容なライセンスのPDF処理ライブラリへの移行を検討し、ユーザーフレンドリーさと柔軟性を向上させる予定です。 - - -# 謝辞 - -- [PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR) -- [PyMuPDF](https://github.com/pymupdf/PyMuPDF) -- [fast-langdetect](https://github.com/LlmKira/fast-langdetect) -- [pdfminer.six](https://github.com/pdfminer/pdfminer.six) - - -# 引用 - -```bibtex -@misc{wang2024mineruopensourcesolutionprecise, - title={MinerU: An Open-Source Solution for Precise Document Content Extraction}, - author={Bin Wang and Chao Xu and Xiaomeng Zhao and Linke Ouyang and Fan Wu and Zhiyuan Zhao and Rui Xu and Kaiwen Liu and Yuan Qu and Fukai Shang and Bo Zhang and Liqun Wei and Zhihao Sui and Wei Li and Botian Shi and Yu Qiao and Dahua Lin and Conghui He}, - year={2024}, - eprint={2409.18839}, - archivePrefix={arXiv}, - primaryClass={cs.CV}, - url={https://arxiv.org/abs/2409.18839}, -} - -@article{he2024opendatalab, - title={Opendatalab: Empowering general artificial intelligence with open datasets}, - author={He, Conghui and Li, Wei and Jin, Zhenjiang and Xu, Chao and Wang, Bin and Lin, Dahua}, - journal={arXiv preprint arXiv:2407.13773}, - year={2024} -} -``` - -# スター履歴 - - - - - - Star History Chart - - - -# リンク -- [LabelU (軽量なマルチモーダルデータアノテーションツール)](https://github.com/opendatalab/labelU) -- [LabelLLM (オープンソースのLLM対話アノテーションプラットフォーム)](https://github.com/opendatalab/LabelLLM) -- [PDF-Extract-Kit (高品質なPDFコンテンツ抽出のための包括的なツールキット)](https://github.com/opendatalab/PDF-Extract-Kit) diff --git a/README_zh-CN.md b/README_zh-CN.md index 52b6c5b6..f662d28c 100644 --- a/README_zh-CN.md +++ b/README_zh-CN.md @@ -81,6 +81,7 @@
  • 在线体验
  • 使用CPU快速体验
  • 使用GPU
  • +
  • 使用NPU
  • 使用方式 @@ -129,7 +130,7 @@ https://github.com/user-attachments/assets/4bea02c9-6d54-4cd6-97ed-dff14340982c - OCR支持84种语言的检测与识别 - 支持多种输出格式,如多模态与NLP的Markdown、按阅读顺序排序的JSON、含有丰富信息的中间格式等 - 支持多种可视化结果,包括layout可视化、span可视化等,便于高效确认输出效果与质检 -- 支持CPU和GPU环境 +- 支持纯CPU环境运行,并支持GPU/NPU加速 - 兼容Windows、Linux和Mac平台 ## 快速开始 @@ -141,6 +142,7 @@ https://github.com/user-attachments/assets/4bea02c9-6d54-4cd6-97ed-dff14340982c - [在线体验(无需任何安装)](#在线体验) - [使用CPU快速体验(Windows,Linux,Mac)](#使用cpu快速体验) - [Linux/Windows + CUDA](#使用gpu) +- [Linux + CANN](#使用NPU) > [!WARNING] @@ -157,13 +159,13 @@ https://github.com/user-attachments/assets/4bea02c9-6d54-4cd6-97ed-dff14340982c 操作系统 - Ubuntu 22.04 LTS + Linux after 2019 Windows 10 / 11 macOS 11+ CPU - x86_64(暂不支持ARM Linux) + x86_64 / arm64 x86_64(暂不支持ARM Windows) x86_64 / arm64 @@ -171,6 +173,10 @@ https://github.com/user-attachments/assets/4bea02c9-6d54-4cd6-97ed-dff14340982c 内存 大于等于16GB,推荐32G以上 + + 存储空间 + 大于等于20GB,推荐使用SSD以获得最佳性能 + python版本 3.10 (请务必通过conda创建3.10虚拟环境) @@ -187,6 +193,12 @@ https://github.com/user-attachments/assets/4bea02c9-6d54-4cd6-97ed-dff14340982c 11.8(手动安装)+cuDNN v8.7.0(手动安装) None + + CANN环境(NPU支持) + 8.0+(Ascend 910b) + None + None + GPU硬件支持列表 显存8G以上 @@ -195,7 +207,6 @@ https://github.com/user-attachments/assets/4bea02c9-6d54-4cd6-97ed-dff14340982c 8G显存及以上可开启全部加速功能 None - ### 在线体验 @@ -278,7 +289,9 @@ pip install -U magic-pdf[full] --extra-index-url https://wheels.myhloli.com -i h docker run --rm -it --gpus=all mineru:latest /bin/bash magic-pdf --help ``` - +### 使用NPU + +如果您的设备存在NPU加速硬件,则可以通过以下教程使用NPU加速: ## 使用 From ad09980807be8f9fd6d02af6fc06731ad8a70ba8 Mon Sep 17 00:00:00 2001 From: myhloli Date: Mon, 6 Jan 2025 10:38:14 +0800 Subject: [PATCH 14/16] build(docker): add Dockerfiles for global and Huawei NPU setups - Add Dockerfile for global setup with Ubuntu base image - Add Dockerfile for Huawei NPU setup with Ascend base image - Update requirements file structure: - Rename requirements-docker.txt to docker/china/requirements.txt - Add new requirements files for global and Huawei NPU setups - Install necessary packages and dependencies in both Dockerfiles- Set up virtual environment and install Python packages - Download models and configure magic-pdf for both setups --- Dockerfile => docker/china/Dockerfile | 4 +- docker/china/requirements.txt | 25 ++++++++++ docker/global/Dockerfile | 50 +++++++++++++++++++ docker/global/requirements.txt | 25 ++++++++++ docker/huawei_npu/Dockerfile | 49 ++++++++++++++++++ .../huawei_npu/requirements.txt | 1 + 6 files changed, 152 insertions(+), 2 deletions(-) rename Dockerfile => docker/china/Dockerfile (89%) create mode 100644 docker/china/requirements.txt create mode 100644 docker/global/Dockerfile create mode 100644 docker/global/requirements.txt create mode 100644 docker/huawei_npu/Dockerfile rename requirements-docker.txt => docker/huawei_npu/requirements.txt (95%) diff --git a/Dockerfile b/docker/china/Dockerfile similarity index 89% rename from Dockerfile rename to docker/china/Dockerfile index 870432d7..3d8a8050 100644 --- a/Dockerfile +++ b/docker/china/Dockerfile @@ -30,8 +30,8 @@ RUN python3 -m venv /opt/mineru_venv # Activate the virtual environment and install necessary Python packages RUN /bin/bash -c "source /opt/mineru_venv/bin/activate && \ pip3 install --upgrade pip && \ - wget https://gitee.com/myhloli/MinerU/raw/master/requirements-docker.txt && \ - pip3 install -r requirements-docker.txt --extra-index-url https://wheels.myhloli.com -i https://mirrors.aliyun.com/pypi/simple && \ + wget https://gitee.com/myhloli/MinerU/raw/master/docker/china/requirements.txt && \ + pip3 install -r requirements.txt --extra-index-url https://wheels.myhloli.com -i https://mirrors.aliyun.com/pypi/simple && \ pip3 install paddlepaddle-gpu==3.0.0b1 -i https://www.paddlepaddle.org.cn/packages/stable/cu118/" # Copy the configuration file template and install magic-pdf latest diff --git a/docker/china/requirements.txt b/docker/china/requirements.txt new file mode 100644 index 00000000..4ebd459a --- /dev/null +++ b/docker/china/requirements.txt @@ -0,0 +1,25 @@ +boto3>=1.28.43 +Brotli>=1.1.0 +click>=8.1.7 +PyMuPDF>=1.24.9 +loguru>=0.6.0 +numpy>=1.21.6,<2.0.0 +fast-langdetect==0.2.0 +scikit-learn>=1.0.2 +pdfminer.six==20231228 +unimernet==0.2.3 +torch>=2.2.2,<=2.3.1 +torchvision>=0.17.2,<=0.18.1 +matplotlib +ultralytics>=8.3.48 +paddleocr==2.7.3 +struct-eqtable==0.3.2 +einops +accelerate +doclayout_yolo==0.0.2 +rapidocr-paddle +rapidocr-onnxruntime +rapid_table +doclayout-yolo==0.0.2 +openai +detectron2 diff --git a/docker/global/Dockerfile b/docker/global/Dockerfile new file mode 100644 index 00000000..15db7d30 --- /dev/null +++ b/docker/global/Dockerfile @@ -0,0 +1,50 @@ +# Use the official Ubuntu base image +FROM ubuntu:22.04 + +# Set environment variables to non-interactive to avoid prompts during installation +ENV DEBIAN_FRONTEND=noninteractive + +# Update the package list and install necessary packages +RUN apt-get update && \ + apt-get install -y \ + software-properties-common && \ + add-apt-repository ppa:deadsnakes/ppa && \ + apt-get update && \ + apt-get install -y \ + python3.10 \ + python3.10-venv \ + python3.10-distutils \ + python3-pip \ + wget \ + git \ + libgl1 \ + libglib2.0-0 \ + && rm -rf /var/lib/apt/lists/* + +# Set Python 3.10 as the default python3 +RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1 + +# Create a virtual environment for MinerU +RUN python3 -m venv /opt/mineru_venv + +# Activate the virtual environment and install necessary Python packages +RUN /bin/bash -c "source /opt/mineru_venv/bin/activate && \ + pip3 install --upgrade pip && \ + wget https://github.com/opendatalab/MinerU/raw/master/docker/global/requirements.txt && \ + pip3 install -r requirements.txt --extra-index-url https://wheels.myhloli.com && \ + pip3 install paddlepaddle-gpu==3.0.0b1 -i https://www.paddlepaddle.org.cn/packages/stable/cu118/" + +# Copy the configuration file template and install magic-pdf latest +RUN /bin/bash -c "wget https://github.com/opendatalab/MinerU/raw/master/magic-pdf.template.json && \ + cp magic-pdf.template.json /root/magic-pdf.json && \ + source /opt/mineru_venv/bin/activate && \ + pip3 install -U magic-pdf" + +# Download models and update the configuration file +RUN /bin/bash -c "pip3 install huggingface_hub && \ + wget https://github.com/opendatalab/MinerU/raw/master/scripts/download_models_hf.py -O download_models.py && \ + python3 download_models.py && \ + sed -i 's|cpu|cuda|g' /root/magic-pdf.json" + +# Set the entry point to activate the virtual environment and run the command line tool +ENTRYPOINT ["/bin/bash", "-c", "source /opt/mineru_venv/bin/activate && exec \"$@\"", "--"] diff --git a/docker/global/requirements.txt b/docker/global/requirements.txt new file mode 100644 index 00000000..4ebd459a --- /dev/null +++ b/docker/global/requirements.txt @@ -0,0 +1,25 @@ +boto3>=1.28.43 +Brotli>=1.1.0 +click>=8.1.7 +PyMuPDF>=1.24.9 +loguru>=0.6.0 +numpy>=1.21.6,<2.0.0 +fast-langdetect==0.2.0 +scikit-learn>=1.0.2 +pdfminer.six==20231228 +unimernet==0.2.3 +torch>=2.2.2,<=2.3.1 +torchvision>=0.17.2,<=0.18.1 +matplotlib +ultralytics>=8.3.48 +paddleocr==2.7.3 +struct-eqtable==0.3.2 +einops +accelerate +doclayout_yolo==0.0.2 +rapidocr-paddle +rapidocr-onnxruntime +rapid_table +doclayout-yolo==0.0.2 +openai +detectron2 diff --git a/docker/huawei_npu/Dockerfile b/docker/huawei_npu/Dockerfile new file mode 100644 index 00000000..93ea7d54 --- /dev/null +++ b/docker/huawei_npu/Dockerfile @@ -0,0 +1,49 @@ +# Use the official Ubuntu base image +FROM swr.cn-south-1.myhuaweicloud.com/ascendhub/ascend-infer:24.0.RC3-ubuntu20.04 + +# Set environment variables to non-interactive to avoid prompts during installation +ENV DEBIAN_FRONTEND=noninteractive + +# Update the package list and install necessary packages +RUN apt-get update && \ + apt-get install -y \ + software-properties-common && \ + add-apt-repository ppa:deadsnakes/ppa && \ + apt-get update && \ + apt-get install -y \ + python3.10 \ + python3.10-venv \ + python3.10-distutils \ + python3-pip \ + wget \ + git \ + libgl1 \ + libglib2.0-0 \ + && rm -rf /var/lib/apt/lists/* + +# Set Python 3.10 as the default python3 +RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1 + +# Create a virtual environment for MinerU +RUN python3 -m venv /opt/mineru_venv + +# Activate the virtual environment and install necessary Python packages +RUN /bin/bash -c "source /opt/mineru_venv/bin/activate && \ + pip3 install --upgrade pip && \ + wget https://gitee.com/myhloli/MinerU/raw/dev/docker/huawei_npu/requirements.txt && \ + pip3 install -r requirements.txt --extra-index-url https://wheels.myhloli.com -i https://mirrors.aliyun.com/pypi/simple" + +# Copy the configuration file template and install magic-pdf latest +RUN /bin/bash -c "wget https://gitee.com/myhloli/MinerU/raw/master/magic-pdf.template.json && \ + cp magic-pdf.template.json /root/magic-pdf.json && \ + source /opt/mineru_venv/bin/activate && \ + pip3 install git+https://gitee.com/myhloli/MinerU.git@dev" + +# Download models and update the configuration file +RUN /bin/bash -c "pip3 install modelscope && \ + wget https://gitee.com/myhloli/MinerU/raw/master/scripts/download_models.py && \ + python3 download_models.py && \ + sed -i 's|cpu|npu|g' /root/magic-pdf.json" + +# Set the entry point to activate the virtual environment and run the command line tool +ENTRYPOINT ["/bin/bash", "-c", "source /opt/mineru_venv/bin/activate && exec \"$@\"", "--"] diff --git a/requirements-docker.txt b/docker/huawei_npu/requirements.txt similarity index 95% rename from requirements-docker.txt rename to docker/huawei_npu/requirements.txt index 665d2b62..1efa80e2 100644 --- a/requirements-docker.txt +++ b/docker/huawei_npu/requirements.txt @@ -19,6 +19,7 @@ einops accelerate doclayout_yolo==0.0.2 rapidocr-paddle +rapidocr-onnxruntime rapid_table doclayout-yolo==0.0.2 openai From 36c3ad6f8ca527b6d68741d005d2fcb0b02f2b31 Mon Sep 17 00:00:00 2001 From: myhloli Date: Mon, 6 Jan 2025 10:55:51 +0800 Subject: [PATCH 15/16] build(docker): update Dockerfiles and download scripts - Update Dockerfiles in china, global, and huawei_npu directories - Improve wget commands by specifying output file names - Update READMEs to reflect new Dockerfile locations --- README.md | 2 +- README_zh-CN.md | 2 +- docker/china/Dockerfile | 4 ++-- docker/global/Dockerfile | 2 +- docker/huawei_npu/Dockerfile | 4 ++-- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 0c3583c9..b7eb02a2 100644 --- a/README.md +++ b/README.md @@ -279,7 +279,7 @@ If your device supports CUDA and meets the GPU requirements of the mainline envi > docker run --rm --gpus=all nvidia/cuda:12.1.0-base-ubuntu22.04 nvidia-smi > ``` ```bash - wget https://github.com/opendatalab/MinerU/raw/master/Dockerfile + wget https://github.com/opendatalab/MinerU/raw/master/docker/global/Dockerfile -O Dockerfile docker build -t mineru:latest . docker run --rm -it --gpus=all mineru:latest /bin/bash magic-pdf --help diff --git a/README_zh-CN.md b/README_zh-CN.md index f662d28c..dcb8d2c3 100644 --- a/README_zh-CN.md +++ b/README_zh-CN.md @@ -284,7 +284,7 @@ pip install -U magic-pdf[full] --extra-index-url https://wheels.myhloli.com -i h > docker run --rm --gpus=all nvidia/cuda:12.1.0-base-ubuntu22.04 nvidia-smi > ``` ```bash - wget https://github.com/opendatalab/MinerU/raw/master/Dockerfile + wget https://github.com/opendatalab/MinerU/raw/master/docker/china/Dockerfile -O Dockerfile docker build -t mineru:latest . docker run --rm -it --gpus=all mineru:latest /bin/bash magic-pdf --help diff --git a/docker/china/Dockerfile b/docker/china/Dockerfile index 3d8a8050..ff83041c 100644 --- a/docker/china/Dockerfile +++ b/docker/china/Dockerfile @@ -30,7 +30,7 @@ RUN python3 -m venv /opt/mineru_venv # Activate the virtual environment and install necessary Python packages RUN /bin/bash -c "source /opt/mineru_venv/bin/activate && \ pip3 install --upgrade pip && \ - wget https://gitee.com/myhloli/MinerU/raw/master/docker/china/requirements.txt && \ + wget https://gitee.com/myhloli/MinerU/raw/master/docker/china/requirements.txt -O requirements.txt && \ pip3 install -r requirements.txt --extra-index-url https://wheels.myhloli.com -i https://mirrors.aliyun.com/pypi/simple && \ pip3 install paddlepaddle-gpu==3.0.0b1 -i https://www.paddlepaddle.org.cn/packages/stable/cu118/" @@ -42,7 +42,7 @@ RUN /bin/bash -c "wget https://gitee.com/myhloli/MinerU/raw/master/magic-pdf.tem # Download models and update the configuration file RUN /bin/bash -c "pip3 install modelscope && \ - wget https://gitee.com/myhloli/MinerU/raw/master/scripts/download_models.py && \ + wget https://gitee.com/myhloli/MinerU/raw/master/scripts/download_models.py -O download_models.py && \ python3 download_models.py && \ sed -i 's|cpu|cuda|g' /root/magic-pdf.json" diff --git a/docker/global/Dockerfile b/docker/global/Dockerfile index 15db7d30..fae0a845 100644 --- a/docker/global/Dockerfile +++ b/docker/global/Dockerfile @@ -30,7 +30,7 @@ RUN python3 -m venv /opt/mineru_venv # Activate the virtual environment and install necessary Python packages RUN /bin/bash -c "source /opt/mineru_venv/bin/activate && \ pip3 install --upgrade pip && \ - wget https://github.com/opendatalab/MinerU/raw/master/docker/global/requirements.txt && \ + wget https://github.com/opendatalab/MinerU/raw/master/docker/global/requirements.txt -O requirements.txt && \ pip3 install -r requirements.txt --extra-index-url https://wheels.myhloli.com && \ pip3 install paddlepaddle-gpu==3.0.0b1 -i https://www.paddlepaddle.org.cn/packages/stable/cu118/" diff --git a/docker/huawei_npu/Dockerfile b/docker/huawei_npu/Dockerfile index 93ea7d54..8589c9c0 100644 --- a/docker/huawei_npu/Dockerfile +++ b/docker/huawei_npu/Dockerfile @@ -30,7 +30,7 @@ RUN python3 -m venv /opt/mineru_venv # Activate the virtual environment and install necessary Python packages RUN /bin/bash -c "source /opt/mineru_venv/bin/activate && \ pip3 install --upgrade pip && \ - wget https://gitee.com/myhloli/MinerU/raw/dev/docker/huawei_npu/requirements.txt && \ + wget https://gitee.com/myhloli/MinerU/raw/dev/docker/huawei_npu/requirements.txt -O requirements.txt && \ pip3 install -r requirements.txt --extra-index-url https://wheels.myhloli.com -i https://mirrors.aliyun.com/pypi/simple" # Copy the configuration file template and install magic-pdf latest @@ -41,7 +41,7 @@ RUN /bin/bash -c "wget https://gitee.com/myhloli/MinerU/raw/master/magic-pdf.tem # Download models and update the configuration file RUN /bin/bash -c "pip3 install modelscope && \ - wget https://gitee.com/myhloli/MinerU/raw/master/scripts/download_models.py && \ + wget https://gitee.com/myhloli/MinerU/raw/master/scripts/download_models.py -O download_models.py && \ python3 download_models.py && \ sed -i 's|cpu|npu|g' /root/magic-pdf.json" From 2e1bf88174d421440867c7cf48cdd897dd10930d Mon Sep 17 00:00:00 2001 From: myhloli Date: Mon, 6 Jan 2025 18:07:40 +0800 Subject: [PATCH 16/16] build(docker): update Dockerfiles for China and Huawei NPU versions - Update package sources to use Aliyun mirrors for faster downloads - Upgrade pip and install Python packages in virtual environment - Add python3.10-dev package to Huawei NPU Dockerfile - Update requirements file URLs to master branch- Install specific version of torch_npu in Huawei NPU Dockerfile - Update magic-pdf installation method - Improve modelscope installation process - Optimize model download and configuration update steps --- docker/china/Dockerfile | 2 +- docker/huawei_npu/Dockerfile | 22 ++++++++++++++-------- 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/docker/china/Dockerfile b/docker/china/Dockerfile index ff83041c..0a198c90 100644 --- a/docker/china/Dockerfile +++ b/docker/china/Dockerfile @@ -29,7 +29,7 @@ RUN python3 -m venv /opt/mineru_venv # Activate the virtual environment and install necessary Python packages RUN /bin/bash -c "source /opt/mineru_venv/bin/activate && \ - pip3 install --upgrade pip && \ + pip3 install --upgrade pip -i https://mirrors.aliyun.com/pypi/simple && \ wget https://gitee.com/myhloli/MinerU/raw/master/docker/china/requirements.txt -O requirements.txt && \ pip3 install -r requirements.txt --extra-index-url https://wheels.myhloli.com -i https://mirrors.aliyun.com/pypi/simple && \ pip3 install paddlepaddle-gpu==3.0.0b1 -i https://www.paddlepaddle.org.cn/packages/stable/cu118/" diff --git a/docker/huawei_npu/Dockerfile b/docker/huawei_npu/Dockerfile index 8589c9c0..980cfa4e 100644 --- a/docker/huawei_npu/Dockerfile +++ b/docker/huawei_npu/Dockerfile @@ -1,5 +1,7 @@ # Use the official Ubuntu base image -FROM swr.cn-south-1.myhuaweicloud.com/ascendhub/ascend-infer:24.0.RC3-ubuntu20.04 +FROM swr.cn-central-221.ovaijisuan.com/mindformers/mindformers1.2_mindspore2.3:20240722 + +USER root # Set environment variables to non-interactive to avoid prompts during installation ENV DEBIAN_FRONTEND=noninteractive @@ -14,7 +16,8 @@ RUN apt-get update && \ python3.10 \ python3.10-venv \ python3.10-distutils \ - python3-pip \ + python3.10-dev \ + python3-pip \ wget \ git \ libgl1 \ @@ -29,21 +32,24 @@ RUN python3 -m venv /opt/mineru_venv # Activate the virtual environment and install necessary Python packages RUN /bin/bash -c "source /opt/mineru_venv/bin/activate && \ - pip3 install --upgrade pip && \ - wget https://gitee.com/myhloli/MinerU/raw/dev/docker/huawei_npu/requirements.txt -O requirements.txt && \ - pip3 install -r requirements.txt --extra-index-url https://wheels.myhloli.com -i https://mirrors.aliyun.com/pypi/simple" + pip3 install --upgrade pip -i https://mirrors.aliyun.com/pypi/simple && \ + wget https://gitee.com/myhloli/MinerU/raw/master/docker/huawei_npu/requirements.txt -O requirements.txt && \ + pip3 install -r requirements.txt --extra-index-url https://wheels.myhloli.com -i https://mirrors.aliyun.com/pypi/simple && \ + wget https://gitee.com/ascend/pytorch/releases/download/v6.0.rc2-pytorch2.3.1/torch_npu-2.3.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl && \ + pip install torch_npu-2.3.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl" # Copy the configuration file template and install magic-pdf latest RUN /bin/bash -c "wget https://gitee.com/myhloli/MinerU/raw/master/magic-pdf.template.json && \ cp magic-pdf.template.json /root/magic-pdf.json && \ source /opt/mineru_venv/bin/activate && \ - pip3 install git+https://gitee.com/myhloli/MinerU.git@dev" + pip3 install -U magic-pdf" # Download models and update the configuration file -RUN /bin/bash -c "pip3 install modelscope && \ +RUN /bin/bash -c "source /opt/mineru_venv/bin/activate && \ + pip3 install modelscope -i https://mirrors.aliyun.com/pypi/simple && \ wget https://gitee.com/myhloli/MinerU/raw/master/scripts/download_models.py -O download_models.py && \ python3 download_models.py && \ sed -i 's|cpu|npu|g' /root/magic-pdf.json" # Set the entry point to activate the virtual environment and run the command line tool -ENTRYPOINT ["/bin/bash", "-c", "source /opt/mineru_venv/bin/activate && exec \"$@\"", "--"] +ENTRYPOINT ["/bin/bash", "-c", "source /opt/mineru_venv/bin/activate && exec \"$@\"", "--"] \ No newline at end of file