From 994b974f07c9ca1ff4ef584dfcaa49aeb792d21e Mon Sep 17 00:00:00 2001 From: "Mingde (Matthew) Zeng" Date: Sun, 29 Dec 2024 23:42:48 -0500 Subject: [PATCH] Fix classify: there is no more pdf_bytes in UNIPipe Signed-off-by: Mingde (Matthew) Zeng --- magic_pdf/pipe/AbsPipe.py | 4 +++- magic_pdf/pipe/UNIPipe.py | 16 +++++++--------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/magic_pdf/pipe/AbsPipe.py b/magic_pdf/pipe/AbsPipe.py index 93ac20c3..6511ae64 100644 --- a/magic_pdf/pipe/AbsPipe.py +++ b/magic_pdf/pipe/AbsPipe.py @@ -56,8 +56,10 @@ def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF, m return md_content @staticmethod - def classify(pdf_bytes: bytes) -> str: + def classify(self) -> str: """根据pdf的元数据,判断是文本pdf,还是ocr pdf.""" + pdf_bytes = self.dataset.data_bits() + pdf_meta = pdf_meta_scan(pdf_bytes) if pdf_meta.get('_need_drop', False): # 如果返回了需要丢弃的标志,则抛出异常 raise Exception(f"pdf meta_scan need_drop,reason is {pdf_meta['_drop_reason']}") diff --git a/magic_pdf/pipe/UNIPipe.py b/magic_pdf/pipe/UNIPipe.py index 1809492e..a9e10f6d 100644 --- a/magic_pdf/pipe/UNIPipe.py +++ b/magic_pdf/pipe/UNIPipe.py @@ -26,7 +26,6 @@ def __init__( formula_enable=None, table_enable=None, ): - self.pdf_type = jso_useful_key['_pdf_type'] super().__init__( dataset, jso_useful_key['model_list'], @@ -39,13 +38,14 @@ def __init__( formula_enable, table_enable, ) + self.pdf_type = jso_useful_key['_pdf_type'] if len(self.model_list) == 0: self.input_model_is_empty = True else: self.input_model_is_empty = False def pipe_classify(self): - self.pdf_type = AbsPipe.classify(self.pdf_bytes) + self.pdf_type = AbsPipe.classify(self) def pipe_analyze(self): if self.pdf_type == self.PIP_TXT: @@ -115,8 +115,9 @@ def pipe_mk_markdown( if __name__ == '__main__': - # 测试 + # Testing from magic_pdf.data.data_reader_writer import DataReader + from magic_pdf.data.dataset import PymuDocDataset # Import the concrete dataset class drw = DataReader(r'D:/project/20231108code-clean') @@ -129,14 +130,11 @@ def pipe_mk_markdown( img_bucket_path = 'imgs' img_writer = DataWriter(join_path(write_path, img_bucket_path)) - # pdf_type = UNIPipe.classify(pdf_bytes) - # jso_useful_key = { - # "_pdf_type": pdf_type, - # "model_list": model_list - # } + # Create dataset instance instead of using raw bytes + dataset = PymuDocDataset(pdf_bytes) jso_useful_key = {'_pdf_type': '', 'model_list': model_list} - pipe = UNIPipe(pdf_bytes, jso_useful_key, img_writer) + pipe = UNIPipe(dataset, jso_useful_key, img_writer) pipe.pipe_classify() pipe.pipe_parse() md_content = pipe.pipe_mk_markdown(img_bucket_path)