opendatalab · MatthewZMD · Dec 30, 2024
diff --git a/magic_pdf/pipe/AbsPipe.py b/magic_pdf/pipe/AbsPipe.py
@@ -56,8 +56,10 @@ def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF, m
         return md_content
 
     @staticmethod
-    def classify(pdf_bytes: bytes) -> str:
+    def classify(self) -> str:
         """根据pdf的元数据，判断是文本pdf，还是ocr pdf."""
+        pdf_bytes = self.dataset.data_bits()
+
         pdf_meta = pdf_meta_scan(pdf_bytes)
         if pdf_meta.get('_need_drop', False):  # 如果返回了需要丢弃的标志，则抛出异常
             raise Exception(f"pdf meta_scan need_drop,reason is {pdf_meta['_drop_reason']}")

diff --git a/magic_pdf/pipe/UNIPipe.py b/magic_pdf/pipe/UNIPipe.py
@@ -26,7 +26,6 @@ def __init__(
         formula_enable=None,
         table_enable=None,
     ):
-        self.pdf_type = jso_useful_key['_pdf_type']
         super().__init__(
             dataset,
             jso_useful_key['model_list'],
@@ -39,13 +38,14 @@ def __init__(
             formula_enable,
             table_enable,
         )
+        self.pdf_type = jso_useful_key['_pdf_type']
         if len(self.model_list) == 0:
             self.input_model_is_empty = True
         else:
             self.input_model_is_empty = False
 
     def pipe_classify(self):
-        self.pdf_type = AbsPipe.classify(self.pdf_bytes)
+        self.pdf_type = AbsPipe.classify(self)
 
     def pipe_analyze(self):
         if self.pdf_type == self.PIP_TXT:
@@ -115,8 +115,9 @@ def pipe_mk_markdown(
 
 
 if __name__ == '__main__':
-    # 测试
+    # Testing
     from magic_pdf.data.data_reader_writer import DataReader
+    from magic_pdf.data.dataset import PymuDocDataset  # Import the concrete dataset class
 
     drw = DataReader(r'D:/project/20231108code-clean')
 
@@ -129,14 +130,11 @@ def pipe_mk_markdown(
     img_bucket_path = 'imgs'
     img_writer = DataWriter(join_path(write_path, img_bucket_path))
 
-    # pdf_type = UNIPipe.classify(pdf_bytes)
-    # jso_useful_key = {
-    #     "_pdf_type": pdf_type,
-    #     "model_list": model_list
-    # }
+    # Create dataset instance instead of using raw bytes
+    dataset = PymuDocDataset(pdf_bytes)
 
     jso_useful_key = {'_pdf_type': '', 'model_list': model_list}
-    pipe = UNIPipe(pdf_bytes, jso_useful_key, img_writer)
+    pipe = UNIPipe(dataset, jso_useful_key, img_writer)
     pipe.pipe_classify()
     pipe.pipe_parse()
     md_content = pipe.pipe_mk_markdown(img_bucket_path)