Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix classify: there is no more pdf_bytes in UNIPipe #1379

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion magic_pdf/pipe/AbsPipe.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,10 @@ def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF, m
return md_content

@staticmethod
def classify(pdf_bytes: bytes) -> str:
def classify(self) -> str:
"""根据pdf的元数据,判断是文本pdf,还是ocr pdf."""
pdf_bytes = self.dataset.data_bits()

pdf_meta = pdf_meta_scan(pdf_bytes)
if pdf_meta.get('_need_drop', False): # 如果返回了需要丢弃的标志,则抛出异常
raise Exception(f"pdf meta_scan need_drop,reason is {pdf_meta['_drop_reason']}")
Expand Down
16 changes: 7 additions & 9 deletions magic_pdf/pipe/UNIPipe.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ def __init__(
formula_enable=None,
table_enable=None,
):
self.pdf_type = jso_useful_key['_pdf_type']
super().__init__(
dataset,
jso_useful_key['model_list'],
Expand All @@ -39,13 +38,14 @@ def __init__(
formula_enable,
table_enable,
)
self.pdf_type = jso_useful_key['_pdf_type']
if len(self.model_list) == 0:
self.input_model_is_empty = True
else:
self.input_model_is_empty = False

def pipe_classify(self):
self.pdf_type = AbsPipe.classify(self.pdf_bytes)
self.pdf_type = AbsPipe.classify(self)

def pipe_analyze(self):
if self.pdf_type == self.PIP_TXT:
Expand Down Expand Up @@ -115,8 +115,9 @@ def pipe_mk_markdown(


if __name__ == '__main__':
# 测试
# Testing
from magic_pdf.data.data_reader_writer import DataReader
from magic_pdf.data.dataset import PymuDocDataset # Import the concrete dataset class

drw = DataReader(r'D:/project/20231108code-clean')

Expand All @@ -129,14 +130,11 @@ def pipe_mk_markdown(
img_bucket_path = 'imgs'
img_writer = DataWriter(join_path(write_path, img_bucket_path))

# pdf_type = UNIPipe.classify(pdf_bytes)
# jso_useful_key = {
# "_pdf_type": pdf_type,
# "model_list": model_list
# }
# Create dataset instance instead of using raw bytes
dataset = PymuDocDataset(pdf_bytes)

jso_useful_key = {'_pdf_type': '', 'model_list': model_list}
pipe = UNIPipe(pdf_bytes, jso_useful_key, img_writer)
pipe = UNIPipe(dataset, jso_useful_key, img_writer)
pipe.pipe_classify()
pipe.pipe_parse()
md_content = pipe.pipe_mk_markdown(img_bucket_path)
Expand Down
Loading