Skip to content

Commit

Permalink
fix: some text char removed by interline_equations overlap
Browse files Browse the repository at this point in the history
  • Loading branch information
myhloli committed Jun 6, 2024
1 parent 999b698 commit 3c145ba
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 9 deletions.
9 changes: 5 additions & 4 deletions magic_pdf/pre_proc/equations_replace.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ def _is_in_or_part_overlap(box1, box2) -> bool:
or y0_1 > y1_2
) # box1在box2的下边


def remove_text_block_overlap_interline_equation_bbox(
interline_eq_bboxes, pymu_block_list
):
Expand All @@ -122,10 +123,10 @@ def remove_text_block_overlap_interline_equation_bbox(
deleted_chars = []
for char in span["chars"]:
if any(
[
_is_in_or_part_overlap(char["bbox"], eq_bbox["bbox"])
for eq_bbox in interline_eq_bboxes
]
[
(calculate_overlap_area_2_minbox_area_ratio(eq_bbox["bbox"], char["bbox"]) > 0.5)
for eq_bbox in interline_eq_bboxes
]
):
deleted_chars.append(char)
# 检查span里没有char则删除这个span
Expand Down
24 changes: 19 additions & 5 deletions magic_pdf/pre_proc/ocr_dict_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,12 +160,12 @@ def fill_spans_in_blocks(blocks, spans, radio):
block_spans.append(span)

'''行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)'''
displayed_list = []
text_inline_lines = []
modify_y_axis(block_spans, displayed_list, text_inline_lines)
# displayed_list = []
# text_inline_lines = []
# modify_y_axis(block_spans, displayed_list, text_inline_lines)

'''模型识别错误的行间公式, type类型转换成行内公式'''
block_spans = modify_inline_equation(block_spans, displayed_list, text_inline_lines)
# block_spans = modify_inline_equation(block_spans, displayed_list, text_inline_lines)

'''bbox去除粘连''' # 去粘连会影响span的bbox,导致后续fill的时候出错
# block_spans = remove_overlap_between_bbox_for_span(block_spans)
Expand Down Expand Up @@ -196,8 +196,10 @@ def fix_block_spans(block_with_spans, img_blocks, table_blocks):
block = fix_image_block(block, img_blocks)
elif block_type == BlockType.Table:
block = fix_table_block(block, table_blocks)
elif block_type in [BlockType.Text, BlockType.Title, BlockType.InterlineEquation]:
elif block_type in [BlockType.Text, BlockType.Title]:
block = fix_text_block(block)
elif block_type == BlockType.InterlineEquation:
block = fix_interline_block(block)
else:
continue
fix_blocks.append(block)
Expand Down Expand Up @@ -315,6 +317,18 @@ def fix_table_block(block, table_blocks):


def fix_text_block(block):
# 文本block中的公式span都应该转换成行内type
for span in block['spans']:
if span['type'] == ContentType.InterlineEquation:
span['type'] = ContentType.InlineEquation
block_lines = merge_spans_to_line(block['spans'])
sort_block_lines = line_sort_spans_by_left_to_right(block_lines)
block['lines'] = sort_block_lines
del block['spans']
return block


def fix_interline_block(block):
block_lines = merge_spans_to_line(block['spans'])
sort_block_lines = line_sort_spans_by_left_to_right(block_lines)
block['lines'] = sort_block_lines
Expand Down

0 comments on commit 3c145ba

Please sign in to comment.