Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor(pre_proc): improve character overlap handling in spans #1338

Merged
merged 2 commits into from
Dec 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 3 additions & 5 deletions magic_pdf/pdf_parse_union_core_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
from magic_pdf.pre_proc.ocr_detect_all_bboxes import ocr_prepare_bboxes_for_layout_split_v2
from magic_pdf.pre_proc.ocr_dict_merge import fill_spans_in_blocks, fix_block_spans_v2, fix_discarded_block
from magic_pdf.pre_proc.ocr_span_list_modify import get_qa_need_list_v2, remove_overlaps_low_confidence_spans, \
remove_overlaps_min_spans, remove_overlaps_chars
remove_overlaps_min_spans, check_chars_is_overlap_in_span

os.environ['NO_ALBUMENTATIONS_UPDATE'] = '1' # 禁止albumentations检查更新

Expand Down Expand Up @@ -78,6 +78,8 @@ def chars_to_content(span):
if len(span['chars']) == 0:
pass
# span['content'] = ''
elif check_chars_is_overlap_in_span(span['chars']):
pass
else:
# 先给chars按char['bbox']的中心点的x坐标排序
span['chars'] = sorted(span['chars'], key=lambda x: (x['bbox'][0] + x['bbox'][2]) / 2)
Expand Down Expand Up @@ -121,10 +123,6 @@ def fill_char_in_spans(spans, all_chars):
empty_spans = []

for span in spans:

# 移除同一个span中重叠的char
span['chars'] = remove_overlaps_chars(span['chars'])

chars_to_content(span)
# 有的span中虽然没有字但有一两个空的占位符,用宽高和content长度过滤
if len(span['content']) * span['height'] < span['width'] * 0.5:
Expand Down
29 changes: 6 additions & 23 deletions magic_pdf/pre_proc/ocr_span_list_modify.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,29 +33,12 @@ def remove_overlaps_low_confidence_spans(spans):
return spans, dropped_spans


def remove_overlaps_chars(chars):
dropped_chars = []
# 删除重叠的char
for char1 in chars:
for char2 in chars:
if char1 != char2:
# char1 或 char2 任何一个都不应该在 dropped_chars 中
if char1 in dropped_chars or char2 in dropped_chars:
continue
else:
if calculate_iou(char1['bbox'], char2['bbox']) > 0.95:
char_need_remove = char1
if (
char_need_remove is not None
and char_need_remove not in dropped_chars
):
dropped_chars.append(char_need_remove)

if len(dropped_chars) > 0:
for char_need_remove in dropped_chars:
chars.remove(char_need_remove)

return chars
def check_chars_is_overlap_in_span(chars):
for i in range(len(chars)):
for j in range(i + 1, len(chars)):
if calculate_iou(chars[i]['bbox'], chars[j]['bbox']) > 0.9:
return True
return False


def remove_overlaps_min_spans(spans):
Expand Down
Loading