Skip to content

Commit

Permalink
fix: DLNv1 create: Introduce hardcoded list of blacklisted doc_ids
Browse files Browse the repository at this point in the history
Signed-off-by: Nikos Livathinos <[email protected]>
  • Loading branch information
nikos-livathinos committed Mar 3, 2025
1 parent bb318a2 commit 49c1677
Showing 1 changed file with 34 additions and 0 deletions.
34 changes: 34 additions & 0 deletions docling_eval/benchmarks/doclaynet_v1/create.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,35 @@
# Get logger
_log = logging.getLogger(__name__)

###########################################################################################
BLACKLISTED_DOC_IDS = [
"f556167ac3284665652050b1b0bc1e6f5af27f54f17f27566c60c80f6f134a92",
"dbc51622cbe9b8766f44db3b3fda8d0a745da06b9bfec9935bd003d2bdd494c8",
"d4c0401fffc04d24e629a9fada23266a3b492ea63e889641b3c33adf815d44e3",
"cc93b556f49af1f2e366719ec98a131186c16385545d8062d21e4d38b6bf7686",
"c9755e6972e3150a1c02565ec8070bfc26503d0fe09d056e418d6dcd6ea43cd9",
"c90d298ac9493e3804baf1b62c9321cdabf388c29eb504c5ad12106b3cdf530b",
"c2b513a5611d3138726e679c6e2e9e5383e4d3d82a2c588bbe3d5802797e2765",
"b72bb61059b06ff9859ae023aa66cdb3ff706c354ac72ca5d3c837e107d0a384",
"b4f5d430d89499474a31f39fe8eb615fdcd7aa682eb0b959a0384206d5c8174c",
"ab9315a0610ec0e5446a7062cd99a9e137efe3d7da9a7bffa2523894ac68751a",
"99723d3d3c61db030dbd813faec67579ceb50c6b5dd8c2f500c6e073849e9784",
"87c7dc9ca13016fafa4a7539efa1bf00401ba27323a473094b4184bc42cb36c0",
"7c1fa2e7c81a81888c18eb95cfe37edb82a91dd340e75c8123618a6774081f2e",
"7a231e9b7d841935a142d972ea1c7546d613cba18e301b0e07415f9eb44e3382",
"5793282eaaa089d0dc71e67c951c68b4157a212cc43edbc3106323e96b385190",
"55f9167173149b0b4c8d8951baca190ee756450d6565a91655ec04967a08c798",
"5003688e1ae61558cbeda741d246804b59fe89dac29cf508b4b6ce56d1a4342b",
"4f6e20223b7bc8436c623b9e6282db6ebd5f221aeb880a8db9b4544326d5a8a6",
"4232e47097e6ecfdf53d4097cb90bdd56cc63c31508a2f91a6d3908770a4d1ed",
"3361796dba75fe2c641c43db12ab31a0eb9dbcbbc7c99721288d36c41d759bcd",
"1fadb433bffa31c43817d1f6bafbb10dff53422ad046d391ed560ebef13d9f83",
"1a8f46903dbe89dc5b6df43389b4895a376e00ab3b90c7c37f1a1b561d3d51a1",
"1763e54be635759ccb66ebb462548f8a40d44567f62cecc5ca26f22acd28e823",
"048a570b2e415b653a62313ef82504adfda480c99f69826fcbeb67758ea3c7a4",
"0261791e343389682847c913a16789776d0ba41a584901571846c7ddab3cbaa6",
]
###########################################################################################

TRUE_HTML_EXPORT_LABELS = {
DocItemLabel.TITLE,
Expand Down Expand Up @@ -224,9 +253,14 @@ def create_dlnv1_e2e_dataset(
skipped_rows = 0
saved_shards = 0

black_page_hashes = set(BLACKLISTED_DOC_IDS)

for doc in tqdm(ds, total=selected_ds_len):
try:
page_hash = doc["metadata"]["page_hash"]
if page_hash in black_page_hashes:
_log.info("Skip blacklisted doc id: %s", page_hash)
continue

if do_debug:
_log.info("Converting: %s", page_hash)
Expand Down

0 comments on commit 49c1677

Please sign in to comment.