Skip to content

Commit

Permalink
Warn which line(s) are buggy when converting a WikiNER dataset
Browse files Browse the repository at this point in the history
  • Loading branch information
AngledLuffa committed Nov 30, 2024
1 parent 0ddd705 commit 0614f16
Showing 1 changed file with 4 additions and 2 deletions.
6 changes: 4 additions & 2 deletions stanza/utils/datasets/ner/split_wikiner.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@

import os
import random
import warnings
from collections import Counter
random.seed(1234)

def read_sentences(filename, encoding):
sents = []
Expand All @@ -29,8 +29,8 @@ def read_sentences(filename, encoding):
array = line.split()
if len(array) != 2:
skip = True
warnings.warn("Format error at line {}: {}".format(i+1, line))
continue
#assert len(array) == 2, "Format error at line {}: {}".format(i+1, line)
w, t = array
cache.append([w, t])
if len(cache) > 0:
Expand Down Expand Up @@ -60,6 +60,8 @@ def remap_labels(sents, remap):
return new_sentences

def split_wikiner(directory, *in_filenames, encoding="utf-8", prefix="", suffix="bio", remap=None, shuffle=True, train_fraction=0.7, dev_fraction=0.15, test_section=True):
random.seed(1234)

sents = []
for filename in in_filenames:
new_sents = read_sentences(filename, encoding)
Expand Down

0 comments on commit 0614f16

Please sign in to comment.