Skip to content

Commit

Permalink
rename entity frequency
Browse files Browse the repository at this point in the history
  • Loading branch information
svlandeg committed Jul 19, 2019
1 parent f75d129 commit dae8a21
Show file tree
Hide file tree
Showing 7 changed files with 49 additions and 49 deletions.
2 changes: 1 addition & 1 deletion bin/wiki_entity_linking/kb_creator.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def create_kb(nlp, max_entities_per_alias, min_entity_freq, min_occ,

print()
print(" * adding", len(entity_list), "entities", datetime.datetime.now())
kb.set_entities(entity_list=entity_list, prob_list=frequency_list, vector_list=embeddings)
kb.set_entities(entity_list=entity_list, freq_list=frequency_list, vector_list=embeddings)

print()
print(" * adding aliases", datetime.datetime.now())
Expand Down
6 changes: 3 additions & 3 deletions examples/pipeline/dummy_entity_linking.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,15 +14,15 @@ def create_kb(vocab):
# adding entities
entity_0 = "Q1004791_Douglas"
print("adding entity", entity_0)
kb.add_entity(entity=entity_0, prob=0.5, entity_vector=[0])
kb.add_entity(entity=entity_0, freq=0.5, entity_vector=[0])

entity_1 = "Q42_Douglas_Adams"
print("adding entity", entity_1)
kb.add_entity(entity=entity_1, prob=0.5, entity_vector=[1])
kb.add_entity(entity=entity_1, freq=0.5, entity_vector=[1])

entity_2 = "Q5301561_Douglas_Haig"
print("adding entity", entity_2)
kb.add_entity(entity=entity_2, prob=0.5, entity_vector=[2])
kb.add_entity(entity=entity_2, freq=0.5, entity_vector=[2])

# adding aliases
print()
Expand Down
12 changes: 6 additions & 6 deletions spacy/kb.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ cdef class KnowledgeBase:
return new_index


cdef inline int64_t c_add_entity(self, hash_t entity_hash, float prob,
cdef inline int64_t c_add_entity(self, hash_t entity_hash, float freq,
int32_t vector_index, int feats_row) nogil:
"""Add an entry to the vector of entries.
After calling this method, make sure to update also the _entry_index using the return value"""
Expand All @@ -92,7 +92,7 @@ cdef class KnowledgeBase:
entry.entity_hash = entity_hash
entry.vector_index = vector_index
entry.feats_row = feats_row
entry.prob = prob
entry.freq = freq

self._entries.push_back(entry)
return new_index
Expand Down Expand Up @@ -125,7 +125,7 @@ cdef class KnowledgeBase:
entry.entity_hash = dummy_hash
entry.vector_index = dummy_value
entry.feats_row = dummy_value
entry.prob = dummy_value
entry.freq = dummy_value

# Avoid struct initializer to enable nogil
cdef vector[int64_t] dummy_entry_indices
Expand All @@ -141,15 +141,15 @@ cdef class KnowledgeBase:
self._aliases_table.push_back(alias)

cpdef load_bulk(self, loc)
cpdef set_entities(self, entity_list, prob_list, vector_list)
cpdef set_entities(self, entity_list, freq_list, vector_list)


cdef class Writer:
cdef FILE* _fp

cdef int write_header(self, int64_t nr_entries, int64_t entity_vector_length) except -1
cdef int write_vector_element(self, float element) except -1
cdef int write_entry(self, hash_t entry_hash, float entry_prob, int32_t vector_index) except -1
cdef int write_entry(self, hash_t entry_hash, float entry_freq, int32_t vector_index) except -1

cdef int write_alias_length(self, int64_t alias_length) except -1
cdef int write_alias_header(self, hash_t alias_hash, int64_t candidate_length) except -1
Expand All @@ -162,7 +162,7 @@ cdef class Reader:

cdef int read_header(self, int64_t* nr_entries, int64_t* entity_vector_length) except -1
cdef int read_vector_element(self, float* element) except -1
cdef int read_entry(self, hash_t* entity_hash, float* prob, int32_t* vector_index) except -1
cdef int read_entry(self, hash_t* entity_hash, float* freq, int32_t* vector_index) except -1

cdef int read_alias_length(self, int64_t* alias_length) except -1
cdef int read_alias_header(self, hash_t* alias_hash, int64_t* candidate_length) except -1
Expand Down
30 changes: 15 additions & 15 deletions spacy/kb.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ cdef class KnowledgeBase:
def get_alias_strings(self):
return [self.vocab.strings[x] for x in self._alias_index]

def add_entity(self, unicode entity, float prob, vector[float] entity_vector):
def add_entity(self, unicode entity, float freq, vector[float] entity_vector):
"""
Add an entity to the KB, optionally specifying its log probability based on corpus frequency
Return the hash of the entity ID/name at the end.
Expand All @@ -113,15 +113,15 @@ cdef class KnowledgeBase:
vector_index = self.c_add_vector(entity_vector=entity_vector)

new_index = self.c_add_entity(entity_hash=entity_hash,
prob=prob,
freq=freq,
vector_index=vector_index,
feats_row=-1) # Features table currently not implemented
self._entry_index[entity_hash] = new_index

return entity_hash

cpdef set_entities(self, entity_list, prob_list, vector_list):
if len(entity_list) != len(prob_list) or len(entity_list) != len(vector_list):
cpdef set_entities(self, entity_list, freq_list, vector_list):
if len(entity_list) != len(freq_list) or len(entity_list) != len(vector_list):
raise ValueError(Errors.E140)

nr_entities = len(entity_list)
Expand All @@ -137,7 +137,7 @@ cdef class KnowledgeBase:

entity_hash = self.vocab.strings.add(entity_list[i])
entry.entity_hash = entity_hash
entry.prob = prob_list[i]
entry.freq = freq_list[i]

vector_index = self.c_add_vector(entity_vector=vector_list[i])
entry.vector_index = vector_index
Expand Down Expand Up @@ -196,7 +196,7 @@ cdef class KnowledgeBase:

return [Candidate(kb=self,
entity_hash=self._entries[entry_index].entity_hash,
entity_freq=self._entries[entry_index].prob,
entity_freq=self._entries[entry_index].freq,
entity_vector=self._vectors_table[self._entries[entry_index].vector_index],
alias_hash=alias_hash,
prior_prob=prior_prob)
Expand Down Expand Up @@ -252,7 +252,7 @@ cdef class KnowledgeBase:
entry = self._entries[entry_index]
assert entry.entity_hash == entry_hash
assert entry_index == i
writer.write_entry(entry.entity_hash, entry.prob, entry.vector_index)
writer.write_entry(entry.entity_hash, entry.freq, entry.vector_index)
i = i+1

writer.write_alias_length(self.get_size_aliases())
Expand All @@ -278,7 +278,7 @@ cdef class KnowledgeBase:
cdef hash_t entity_hash
cdef hash_t alias_hash
cdef int64_t entry_index
cdef float prob
cdef float freq
cdef int32_t vector_index
cdef KBEntryC entry
cdef AliasC alias
Expand Down Expand Up @@ -314,10 +314,10 @@ cdef class KnowledgeBase:
# index 0 is a dummy object not stored in the _entry_index and can be ignored.
i = 1
while i <= nr_entities:
reader.read_entry(&entity_hash, &prob, &vector_index)
reader.read_entry(&entity_hash, &freq, &vector_index)

entry.entity_hash = entity_hash
entry.prob = prob
entry.freq = freq
entry.vector_index = vector_index
entry.feats_row = -1 # Features table currently not implemented

Expand Down Expand Up @@ -387,9 +387,9 @@ cdef class Writer:
cdef int write_vector_element(self, float element) except -1:
self._write(&element, sizeof(element))

cdef int write_entry(self, hash_t entry_hash, float entry_prob, int32_t vector_index) except -1:
cdef int write_entry(self, hash_t entry_hash, float entry_freq, int32_t vector_index) except -1:
self._write(&entry_hash, sizeof(entry_hash))
self._write(&entry_prob, sizeof(entry_prob))
self._write(&entry_freq, sizeof(entry_freq))
self._write(&vector_index, sizeof(vector_index))
# Features table currently not implemented and not written to file

Expand Down Expand Up @@ -444,18 +444,18 @@ cdef class Reader:
return 0 # end of file
raise IOError("error reading entity vector from input file")

cdef int read_entry(self, hash_t* entity_hash, float* prob, int32_t* vector_index) except -1:
cdef int read_entry(self, hash_t* entity_hash, float* freq, int32_t* vector_index) except -1:
status = self._read(entity_hash, sizeof(hash_t))
if status < 1:
if feof(self._fp):
return 0 # end of file
raise IOError("error reading entity hash from input file")

status = self._read(prob, sizeof(float))
status = self._read(freq, sizeof(float))
if status < 1:
if feof(self._fp):
return 0 # end of file
raise IOError("error reading entity prob from input file")
raise IOError("error reading entity freq from input file")

status = self._read(vector_index, sizeof(int32_t))
if status < 1:
Expand Down
2 changes: 1 addition & 1 deletion spacy/structs.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ cdef struct KBEntryC:
int32_t feats_row

# log probability of entity, based on corpus frequency
float prob
float freq


# Each alias struct stores a list of Entry pointers with their prior probabilities
Expand Down
38 changes: 19 additions & 19 deletions spacy/tests/pipeline/test_entity_linker.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,9 @@ def test_kb_valid_entities(nlp):
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=3)

# adding entities
mykb.add_entity(entity="Q1", prob=0.9, entity_vector=[8, 4, 3])
mykb.add_entity(entity="Q2", prob=0.5, entity_vector=[2, 1, 0])
mykb.add_entity(entity="Q3", prob=0.5, entity_vector=[-1, -6, 5])
mykb.add_entity(entity="Q1", freq=0.9, entity_vector=[8, 4, 3])
mykb.add_entity(entity="Q2", freq=0.5, entity_vector=[2, 1, 0])
mykb.add_entity(entity="Q3", freq=0.5, entity_vector=[-1, -6, 5])

# adding aliases
mykb.add_alias(alias="douglas", entities=["Q2", "Q3"], probabilities=[0.8, 0.2])
Expand All @@ -50,9 +50,9 @@ def test_kb_invalid_entities(nlp):
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)

# adding entities
mykb.add_entity(entity="Q1", prob=0.9, entity_vector=[1])
mykb.add_entity(entity="Q2", prob=0.2, entity_vector=[2])
mykb.add_entity(entity="Q3", prob=0.5, entity_vector=[3])
mykb.add_entity(entity="Q1", freq=0.9, entity_vector=[1])
mykb.add_entity(entity="Q2", freq=0.2, entity_vector=[2])
mykb.add_entity(entity="Q3", freq=0.5, entity_vector=[3])

# adding aliases - should fail because one of the given IDs is not valid
with pytest.raises(ValueError):
Expand All @@ -66,9 +66,9 @@ def test_kb_invalid_probabilities(nlp):
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)

# adding entities
mykb.add_entity(entity="Q1", prob=0.9, entity_vector=[1])
mykb.add_entity(entity="Q2", prob=0.2, entity_vector=[2])
mykb.add_entity(entity="Q3", prob=0.5, entity_vector=[3])
mykb.add_entity(entity="Q1", freq=0.9, entity_vector=[1])
mykb.add_entity(entity="Q2", freq=0.2, entity_vector=[2])
mykb.add_entity(entity="Q3", freq=0.5, entity_vector=[3])

# adding aliases - should fail because the sum of the probabilities exceeds 1
with pytest.raises(ValueError):
Expand All @@ -80,9 +80,9 @@ def test_kb_invalid_combination(nlp):
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)

# adding entities
mykb.add_entity(entity="Q1", prob=0.9, entity_vector=[1])
mykb.add_entity(entity="Q2", prob=0.2, entity_vector=[2])
mykb.add_entity(entity="Q3", prob=0.5, entity_vector=[3])
mykb.add_entity(entity="Q1", freq=0.9, entity_vector=[1])
mykb.add_entity(entity="Q2", freq=0.2, entity_vector=[2])
mykb.add_entity(entity="Q3", freq=0.5, entity_vector=[3])

# adding aliases - should fail because the entities and probabilities vectors are not of equal length
with pytest.raises(ValueError):
Expand All @@ -96,21 +96,21 @@ def test_kb_invalid_entity_vector(nlp):
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=3)

# adding entities
mykb.add_entity(entity="Q1", prob=0.9, entity_vector=[1, 2, 3])
mykb.add_entity(entity="Q1", freq=0.9, entity_vector=[1, 2, 3])

# this should fail because the kb's expected entity vector length is 3
with pytest.raises(ValueError):
mykb.add_entity(entity="Q2", prob=0.2, entity_vector=[2])
mykb.add_entity(entity="Q2", freq=0.2, entity_vector=[2])


def test_candidate_generation(nlp):
"""Test correct candidate generation"""
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)

# adding entities
mykb.add_entity(entity="Q1", prob=0.7, entity_vector=[1])
mykb.add_entity(entity="Q2", prob=0.2, entity_vector=[2])
mykb.add_entity(entity="Q3", prob=0.5, entity_vector=[3])
mykb.add_entity(entity="Q1", freq=0.7, entity_vector=[1])
mykb.add_entity(entity="Q2", freq=0.2, entity_vector=[2])
mykb.add_entity(entity="Q3", freq=0.5, entity_vector=[3])

# adding aliases
mykb.add_alias(alias="douglas", entities=["Q2", "Q3"], probabilities=[0.8, 0.1])
Expand All @@ -133,8 +133,8 @@ def test_preserving_links_asdoc(nlp):
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)

# adding entities
mykb.add_entity(entity="Q1", prob=0.9, entity_vector=[1])
mykb.add_entity(entity="Q2", prob=0.8, entity_vector=[1])
mykb.add_entity(entity="Q1", freq=0.9, entity_vector=[1])
mykb.add_entity(entity="Q2", freq=0.8, entity_vector=[1])

# adding aliases
mykb.add_alias(alias="Boston", entities=["Q1"], probabilities=[0.7])
Expand Down
8 changes: 4 additions & 4 deletions spacy/tests/serialize/test_serialize_kb.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,10 @@ def test_serialize_kb_disk(en_vocab):
def _get_dummy_kb(vocab):
kb = KnowledgeBase(vocab=vocab, entity_vector_length=3)

kb.add_entity(entity='Q53', prob=0.33, entity_vector=[0, 5, 3])
kb.add_entity(entity='Q17', prob=0.2, entity_vector=[7, 1, 0])
kb.add_entity(entity='Q007', prob=0.7, entity_vector=[0, 0, 7])
kb.add_entity(entity='Q44', prob=0.4, entity_vector=[4, 4, 4])
kb.add_entity(entity='Q53', freq=0.33, entity_vector=[0, 5, 3])
kb.add_entity(entity='Q17', freq=0.2, entity_vector=[7, 1, 0])
kb.add_entity(entity='Q007', freq=0.7, entity_vector=[0, 0, 7])
kb.add_entity(entity='Q44', freq=0.4, entity_vector=[4, 4, 4])

kb.add_alias(alias='double07', entities=['Q17', 'Q007'], probabilities=[0.1, 0.9])
kb.add_alias(alias='guy', entities=['Q53', 'Q007', 'Q17', 'Q44'], probabilities=[0.3, 0.3, 0.2, 0.1])
Expand Down

0 comments on commit dae8a21

Please sign in to comment.