Skip to content

Commit

Permalink
Merge pull request #44 from aldro61/blacklist_bug
Browse files Browse the repository at this point in the history
Blacklist bug
  • Loading branch information
aldro61 authored Mar 19, 2018
2 parents 9aedb97 + 482b261 commit 4ae02ef
Show file tree
Hide file tree
Showing 3 changed files with 41 additions and 39 deletions.
51 changes: 26 additions & 25 deletions core/kover/learning/experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,17 @@
"""
Kover: Learn interpretable computational phenotyping models from k-merized genomic data
Copyright (C) 2015 Alexandre Drouin
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
"""
Expand Down Expand Up @@ -419,21 +419,21 @@ def _bound_selection(dataset_file, split_name, model_types, p_values, max_rules,
def _find_rule_blacklist(dataset_file, kmer_blacklist_file, warning_callback):
"""
Finds the index of the rules that must be blacklisted.
"""
dataset = KoverDataset(dataset_file)

# Find all rules to blacklist
rule_blacklist = []
if kmer_blacklist_file is not None:
kmers_to_blacklist = _parse_kmer_blacklist(kmer_blacklist_file, dataset.kmer_length)

if kmers_to_blacklist:
# XXX: the k-mers are upper-cased to avoid not finding a match because of the character case
kmer_sequences = np.array([x.upper() for x in dataset.kmer_sequences]).tolist()
kmer_by_matrix_column = np.array(dataset.kmer_by_matrix_column).tolist() # XXX: each k-mer is there only once (see wiki)
n_kmers = len(kmer_sequences)
kmer_sequences = np.array([x.upper() for x in dataset.kmer_sequences[...]]).tolist()
kmer_by_matrix_column = dataset.kmer_by_matrix_column[...].tolist() # XXX: each k-mer is there only once (see wiki)
n_kmers = len(kmer_sequences)

kmers_not_found = []
rule_blacklist = []
for k in kmers_to_blacklist:
Expand All @@ -444,12 +444,12 @@ def _find_rule_blacklist(dataset_file, kmer_blacklist_file, warning_callback):
rule_blacklist += [presence_rule_idx, absence_rule_idx]
except ValueError:
kmers_not_found.append(k)

if(len(kmers_not_found) > 0):
warning_callback("The following kmers could not be found in the dataset: " + ", ".join(kmers_not_found))

return rule_blacklist


def learn(dataset_file, split_name, model_type, p, kmer_blacklist_file, max_rules, max_equiv_rules, parameter_selection, n_cpu, random_seed,
bound_delta=None, bound_max_genome_size=None, progress_callback=None, warning_callback=None, error_callback=None):
Expand All @@ -474,12 +474,13 @@ def normal_raise(exception):
model_type = np.unique(model_type)
p = np.unique(p)

rule_blacklist = _find_rule_blacklist(dataset_file=dataset_file,
logging.debug("Searching for blacklisted rules.")
rule_blacklist = _find_rule_blacklist(dataset_file=dataset_file,
kmer_blacklist_file=kmer_blacklist_file,
warning_callback=warning_callback)

dataset = KoverDataset(dataset_file)

# Score the hyperparameter combinations
# ------------------------------------------------------------------------------------------------------------------
if parameter_selection == "bound":
Expand All @@ -492,19 +493,19 @@ def normal_raise(exception):
best_hp, \
best_model, \
best_rule_importances, \
best_predictor_equiv_rules = _bound_selection(dataset_file=dataset_file, split_name=split_name, model_type=model_type,
p_values=p, max_rules=max_rules, max_equiv_rules=max_equiv_rules,
rule_blacklist=rule_blacklist, bound_delta=bound_delta,
bound_max_genome_size=bound_max_genome_size, n_cpu=n_cpu,
random_generator=random_generator, progress_callback=progress_callback,
best_predictor_equiv_rules = _bound_selection(dataset_file=dataset_file, split_name=split_name, model_type=model_type,
p_values=p, max_rules=max_rules, max_equiv_rules=max_equiv_rules,
rule_blacklist=rule_blacklist, bound_delta=bound_delta,
bound_max_genome_size=bound_max_genome_size, n_cpu=n_cpu,
random_generator=random_generator, progress_callback=progress_callback,
warning_callback=warning_callback, error_callback=error_callback)
elif parameter_selection == "cv":
n_folds = len(dataset.get_split(split_name).folds)
if n_folds < 1:
error_callback(Exception("Cross-validation cannot be performed on a split with no folds."))
best_hp_score, best_hp = _cross_validation(dataset_file=dataset_file, split_name=split_name, model_types=model_type,
p_values=p, max_rules=max_rules, rule_blacklist=rule_blacklist, n_cpu=n_cpu,
progress_callback=progress_callback, warning_callback=warning_callback,
p_values=p, max_rules=max_rules, rule_blacklist=rule_blacklist, n_cpu=n_cpu,
progress_callback=progress_callback, warning_callback=warning_callback,
error_callback=error_callback)
else:
# Use the first value provided for each parameter
Expand All @@ -519,8 +520,8 @@ def normal_raise(exception):
rule_importances = best_rule_importances
else:
model, rule_importances, \
equivalent_rules = _full_train(dataset=dataset, split_name=split_name, model_type=best_hp["model_type"], p=best_hp["p"],
max_rules=best_hp["max_rules"], max_equiv_rules=max_equiv_rules, rule_blacklist=rule_blacklist,
equivalent_rules = _full_train(dataset=dataset, split_name=split_name, model_type=best_hp["model_type"], p=best_hp["p"],
max_rules=best_hp["max_rules"], max_equiv_rules=max_equiv_rules, rule_blacklist=rule_blacklist,
random_generator=random_generator, progress_callback=progress_callback)

split = dataset.get_split(split_name)
Expand Down
27 changes: 14 additions & 13 deletions core/kover/learning/set_covering_machine/scm.py
Original file line number Diff line number Diff line change
Expand Up @@ -263,26 +263,27 @@ def _get_best_utility_rules(self, rule_classifications, positive_example_idx, ne
block_utilities = negative_cover_counts[block * UTIL_BLOCK_SIZE : (block + 1) * UTIL_BLOCK_SIZE] - \
float(self.p) * positive_error_counts[block * UTIL_BLOCK_SIZE : (block + 1) * UTIL_BLOCK_SIZE]

# Discard blacklisted rules
block_utilities[rule_is_blacklisted[block * UTIL_BLOCK_SIZE : (block + 1) * UTIL_BLOCK_SIZE]] = -np.infty

# Check if there is a better rule or equal in this block
block_max_utility = np.max(block_utilities)
if block_max_utility > best_utility or np.allclose(best_utility, block_max_utility):

# Find the indices of the better rules that are not blacklisted
block_utility_argmax = np.where(block_utilities == block_max_utility)[0] + block * UTIL_BLOCK_SIZE
block_utility_argmax = block_utility_argmax[~rule_is_blacklisted[block_utility_argmax]]

# Update the best utility value and other infos
if len(block_utility_argmax) > 0:
if np.allclose(block_max_utility, best_utility):
best_utility_idx = np.hstack((best_utility_idx, block_utility_argmax))
best_utility_pos_error_count = np.hstack((best_utility_pos_error_count,
positive_error_counts[block_utility_argmax]))
best_utility_neg_cover_count = np.hstack((best_utility_neg_cover_count,
negative_cover_counts[block_utility_argmax]))
else:
best_utility = block_max_utility
best_utility_idx = block_utility_argmax
best_utility_pos_error_count = positive_error_counts[block_utility_argmax]
best_utility_neg_cover_count = negative_cover_counts[block_utility_argmax]
if np.allclose(block_max_utility, best_utility):
best_utility_idx = np.hstack((best_utility_idx, block_utility_argmax))
best_utility_pos_error_count = np.hstack((best_utility_pos_error_count,
positive_error_counts[block_utility_argmax]))
best_utility_neg_cover_count = np.hstack((best_utility_neg_cover_count,
negative_cover_counts[block_utility_argmax]))
else:
best_utility = block_max_utility
best_utility_idx = block_utility_argmax
best_utility_pos_error_count = positive_error_counts[block_utility_argmax]
best_utility_neg_cover_count = negative_cover_counts[block_utility_argmax]

return best_utility, best_utility_idx, best_utility_pos_error_count, best_utility_neg_cover_count
2 changes: 1 addition & 1 deletion core/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ def finalize_options(self):

setup(
name = "kover",
version = "1.3.0",
version = "1.3.1",
packages = find_packages(),

cmdclass={'build_ext':build_ext},
Expand Down

0 comments on commit 4ae02ef

Please sign in to comment.