Skip to content

Commit

Permalink
added parameters for specifying the number of labels in nominal attri…
Browse files Browse the repository at this point in the history
…butes/class to avoid "cannot handle unary class attribute" if particular split only contains one label
  • Loading branch information
fracpete committed Apr 25, 2022
1 parent d4ba7a1 commit 9df5897
Show file tree
Hide file tree
Showing 5 changed files with 102 additions and 24 deletions.
5 changes: 4 additions & 1 deletion CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,10 @@ Changelog
0.0.6 (????-??-??)
------------------

- ...
- `WekaEstimator` (module `sklweka.classifiers`) and `WekaCluster` (module `sklweka.clusters`)
now allow specifying how many labels a particular nominal attribute or class attribute has
(to avoid error message like `Cannot handle unary class attribute!` if there is only one
label present in a particular split)


0.0.5 (2022-04-01)
Expand Down
23 changes: 21 additions & 2 deletions src/sklweka/classifiers.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@ class WekaEstimator(BaseEstimator, OptionHandler, RegressorMixin, ClassifierMixi
"""

def __init__(self, jobject=None, classifier=None, classname=None, options=None,
nominal_input_vars=None, nominal_output_var=None):
nominal_input_vars=None, nominal_output_var=None,
num_nominal_input_labels=None, num_nominal_output_labels=None):
"""
Initializes the estimator. Can be either instantiated via the following priority of parameters:
1. JB_Object representing a Java Classifier object
Expand All @@ -35,6 +36,10 @@ def __init__(self, jobject=None, classifier=None, classname=None, options=None,
:type nominal_input_vars: list or str
:param nominal_output_var: whether to convert the output variable to a nominal one
:type nominal_output_var: bool
:param num_nominal_input_labels: the dictionary with the number of labels for the nominal input variables (key is 0-based attribute index)
:type num_nominal_input_labels: dict
:param num_nominal_output_labels: the number of labels for the output variable
:type num_nominal_output_labels: int
"""
if jobject is not None:
_jobject = jobject
Expand All @@ -60,6 +65,8 @@ def __init__(self, jobject=None, classifier=None, classname=None, options=None,
self._options = options
self._nominal_input_vars = nominal_input_vars
self._nominal_output_var = nominal_output_var
self._num_nominal_input_labels = num_nominal_input_labels
self._num_nominal_output_labels = num_nominal_output_labels

@property
def classifier(self):
Expand Down Expand Up @@ -97,7 +104,9 @@ def fit(self, data, targets):
data = to_nominal_attributes(data, self._nominal_input_vars)
if self._nominal_output_var is not None:
targets = to_nominal_labels(targets)
d = to_instances(data, targets)
d = to_instances(data, targets,
num_nominal_labels=self._num_nominal_input_labels,
num_class_labels=self._num_nominal_output_labels)
self._classifier.build_classifier(d)
self.header_ = d.template_instances(d, 0)
if d.class_attribute.is_nominal:
Expand Down Expand Up @@ -162,6 +171,10 @@ def get_params(self, deep=True):
result["nominal_input_vars"] = self._nominal_input_vars
if self._nominal_output_var is not None:
result["nominal_output_var"] = self._nominal_output_var
if self._num_nominal_input_labels is not None:
result["num_nominal_input_labels"] = self._num_nominal_input_labels
if self._num_nominal_output_labels is not None:
result["num_nominal_output_labels"] = self._num_nominal_output_labels
return result

def set_params(self, **params):
Expand All @@ -186,6 +199,12 @@ def set_params(self, **params):
self._nominal_output_var = None
if "nominal_output_var" in params:
self._nominal_output_var = params["nominal_output_var"]
self._num_nominal_input_labels = None
if "num_nominal_input_labels" in params:
self._num_nominal_input_labels = params["num_nominal_input_labels"]
self._num_nominal_output_labels = None
if "num_nominal_output_labels" in params:
self._num_nominal_output_labels = params["num_nominal_output_labels"]

def __str__(self):
"""
Expand Down
22 changes: 13 additions & 9 deletions src/sklweka/clusters.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@ class WekaCluster(BaseEstimator, OptionHandler, ClusterMixin):
Wraps a Weka cluster within the scikit-learn framework.
"""

def __init__(self, jobject=None, cluster=None, classname=None, options=None, nominal_input_vars=None):
def __init__(self, jobject=None, cluster=None, classname=None, options=None, nominal_input_vars=None,
num_nominal_input_labels=None):
"""
Initializes the estimator. Can be either instantiated via the following priority of parameters:
1. JB_Object representing a Java Clusterer object
Expand All @@ -29,8 +30,8 @@ def __init__(self, jobject=None, cluster=None, classname=None, options=None, nom
:type classname: str
:param options: the command-line options of the Weka cluster to instantiate
:type options: list
:param nominal_input_vars: the list of 0-based indices of attributes to convert to nominal or range string with 1-based indices
:type nominal_input_vars: list or str
:param num_nominal_input_labels: the dictionary with the number of labels for the nominal input variables (key is 0-based attribute index)
:type num_nominal_input_labels: dict
"""
if jobject is not None:
_jobject = jobject
Expand All @@ -54,6 +55,7 @@ def __init__(self, jobject=None, cluster=None, classname=None, options=None, nom
self._classname = classname
self._options = options
self._nominal_input_vars = nominal_input_vars
self._num_nominal_input_labels = num_nominal_input_labels

@property
def cluster(self):
Expand Down Expand Up @@ -88,7 +90,7 @@ def fit(self, data, targets=None):
"""
if self._nominal_input_vars is not None:
data = to_nominal_attributes(data, self._nominal_input_vars)
d = to_instances(data)
d = to_instances(data, num_nominal_labels=self._num_nominal_input_labels)
self._cluster.build_clusterer(d)
self.header_ = d.template_instances(d, 0)
return self
Expand Down Expand Up @@ -141,8 +143,10 @@ def get_params(self, deep=True):
result["options"] = self._options
if self._nominal_input_vars is not None:
result["nominal_input_vars"] = self._nominal_input_vars
if self._nominal_output_var is not None:
result["nominal_output_var"] = self._nominal_output_var
if self._num_nominal_input_labels is not None:
result["num_nominal_input_labels"] = self._num_nominal_input_labels
if self._num_nominal_input_labels is not None:
result["num_nominal_input_labels"] = self._num_nominal_input_labels
return result

def set_params(self, **params):
Expand All @@ -164,9 +168,9 @@ def set_params(self, **params):
self._nominal_input_vars = None
if "nominal_input_vars" in params:
self._nominal_input_vars = params["nominal_input_vars"]
self._nominal_output_var = None
if "nominal_output_var" in params:
self._nominal_output_var = params["nominal_output_var"]
self._num_nominal_input_labels = None
if "num_nominal_input_labels" in params:
self._num_nominal_input_labels = params["num_nominal_input_labels"]

def __str__(self):
"""
Expand Down
31 changes: 23 additions & 8 deletions src/sklweka/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,7 +265,8 @@ def determine_attribute_type(y):
return result


def to_instances(X, y=None, att_names=None, att_types=None, class_name=None, class_type=None, relation_name=None):
def to_instances(X, y=None, att_names=None, att_types=None, class_name=None, class_type=None, relation_name=None,
num_nominal_labels=None, num_class_labels=None):
"""
Turns the 2D matrix and the optional 1D class vector into an Instances object.
Expand All @@ -282,6 +283,10 @@ def to_instances(X, y=None, att_names=None, att_types=None, class_name=None, cla
:type class_type: str
:param relation_name: the name for the dataset
:type relation_name: str
:param num_nominal_labels: the dictionary with the number of labels (key is 0-based attribute index)
:type num_nominal_labels: dict
:param num_class_labels: the number of labels in the class attribute
:type num_class_labels: int
:return: the generated Instances object
:rtype: Instances
"""
Expand Down Expand Up @@ -317,12 +322,17 @@ def to_instances(X, y=None, att_names=None, att_types=None, class_name=None, cla
if att_type == "N":
atts.append(Attribute.create_numeric(att_name))
elif att_type == "C":
labels = set()
for n in range(len(X)):
r = X[n]
v = str(r[i])
labels.add(v)
values = sorted(labels)
if (num_nominal_labels is not None) and (i in num_nominal_labels):
values = []
for l in range(num_nominal_labels[i]):
values.append("_%d" % l)
else:
labels = set()
for n in range(len(X)):
r = X[n]
v = str(r[i])
labels.add(v)
values = sorted(labels)
atts.append(Attribute.create_nominal(att_name, values))
else:
raise Exception("Unsupported attribute type for column %d: %s" % ((i+1), att_type))
Expand All @@ -331,7 +341,12 @@ def to_instances(X, y=None, att_names=None, att_types=None, class_name=None, cla
if class_type == "N":
atts.append(Attribute.create_numeric(class_name))
elif class_type == "C":
values = sorted(set([str(x) for x in y]))
if num_class_labels is not None:
values = []
for l in range(num_class_labels):
values.append("_%d" % l)
else:
values = sorted(set([str(x) for x in y]))
atts.append(Attribute.create_nominal(class_name, values))

result = Instances.create_instances(relation_name, atts, len(X))
Expand Down
45 changes: 41 additions & 4 deletions src/sklweka/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@ class WekaTransformer(BaseEstimator, OptionHandler, TransformerMixin):
Wraps a Weka filter within the scikit-learn framework.
"""

def __init__(self, jobject=None, filter=None, classname=None, options=None):
def __init__(self, jobject=None, filter=None, classname=None, options=None,
num_nominal_input_labels=None, num_nominal_output_labels=None):
"""
Initializes the estimator. Can be either instantiated via the following priority of parameters:
1. JB_Object representing a Java Filter object
Expand All @@ -29,6 +30,10 @@ def __init__(self, jobject=None, filter=None, classname=None, options=None):
:type classname: str
:param options: the command-line options of the Weka filter to instantiate
:type options: list
:param num_nominal_input_labels: the dictionary with the number of labels for the nominal input variables (key is 0-based attribute index)
:type num_nominal_input_labels: dict
:param num_nominal_output_labels: the number of labels for the output variable
:type num_nominal_output_labels: int
"""
if jobject is not None:
_jobject = jobject
Expand All @@ -51,6 +56,8 @@ def __init__(self, jobject=None, filter=None, classname=None, options=None):
# the following references are required for get_params/set_params
self._classname = classname
self._options = options
self._num_nominal_input_labels = num_nominal_input_labels
self._num_nominal_output_labels = num_nominal_output_labels

@property
def filter(self):
Expand Down Expand Up @@ -87,7 +94,9 @@ def fit(self, data, targets):
check_array(data)
else:
check_X_y(data, targets)
d = to_instances(data, y=targets)
d = to_instances(data, y=targets,
num_nominal_labels=self._num_nominal_input_labels,
num_class_labels=self._num_nominal_output_labels)
self.header_ = Instances.template_instances(d)
self._filter.inputformat(d)
self._filter.filter(d)
Expand All @@ -114,7 +123,9 @@ def transform(self, data, targets=None):
targets.append(missing_value())
targets = np.array(targets)

d = to_instances(data, y=targets)
d = to_instances(data, y=targets,
num_nominal_labels=self._num_nominal_input_labels,
num_class_labels=self._num_nominal_output_labels)
d_new = self._filter.filter(d)
X, y = to_array(d_new)
if no_targets:
Expand All @@ -134,6 +145,10 @@ def get_params(self, deep=True):
result = dict()
result["classname"] = self._classname
result["options"] = self._options
if self._num_nominal_input_labels is not None:
result["num_nominal_input_labels"] = self._num_nominal_input_labels
if self._num_nominal_output_labels is not None:
result["num_nominal_output_labels"] = self._num_nominal_output_labels
return result

def set_params(self, **params):
Expand All @@ -152,6 +167,12 @@ def set_params(self, **params):
self._classname = params["classname"]
self._options = params["options"]
self._filter = Filter(classname=self._classname, options=self._options)
self._num_nominal_input_labels = None
if "num_nominal_input_labels" in params:
self._num_nominal_input_labels = params["num_nominal_input_labels"]
self._num_nominal_output_labels = None
if "num_nominal_output_labels" in params:
self._num_nominal_output_labels = params["num_nominal_output_labels"]

def __str__(self):
"""
Expand Down Expand Up @@ -194,18 +215,24 @@ class MakeNominal(BaseEstimator, TransformerMixin):
Converts numeric columns to nominal ones (ie string labels).
"""

def __init__(self, input_vars=None, output_var=False):
def __init__(self, input_vars=None, output_var=False, num_nominal_input_labels=None, num_nominal_output_labels=None):
"""
Initializes the estimator.
:param nominal_input_vars: the list of 0-based indices of attributes to convert to nominal or range string with 1-based indices
:type nominal_input_vars: list or str
:param output_var: whether to convert the output variable as well
:type output_var: bool
:param num_nominal_input_labels: the dictionary with the number of labels for the nominal input variables (key is 0-based attribute index)
:type num_nominal_input_labels: dict
:param num_nominal_output_labels: the number of labels for the output variable
:type num_nominal_output_labels: int
"""
super(MakeNominal, self).__init__()
self._input_vars = None if input_vars is None else input_vars[:]
self._output_var = output_var
self._num_nominal_input_labels = num_nominal_input_labels
self._num_nominal_output_labels = num_nominal_output_labels

@property
def input_vars(self):
Expand Down Expand Up @@ -275,6 +302,10 @@ def get_params(self, deep=True):
result = dict()
result["input_vars"] = self._input_vars
result["output_var"] = self._output_var
if self._num_nominal_input_labels is not None:
result["num_nominal_input_labels"] = self._num_nominal_input_labels
if self._num_nominal_output_labels is not None:
result["num_nominal_output_labels"] = self._num_nominal_output_labels
return result

def set_params(self, **params):
Expand All @@ -292,6 +323,12 @@ def set_params(self, **params):
raise Exception("Cannot find 'output_var' in parameters!")
self._input_vars = params["input_vars"]
self._output_var = params["output_var"]
self._num_nominal_input_labels = None
if "num_nominal_input_labels" in params:
self._num_nominal_input_labels = params["num_nominal_input_labels"]
self._num_nominal_output_labels = None
if "num_nominal_output_labels" in params:
self._num_nominal_output_labels = params["num_nominal_output_labels"]

def __str__(self):
"""
Expand Down

0 comments on commit 9df5897

Please sign in to comment.