From 52c1f8e24f7f4be0d7657deddb302a0ee8fa6036 Mon Sep 17 00:00:00 2001 From: j1c Date: Tue, 13 Feb 2024 02:37:21 -0500 Subject: [PATCH 01/10] Add cartesian product functionality --- sktree/tree/_utils.pxd | 5 +++++ sktree/tree/_utils.pyx | 19 +++++++++++++++++++ 2 files changed, 24 insertions(+) diff --git a/sktree/tree/_utils.pxd b/sktree/tree/_utils.pxd index c44454914..b135d3de0 100644 --- a/sktree/tree/_utils.pxd +++ b/sktree/tree/_utils.pxd @@ -1,6 +1,7 @@ import numpy as np cimport numpy as cnp +from libcpp.vector cimport vector cnp.import_array() @@ -20,3 +21,7 @@ cpdef ravel_multi_index(intp_t[:] coords, const intp_t[:] shape) cdef void unravel_index_cython(intp_t index, const intp_t[:] shape, intp_t[:] coords) noexcept nogil cdef intp_t ravel_multi_index_cython(intp_t[:] coords, const intp_t[:] shape) noexcept nogil + +cdef vector[vector[intp_t]] cartesian_cython(vector[vector[intp_t]]& sequences) noexcept nogil + +cpdef cartesian_python(vector[vector[intp_t]]& sequences) \ No newline at end of file diff --git a/sktree/tree/_utils.pyx b/sktree/tree/_utils.pyx index 84dc2de34..7c64ca5f1 100644 --- a/sktree/tree/_utils.pyx +++ b/sktree/tree/_utils.pyx @@ -10,6 +10,7 @@ import numpy as np cimport numpy as cnp cnp.import_array() +from libcpp.vector cimport vector from .._lib.sklearn.tree._utils cimport rand_uniform @@ -145,3 +146,21 @@ cdef intp_t ravel_multi_index_cython(intp_t[:] coords, const intp_t[:] shape) no flat_index *= shape[i + 1] return flat_index + + +cdef vector[vector[intp_t]] cartesian_cython(vector[vector[intp_t]]& sequences) noexcept nogil: + cdef vector[vector[intp_t]] results = vector[vector[intp_t]](1) + cdef vector[vector[intp_t]] next_results + for new_values in sequences: + for result in results: + for value in new_values: + result_copy = result + result_copy.push_back(value) + next_results.push_back(result_copy) + results = next_results + next_results.clear() + return results + + +cpdef cartesian_python(vector[vector[intp_t]]& sequences): + return cartesian_cython(sequences) \ No newline at end of file From 4db2547cb0a9941c5b796fdefa43f281834ff153 Mon Sep 17 00:00:00 2001 From: j1c Date: Tue, 13 Feb 2024 03:39:49 -0500 Subject: [PATCH 02/10] Change ravel function to use vectors --- sktree/tree/_utils.pxd | 6 +++--- sktree/tree/_utils.pyx | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/sktree/tree/_utils.pxd b/sktree/tree/_utils.pxd index b135d3de0..3446dd4b9 100644 --- a/sktree/tree/_utils.pxd +++ b/sktree/tree/_utils.pxd @@ -16,12 +16,12 @@ cpdef unravel_index( intp_t index, cnp.ndarray[intp_t, ndim=1] shape ) -cpdef ravel_multi_index(intp_t[:] coords, const intp_t[:] shape) +cpdef ravel_multi_index(vector[intp_t] coords, const intp_t[:] shape) cdef void unravel_index_cython(intp_t index, const intp_t[:] shape, intp_t[:] coords) noexcept nogil -cdef intp_t ravel_multi_index_cython(intp_t[:] coords, const intp_t[:] shape) noexcept nogil +cdef intp_t ravel_multi_index_cython(vector[intp_t] coords, const intp_t[:] shape) noexcept nogil -cdef vector[vector[intp_t]] cartesian_cython(vector[vector[intp_t]]& sequences) noexcept nogil +cdef vector[vector[intp_t]] cartesian_cython(vector[vector[intp_t]] sequences) noexcept nogil cpdef cartesian_python(vector[vector[intp_t]]& sequences) \ No newline at end of file diff --git a/sktree/tree/_utils.pyx b/sktree/tree/_utils.pyx index 7c64ca5f1..ac50c02de 100644 --- a/sktree/tree/_utils.pyx +++ b/sktree/tree/_utils.pyx @@ -59,7 +59,7 @@ cpdef unravel_index( return coords -cpdef ravel_multi_index(intp_t[:] coords, const intp_t[:] shape): +cpdef ravel_multi_index(vector[intp_t] coords, const intp_t[:] shape): """Converts a tuple of coordinate arrays into a flat index. Purely used for testing purposes. @@ -110,7 +110,7 @@ cdef void unravel_index_cython(intp_t index, const intp_t[:] shape, intp_t[:] co index //= size -cdef intp_t ravel_multi_index_cython(intp_t[:] coords, const intp_t[:] shape) noexcept nogil: +cdef intp_t ravel_multi_index_cython(vector[intp_t] coords, const intp_t[:] shape) noexcept nogil: """Converts a tuple of coordinate arrays into a flat index. Parameters @@ -148,7 +148,7 @@ cdef intp_t ravel_multi_index_cython(intp_t[:] coords, const intp_t[:] shape) no return flat_index -cdef vector[vector[intp_t]] cartesian_cython(vector[vector[intp_t]]& sequences) noexcept nogil: +cdef vector[vector[intp_t]] cartesian_cython(vector[vector[intp_t]] sequences) noexcept nogil: cdef vector[vector[intp_t]] results = vector[vector[intp_t]](1) cdef vector[vector[intp_t]] next_results for new_values in sequences: From f402fe53d82be3ab06d5a5317f2fd38d53ebfd00 Mon Sep 17 00:00:00 2001 From: j1c Date: Tue, 13 Feb 2024 03:40:11 -0500 Subject: [PATCH 03/10] Change sampling scheme --- sktree/tree/manifold/_morf_splitter.pyx | 160 +++++++++++------------- 1 file changed, 71 insertions(+), 89 deletions(-) diff --git a/sktree/tree/manifold/_morf_splitter.pyx b/sktree/tree/manifold/_morf_splitter.pyx index 2081ab852..58adb6396 100644 --- a/sktree/tree/manifold/_morf_splitter.pyx +++ b/sktree/tree/manifold/_morf_splitter.pyx @@ -16,7 +16,7 @@ from libcpp.vector cimport vector from ..._lib.sklearn.tree._criterion cimport Criterion from ..._lib.sklearn.tree._utils cimport rand_int -from .._utils cimport ravel_multi_index_cython, unravel_index_cython +from .._utils cimport ravel_multi_index_cython, unravel_index_cython, cartesian_cython cdef class PatchSplitter(BestObliqueSplitter): @@ -156,17 +156,12 @@ cdef class BestPatchSplitter(BaseDensePatchSplitter): # create a buffer for storing the patch dimensions sampled per projection matrix self.patch_dims_buff = np.zeros(data_dims.shape[0], dtype=np.intp) - self.unraveled_patch_point = np.zeros(data_dims.shape[0], dtype=np.intp) # store the min and max patch dimension constraints self.min_patch_dims = min_patch_dims self.max_patch_dims = max_patch_dims self.dim_contiguous = dim_contiguous - # initialize a buffer to allow for Fisher-Yates - self._index_patch_buffer = np.zeros(np.max(self.max_patch_dims), dtype=np.intp) - self._index_data_buffer = np.zeros(np.max(self.data_dims), dtype=np.intp) - # whether or not to perform some discontinuous sampling if not all(self.dim_contiguous): self._discontiguous = True @@ -211,6 +206,7 @@ cdef class BestPatchSplitter(BaseDensePatchSplitter): # compute top-left seed for the multi-dimensional patch cdef intp_t top_left_patch_seed cdef intp_t patch_size = 1 + cdef vector[intp_t] unraveled_patch_point = vector[intp_t](self.ndim) cdef UINT32_t* random_state = &self.rand_r_state @@ -232,6 +228,7 @@ cdef class BestPatchSplitter(BaseDensePatchSplitter): self.max_patch_dims[idx] + 1, random_state ) + # samples patch size for a dimension # sample the top-left index and patch size for this dimension based on boundary effects if self.boundary is None: @@ -262,10 +259,10 @@ cdef class BestPatchSplitter(BaseDensePatchSplitter): # Convert the top-left-seed value to it's appropriate index in the full image. top_left_patch_seed = max(0, dim - patch_dim + 1) - self.unraveled_patch_point[idx] = top_left_patch_seed + unraveled_patch_point[idx] = top_left_patch_seed top_left_patch_seed = ravel_multi_index_cython( - self.unraveled_patch_point, + unraveled_patch_point, self.data_dims ) return top_left_patch_seed, patch_size @@ -306,6 +303,7 @@ cdef class BestPatchSplitter(BaseDensePatchSplitter): self.patch_dims_buff ) + cdef void sample_proj_vec( self, vector[vector[float32_t]]& proj_mat_weights, @@ -315,96 +313,79 @@ cdef class BestPatchSplitter(BaseDensePatchSplitter): intp_t top_left_patch_seed, const intp_t[:] patch_dims, ) noexcept nogil: - cdef UINT32_t* random_state = &self.rand_r_state - # iterates over the size of the patch - cdef intp_t patch_idx + """ + Samples projection vector. - # stores how many patches we have iterated so far - cdef intp_t vectorized_patch_offset - cdef intp_t vectorized_point_offset - cdef intp_t vectorized_point + Parameters + ---------- + proj_mat_weights : vector[vector[float32_t]] + The weights of the projection matrix. + proj_mat_indices : vector[vector[intp_t]] + The indices of the projection matrix. + proj_i : intp_t + The index of feature. + patch_size : intp_t + The size of the patch. + top_left_patch_seed : intp_t + The top-left seed of the patch. + patch_dims : array-like, shape (n_dims,) + The dimensions of the patch. + """ + # initialize a buffer to allow for Fisher-Yates + cdef vector[intp_t] _index_patch_buffer + cdef vector[intp_t] _index_data_buffer - cdef intp_t dim_idx + cdef UINT32_t* random_state = &self.rand_r_state + cdef intp_t num_rows + cdef int ndim = self.ndim + cdef vector[vector[intp_t]] points = vector[vector[intp_t]](ndim) + cdef vector[intp_t] temp = vector[intp_t](ndim) + cdef intp_t patch_dim + cdef intp_t patch_idx + cdef intp_t idx + cdef intp_t i + cdef intp_t[:] unraveled_patch_point # weights are default to 1 cdef float32_t weight = 1. - # XXX: still unsure if it works yet - # XXX: THIS ONLY WORKS FOR THE FIRST DIMENSION THAT IS DISCONTIGUOUS. - cdef intp_t other_dims_offset - cdef intp_t row_index - - cdef intp_t i - cdef intp_t num_rows = self.data_dims[0] - if self._discontiguous: - # fill with values 0, 1, ..., dimension - 1 - for i in range(0, self.data_dims[0]): - self._index_data_buffer[i] = i - # then shuffle indices using Fisher-Yates - for i in range(num_rows): - j = rand_int(0, num_rows - i, random_state) - self._index_data_buffer[i], self._index_data_buffer[j] = \ - self._index_data_buffer[j], self._index_data_buffer[i] - # now select the first `patch_dims[0]` indices - for i in range(num_rows): - self._index_patch_buffer[i] = self._index_data_buffer[i] - - for patch_idx in range(patch_size): - # keep track of which dimensions of the patch we have iterated over - vectorized_patch_offset = 1 - - # Once the vectorized top-left-seed is unraveled, you can add the patch - # points in the array structure and compute their vectorized (unraveled) - # points, which are added to the projection vector - unravel_index_cython(top_left_patch_seed, self.data_dims, self.unraveled_patch_point) - - for dim_idx in range(self.ndim): - # compute the offset from the top-left patch seed based on: - # 1. the current patch index - # 2. the patch dimension indexed by `dim_idx` - # 3. and the vectorized patch dimensions that we have seen so far - # the `vectorized_point_offset` is the offset from the top-left vectorized seed for this dimension - vectorized_point_offset = (patch_idx // (vectorized_patch_offset)) % patch_dims[dim_idx] - - # then we compute the actual point in the original data shape - self.unraveled_patch_point[dim_idx] = self.unraveled_patch_point[dim_idx] + vectorized_point_offset - vectorized_patch_offset *= patch_dims[dim_idx] - - # if any dimensions are discontiguous, we want to migrate the entire axis a fixed amount - # based on the shuffling - if self._discontiguous is True: - for dim_idx in range(self.ndim): - if self.dim_contiguous[dim_idx] is True: - continue - - # determine the "row" we are currently on - # other_dims_offset = 1 - # for idx in range(dim_idx + 1, self.ndim): - # other_dims_offset *= self.data_dims[idx] - # row_index = self.unraveled_patch_point[dim_idx] % other_dims_offset - # determine the "row" we are currently on - other_dims_offset = 1 - for idx in range(dim_idx + 1, self.ndim): - if not self.dim_contiguous[idx]: - other_dims_offset *= self.data_dims[idx] - - row_index = 0 - for idx in range(dim_idx + 1, self.ndim): - if not self.dim_contiguous[idx]: - row_index += ( - (self.unraveled_patch_point[idx] // other_dims_offset) % - self.patch_dims_buff[idx] - ) * other_dims_offset - other_dims_offset //= self.data_dims[idx] - - # assign random row index now - self.unraveled_patch_point[dim_idx] = self._index_patch_buffer[row_index] - - # ravel the patch point into the original data dimensions - vectorized_point = ravel_multi_index_cython(self.unraveled_patch_point, self.data_dims) + unravel_index_cython(top_left_patch_seed, self.data_dims, unraveled_patch_point) + + # # TODO + for dim_idx in range(ndim): + if self.dim_contiguous[dim_idx]: + patch_dim = patch_dims[dim_idx] + patch_idx = unraveled_patch_point[dim_idx] + idx = patch_dim + patch_idx + for i in range(patch_idx, idx): + points[dim_idx].push_back(i) + else: + num_rows = self.data_dims[dim_idx] + for i in range(0, num_rows): + _index_data_buffer.push_back(i) + # then shuffle indices using Fisher-Yates + for i in range(num_rows): + j = rand_int(0, num_rows - i, random_state) + _index_data_buffer[i], _index_data_buffer[j] = \ + _index_data_buffer[j], _index_data_buffer[i] + for i in range(num_rows): + _index_patch_buffer.push_back(_index_data_buffer[i]) + + for i in range(0, patch_dims[dim_idx]): #populate + points[dim_idx].push_back(_index_patch_buffer[i]) + _index_data_buffer.clear() + _index_patch_buffer.clear() + + # make cartesian product of the points, ravel then to proj_mat + + cdef intp_t[:] tmp + cdef vector[vector[intp_t]] products = cartesian_cython(points) + for point in products: + vectorized_point = ravel_multi_index_cython(point, self.data_dims) proj_mat_indices[proj_i].push_back(vectorized_point) proj_mat_weights[proj_i].push_back(weight) + cdef void compute_features_over_samples( self, intp_t start, @@ -476,6 +457,7 @@ cdef class BestPatchSplitterTester(BestPatchSplitter): # convert the projection matrix to something that can be used in Python proj_vecs = np.zeros((1, self.n_features), dtype=np.float64) for i in range(0, 1): + print(i) for j in range(0, proj_mat_weights[i].size()): weight = proj_mat_weights[i][j] feat = proj_mat_indices[i][j] From fb7c2653b1ac79844c36553e52ede206fc6f94e5 Mon Sep 17 00:00:00 2001 From: j1c Date: Tue, 13 Feb 2024 04:53:00 -0500 Subject: [PATCH 04/10] Update to vectors --- sktree/tree/_utils.pxd | 2 +- sktree/tree/_utils.pyx | 8 +++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/sktree/tree/_utils.pxd b/sktree/tree/_utils.pxd index 3446dd4b9..bd698cd95 100644 --- a/sktree/tree/_utils.pxd +++ b/sktree/tree/_utils.pxd @@ -18,7 +18,7 @@ cpdef unravel_index( cpdef ravel_multi_index(vector[intp_t] coords, const intp_t[:] shape) -cdef void unravel_index_cython(intp_t index, const intp_t[:] shape, intp_t[:] coords) noexcept nogil +cdef vector[intp_t] unravel_index_cython(intp_t index, const intp_t[:] shape) noexcept nogil cdef intp_t ravel_multi_index_cython(vector[intp_t] coords, const intp_t[:] shape) noexcept nogil diff --git a/sktree/tree/_utils.pyx b/sktree/tree/_utils.pyx index ac50c02de..8211f884e 100644 --- a/sktree/tree/_utils.pyx +++ b/sktree/tree/_utils.pyx @@ -54,8 +54,7 @@ cpdef unravel_index( """ index = np.intp(index) shape = np.array(shape) - coords = np.empty(shape.shape[0], dtype=np.intp) - unravel_index_cython(index, shape, coords) + coords = unravel_index_cython(index, shape) return coords @@ -84,7 +83,7 @@ cpdef ravel_multi_index(vector[intp_t] coords, const intp_t[:] shape): return ravel_multi_index_cython(coords, shape) -cdef void unravel_index_cython(intp_t index, const intp_t[:] shape, intp_t[:] coords) noexcept nogil: +cdef vector[intp_t] unravel_index_cython(intp_t index, const intp_t[:] shape) noexcept nogil: """Converts a flat index into a tuple of coordinate arrays. Parameters @@ -103,12 +102,15 @@ cdef void unravel_index_cython(intp_t index, const intp_t[:] shape, intp_t[:] co """ cdef intp_t ndim = shape.shape[0] cdef intp_t j, size + cdef vector[intp_t] coords = vector[intp_t](ndim) for j in range(ndim - 1, -1, -1): size = shape[j] coords[j] = index % size index //= size + return coords + cdef intp_t ravel_multi_index_cython(vector[intp_t] coords, const intp_t[:] shape) noexcept nogil: """Converts a tuple of coordinate arrays into a flat index. From 3f77cb80a20292201794b30687961647aeb2eb11 Mon Sep 17 00:00:00 2001 From: j1c Date: Tue, 13 Feb 2024 15:40:30 -0500 Subject: [PATCH 05/10] add tests --- sktree/tree/tests/test_utils.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/sktree/tree/tests/test_utils.py b/sktree/tree/tests/test_utils.py index 3cf6705bf..4a9708d9d 100644 --- a/sktree/tree/tests/test_utils.py +++ b/sktree/tree/tests/test_utils.py @@ -7,7 +7,7 @@ from sktree._lib.sklearn.tree._criterion import Gini from sktree._lib.sklearn.tree._utils import _any_isnan_axis0 -from .._utils import ravel_multi_index, unravel_index +from .._utils import ravel_multi_index, unravel_index, cartesian_python from ..manifold._morf_splitter import BestPatchSplitterTester @@ -142,7 +142,7 @@ def test_unravel_index(): shape = np.asarray((5,)) expected_output = [(0,), (1,), (2,), (3,), (4,)] for idx, index in enumerate(indices): - assert unravel_index(index, shape) == expected_output[idx] + assert_equal(unravel_index(index, shape), expected_output[idx]) # Test with 2D array indices = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8]) @@ -202,3 +202,12 @@ def test_ravel_multi_index(): # assert str(e) == "Invalid index" # else: # assert False, "Expected ValueError" + + +def test_cartesian_prod(): + sequences = [[1, 2], [3, 4, 5]] + + from_itertools = list(product(*sequences)) + from_cython = cartesian_python(sequences) + + assert_equal(from_itertools, from_cython) \ No newline at end of file From 074ee50ded3b15025fb5d755ad7b351517a7f13d Mon Sep 17 00:00:00 2001 From: j1c Date: Tue, 13 Feb 2024 16:00:49 -0500 Subject: [PATCH 06/10] Fix GIL and cython errors --- sktree/tree/manifold/_morf_splitter.pyx | 28 ++++++++----------------- 1 file changed, 9 insertions(+), 19 deletions(-) diff --git a/sktree/tree/manifold/_morf_splitter.pyx b/sktree/tree/manifold/_morf_splitter.pyx index 58adb6396..0b0dd205c 100644 --- a/sktree/tree/manifold/_morf_splitter.pyx +++ b/sktree/tree/manifold/_morf_splitter.pyx @@ -325,14 +325,13 @@ cdef class BestPatchSplitter(BaseDensePatchSplitter): proj_i : intp_t The index of feature. patch_size : intp_t - The size of the patch. + The total size of the patch. top_left_patch_seed : intp_t - The top-left seed of the patch. + The top-left seed of the patch raveled. patch_dims : array-like, shape (n_dims,) The dimensions of the patch. """ # initialize a buffer to allow for Fisher-Yates - cdef vector[intp_t] _index_patch_buffer cdef vector[intp_t] _index_data_buffer cdef UINT32_t* random_state = &self.rand_r_state @@ -341,23 +340,20 @@ cdef class BestPatchSplitter(BaseDensePatchSplitter): cdef vector[vector[intp_t]] points = vector[vector[intp_t]](ndim) cdef vector[intp_t] temp = vector[intp_t](ndim) cdef intp_t patch_dim - cdef intp_t patch_idx cdef intp_t idx cdef intp_t i - cdef intp_t[:] unraveled_patch_point # weights are default to 1 cdef float32_t weight = 1. - unravel_index_cython(top_left_patch_seed, self.data_dims, unraveled_patch_point) + cdef vector[intp_t] unraveled_patch_point = vector[intp_t](self.ndim) + unraveled_patch_point = unravel_index_cython(top_left_patch_seed, self.data_dims) - # # TODO for dim_idx in range(ndim): if self.dim_contiguous[dim_idx]: patch_dim = patch_dims[dim_idx] - patch_idx = unraveled_patch_point[dim_idx] - idx = patch_dim + patch_idx - for i in range(patch_idx, idx): + idx = patch_dim + unraveled_patch_point[dim_idx] + for i in range(unraveled_patch_point[dim_idx], idx): points[dim_idx].push_back(i) else: num_rows = self.data_dims[dim_idx] @@ -368,17 +364,12 @@ cdef class BestPatchSplitter(BaseDensePatchSplitter): j = rand_int(0, num_rows - i, random_state) _index_data_buffer[i], _index_data_buffer[j] = \ _index_data_buffer[j], _index_data_buffer[i] - for i in range(num_rows): - _index_patch_buffer.push_back(_index_data_buffer[i]) - for i in range(0, patch_dims[dim_idx]): #populate - points[dim_idx].push_back(_index_patch_buffer[i]) + for i in range(0, patch_dims[dim_idx]): # populate + points[dim_idx].push_back(_index_data_buffer[i]) _index_data_buffer.clear() - _index_patch_buffer.clear() - - # make cartesian product of the points, ravel then to proj_mat - cdef intp_t[:] tmp + # make cartesian product of the points, ravel, then add to proj_mat_indices cdef vector[vector[intp_t]] products = cartesian_cython(points) for point in products: vectorized_point = ravel_multi_index_cython(point, self.data_dims) @@ -457,7 +448,6 @@ cdef class BestPatchSplitterTester(BestPatchSplitter): # convert the projection matrix to something that can be used in Python proj_vecs = np.zeros((1, self.n_features), dtype=np.float64) for i in range(0, 1): - print(i) for j in range(0, proj_mat_weights[i].size()): weight = proj_mat_weights[i][j] feat = proj_mat_indices[i][j] From 43cee1eee18bc74d40897c9520801caa962613ea Mon Sep 17 00:00:00 2001 From: j1c Date: Tue, 13 Feb 2024 16:23:23 -0500 Subject: [PATCH 07/10] Remove unnecesary line --- sktree/tree/manifold/_morf_splitter.pyx | 2 -- 1 file changed, 2 deletions(-) diff --git a/sktree/tree/manifold/_morf_splitter.pyx b/sktree/tree/manifold/_morf_splitter.pyx index 0b0dd205c..8ac880244 100644 --- a/sktree/tree/manifold/_morf_splitter.pyx +++ b/sktree/tree/manifold/_morf_splitter.pyx @@ -228,8 +228,6 @@ cdef class BestPatchSplitter(BaseDensePatchSplitter): self.max_patch_dims[idx] + 1, random_state ) - # samples patch size for a dimension - # sample the top-left index and patch size for this dimension based on boundary effects if self.boundary is None: # compute the difference between the image dimensions and the current From a823556b3adfbe52792e7719e1850498909aa93d Mon Sep 17 00:00:00 2001 From: j1c Date: Tue, 13 Feb 2024 16:47:02 -0500 Subject: [PATCH 08/10] Add to changelog --- doc/whats_new/v0.7.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/doc/whats_new/v0.7.rst b/doc/whats_new/v0.7.rst index 717faee65..125574a60 100644 --- a/doc/whats_new/v0.7.rst +++ b/doc/whats_new/v0.7.rst @@ -25,6 +25,9 @@ Changelog by `Adam Li`_ (:pr:`#211`) - |Feature| Introduce a new set of simulations based on Marron and Wand 1992. by `Sambit Panda`_ (:pr:`#203`) +- |Fix| :class:`sktree.tree.PatchObliqueDecisionTreeClassifier` now correctly + handles the case where one or more features are discontiguous. + by `Jaewon Chung`_ (:pr:`#219`). Code and Documentation Contributors ----------------------------------- @@ -34,3 +37,4 @@ the project since version inception, including: * `Adam Li`_ * `Sambit Panda`_ +* `Jaewon Chung`_ From b7887d35ecfac1f9694800c5ce0aee829e2fa88f Mon Sep 17 00:00:00 2001 From: j1c Date: Tue, 13 Feb 2024 16:49:43 -0500 Subject: [PATCH 09/10] isort --- sktree/tree/manifold/_morf_splitter.pyx | 2 +- sktree/tree/tests/test_utils.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/sktree/tree/manifold/_morf_splitter.pyx b/sktree/tree/manifold/_morf_splitter.pyx index 8ac880244..dfaec589a 100644 --- a/sktree/tree/manifold/_morf_splitter.pyx +++ b/sktree/tree/manifold/_morf_splitter.pyx @@ -16,7 +16,7 @@ from libcpp.vector cimport vector from ..._lib.sklearn.tree._criterion cimport Criterion from ..._lib.sklearn.tree._utils cimport rand_int -from .._utils cimport ravel_multi_index_cython, unravel_index_cython, cartesian_cython +from .._utils cimport cartesian_cython, ravel_multi_index_cython, unravel_index_cython cdef class PatchSplitter(BestObliqueSplitter): diff --git a/sktree/tree/tests/test_utils.py b/sktree/tree/tests/test_utils.py index 4a9708d9d..a0c4e1c07 100644 --- a/sktree/tree/tests/test_utils.py +++ b/sktree/tree/tests/test_utils.py @@ -7,7 +7,7 @@ from sktree._lib.sklearn.tree._criterion import Gini from sktree._lib.sklearn.tree._utils import _any_isnan_axis0 -from .._utils import ravel_multi_index, unravel_index, cartesian_python +from .._utils import cartesian_python, ravel_multi_index, unravel_index from ..manifold._morf_splitter import BestPatchSplitterTester From 896ec475c3fc13b659bbb6e5c12358b4fbb805b9 Mon Sep 17 00:00:00 2001 From: j1c Date: Tue, 13 Feb 2024 16:52:15 -0500 Subject: [PATCH 10/10] Black/fix lint --- sktree/tree/_utils.pxd | 2 +- sktree/tree/_utils.pyx | 2 +- sktree/tree/manifold/_morf_splitter.pyx | 7 ++----- sktree/tree/tests/test_utils.py | 2 +- 4 files changed, 5 insertions(+), 8 deletions(-) diff --git a/sktree/tree/_utils.pxd b/sktree/tree/_utils.pxd index bd698cd95..ef4418033 100644 --- a/sktree/tree/_utils.pxd +++ b/sktree/tree/_utils.pxd @@ -24,4 +24,4 @@ cdef intp_t ravel_multi_index_cython(vector[intp_t] coords, const intp_t[:] shap cdef vector[vector[intp_t]] cartesian_cython(vector[vector[intp_t]] sequences) noexcept nogil -cpdef cartesian_python(vector[vector[intp_t]]& sequences) \ No newline at end of file +cpdef cartesian_python(vector[vector[intp_t]]& sequences) diff --git a/sktree/tree/_utils.pyx b/sktree/tree/_utils.pyx index 8211f884e..dab38e989 100644 --- a/sktree/tree/_utils.pyx +++ b/sktree/tree/_utils.pyx @@ -165,4 +165,4 @@ cdef vector[vector[intp_t]] cartesian_cython(vector[vector[intp_t]] sequences) n cpdef cartesian_python(vector[vector[intp_t]]& sequences): - return cartesian_cython(sequences) \ No newline at end of file + return cartesian_cython(sequences) diff --git a/sktree/tree/manifold/_morf_splitter.pyx b/sktree/tree/manifold/_morf_splitter.pyx index dfaec589a..437aeff19 100644 --- a/sktree/tree/manifold/_morf_splitter.pyx +++ b/sktree/tree/manifold/_morf_splitter.pyx @@ -301,7 +301,6 @@ cdef class BestPatchSplitter(BaseDensePatchSplitter): self.patch_dims_buff ) - cdef void sample_proj_vec( self, vector[vector[float32_t]]& proj_mat_weights, @@ -336,7 +335,6 @@ cdef class BestPatchSplitter(BaseDensePatchSplitter): cdef intp_t num_rows cdef int ndim = self.ndim cdef vector[vector[intp_t]] points = vector[vector[intp_t]](ndim) - cdef vector[intp_t] temp = vector[intp_t](ndim) cdef intp_t patch_dim cdef intp_t idx cdef intp_t i @@ -363,18 +361,17 @@ cdef class BestPatchSplitter(BaseDensePatchSplitter): _index_data_buffer[i], _index_data_buffer[j] = \ _index_data_buffer[j], _index_data_buffer[i] - for i in range(0, patch_dims[dim_idx]): # populate + for i in range(0, patch_dims[dim_idx]): # populate points[dim_idx].push_back(_index_data_buffer[i]) _index_data_buffer.clear() # make cartesian product of the points, ravel, then add to proj_mat_indices - cdef vector[vector[intp_t]] products = cartesian_cython(points) + cdef vector[vector[intp_t]] products = cartesian_cython(points) for point in products: vectorized_point = ravel_multi_index_cython(point, self.data_dims) proj_mat_indices[proj_i].push_back(vectorized_point) proj_mat_weights[proj_i].push_back(weight) - cdef void compute_features_over_samples( self, intp_t start, diff --git a/sktree/tree/tests/test_utils.py b/sktree/tree/tests/test_utils.py index a0c4e1c07..1cb112045 100644 --- a/sktree/tree/tests/test_utils.py +++ b/sktree/tree/tests/test_utils.py @@ -210,4 +210,4 @@ def test_cartesian_prod(): from_itertools = list(product(*sequences)) from_cython = cartesian_python(sequences) - assert_equal(from_itertools, from_cython) \ No newline at end of file + assert_equal(from_itertools, from_cython)