Merge pull request #165 from rapidsai/branch-0.5

[gpuCI] Auto-merge branch-0.5 to branch-0.6 [skip ci]
rapidsai · Jan 30, 2019 · c651e6d · c651e6d
2 parents 6277f1e + 7fc5e86
commit c651e6d
Show file tree

Hide file tree

Showing 8 changed files with 96 additions and 6 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -19,6 +19,7 @@
 - PR #94: Add cmake flag to set ABI compatibility
 - PR #139: Move thirdparty submodules to root and add symlinks to new locations
 - PR #151: Replace TravisCI testing and conda pkg builds with gpuCI
+- PR #164: Add numba kernel for faster column to row major transform
 
 ## Bug Fixes
 

diff --git a/python/cuML/__init__.py b/python/cuML/__init__.py
@@ -1,5 +1,7 @@
 # Copyright (c) 2018, NVIDIA CORPORATION.
 # Versioneer
+from cuML import numba_utils
+
 from ._version import get_versions
 __version__ = get_versions()['version']
 del get_versions
diff --git a/python/cuML/_version.py b/python/cuML/_version.py
@@ -43,7 +43,7 @@ def get_config():
     cfg.style = "pep440"
     cfg.tag_prefix = "v"
     cfg.parentdir_prefix = "cuml-"
-    cfg.versionfile_source = "python/cuML/_version.py"
+    cfg.versionfile_source = "cuML/_version.py"
     cfg.verbose = False
     return cfg
 

diff --git a/python/cuML/dbscan/dbscan_wrapper.pyx b/python/cuML/dbscan/dbscan_wrapper.pyx
@@ -22,6 +22,8 @@ from libcpp cimport bool
 import ctypes
 from libc.stdint cimport uintptr_t
 from c_dbscan cimport *
+# temporary import for numba_utils
+from cuML import numba_utils
 
 
 class DBSCAN:
@@ -82,7 +84,7 @@ class DBSCAN:
         cdef uintptr_t input_ptr
         if (isinstance(X, cudf.DataFrame)):
             self.gdf_datatype = np.dtype(X[X.columns[0]]._column.dtype)
-            X_m = X.as_gpu_matrix(order = "C")
+            X_m = numba_utils.row_matrix(X)
             self.n_rows = len(X)
             self.n_cols = len(X._cols)
 

diff --git a/python/cuML/kalman/kalman_filter.pyx b/python/cuML/kalman/kalman_filter.pyx
@@ -15,6 +15,8 @@
 
 import numpy as np
 from numba import cuda
+# temporary import for numba_utils
+from cuML import numba_utils
 
 
 cdef extern from "kalman_filter/kf_variables.h" namespace "kf::linear":
@@ -520,7 +522,7 @@ class KalmanFilter:
     def __setattr__(self, name, value):
         if name in ["F", "x_up", "x", "P_up", "P", "Q", "H", "R", "z"]:
             if (isinstance(value, cudf.DataFrame)):
-                val = value.as_gpu_matrix(order='C')
+                val = numba_utils.row_matrix(value)
 
             elif (isinstance(value, cudf.Series)):
                 val = value.to_gpu_array()

diff --git a/python/cuML/numba_utils.py b/python/cuML/numba_utils.py
@@ -0,0 +1,82 @@
+# Copyright (c) 2018, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import numpy as np
+import pandas as pd
+import cudf
+import numba
+from librmm_cffi import librmm as rmm
+from numba.cuda.cudadrv.driver import driver
+import math
+from numba import cuda
+
+
+def row_matrix(df):
+    """Compute the C (row major) version gpu matrix of df
+
+    This implements the algorithm documented in
+    http://devblogs.nvidia.com/parallelforall/efficient-matrix-transpose-cuda-cc/
+
+    :param a: an `np.ndarray` or a `DeviceNDArrayBase` subclass. If already on
+        the device its stream will be used to perform the transpose (and to copy
+        `b` to the device if necessary).
+
+    Adapted from numba:
+    https://github.com/numba/numba/blob/master/numba/cuda/kernels/transpose.py
+
+    To be replaced by CUDA ml-prim in upcoming version
+    """
+
+    cols = [df._cols[k] for k in df._cols]
+    ncol = len(cols)
+    nrow = len(df)
+    dtype = cols[0].dtype
+
+    a = df.as_gpu_matrix(order='F')
+    b = rmm.device_array((nrow, ncol), dtype=dtype, order='C')
+    dtype = numba.typeof(a)
+
+    tpb = driver.get_device().MAX_THREADS_PER_BLOCK
+
+    tile_width = int(math.pow(2, math.log(tpb, 2) / 2))
+    tile_height = int(tpb / tile_width)
+
+    tile_shape = (tile_height, tile_width + 1)
+
+    @cuda.jit
+    def kernel(input, output):
+
+        tile = cuda.shared.array(shape=tile_shape, dtype=numba.float32)
+
+        tx = cuda.threadIdx.x
+        ty = cuda.threadIdx.y
+        bx = cuda.blockIdx.x * cuda.blockDim.x
+        by = cuda.blockIdx.y * cuda.blockDim.y
+        y = by + tx
+        x = bx + ty
+
+        if by + ty < input.shape[0] and bx + tx < input.shape[1]:
+            tile[ty, tx] = input[by + ty, bx + tx]
+        cuda.syncthreads()
+        if y < output.shape[0] and x < output.shape[1]:
+            output[y, x] = tile[tx, ty]
+
+    # one block per tile, plus one for remainders
+    blocks = int((b.shape[1]) / tile_height + 1), int((b.shape[0]) / tile_width + 1)
+    # one thread per tile element
+    threads = tile_height, tile_width
+    kernel[blocks, threads](a, b)
+
+    return b
diff --git a/python/setup.cfg b/python/setup.cfg
@@ -9,7 +9,7 @@ exclude = cuML,ml-prims,__init__.py,versioneer.py
 [versioneer]
 VCS = git
 style = pep440
-versionfile_source = python/cuML/_version.py
-versionfile_build = python/cuML/_version.py
+versionfile_source = cuML/_version.py
+versionfile_build = cuML/_version.py
 tag_prefix = v
 parentdir_prefix = cuml-
diff --git a/python/setup.py b/python/setup.py
@@ -14,7 +14,7 @@
 # limitations under the License.
 #
 
-from setuptools import setup
+from setuptools import setup, find_packages
 from setuptools.extension import Extension
 from Cython.Build import cythonize
 import numpy
@@ -64,6 +64,7 @@
       author="NVIDIA Corporation",
       setup_requires=['cython'],
       ext_modules=cythonize(extensions),
+      packages=find_packages(include=['cuML', 'cuML.*']),
       install_requires=install_requires,
       license="Apache",
       cmdclass=versioneer.get_cmdclass(),