diff --git a/.gitignore b/.gitignore
index 6926523a6..43bfd39e2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,6 @@
build
+buildcuda
+compile
# Dependency folders
depends/*/
@@ -123,6 +125,11 @@ ClientBin/
*.pfx
*.publishsettings
+# Nsight Nvidia Eclipse
+.cproject
+.project
+.settings/
+
# RIA/Silverlight projects
Generated_Code/
diff --git a/CMakeLists.txt b/CMakeLists.txt
index d8ef04736..39bba8717 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,3 +1,4 @@
+#CMAKE_MINIMUM_REQUIRED(VERSION 3.28.3)
CMAKE_MINIMUM_REQUIRED(VERSION 2.8.12.1)
SET(PROJECT_VER_MAJOR 0)
@@ -45,7 +46,6 @@ OPTION(ENABLE_PROFILING "Collect profiling stats (memory consuming)" OFF)
IF(ENABLE_PROFILING)
SET(LIBFREENECT2_WITH_PROFILING 1)
ENDIF()
-
IF(MSVC)
# suppress several "possible loss of data" warnings, and
# "zero-length array in struct" from libusb.h
@@ -266,6 +266,7 @@ ENDIF()
SET(HAVE_OpenGL disabled)
IF(ENABLE_OPENGL)
FIND_PACKAGE(GLFW3)
+ set(OpenGL_GL_PREFERENCE "GLVND")
FIND_PACKAGE(OpenGL)
SET(HAVE_OpenGL no)
IF(GLFW3_FOUND AND OPENGL_FOUND)
@@ -277,7 +278,8 @@ IF(ENABLE_OPENGL)
LIST(APPEND LIBFREENECT2_DLLS ${GLFW3_DLL})
LIST(APPEND LIBRARIES
${GLFW3_LIBRARIES}
- ${OPENGL_gl_LIBRARY}
+ ${OPENGL_GL_LIBRARY}
+ ${OPENGL_LIBRARIES}
)
LIST(APPEND SOURCES
src/flextGL.cpp
@@ -337,19 +339,27 @@ IF(ENABLE_OPENCL)
ENDIF(OpenCL_FOUND)
ENDIF(ENABLE_OPENCL)
+
SET(HAVE_CUDA disabled)
IF(ENABLE_CUDA)
- FIND_PACKAGE(CUDA)
+ INCLUDE(CheckLanguage)
+ CHECK_LANGUAGE(CUDA)
+# FIND_PACKAGE(CUDA)
SET(HAVE_CUDA no)
- IF(CUDA_FOUND AND MSVC14 AND CUDA_VERSION VERSION_LESS 8.0)
+ #IF(CUDA_FOUND AND MSVC14 AND CUDA_VERSION VERSION_LESS 8.0)
+ IF(CMAKE_CUDA_COMPILER AND MSVC14 AND CUDA_VERSION VERSION_LESS 8.0)
SET(HAVE_CUDA "no (VS2015 not supported)")
- ELSEIF(CUDA_FOUND)
+ #ELSEIF(CUDA_FOUND)
+ ELSEIF(CMAKE_CUDA_COMPILER)
SET(LIBFREENECT2_WITH_CUDA_SUPPORT 1)
SET(HAVE_CUDA yes)
+ ENABLE_LANGUAGE(CUDA)
+ #FIND_PACKAGE(CUDAToolkit)
STRING(REPLACE "\\" "/" NVCUDASAMPLES_ROOT "$ENV{NVCUDASAMPLES_ROOT}")
STRING(REPLACE "\\" "/" NVCUDASAMPLES8_0_ROOT "$ENV{NVCUDASAMPLES8_0_ROOT}")
- CUDA_INCLUDE_DIRECTORIES(
+ #CUDA_INCLUDE_DIRECTORIES(
+ INCLUDE_DIRECTORIES(
"${MY_DIR}/include/"
"${CUDA_TOOLKIT_ROOT_DIR}/samples/common/inc"
"${NVCUDASAMPLES_ROOT}/common/inc"
@@ -357,7 +367,7 @@ IF(ENABLE_CUDA)
)
SET(CUDA_FLAGS -use_fast_math)
IF(NOT MSVC)
- SET(CUDA_FLAGS "${CUDA_FLAGS} -Xcompiler -fPIC")
+ SET(CUDA_FLAGS "${CUDA_FLAGS} -Xcompiler -fPIC -D_FORCE_INLINES")
ENDIF()
IF(HAVE_CXX11 STREQUAL yes AND CUDA_VERSION VERSION_GREATER 7.0)
SET(CUDA_FLAGS "${CUDA_FLAGS} -std=c++11")
@@ -365,25 +375,42 @@ IF(ENABLE_CUDA)
SET(OLD_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
STRING(REGEX REPLACE "-std=c\\+\\+.." "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
- CUDA_COMPILE(CUDA_OBJECTS
+ # Thrust requires exceptions. If OpenCL from NVidia is used we don't need this flag.
+ STRING(REGEX REPLACE "-fno-exceptions" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+ MESSAGE(STATUS "*************")
+ MESSAGE(STATUS ${CMAKE_CXX_FLAGS})
+ MESSAGE(STATUS ${CUDA_FLAGS})
+ MESSAGE(STATUS ${CUDA_INCLUDE_DIRS})
+ MESSAGE(STATUS ${CUDA_LIBRARIES})
+ MESSAGE(STATUS ${CUDA_OBJECTS})
+ MESSAGE(STATUS "*************")
+ #CUDA_COMPILE(CUDA_OBJECTS
+ LIST(APPEND SOURCES
+ #include/libfreenect2/cuda_registration.h
src/cuda_depth_packet_processor.cu
src/cuda_kde_depth_packet_processor.cu
- OPTIONS ${CUDA_FLAGS}
+ src/cuda_registration.cu
+ #OPTIONS ${CUDA_FLAGS}
)
SET(CMAKE_CXX_FLAGS "${OLD_CMAKE_CXX_FLAGS}")
INCLUDE_DIRECTORIES(${CUDA_INCLUDE_DIRS})
- LIST(APPEND SOURCES
- ${CUDA_OBJECTS}
- )
+ #LIST(APPEND SOURCES
+ # ${CUDA_OBJECTS}
+ #)
- LIST(APPEND LIBRARIES
- ${CUDA_LIBRARIES}
- )
+ #LIST(APPEND LIBRARIES
+ # ${CUDA_LIBRARIES}
+ #)
+ #set_target_properties(freenect2 PROPERTIES CUDA_ARCHITECTURES "35;50;75")
+ #SET_PROPERTY(TARGET freenect2 PROPERTY CUDA_ARCHITECTURES OFF)
ENDIF()
+ #ENDIF(CMAKE_CUDA_COMPILER)
ENDIF(ENABLE_CUDA)
+
+
# RPATH handling for CUDA 8.0 libOpenCL.so conflict. See #804.
IF(HAVE_OpenCL STREQUAL yes AND UNIX AND NOT APPLE)
FILE(GLOB CUDA_ld_so_conf /etc/ld.so.conf.d/cuda*.conf)
diff --git a/doc/sequence.svg b/doc/sequence.svg
new file mode 100644
index 000000000..351c09b09
--- /dev/null
+++ b/doc/sequence.svg
@@ -0,0 +1,433 @@
+
+
+
+
diff --git a/examples/Protonect.cpp b/examples/Protonect.cpp
index 2fbf4a67b..c34ec8c3a 100644
--- a/examples/Protonect.cpp
+++ b/examples/Protonect.cpp
@@ -101,9 +101,13 @@ class MyFileLogger: public libfreenect2::Logger
* Main application entry point.
*
* Accepted argumemnts:
- * - cpu Perform depth processing with the CPU.
- * - gl Perform depth processing with OpenGL.
- * - cl Perform depth processing with OpenCL.
+ * - cpu Perform depth processing with the CPU.
+ * - gl Perform depth processing with OpenGL.
+ * - cl Perform depth processing with OpenCL.
+ * - clkde
+ * - cuda
+ * - cudakde
+ * - cudaccess Does not send data to CPU.
* - Serial number of the device to open.
* - -noviewer Disable viewer window.
*/
@@ -113,7 +117,7 @@ int main(int argc, char *argv[])
std::string program_path(argv[0]);
std::cerr << "Version: " << LIBFREENECT2_VERSION << std::endl;
std::cerr << "Environment variables: LOGFILE=" << std::endl;
- std::cerr << "Usage: " << program_path << " [-gpu=] [gl | cl | clkde | cuda | cudakde | cpu] []" << std::endl;
+ std::cerr << "Usage: " << program_path << " [-gpu=] [gl | cl | clkde | cuda | cudakde | cudaccess | cpu] []" << std::endl;
std::cerr << " [-noviewer] [-norgb | -nodepth] [-help] [-version]" << std::endl;
std::cerr << " [-frames ]" << std::endl;
std::cerr << "To pause and unpause: pkill -USR1 Protonect" << std::endl;
@@ -225,6 +229,15 @@ int main(int argc, char *argv[])
pipeline = new libfreenect2::CudaKdePacketPipeline(deviceId);
#else
std::cout << "CUDA pipeline is not supported!" << std::endl;
+#endif
+ }
+ else if(arg == "cudaccess")
+ {
+#ifdef LIBFREENECT2_WITH_CUDA_SUPPORT
+ if(!pipeline)
+ pipeline = new libfreenect2::CudaAccessPacketPipeline(deviceId);
+#else
+ std::cout << "CUDA pipeline is not supported!" << std::endl;
#endif
}
else if(arg.find_first_not_of("0123456789") == std::string::npos) //check if parameter could be a serial number
diff --git a/include/libfreenect2/cuda_registration.h b/include/libfreenect2/cuda_registration.h
new file mode 100644
index 000000000..76c995e0d
--- /dev/null
+++ b/include/libfreenect2/cuda_registration.h
@@ -0,0 +1,162 @@
+/*
+ * This file is part of the OpenKinect Project. http://www.openkinect.org
+ *
+ * Copyright (c) 2014 individual OpenKinect contributors. See the CONTRIB file
+ * for details.
+ *
+ * This code is licensed to you under the terms of the Apache License, version
+ * 2.0, or, at your option, the terms of the GNU General Public License,
+ * version 2.0. See the APACHE20 and GPL2 files for the text of the licenses,
+ * or the following URLs:
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.gnu.org/licenses/gpl-2.0.txt
+ *
+ * If you redistribute this file in source form, modified or unmodified, you
+ * may:
+ * 1) Leave this header intact and distribute it under the same terms,
+ * accompanying it with the APACHE20 and GPL20 files, or
+ * 2) Delete the Apache 2.0 clause and accompany it with the GPL2 file, or
+ * 3) Delete the GPL v2 clause and accompany it with the APACHE20 file
+ * In all cases you must keep the copyright notice intact and include a copy
+ * of the CONTRIB file.
+ *
+ * Binary distributions must follow the binary distribution requirements of
+ * either License.
+ */
+
+/** @file cuda_registration.h Class for merging depth and color frames using cuda. */
+
+#ifndef CUDA_REGISTRATION_H_
+#define CUDA_REGISTRATION_H_
+
+#include
+#include
+#include
+#include
+
+#ifdef LIBFREENECT2_WITH_CUDA_SUPPORT
+#include
+#include
+#include
+#include
+#include
+
+namespace libfreenect2
+{
+
+typedef thrust::tuple TupleXYZRGB;
+
+/**
+ * Frame whose data is allocated on device.
+ */
+class LIBFREENECT2_API CudaDeviceFrame: public Frame
+{
+public:
+ /** Construct a new frame.
+ * @param width Width in pixel
+ * @param height Height in pixel
+ * @param bytes_per_pixel Bytes per pixel
+ */
+ CudaDeviceFrame(size_t width, size_t height, size_t bytes_per_pixel);
+ virtual ~CudaDeviceFrame();
+private:
+ bool allocateMemory();
+};
+
+class CudaRegistrationImpl;
+
+/** @defgroup registration Registration and Geometry
+ * Register depth to color, create point clouds. */
+
+/** Combine frames of depth and color camera using gpus. @ingroup registration
+ * Right now this class uses a reverse engineered formula that uses factory
+ * preset extrinsic parameters the same way the Registration class does.
+ */
+class LIBFREENECT2_API CudaRegistration
+{
+public:
+ /**
+ * @param depth_p Depth camera parameters. You can use the factory values, or use your own.
+ * @param rgb_p Color camera parameters. Probably use the factory values for now.
+ */
+ CudaRegistration(Freenect2Device::IrCameraParams depth_p, Freenect2Device::ColorCameraParams rgb_p);
+ ~CudaRegistration();
+
+ /** Undistort and register a single depth point to color camera.
+ * @param dx Distorted depth coordinate x (pixel)
+ * @param dy Distorted depth coordinate y (pixel)
+ * @param dz Depth value (millimeter)
+ * @param[out] cx Undistorted color coordinate x (normalized)
+ * @param[out] cy Undistorted color coordinate y (normalized)
+ */
+ void apply(int dx, int dy, float dz, float& cx, float &cy) const;
+
+ /** Map color images onto depth images
+ * @param rgb Color image (1920x1080 BGRX)
+ * @param depth Depth image (512x424 float)
+ * @param[out] undistorted Undistorted depth image
+ * @param[out] registered Color image for the depth image (512x424)
+ * @param enable_filter Filter out pixels not visible to both cameras.
+ * @param[out] bigdepth If not `NULL`, return mapping of depth onto colors (1920x1082 float). **1082** not 1080, with a blank top and bottom row.
+ * @param[out] color_depth_map Index of mapped color pixel for each depth pixel (512x424).
+ */
+ bool apply(const Frame* rgb, const Frame* depth, CudaDeviceFrame* undistorted, CudaDeviceFrame* registered, const bool enable_filter = true, CudaDeviceFrame* bigdepth = 0, int* color_depth_map = 0) const;
+
+ /** Undistort depth
+ * @param depth Depth image (512x424 float)
+ * @param[out] undistorted Undistorted depth image
+ */
+ void undistortDepth(const Frame* depth, Frame* undistorted) const;
+
+ /** Construct a 3-D point with color in a point cloud.
+ * @param undistorted Undistorted depth frame from apply().
+ * @param registered Registered color frame from apply().
+ * @param r Row (y) index in depth image.
+ * @param c Column (x) index in depth image.
+ * @param[out] x X coordinate of the 3-D point (meter).
+ * @param[out] y Y coordinate of the 3-D point (meter).
+ * @param[out] z Z coordinate of the 3-D point (meter).
+ * @param[out] rgb Color of the 3-D point (BGRX). To unpack the data, use
+ *
+ * const uint8_t *p = reinterpret_cast(&rgb);
+ * uint8_t b = p[0];
+ * uint8_t g = p[1];
+ * uint8_t r = p[2];
+ */
+ void getPointXYZRGB (const Frame* undistorted, const Frame* registered, int r, int c, float& x, float& y, float& z, float& rgb) const;
+
+ /** Construct a 3-D point in a point cloud.
+ * @param undistorted Undistorted depth frame from apply().
+ * @param r Row (y) index in depth image.
+ * @param c Column (x) index in depth image.
+ * @param[out] x X coordinate of the 3-D point (meter).
+ * @param[out] y Y coordinate of the 3-D point (meter).
+ * @param[out] z Z coordinate of the 3-D point (meter).
+ */
+ void getPointXYZ (const Frame* undistorted, int r, int c, float& x, float& y, float& z) const;
+
+ /**
+ * Construct a point cloud as thrust vector of XYZRGB data as tuples of in device memory, which can be used
+ * for further processing with CUDA.
+ * @param undistorted Undistorted depth frame from apply().
+ * @param registered Registered color frame from apply().
+ * @param[out] cloud_data coordinates of the 3-D point (meter) and color (BGRX).
+ * To unpack the color data, use
+ * const uint8_t *p = reinterpret_cast(&rgb);
+ * uint8_t b = p[0];
+ * uint8_t g = p[1];
+ * uint8_t r = p[2];
+ */
+ void getPointXYZRGB(const Frame* undistorted, const Frame* registered, thrust::device_vector& cloud_data) const;
+
+private:
+ CudaRegistrationImpl *impl_;
+
+ /* Disable copy and assignment constructors */
+ CudaRegistration(const CudaRegistration&);
+ CudaRegistration& operator=(const CudaRegistration&);
+};
+#endif // LIBFREENECT2_WITH_CUDA_SUPPORT
+
+} /* namespace libfreenect2 */
+#endif /* REGISTRATION_H_ */
diff --git a/src/cuda_registration.cu b/src/cuda_registration.cu
new file mode 100644
index 000000000..e0c8872ee
--- /dev/null
+++ b/src/cuda_registration.cu
@@ -0,0 +1,558 @@
+/*
+ * This file is part of the OpenKinect Project. http://www.openkinect.org
+ *
+ * Copyright (c) 2014 individual OpenKinect contributors. See the CONTRIB file
+ * for details.
+ *
+ * This code is licensed to you under the terms of the Apache License, version
+ * 2.0, or, at your option, the terms of the GNU General Public License,
+ * version 2.0. See the APACHE20 and GPL2 files for the text of the licenses,
+ * or the following URLs:
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.gnu.org/licenses/gpl-2.0.txt
+ *
+ * If you redistribute this file in source form, modified or unmodified, you
+ * may:
+ * 1) Leave this header intact and distribute it under the same terms,
+ * accompanying it with the APACHE20 and GPL20 files, or
+ * 2) Delete the Apache 2.0 clause and accompany it with the GPL2 file, or
+ * 3) Delete the GPL v2 clause and accompany it with the APACHE20 file
+ * In all cases you must keep the copyright notice intact and include a copy
+ * of the CONTRIB file.
+ *
+ * Binary distributions must follow the binary distribution requirements of
+ * either License.
+ */
+
+/** @file Implementation of merging depth and color images using cuda. */
+
+#include
+#include "libfreenect2/logging.h"
+#include
+
+#define MONO_ROWS 424
+#define MONO_COLS 512
+#define COLOR_ROWS 1080
+#define COLOR_COLS 1920
+
+
+typedef unsigned char uchar;
+
+#define CHECK_CUDA(expr) do { cudaError_t err = (expr); if (err != cudaSuccess) { LOG_ERROR << #expr ": " << cudaGetErrorString(err); return false; } } while(0)
+#define CALL_CUDA(expr) do { cudaError_t err = (expr); if (err != cudaSuccess) { LOG_ERROR << #expr ": " << cudaGetErrorString(err); } } while(0)
+
+static __device__
+void distort(int mx, int my, float& d_x, float& d_y, const libfreenect2::Freenect2Device::IrCameraParams& d_depth)
+{
+ float dx = ((float)mx - d_depth.cx) / d_depth.fx;
+ float dy = ((float)my - d_depth.cy) / d_depth.fy;
+ float dx2 = dx * dx;
+ float dy2 = dy * dy;
+ float r2 = dx2 + dy2;
+ float dxdy2 = 2 * dx * dy;
+ float kr = 1 + ((d_depth.k3 * r2 + d_depth.k2) * r2 + d_depth.k1) * r2;
+ d_x = d_depth.fx * (dx * kr + d_depth.p2 * (r2 + 2 * dx2) + d_depth.p1 * dxdy2) + d_depth.cx;
+ d_y = d_depth.fy * (dy * kr + d_depth.p1 * (r2 + 2 * dy2) + d_depth.p2 * dxdy2) + d_depth.cy;
+}
+
+static __device__
+void depth_to_color(float mx, float my, float& d_rx, float& d_ry,
+ const libfreenect2::Freenect2Device::IrCameraParams& d_depth,
+ const libfreenect2::Freenect2Device::ColorCameraParams& d_color,
+ const float depth_q, const float color_q)
+{
+ mx = (mx - d_depth.cx) * depth_q;
+ my = (my - d_depth.cy) * depth_q;
+
+ float wx =
+ (mx * mx * mx * d_color.mx_x3y0) + (my * my * my * d_color.mx_x0y3) +
+ (mx * mx * my * d_color.mx_x2y1) + (my * my * mx * d_color.mx_x1y2) +
+ (mx * mx * d_color.mx_x2y0) + (my * my * d_color.mx_x0y2) + (mx * my * d_color.mx_x1y1) +
+ (mx * d_color.mx_x1y0) + (my * d_color.mx_x0y1) + (d_color.mx_x0y0);
+
+ float wy =
+ (mx * mx * mx * d_color.my_x3y0) + (my * my * my * d_color.my_x0y3) +
+ (mx * mx * my * d_color.my_x2y1) + (my * my * mx * d_color.my_x1y2) +
+ (mx * mx * d_color.my_x2y0) + (my * my * d_color.my_x0y2) + (mx * my * d_color.my_x1y1) +
+ (mx * d_color.my_x1y0) + (my * d_color.my_x0y1) + (d_color.my_x0y0);
+
+ d_rx = (wx / (d_color.fx * color_q)) - (d_color.shift_m / d_color.shift_d);
+ d_ry = (wy / color_q) + d_color.cy;
+}
+
+static __global__
+void dInitMaps(int* d_map_dist, float* d_map_x, float* d_map_y, int* d_map_yi,
+ const libfreenect2::Freenect2Device::IrCameraParams d_depth,
+ const libfreenect2::Freenect2Device::ColorCameraParams d_color,
+ const float depth_q, const float color_q)
+{
+ // Configuration copied from cuda_depth_packet_processor.cu
+ const uint i = blockIdx.x*blockDim.x + threadIdx.x;
+
+ const uint x = i % MONO_COLS;
+ const uint y = i / MONO_COLS;
+
+ float mx, my;
+ int ix, iy, index;
+ float rx, ry;
+
+ // compute the distorted coordinate for current pixel
+ distort(x, y, mx, my, d_depth);
+
+ // rounding the values and check if the pixel is inside the image
+ ix = (int)(mx + 0.5f);
+ iy = (int)(my + 0.5f);
+ if(ix < 0 || ix >= 512 || iy < 0 || iy >= 424)
+ index = -1;
+ else
+ // computing the index from the coordinates for faster access to the data
+ index = iy * 512 + ix;
+ d_map_dist[i] = index;
+
+ // compute the depth to color mapping entries for the current pixel
+ depth_to_color(x, y, rx, ry, d_depth, d_color, depth_q, color_q);
+ d_map_x[i] = rx;
+ d_map_y[i] = ry;
+ // compute the y offset to minimize later computations
+ d_map_yi[i] = (int)(ry + 0.5f);
+}
+
+static __global__
+void setFloat(float* devPtr, float value)
+{
+ // Configuration copied from cuda_depth_packet_processor.cu
+ const uint i = blockIdx.x * blockDim.x + threadIdx.x;
+
+ devPtr[i] = value;
+}
+
+/**
+ * Set all values of array of floats devPtr to value.
+ * This function does not call for synchronization.
+ * @param devPtr pointer to memory in device
+ * @param value value to set
+ * @param size number of float sized elements in array
+ */
+void cudaMemsetFloat(float* devPtr, float value, size_t size)
+{
+ size_t numThreads = 512;
+ size_t numBlocks = size / numThreads;
+ setFloat<<>>(devPtr, value);
+}
+
+/**
+ * Compares value at address with val, if val is smaller it
+ * saves it at address.
+ */
+__device__ float atomicKeepSmaller(float* address, float val)
+{
+ // Implementation addapted from http://docs.nvidia.com/cuda/cuda-c-programming-guide/#atomic-functions
+ int* address_as_ull = (int*)address;
+ int old = *address_as_ull, assumed;
+ do {
+ assumed = old;
+ old = atomicCAS(address_as_ull, assumed,
+ __float_as_int(val < __int_as_float(assumed) ? val : __int_as_float(assumed)));
+ // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
+ } while (assumed != old);
+
+ return __int_as_float(old);
+}
+
+
+static __global__
+void undistort(int* d_depth_to_c_off,
+ float* d_undistorted_data,
+ float* d_filter_map,
+ const float* d_depth_data, const int* d_map_dist,
+ const float* d_map_x, const int* d_map_yi,
+ const libfreenect2::Freenect2Device::IrCameraParams depth,
+ const libfreenect2::Freenect2Device::ColorCameraParams color,
+ const int filter_width_half,
+ const int filter_height_half,
+ const int offset_filter_map,
+ const bool enable_filter)
+{
+ // getting index of distorted depth pixel
+ const int i = blockIdx.x * blockDim.x + threadIdx.x;
+ const int index = d_map_dist[i];
+
+ // check if distorted depth pixel is outside of the depth image
+ if(index < 0){
+ d_depth_to_c_off[i] = -1;
+ d_undistorted_data[i] = 0;
+ return;
+ }
+
+ // getting depth value for current pixel
+ const float z = d_depth_data[index];
+ d_undistorted_data[i] = z;
+
+ // checking for invalid depth value
+ if(z <= 0.0f){
+ d_depth_to_c_off[i] = -1;
+ return;
+ }
+
+ // calculating x offset for rgb image based on depth value
+ const float color_cx = color.cx + 0.5f; // 0.5f added for later rounding
+ const float rx = (d_map_x[index] + (color.shift_m / z)) * color.fx + color_cx;
+ const int cx = rx; // same as round for positive numbers (0.5f was already added to color_cx)
+ // getting y offset for depth image
+ const int cy = d_map_yi[i];
+ // combining offsets
+ const int c_off = cx + cy * COLOR_COLS;
+
+ // check if c_off is outside of rgb image
+ // checking rx/cx is not needed because the color image is much wider then the depth image
+ if(c_off < 0 || c_off >= COLOR_ROWS * COLOR_COLS){
+ d_depth_to_c_off[i] = -1;
+ return;
+ }
+
+ // saving the offset for later
+ d_depth_to_c_off[i] = c_off;
+
+ // I am not sure if there won't be race conditions here due to overlap, the atomic operation should help.
+ if(enable_filter){
+ // setting a window around the filter map pixel corresponding to the color pixel with the current z value
+ int yi = (cy - filter_height_half) * 1920 + cx - filter_width_half; // index of first pixel to set
+ for(int r = -filter_height_half; r <= filter_height_half; ++r, yi += COLOR_COLS) // index increased by a full row each iteration
+ {
+ float *it = d_filter_map + offset_filter_map + yi;
+ for(int c = -filter_width_half; c <= filter_width_half; ++c, ++it)
+ {
+ // only set if the current z is smaller
+ atomicKeepSmaller(it, z);
+ }
+ }
+ }
+}
+
+/** Construct 'registered' image with filter.
+ * Filter drops duplicate pixels due to aspect of two cameras.
+ */
+static __global__
+void registerImageFiltered(unsigned int *d_registered_data,
+ const unsigned int * d_rgb_data,
+ const int* d_depth_to_c_off,
+ const float* d_undistorted_data,
+ const float *d_p_filter_map,
+ const float filter_tolerance)
+{
+ // getting index of distorted depth pixel
+ const int i = blockIdx.x * blockDim.x + threadIdx.x;
+
+ // run through all registered color pixels and set them based on filter results
+ const int c_off = d_depth_to_c_off[i];
+
+ // check if offset is out of image
+ if(c_off < 0){
+ d_registered_data[i] = 0;
+ return;
+ }
+
+ const float min_z = d_p_filter_map[c_off];
+ const float z = d_undistorted_data[i];
+
+ // check for allowed depth noise
+ d_registered_data[i] = (z - min_z) / z > filter_tolerance ? 0 : d_rgb_data[c_off];
+
+}
+
+/** Construct 'registered' image. */
+static __global__
+void registerImage(unsigned int *d_registered_data,
+ const unsigned int * d_rgb_data,
+ const int* d_depth_to_c_off)
+{
+ // getting index of distorted depth pixel
+ const int i = blockIdx.x * blockDim.x + threadIdx.x;
+
+ // run through all registered color pixels and set them based on c_off
+ const int c_off = d_depth_to_c_off[i];
+
+ // check if offset is out of image
+ d_registered_data[i] = c_off < 0 ? 0 : d_rgb_data[c_off];
+}
+
+namespace libfreenect2
+{
+
+CudaDeviceFrame::CudaDeviceFrame(size_t width, size_t height, size_t bytes_per_pixel):
+ Frame(width, height, bytes_per_pixel, (unsigned char*)-1)
+{
+ allocateMemory();
+}
+
+CudaDeviceFrame::~CudaDeviceFrame()
+{
+ CALL_CUDA(cudaFree(data));
+}
+
+bool CudaDeviceFrame::allocateMemory()
+{
+ CHECK_CUDA(cudaMalloc(&data, width * height * bytes_per_pixel * sizeof(unsigned char)));
+
+ cudaDeviceSynchronize();
+
+ CHECK_CUDA(cudaGetLastError());
+ return true;
+}
+
+/*
+ * The information used here has been taken from libfreenect2::Registration source
+ * code.
+ */
+static const float depth_q = 0.01;
+static const float color_q = 0.002199;
+
+class CudaRegistrationImpl
+{
+public:
+ CudaRegistrationImpl(Freenect2Device::IrCameraParams depth_p, Freenect2Device::ColorCameraParams rgb_p):
+ depth(depth_p), color(rgb_p),
+ filter_width_half(2), filter_height_half(1), filter_tolerance(0.01f),
+ block_size(128), grid_size(MONO_IMAGE_SIZE/block_size)
+ {
+ good = setupDevice();
+ if (!good)
+ return;
+
+ good = initMaps();
+ if (!good)
+ return;
+ }
+
+ ~CudaRegistrationImpl()
+ {
+ if (good)
+ freeDeviceMemory();
+ }
+
+ void apply(int dx, int dy, float dz, float& cx, float &cy) const;
+ bool apply(const Frame* rgb, const Frame* depth, CudaDeviceFrame* undistorted, CudaDeviceFrame* registered, const bool enable_filter, CudaDeviceFrame* bigdepth, int* color_depth_map) const;
+ void undistortDepth(const Frame *depth, Frame *undistorted) const;
+ void getPointXYZRGB (const Frame* undistorted, const Frame* registered, int r, int c, float& x, float& y, float& z, float& rgb) const;
+ void getPointXYZ (const Frame* undistorted, int r, int c, float& x, float& y, float& z) const;
+ void distort(int mx, int my, float& dx, float& dy) const;
+ void depth_to_color(float mx, float my, float& rx, float& ry) const;
+
+private:
+ Freenect2Device::IrCameraParams depth; ///< Depth camera parameters.
+ Freenect2Device::ColorCameraParams color; ///< Color camera parameters.
+
+ const int filter_width_half;
+ const int filter_height_half;
+ const float filter_tolerance;
+
+ static const size_t MONO_IMAGE_SIZE = MONO_COLS * MONO_ROWS;
+
+ size_t block_size;
+ size_t grid_size;
+
+ bool good; // Memory correctly allocated
+
+ // Maps
+ int* d_distort_map;
+ float* d_depth_to_color_map_x;
+ float* d_depth_to_color_map_y;
+ int* d_depth_to_color_map_yi;
+
+ bool allocateDeviceMemory()
+ {
+ CHECK_CUDA(cudaMalloc(&d_distort_map, MONO_IMAGE_SIZE * sizeof(int)));
+ CHECK_CUDA(cudaMalloc(&d_depth_to_color_map_x, MONO_IMAGE_SIZE * sizeof(float)));
+ CHECK_CUDA(cudaMalloc(&d_depth_to_color_map_y, MONO_IMAGE_SIZE * sizeof(float)));
+ CHECK_CUDA(cudaMalloc(&d_depth_to_color_map_yi, MONO_IMAGE_SIZE * sizeof(int)));
+
+ cudaDeviceSynchronize();
+
+ CHECK_CUDA(cudaGetLastError());
+ return true;
+ }
+
+ bool setupDevice()
+ {
+ // Continue to use same device than cuda_depth_packet_processor?
+ if (!allocateDeviceMemory())
+ return false;
+
+ return true;
+ }
+
+ bool initMaps()
+ {
+ dInitMaps<<>>(d_distort_map, d_depth_to_color_map_x,
+ d_depth_to_color_map_y, d_depth_to_color_map_yi,
+ depth, color, depth_q, color_q);
+
+ cudaDeviceSynchronize();
+ CHECK_CUDA(cudaGetLastError());
+
+ return true;
+ }
+
+ void freeDeviceMemory()
+ {
+ CALL_CUDA(cudaFree(d_distort_map));
+ CALL_CUDA(cudaFree(d_depth_to_color_map_x));
+ CALL_CUDA(cudaFree(d_depth_to_color_map_y));
+ CALL_CUDA(cudaFree(d_depth_to_color_map_yi));
+ }
+};
+
+CudaRegistration::CudaRegistration(Freenect2Device::IrCameraParams depth_p, Freenect2Device::ColorCameraParams rgb_p):
+ impl_(new CudaRegistrationImpl(depth_p, rgb_p)) {}
+
+CudaRegistration::~CudaRegistration()
+{
+ delete impl_;
+}
+
+bool CudaRegistration::apply(const Frame* rgb, const Frame* depth, CudaDeviceFrame* undistorted, CudaDeviceFrame* registered, const bool enable_filter, CudaDeviceFrame* bigdepth, int* color_depth_map) const
+{
+ return impl_->apply(rgb, depth, undistorted, registered, enable_filter, bigdepth, color_depth_map);
+}
+
+bool CudaRegistrationImpl::apply(const Frame *rgb, const Frame *depth, CudaDeviceFrame *undistorted, CudaDeviceFrame *registered, const bool enable_filter, CudaDeviceFrame *bigdepth, int *color_depth_map) const
+{
+ // Check if all frames are valid and have the correct size
+ if (!rgb || !depth || !undistorted || !registered ||
+ rgb->width != 1920 || rgb->height != 1080 || rgb->bytes_per_pixel != 4 ||
+ depth->width != 512 || depth->height != 424 || depth->bytes_per_pixel != 4 ||
+ undistorted->width != 512 || undistorted->height != 424 || undistorted->bytes_per_pixel != 4 ||
+ registered->width != 512 || registered->height != 424 || registered->bytes_per_pixel != 4)
+ {
+ LOG_ERROR << "Not applying" << std::endl;
+ return false;
+ }
+
+ // Setup memory
+
+ float *d_depth_data;
+ size_t depth_size = depth->width * depth->height * sizeof(float);
+ unsigned int *d_rgb_data;
+ size_t rgb_size = rgb->width * rgb->height * sizeof(unsigned int);
+
+ CHECK_CUDA(cudaMalloc(&d_depth_data, depth_size));
+ cudaMemcpy((void*)d_depth_data,
+ (const void*)depth->data, depth_size,
+ cudaMemcpyHostToDevice);
+
+ CHECK_CUDA(cudaMalloc(&d_rgb_data, rgb_size));
+ cudaMemcpy((void*)d_rgb_data,
+ (const void*)rgb->data, rgb_size,
+ cudaMemcpyHostToDevice);
+
+ cudaDeviceSynchronize();
+ CHECK_CUDA(cudaGetLastError());
+
+ float *d_undistorted_data = (float*)undistorted->data;
+ unsigned int *d_registered_data = (unsigned int*)registered->data;
+ const int *d_map_dist = d_distort_map;
+ const float *d_map_x = d_depth_to_color_map_x;
+ const int *d_map_yi = d_depth_to_color_map_yi;
+
+
+ // Setup parameters
+
+ const int size_depth = MONO_ROWS * MONO_COLS;
+ const int size_color = COLOR_ROWS * COLOR_COLS;
+ //const float color_cx = color.cx + 0.5f; // 0.5f added for later rounding
+
+ // size of filter map with a border of filter_height_half on top and bottom so that no check for borders is needed.
+ // since the color image is wide angle no border to the sides is needed.
+ const int size_filter_map = size_color + COLOR_COLS * filter_height_half * 2;
+ // offset to the important data
+ const int offset_filter_map = COLOR_COLS * filter_height_half;
+
+
+ // Auxiliary maps
+
+ // map for storing the min z values used for each color pixel
+ float *d_filter_map = NULL;
+ // pointer to the beginning of the important data
+ float *d_p_filter_map = NULL;
+
+ // map for storing the color offset for each depth pixel
+ int *d_depth_to_c_off;
+ CHECK_CUDA(cudaMalloc(&d_depth_to_c_off, size_depth * sizeof(int)));
+ if (color_depth_map)
+ {
+ // I don't know where this other color map could be coming from,
+ // so for the moment I will assume it is in host memory.
+ cudaMemcpy((void*)d_depth_to_c_off,
+ (const void*)color_depth_map, size_depth * sizeof(int),
+ cudaMemcpyHostToDevice);
+ }
+ //int *map_c_off = depth_to_c_off;
+
+ // initializing the depth_map with values outside of the Kinect2 range
+ if(enable_filter){
+ if(bigdepth)
+ {
+ d_filter_map = (float*)bigdepth->data;
+ }
+ else
+ {
+ CHECK_CUDA(cudaMalloc(&d_filter_map, size_filter_map * sizeof(float)));
+ }
+ d_p_filter_map = d_filter_map + offset_filter_map; // works the same even on device
+
+ cudaMemsetFloat(d_filter_map, std::numeric_limits::infinity(), size_filter_map);
+ }
+
+ /* Fix depth distortion, and compute pixel to use from 'rgb' based on depth measurement,
+ * stored as x/y offset in the rgb data.
+ */
+ undistort<<>>(d_depth_to_c_off,
+ d_undistorted_data, d_filter_map,
+ d_depth_data, d_map_dist,
+ d_map_x, d_map_yi,
+ this->depth, this->color, filter_width_half, filter_height_half, offset_filter_map, enable_filter);
+ if (enable_filter)
+ {
+ registerImageFiltered<<>>(d_registered_data,
+ d_rgb_data,
+ d_depth_to_c_off,
+ d_undistorted_data,
+ d_p_filter_map,
+ filter_tolerance);
+ if (!bigdepth)
+ {
+ CALL_CUDA(cudaFree(d_filter_map));
+ }
+ }
+ else
+ {
+ registerImage<<>>(d_registered_data,
+ d_rgb_data,
+ d_depth_to_c_off);
+ }
+
+ // Finish
+
+ // -1 represents Invalid
+ //undistorted->format = undistorted->Float;
+ //registered->format = registered->BGRX;
+
+
+ if (color_depth_map)
+ {
+ // I don't know where this other color map could be coming from,
+ // so for the moment I will assume it is in host memory.
+ // Placing it back to where it came from
+ cudaMemcpy((void*)color_depth_map,
+ (const void*)d_depth_to_c_off, size_depth * sizeof(int),
+ cudaMemcpyDeviceToHost);
+ }
+ CALL_CUDA(cudaFree(d_depth_to_c_off));
+
+ CALL_CUDA(cudaFree(d_depth_data));
+ CALL_CUDA(cudaFree(d_rgb_data));
+
+ return true;
+}
+
+} /* namespace libfreenect2 */
diff --git a/src/opencl_depth_packet_processor.cpp b/src/opencl_depth_packet_processor.cpp
index d8db14d3c..c7f2a6953 100644
--- a/src/opencl_depth_packet_processor.cpp
+++ b/src/opencl_depth_packet_processor.cpp
@@ -251,7 +251,7 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging
newIrFrame();
newDepthFrame();
- const int CL_ICDL_VERSION = 2;
+ //const int CL_ICDL_VERSION = 2; // Commented out because it is defined somewhere else
typedef cl_int (*icdloader_func)(int, size_t, void*, size_t*);
#ifdef _MSC_VER
#pragma warning(push)
diff --git a/src/opencl_kde_depth_packet_processor.cpp b/src/opencl_kde_depth_packet_processor.cpp
index 98dbdff54..97765f415 100644
--- a/src/opencl_kde_depth_packet_processor.cpp
+++ b/src/opencl_kde_depth_packet_processor.cpp
@@ -259,7 +259,7 @@ class OpenCLKdeDepthPacketProcessorImpl: public WithPerfLogging
newIrFrame();
newDepthFrame();
- const int CL_ICDL_VERSION = 2;
+ //const int cl_icdl_version = 2; // CL_ICDL_VERSION is defined somewhere else
typedef cl_int (*icdloader_func)(int, size_t, void*, size_t*);
#ifdef _MSC_VER
#pragma warning(push)
@@ -277,6 +277,7 @@ class OpenCLKdeDepthPacketProcessorImpl: public WithPerfLogging
if (clGetICDLoaderInfoOCLICD != NULL)
{
char buf[16];
+ //if (clGetICDLoaderInfoOCLICD(cl_icdl_version, sizeof(buf), buf, NULL) == CL_SUCCESS)
if (clGetICDLoaderInfoOCLICD(CL_ICDL_VERSION, sizeof(buf), buf, NULL) == CL_SUCCESS)
{
if (strcmp(buf, "2.2.4") < 0)
diff --git a/src/registration.cpp b/src/registration.cpp
index 49a3b03e0..b4a9ddd07 100644
--- a/src/registration.cpp
+++ b/src/registration.cpp
@@ -380,7 +380,7 @@ RegistrationImpl::RegistrationImpl(Freenect2Device::IrCameraParams depth_p, Free
for (int y = 0; y < 424; y++) {
for (int x = 0; x < 512; x++) {
- // compute the dirstored coordinate for current pixel
+ // compute the distorted coordinate for current pixel
distort(x,y,mx,my);
// rounding the values and check if the pixel is inside the image
ix = (int)(mx + 0.5f);
@@ -388,7 +388,7 @@ RegistrationImpl::RegistrationImpl(Freenect2Device::IrCameraParams depth_p, Free
if(ix < 0 || ix >= 512 || iy < 0 || iy >= 424)
index = -1;
else
- // computing the index from the coordianted for faster access to the data
+ // computing the index from the coordinates for faster access to the data
index = iy * 512 + ix;
*map_dist++ = index;