diff --git a/.gitignore b/.gitignore index 6926523a6..43bfd39e2 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,6 @@ build +buildcuda +compile # Dependency folders depends/*/ @@ -123,6 +125,11 @@ ClientBin/ *.pfx *.publishsettings +# Nsight Nvidia Eclipse +.cproject +.project +.settings/ + # RIA/Silverlight projects Generated_Code/ diff --git a/CMakeLists.txt b/CMakeLists.txt index d8ef04736..39bba8717 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,3 +1,4 @@ +#CMAKE_MINIMUM_REQUIRED(VERSION 3.28.3) CMAKE_MINIMUM_REQUIRED(VERSION 2.8.12.1) SET(PROJECT_VER_MAJOR 0) @@ -45,7 +46,6 @@ OPTION(ENABLE_PROFILING "Collect profiling stats (memory consuming)" OFF) IF(ENABLE_PROFILING) SET(LIBFREENECT2_WITH_PROFILING 1) ENDIF() - IF(MSVC) # suppress several "possible loss of data" warnings, and # "zero-length array in struct" from libusb.h @@ -266,6 +266,7 @@ ENDIF() SET(HAVE_OpenGL disabled) IF(ENABLE_OPENGL) FIND_PACKAGE(GLFW3) + set(OpenGL_GL_PREFERENCE "GLVND") FIND_PACKAGE(OpenGL) SET(HAVE_OpenGL no) IF(GLFW3_FOUND AND OPENGL_FOUND) @@ -277,7 +278,8 @@ IF(ENABLE_OPENGL) LIST(APPEND LIBFREENECT2_DLLS ${GLFW3_DLL}) LIST(APPEND LIBRARIES ${GLFW3_LIBRARIES} - ${OPENGL_gl_LIBRARY} + ${OPENGL_GL_LIBRARY} + ${OPENGL_LIBRARIES} ) LIST(APPEND SOURCES src/flextGL.cpp @@ -337,19 +339,27 @@ IF(ENABLE_OPENCL) ENDIF(OpenCL_FOUND) ENDIF(ENABLE_OPENCL) + SET(HAVE_CUDA disabled) IF(ENABLE_CUDA) - FIND_PACKAGE(CUDA) + INCLUDE(CheckLanguage) + CHECK_LANGUAGE(CUDA) +# FIND_PACKAGE(CUDA) SET(HAVE_CUDA no) - IF(CUDA_FOUND AND MSVC14 AND CUDA_VERSION VERSION_LESS 8.0) + #IF(CUDA_FOUND AND MSVC14 AND CUDA_VERSION VERSION_LESS 8.0) + IF(CMAKE_CUDA_COMPILER AND MSVC14 AND CUDA_VERSION VERSION_LESS 8.0) SET(HAVE_CUDA "no (VS2015 not supported)") - ELSEIF(CUDA_FOUND) + #ELSEIF(CUDA_FOUND) + ELSEIF(CMAKE_CUDA_COMPILER) SET(LIBFREENECT2_WITH_CUDA_SUPPORT 1) SET(HAVE_CUDA yes) + ENABLE_LANGUAGE(CUDA) + #FIND_PACKAGE(CUDAToolkit) STRING(REPLACE "\\" "/" NVCUDASAMPLES_ROOT "$ENV{NVCUDASAMPLES_ROOT}") STRING(REPLACE "\\" "/" NVCUDASAMPLES8_0_ROOT "$ENV{NVCUDASAMPLES8_0_ROOT}") - CUDA_INCLUDE_DIRECTORIES( + #CUDA_INCLUDE_DIRECTORIES( + INCLUDE_DIRECTORIES( "${MY_DIR}/include/" "${CUDA_TOOLKIT_ROOT_DIR}/samples/common/inc" "${NVCUDASAMPLES_ROOT}/common/inc" @@ -357,7 +367,7 @@ IF(ENABLE_CUDA) ) SET(CUDA_FLAGS -use_fast_math) IF(NOT MSVC) - SET(CUDA_FLAGS "${CUDA_FLAGS} -Xcompiler -fPIC") + SET(CUDA_FLAGS "${CUDA_FLAGS} -Xcompiler -fPIC -D_FORCE_INLINES") ENDIF() IF(HAVE_CXX11 STREQUAL yes AND CUDA_VERSION VERSION_GREATER 7.0) SET(CUDA_FLAGS "${CUDA_FLAGS} -std=c++11") @@ -365,25 +375,42 @@ IF(ENABLE_CUDA) SET(OLD_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") STRING(REGEX REPLACE "-std=c\\+\\+.." "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") - CUDA_COMPILE(CUDA_OBJECTS + # Thrust requires exceptions. If OpenCL from NVidia is used we don't need this flag. + STRING(REGEX REPLACE "-fno-exceptions" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") + MESSAGE(STATUS "*************") + MESSAGE(STATUS ${CMAKE_CXX_FLAGS}) + MESSAGE(STATUS ${CUDA_FLAGS}) + MESSAGE(STATUS ${CUDA_INCLUDE_DIRS}) + MESSAGE(STATUS ${CUDA_LIBRARIES}) + MESSAGE(STATUS ${CUDA_OBJECTS}) + MESSAGE(STATUS "*************") + #CUDA_COMPILE(CUDA_OBJECTS + LIST(APPEND SOURCES + #include/libfreenect2/cuda_registration.h src/cuda_depth_packet_processor.cu src/cuda_kde_depth_packet_processor.cu - OPTIONS ${CUDA_FLAGS} + src/cuda_registration.cu + #OPTIONS ${CUDA_FLAGS} ) SET(CMAKE_CXX_FLAGS "${OLD_CMAKE_CXX_FLAGS}") INCLUDE_DIRECTORIES(${CUDA_INCLUDE_DIRS}) - LIST(APPEND SOURCES - ${CUDA_OBJECTS} - ) + #LIST(APPEND SOURCES + # ${CUDA_OBJECTS} + #) - LIST(APPEND LIBRARIES - ${CUDA_LIBRARIES} - ) + #LIST(APPEND LIBRARIES + # ${CUDA_LIBRARIES} + #) + #set_target_properties(freenect2 PROPERTIES CUDA_ARCHITECTURES "35;50;75") + #SET_PROPERTY(TARGET freenect2 PROPERTY CUDA_ARCHITECTURES OFF) ENDIF() + #ENDIF(CMAKE_CUDA_COMPILER) ENDIF(ENABLE_CUDA) + + # RPATH handling for CUDA 8.0 libOpenCL.so conflict. See #804. IF(HAVE_OpenCL STREQUAL yes AND UNIX AND NOT APPLE) FILE(GLOB CUDA_ld_so_conf /etc/ld.so.conf.d/cuda*.conf) diff --git a/doc/sequence.svg b/doc/sequence.svg new file mode 100644 index 000000000..351c09b09 --- /dev/null +++ b/doc/sequence.svg @@ -0,0 +1,433 @@ + + + + + + + + + + + + + + + + + + int types = 0;libfreenect2::SyncMultiFrameListener listenerlibfreenect2::FrameMap frames; + libfreenect2::Freenect2 freenect2; libfreenect2::Freenect2Device *dev = 0; libfreenect2::PacketPipeline *pipeline = 0; + + + Protonect.cpp + main + libfreenect2::CudaAccessPacketPipeline(deviceId) + + bool protonect_shutdown = false;bool protonect_paused = false;libfreenect2::Freenect2Device *devtopause; + + + + std::string serial = ""; bool viewer_enabled = true; bool enable_rgb = true; bool enable_depth = true; int deviceId = -1; size_t framemax = -1; + + freenect2.getDefaultDeviceSerialNumber(); + + freenect2.openDevice(serial, pipeline); + + + + + libfreenect2::Frame::Color | libfreenect2::Frame::Ir | libfreenect2::Frame::Depth + + ( ) + + dev + setColorFrameListener(&listener) + setIrAndDepthFrameListener(&listener); + + + + + start() + + + diff --git a/examples/Protonect.cpp b/examples/Protonect.cpp index 2fbf4a67b..c34ec8c3a 100644 --- a/examples/Protonect.cpp +++ b/examples/Protonect.cpp @@ -101,9 +101,13 @@ class MyFileLogger: public libfreenect2::Logger * Main application entry point. * * Accepted argumemnts: - * - cpu Perform depth processing with the CPU. - * - gl Perform depth processing with OpenGL. - * - cl Perform depth processing with OpenCL. + * - cpu Perform depth processing with the CPU. + * - gl Perform depth processing with OpenGL. + * - cl Perform depth processing with OpenCL. + * - clkde + * - cuda + * - cudakde + * - cudaccess Does not send data to CPU. * - Serial number of the device to open. * - -noviewer Disable viewer window. */ @@ -113,7 +117,7 @@ int main(int argc, char *argv[]) std::string program_path(argv[0]); std::cerr << "Version: " << LIBFREENECT2_VERSION << std::endl; std::cerr << "Environment variables: LOGFILE=" << std::endl; - std::cerr << "Usage: " << program_path << " [-gpu=] [gl | cl | clkde | cuda | cudakde | cpu] []" << std::endl; + std::cerr << "Usage: " << program_path << " [-gpu=] [gl | cl | clkde | cuda | cudakde | cudaccess | cpu] []" << std::endl; std::cerr << " [-noviewer] [-norgb | -nodepth] [-help] [-version]" << std::endl; std::cerr << " [-frames ]" << std::endl; std::cerr << "To pause and unpause: pkill -USR1 Protonect" << std::endl; @@ -225,6 +229,15 @@ int main(int argc, char *argv[]) pipeline = new libfreenect2::CudaKdePacketPipeline(deviceId); #else std::cout << "CUDA pipeline is not supported!" << std::endl; +#endif + } + else if(arg == "cudaccess") + { +#ifdef LIBFREENECT2_WITH_CUDA_SUPPORT + if(!pipeline) + pipeline = new libfreenect2::CudaAccessPacketPipeline(deviceId); +#else + std::cout << "CUDA pipeline is not supported!" << std::endl; #endif } else if(arg.find_first_not_of("0123456789") == std::string::npos) //check if parameter could be a serial number diff --git a/include/libfreenect2/cuda_registration.h b/include/libfreenect2/cuda_registration.h new file mode 100644 index 000000000..76c995e0d --- /dev/null +++ b/include/libfreenect2/cuda_registration.h @@ -0,0 +1,162 @@ +/* + * This file is part of the OpenKinect Project. http://www.openkinect.org + * + * Copyright (c) 2014 individual OpenKinect contributors. See the CONTRIB file + * for details. + * + * This code is licensed to you under the terms of the Apache License, version + * 2.0, or, at your option, the terms of the GNU General Public License, + * version 2.0. See the APACHE20 and GPL2 files for the text of the licenses, + * or the following URLs: + * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.gnu.org/licenses/gpl-2.0.txt + * + * If you redistribute this file in source form, modified or unmodified, you + * may: + * 1) Leave this header intact and distribute it under the same terms, + * accompanying it with the APACHE20 and GPL20 files, or + * 2) Delete the Apache 2.0 clause and accompany it with the GPL2 file, or + * 3) Delete the GPL v2 clause and accompany it with the APACHE20 file + * In all cases you must keep the copyright notice intact and include a copy + * of the CONTRIB file. + * + * Binary distributions must follow the binary distribution requirements of + * either License. + */ + +/** @file cuda_registration.h Class for merging depth and color frames using cuda. */ + +#ifndef CUDA_REGISTRATION_H_ +#define CUDA_REGISTRATION_H_ + +#include +#include +#include +#include + +#ifdef LIBFREENECT2_WITH_CUDA_SUPPORT +#include +#include +#include +#include +#include + +namespace libfreenect2 +{ + +typedef thrust::tuple TupleXYZRGB; + +/** + * Frame whose data is allocated on device. + */ +class LIBFREENECT2_API CudaDeviceFrame: public Frame +{ +public: + /** Construct a new frame. + * @param width Width in pixel + * @param height Height in pixel + * @param bytes_per_pixel Bytes per pixel + */ + CudaDeviceFrame(size_t width, size_t height, size_t bytes_per_pixel); + virtual ~CudaDeviceFrame(); +private: + bool allocateMemory(); +}; + +class CudaRegistrationImpl; + +/** @defgroup registration Registration and Geometry + * Register depth to color, create point clouds. */ + +/** Combine frames of depth and color camera using gpus. @ingroup registration + * Right now this class uses a reverse engineered formula that uses factory + * preset extrinsic parameters the same way the Registration class does. + */ +class LIBFREENECT2_API CudaRegistration +{ +public: + /** + * @param depth_p Depth camera parameters. You can use the factory values, or use your own. + * @param rgb_p Color camera parameters. Probably use the factory values for now. + */ + CudaRegistration(Freenect2Device::IrCameraParams depth_p, Freenect2Device::ColorCameraParams rgb_p); + ~CudaRegistration(); + + /** Undistort and register a single depth point to color camera. + * @param dx Distorted depth coordinate x (pixel) + * @param dy Distorted depth coordinate y (pixel) + * @param dz Depth value (millimeter) + * @param[out] cx Undistorted color coordinate x (normalized) + * @param[out] cy Undistorted color coordinate y (normalized) + */ + void apply(int dx, int dy, float dz, float& cx, float &cy) const; + + /** Map color images onto depth images + * @param rgb Color image (1920x1080 BGRX) + * @param depth Depth image (512x424 float) + * @param[out] undistorted Undistorted depth image + * @param[out] registered Color image for the depth image (512x424) + * @param enable_filter Filter out pixels not visible to both cameras. + * @param[out] bigdepth If not `NULL`, return mapping of depth onto colors (1920x1082 float). **1082** not 1080, with a blank top and bottom row. + * @param[out] color_depth_map Index of mapped color pixel for each depth pixel (512x424). + */ + bool apply(const Frame* rgb, const Frame* depth, CudaDeviceFrame* undistorted, CudaDeviceFrame* registered, const bool enable_filter = true, CudaDeviceFrame* bigdepth = 0, int* color_depth_map = 0) const; + + /** Undistort depth + * @param depth Depth image (512x424 float) + * @param[out] undistorted Undistorted depth image + */ + void undistortDepth(const Frame* depth, Frame* undistorted) const; + + /** Construct a 3-D point with color in a point cloud. + * @param undistorted Undistorted depth frame from apply(). + * @param registered Registered color frame from apply(). + * @param r Row (y) index in depth image. + * @param c Column (x) index in depth image. + * @param[out] x X coordinate of the 3-D point (meter). + * @param[out] y Y coordinate of the 3-D point (meter). + * @param[out] z Z coordinate of the 3-D point (meter). + * @param[out] rgb Color of the 3-D point (BGRX). To unpack the data, use + * + * const uint8_t *p = reinterpret_cast(&rgb); + * uint8_t b = p[0]; + * uint8_t g = p[1]; + * uint8_t r = p[2]; + */ + void getPointXYZRGB (const Frame* undistorted, const Frame* registered, int r, int c, float& x, float& y, float& z, float& rgb) const; + + /** Construct a 3-D point in a point cloud. + * @param undistorted Undistorted depth frame from apply(). + * @param r Row (y) index in depth image. + * @param c Column (x) index in depth image. + * @param[out] x X coordinate of the 3-D point (meter). + * @param[out] y Y coordinate of the 3-D point (meter). + * @param[out] z Z coordinate of the 3-D point (meter). + */ + void getPointXYZ (const Frame* undistorted, int r, int c, float& x, float& y, float& z) const; + + /** + * Construct a point cloud as thrust vector of XYZRGB data as tuples of in device memory, which can be used + * for further processing with CUDA. + * @param undistorted Undistorted depth frame from apply(). + * @param registered Registered color frame from apply(). + * @param[out] cloud_data coordinates of the 3-D point (meter) and color (BGRX). + * To unpack the color data, use + * const uint8_t *p = reinterpret_cast(&rgb); + * uint8_t b = p[0]; + * uint8_t g = p[1]; + * uint8_t r = p[2]; + */ + void getPointXYZRGB(const Frame* undistorted, const Frame* registered, thrust::device_vector& cloud_data) const; + +private: + CudaRegistrationImpl *impl_; + + /* Disable copy and assignment constructors */ + CudaRegistration(const CudaRegistration&); + CudaRegistration& operator=(const CudaRegistration&); +}; +#endif // LIBFREENECT2_WITH_CUDA_SUPPORT + +} /* namespace libfreenect2 */ +#endif /* REGISTRATION_H_ */ diff --git a/src/cuda_registration.cu b/src/cuda_registration.cu new file mode 100644 index 000000000..e0c8872ee --- /dev/null +++ b/src/cuda_registration.cu @@ -0,0 +1,558 @@ +/* + * This file is part of the OpenKinect Project. http://www.openkinect.org + * + * Copyright (c) 2014 individual OpenKinect contributors. See the CONTRIB file + * for details. + * + * This code is licensed to you under the terms of the Apache License, version + * 2.0, or, at your option, the terms of the GNU General Public License, + * version 2.0. See the APACHE20 and GPL2 files for the text of the licenses, + * or the following URLs: + * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.gnu.org/licenses/gpl-2.0.txt + * + * If you redistribute this file in source form, modified or unmodified, you + * may: + * 1) Leave this header intact and distribute it under the same terms, + * accompanying it with the APACHE20 and GPL20 files, or + * 2) Delete the Apache 2.0 clause and accompany it with the GPL2 file, or + * 3) Delete the GPL v2 clause and accompany it with the APACHE20 file + * In all cases you must keep the copyright notice intact and include a copy + * of the CONTRIB file. + * + * Binary distributions must follow the binary distribution requirements of + * either License. + */ + +/** @file Implementation of merging depth and color images using cuda. */ + +#include +#include "libfreenect2/logging.h" +#include + +#define MONO_ROWS 424 +#define MONO_COLS 512 +#define COLOR_ROWS 1080 +#define COLOR_COLS 1920 + + +typedef unsigned char uchar; + +#define CHECK_CUDA(expr) do { cudaError_t err = (expr); if (err != cudaSuccess) { LOG_ERROR << #expr ": " << cudaGetErrorString(err); return false; } } while(0) +#define CALL_CUDA(expr) do { cudaError_t err = (expr); if (err != cudaSuccess) { LOG_ERROR << #expr ": " << cudaGetErrorString(err); } } while(0) + +static __device__ +void distort(int mx, int my, float& d_x, float& d_y, const libfreenect2::Freenect2Device::IrCameraParams& d_depth) +{ + float dx = ((float)mx - d_depth.cx) / d_depth.fx; + float dy = ((float)my - d_depth.cy) / d_depth.fy; + float dx2 = dx * dx; + float dy2 = dy * dy; + float r2 = dx2 + dy2; + float dxdy2 = 2 * dx * dy; + float kr = 1 + ((d_depth.k3 * r2 + d_depth.k2) * r2 + d_depth.k1) * r2; + d_x = d_depth.fx * (dx * kr + d_depth.p2 * (r2 + 2 * dx2) + d_depth.p1 * dxdy2) + d_depth.cx; + d_y = d_depth.fy * (dy * kr + d_depth.p1 * (r2 + 2 * dy2) + d_depth.p2 * dxdy2) + d_depth.cy; +} + +static __device__ +void depth_to_color(float mx, float my, float& d_rx, float& d_ry, + const libfreenect2::Freenect2Device::IrCameraParams& d_depth, + const libfreenect2::Freenect2Device::ColorCameraParams& d_color, + const float depth_q, const float color_q) +{ + mx = (mx - d_depth.cx) * depth_q; + my = (my - d_depth.cy) * depth_q; + + float wx = + (mx * mx * mx * d_color.mx_x3y0) + (my * my * my * d_color.mx_x0y3) + + (mx * mx * my * d_color.mx_x2y1) + (my * my * mx * d_color.mx_x1y2) + + (mx * mx * d_color.mx_x2y0) + (my * my * d_color.mx_x0y2) + (mx * my * d_color.mx_x1y1) + + (mx * d_color.mx_x1y0) + (my * d_color.mx_x0y1) + (d_color.mx_x0y0); + + float wy = + (mx * mx * mx * d_color.my_x3y0) + (my * my * my * d_color.my_x0y3) + + (mx * mx * my * d_color.my_x2y1) + (my * my * mx * d_color.my_x1y2) + + (mx * mx * d_color.my_x2y0) + (my * my * d_color.my_x0y2) + (mx * my * d_color.my_x1y1) + + (mx * d_color.my_x1y0) + (my * d_color.my_x0y1) + (d_color.my_x0y0); + + d_rx = (wx / (d_color.fx * color_q)) - (d_color.shift_m / d_color.shift_d); + d_ry = (wy / color_q) + d_color.cy; +} + +static __global__ +void dInitMaps(int* d_map_dist, float* d_map_x, float* d_map_y, int* d_map_yi, + const libfreenect2::Freenect2Device::IrCameraParams d_depth, + const libfreenect2::Freenect2Device::ColorCameraParams d_color, + const float depth_q, const float color_q) +{ + // Configuration copied from cuda_depth_packet_processor.cu + const uint i = blockIdx.x*blockDim.x + threadIdx.x; + + const uint x = i % MONO_COLS; + const uint y = i / MONO_COLS; + + float mx, my; + int ix, iy, index; + float rx, ry; + + // compute the distorted coordinate for current pixel + distort(x, y, mx, my, d_depth); + + // rounding the values and check if the pixel is inside the image + ix = (int)(mx + 0.5f); + iy = (int)(my + 0.5f); + if(ix < 0 || ix >= 512 || iy < 0 || iy >= 424) + index = -1; + else + // computing the index from the coordinates for faster access to the data + index = iy * 512 + ix; + d_map_dist[i] = index; + + // compute the depth to color mapping entries for the current pixel + depth_to_color(x, y, rx, ry, d_depth, d_color, depth_q, color_q); + d_map_x[i] = rx; + d_map_y[i] = ry; + // compute the y offset to minimize later computations + d_map_yi[i] = (int)(ry + 0.5f); +} + +static __global__ +void setFloat(float* devPtr, float value) +{ + // Configuration copied from cuda_depth_packet_processor.cu + const uint i = blockIdx.x * blockDim.x + threadIdx.x; + + devPtr[i] = value; +} + +/** + * Set all values of array of floats devPtr to value. + * This function does not call for synchronization. + * @param devPtr pointer to memory in device + * @param value value to set + * @param size number of float sized elements in array + */ +void cudaMemsetFloat(float* devPtr, float value, size_t size) +{ + size_t numThreads = 512; + size_t numBlocks = size / numThreads; + setFloat<<>>(devPtr, value); +} + +/** + * Compares value at address with val, if val is smaller it + * saves it at address. + */ +__device__ float atomicKeepSmaller(float* address, float val) +{ + // Implementation addapted from http://docs.nvidia.com/cuda/cuda-c-programming-guide/#atomic-functions + int* address_as_ull = (int*)address; + int old = *address_as_ull, assumed; + do { + assumed = old; + old = atomicCAS(address_as_ull, assumed, + __float_as_int(val < __int_as_float(assumed) ? val : __int_as_float(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + + return __int_as_float(old); +} + + +static __global__ +void undistort(int* d_depth_to_c_off, + float* d_undistorted_data, + float* d_filter_map, + const float* d_depth_data, const int* d_map_dist, + const float* d_map_x, const int* d_map_yi, + const libfreenect2::Freenect2Device::IrCameraParams depth, + const libfreenect2::Freenect2Device::ColorCameraParams color, + const int filter_width_half, + const int filter_height_half, + const int offset_filter_map, + const bool enable_filter) +{ + // getting index of distorted depth pixel + const int i = blockIdx.x * blockDim.x + threadIdx.x; + const int index = d_map_dist[i]; + + // check if distorted depth pixel is outside of the depth image + if(index < 0){ + d_depth_to_c_off[i] = -1; + d_undistorted_data[i] = 0; + return; + } + + // getting depth value for current pixel + const float z = d_depth_data[index]; + d_undistorted_data[i] = z; + + // checking for invalid depth value + if(z <= 0.0f){ + d_depth_to_c_off[i] = -1; + return; + } + + // calculating x offset for rgb image based on depth value + const float color_cx = color.cx + 0.5f; // 0.5f added for later rounding + const float rx = (d_map_x[index] + (color.shift_m / z)) * color.fx + color_cx; + const int cx = rx; // same as round for positive numbers (0.5f was already added to color_cx) + // getting y offset for depth image + const int cy = d_map_yi[i]; + // combining offsets + const int c_off = cx + cy * COLOR_COLS; + + // check if c_off is outside of rgb image + // checking rx/cx is not needed because the color image is much wider then the depth image + if(c_off < 0 || c_off >= COLOR_ROWS * COLOR_COLS){ + d_depth_to_c_off[i] = -1; + return; + } + + // saving the offset for later + d_depth_to_c_off[i] = c_off; + + // I am not sure if there won't be race conditions here due to overlap, the atomic operation should help. + if(enable_filter){ + // setting a window around the filter map pixel corresponding to the color pixel with the current z value + int yi = (cy - filter_height_half) * 1920 + cx - filter_width_half; // index of first pixel to set + for(int r = -filter_height_half; r <= filter_height_half; ++r, yi += COLOR_COLS) // index increased by a full row each iteration + { + float *it = d_filter_map + offset_filter_map + yi; + for(int c = -filter_width_half; c <= filter_width_half; ++c, ++it) + { + // only set if the current z is smaller + atomicKeepSmaller(it, z); + } + } + } +} + +/** Construct 'registered' image with filter. + * Filter drops duplicate pixels due to aspect of two cameras. + */ +static __global__ +void registerImageFiltered(unsigned int *d_registered_data, + const unsigned int * d_rgb_data, + const int* d_depth_to_c_off, + const float* d_undistorted_data, + const float *d_p_filter_map, + const float filter_tolerance) +{ + // getting index of distorted depth pixel + const int i = blockIdx.x * blockDim.x + threadIdx.x; + + // run through all registered color pixels and set them based on filter results + const int c_off = d_depth_to_c_off[i]; + + // check if offset is out of image + if(c_off < 0){ + d_registered_data[i] = 0; + return; + } + + const float min_z = d_p_filter_map[c_off]; + const float z = d_undistorted_data[i]; + + // check for allowed depth noise + d_registered_data[i] = (z - min_z) / z > filter_tolerance ? 0 : d_rgb_data[c_off]; + +} + +/** Construct 'registered' image. */ +static __global__ +void registerImage(unsigned int *d_registered_data, + const unsigned int * d_rgb_data, + const int* d_depth_to_c_off) +{ + // getting index of distorted depth pixel + const int i = blockIdx.x * blockDim.x + threadIdx.x; + + // run through all registered color pixels and set them based on c_off + const int c_off = d_depth_to_c_off[i]; + + // check if offset is out of image + d_registered_data[i] = c_off < 0 ? 0 : d_rgb_data[c_off]; +} + +namespace libfreenect2 +{ + +CudaDeviceFrame::CudaDeviceFrame(size_t width, size_t height, size_t bytes_per_pixel): + Frame(width, height, bytes_per_pixel, (unsigned char*)-1) +{ + allocateMemory(); +} + +CudaDeviceFrame::~CudaDeviceFrame() +{ + CALL_CUDA(cudaFree(data)); +} + +bool CudaDeviceFrame::allocateMemory() +{ + CHECK_CUDA(cudaMalloc(&data, width * height * bytes_per_pixel * sizeof(unsigned char))); + + cudaDeviceSynchronize(); + + CHECK_CUDA(cudaGetLastError()); + return true; +} + +/* + * The information used here has been taken from libfreenect2::Registration source + * code. + */ +static const float depth_q = 0.01; +static const float color_q = 0.002199; + +class CudaRegistrationImpl +{ +public: + CudaRegistrationImpl(Freenect2Device::IrCameraParams depth_p, Freenect2Device::ColorCameraParams rgb_p): + depth(depth_p), color(rgb_p), + filter_width_half(2), filter_height_half(1), filter_tolerance(0.01f), + block_size(128), grid_size(MONO_IMAGE_SIZE/block_size) + { + good = setupDevice(); + if (!good) + return; + + good = initMaps(); + if (!good) + return; + } + + ~CudaRegistrationImpl() + { + if (good) + freeDeviceMemory(); + } + + void apply(int dx, int dy, float dz, float& cx, float &cy) const; + bool apply(const Frame* rgb, const Frame* depth, CudaDeviceFrame* undistorted, CudaDeviceFrame* registered, const bool enable_filter, CudaDeviceFrame* bigdepth, int* color_depth_map) const; + void undistortDepth(const Frame *depth, Frame *undistorted) const; + void getPointXYZRGB (const Frame* undistorted, const Frame* registered, int r, int c, float& x, float& y, float& z, float& rgb) const; + void getPointXYZ (const Frame* undistorted, int r, int c, float& x, float& y, float& z) const; + void distort(int mx, int my, float& dx, float& dy) const; + void depth_to_color(float mx, float my, float& rx, float& ry) const; + +private: + Freenect2Device::IrCameraParams depth; ///< Depth camera parameters. + Freenect2Device::ColorCameraParams color; ///< Color camera parameters. + + const int filter_width_half; + const int filter_height_half; + const float filter_tolerance; + + static const size_t MONO_IMAGE_SIZE = MONO_COLS * MONO_ROWS; + + size_t block_size; + size_t grid_size; + + bool good; // Memory correctly allocated + + // Maps + int* d_distort_map; + float* d_depth_to_color_map_x; + float* d_depth_to_color_map_y; + int* d_depth_to_color_map_yi; + + bool allocateDeviceMemory() + { + CHECK_CUDA(cudaMalloc(&d_distort_map, MONO_IMAGE_SIZE * sizeof(int))); + CHECK_CUDA(cudaMalloc(&d_depth_to_color_map_x, MONO_IMAGE_SIZE * sizeof(float))); + CHECK_CUDA(cudaMalloc(&d_depth_to_color_map_y, MONO_IMAGE_SIZE * sizeof(float))); + CHECK_CUDA(cudaMalloc(&d_depth_to_color_map_yi, MONO_IMAGE_SIZE * sizeof(int))); + + cudaDeviceSynchronize(); + + CHECK_CUDA(cudaGetLastError()); + return true; + } + + bool setupDevice() + { + // Continue to use same device than cuda_depth_packet_processor? + if (!allocateDeviceMemory()) + return false; + + return true; + } + + bool initMaps() + { + dInitMaps<<>>(d_distort_map, d_depth_to_color_map_x, + d_depth_to_color_map_y, d_depth_to_color_map_yi, + depth, color, depth_q, color_q); + + cudaDeviceSynchronize(); + CHECK_CUDA(cudaGetLastError()); + + return true; + } + + void freeDeviceMemory() + { + CALL_CUDA(cudaFree(d_distort_map)); + CALL_CUDA(cudaFree(d_depth_to_color_map_x)); + CALL_CUDA(cudaFree(d_depth_to_color_map_y)); + CALL_CUDA(cudaFree(d_depth_to_color_map_yi)); + } +}; + +CudaRegistration::CudaRegistration(Freenect2Device::IrCameraParams depth_p, Freenect2Device::ColorCameraParams rgb_p): + impl_(new CudaRegistrationImpl(depth_p, rgb_p)) {} + +CudaRegistration::~CudaRegistration() +{ + delete impl_; +} + +bool CudaRegistration::apply(const Frame* rgb, const Frame* depth, CudaDeviceFrame* undistorted, CudaDeviceFrame* registered, const bool enable_filter, CudaDeviceFrame* bigdepth, int* color_depth_map) const +{ + return impl_->apply(rgb, depth, undistorted, registered, enable_filter, bigdepth, color_depth_map); +} + +bool CudaRegistrationImpl::apply(const Frame *rgb, const Frame *depth, CudaDeviceFrame *undistorted, CudaDeviceFrame *registered, const bool enable_filter, CudaDeviceFrame *bigdepth, int *color_depth_map) const +{ + // Check if all frames are valid and have the correct size + if (!rgb || !depth || !undistorted || !registered || + rgb->width != 1920 || rgb->height != 1080 || rgb->bytes_per_pixel != 4 || + depth->width != 512 || depth->height != 424 || depth->bytes_per_pixel != 4 || + undistorted->width != 512 || undistorted->height != 424 || undistorted->bytes_per_pixel != 4 || + registered->width != 512 || registered->height != 424 || registered->bytes_per_pixel != 4) + { + LOG_ERROR << "Not applying" << std::endl; + return false; + } + + // Setup memory + + float *d_depth_data; + size_t depth_size = depth->width * depth->height * sizeof(float); + unsigned int *d_rgb_data; + size_t rgb_size = rgb->width * rgb->height * sizeof(unsigned int); + + CHECK_CUDA(cudaMalloc(&d_depth_data, depth_size)); + cudaMemcpy((void*)d_depth_data, + (const void*)depth->data, depth_size, + cudaMemcpyHostToDevice); + + CHECK_CUDA(cudaMalloc(&d_rgb_data, rgb_size)); + cudaMemcpy((void*)d_rgb_data, + (const void*)rgb->data, rgb_size, + cudaMemcpyHostToDevice); + + cudaDeviceSynchronize(); + CHECK_CUDA(cudaGetLastError()); + + float *d_undistorted_data = (float*)undistorted->data; + unsigned int *d_registered_data = (unsigned int*)registered->data; + const int *d_map_dist = d_distort_map; + const float *d_map_x = d_depth_to_color_map_x; + const int *d_map_yi = d_depth_to_color_map_yi; + + + // Setup parameters + + const int size_depth = MONO_ROWS * MONO_COLS; + const int size_color = COLOR_ROWS * COLOR_COLS; + //const float color_cx = color.cx + 0.5f; // 0.5f added for later rounding + + // size of filter map with a border of filter_height_half on top and bottom so that no check for borders is needed. + // since the color image is wide angle no border to the sides is needed. + const int size_filter_map = size_color + COLOR_COLS * filter_height_half * 2; + // offset to the important data + const int offset_filter_map = COLOR_COLS * filter_height_half; + + + // Auxiliary maps + + // map for storing the min z values used for each color pixel + float *d_filter_map = NULL; + // pointer to the beginning of the important data + float *d_p_filter_map = NULL; + + // map for storing the color offset for each depth pixel + int *d_depth_to_c_off; + CHECK_CUDA(cudaMalloc(&d_depth_to_c_off, size_depth * sizeof(int))); + if (color_depth_map) + { + // I don't know where this other color map could be coming from, + // so for the moment I will assume it is in host memory. + cudaMemcpy((void*)d_depth_to_c_off, + (const void*)color_depth_map, size_depth * sizeof(int), + cudaMemcpyHostToDevice); + } + //int *map_c_off = depth_to_c_off; + + // initializing the depth_map with values outside of the Kinect2 range + if(enable_filter){ + if(bigdepth) + { + d_filter_map = (float*)bigdepth->data; + } + else + { + CHECK_CUDA(cudaMalloc(&d_filter_map, size_filter_map * sizeof(float))); + } + d_p_filter_map = d_filter_map + offset_filter_map; // works the same even on device + + cudaMemsetFloat(d_filter_map, std::numeric_limits::infinity(), size_filter_map); + } + + /* Fix depth distortion, and compute pixel to use from 'rgb' based on depth measurement, + * stored as x/y offset in the rgb data. + */ + undistort<<>>(d_depth_to_c_off, + d_undistorted_data, d_filter_map, + d_depth_data, d_map_dist, + d_map_x, d_map_yi, + this->depth, this->color, filter_width_half, filter_height_half, offset_filter_map, enable_filter); + if (enable_filter) + { + registerImageFiltered<<>>(d_registered_data, + d_rgb_data, + d_depth_to_c_off, + d_undistorted_data, + d_p_filter_map, + filter_tolerance); + if (!bigdepth) + { + CALL_CUDA(cudaFree(d_filter_map)); + } + } + else + { + registerImage<<>>(d_registered_data, + d_rgb_data, + d_depth_to_c_off); + } + + // Finish + + // -1 represents Invalid + //undistorted->format = undistorted->Float; + //registered->format = registered->BGRX; + + + if (color_depth_map) + { + // I don't know where this other color map could be coming from, + // so for the moment I will assume it is in host memory. + // Placing it back to where it came from + cudaMemcpy((void*)color_depth_map, + (const void*)d_depth_to_c_off, size_depth * sizeof(int), + cudaMemcpyDeviceToHost); + } + CALL_CUDA(cudaFree(d_depth_to_c_off)); + + CALL_CUDA(cudaFree(d_depth_data)); + CALL_CUDA(cudaFree(d_rgb_data)); + + return true; +} + +} /* namespace libfreenect2 */ diff --git a/src/opencl_depth_packet_processor.cpp b/src/opencl_depth_packet_processor.cpp index d8db14d3c..c7f2a6953 100644 --- a/src/opencl_depth_packet_processor.cpp +++ b/src/opencl_depth_packet_processor.cpp @@ -251,7 +251,7 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging newIrFrame(); newDepthFrame(); - const int CL_ICDL_VERSION = 2; + //const int CL_ICDL_VERSION = 2; // Commented out because it is defined somewhere else typedef cl_int (*icdloader_func)(int, size_t, void*, size_t*); #ifdef _MSC_VER #pragma warning(push) diff --git a/src/opencl_kde_depth_packet_processor.cpp b/src/opencl_kde_depth_packet_processor.cpp index 98dbdff54..97765f415 100644 --- a/src/opencl_kde_depth_packet_processor.cpp +++ b/src/opencl_kde_depth_packet_processor.cpp @@ -259,7 +259,7 @@ class OpenCLKdeDepthPacketProcessorImpl: public WithPerfLogging newIrFrame(); newDepthFrame(); - const int CL_ICDL_VERSION = 2; + //const int cl_icdl_version = 2; // CL_ICDL_VERSION is defined somewhere else typedef cl_int (*icdloader_func)(int, size_t, void*, size_t*); #ifdef _MSC_VER #pragma warning(push) @@ -277,6 +277,7 @@ class OpenCLKdeDepthPacketProcessorImpl: public WithPerfLogging if (clGetICDLoaderInfoOCLICD != NULL) { char buf[16]; + //if (clGetICDLoaderInfoOCLICD(cl_icdl_version, sizeof(buf), buf, NULL) == CL_SUCCESS) if (clGetICDLoaderInfoOCLICD(CL_ICDL_VERSION, sizeof(buf), buf, NULL) == CL_SUCCESS) { if (strcmp(buf, "2.2.4") < 0) diff --git a/src/registration.cpp b/src/registration.cpp index 49a3b03e0..b4a9ddd07 100644 --- a/src/registration.cpp +++ b/src/registration.cpp @@ -380,7 +380,7 @@ RegistrationImpl::RegistrationImpl(Freenect2Device::IrCameraParams depth_p, Free for (int y = 0; y < 424; y++) { for (int x = 0; x < 512; x++) { - // compute the dirstored coordinate for current pixel + // compute the distorted coordinate for current pixel distort(x,y,mx,my); // rounding the values and check if the pixel is inside the image ix = (int)(mx + 0.5f); @@ -388,7 +388,7 @@ RegistrationImpl::RegistrationImpl(Freenect2Device::IrCameraParams depth_p, Free if(ix < 0 || ix >= 512 || iy < 0 || iy >= 424) index = -1; else - // computing the index from the coordianted for faster access to the data + // computing the index from the coordinates for faster access to the data index = iy * 512 + ix; *map_dist++ = index;