diff --git a/.gitignore b/.gitignore
index 278c78091b..69f928be3c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,7 @@
 *.pyc
 *.o
+*.a
 *.exe
+*.gch
 build/
+build-*/
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000000..0fdd3ab4f8
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "blt"]
+	path = blt
+	url = https://github.com/LLNL/blt.git
diff --git a/.travis.yml b/.travis.yml
index b91fda4cbe..2902989e6f 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -13,60 +13,64 @@ matrix:
     addons: { apt: { sources: [ ubuntu-toolchain-r-test ] , packages: [ g++-4.9, libtbb-dev ] } }
     env:
     - COMPILER=g++-4.9
-    - CMAKE_EXTRA_FLAGS="-DRAJA_ENABLE_WARNINGS=On"
+    - CMAKE_EXTRA_FLAGS="-DENABLE_WARNINGS=On -DENABLE_TBB=On"
   - compiler: gcc-6
     addons: { apt: { sources: [ ubuntu-toolchain-r-test ] , packages: [ g++-6, libtbb-dev ] } }
     env:
     - COMPILER=g++-6
-    - CMAKE_EXTRA_FLAGS="-DRAJA_ENABLE_WARNINGS=On"
+    - CMAKE_EXTRA_FLAGS="-DENABLE_WARNINGS=On -DENABLE_TBB=On"
   - compiler: gcc-7
     addons: { apt: { sources: [ ubuntu-toolchain-r-test ] , packages: [ g++-7, libtbb-dev ] } }
     env:
     - COMPILER=g++-7
-    - CMAKE_EXTRA_FLAGS="-DRAJA_ENABLE_WARNINGS=On"
+    - CMAKE_EXTRA_FLAGS="-DENABLE_WARNINGS=On -DENABLE_TBB=On"
   - compiler: clang-5
     addons: { apt: { sources: [ ubuntu-toolchain-r-test ] , packages: [ g++-6, libtbb-dev ] } }
     env:
     - COMPILER=clang++-5.0.0
     - LLVM_VERSION=5.0.0
     - DOWNLOAD_URL=http://releases.llvm.org/5.0.0/clang+llvm-5.0.0-linux-x86_64-ubuntu14.04.tar.xz
+    - CMAKE_EXTRA_FLAGS="-DCMAKE_CXX_FLAGS=-fmodules -DENABLE_TBB=On"
   - compiler: clang-3.9
     addons: { apt: { sources: [ ubuntu-toolchain-r-test ] , packages: [ g++-6, libtbb-dev ] } }
     env:
     - COMPILER=clang++-3.9.1
     - LLVM_VERSION=3.9.1
+    - CMAKE_EXTRA_FLAGS="-DENABLE_TBB=On"
   - compiler: clang-4.0
     addons: { apt: { sources: [ ubuntu-toolchain-r-test ] , packages: [ g++-6, libtbb-dev ] } }
     env:
     - COMPILER=clang++-4.0.0
     - LLVM_VERSION=4.0.0
+    - CMAKE_EXTRA_FLAGS="-DENABLE_TBB=On"
   - compiler: intel-17
     env:
     - COMPILER=icpc
     - TRAVIS_INSTALL_COMPILER="intel"
+    - CMAKE_EXTRA_FLAGS="-DENABLE_TBB=On"
   - compiler: nvcc
     addons: { apt: { sources: [ ubuntu-toolchain-r-test ] , packages: [ g++-4.9, libtbb-dev ] } }
     env:
     - COMPILER=g++-4.9
-    - CMAKE_EXTRA_FLAGS="-DRAJA_ENABLE_CUDA=On"
+    - CMAKE_EXTRA_FLAGS="-DENABLE_CUDA=On -DENABLE_TBB=On"
     - TRAVIS_INSTALL_COMPILER="nvcc"
     - DO_TEST=no
   - compiler: gcc-4.9-debug
     addons: { apt: { sources: [ ubuntu-toolchain-r-test ] , packages: [ g++-4.9, libtbb-dev ] } }
     env:
     - COMPILER=g++-4.9
-    - CMAKE_EXTRA_FLAGS="-DCMAKE_BUILD_TYPE=Debug  -DRAJA_ENABLE_COVERAGE=On"
+    - CMAKE_EXTRA_FLAGS="-DCMAKE_BUILD_TYPE=Debug  -DENABLE_COVERAGE=On -DENABLE_TBB=On"
   - compiler: clang-3.9-debug
     addons: { apt: { sources: [ ubuntu-toolchain-r-test ] , packages: [ g++-6, libtbb-dev ] } }
     env:
     - COMPILER=clang++
     - LLVM_VERSION=3.9.1
-    - CMAKE_EXTRA_FLAGS="-DCMAKE_BUILD_TYPE=Debug"
+    - CMAKE_EXTRA_FLAGS="-DCMAKE_BUILD_TYPE=Debug -DENABLE_TBB=On"
   - compiler: nvcc-debug
     addons: { apt: { sources: [ ubuntu-toolchain-r-test ] , packages: [ g++-4.9, libtbb-dev ] } }
     env:
     - COMPILER=g++-4.9
-    - CMAKE_EXTRA_FLAGS="-DCMAKE_BUILD_TYPE=Debug -DRAJA_ENABLE_CUDA=On"
+    - CMAKE_EXTRA_FLAGS="-DCMAKE_BUILD_TYPE=Debug -DENABLE_CUDA=On -DENABLE_TBB=On"
     - TRAVIS_INSTALL_COMPILER="nvcc"
     - DO_TEST=no
 cache:
@@ -97,5 +101,5 @@ before_install:
 script:
 - ./scripts/travis_build_and_test.sh
 after_success:
-- if [[ "${CMAKE_EXTRA_FLAGS}" == *"RAJA_ENABLE_COVERAGE"* ]] ; then bash <(curl -s https://codecov.io/bash) -a "-f"; fi
+- if [[ "${CMAKE_EXTRA_FLAGS}" == *"ENABLE_COVERAGE"* ]] ; then bash <(curl -s https://codecov.io/bash) -a "-f" >& /dev/null; fi
 - if [[ "${TRAVIS_INSTALL_COMPILER}" == "intel" ]] ; then uninstall_intel_software ; fi
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4fae38936a..cbd0ef79a2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,5 +1,5 @@
 ###############################################################################
-# Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+# Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 #
 # Produced at the Lawrence Livermore National Laboratory
 #
@@ -9,34 +9,7 @@
 #
 # This file is part of RAJA.
 #
-# For additional details, please also read RAJA/LICENSE.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright notice,
-#   this list of conditions and the disclaimer below.
-#
-# * Redistributions in binary form must reproduce the above copyright notice,
-#   this list of conditions and the disclaimer (as noted below) in the
-#   documentation and/or other materials provided with the distribution.
-#
-# * Neither the name of the LLNS/LLNL nor the names of its contributors may
-#   be used to endorse or promote products derived from this software without
-#   specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-# ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-# LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-# IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-# POSSIBILITY OF SUCH DAMAGE.
+# For details about use and distribution, please read RAJA/LICENSE.
 #
 ###############################################################################
 
@@ -47,30 +20,44 @@ project(RAJA LANGUAGES CXX C)
 
 # Set version number
 set(RAJA_VERSION_MAJOR 0)
-set(RAJA_VERSION_MINOR 3)
-set(RAJA_VERSION_PATCHLEVEL 1)
+set(RAJA_VERSION_MINOR 4)
+set(RAJA_VERSION_PATCHLEVEL 0)
 
 set(CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake/thirdparty" ${CMAKE_MODULE_PATH})
 
 # Build options
+set(ENABLE_OPENMP On CACHE Bool "Build OpenMP support")
+set(ENABLE_CUDA Off CACHE Bool "Build CUDA support")
+set(ENABLE_COPY_HEADERS Off CACHE Bool "")
+set(ENABLE_WARNINGS_AS_ERRORS Off CACHE Bool "")
+
 set(RAJA_CXX_STANDARD_FLAG "default" CACHE STRING "Specific c++ standard flag to use, default attempts to autodetect the highest available")
-option(RAJA_ENABLE_OPENMP "Build OpenMP support" On)
-option(RAJA_ENABLE_TBB "Build TBB support" On)
-option(RAJA_ENABLE_TARGET_OPENMP "Build OpenMP on target device support" Off)
-option(RAJA_ENABLE_CUDA "Build CUDA support" Off)
-option(RAJA_ENABLE_CLANG_CUDA "Use Clang's native CUDA support" Off)
-set(RAJA_CUDA_ARCH "sm_35" CACHE STRING "Compute architecture to pass to CUDA builds")
-option(RAJA_ENABLE_CUB "Use cub for scans using CUDA" On)
-option(RAJA_ENABLE_TESTS "Build tests" On)
-option(RAJA_ENABLE_EXAMPLES "Build simple examples" On)
-option(RAJA_ENABLE_NESTED "Enable nested loop support" Off)
-option(RAJA_ENABLE_WARNINGS "Enable warnings as errors for CI" Off)
-option(RAJA_ENABLE_DOCUMENTATION "Build RAJA documentation" Off)
-
-option(RAJA_ENABLE_COVERAGE "Enable coverage (only supported with GCC)" Off)
+
+option(ENABLE_TBB "Build TBB support" Off)
+option(ENABLE_TARGET_OPENMP "Build OpenMP on target device support" Off)
+option(ENABLE_CLANG_CUDA "Use Clang's native CUDA support" Off)
+set(CUDA_ARCH "sm_35" CACHE STRING "Compute architecture to pass to CUDA builds")
+option(ENABLE_CUB "Use cub for scans using CUDA" On)
+option(ENABLE_TESTS "Build tests" On)
+option(ENABLE_EXAMPLES "Build simple examples" On)
+option(ENABLE_MODULES "Enable modules in supporting compilers (clang)" On)
+option(ENABLE_WARNINGS "Enable warnings as errors for CI" Off)
+option(ENABLE_DOCUMENTATION "Build RAJA documentation" Off)
+option(ENABLE_COVERAGE "Enable coverage (only supported with GCC)" Off)
 
 set(TEST_DRIVER "" CACHE STRING "driver used to wrap test commands")
 
+if (NOT BLT_LOADED) 
+if (NOT EXISTS ${PROJECT_SOURCE_DIR}/blt/SetupBLT.cmake)
+  message(FATAL_ERROR "\
+  The BLT submodule is not present. \
+  If in a git repo run the following command:\n\
+  git submodule init && git submodule update")
+endif()
+
+include(blt/SetupBLT.cmake)
+endif()
+
 # Setup basic CMake options
 include(cmake/SetupBasics.cmake)
 # Find third-party packages
@@ -81,44 +68,83 @@ include(cmake/SetupCompilers.cmake)
 include(cmake/SetupRajaConfig.cmake)
 # Macros for building executables and libraries
 include (cmake/RAJAMacros.cmake)
-# Sanity check for compiler compatibility
-include (cmake/CompilerCompatibility.cmake)
 
-include_directories(${PROJECT_BINARY_DIR}/include/RAJA)
-include_directories(${PROJECT_BINARY_DIR}/include)
+set (raja_sources
+  src/AlignedRangeIndexSetBuilders.cpp
+  src/DepGraphNode.cpp
+  src/LockFreeIndexSetBuilders.cpp
+  src/MemUtils_CUDA.cpp
+  src/ThreadUtils_CPU.cpp)
+
+set (raja_depends)
 
-include_directories(include)
+if (ENABLE_OPENMP)
+  set (raja_depends
+    openmp)
+endif()
+
+if (ENABLE_CUDA)
+  set (raja_depends
+    ${raja_depends}
+    cuda)
+endif ()
+
+if (ENABLE_CUDA)
+  if (ENABLE_CUB)
+    set (raja_depends
+      ${raja_depends}
+      cub)
+  endif ()
+endif ()
+
+if (ENABLE_CHAI)
+  set (raja_depends
+    ${raja_depends}
+    chai)
+endif ()
+
+if (ENABLE_TBB)
+  set(raja_depends
+    ${raja_depends}
+    tbb)
+endif ()
+
+blt_add_library(
+  NAME RAJA
+  SOURCES ${raja_sources}
+  DEPENDS_ON ${raja_depends})
+
+install(TARGETS RAJA
+  EXPORT RAJA
+  ARCHIVE DESTINATION lib
+  LIBRARY DESTINATION lib
+  RUNTIME DESTINATION lib
+)
+
+install(EXPORT RAJA DESTINATION share/raja/cmake/)
+
+target_include_directories(RAJA
+  PUBLIC
+  $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+  $<BUILD_INTERFACE:${PROJECT_BINARY_DIR}/include>
+  $<INSTALL_INTERFACE:include>)
 
 install(DIRECTORY include/ DESTINATION include FILES_MATCHING PATTERN *.hpp)
-install(FILES ${PROJECT_BINARY_DIR}/include/RAJA/config.hpp
-        DESTINATION "include/RAJA")
 
-add_subdirectory(src)
+install(FILES
+  ${PROJECT_BINARY_DIR}/include/RAJA/config.hpp
+  include/RAJA/module.modulemap
+  include/RAJA/module.private.modulemap
+  DESTINATION "include/RAJA/")
 
-if(RAJA_ENABLE_TESTS)
+if(ENABLE_TESTS)
   add_subdirectory(test)
 endif()
 
-if(RAJA_ENABLE_EXAMPLES)
+if(ENABLE_EXAMPLES)
   add_subdirectory(examples)
 endif()
 
-if (RAJA_ENABLE_DOCUMENTATION)
+if (ENABLE_DOCUMENTATION)
   add_subdirectory(docs)
 endif ()
-
-if(RAJA_ENABLE_APPLICATIONS)
-  if (NOT EXISTS ${PROJECT_SOURCE_DIR}/extra/llnl-raja-proxies/CMakeLists.txt)
-    message(STATUS "Cloning RAJA proxy applications...")
-    execute_process(COMMAND git clone https://github.com/LLNL/RAJA-examples.git ${PROJECT_SOURCE_DIR}/extra/llnl-raja-proxies)
-  endif()
-  add_subdirectory(extra/llnl-raja-proxies)
-endif()
-
-if(RAJA_ENABLE_PERFSUITE)
-  if (NOT EXISTS ${PROJECT_SOURCE_DIR}/extra/performance/CMakeLists.txt)
-    message(STATUS "Cannot find performance suite")
-    message(STATUS "Clone it to ./extra/performance")
-  endif()
-  add_subdirectory(extra/performance)
-endif()
diff --git a/Dockerfile b/Dockerfile
index a3a9702514..a1f93dcdb9 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -8,6 +8,6 @@ RUN cd /opt/ && git clone https://github.com/LLNL/RAJA.git
 
 WORKDIR /opt/RAJA
 
-RUN mkdir build && cd build && cmake -DRAJA_ENABLE_CUDA=ON ..
+RUN mkdir build && cd build && cmake -DENABLE_CUDA=ON ..
 
 RUN cd build && make -j && make install
diff --git a/LICENSE b/LICENSE
index 41872267e9..40def96a55 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,32 +1,5 @@
-*******************************************************************************
-
-RAJA: ................................, version 0.3.1
-
-Copyright (c) 2016, Lawrence Livermore National Security, LLC. 
-Produced at the Lawrence Livermore National Laboratory.
-All rights reserved. See details below.
-
-Unlimited Open Source - BSD Distribution
-LLNL-CODE-689114
-OCEC-16-063
-
-The original developers of RAJA are:
-
-Rich Hornung (hornung1@llnl.gov)
-Jeff Keasler (keasler1@llnl.gov)
-
-Contributors include:
-
-David Beckingsale (beckingsale1@llnl.gov)
-Jason Burmark (burmark1@llnl.gov)
-Holger Jones (jones19@llnl.gov)
-Will Killian (killian4@llnl.gov)
-Adam Kunen (kunen1@llnl.gov)
-Olga Pearce (pearce8@llnl.gov)
-David Poliakoff (poliakoff1@llnl.gov)
-Tom Scogland (scogland1@llnl.gov)
-
-*******************************************************************************
+Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
+All rights reserved.
 
 Redistribution and use in source and binary forms, with or without 
 modification, are permitted provided that the following conditions are met:
@@ -53,25 +26,3 @@ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 
 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 
 EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-Additional BSD Notice
-
-1. This notice is required to be provided under our contract with the U.S. 
-Department of Energy (DOE). This work was produced at Lawrence Livermore 
-National Laboratory under Contract No. DE-AC52-07NA27344 with the DOE.
-
-2. Neither the United States Government nor Lawrence Livermore National 
-Security, LLC nor any of their employees, makes any warranty, express or 
-implied, or assumes any liability or responsibility for the accuracy, 
-completeness, or usefulness of any information, apparatus, product, or 
-process disclosed, or represents that its use would not infringe 
-privately-owned rights.
-
-3. Also, reference herein to any specific commercial products, process, 
-or services by trade name, trademark, manufacturer or otherwise does not 
-necessarily constitute or imply its endorsement, recommendation, or favoring 
-by the United States Government or Lawrence Livermore National Security, LLC. 
-The views and opinions of authors expressed herein do not necessarily state 
-or reflect those of the United States Government or Lawrence Livermore 
-National Security, LLC, and shall not be used for advertising or product 
-endorsement purposes.
diff --git a/NOTICE b/NOTICE
new file mode 100644
index 0000000000..8aea31b91a
--- /dev/null
+++ b/NOTICE
@@ -0,0 +1,21 @@
+Additional BSD Notice
+
+1. This notice is required to be provided under our contract with the U.S. 
+Department of Energy (DOE). This work was produced at Lawrence Livermore 
+National Laboratory under Contract No. DE-AC52-07NA27344 with the DOE.
+
+2. Neither the United States Government nor Lawrence Livermore National 
+Security, LLC nor any of their employees, makes any warranty, express or 
+implied, or assumes any liability or responsibility for the accuracy, 
+completeness, or usefulness of any information, apparatus, product, or 
+process disclosed, or represents that its use would not infringe 
+privately-owned rights.
+
+3. Also, reference herein to any specific commercial products, process, 
+or services by trade name, trademark, manufacturer or otherwise does not 
+necessarily constitute or imply its endorsement, recommendation, or favoring 
+by the United States Government or Lawrence Livermore National Security, LLC. 
+The views and opinions of authors expressed herein do not necessarily state 
+or reflect those of the United States Government or Lawrence Livermore 
+National Security, LLC, and shall not be used for advertising or product 
+endorsement purposes.
diff --git a/README.md b/README.md
index 0fd546a8c6..45eab7e860 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-RAJA v0.3.1
+RAJA v0.4.0
 ============
 
 [![Build Status](https://travis-ci.org/LLNL/RAJA.svg?branch=develop)](https://travis-ci.org/LLNL/RAJA)
@@ -42,7 +42,7 @@ Quick Start
 The RAJA code lives in a GitHub [repository](https://github.com/llnl/raja).
 To clone the repo, use the command:
 
-    git clone https://github.com/llnl/raja.git
+    git clone --recursive https://github.com/llnl/raja.git
 
 Then, you can build RAJA like any other CMake project, provided you have a C++
 compiler that supports the C++11 standard. The simplest way to build the code 
@@ -62,7 +62,7 @@ Example Applications
 
 The [RAJA-examples](https://github.com/LLNL/RAJA-examples) repository contains three proxy applications that use the RAJA
 programming model. These applications can be built along with the rest of the
-RAJA framework by setting `-DRAJA_ENABLE_APPLICATIONS=On` when running CMake.
+RAJA framework by setting `-DENABLE_APPLICATIONS=On` when running CMake.
 
 When this option is passed to CMake, the RAJA-examples repository is cloned using `git` to the directory `extra/llnl-raja-proxies` in the project root. The example applications will be built using the same configuration that the RAJA library uses.
 
@@ -110,31 +110,24 @@ The original developers of RAJA are:
   * Rich Hornung (hornung1@llnl.gov)
   * Jeff Keasler (keasler1@llnl.gov)
 
-Contributors include:
-
-  * David Beckingsale (beckingsale1@llnl.gov)
-  * Jason Burmark (burmark1@llnl.gov)
-  * Holger Jones (jones19@llnl.gov)
-  * Will Killian (killian4@llnl.gov)
-  * Adam Kunen (kunen1@llnl.gov)
-  * Olga Pearce (pearce8@llnl.gov)
-  * David Poliakoff (poliakoff1@llnl.gov)
-  * Tom Scogland (scogland1@llnl.gov)
+Please see the {RAJA Contributors Page](https://github.com/LLNL/RAJA/graphs/contributors), to see the full list of contributors to the project.
 
 
 Release
 -----------
 
-Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+Copyright (c) 2016-2017, Lawrence Livermore National Security, LLC.
 
 Produced at the Lawrence Livermore National Laboratory.
 
 All rights reserved.
 
-Unlimited Open Source - BSD Distribution
+`LLNL-CODE-689114`  `OCEC-16-063`
 
-For release details and restrictions, please read the LICENSE.txt file.
-It is also linked here:
-- [LICENSE](./LICENSE.txt)
+Unlimited Open Source - BSD Distribution
 
-`LLNL-CODE-689114`  `OCEC-16-063`
+For release details and restrictions, please read the RELEASE, LICENSE,
+and NOTICE files, also linked here:
+- [RELEASE](./RELEASE)
+- [LICENSE](./LICENSE)
+- [NOTICE](./NOTICE)
diff --git a/RELEASE b/RELEASE
new file mode 100644
index 0000000000..461ec52447
--- /dev/null
+++ b/RELEASE
@@ -0,0 +1,30 @@
+*******************************************************************************
+
+RAJA: ................................, version 0.4.0
+
+Copyright (c) 2016-17, Lawrence Livermore National Security, LLC. 
+Produced at the Lawrence Livermore National Laboratory.
+All rights reserved. See details in RAJA/LICENSE and RAJA/NOTICE files.
+
+Unlimited Open Source - BSD Distribution
+LLNL-CODE-689114
+OCEC-16-063
+
+The original developers of RAJA are:
+
+Rich Hornung (hornung1@llnl.gov)
+Jeff Keasler (keasler1@llnl.gov)
+
+Contributors include:
+
+David Beckingsale (beckingsale1@llnl.gov)
+Jason Burmark (burmark1@llnl.gov)
+Matt Cordery (cordery1@llnl.gov)
+Jeff Hammond (jeff.science@gmail.com)
+Holger Jones (jones19@llnl.gov)
+Will Killian (killian4@llnl.gov)
+Adam Kunen (kunen1@llnl.gov)
+Olga Pearce (pearce8@llnl.gov)
+David Poliakoff (poliakoff1@llnl.gov)
+Tom Scogland (scogland1@llnl.gov)
+Arturo Vargas (vargas45@llnl.gov)
diff --git a/appveyor.yml b/appveyor.yml
index 5de70791d4..947fb05695 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -3,6 +3,10 @@ skip_branch_with_pr: true
 image: Visual Studio 2017
 build_script:
 - cmd: >-
+    git submodule init
+
+    git submodule update
+
     mkdir build
 
     cd build
diff --git a/blt b/blt
new file mode 160000
index 0000000000..949f45ae30
--- /dev/null
+++ b/blt
@@ -0,0 +1 @@
+Subproject commit 949f45ae3041bea0072f0bdfd9d53409f03e7201
diff --git a/cmake/CompilerCompatibility.cmake b/cmake/CompilerCompatibility.cmake
deleted file mode 100644
index 709a7d5bac..0000000000
--- a/cmake/CompilerCompatibility.cmake
+++ /dev/null
@@ -1,114 +0,0 @@
-###############################################################################
-# Copyright (c) 2016, Lawrence Livermore National Security, LLC.
-#    
-# Produced at the Lawrence Livermore National Laboratory
-#    
-# LLNL-CODE-689114
-# 
-# All rights reserved.
-#  
-# This file is part of RAJA.
-#
-# For additional details, please also read RAJA/LICENSE.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright notice,
-#   this list of conditions and the disclaimer below.
-#
-# * Redistributions in binary form must reproduce the above copyright notice,
-#   this list of conditions and the disclaimer (as noted below) in the
-#   documentation and/or other materials provided with the distribution.
-#
-# * Neither the name of the LLNS/LLNL nor the names of its contributors may
-#   be used to endorse or promote products derived from this software without
-#   specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-# ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-# LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-# IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-# POSSIBILITY OF SUCH DAMAGE.
-#
-###############################################################################
-
-include(CheckCXXSourceCompiles)
-
-set(OLD_CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS})
-if (NOT MSVC)
-  if (CMAKE_CXX_COMPILER_ID MATCHES INTEL)
-    set (CMAKE_REQUIRED_FLAGS "${COMMON_FLAGS} ${CMAKE_CXX_FLAGS_DEBUG}")
-  else ()
-    set (CMAKE_REQUIRED_FLAGS "${COMMON_FLAGS} -std=c++11")
-  endif()
-endif()
-
-CHECK_CXX_SOURCE_COMPILES(
-"#include <type_traits>
-#include <limits>
-
-template <typename T>
-struct signed_limits {
-  static constexpr T min()
-  {
-    return static_cast<T>(1llu << ((8llu * sizeof(T)) - 1llu));
-  }
-  static constexpr T max()
-  {
-    return static_cast<T>(~(1llu << ((8llu * sizeof(T)) - 1llu)));
-  }
-};
-
-template <typename T>
-struct unsigned_limits {
-  static constexpr T min()
-  {
-    return static_cast<T>(0);
-  }
-  static constexpr T max()
-  {
-    return static_cast<T>(0xFFFFFFFFFFFFFFFF);
-  }
-};
-
-template <typename T>
-struct limits : public std::conditional<
-  std::is_signed<T>::value,
-  signed_limits<T>,
-  unsigned_limits<T>>::type {
-};
-
-template <typename T>
-void check() {
-  static_assert(limits<T>::min() == std::numeric_limits<T>::min(), \"min failed\");
-  static_assert(limits<T>::max() == std::numeric_limits<T>::max(), \"max failed\");
-}
-
-int main() {
-  check<char>();
-  check<unsigned char>();
-  check<short>();
-  check<unsigned short>();
-  check<int>();
-  check<unsigned int>();
-  check<long>();
-  check<unsigned long>();
-  check<long int>();
-  check<unsigned long int>();
-  check<long long>();
-  check<unsigned long long>();
-}" check_power_of_two_integral_types)
-
-set(CMAKE_REQUIRED_FLAGS ${OLD_CMAKE_REQUIRED_FLAGS})
-
-if(NOT check_power_of_two_integral_types)
-  message(FATAL_ERROR "RAJA fast limits are unsupported for your compiler/architecture")
-endif()
diff --git a/cmake/RAJAMacros.cmake b/cmake/RAJAMacros.cmake
index 413f6630a9..519a009347 100644
--- a/cmake/RAJAMacros.cmake
+++ b/cmake/RAJAMacros.cmake
@@ -1,5 +1,5 @@
 ###############################################################################
-# Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+# Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 #
 # Produced at the Lawrence Livermore National Laboratory
 #
@@ -9,106 +9,51 @@
 #
 # This file is part of RAJA.
 #
-# For additional details, please also read RAJA/LICENSE.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright notice,
-#   this list of conditions and the disclaimer below.
-#
-# * Redistributions in binary form must reproduce the above copyright notice,
-#   this list of conditions and the disclaimer (as noted below) in the
-#   documentation and/or other materials provided with the distribution.
-#
-# * Neither the name of the LLNS/LLNL nor the names of its contributors may
-#   be used to endorse or promote products derived from this software without
-#   specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-# ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-# LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-# IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-# POSSIBILITY OF SUCH DAMAGE.
+# For details about use and distribution, please read RAJA/LICENSE.
 #
 ###############################################################################
 
 macro(raja_add_executable)
   set(options )
-  set(singleValueArgs NAME)
+  set(singleValueArgs NAME TEST)
   set(multiValueArgs SOURCES DEPENDS_ON)
 
   cmake_parse_arguments(arg
     "${options}" "${singleValueArgs}" "${multiValueArgs}" ${ARGN})
 
-  if (RAJA_ENABLE_CHAI)
+  list (APPEND arg_DEPENDS_ON RAJA)
+
+  if (ENABLE_CHAI)
     list (APPEND arg_DEPENDS_ON chai)
   endif ()
 
-  if (RAJA_ENABLE_CUDA)
-    if (RAJA_ENABLE_CLANG_CUDA)
-      add_executable(${arg_NAME} ${arg_SOURCES})
-      target_compile_options(${arg_NAME} PRIVATE
-        -x cuda --cuda-gpu-arch=${RAJA_CUDA_ARCH} --cuda-path=${CUDA_TOOLKIT_ROOT_DIR})
-      target_include_directories(${arg_NAME}
-        PUBLIC ${EXPT_CUDA_INCLUDE_LOCATION})
-      target_link_libraries(${arg_NAME} ${CUDA_LIBRARIES} RAJA ${arg_DEPENDS_ON})
-    else ()
-      set_source_files_properties(
-        ${arg_SOURCES}
-        PROPERTIES
-        CUDA_SOURCE_PROPERTY_FORMAT OBJ)
-      cuda_add_executable(${arg_NAME} ${arg_SOURCES})
-      target_link_libraries(${arg_NAME} PUBLIC RAJA ${arg_DEPENDS_ON})
-    endif()
-  else ()
-    add_executable(${arg_NAME} ${arg_SOURCES})
-    target_link_libraries(${arg_NAME} RAJA ${arg_DEPENDS_ON})
-  endif()
-endmacro(raja_add_executable)
-
-macro(raja_add_library)
-  set(options )
-  set(singleValueArgs NAME)
-  set(multiValueArgs SOURCES DEPENDS_ON)
-
-  cmake_parse_arguments(arg
-    "${options}" "${singleValueArgs}" "${multiValueArgs}" ${ARGN})
-
-  if (RAJA_ENABLE_CHAI)
-    list (APPEND arg_DEPENDS_ON chai)
+  if (ENABLE_OPENMP)
+    list (APPEND arg_DEPENDS_ON openmp)
   endif ()
 
-  if (RAJA_ENABLE_CUDA)
-    if (RAJA_ENABLE_CLANG_CUDA)
+  if (ENABLE_CUDA)
+    list (APPEND arg_DEPENDS_ON cuda)
+  endif ()
 
-      add_library(${arg_NAME} ${arg_SOURCES})
-      target_compile_options(${arg_NAME} PRIVATE
-        -x cuda --cuda-gpu-arch=${RAJA_CUDA_ARCH} --cuda-path=${CUDA_TOOLKIT_ROOT_DIR})
-      target_include_directories(${arg_NAME}
-        PUBLIC ${EXPT_CUDA_INCLUDE_LOCATION})
-      target_link_libraries(${arg_NAME} ${CUDA_LIBRARIES})
+  if (ENABLE_TBB)
+    list (APPEND arg_DEPENDS_ON tbb)
+  endif ()
 
-    else ()
-      set_source_files_properties(
-        ${arg_SOURCES}
-        PROPERTIES
-        CUDA_SOURCE_PROPERTY_FORMAT OBJ)
+  message(STATUS "${arg_NAME} building with depends: ${arg_DEPENDS_ON}")
 
-      cuda_add_library(${arg_NAME} ${arg_SOURCES})
-    endif ()
+  if (${arg_TEST})
+    set (_output_dir test)
   else ()
-    add_library(${arg_NAME} ${arg_SOURCES})
-  endif ()
+    set (_output_dir bin)
+  endif()
 
-endmacro(raja_add_library)
+  blt_add_executable(
+    NAME ${arg_NAME}
+    SOURCES ${arg_SOURCES}
+    DEPENDS_ON ${arg_DEPENDS_ON}
+    OUTPUT_DIR ${_output_dir}
+    )
+endmacro(raja_add_executable)
 
 macro(raja_add_test)
   set(options )
@@ -118,13 +63,16 @@ macro(raja_add_test)
   cmake_parse_arguments(arg
     "${options}" "${singleValueArgs}" "${multiValueArgs}" ${ARGN})
 
-  list (APPEND arg_DEPENDS_ON gtest gtest_main ${CMAKE_THREAD_LIBS_INIT})
+  list (APPEND arg_DEPENDS_ON gtest ${CMAKE_THREAD_LIBS_INIT})
 
   raja_add_executable(
     NAME ${arg_NAME}.exe
     SOURCES ${arg_SOURCES}
-    DEPENDS_ON ${arg_DEPENDS_ON})
+    DEPENDS_ON ${arg_DEPENDS_ON}
+    TEST On)
 
-  add_test(NAME ${arg_NAME}
-    COMMAND ${TEST_DRIVER} $<TARGET_FILE:${arg_NAME}>)
+  blt_add_test(
+    NAME ${arg_NAME}
+    #COMMAND ${TEST_DRIVER} $<TARGET_FILE:${arg_NAME}>)
+    COMMAND ${TEST_DRIVER} ${arg_NAME})
 endmacro(raja_add_test)
diff --git a/cmake/SetupBasics.cmake b/cmake/SetupBasics.cmake
index cd11ee52d9..ef6bec528f 100644
--- a/cmake/SetupBasics.cmake
+++ b/cmake/SetupBasics.cmake
@@ -1,5 +1,5 @@
 ###############################################################################
-# Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+# Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 #    
 # Produced at the Lawrence Livermore National Laboratory
 #    
@@ -9,44 +9,10 @@
 #  
 # This file is part of RAJA.
 #
-# For additional details, please also read RAJA/LICENSE.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright notice,
-#   this list of conditions and the disclaimer below.
-#
-# * Redistributions in binary form must reproduce the above copyright notice,
-#   this list of conditions and the disclaimer (as noted below) in the
-#   documentation and/or other materials provided with the distribution.
-#
-# * Neither the name of the LLNS/LLNL nor the names of its contributors may
-#   be used to endorse or promote products derived from this software without
-#   specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-# ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-# LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-# IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-# POSSIBILITY OF SUCH DAMAGE.
+# For details about use and distribution, please read RAJA/LICENSE.
 #
 ###############################################################################
 
-# Don't allow in-source builds
-if(${CMAKE_SOURCE_DIR} STREQUAL ${CMAKE_BINARY_DIR})
-     message(FATAL_ERROR "In-source builds are not supported. Please remove \
-     CMakeCache.txt from the 'src' dir and configure an out-of-source build in \
-     another directory.")
- endif()
-
  if(NOT CMAKE_BUILD_TYPE)
    set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Choose the type of build, \
    options are: Debug Release RelWithDebInfo" FORCE)
diff --git a/cmake/SetupCompilers.cmake b/cmake/SetupCompilers.cmake
index 5a719b9bf9..d80d5e2552 100644
--- a/cmake/SetupCompilers.cmake
+++ b/cmake/SetupCompilers.cmake
@@ -1,5 +1,5 @@
 ###############################################################################
-# Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+# Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 #
 # Produced at the Lawrence Livermore National Laboratory
 #
@@ -9,34 +9,7 @@
 #
 # This file is part of RAJA.
 #
-# For additional details, please also read RAJA/LICENSE.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright notice,
-#   this list of conditions and the disclaimer below.
-#
-# * Redistributions in binary form must reproduce the above copyright notice,
-#   this list of conditions and the disclaimer (as noted below) in the
-#   documentation and/or other materials provided with the distribution.
-#
-# * Neither the name of the LLNS/LLNL nor the names of its contributors may
-#   be used to endorse or promote products derived from this software without
-#   specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-# ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-# LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-# IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-# POSSIBILITY OF SUCH DAMAGE.
+# For details about use and distribution, please read RAJA/LICENSE.
 #
 ###############################################################################
 
@@ -45,6 +18,7 @@ set(COMPILERS_KNOWN_TO_CMAKE33 AppleClang Clang GNU MSVC)
 include(CheckCXXCompilerFlag)
 if(RAJA_CXX_STANDARD_FLAG MATCHES default)
   if("cxx_std_17" IN_LIST CMAKE_CXX_KNOWN_FEATURES)
+    #TODO set BLT_CXX_STANDARD
     set(CMAKE_CXX_STANDARD 17)
   elseif("cxx_std_14" IN_LIST CMAKE_CXX_KNOWN_FEATURES)
     set(CMAKE_CXX_STANDARD 14)
@@ -71,16 +45,16 @@ set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3" CACHE STRING "")
 set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -O3" CACHE STRING "")
 set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0" CACHE STRING "")
 
-if (RAJA_ENABLE_WARNINGS)
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -Werror")
-endif ()
+if (RAJA_ENABLE_MODULES AND CMAKE_CXX_COMPILER_ID MATCHES Clang)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fmodules")
+endif()
 
 if (CMAKE_CXX_COMPILER_ID MATCHES GNU)
   if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.9)
     message(FATAL_ERROR "RAJA requires GCC 4.9 or greater!")
   endif ()
-  if (RAJA_ENABLE_COVERAGE)
-    if(NOT RAJA_ENABLE_CUDA)
+  if (ENABLE_COVERAGE)
+    if(NOT ENABLE_CUDA)
       message(INFO "Coverage analysis enabled")
       set(CMAKE_CXX_FLAGS "-coverage ${CMAKE_CXX_FLAGS}")
       set(CMAKE_EXE_LINKER_FLAGS "-coverage ${CMAKE_EXE_LINKER_FLAGS}")
@@ -102,8 +76,7 @@ if ( MSVC )
   endif()
 endif()
 
-if (RAJA_ENABLE_CUDA)
-
+if (ENABLE_CUDA)
   if ( NOT DEFINED RAJA_NVCC_STD ) 
     set(RAJA_NVCC_STD "c++11")
     # When we require cmake 3.8+, replace this with setting CUDA_STANDARD
@@ -119,28 +92,29 @@ if (RAJA_ENABLE_CUDA)
   endif()
 
   if (NOT RAJA_HOST_CONFIG_LOADED)
-    if(CMAKE_BUILD_TYPE MATCHES Release)
-        set(RAJA_NVCC_FLAGS -O2; -restrict; -arch ${RAJA_CUDA_ARCH}; -std ${RAJA_NVCC_STD}; --expt-extended-lambda; -ccbin; ${CMAKE_CXX_COMPILER} CACHE LIST "")
-    elseif(CMAKE_BUILD_TYPE MATCHES Debug)
-        set(RAJA_NVCC_FLAGS -g; -G; -O0; -restrict; -arch ${RAJA_CUDA_ARCH}; -std  ${RAJA_NVCC_STD}; --expt-extended-lambda; -ccbin ${CMAKE_CXX_COMPILER} CACHE LIST "")
-    elseif(CMAKE_BUILD_TYPE MATCHES MinSizeRel)
-        set(RAJA_NVCC_FLAGS -Os; -restrict; -arch ${RAJA_CUDA_ARCH}; -std ${RAJA_NVCC_STD}; --expt-extended-lambda; -ccbin; ${CMAKE_CXX_COMPILER} CACHE LIST "")
-    else() # CMAKE_BUILD_TYPE MATCHES RelWithDebInfo)
-        set(RAJA_NVCC_FLAGS -g; -G; -O2; -restrict; -arch ${RAJA_CUDA_ARCH}; -std  ${RAJA_NVCC_STD}; --expt-extended-lambda; -ccbin ${CMAKE_CXX_COMPILER} CACHE LIST "")
-    endif()
+    list(APPEND RAJA_EXTRA_NVCC_FLAGS -restrict; -arch ${CUDA_ARCH}; -std ${RAJA_NVCC_STD}; --expt-extended-lambda; -ccbin; ${CMAKE_CXX_COMPILER})
+
+    set(RAJA_NVCC_FLAGS_RELEASE -O2 CACHE STRING "")
+    set(RAJA_NVCC_FLAGS_DEBUG -g; -G; -O0 CACHE STRING "")
+    set(RAJA_NVCC_FLAGS_MINSIZEREL -Os CACHE STRING "")
+    set(RAJA_NVCC_FLAGS_RELWITHDEBINFO -g; -G; -O2 CACHE STRING "")
 
     if(RAJA_ENABLE_COVERAGE)
       if (CMAKE_CXX_COMPILER_ID MATCHES GNU)
         message(INFO "Coverage analysis enabled")
-        set(RAJA_NVCC_FLAGS ${RAJA_NVCC_FLAGS}; -Xcompiler -coverage; -Xlinker -coverage)
+        set(RAJA_EXTRA_NVCC_FLAGS ${RAJA_EXTRA_NVCC_FLAGS}; -Xcompiler -coverage; -Xlinker -coverage)
         set(CMAKE_EXE_LINKER_FLAGS "-coverage ${CMAKE_EXE_LINKER_FLAGS}")
       else()
         message(WARNING "Code coverage specified but not enabled -- GCC was not detected")
       endif()
     endif()
   endif()
-
+  set(RAJA_NVCC_FLAGS ${RAJA_EXTRA_NVCC_FLAGS} CACHE STRING "")
   set(CUDA_NVCC_FLAGS ${RAJA_NVCC_FLAGS})
+  set(CUDA_NVCC_FLAGS_RELEASE ${RAJA_NVCC_FLAGS_RELEASE})
+  set(CUDA_NVCC_FLAGS_DEBUG ${RAJA_NVCC_FLAGS_DEBUG})
+  set(CUDA_NVCC_FLAGS_MINSIZEREL ${RAJA_NVCC_FLAGS_MINSIZEREL})
+  set(CUDA_NVCC_FLAGS_RELWITHDEBINFO ${RAJA_NVCC_FLAGS_RELWITHDEBINFO})
 endif()
 # end RAJA_ENABLE_CUDA section
 
@@ -148,4 +122,3 @@ set(RAJA_RANGE_ALIGN 4 CACHE INT "")
 set(RAJA_RANGE_MIN_LENGTH 32 CACHE INT "")
 set(RAJA_DATA_ALIGN 64 CACHE INT "")
 set(RAJA_COHERENCE_BLOCK_SIZE 64 CACHE INT "")
-
diff --git a/cmake/SetupPackages.cmake b/cmake/SetupPackages.cmake
index 88b8ee6343..ef5be0aefa 100644
--- a/cmake/SetupPackages.cmake
+++ b/cmake/SetupPackages.cmake
@@ -1,5 +1,5 @@
 ###############################################################################
-# Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+# Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 #    
 # Produced at the Lawrence Livermore National Laboratory
 #    
@@ -9,152 +9,52 @@
 #  
 # This file is part of RAJA.
 #
-# For additional details, please also read RAJA/LICENSE.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright notice,
-#   this list of conditions and the disclaimer below.
-#
-# * Redistributions in binary form must reproduce the above copyright notice,
-#   this list of conditions and the disclaimer (as noted below) in the
-#   documentation and/or other materials provided with the distribution.
-#
-# * Neither the name of the LLNS/LLNL nor the names of its contributors may
-#   be used to endorse or promote products derived from this software without
-#   specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-# ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-# LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-# IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-# POSSIBILITY OF SUCH DAMAGE.
+# For details about use and distribution, please read RAJA/LICENSE.
 #
 ###############################################################################
 
-if (RAJA_ENABLE_OPENMP)
-  find_package(OpenMP)
+if (ENABLE_OPENMP)
   if(OPENMP_FOUND)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
-    list(APPEND RAJA_NVCC_FLAGS -Xcompiler ${OpenMP_CXX_FLAGS})
+    list(APPEND RAJA_EXTRA_NVCC_FLAGS -Xcompiler ${OpenMP_CXX_FLAGS})
     message(STATUS "OpenMP Enabled")
   else()
     message(WARNING "OpenMP NOT FOUND")
-    set(RAJA_ENABLE_OPENMP Off)
+    set(ENABLE_OPENMP Off)
   endif()
 endif()
 
-if (RAJA_ENABLE_CLANG_CUDA)
-  set(RAJA_ENABLE_CUDA On)
-endif ()
-
-if (RAJA_ENABLE_CUDA)
-  find_package(CUDA REQUIRED)
-  set (CUDA_PROPAGATE_HOST_FLAGS OFF)
-  include_directories(${CUDA_INCLUDE_DIRS})
-
-  if (RAJA_ENABLE_CUB)
-
+if (ENABLE_CUDA)
+  if (ENABLE_CUB)
     find_package(CUB)
-
     if (CUB_FOUND)
-      include_directories(${CUB_INCLUDE_DIRS})
+      blt_register_library(
+        NAME cub
+        INCLUDES ${CUB_INCLUDE_DIRS})
     else()
       message(WARNING "Using deprecated Thrust backend for CUDA scans.\n
   Please set CUB_DIR for better scan performance.")
-      set(RAJA_ENABLE_CUB False)
+      set(ENABLE_CUB Off)
     endif()
   endif()
-endif()
-
+endif ()
 
-if (RAJA_ENABLE_TBB)
+if (ENABLE_TBB)
   find_package(TBB)
   if(TBB_FOUND)
-    include_directories(${TBB_INCLUDE_DIRS})
+    blt_register_library(
+      NAME tbb
+      INCLUDES ${TBB_INCLUDE_DIRS}
+      LIBRARIES ${TBB_LIBRARIES})
     message(STATUS "TBB Enabled")
   else()
     message(WARNING "TBB NOT FOUND")
-    set(RAJA_ENABLE_TBB Off)
+    set(ENABLE_TBB Off)
   endif()
 endif ()
 
-if (RAJA_ENABLE_TESTS)
-
-#
-# This conditional prevents build problems resulting from BLT and
-# RAJA each having their own copy of googletest.
-#
-if (RAJA_BUILD_WITH_BLT)
-else()
-
-  include(ExternalProject)
-  # Set default ExternalProject root directory
-  SET_DIRECTORY_PROPERTIES(PROPERTIES EP_PREFIX ${CMAKE_BINARY_DIR}/tpl)
-
-  ExternalProject_Add(
-      googletest
-      GIT_REPOSITORY https://github.com/google/googletest.git
-      GIT_TAG release-1.7.0
-      CMAKE_ARGS                
-          -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-          -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-          -DCMAKE_CXX_COMPILER_ARG1=${CMAKE_CXX_COMPILER_ARG1}
-      INSTALL_COMMAND ""
-      LOG_DOWNLOAD ON
-      LOG_CONFIGURE ON
-      LOG_BUILD ON)
-
-  ExternalProject_Get_Property(googletest source_dir)
-  include_directories(${source_dir}/include)
-
-  ExternalProject_Get_Property(googletest binary_dir)
-  add_library(gtest      UNKNOWN IMPORTED)
-  add_library(gtest_main UNKNOWN IMPORTED)
-
-  if ( UNIX )
-    set_target_properties(gtest PROPERTIES
-      IMPORTED_LOCATION ${binary_dir}/libgtest.a
-    )
-    set_target_properties(gtest_main PROPERTIES
-      IMPORTED_LOCATION ${binary_dir}/libgtest_main.a
-    )
-  elseif( WIN32 )
-    set_target_properties(gtest PROPERTIES
-      IMPORTED_LOCATION ${binary_dir}/${CMAKE_BUILD_TYPE}/gtest.lib
-    )
-    set_target_properties(gtest_main PROPERTIES
-      IMPORTED_LOCATION ${binary_dir}/${CMAKE_BUILD_TYPE}/gtest_main.lib
-    )
-  endif ()
-  add_dependencies(gtest      googletest)
-  add_dependencies(gtest_main googletest)
-
-  # GoogleTest requires threading
-  find_package(Threads)
-
-  enable_testing()
-endif ()
-
-endif ()
-
-if (RAJA_ENABLE_DOCUMENTATION)
-  find_package(Sphinx)
-  find_package(Doxygen)
-endif ()
-
-if (RAJA_ENABLE_CHAI)
+if (ENABLE_CHAI)
   message(STATUS "CHAI enabled")
-
   find_package(chai)
-
   include_directories(${CHAI_INCLUDE_DIRS})
 endif()
diff --git a/cmake/SetupRajaConfig.cmake b/cmake/SetupRajaConfig.cmake
index efa1bc59a7..ab9e562573 100644
--- a/cmake/SetupRajaConfig.cmake
+++ b/cmake/SetupRajaConfig.cmake
@@ -1,5 +1,5 @@
 ###############################################################################
-# Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+# Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 #    
 # Produced at the Lawrence Livermore National Laboratory
 #    
@@ -9,34 +9,7 @@
 #  
 # This file is part of RAJA.
 #
-# For additional details, please also read RAJA/LICENSE.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright notice,
-#   this list of conditions and the disclaimer below.
-#
-# * Redistributions in binary form must reproduce the above copyright notice,
-#   this list of conditions and the disclaimer (as noted below) in the
-#   documentation and/or other materials provided with the distribution.
-#
-# * Neither the name of the LLNS/LLNL nor the names of its contributors may
-#   be used to endorse or promote products derived from this software without
-#   specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-# ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-# LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-# IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-# POSSIBILITY OF SUCH DAMAGE.
+# For details about use and distribution, please read RAJA/LICENSE.
 #
 ###############################################################################
 
@@ -48,7 +21,7 @@ option(RAJA_USE_FLOAT Off)
 option(RAJA_USE_COMPLEX Off)
 
 ## Pointer options
-if (RAJA_ENABLE_CUDA)
+if (ENABLE_CUDA)
   set(RAJA_PTR "RAJA_USE_BARE_PTR")
 else ()
   set(RAJA_PTR "RAJA_USE_RESTRICT_PTR")
@@ -59,7 +32,7 @@ endif()
 #set(RAJA_USE_PTR_CLASS OFF)
 
 ## Fault tolerance options
-option(RAJA_ENABLE_FT "Enable fault-tolerance features" OFF)
+option(ENABLE_FT "Enable fault-tolerance features" OFF)
 option(RAJA_REPORT_FT "Report on use of fault-tolerant features" OFF)
 
 ## Timer options
@@ -88,6 +61,15 @@ check_function_exists(posix_memalign RAJA_HAVE_POSIX_MEMALIGN)
 check_function_exists(aligned_alloc RAJA_HAVE_ALIGNED_ALLOC)
 check_function_exists(_mm_malloc RAJA_HAVE_MM_MALLOC)
 
+# Set up RAJA_ENABLE prefixed options
+set(RAJA_ENABLE_OPENMP ${ENABLE_OPENMP})
+set(RAJA_ENABLE_TARGET_OPENMP ${ENABLE_TARGET_OPENMP})
+set(RAJA_ENABLE_TBB ${ENABLE_TBB})
+set(RAJA_ENABLE_CUDA ${ENABLE_CUDA})
+set(RAJA_ENABLE_CLANG_CUDA ${ENABLE_CLANG_CUDA})
+set(RAJA_ENABLE_CHAI ${ENABLE_CHAI})
+set(RAJA_ENABLE_CUB ${ENABLE_CUB})
+
 # Configure a header file with all the variables we found.
 configure_file(${PROJECT_SOURCE_DIR}/include/RAJA/config.hpp.in
   ${PROJECT_BINARY_DIR}/include/RAJA/config.hpp)
@@ -109,7 +91,7 @@ if(PKG_CONFIG_FOUND)
   foreach(INCDIR ${INCLUDE_DIRECTORIES} ${CUDA_INCLUDE_DIRS})
     set(PC_C_FLAGS "${PC_C_FLAGS} -I${INCDIR}")
   endforeach()
-  if(RAJA_ENABLE_CUDA)
+  if(ENABLE_CUDA)
     foreach(FLAG ${RAJA_NVCC_FLAGS})
       set(PC_C_FLAGS "${PC_C_FLAGS} ${FLAG}")
     endforeach()
diff --git a/cmake/thirdparty/FindCUDA.cmake b/cmake/thirdparty/FindCUDA.cmake
deleted file mode 100644
index ebfd24ab29..0000000000
--- a/cmake/thirdparty/FindCUDA.cmake
+++ /dev/null
@@ -1,1917 +0,0 @@
-#.rst:
-# FindCUDA
-# --------
-#
-# Tools for building CUDA C files: libraries and build dependencies.
-#
-# This script locates the NVIDIA CUDA C tools.  It should work on linux,
-# windows, and mac and should be reasonably up to date with CUDA C
-# releases.
-#
-# This script makes use of the standard find_package arguments of
-# <VERSION>, REQUIRED and QUIET.  CUDA_FOUND will report if an
-# acceptable version of CUDA was found.
-#
-# The script will prompt the user to specify CUDA_TOOLKIT_ROOT_DIR if
-# the prefix cannot be determined by the location of nvcc in the system
-# path and REQUIRED is specified to find_package().  To use a different
-# installed version of the toolkit set the environment variable
-# CUDA_BIN_PATH before running cmake (e.g.
-# CUDA_BIN_PATH=/usr/local/cuda1.0 instead of the default
-# /usr/local/cuda) or set CUDA_TOOLKIT_ROOT_DIR after configuring.  If
-# you change the value of CUDA_TOOLKIT_ROOT_DIR, various components that
-# depend on the path will be relocated.
-#
-# It might be necessary to set CUDA_TOOLKIT_ROOT_DIR manually on certain
-# platforms, or to use a cuda runtime not installed in the default
-# location.  In newer versions of the toolkit the cuda library is
-# included with the graphics driver- be sure that the driver version
-# matches what is needed by the cuda runtime version.
-#
-# The following variables affect the behavior of the macros in the
-# script (in alphebetical order).  Note that any of these flags can be
-# changed multiple times in the same directory before calling
-# CUDA_ADD_EXECUTABLE, CUDA_ADD_LIBRARY, CUDA_COMPILE, CUDA_COMPILE_PTX,
-# CUDA_COMPILE_FATBIN, CUDA_COMPILE_CUBIN or CUDA_WRAP_SRCS::
-#
-#   CUDA_64_BIT_DEVICE_CODE (Default matches host bit size)
-#   -- Set to ON to compile for 64 bit device code, OFF for 32 bit device code.
-#      Note that making this different from the host code when generating object
-#      or C files from CUDA code just won't work, because size_t gets defined by
-#      nvcc in the generated source.  If you compile to PTX and then load the
-#      file yourself, you can mix bit sizes between device and host.
-#
-#   CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE (Default ON)
-#   -- Set to ON if you want the custom build rule to be attached to the source
-#      file in Visual Studio.  Turn OFF if you add the same cuda file to multiple
-#      targets.
-#
-#      This allows the user to build the target from the CUDA file; however, bad
-#      things can happen if the CUDA source file is added to multiple targets.
-#      When performing parallel builds it is possible for the custom build
-#      command to be run more than once and in parallel causing cryptic build
-#      errors.  VS runs the rules for every source file in the target, and a
-#      source can have only one rule no matter how many projects it is added to.
-#      When the rule is run from multiple targets race conditions can occur on
-#      the generated file.  Eventually everything will get built, but if the user
-#      is unaware of this behavior, there may be confusion.  It would be nice if
-#      this script could detect the reuse of source files across multiple targets
-#      and turn the option off for the user, but no good solution could be found.
-#
-#   CUDA_BUILD_CUBIN (Default OFF)
-#   -- Set to ON to enable and extra compilation pass with the -cubin option in
-#      Device mode. The output is parsed and register, shared memory usage is
-#      printed during build.
-#
-#   CUDA_BUILD_EMULATION (Default OFF for device mode)
-#   -- Set to ON for Emulation mode. -D_DEVICEEMU is defined for CUDA C files
-#      when CUDA_BUILD_EMULATION is TRUE.
-#
-#   CUDA_GENERATED_OUTPUT_DIR (Default CMAKE_CURRENT_BINARY_DIR)
-#   -- Set to the path you wish to have the generated files placed.  If it is
-#      blank output files will be placed in CMAKE_CURRENT_BINARY_DIR.
-#      Intermediate files will always be placed in
-#      CMAKE_CURRENT_BINARY_DIR/CMakeFiles.
-#
-#   CUDA_HOST_COMPILATION_CPP (Default ON)
-#   -- Set to OFF for C compilation of host code.
-#
-#   CUDA_HOST_COMPILER (Default CMAKE_C_COMPILER, $(VCInstallDir)/bin for VS)
-#   -- Set the host compiler to be used by nvcc.  Ignored if -ccbin or
-#      --compiler-bindir is already present in the CUDA_NVCC_FLAGS or
-#      CUDA_NVCC_FLAGS_<CONFIG> variables.  For Visual Studio targets
-#      $(VCInstallDir)/bin is a special value that expands out to the path when
-#      the command is run from within VS.
-#
-#   CUDA_NVCC_FLAGS
-#   CUDA_NVCC_FLAGS_<CONFIG>
-#   -- Additional NVCC command line arguments.  NOTE: multiple arguments must be
-#      semi-colon delimited (e.g. --compiler-options;-Wall)
-#
-#   CUDA_PROPAGATE_HOST_FLAGS (Default ON)
-#   -- Set to ON to propagate CMAKE_{C,CXX}_FLAGS and their configuration
-#      dependent counterparts (e.g. CMAKE_C_FLAGS_DEBUG) automatically to the
-#      host compiler through nvcc's -Xcompiler flag.  This helps make the
-#      generated host code match the rest of the system better.  Sometimes
-#      certain flags give nvcc problems, and this will help you turn the flag
-#      propagation off.  This does not affect the flags supplied directly to nvcc
-#      via CUDA_NVCC_FLAGS or through the OPTION flags specified through
-#      CUDA_ADD_LIBRARY, CUDA_ADD_EXECUTABLE, or CUDA_WRAP_SRCS.  Flags used for
-#      shared library compilation are not affected by this flag.
-#
-#   CUDA_SEPARABLE_COMPILATION (Default OFF)
-#   -- If set this will enable separable compilation for all CUDA runtime object
-#      files.  If used outside of CUDA_ADD_EXECUTABLE and CUDA_ADD_LIBRARY
-#      (e.g. calling CUDA_WRAP_SRCS directly),
-#      CUDA_COMPUTE_SEPARABLE_COMPILATION_OBJECT_FILE_NAME and
-#      CUDA_LINK_SEPARABLE_COMPILATION_OBJECTS should be called.
-#
-#   CUDA_SOURCE_PROPERTY_FORMAT
-#   -- If this source file property is set, it can override the format specified
-#      to CUDA_WRAP_SRCS (OBJ, PTX, CUBIN, or FATBIN).  If an input source file
-#      is not a .cu file, setting this file will cause it to be treated as a .cu
-#      file. See documentation for set_source_files_properties on how to set
-#      this property.
-#
-#   CUDA_USE_STATIC_CUDA_RUNTIME (Default ON)
-#   -- When enabled the static version of the CUDA runtime library will be used
-#      in CUDA_LIBRARIES.  If the version of CUDA configured doesn't support
-#      this option, then it will be silently disabled.
-#
-#   CUDA_VERBOSE_BUILD (Default OFF)
-#   -- Set to ON to see all the commands used when building the CUDA file.  When
-#      using a Makefile generator the value defaults to VERBOSE (run make
-#      VERBOSE=1 to see output), although setting CUDA_VERBOSE_BUILD to ON will
-#      always print the output.
-#
-# The script creates the following macros (in alphebetical order)::
-#
-#   CUDA_ADD_CUFFT_TO_TARGET( cuda_target )
-#   -- Adds the cufft library to the target (can be any target).  Handles whether
-#      you are in emulation mode or not.
-#
-#   CUDA_ADD_CUBLAS_TO_TARGET( cuda_target )
-#   -- Adds the cublas library to the target (can be any target).  Handles
-#      whether you are in emulation mode or not.
-#
-#   CUDA_ADD_EXECUTABLE( cuda_target file0 file1 ...
-#                        [WIN32] [MACOSX_BUNDLE] [EXCLUDE_FROM_ALL] [OPTIONS ...] )
-#   -- Creates an executable "cuda_target" which is made up of the files
-#      specified.  All of the non CUDA C files are compiled using the standard
-#      build rules specified by CMAKE and the cuda files are compiled to object
-#      files using nvcc and the host compiler.  In addition CUDA_INCLUDE_DIRS is
-#      added automatically to include_directories().  Some standard CMake target
-#      calls can be used on the target after calling this macro
-#      (e.g. set_target_properties and target_link_libraries), but setting
-#      properties that adjust compilation flags will not affect code compiled by
-#      nvcc.  Such flags should be modified before calling CUDA_ADD_EXECUTABLE,
-#      CUDA_ADD_LIBRARY or CUDA_WRAP_SRCS.
-#
-#   CUDA_ADD_LIBRARY( cuda_target file0 file1 ...
-#                     [STATIC | SHARED | MODULE] [EXCLUDE_FROM_ALL] [OPTIONS ...] )
-#   -- Same as CUDA_ADD_EXECUTABLE except that a library is created.
-#
-#   CUDA_BUILD_CLEAN_TARGET()
-#   -- Creates a convience target that deletes all the dependency files
-#      generated.  You should make clean after running this target to ensure the
-#      dependency files get regenerated.
-#
-#   CUDA_COMPILE( generated_files file0 file1 ... [STATIC | SHARED | MODULE]
-#                 [OPTIONS ...] )
-#   -- Returns a list of generated files from the input source files to be used
-#      with ADD_LIBRARY or ADD_EXECUTABLE.
-#
-#   CUDA_COMPILE_PTX( generated_files file0 file1 ... [OPTIONS ...] )
-#   -- Returns a list of PTX files generated from the input source files.
-#
-#   CUDA_COMPILE_FATBIN( generated_files file0 file1 ... [OPTIONS ...] )
-#   -- Returns a list of FATBIN files generated from the input source files.
-#
-#   CUDA_COMPILE_CUBIN( generated_files file0 file1 ... [OPTIONS ...] )
-#   -- Returns a list of CUBIN files generated from the input source files.
-#
-#   CUDA_COMPUTE_SEPARABLE_COMPILATION_OBJECT_FILE_NAME( output_file_var
-#                                                        cuda_target
-#                                                        object_files )
-#   -- Compute the name of the intermediate link file used for separable
-#      compilation.  This file name is typically passed into
-#      CUDA_LINK_SEPARABLE_COMPILATION_OBJECTS.  output_file_var is produced
-#      based on cuda_target the list of objects files that need separable
-#      compilation as specified by object_files.  If the object_files list is
-#      empty, then output_file_var will be empty.  This function is called
-#      automatically for CUDA_ADD_LIBRARY and CUDA_ADD_EXECUTABLE.  Note that
-#      this is a function and not a macro.
-#
-#   CUDA_INCLUDE_DIRECTORIES( path0 path1 ... )
-#   -- Sets the directories that should be passed to nvcc
-#      (e.g. nvcc -Ipath0 -Ipath1 ... ). These paths usually contain other .cu
-#      files.
-#
-#
-#   CUDA_LINK_SEPARABLE_COMPILATION_OBJECTS( output_file_var cuda_target
-#                                            nvcc_flags object_files)
-#   -- Generates the link object required by separable compilation from the given
-#      object files.  This is called automatically for CUDA_ADD_EXECUTABLE and
-#      CUDA_ADD_LIBRARY, but can be called manually when using CUDA_WRAP_SRCS
-#      directly.  When called from CUDA_ADD_LIBRARY or CUDA_ADD_EXECUTABLE the
-#      nvcc_flags passed in are the same as the flags passed in via the OPTIONS
-#      argument.  The only nvcc flag added automatically is the bitness flag as
-#      specified by CUDA_64_BIT_DEVICE_CODE.  Note that this is a function
-#      instead of a macro.
-#
-#   CUDA_SELECT_NVCC_ARCH_FLAGS(out_variable [target_CUDA_architectures])
-#   -- Selects GPU arch flags for nvcc based on target_CUDA_architectures
-#      target_CUDA_architectures : Auto | Common | All | LIST(ARCH_AND_PTX ...)
-#       - "Auto" detects local machine GPU compute arch at runtime.
-#       - "Common" and "All" cover common and entire subsets of architectures
-#      ARCH_AND_PTX : NAME | NUM.NUM | NUM.NUM(NUM.NUM) | NUM.NUM+PTX
-#      NAME: Fermi Kepler Maxwell Kepler+Tegra Kepler+Tesla Maxwell+Tegra Pascal
-#      NUM: Any number. Only those pairs are currently accepted by NVCC though:
-#            2.0 2.1 3.0 3.2 3.5 3.7 5.0 5.2 5.3 6.0 6.2
-#      Returns LIST of flags to be added to CUDA_NVCC_FLAGS in ${out_variable}
-#      Additionally, sets ${out_variable}_readable to the resulting numeric list
-#      Example:
-#       CUDA_SELECT_NVCC_ARCH_FLAGS(ARCH_FLAGS 3.0 3.5+PTX 5.2(5.0) Maxwell)
-#        LIST(APPEND CUDA_NVCC_FLAGS ${ARCH_FLAGS})
-#
-#      More info on CUDA architectures: https://en.wikipedia.org/wiki/CUDA
-#      Note that this is a function instead of a macro.
-#
-#   CUDA_WRAP_SRCS ( cuda_target format generated_files file0 file1 ...
-#                    [STATIC | SHARED | MODULE] [OPTIONS ...] )
-#   -- This is where all the magic happens.  CUDA_ADD_EXECUTABLE,
-#      CUDA_ADD_LIBRARY, CUDA_COMPILE, and CUDA_COMPILE_PTX all call this
-#      function under the hood.
-#
-#      Given the list of files (file0 file1 ... fileN) this macro generates
-#      custom commands that generate either PTX or linkable objects (use "PTX" or
-#      "OBJ" for the format argument to switch).  Files that don't end with .cu
-#      or have the HEADER_FILE_ONLY property are ignored.
-#
-#      The arguments passed in after OPTIONS are extra command line options to
-#      give to nvcc.  You can also specify per configuration options by
-#      specifying the name of the configuration followed by the options.  General
-#      options must precede configuration specific options.  Not all
-#      configurations need to be specified, only the ones provided will be used.
-#
-#         OPTIONS -DFLAG=2 "-DFLAG_OTHER=space in flag"
-#         DEBUG -g
-#         RELEASE --use_fast_math
-#         RELWITHDEBINFO --use_fast_math;-g
-#         MINSIZEREL --use_fast_math
-#
-#      For certain configurations (namely VS generating object files with
-#      CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE set to ON), no generated file will
-#      be produced for the given cuda file.  This is because when you add the
-#      cuda file to Visual Studio it knows that this file produces an object file
-#      and will link in the resulting object file automatically.
-#
-#      This script will also generate a separate cmake script that is used at
-#      build time to invoke nvcc.  This is for several reasons.
-#
-#        1. nvcc can return negative numbers as return values which confuses
-#        Visual Studio into thinking that the command succeeded.  The script now
-#        checks the error codes and produces errors when there was a problem.
-#
-#        2. nvcc has been known to not delete incomplete results when it
-#        encounters problems.  This confuses build systems into thinking the
-#        target was generated when in fact an unusable file exists.  The script
-#        now deletes the output files if there was an error.
-#
-#        3. By putting all the options that affect the build into a file and then
-#        make the build rule dependent on the file, the output files will be
-#        regenerated when the options change.
-#
-#      This script also looks at optional arguments STATIC, SHARED, or MODULE to
-#      determine when to target the object compilation for a shared library.
-#      BUILD_SHARED_LIBS is ignored in CUDA_WRAP_SRCS, but it is respected in
-#      CUDA_ADD_LIBRARY.  On some systems special flags are added for building
-#      objects intended for shared libraries.  A preprocessor macro,
-#      <target_name>_EXPORTS is defined when a shared library compilation is
-#      detected.
-#
-#      Flags passed into add_definitions with -D or /D are passed along to nvcc.
-#
-#
-#
-# The script defines the following variables::
-#
-#   CUDA_VERSION_MAJOR    -- The major version of cuda as reported by nvcc.
-#   CUDA_VERSION_MINOR    -- The minor version.
-#   CUDA_VERSION
-#   CUDA_VERSION_STRING   -- CUDA_VERSION_MAJOR.CUDA_VERSION_MINOR
-#   CUDA_HAS_FP16         -- Whether a short float (float16,fp16) is supported.
-#
-#   CUDA_TOOLKIT_ROOT_DIR -- Path to the CUDA Toolkit (defined if not set).
-#   CUDA_SDK_ROOT_DIR     -- Path to the CUDA SDK.  Use this to find files in the
-#                            SDK.  This script will not directly support finding
-#                            specific libraries or headers, as that isn't
-#                            supported by NVIDIA.  If you want to change
-#                            libraries when the path changes see the
-#                            FindCUDA.cmake script for an example of how to clear
-#                            these variables.  There are also examples of how to
-#                            use the CUDA_SDK_ROOT_DIR to locate headers or
-#                            libraries, if you so choose (at your own risk).
-#   CUDA_INCLUDE_DIRS     -- Include directory for cuda headers.  Added automatically
-#                            for CUDA_ADD_EXECUTABLE and CUDA_ADD_LIBRARY.
-#   CUDA_LIBRARIES        -- Cuda RT library.
-#   CUDA_CUFFT_LIBRARIES  -- Device or emulation library for the Cuda FFT
-#                            implementation (alternative to:
-#                            CUDA_ADD_CUFFT_TO_TARGET macro)
-#   CUDA_CUBLAS_LIBRARIES -- Device or emulation library for the Cuda BLAS
-#                            implementation (alternative to:
-#                            CUDA_ADD_CUBLAS_TO_TARGET macro).
-#   CUDA_cudart_static_LIBRARY -- Statically linkable cuda runtime library.
-#                                 Only available for CUDA version 5.5+
-#   CUDA_cudadevrt_LIBRARY -- Device runtime library.
-#                             Required for separable compilation.
-#   CUDA_cupti_LIBRARY    -- CUDA Profiling Tools Interface library.
-#                            Only available for CUDA version 4.0+.
-#   CUDA_curand_LIBRARY   -- CUDA Random Number Generation library.
-#                            Only available for CUDA version 3.2+.
-#   CUDA_cusolver_LIBRARY -- CUDA Direct Solver library.
-#                            Only available for CUDA version 7.0+.
-#   CUDA_cusparse_LIBRARY -- CUDA Sparse Matrix library.
-#                            Only available for CUDA version 3.2+.
-#   CUDA_npp_LIBRARY      -- NVIDIA Performance Primitives lib.
-#                            Only available for CUDA version 4.0+.
-#   CUDA_nppc_LIBRARY     -- NVIDIA Performance Primitives lib (core).
-#                            Only available for CUDA version 5.5+.
-#   CUDA_nppi_LIBRARY     -- NVIDIA Performance Primitives lib (image processing).
-#                            Only available for CUDA version 5.5+.
-#   CUDA_npps_LIBRARY     -- NVIDIA Performance Primitives lib (signal processing).
-#                            Only available for CUDA version 5.5+.
-#   CUDA_nvcuvenc_LIBRARY -- CUDA Video Encoder library.
-#                            Only available for CUDA version 3.2+.
-#                            Windows only.
-#   CUDA_nvcuvid_LIBRARY  -- CUDA Video Decoder library.
-#                            Only available for CUDA version 3.2+.
-#                            Windows only.
-#
-
-#   James Bigler, NVIDIA Corp (nvidia.com - jbigler)
-#   Abe Stephens, SCI Institute -- http://www.sci.utah.edu/~abe/FindCuda.html
-#
-#   Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
-#
-#   Copyright (c) 2007-2009
-#   Scientific Computing and Imaging Institute, University of Utah
-#
-#   This code is licensed under the MIT License.  See the FindCUDA.cmake script
-#   for the text of the license.
-
-# The MIT License
-#
-# License for the specific language governing rights and limitations under
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included
-# in all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
-# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-# DEALINGS IN THE SOFTWARE.
-#
-###############################################################################
-
-# FindCUDA.cmake
-
-# This macro helps us find the location of helper files we will need the full path to
-macro(CUDA_FIND_HELPER_FILE _name _extension)
-  set(_full_name "${_name}.${_extension}")
-  # CMAKE_CURRENT_LIST_FILE contains the full path to the file currently being
-  # processed.  Using this variable, we can pull out the current path, and
-  # provide a way to get access to the other files we need local to here.
-  get_filename_component(CMAKE_CURRENT_LIST_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH)
-  set(CUDA_${_name} "${CMAKE_CURRENT_LIST_DIR}/FindCUDA/${_full_name}")
-  if(NOT EXISTS "${CUDA_${_name}}")
-    set(error_message "${_full_name} not found in ${CMAKE_CURRENT_LIST_DIR}/FindCUDA")
-    if(CUDA_FIND_REQUIRED)
-      message(FATAL_ERROR "${error_message}")
-    else()
-      if(NOT CUDA_FIND_QUIETLY)
-        message(STATUS "${error_message}")
-      endif()
-    endif()
-  endif()
-  # Set this variable as internal, so the user isn't bugged with it.
-  set(CUDA_${_name} ${CUDA_${_name}} CACHE INTERNAL "Location of ${_full_name}" FORCE)
-endmacro()
-
-#####################################################################
-## CUDA_INCLUDE_NVCC_DEPENDENCIES
-##
-
-# So we want to try and include the dependency file if it exists.  If
-# it doesn't exist then we need to create an empty one, so we can
-# include it.
-
-# If it does exist, then we need to check to see if all the files it
-# depends on exist.  If they don't then we should clear the dependency
-# file and regenerate it later.  This covers the case where a header
-# file has disappeared or moved.
-
-macro(CUDA_INCLUDE_NVCC_DEPENDENCIES dependency_file)
-  set(CUDA_NVCC_DEPEND)
-  set(CUDA_NVCC_DEPEND_REGENERATE FALSE)
-
-
-  # Include the dependency file.  Create it first if it doesn't exist .  The
-  # INCLUDE puts a dependency that will force CMake to rerun and bring in the
-  # new info when it changes.  DO NOT REMOVE THIS (as I did and spent a few
-  # hours figuring out why it didn't work.
-  if(NOT EXISTS ${dependency_file})
-    file(WRITE ${dependency_file} "#FindCUDA.cmake generated file.  Do not edit.\n")
-  endif()
-  # Always include this file to force CMake to run again next
-  # invocation and rebuild the dependencies.
-  #message("including dependency_file = ${dependency_file}")
-  include(${dependency_file})
-
-  # Now we need to verify the existence of all the included files
-  # here.  If they aren't there we need to just blank this variable and
-  # make the file regenerate again.
-#   if(DEFINED CUDA_NVCC_DEPEND)
-#     message("CUDA_NVCC_DEPEND set")
-#   else()
-#     message("CUDA_NVCC_DEPEND NOT set")
-#   endif()
-  if(CUDA_NVCC_DEPEND)
-    #message("CUDA_NVCC_DEPEND found")
-    foreach(f ${CUDA_NVCC_DEPEND})
-      # message("searching for ${f}")
-      if(NOT EXISTS ${f})
-        #message("file ${f} not found")
-        set(CUDA_NVCC_DEPEND_REGENERATE TRUE)
-      endif()
-    endforeach()
-  else()
-    #message("CUDA_NVCC_DEPEND false")
-    # No dependencies, so regenerate the file.
-    set(CUDA_NVCC_DEPEND_REGENERATE TRUE)
-  endif()
-
-  #message("CUDA_NVCC_DEPEND_REGENERATE = ${CUDA_NVCC_DEPEND_REGENERATE}")
-  # No incoming dependencies, so we need to generate them.  Make the
-  # output depend on the dependency file itself, which should cause the
-  # rule to re-run.
-  if(CUDA_NVCC_DEPEND_REGENERATE)
-    set(CUDA_NVCC_DEPEND ${dependency_file})
-    #message("Generating an empty dependency_file: ${dependency_file}")
-    file(WRITE ${dependency_file} "#FindCUDA.cmake generated file.  Do not edit.\n")
-  endif()
-
-endmacro()
-
-###############################################################################
-###############################################################################
-# Setup variables' defaults
-###############################################################################
-###############################################################################
-
-# Allow the user to specify if the device code is supposed to be 32 or 64 bit.
-if(CMAKE_SIZEOF_VOID_P EQUAL 8)
-  set(CUDA_64_BIT_DEVICE_CODE_DEFAULT ON)
-else()
-  set(CUDA_64_BIT_DEVICE_CODE_DEFAULT OFF)
-endif()
-option(CUDA_64_BIT_DEVICE_CODE "Compile device code in 64 bit mode" ${CUDA_64_BIT_DEVICE_CODE_DEFAULT})
-
-# Attach the build rule to the source file in VS.  This option
-option(CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE "Attach the build rule to the CUDA source file.  Enable only when the CUDA source file is added to at most one target." ON)
-
-# Prints out extra information about the cuda file during compilation
-option(CUDA_BUILD_CUBIN "Generate and parse .cubin files in Device mode." OFF)
-
-# Set whether we are using emulation or device mode.
-option(CUDA_BUILD_EMULATION "Build in Emulation mode" OFF)
-
-# Where to put the generated output.
-set(CUDA_GENERATED_OUTPUT_DIR "" CACHE PATH "Directory to put all the output files.  If blank it will default to the CMAKE_CURRENT_BINARY_DIR")
-
-# Parse HOST_COMPILATION mode.
-option(CUDA_HOST_COMPILATION_CPP "Generated file extension" ON)
-
-# Extra user settable flags
-set(CUDA_NVCC_FLAGS "" CACHE STRING "Semi-colon delimit multiple arguments.")
-
-if(CMAKE_GENERATOR MATCHES "Visual Studio")
-  set(CUDA_HOST_COMPILER "$(VCInstallDir)bin" CACHE FILEPATH "Host side compiler used by NVCC")
-else()
-  if(APPLE
-      AND "${CMAKE_C_COMPILER_ID}" MATCHES "Clang"
-      AND "${CMAKE_C_COMPILER}" MATCHES "/cc$")
-    # Using cc which is symlink to clang may let NVCC think it is GCC and issue
-    # unhandled -dumpspecs option to clang. Also in case neither
-    # CMAKE_C_COMPILER is defined (project does not use C language) nor
-    # CUDA_HOST_COMPILER is specified manually we should skip -ccbin and let
-    # nvcc use its own default C compiler.
-    # Only care about this on APPLE with clang to avoid
-    # following symlinks to things like ccache
-    if(DEFINED CMAKE_C_COMPILER AND NOT DEFINED CUDA_HOST_COMPILER)
-      get_filename_component(c_compiler_realpath "${CMAKE_C_COMPILER}" REALPATH)
-      # if the real path does not end up being clang then
-      # go back to using CMAKE_C_COMPILER
-      if(NOT "${c_compiler_realpath}" MATCHES "/clang$")
-        set(c_compiler_realpath "${CMAKE_C_COMPILER}")
-      endif()
-    else()
-      set(c_compiler_realpath "")
-    endif()
-    set(CUDA_HOST_COMPILER "${c_compiler_realpath}" CACHE FILEPATH "Host side compiler used by NVCC")
-  else()
-    set(CUDA_HOST_COMPILER "${CMAKE_C_COMPILER}"
-      CACHE FILEPATH "Host side compiler used by NVCC")
-  endif()
-endif()
-
-# Propagate the host flags to the host compiler via -Xcompiler
-option(CUDA_PROPAGATE_HOST_FLAGS "Propage C/CXX_FLAGS and friends to the host compiler via -Xcompile" ON)
-
-# Enable CUDA_SEPARABLE_COMPILATION
-option(CUDA_SEPARABLE_COMPILATION "Compile CUDA objects with separable compilation enabled.  Requires CUDA 5.0+" OFF)
-
-# Specifies whether the commands used when compiling the .cu file will be printed out.
-option(CUDA_VERBOSE_BUILD "Print out the commands run while compiling the CUDA source file.  With the Makefile generator this defaults to VERBOSE variable specified on the command line, but can be forced on with this option." OFF)
-
-mark_as_advanced(
-  CUDA_64_BIT_DEVICE_CODE
-  CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE
-  CUDA_GENERATED_OUTPUT_DIR
-  CUDA_HOST_COMPILATION_CPP
-  CUDA_NVCC_FLAGS
-  CUDA_PROPAGATE_HOST_FLAGS
-  CUDA_BUILD_CUBIN
-  CUDA_BUILD_EMULATION
-  CUDA_VERBOSE_BUILD
-  CUDA_SEPARABLE_COMPILATION
-  )
-
-# Makefile and similar generators don't define CMAKE_CONFIGURATION_TYPES, so we
-# need to add another entry for the CMAKE_BUILD_TYPE.  We also need to add the
-# standerd set of 4 build types (Debug, MinSizeRel, Release, and RelWithDebInfo)
-# for completeness.  We need run this loop in order to accomodate the addition
-# of extra configuration types.  Duplicate entries will be removed by
-# REMOVE_DUPLICATES.
-set(CUDA_configuration_types ${CMAKE_CONFIGURATION_TYPES} ${CMAKE_BUILD_TYPE} Debug MinSizeRel Release RelWithDebInfo)
-list(REMOVE_DUPLICATES CUDA_configuration_types)
-foreach(config ${CUDA_configuration_types})
-    string(TOUPPER ${config} config_upper)
-    set(CUDA_NVCC_FLAGS_${config_upper} "" CACHE STRING "Semi-colon delimit multiple arguments.")
-    mark_as_advanced(CUDA_NVCC_FLAGS_${config_upper})
-endforeach()
-
-###############################################################################
-###############################################################################
-# Locate CUDA, Set Build Type, etc.
-###############################################################################
-###############################################################################
-
-macro(cuda_unset_include_and_libraries)
-  unset(CUDA_TOOLKIT_INCLUDE CACHE)
-  unset(CUDA_CUDART_LIBRARY CACHE)
-  unset(CUDA_CUDA_LIBRARY CACHE)
-  # Make sure you run this before you unset CUDA_VERSION.
-  if(CUDA_VERSION VERSION_EQUAL "3.0")
-    # This only existed in the 3.0 version of the CUDA toolkit
-    unset(CUDA_CUDARTEMU_LIBRARY CACHE)
-  endif()
-  unset(CUDA_cudart_static_LIBRARY CACHE)
-  unset(CUDA_cudadevrt_LIBRARY CACHE)
-  unset(CUDA_cublas_LIBRARY CACHE)
-  unset(CUDA_cublas_device_LIBRARY CACHE)
-  unset(CUDA_cublasemu_LIBRARY CACHE)
-  unset(CUDA_cufft_LIBRARY CACHE)
-  unset(CUDA_cufftemu_LIBRARY CACHE)
-  unset(CUDA_cupti_LIBRARY CACHE)
-  unset(CUDA_curand_LIBRARY CACHE)
-  unset(CUDA_cusolver_LIBRARY CACHE)
-  unset(CUDA_cusparse_LIBRARY CACHE)
-  unset(CUDA_npp_LIBRARY CACHE)
-  unset(CUDA_nppc_LIBRARY CACHE)
-  unset(CUDA_nppi_LIBRARY CACHE)
-  unset(CUDA_npps_LIBRARY CACHE)
-  unset(CUDA_nvcuvenc_LIBRARY CACHE)
-  unset(CUDA_nvcuvid_LIBRARY CACHE)
-  unset(CUDA_USE_STATIC_CUDA_RUNTIME CACHE)
-  unset(CUDA_GPU_DETECT_OUTPUT CACHE)
-endmacro()
-
-# Check to see if the CUDA_TOOLKIT_ROOT_DIR and CUDA_SDK_ROOT_DIR have changed,
-# if they have then clear the cache variables, so that will be detected again.
-if(NOT "${CUDA_TOOLKIT_ROOT_DIR}" STREQUAL "${CUDA_TOOLKIT_ROOT_DIR_INTERNAL}")
-  unset(CUDA_TOOLKIT_TARGET_DIR CACHE)
-  unset(CUDA_NVCC_EXECUTABLE CACHE)
-  cuda_unset_include_and_libraries()
-  unset(CUDA_VERSION CACHE)
-endif()
-
-if(NOT "${CUDA_TOOLKIT_TARGET_DIR}" STREQUAL "${CUDA_TOOLKIT_TARGET_DIR_INTERNAL}")
-  cuda_unset_include_and_libraries()
-endif()
-
-#
-#  End of unset()
-#
-
-#
-#  Start looking for things
-#
-
-# Search for the cuda distribution.
-if(NOT CUDA_TOOLKIT_ROOT_DIR AND NOT CMAKE_CROSSCOMPILING)
-  # Search in the CUDA_BIN_PATH first.
-  find_path(CUDA_TOOLKIT_ROOT_DIR
-    NAMES nvcc nvcc.exe
-    PATHS
-      ENV CUDA_TOOLKIT_ROOT
-      ENV CUDA_PATH
-      ENV CUDA_BIN_PATH
-    PATH_SUFFIXES bin bin64
-    DOC "Toolkit location."
-    NO_DEFAULT_PATH
-    )
-
-  # Now search default paths
-  find_path(CUDA_TOOLKIT_ROOT_DIR
-    NAMES nvcc nvcc.exe
-    PATHS /opt/cuda/bin
-          /usr/local/bin
-          /usr/local/cuda/bin
-    DOC "Toolkit location."
-    )
-
-  if (CUDA_TOOLKIT_ROOT_DIR)
-    string(REGEX REPLACE "[/\\\\]?bin[64]*[/\\\\]?$" "" CUDA_TOOLKIT_ROOT_DIR ${CUDA_TOOLKIT_ROOT_DIR})
-    # We need to force this back into the cache.
-    set(CUDA_TOOLKIT_ROOT_DIR ${CUDA_TOOLKIT_ROOT_DIR} CACHE PATH "Toolkit location." FORCE)
-    set(CUDA_TOOLKIT_TARGET_DIR ${CUDA_TOOLKIT_ROOT_DIR})
-  endif()
-
-  if (NOT EXISTS ${CUDA_TOOLKIT_ROOT_DIR})
-    if(CUDA_FIND_REQUIRED)
-      message(FATAL_ERROR "Specify CUDA_TOOLKIT_ROOT_DIR")
-    elseif(NOT CUDA_FIND_QUIETLY)
-      message("CUDA_TOOLKIT_ROOT_DIR not found or specified")
-    endif()
-  endif ()
-endif ()
-
-if(CMAKE_CROSSCOMPILING)
-  SET (CUDA_TOOLKIT_ROOT $ENV{CUDA_TOOLKIT_ROOT})
-  if(CMAKE_SYSTEM_PROCESSOR STREQUAL "armv7-a")
-    # Support for NVPACK
-    set (CUDA_TOOLKIT_TARGET_NAME "armv7-linux-androideabi")
-  elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "arm")
-    # Support for arm cross compilation
-    set(CUDA_TOOLKIT_TARGET_NAME "armv7-linux-gnueabihf")
-  elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
-    # Support for aarch64 cross compilation
-    if (ANDROID_ARCH_NAME STREQUAL "arm64")
-      set(CUDA_TOOLKIT_TARGET_NAME "aarch64-linux-androideabi")
-    else()
-      set(CUDA_TOOLKIT_TARGET_NAME "aarch64-linux")
-    endif (ANDROID_ARCH_NAME STREQUAL "arm64")
-  endif()
-
-  if (EXISTS "${CUDA_TOOLKIT_ROOT}/targets/${CUDA_TOOLKIT_TARGET_NAME}")
-    set(CUDA_TOOLKIT_TARGET_DIR "${CUDA_TOOLKIT_ROOT}/targets/${CUDA_TOOLKIT_TARGET_NAME}" CACHE PATH "CUDA Toolkit target location.")
-    SET (CUDA_TOOLKIT_ROOT_DIR ${CUDA_TOOLKIT_ROOT})
-    mark_as_advanced(CUDA_TOOLKIT_TARGET_DIR)
-  endif()
-
-  # add known CUDA targetr root path to the set of directories we search for programs, libraries and headers
-  set( CMAKE_FIND_ROOT_PATH "${CUDA_TOOLKIT_TARGET_DIR};${CMAKE_FIND_ROOT_PATH}")
-  macro( cuda_find_host_program )
-    find_host_program( ${ARGN} )
-  endmacro()
-else()
-  # for non-cross-compile, find_host_program == find_program and CUDA_TOOLKIT_TARGET_DIR == CUDA_TOOLKIT_ROOT_DIR
-  macro( cuda_find_host_program )
-    find_program( ${ARGN} )
-  endmacro()
-  SET (CUDA_TOOLKIT_TARGET_DIR ${CUDA_TOOLKIT_ROOT_DIR})
-endif()
-
-
-# CUDA_NVCC_EXECUTABLE
-cuda_find_host_program(CUDA_NVCC_EXECUTABLE
-  NAMES nvcc
-  PATHS "${CUDA_TOOLKIT_ROOT_DIR}"
-  ENV CUDA_PATH
-  ENV CUDA_BIN_PATH
-  PATH_SUFFIXES bin bin64
-  NO_DEFAULT_PATH
-  )
-# Search default search paths, after we search our own set of paths.
-cuda_find_host_program(CUDA_NVCC_EXECUTABLE nvcc)
-mark_as_advanced(CUDA_NVCC_EXECUTABLE)
-
-if(CUDA_NVCC_EXECUTABLE AND NOT CUDA_VERSION)
-  # Compute the version.
-  execute_process (COMMAND ${CUDA_NVCC_EXECUTABLE} "--version" OUTPUT_VARIABLE NVCC_OUT)
-  string(REGEX REPLACE ".*release ([0-9]+)\\.([0-9]+).*" "\\1" CUDA_VERSION_MAJOR ${NVCC_OUT})
-  string(REGEX REPLACE ".*release ([0-9]+)\\.([0-9]+).*" "\\2" CUDA_VERSION_MINOR ${NVCC_OUT})
-  set(CUDA_VERSION "${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR}" CACHE STRING "Version of CUDA as computed from nvcc.")
-  mark_as_advanced(CUDA_VERSION)
-else()
-  # Need to set these based off of the cached value
-  string(REGEX REPLACE "([0-9]+)\\.([0-9]+).*" "\\1" CUDA_VERSION_MAJOR "${CUDA_VERSION}")
-  string(REGEX REPLACE "([0-9]+)\\.([0-9]+).*" "\\2" CUDA_VERSION_MINOR "${CUDA_VERSION}")
-endif()
-
-
-# Always set this convenience variable
-set(CUDA_VERSION_STRING "${CUDA_VERSION}")
-
-# CUDA_TOOLKIT_INCLUDE
-find_path(CUDA_TOOLKIT_INCLUDE
-  device_functions.h # Header included in toolkit
-  PATHS ${CUDA_TOOLKIT_TARGET_DIR}
-  ENV CUDA_PATH
-  ENV CUDA_INC_PATH
-  PATH_SUFFIXES include
-  NO_DEFAULT_PATH
-  )
-# Search default search paths, after we search our own set of paths.
-find_path(CUDA_TOOLKIT_INCLUDE device_functions.h)
-mark_as_advanced(CUDA_TOOLKIT_INCLUDE)
-
-if (CUDA_VERSION VERSION_GREATER "7.0" OR EXISTS "${CUDA_TOOLKIT_INCLUDE}/cuda_fp16.h")
-  set(CUDA_HAS_FP16 TRUE)
-else()
-  set(CUDA_HAS_FP16 FALSE)
-endif()
-
-# Set the user list of include dir to nothing to initialize it.
-set (CUDA_NVCC_INCLUDE_DIRS_USER "")
-set (CUDA_INCLUDE_DIRS ${CUDA_TOOLKIT_INCLUDE})
-
-macro(cuda_find_library_local_first_with_path_ext _var _names _doc _path_ext )
-  if(CMAKE_SIZEOF_VOID_P EQUAL 8)
-    # CUDA 3.2+ on Windows moved the library directories, so we need the new
-    # and old paths.
-    set(_cuda_64bit_lib_dir "${_path_ext}lib/x64" "${_path_ext}lib64" "${_path_ext}libx64" )
-  endif()
-  # CUDA 3.2+ on Windows moved the library directories, so we need to new
-  # (lib/Win32) and the old path (lib).
-  find_library(${_var}
-    NAMES ${_names}
-    PATHS "${CUDA_TOOLKIT_TARGET_DIR}"
-    ENV CUDA_PATH
-    ENV CUDA_LIB_PATH
-    PATH_SUFFIXES ${_cuda_64bit_lib_dir} "${_path_ext}lib/Win32" "${_path_ext}lib" "${_path_ext}libWin32"
-    DOC ${_doc}
-    NO_DEFAULT_PATH
-    )
-  if (NOT CMAKE_CROSSCOMPILING)
-    # Search default search paths, after we search our own set of paths.
-    find_library(${_var}
-      NAMES ${_names}
-      PATHS "/usr/lib/nvidia-current"
-      DOC ${_doc}
-      )
-  endif()
-endmacro()
-
-macro(cuda_find_library_local_first _var _names _doc)
-  cuda_find_library_local_first_with_path_ext( "${_var}" "${_names}" "${_doc}" "" )
-endmacro()
-
-macro(find_library_local_first _var _names _doc )
-  cuda_find_library_local_first( "${_var}" "${_names}" "${_doc}" "" )
-endmacro()
-
-
-# CUDA_LIBRARIES
-cuda_find_library_local_first(CUDA_CUDART_LIBRARY cudart "\"cudart\" library")
-if(CUDA_VERSION VERSION_EQUAL "3.0")
-  # The cudartemu library only existed for the 3.0 version of CUDA.
-  cuda_find_library_local_first(CUDA_CUDARTEMU_LIBRARY cudartemu "\"cudartemu\" library")
-  mark_as_advanced(
-    CUDA_CUDARTEMU_LIBRARY
-    )
-endif()
-
-if(NOT CUDA_VERSION VERSION_LESS "5.5")
-  cuda_find_library_local_first(CUDA_cudart_static_LIBRARY cudart_static "static CUDA runtime library")
-  mark_as_advanced(CUDA_cudart_static_LIBRARY)
-endif()
-
-
-if(CUDA_cudart_static_LIBRARY)
-  # If static cudart available, use it by default, but provide a user-visible option to disable it.
-  option(CUDA_USE_STATIC_CUDA_RUNTIME "Use the static version of the CUDA runtime library if available" ON)
-  set(CUDA_CUDART_LIBRARY_VAR CUDA_cudart_static_LIBRARY)
-else()
-  # If not available, silently disable the option.
-  set(CUDA_USE_STATIC_CUDA_RUNTIME OFF CACHE INTERNAL "")
-  set(CUDA_CUDART_LIBRARY_VAR CUDA_CUDART_LIBRARY)
-endif()
-if(NOT CUDA_VERSION VERSION_LESS "5.0")
-  cuda_find_library_local_first(CUDA_cudadevrt_LIBRARY cudadevrt "\"cudadevrt\" library")
-  mark_as_advanced(CUDA_cudadevrt_LIBRARY)
-endif()
-
-if(CUDA_USE_STATIC_CUDA_RUNTIME)
-  if(UNIX)
-    # Check for the dependent libraries.  Here we look for pthreads.
-    if (DEFINED CMAKE_THREAD_PREFER_PTHREAD)
-      set(_cuda_cmake_thread_prefer_pthread ${CMAKE_THREAD_PREFER_PTHREAD})
-    endif()
-    set(CMAKE_THREAD_PREFER_PTHREAD 1)
-
-    # Many of the FindXYZ CMake comes with makes use of try_compile with int main(){return 0;}
-    # as the source file.  Unfortunately this causes a warning with -Wstrict-prototypes and
-    # -Werror causes the try_compile to fail.  We will just temporarily disable other flags
-    # when doing the find_package command here.
-    set(_cuda_cmake_c_flags ${CMAKE_C_FLAGS})
-    set(CMAKE_C_FLAGS "-fPIC")
-    find_package(Threads REQUIRED)
-    set(CMAKE_C_FLAGS ${_cuda_cmake_c_flags})
-
-    if (DEFINED _cuda_cmake_thread_prefer_pthread)
-      set(CMAKE_THREAD_PREFER_PTHREAD ${_cuda_cmake_thread_prefer_pthread})
-      unset(_cuda_cmake_thread_prefer_pthread)
-    else()
-      unset(CMAKE_THREAD_PREFER_PTHREAD)
-    endif()
-
-    if(NOT APPLE)
-      #On Linux, you must link against librt when using the static cuda runtime.
-      find_library(CUDA_rt_LIBRARY rt)
-      if (NOT CUDA_rt_LIBRARY)
-        message(WARNING "Expecting to find librt for libcudart_static, but didn't find it.")
-      endif()
-    endif()
-  endif()
-endif()
-
-# CUPTI library showed up in cuda toolkit 4.0
-if(NOT CUDA_VERSION VERSION_LESS "4.0")
-  cuda_find_library_local_first_with_path_ext(CUDA_cupti_LIBRARY cupti "\"cupti\" library" "extras/CUPTI/")
-  mark_as_advanced(CUDA_cupti_LIBRARY)
-endif()
-
-# Set the CUDA_LIBRARIES variable.  This is the set of stuff to link against if you are
-# using the CUDA runtime.  For the dynamic version of the runtime, most of the
-# dependencies are brough in, but for the static version there are additional libraries
-# and linker commands needed.
-# Initialize to empty
-set(CUDA_LIBRARIES)
-
-# If we are using emulation mode and we found the cudartemu library then use
-# that one instead of cudart.
-if(CUDA_BUILD_EMULATION AND CUDA_CUDARTEMU_LIBRARY)
-  list(APPEND CUDA_LIBRARIES ${CUDA_CUDARTEMU_LIBRARY})
-elseif(CUDA_USE_STATIC_CUDA_RUNTIME AND CUDA_cudart_static_LIBRARY)
-  list(APPEND CUDA_LIBRARIES ${CUDA_cudart_static_LIBRARY} ${CMAKE_THREAD_LIBS_INIT} ${CMAKE_DL_LIBS})
-  if (CUDA_rt_LIBRARY)
-    list(APPEND CUDA_LIBRARIES ${CUDA_rt_LIBRARY})
-  endif()
-  if(APPLE)
-    # We need to add the default path to the driver (libcuda.dylib) as an rpath, so that
-    # the static cuda runtime can find it at runtime.
-    list(APPEND CUDA_LIBRARIES -Wl,-rpath,/usr/local/cuda/lib)
-  endif()
-else()
-  list(APPEND CUDA_LIBRARIES ${CUDA_CUDART_LIBRARY})
-endif()
-
-# 1.1 toolkit on linux doesn't appear to have a separate library on
-# some platforms.
-cuda_find_library_local_first(CUDA_CUDA_LIBRARY cuda "\"cuda\" library (older versions only).")
-
-mark_as_advanced(
-  CUDA_CUDA_LIBRARY
-  CUDA_CUDART_LIBRARY
-  )
-
-#######################
-# Look for some of the toolkit helper libraries
-macro(FIND_CUDA_HELPER_LIBS _name)
-  cuda_find_library_local_first(CUDA_${_name}_LIBRARY ${_name} "\"${_name}\" library")
-  mark_as_advanced(CUDA_${_name}_LIBRARY)
-endmacro()
-
-#######################
-# Disable emulation for v3.1 onward
-if(CUDA_VERSION VERSION_GREATER "3.0")
-  if(CUDA_BUILD_EMULATION)
-    message(FATAL_ERROR "CUDA_BUILD_EMULATION is not supported in version 3.1 and onwards.  You must disable it to proceed.  You have version ${CUDA_VERSION}.")
-  endif()
-endif()
-
-# Search for additional CUDA toolkit libraries.
-if(CUDA_VERSION VERSION_LESS "3.1")
-  # Emulation libraries aren't available in version 3.1 onward.
-  find_cuda_helper_libs(cufftemu)
-  find_cuda_helper_libs(cublasemu)
-endif()
-find_cuda_helper_libs(cufft)
-find_cuda_helper_libs(cublas)
-if(NOT CUDA_VERSION VERSION_LESS "3.2")
-  # cusparse showed up in version 3.2
-  find_cuda_helper_libs(cusparse)
-  find_cuda_helper_libs(curand)
-  if (WIN32)
-    find_cuda_helper_libs(nvcuvenc)
-    find_cuda_helper_libs(nvcuvid)
-  endif()
-endif()
-if(CUDA_VERSION VERSION_GREATER "5.0")
-  find_cuda_helper_libs(cublas_device)
-  # In CUDA 5.5 NPP was splitted onto 3 separate libraries.
-  find_cuda_helper_libs(nppc)
-  find_cuda_helper_libs(nppi)
-  find_cuda_helper_libs(npps)
-  set(CUDA_npp_LIBRARY "${CUDA_nppc_LIBRARY};${CUDA_nppi_LIBRARY};${CUDA_npps_LIBRARY}")
-elseif(NOT CUDA_VERSION VERSION_LESS "4.0")
-  find_cuda_helper_libs(npp)
-endif()
-if(NOT CUDA_VERSION VERSION_LESS "7.0")
-  # cusolver showed up in version 7.0
-  find_cuda_helper_libs(cusolver)
-endif()
-
-if (CUDA_BUILD_EMULATION)
-  set(CUDA_CUFFT_LIBRARIES ${CUDA_cufftemu_LIBRARY})
-  set(CUDA_CUBLAS_LIBRARIES ${CUDA_cublasemu_LIBRARY})
-else()
-  set(CUDA_CUFFT_LIBRARIES ${CUDA_cufft_LIBRARY})
-  set(CUDA_CUBLAS_LIBRARIES ${CUDA_cublas_LIBRARY} ${CUDA_cublas_device_LIBRARY})
-endif()
-
-########################
-# Look for the SDK stuff.  As of CUDA 3.0 NVSDKCUDA_ROOT has been replaced with
-# NVSDKCOMPUTE_ROOT with the old CUDA C contents moved into the C subdirectory
-find_path(CUDA_SDK_ROOT_DIR common/inc/cutil.h
- HINTS
-  "$ENV{NVSDKCOMPUTE_ROOT}/C"
-  ENV NVSDKCUDA_ROOT
-  "[HKEY_LOCAL_MACHINE\\SOFTWARE\\NVIDIA Corporation\\Installed Products\\NVIDIA SDK 10\\Compute;InstallDir]"
- PATHS
-  "/Developer/GPU\ Computing/C"
-  )
-
-# Keep the CUDA_SDK_ROOT_DIR first in order to be able to override the
-# environment variables.
-set(CUDA_SDK_SEARCH_PATH
-  "${CUDA_SDK_ROOT_DIR}"
-  "${CUDA_TOOLKIT_ROOT_DIR}/local/NVSDK0.2"
-  "${CUDA_TOOLKIT_ROOT_DIR}/NVSDK0.2"
-  "${CUDA_TOOLKIT_ROOT_DIR}/NV_CUDA_SDK"
-  "$ENV{HOME}/NVIDIA_CUDA_SDK"
-  "$ENV{HOME}/NVIDIA_CUDA_SDK_MACOSX"
-  "/Developer/CUDA"
-  )
-
-# Example of how to find an include file from the CUDA_SDK_ROOT_DIR
-
-# find_path(CUDA_CUT_INCLUDE_DIR
-#   cutil.h
-#   PATHS ${CUDA_SDK_SEARCH_PATH}
-#   PATH_SUFFIXES "common/inc"
-#   DOC "Location of cutil.h"
-#   NO_DEFAULT_PATH
-#   )
-# # Now search system paths
-# find_path(CUDA_CUT_INCLUDE_DIR cutil.h DOC "Location of cutil.h")
-
-# mark_as_advanced(CUDA_CUT_INCLUDE_DIR)
-
-
-# Example of how to find a library in the CUDA_SDK_ROOT_DIR
-
-# # cutil library is called cutil64 for 64 bit builds on windows.  We don't want
-# # to get these confused, so we are setting the name based on the word size of
-# # the build.
-
-# if(CMAKE_SIZEOF_VOID_P EQUAL 8)
-#   set(cuda_cutil_name cutil64)
-# else()
-#   set(cuda_cutil_name cutil32)
-# endif()
-
-# find_library(CUDA_CUT_LIBRARY
-#   NAMES cutil ${cuda_cutil_name}
-#   PATHS ${CUDA_SDK_SEARCH_PATH}
-#   # The new version of the sdk shows up in common/lib, but the old one is in lib
-#   PATH_SUFFIXES "common/lib" "lib"
-#   DOC "Location of cutil library"
-#   NO_DEFAULT_PATH
-#   )
-# # Now search system paths
-# find_library(CUDA_CUT_LIBRARY NAMES cutil ${cuda_cutil_name} DOC "Location of cutil library")
-# mark_as_advanced(CUDA_CUT_LIBRARY)
-# set(CUDA_CUT_LIBRARIES ${CUDA_CUT_LIBRARY})
-
-
-
-#############################
-# Check for required components
-set(CUDA_FOUND TRUE)
-
-set(CUDA_TOOLKIT_ROOT_DIR_INTERNAL "${CUDA_TOOLKIT_ROOT_DIR}" CACHE INTERNAL
-  "This is the value of the last time CUDA_TOOLKIT_ROOT_DIR was set successfully." FORCE)
-set(CUDA_TOOLKIT_TARGET_DIR_INTERNAL "${CUDA_TOOLKIT_TARGET_DIR}" CACHE INTERNAL
-  "This is the value of the last time CUDA_TOOLKIT_TARGET_DIR was set successfully." FORCE)
-set(CUDA_SDK_ROOT_DIR_INTERNAL "${CUDA_SDK_ROOT_DIR}" CACHE INTERNAL
-  "This is the value of the last time CUDA_SDK_ROOT_DIR was set successfully." FORCE)
-
-#include(${CMAKE_CURRENT_LIST_DIR}/FindPackageHandleStandardArgs.cmake)
-
-find_package_handle_standard_args(CUDA
-  REQUIRED_VARS
-    CUDA_TOOLKIT_ROOT_DIR
-    CUDA_NVCC_EXECUTABLE
-    CUDA_INCLUDE_DIRS
-    ${CUDA_CUDART_LIBRARY_VAR}
-  VERSION_VAR
-    CUDA_VERSION
-  )
-
-
-
-###############################################################################
-###############################################################################
-# Macros
-###############################################################################
-###############################################################################
-
-###############################################################################
-# Add include directories to pass to the nvcc command.
-macro(CUDA_INCLUDE_DIRECTORIES)
-  foreach(dir ${ARGN})
-    list(APPEND CUDA_NVCC_INCLUDE_DIRS_USER ${dir})
-  endforeach()
-endmacro()
-
-
-##############################################################################
-cuda_find_helper_file(parse_cubin cmake)
-cuda_find_helper_file(make2cmake cmake)
-cuda_find_helper_file(run_nvcc cmake)
-include("${CMAKE_CURRENT_LIST_DIR}/FindCUDA/select_compute_arch.cmake")
-
-##############################################################################
-# Separate the OPTIONS out from the sources
-#
-macro(CUDA_GET_SOURCES_AND_OPTIONS _sources _cmake_options _options)
-  set( ${_sources} )
-  set( ${_cmake_options} )
-  set( ${_options} )
-  set( _found_options FALSE )
-  foreach(arg ${ARGN})
-    if("x${arg}" STREQUAL "xOPTIONS")
-      set( _found_options TRUE )
-    elseif(
-        "x${arg}" STREQUAL "xWIN32" OR
-        "x${arg}" STREQUAL "xMACOSX_BUNDLE" OR
-        "x${arg}" STREQUAL "xEXCLUDE_FROM_ALL" OR
-        "x${arg}" STREQUAL "xSTATIC" OR
-        "x${arg}" STREQUAL "xSHARED" OR
-        "x${arg}" STREQUAL "xMODULE"
-        )
-      list(APPEND ${_cmake_options} ${arg})
-    else()
-      if ( _found_options )
-        list(APPEND ${_options} ${arg})
-      else()
-        # Assume this is a file
-        list(APPEND ${_sources} ${arg})
-      endif()
-    endif()
-  endforeach()
-endmacro()
-
-##############################################################################
-# Parse the OPTIONS from ARGN and set the variables prefixed by _option_prefix
-#
-macro(CUDA_PARSE_NVCC_OPTIONS _option_prefix)
-  set( _found_config )
-  foreach(arg ${ARGN})
-    # Determine if we are dealing with a perconfiguration flag
-    foreach(config ${CUDA_configuration_types})
-      string(TOUPPER ${config} config_upper)
-      if (arg STREQUAL "${config_upper}")
-        set( _found_config _${arg})
-        # Set arg to nothing to keep it from being processed further
-        set( arg )
-      endif()
-    endforeach()
-
-    if ( arg )
-      list(APPEND ${_option_prefix}${_found_config} "${arg}")
-    endif()
-  endforeach()
-endmacro()
-
-##############################################################################
-# Helper to add the include directory for CUDA only once
-function(CUDA_ADD_CUDA_INCLUDE_ONCE)
-  get_directory_property(_include_directories INCLUDE_DIRECTORIES)
-  set(_add TRUE)
-  if(_include_directories)
-    foreach(dir ${_include_directories})
-      if("${dir}" STREQUAL "${CUDA_INCLUDE_DIRS}")
-        set(_add FALSE)
-      endif()
-    endforeach()
-  endif()
-  if(_add)
-    include_directories(${CUDA_INCLUDE_DIRS})
-  endif()
-endfunction()
-
-function(CUDA_BUILD_SHARED_LIBRARY shared_flag)
-  set(cmake_args ${ARGN})
-  # If SHARED, MODULE, or STATIC aren't already in the list of arguments, then
-  # add SHARED or STATIC based on the value of BUILD_SHARED_LIBS.
-  list(FIND cmake_args SHARED _cuda_found_SHARED)
-  list(FIND cmake_args MODULE _cuda_found_MODULE)
-  list(FIND cmake_args STATIC _cuda_found_STATIC)
-  if( _cuda_found_SHARED GREATER -1 OR
-      _cuda_found_MODULE GREATER -1 OR
-      _cuda_found_STATIC GREATER -1)
-    set(_cuda_build_shared_libs)
-  else()
-    if (BUILD_SHARED_LIBS)
-      set(_cuda_build_shared_libs SHARED)
-    else()
-      set(_cuda_build_shared_libs STATIC)
-    endif()
-  endif()
-  set(${shared_flag} ${_cuda_build_shared_libs} PARENT_SCOPE)
-endfunction()
-
-##############################################################################
-# Helper to avoid clashes of files with the same basename but different paths.
-# This doesn't attempt to do exactly what CMake internals do, which is to only
-# add this path when there is a conflict, since by the time a second collision
-# in names is detected it's already too late to fix the first one.  For
-# consistency sake the relative path will be added to all files.
-function(CUDA_COMPUTE_BUILD_PATH path build_path)
-  #message("CUDA_COMPUTE_BUILD_PATH([${path}] ${build_path})")
-  # Only deal with CMake style paths from here on out
-  file(TO_CMAKE_PATH "${path}" bpath)
-  if (IS_ABSOLUTE "${bpath}")
-    # Absolute paths are generally unnessary, especially if something like
-    # file(GLOB_RECURSE) is used to pick up the files.
-
-    string(FIND "${bpath}" "${CMAKE_CURRENT_BINARY_DIR}" _binary_dir_pos)
-    if (_binary_dir_pos EQUAL 0)
-      file(RELATIVE_PATH bpath "${CMAKE_CURRENT_BINARY_DIR}" "${bpath}")
-    else()
-      file(RELATIVE_PATH bpath "${CMAKE_CURRENT_SOURCE_DIR}" "${bpath}")
-    endif()
-  endif()
-
-  # This recipe is from cmLocalGenerator::CreateSafeUniqueObjectFileName in the
-  # CMake source.
-
-  # Remove leading /
-  string(REGEX REPLACE "^[/]+" "" bpath "${bpath}")
-  # Avoid absolute paths by removing ':'
-  string(REPLACE ":" "_" bpath "${bpath}")
-  # Avoid relative paths that go up the tree
-  string(REPLACE "../" "__/" bpath "${bpath}")
-  # Avoid spaces
-  string(REPLACE " " "_" bpath "${bpath}")
-
-  # Strip off the filename.  I wait until here to do it, since removin the
-  # basename can make a path that looked like path/../basename turn into
-  # path/.. (notice the trailing slash).
-  get_filename_component(bpath "${bpath}" PATH)
-
-  set(${build_path} "${bpath}" PARENT_SCOPE)
-  #message("${build_path} = ${bpath}")
-endfunction()
-
-##############################################################################
-# This helper macro populates the following variables and setups up custom
-# commands and targets to invoke the nvcc compiler to generate C or PTX source
-# dependent upon the format parameter.  The compiler is invoked once with -M
-# to generate a dependency file and a second time with -cuda or -ptx to generate
-# a .cpp or .ptx file.
-# INPUT:
-#   cuda_target         - Target name
-#   format              - PTX, CUBIN, FATBIN or OBJ
-#   FILE1 .. FILEN      - The remaining arguments are the sources to be wrapped.
-#   OPTIONS             - Extra options to NVCC
-# OUTPUT:
-#   generated_files     - List of generated files
-##############################################################################
-##############################################################################
-
-macro(CUDA_WRAP_SRCS cuda_target format generated_files)
-
-  # Put optional arguments in list.
-  set(_argn_list "${ARGN}")
-  # If one of the given optional arguments is "PHONY", make a note of it, then
-  # remove it from the list.
-  list(FIND _argn_list "PHONY" _phony_idx)
-  if("${_phony_idx}" GREATER "-1")
-    set(_target_is_phony true)
-    list(REMOVE_AT _argn_list ${_phony_idx})
-  else()
-    set(_target_is_phony false)
-  endif()
-
-  # If CMake doesn't support separable compilation, complain
-  if(CUDA_SEPARABLE_COMPILATION AND CMAKE_VERSION VERSION_LESS "2.8.10.1")
-    message(SEND_ERROR "CUDA_SEPARABLE_COMPILATION isn't supported for CMake versions less than 2.8.10.1")
-  endif()
-
-  # Set up all the command line flags here, so that they can be overridden on a per target basis.
-
-  set(nvcc_flags "")
-
-  # Emulation if the card isn't present.
-  if (CUDA_BUILD_EMULATION)
-    # Emulation.
-    set(nvcc_flags ${nvcc_flags} --device-emulation -D_DEVICEEMU -g)
-  else()
-    # Device mode.  No flags necessary.
-  endif()
-
-  if(CUDA_HOST_COMPILATION_CPP)
-    set(CUDA_C_OR_CXX CXX)
-  else()
-    if(CUDA_VERSION VERSION_LESS "3.0")
-      set(nvcc_flags ${nvcc_flags} --host-compilation C)
-    else()
-      message(WARNING "--host-compilation flag is deprecated in CUDA version >= 3.0.  Removing --host-compilation C flag" )
-    endif()
-    set(CUDA_C_OR_CXX C)
-  endif()
-
-  set(generated_extension ${CMAKE_${CUDA_C_OR_CXX}_OUTPUT_EXTENSION})
-
-  if(CUDA_64_BIT_DEVICE_CODE)
-    set(nvcc_flags ${nvcc_flags} -m64)
-  else()
-    set(nvcc_flags ${nvcc_flags} -m32)
-  endif()
-
-  if(CUDA_TARGET_CPU_ARCH)
-    set(nvcc_flags ${nvcc_flags} "--target-cpu-architecture=${CUDA_TARGET_CPU_ARCH}")
-  endif()
-
-  # This needs to be passed in at this stage, because VS needs to fill out the
-  # value of VCInstallDir from within VS.  Note that CCBIN is only used if
-  # -ccbin or --compiler-bindir isn't used and CUDA_HOST_COMPILER matches
-  # $(VCInstallDir)/bin.
-  if(CMAKE_GENERATOR MATCHES "Visual Studio")
-    set(ccbin_flags -D "\"CCBIN:PATH=$(VCInstallDir)bin\"" )
-  else()
-    set(ccbin_flags)
-  endif()
-
-  # Figure out which configure we will use and pass that in as an argument to
-  # the script.  We need to defer the decision until compilation time, because
-  # for VS projects we won't know if we are making a debug or release build
-  # until build time.
-  if(CMAKE_GENERATOR MATCHES "Visual Studio")
-    set( CUDA_build_configuration "$(ConfigurationName)" )
-  else()
-    set( CUDA_build_configuration "${CMAKE_BUILD_TYPE}")
-  endif()
-
-  # Initialize our list of includes with the user ones followed by the CUDA system ones.
-  set(CUDA_NVCC_INCLUDE_DIRS ${CUDA_NVCC_INCLUDE_DIRS_USER} "${CUDA_INCLUDE_DIRS}")
-  if(_target_is_phony)
-    # If the passed in target name isn't a real target (i.e., this is from a call to one of the
-    # cuda_compile_* functions), need to query directory properties to get include directories
-    # and compile definitions.
-    get_directory_property(_dir_include_dirs INCLUDE_DIRECTORIES)
-    get_directory_property(_dir_compile_defs COMPILE_DEFINITIONS)
-
-    list(APPEND CUDA_NVCC_INCLUDE_DIRS "${_dir_include_dirs}")
-    set(CUDA_NVCC_COMPILE_DEFINITIONS "${_dir_compile_defs}")
-  else()
-    # Append the include directories for this target via generator expression, which is
-    # expanded by the FILE(GENERATE) call below.  This generator expression captures all
-    # include dirs set by the user, whether via directory properties or target properties
-    list(APPEND CUDA_NVCC_INCLUDE_DIRS "$<TARGET_PROPERTY:${cuda_target},INCLUDE_DIRECTORIES>")
-
-    # Do the same thing with compile definitions
-    set(CUDA_NVCC_COMPILE_DEFINITIONS "$<TARGET_PROPERTY:${cuda_target},COMPILE_DEFINITIONS>")
-  endif()
-
-
-  # Reset these variables
-  set(CUDA_WRAP_OPTION_NVCC_FLAGS)
-  foreach(config ${CUDA_configuration_types})
-    string(TOUPPER ${config} config_upper)
-    set(CUDA_WRAP_OPTION_NVCC_FLAGS_${config_upper})
-  endforeach()
-
-  CUDA_GET_SOURCES_AND_OPTIONS(_cuda_wrap_sources _cuda_wrap_cmake_options _cuda_wrap_options ${_argn_list})
-  CUDA_PARSE_NVCC_OPTIONS(CUDA_WRAP_OPTION_NVCC_FLAGS ${_cuda_wrap_options})
-
-  # Figure out if we are building a shared library.  BUILD_SHARED_LIBS is
-  # respected in CUDA_ADD_LIBRARY.
-  set(_cuda_build_shared_libs FALSE)
-  # SHARED, MODULE
-  list(FIND _cuda_wrap_cmake_options SHARED _cuda_found_SHARED)
-  list(FIND _cuda_wrap_cmake_options MODULE _cuda_found_MODULE)
-  if(_cuda_found_SHARED GREATER -1 OR _cuda_found_MODULE GREATER -1)
-    set(_cuda_build_shared_libs TRUE)
-  endif()
-  # STATIC
-  list(FIND _cuda_wrap_cmake_options STATIC _cuda_found_STATIC)
-  if(_cuda_found_STATIC GREATER -1)
-    set(_cuda_build_shared_libs FALSE)
-  endif()
-
-  # CUDA_HOST_FLAGS
-  if(_cuda_build_shared_libs)
-    # If we are setting up code for a shared library, then we need to add extra flags for
-    # compiling objects for shared libraries.
-    set(CUDA_HOST_SHARED_FLAGS ${CMAKE_SHARED_LIBRARY_${CUDA_C_OR_CXX}_FLAGS})
-  else()
-    set(CUDA_HOST_SHARED_FLAGS)
-  endif()
-  # Only add the CMAKE_{C,CXX}_FLAGS if we are propagating host flags.  We
-  # always need to set the SHARED_FLAGS, though.
-  if(CUDA_PROPAGATE_HOST_FLAGS)
-    set(_cuda_host_flags "set(CMAKE_HOST_FLAGS ${CMAKE_${CUDA_C_OR_CXX}_FLAGS} ${CUDA_HOST_SHARED_FLAGS})")
-  else()
-    set(_cuda_host_flags "set(CMAKE_HOST_FLAGS ${CUDA_HOST_SHARED_FLAGS})")
-  endif()
-
-  set(_cuda_nvcc_flags_config "# Build specific configuration flags")
-  # Loop over all the configuration types to generate appropriate flags for run_nvcc.cmake
-  foreach(config ${CUDA_configuration_types})
-    string(TOUPPER ${config} config_upper)
-    # CMAKE_FLAGS are strings and not lists.  By not putting quotes around CMAKE_FLAGS
-    # we convert the strings to lists (like we want).
-
-    if(CUDA_PROPAGATE_HOST_FLAGS)
-      # nvcc chokes on -g3 in versions previous to 3.0, so replace it with -g
-      set(_cuda_fix_g3 FALSE)
-
-      if(CMAKE_COMPILER_IS_GNUCC)
-        if (CUDA_VERSION VERSION_LESS  "3.0" OR
-            CUDA_VERSION VERSION_EQUAL "4.1" OR
-            CUDA_VERSION VERSION_EQUAL "4.2"
-            )
-          set(_cuda_fix_g3 TRUE)
-        endif()
-      endif()
-      if(_cuda_fix_g3)
-        string(REPLACE "-g3" "-g" _cuda_C_FLAGS "${CMAKE_${CUDA_C_OR_CXX}_FLAGS_${config_upper}}")
-      else()
-        set(_cuda_C_FLAGS "${CMAKE_${CUDA_C_OR_CXX}_FLAGS_${config_upper}}")
-      endif()
-
-      string(APPEND _cuda_host_flags "\nset(CMAKE_HOST_FLAGS_${config_upper} ${_cuda_C_FLAGS})")
-    endif()
-
-    # Note that if we ever want CUDA_NVCC_FLAGS_<CONFIG> to be string (instead of a list
-    # like it is currently), we can remove the quotes around the
-    # ${CUDA_NVCC_FLAGS_${config_upper}} variable like the CMAKE_HOST_FLAGS_<CONFIG> variable.
-    string(APPEND _cuda_nvcc_flags_config "\nset(CUDA_NVCC_FLAGS_${config_upper} ${CUDA_NVCC_FLAGS_${config_upper}} ;; ${CUDA_WRAP_OPTION_NVCC_FLAGS_${config_upper}})")
-  endforeach()
-
-  # Process the C++11 flag.  If the host sets the flag, we need to add it to nvcc and
-  # remove it from the host. This is because -Xcompile -std=c++ will choke nvcc (it uses
-  # the C preprocessor).  In order to get this to work correctly, we need to use nvcc's
-  # specific c++11 flag.
-  if( "${_cuda_host_flags}" MATCHES "-std=c\\+\\+11")
-    # Add the c++11 flag to nvcc if it isn't already present.  Note that we only look at
-    # the main flag instead of the configuration specific flags.
-    if( NOT "${CUDA_NVCC_FLAGS}" MATCHES "-std;c\\+\\+11" )
-      list(APPEND nvcc_flags --std c++11)
-    endif()
-    string(REGEX REPLACE "[-]+std=c\\+\\+11" "" _cuda_host_flags "${_cuda_host_flags}")
-  endif()
-
-  if(_cuda_build_shared_libs)
-    list(APPEND nvcc_flags "-D${cuda_target}_EXPORTS")
-  endif()
-
-  # Reset the output variable
-  set(_cuda_wrap_generated_files "")
-
-  # Iterate over the macro arguments and create custom
-  # commands for all the .cu files.
-  foreach(file ${_argn_list})
-    # Ignore any file marked as a HEADER_FILE_ONLY
-    get_source_file_property(_is_header ${file} HEADER_FILE_ONLY)
-    # Allow per source file overrides of the format.  Also allows compiling non-.cu files.
-    get_source_file_property(_cuda_source_format ${file} CUDA_SOURCE_PROPERTY_FORMAT)
-    if((${file} MATCHES "\\.cu$" OR _cuda_source_format) AND NOT _is_header)
-
-      if(NOT _cuda_source_format)
-        set(_cuda_source_format ${format})
-      endif()
-      # If file isn't a .cu file, we need to tell nvcc to treat it as such.
-      if(NOT ${file} MATCHES "\\.cu$")
-        set(cuda_language_flag -x=cu)
-      else()
-        set(cuda_language_flag)
-      endif()
-
-      if( ${_cuda_source_format} MATCHES "OBJ")
-        set( cuda_compile_to_external_module OFF )
-      else()
-        set( cuda_compile_to_external_module ON )
-        if( ${_cuda_source_format} MATCHES "PTX" )
-          set( cuda_compile_to_external_module_type "ptx" )
-        elseif( ${_cuda_source_format} MATCHES "CUBIN")
-          set( cuda_compile_to_external_module_type "cubin" )
-        elseif( ${_cuda_source_format} MATCHES "FATBIN")
-          set( cuda_compile_to_external_module_type "fatbin" )
-        else()
-          message( FATAL_ERROR "Invalid format flag passed to CUDA_WRAP_SRCS or set with CUDA_SOURCE_PROPERTY_FORMAT file property for file '${file}': '${_cuda_source_format}'.  Use OBJ, PTX, CUBIN or FATBIN.")
-        endif()
-      endif()
-
-      if(cuda_compile_to_external_module)
-        # Don't use any of the host compilation flags for PTX targets.
-        set(CUDA_HOST_FLAGS)
-        set(CUDA_NVCC_FLAGS_CONFIG)
-      else()
-        set(CUDA_HOST_FLAGS ${_cuda_host_flags})
-        set(CUDA_NVCC_FLAGS_CONFIG ${_cuda_nvcc_flags_config})
-      endif()
-
-      # Determine output directory
-      cuda_compute_build_path("${file}" cuda_build_path)
-      set(cuda_compile_intermediate_directory "${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${cuda_target}.dir/${cuda_build_path}")
-      if(CUDA_GENERATED_OUTPUT_DIR)
-        set(cuda_compile_output_dir "${CUDA_GENERATED_OUTPUT_DIR}")
-      else()
-        if ( cuda_compile_to_external_module )
-          set(cuda_compile_output_dir "${CMAKE_CURRENT_BINARY_DIR}")
-        else()
-          set(cuda_compile_output_dir "${cuda_compile_intermediate_directory}")
-        endif()
-      endif()
-
-      # Add a custom target to generate a c or ptx file. ######################
-
-      get_filename_component( basename ${file} NAME )
-      if( cuda_compile_to_external_module )
-        set(generated_file_path "${cuda_compile_output_dir}")
-        set(generated_file_basename "${cuda_target}_generated_${basename}.${cuda_compile_to_external_module_type}")
-        set(format_flag "-${cuda_compile_to_external_module_type}")
-        file(MAKE_DIRECTORY "${cuda_compile_output_dir}")
-      else()
-        set(generated_file_path "${cuda_compile_output_dir}/${CMAKE_CFG_INTDIR}")
-        set(generated_file_basename "${cuda_target}_generated_${basename}${generated_extension}")
-        if(CUDA_SEPARABLE_COMPILATION)
-          set(format_flag "-dc")
-        else()
-          set(format_flag "-c")
-        endif()
-      endif()
-
-      # Set all of our file names.  Make sure that whatever filenames that have
-      # generated_file_path in them get passed in through as a command line
-      # argument, so that the ${CMAKE_CFG_INTDIR} gets expanded at run time
-      # instead of configure time.
-      set(generated_file "${generated_file_path}/${generated_file_basename}")
-      set(cmake_dependency_file "${cuda_compile_intermediate_directory}/${generated_file_basename}.depend")
-      set(NVCC_generated_dependency_file "${cuda_compile_intermediate_directory}/${generated_file_basename}.NVCC-depend")
-      set(generated_cubin_file "${generated_file_path}/${generated_file_basename}.cubin.txt")
-      set(custom_target_script_pregen "${cuda_compile_intermediate_directory}/${generated_file_basename}.cmake.pre-gen")
-      set(custom_target_script "${cuda_compile_intermediate_directory}/${generated_file_basename}$<$<BOOL:$<CONFIG>>:.$<CONFIG>>.cmake")
-
-      # Setup properties for obj files:
-      if( NOT cuda_compile_to_external_module )
-        set_source_files_properties("${generated_file}"
-          PROPERTIES
-          EXTERNAL_OBJECT true # This is an object file not to be compiled, but only be linked.
-          )
-      endif()
-
-      # Don't add CMAKE_CURRENT_SOURCE_DIR if the path is already an absolute path.
-      get_filename_component(file_path "${file}" PATH)
-      if(IS_ABSOLUTE "${file_path}")
-        set(source_file "${file}")
-      else()
-        set(source_file "${CMAKE_CURRENT_SOURCE_DIR}/${file}")
-      endif()
-
-      if( NOT cuda_compile_to_external_module AND CUDA_SEPARABLE_COMPILATION)
-        list(APPEND ${cuda_target}_SEPARABLE_COMPILATION_OBJECTS "${generated_file}")
-      endif()
-
-      # Bring in the dependencies.  Creates a variable CUDA_NVCC_DEPEND #######
-      cuda_include_nvcc_dependencies(${cmake_dependency_file})
-
-      # Convience string for output ###########################################
-      if(CUDA_BUILD_EMULATION)
-        set(cuda_build_type "Emulation")
-      else()
-        set(cuda_build_type "Device")
-      endif()
-
-      # Build the NVCC made dependency file ###################################
-      set(build_cubin OFF)
-      if ( NOT CUDA_BUILD_EMULATION AND CUDA_BUILD_CUBIN )
-         if ( NOT cuda_compile_to_external_module )
-           set ( build_cubin ON )
-         endif()
-      endif()
-
-      # Configure the build script
-      configure_file("${CUDA_run_nvcc}" "${custom_target_script_pregen}" @ONLY)
-      file(GENERATE
-        OUTPUT "${custom_target_script}"
-        INPUT "${custom_target_script_pregen}"
-        )
-
-      # So if a user specifies the same cuda file as input more than once, you
-      # can have bad things happen with dependencies.  Here we check an option
-      # to see if this is the behavior they want.
-      if(CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE)
-        set(main_dep MAIN_DEPENDENCY ${source_file})
-      else()
-        set(main_dep DEPENDS ${source_file})
-      endif()
-
-      if(CUDA_VERBOSE_BUILD)
-        set(verbose_output ON)
-      elseif(CMAKE_GENERATOR MATCHES "Makefiles")
-        set(verbose_output "$(VERBOSE)")
-      else()
-        set(verbose_output OFF)
-      endif()
-
-      # Create up the comment string
-      file(RELATIVE_PATH generated_file_relative_path "${CMAKE_BINARY_DIR}" "${generated_file}")
-      if(cuda_compile_to_external_module)
-        set(cuda_build_comment_string "Building NVCC ${cuda_compile_to_external_module_type} file ${generated_file_relative_path}")
-      else()
-        set(cuda_build_comment_string "Building NVCC (${cuda_build_type}) object ${generated_file_relative_path}")
-      endif()
-
-      set(_verbatim VERBATIM)
-      if(ccbin_flags MATCHES "\\$\\(VCInstallDir\\)")
-        set(_verbatim "")
-      endif()
-
-      # Build the generated file and dependency file ##########################
-      add_custom_command(
-        OUTPUT ${generated_file}
-        # These output files depend on the source_file and the contents of cmake_dependency_file
-        ${main_dep}
-        DEPENDS ${CUDA_NVCC_DEPEND}
-        DEPENDS ${custom_target_script}
-        # Make sure the output directory exists before trying to write to it.
-        COMMAND ${CMAKE_COMMAND} -E make_directory "${generated_file_path}"
-        COMMAND ${CMAKE_COMMAND} ARGS
-          -D verbose:BOOL=${verbose_output}
-          ${ccbin_flags}
-          -D build_configuration:STRING=${CUDA_build_configuration}
-          -D "generated_file:STRING=${generated_file}"
-          -D "generated_cubin_file:STRING=${generated_cubin_file}"
-          -P "${custom_target_script}"
-        WORKING_DIRECTORY "${cuda_compile_intermediate_directory}"
-        COMMENT "${cuda_build_comment_string}"
-        ${_verbatim}
-        )
-
-      # Make sure the build system knows the file is generated.
-      set_source_files_properties(${generated_file} PROPERTIES GENERATED TRUE)
-
-      list(APPEND _cuda_wrap_generated_files ${generated_file})
-
-      # Add the other files that we want cmake to clean on a cleanup ##########
-      list(APPEND CUDA_ADDITIONAL_CLEAN_FILES "${cmake_dependency_file}")
-      list(REMOVE_DUPLICATES CUDA_ADDITIONAL_CLEAN_FILES)
-      set(CUDA_ADDITIONAL_CLEAN_FILES ${CUDA_ADDITIONAL_CLEAN_FILES} CACHE INTERNAL "List of intermediate files that are part of the cuda dependency scanning.")
-
-    endif()
-  endforeach()
-
-  # Set the return parameter
-  set(${generated_files} ${_cuda_wrap_generated_files})
-endmacro()
-
-function(_cuda_get_important_host_flags important_flags flag_string)
-  if(CMAKE_GENERATOR MATCHES "Visual Studio")
-    string(REGEX MATCHALL "/M[DT][d]?" flags "${flag_string}")
-    list(APPEND ${important_flags} ${flags})
-  else()
-    string(REGEX MATCHALL "-fPIC" flags "${flag_string}")
-    list(APPEND ${important_flags} ${flags})
-  endif()
-  set(${important_flags} ${${important_flags}} PARENT_SCOPE)
-endfunction()
-
-###############################################################################
-###############################################################################
-# Separable Compilation Link
-###############################################################################
-###############################################################################
-
-# Compute the filename to be used by CUDA_LINK_SEPARABLE_COMPILATION_OBJECTS
-function(CUDA_COMPUTE_SEPARABLE_COMPILATION_OBJECT_FILE_NAME output_file_var cuda_target object_files)
-  if (object_files)
-    set(generated_extension ${CMAKE_${CUDA_C_OR_CXX}_OUTPUT_EXTENSION})
-    set(output_file "${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${cuda_target}.dir/${CMAKE_CFG_INTDIR}/${cuda_target}_intermediate_link${generated_extension}")
-  else()
-    set(output_file)
-  endif()
-
-  set(${output_file_var} "${output_file}" PARENT_SCOPE)
-endfunction()
-
-# Setup the build rule for the separable compilation intermediate link file.
-function(CUDA_LINK_SEPARABLE_COMPILATION_OBJECTS output_file cuda_target options object_files)
-  if (object_files)
-
-    set_source_files_properties("${output_file}"
-      PROPERTIES
-      EXTERNAL_OBJECT TRUE # This is an object file not to be compiled, but only
-                           # be linked.
-      GENERATED TRUE       # This file is generated during the build
-      )
-
-    # For now we are ignoring all the configuration specific flags.
-    set(nvcc_flags)
-    CUDA_PARSE_NVCC_OPTIONS(nvcc_flags ${options})
-    if(CUDA_64_BIT_DEVICE_CODE)
-      list(APPEND nvcc_flags -m64)
-    else()
-      list(APPEND nvcc_flags -m32)
-    endif()
-    # If -ccbin, --compiler-bindir has been specified, don't do anything.  Otherwise add it here.
-    list( FIND nvcc_flags "-ccbin" ccbin_found0 )
-    list( FIND nvcc_flags "--compiler-bindir" ccbin_found1 )
-    if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 AND CUDA_HOST_COMPILER )
-      # Match VERBATIM check below.
-      if(CUDA_HOST_COMPILER MATCHES "\\$\\(VCInstallDir\\)")
-        list(APPEND nvcc_flags -ccbin "\"${CUDA_HOST_COMPILER}\"")
-      else()
-        list(APPEND nvcc_flags -ccbin "${CUDA_HOST_COMPILER}")
-      endif()
-    endif()
-
-    # Create a list of flags specified by CUDA_NVCC_FLAGS_${CONFIG} and CMAKE_${CUDA_C_OR_CXX}_FLAGS*
-    set(config_specific_flags)
-    set(flags)
-    foreach(config ${CUDA_configuration_types})
-      string(TOUPPER ${config} config_upper)
-      # Add config specific flags
-      foreach(f ${CUDA_NVCC_FLAGS_${config_upper}})
-        list(APPEND config_specific_flags $<$<CONFIG:${config}>:${f}>)
-      endforeach()
-      set(important_host_flags)
-      _cuda_get_important_host_flags(important_host_flags "${CMAKE_${CUDA_C_OR_CXX}_FLAGS_${config_upper}}")
-      foreach(f ${important_host_flags})
-        list(APPEND flags $<$<CONFIG:${config}>:-Xcompiler> $<$<CONFIG:${config}>:${f}>)
-      endforeach()
-    endforeach()
-    # Add CMAKE_${CUDA_C_OR_CXX}_FLAGS
-    set(important_host_flags)
-    _cuda_get_important_host_flags(important_host_flags "${CMAKE_${CUDA_C_OR_CXX}_FLAGS}")
-    foreach(f ${important_host_flags})
-      list(APPEND flags -Xcompiler ${f})
-    endforeach()
-
-    # Add our general CUDA_NVCC_FLAGS with the configuration specifig flags
-    set(nvcc_flags ${CUDA_NVCC_FLAGS} ${config_specific_flags} ${nvcc_flags})
-
-    file(RELATIVE_PATH output_file_relative_path "${CMAKE_BINARY_DIR}" "${output_file}")
-
-    # Some generators don't handle the multiple levels of custom command
-    # dependencies correctly (obj1 depends on file1, obj2 depends on obj1), so
-    # we work around that issue by compiling the intermediate link object as a
-    # pre-link custom command in that situation.
-    set(do_obj_build_rule TRUE)
-    if (MSVC_VERSION GREATER 1599 AND MSVC_VERSION LESS 1800)
-      # VS 2010 and 2012 have this problem.
-      set(do_obj_build_rule FALSE)
-    endif()
-
-    set(_verbatim VERBATIM)
-    if(nvcc_flags MATCHES "\\$\\(VCInstallDir\\)")
-      set(_verbatim "")
-    endif()
-
-    if (do_obj_build_rule)
-      add_custom_command(
-        OUTPUT ${output_file}
-        DEPENDS ${object_files}
-        COMMAND ${CUDA_NVCC_EXECUTABLE} ${nvcc_flags} -dlink ${object_files} -o ${output_file}
-        ${flags}
-        COMMENT "Building NVCC intermediate link file ${output_file_relative_path}"
-        ${_verbatim}
-        )
-    else()
-      get_filename_component(output_file_dir "${output_file}" DIRECTORY)
-      add_custom_command(
-        TARGET ${cuda_target}
-        PRE_LINK
-        COMMAND ${CMAKE_COMMAND} -E echo "Building NVCC intermediate link file ${output_file_relative_path}"
-        COMMAND ${CMAKE_COMMAND} -E make_directory "${output_file_dir}"
-        COMMAND ${CUDA_NVCC_EXECUTABLE} ${nvcc_flags} ${flags} -dlink ${object_files} -o "${output_file}"
-        ${_verbatim}
-        )
-    endif()
- endif()
-endfunction()
-
-###############################################################################
-###############################################################################
-# ADD LIBRARY
-###############################################################################
-###############################################################################
-macro(CUDA_ADD_LIBRARY cuda_target)
-
-  CUDA_ADD_CUDA_INCLUDE_ONCE()
-
-  # Separate the sources from the options
-  CUDA_GET_SOURCES_AND_OPTIONS(_sources _cmake_options _options ${ARGN})
-  CUDA_BUILD_SHARED_LIBRARY(_cuda_shared_flag ${ARGN})
-  # Create custom commands and targets for each file.
-  CUDA_WRAP_SRCS( ${cuda_target} OBJ _generated_files ${_sources}
-    ${_cmake_options} ${_cuda_shared_flag}
-    OPTIONS ${_options} )
-
-  # Compute the file name of the intermedate link file used for separable
-  # compilation.
-  CUDA_COMPUTE_SEPARABLE_COMPILATION_OBJECT_FILE_NAME(link_file ${cuda_target} "${${cuda_target}_SEPARABLE_COMPILATION_OBJECTS}")
-
-  # Add the library.
-  add_library(${cuda_target} ${_cmake_options}
-    ${_generated_files}
-    ${_sources}
-    ${link_file}
-    )
-
-  # Add a link phase for the separable compilation if it has been enabled.  If
-  # it has been enabled then the ${cuda_target}_SEPARABLE_COMPILATION_OBJECTS
-  # variable will have been defined.
-  CUDA_LINK_SEPARABLE_COMPILATION_OBJECTS("${link_file}" ${cuda_target} "${_options}" "${${cuda_target}_SEPARABLE_COMPILATION_OBJECTS}")
-
-  target_link_libraries(${cuda_target} PUBLIC
-    ${CUDA_LIBRARIES}
-    )
-
-  if(CUDA_SEPARABLE_COMPILATION)
-    target_link_libraries(${cuda_target}
-      ${CUDA_cudadevrt_LIBRARY}
-      )
-  endif()
-
-  # We need to set the linker language based on what the expected generated file
-  # would be. CUDA_C_OR_CXX is computed based on CUDA_HOST_COMPILATION_CPP.
-  set_target_properties(${cuda_target}
-    PROPERTIES
-    LINKER_LANGUAGE ${CUDA_C_OR_CXX}
-    )
-
-endmacro()
-
-
-###############################################################################
-###############################################################################
-# ADD EXECUTABLE
-###############################################################################
-###############################################################################
-macro(CUDA_ADD_EXECUTABLE cuda_target)
-
-  CUDA_ADD_CUDA_INCLUDE_ONCE()
-
-  # Separate the sources from the options
-  CUDA_GET_SOURCES_AND_OPTIONS(_sources _cmake_options _options ${ARGN})
-  # Create custom commands and targets for each file.
-  CUDA_WRAP_SRCS( ${cuda_target} OBJ _generated_files ${_sources} OPTIONS ${_options} )
-
-  # Compute the file name of the intermedate link file used for separable
-  # compilation.
-  CUDA_COMPUTE_SEPARABLE_COMPILATION_OBJECT_FILE_NAME(link_file ${cuda_target} "${${cuda_target}_SEPARABLE_COMPILATION_OBJECTS}")
-
-  # Add the library.
-  add_executable(${cuda_target} ${_cmake_options}
-    ${_generated_files}
-    ${_sources}
-    ${link_file}
-    )
-
-  # Add a link phase for the separable compilation if it has been enabled.  If
-  # it has been enabled then the ${cuda_target}_SEPARABLE_COMPILATION_OBJECTS
-  # variable will have been defined.
-  CUDA_LINK_SEPARABLE_COMPILATION_OBJECTS("${link_file}" ${cuda_target} "${_options}" "${${cuda_target}_SEPARABLE_COMPILATION_OBJECTS}")
-
-  target_link_libraries(${cuda_target} PUBLIC ${CUDA_LIBRARIES})
-
-  # We need to set the linker language based on what the expected generated file
-  # would be. CUDA_C_OR_CXX is computed based on CUDA_HOST_COMPILATION_CPP.
-  set_target_properties(${cuda_target}
-    PROPERTIES
-    LINKER_LANGUAGE ${CUDA_C_OR_CXX}
-    )
-
-endmacro()
-
-
-###############################################################################
-###############################################################################
-# (Internal) helper for manually added cuda source files with specific targets
-###############################################################################
-###############################################################################
-macro(cuda_compile_base cuda_target format generated_files)
-  # Update a counter in this directory, to keep phony target names unique.
-  set(_cuda_target "${cuda_target}")
-  get_property(_counter DIRECTORY PROPERTY _cuda_internal_phony_counter)
-  if(_counter)
-    math(EXPR _counter "${_counter} + 1")
-  else()
-    set(_counter 1)
-  endif()
-  set(_cuda_target "${_cuda_target}_${_counter}")
-  set_property(DIRECTORY PROPERTY _cuda_internal_phony_counter ${_counter})
-
-  # Separate the sources from the options
-  CUDA_GET_SOURCES_AND_OPTIONS(_sources _cmake_options _options ${ARGN})
-
-  # Create custom commands and targets for each file.
-  CUDA_WRAP_SRCS( ${_cuda_target} ${format} _generated_files ${_sources}
-                  ${_cmake_options} OPTIONS ${_options} PHONY)
-
-  set( ${generated_files} ${_generated_files})
-
-endmacro()
-
-###############################################################################
-###############################################################################
-# CUDA COMPILE
-###############################################################################
-###############################################################################
-macro(CUDA_COMPILE generated_files)
-  cuda_compile_base(cuda_compile OBJ ${generated_files} ${ARGN})
-endmacro()
-
-###############################################################################
-###############################################################################
-# CUDA COMPILE PTX
-###############################################################################
-###############################################################################
-macro(CUDA_COMPILE_PTX generated_files)
-  cuda_compile_base(cuda_compile_ptx PTX ${generated_files} ${ARGN})
-endmacro()
-
-###############################################################################
-###############################################################################
-# CUDA COMPILE FATBIN
-###############################################################################
-###############################################################################
-macro(CUDA_COMPILE_FATBIN generated_files)
-  cuda_compile_base(cuda_compile_fatbin FATBIN ${generated_files} ${ARGN})
-endmacro()
-
-###############################################################################
-###############################################################################
-# CUDA COMPILE CUBIN
-###############################################################################
-###############################################################################
-macro(CUDA_COMPILE_CUBIN generated_files)
-  cuda_compile_base(cuda_compile_cubin CUBIN ${generated_files} ${ARGN})
-endmacro()
-
-
-###############################################################################
-###############################################################################
-# CUDA ADD CUFFT TO TARGET
-###############################################################################
-###############################################################################
-macro(CUDA_ADD_CUFFT_TO_TARGET target)
-  if (CUDA_BUILD_EMULATION)
-    target_link_libraries(${target} ${CUDA_cufftemu_LIBRARY})
-  else()
-    target_link_libraries(${target} ${CUDA_cufft_LIBRARY})
-  endif()
-endmacro()
-
-###############################################################################
-###############################################################################
-# CUDA ADD CUBLAS TO TARGET
-###############################################################################
-###############################################################################
-macro(CUDA_ADD_CUBLAS_TO_TARGET target)
-  if (CUDA_BUILD_EMULATION)
-    target_link_libraries(${target} ${CUDA_cublasemu_LIBRARY})
-  else()
-    target_link_libraries(${target} ${CUDA_cublas_LIBRARY} ${CUDA_cublas_device_LIBRARY})
-  endif()
-endmacro()
-
-###############################################################################
-###############################################################################
-# CUDA BUILD CLEAN TARGET
-###############################################################################
-###############################################################################
-macro(CUDA_BUILD_CLEAN_TARGET)
-  # Call this after you add all your CUDA targets, and you will get a convience
-  # target.  You should also make clean after running this target to get the
-  # build system to generate all the code again.
-
-  set(cuda_clean_target_name clean_cuda_depends)
-  if (CMAKE_GENERATOR MATCHES "Visual Studio")
-    string(TOUPPER ${cuda_clean_target_name} cuda_clean_target_name)
-  endif()
-  add_custom_target(${cuda_clean_target_name}
-    COMMAND ${CMAKE_COMMAND} -E remove ${CUDA_ADDITIONAL_CLEAN_FILES})
-
-  # Clear out the variable, so the next time we configure it will be empty.
-  # This is useful so that the files won't persist in the list after targets
-  # have been removed.
-  set(CUDA_ADDITIONAL_CLEAN_FILES "" CACHE INTERNAL "List of intermediate files that are part of the cuda dependency scanning.")
-endmacro()
diff --git a/cmake/thirdparty/FindCUDA/make2cmake.cmake b/cmake/thirdparty/FindCUDA/make2cmake.cmake
deleted file mode 100644
index 7b5389ec51..0000000000
--- a/cmake/thirdparty/FindCUDA/make2cmake.cmake
+++ /dev/null
@@ -1,106 +0,0 @@
-#  James Bigler, NVIDIA Corp (nvidia.com - jbigler)
-#  Abe Stephens, SCI Institute -- http://www.sci.utah.edu/~abe/FindCuda.html
-#
-#  Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
-#
-#  Copyright (c) 2007-2009
-#  Scientific Computing and Imaging Institute, University of Utah
-#
-#  This code is licensed under the MIT License.  See the FindCUDA.cmake script
-#  for the text of the license.
-
-# The MIT License
-#
-# License for the specific language governing rights and limitations under
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included
-# in all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
-# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-# DEALINGS IN THE SOFTWARE.
-#
-
-#######################################################################
-# This converts a file written in makefile syntax into one that can be included
-# by CMake.
-
-# Input variables
-#
-# verbose:BOOL=<>          OFF: Be as quiet as possible (default)
-#                          ON : Extra output
-#
-# input_file:FILEPATH=<>   Path to dependecy file in makefile format
-#
-# output_file:FILEPATH=<>  Path to file with dependencies in CMake readable variable
-#
-
-file(READ ${input_file} depend_text)
-
-if (NOT "${depend_text}" STREQUAL "")
-
-  # message("FOUND DEPENDS")
-
-  string(REPLACE "\\ " " " depend_text ${depend_text})
-
-  # This works for the nvcc -M generated dependency files.
-  string(REGEX REPLACE "^.* : " "" depend_text ${depend_text})
-  string(REGEX REPLACE "[ \\\\]*\n" ";" depend_text ${depend_text})
-
-  set(dependency_list "")
-
-  foreach(file ${depend_text})
-
-    string(REGEX REPLACE "^ +" "" file ${file})
-
-    # OK, now if we had a UNC path, nvcc has a tendency to only output the first '/'
-    # instead of '//'.  Here we will test to see if the file exists, if it doesn't then
-    # try to prepend another '/' to the path and test again.  If it still fails remove the
-    # path.
-
-    if(NOT EXISTS "${file}")
-      if (EXISTS "/${file}")
-        set(file "/${file}")
-      else()
-        if(verbose)
-          message(WARNING " Removing non-existent dependency file: ${file}")
-        endif()
-        set(file "")
-      endif()
-    endif()
-
-    # Make sure we check to see if we have a file, before asking if it is not a directory.
-    # if(NOT IS_DIRECTORY "") will return TRUE.
-    if(file AND NOT IS_DIRECTORY "${file}")
-      # If softlinks start to matter, we should change this to REALPATH.  For now we need
-      # to flatten paths, because nvcc can generate stuff like /bin/../include instead of
-      # just /include.
-      get_filename_component(file_absolute "${file}" ABSOLUTE)
-      list(APPEND dependency_list "${file_absolute}")
-    endif()
-
-  endforeach()
-
-else()
-  # message("FOUND NO DEPENDS")
-endif()
-
-# Remove the duplicate entries and sort them.
-list(REMOVE_DUPLICATES dependency_list)
-list(SORT dependency_list)
-
-foreach(file ${dependency_list})
-  string(APPEND cuda_nvcc_depend " \"${file}\"\n")
-endforeach()
-
-file(WRITE ${output_file} "# Generated by: make2cmake.cmake\nSET(CUDA_NVCC_DEPEND\n ${cuda_nvcc_depend})\n\n")
diff --git a/cmake/thirdparty/FindCUDA/parse_cubin.cmake b/cmake/thirdparty/FindCUDA/parse_cubin.cmake
deleted file mode 100644
index 626c8a2e47..0000000000
--- a/cmake/thirdparty/FindCUDA/parse_cubin.cmake
+++ /dev/null
@@ -1,111 +0,0 @@
-#  James Bigler, NVIDIA Corp (nvidia.com - jbigler)
-#  Abe Stephens, SCI Institute -- http://www.sci.utah.edu/~abe/FindCuda.html
-#
-#  Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
-#
-#  Copyright (c) 2007-2009
-#  Scientific Computing and Imaging Institute, University of Utah
-#
-#  This code is licensed under the MIT License.  See the FindCUDA.cmake script
-#  for the text of the license.
-
-# The MIT License
-#
-# License for the specific language governing rights and limitations under
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included
-# in all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
-# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-# DEALINGS IN THE SOFTWARE.
-#
-
-#######################################################################
-# Parses a .cubin file produced by nvcc and reports statistics about the file.
-
-
-file(READ ${input_file} file_text)
-
-if (NOT "${file_text}" STREQUAL "")
-
-  string(REPLACE ";" "\\;" file_text ${file_text})
-  string(REPLACE "\ncode" ";code" file_text ${file_text})
-
-  list(LENGTH file_text len)
-
-  foreach(line ${file_text})
-
-    # Only look at "code { }" blocks.
-    if(line MATCHES "^code")
-
-      # Break into individual lines.
-      string(REGEX REPLACE "\n" ";" line ${line})
-
-      foreach(entry ${line})
-
-        # Extract kernel names.
-        if (${entry} MATCHES "[^g]name = ([^ ]+)")
-          set(entry "${CMAKE_MATCH_1}")
-
-          # Check to see if the kernel name starts with "_"
-          set(skip FALSE)
-          # if (${entry} MATCHES "^_")
-            # Skip the rest of this block.
-            # message("Skipping ${entry}")
-            # set(skip TRUE)
-          # else ()
-            message("Kernel:    ${entry}")
-          # endif ()
-
-        endif()
-
-        # Skip the rest of the block if necessary
-        if(NOT skip)
-
-          # Registers
-          if (${entry} MATCHES "reg([ ]+)=([ ]+)([^ ]+)")
-            set(entry "${CMAKE_MATCH_3}")
-            message("Registers: ${entry}")
-          endif()
-
-          # Local memory
-          if (${entry} MATCHES "lmem([ ]+)=([ ]+)([^ ]+)")
-            set(entry "${CMAKE_MATCH_3}")
-            message("Local:     ${entry}")
-          endif()
-
-          # Shared memory
-          if (${entry} MATCHES "smem([ ]+)=([ ]+)([^ ]+)")
-            set(entry "${CMAKE_MATCH_3}")
-            message("Shared:    ${entry}")
-          endif()
-
-          if (${entry} MATCHES "^}")
-            message("")
-          endif()
-
-        endif()
-
-
-      endforeach()
-
-    endif()
-
-  endforeach()
-
-else()
-  # message("FOUND NO DEPENDS")
-endif()
-
-
diff --git a/cmake/thirdparty/FindCUDA/run_nvcc.cmake b/cmake/thirdparty/FindCUDA/run_nvcc.cmake
deleted file mode 100644
index ba7f92e3c0..0000000000
--- a/cmake/thirdparty/FindCUDA/run_nvcc.cmake
+++ /dev/null
@@ -1,307 +0,0 @@
-#  James Bigler, NVIDIA Corp (nvidia.com - jbigler)
-#
-#  Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
-#
-#  This code is licensed under the MIT License.  See the FindCUDA.cmake script
-#  for the text of the license.
-
-# The MIT License
-#
-# License for the specific language governing rights and limitations under
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included
-# in all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
-# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-# DEALINGS IN THE SOFTWARE.
-
-
-##########################################################################
-# This file runs the nvcc commands to produce the desired output file along with
-# the dependency file needed by CMake to compute dependencies.  In addition the
-# file checks the output of each command and if the command fails it deletes the
-# output files.
-
-# Input variables
-#
-# verbose:BOOL=<>          OFF: Be as quiet as possible (default)
-#                          ON : Describe each step
-#
-# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or
-#                               RelWithDebInfo, but it should match one of the
-#                               entries in CUDA_HOST_FLAGS. This is the build
-#                               configuration used when compiling the code.  If
-#                               blank or unspecified Debug is assumed as this is
-#                               what CMake does.
-#
-# generated_file:STRING=<> File to generate.  This argument must be passed in.
-#
-# generated_cubin_file:STRING=<> File to generate.  This argument must be passed
-#                                                   in if build_cubin is true.
-
-if(NOT generated_file)
-  message(FATAL_ERROR "You must specify generated_file on the command line")
-endif()
-
-# Set these up as variables to make reading the generated file easier
-set(CMAKE_COMMAND "@CMAKE_COMMAND@") # path
-set(source_file "@source_file@") # path
-set(NVCC_generated_dependency_file "@NVCC_generated_dependency_file@") # path
-set(cmake_dependency_file "@cmake_dependency_file@") # path
-set(CUDA_make2cmake "@CUDA_make2cmake@") # path
-set(CUDA_parse_cubin "@CUDA_parse_cubin@") # path
-set(build_cubin @build_cubin@) # bool
-set(CUDA_HOST_COMPILER "@CUDA_HOST_COMPILER@") # path
-# We won't actually use these variables for now, but we need to set this, in
-# order to force this file to be run again if it changes.
-set(generated_file_path "@generated_file_path@") # path
-set(generated_file_internal "@generated_file@") # path
-set(generated_cubin_file_internal "@generated_cubin_file@") # path
-
-set(CUDA_NVCC_EXECUTABLE "@CUDA_NVCC_EXECUTABLE@") # path
-set(CUDA_NVCC_FLAGS @CUDA_NVCC_FLAGS@ ;; @CUDA_WRAP_OPTION_NVCC_FLAGS@) # list
-@CUDA_NVCC_FLAGS_CONFIG@
-set(nvcc_flags @nvcc_flags@) # list
-set(CUDA_NVCC_INCLUDE_DIRS "@CUDA_NVCC_INCLUDE_DIRS@") # list (needs to be in quotes to handle spaces properly).
-set(CUDA_NVCC_COMPILE_DEFINITIONS "@CUDA_NVCC_COMPILE_DEFINITIONS@") # list (needs to be in quotes to handle spaces properly).
-set(format_flag "@format_flag@") # string
-set(cuda_language_flag @cuda_language_flag@) # list
-
-# Clean up list of include directories and add -I flags
-list(REMOVE_DUPLICATES CUDA_NVCC_INCLUDE_DIRS)
-set(CUDA_NVCC_INCLUDE_ARGS)
-foreach(dir ${CUDA_NVCC_INCLUDE_DIRS})
-  # Extra quotes are added around each flag to help nvcc parse out flags with spaces.
-  list(APPEND CUDA_NVCC_INCLUDE_ARGS "-I${dir}")
-endforeach()
-
-# Clean up list of compile definitions, add -D flags, and append to nvcc_flags
-list(REMOVE_DUPLICATES CUDA_NVCC_COMPILE_DEFINITIONS)
-foreach(def ${CUDA_NVCC_COMPILE_DEFINITIONS})
-  list(APPEND nvcc_flags "-D${def}")
-endforeach()
-
-if(build_cubin AND NOT generated_cubin_file)
-  message(FATAL_ERROR "You must specify generated_cubin_file on the command line")
-endif()
-
-# This is the list of host compilation flags.  It C or CXX should already have
-# been chosen by FindCUDA.cmake.
-@CUDA_HOST_FLAGS@
-
-# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler
-set(nvcc_host_compiler_flags "")
-# If we weren't given a build_configuration, use Debug.
-if(NOT build_configuration)
-  set(build_configuration Debug)
-endif()
-string(TOUPPER "${build_configuration}" build_configuration)
-#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}")
-foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}})
-  # Extra quotes are added around each flag to help nvcc parse out flags with spaces.
-  string(APPEND nvcc_host_compiler_flags ",\"${flag}\"")
-endforeach()
-if (nvcc_host_compiler_flags)
-  set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags})
-endif()
-#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"")
-# Add the build specific configuration flags
-list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}})
-
-# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority
-list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 )
-list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 )
-if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 AND CUDA_HOST_COMPILER )
-  if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN)
-    set(CCBIN -ccbin "${CCBIN}")
-  else()
-    set(CCBIN -ccbin "${CUDA_HOST_COMPILER}")
-  endif()
-endif()
-
-# cuda_execute_process - Executes a command with optional command echo and status message.
-#
-#   status  - Status message to print if verbose is true
-#   command - COMMAND argument from the usual execute_process argument structure
-#   ARGN    - Remaining arguments are the command with arguments
-#
-#   CUDA_result - return value from running the command
-#
-# Make this a macro instead of a function, so that things like RESULT_VARIABLE
-# and other return variables are present after executing the process.
-macro(cuda_execute_process status command)
-  set(_command ${command})
-  if(NOT "x${_command}" STREQUAL "xCOMMAND")
-    message(FATAL_ERROR "Malformed call to cuda_execute_process.  Missing COMMAND as second argument. (command = ${command})")
-  endif()
-  if(verbose)
-    execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status})
-    # Now we need to build up our command string.  We are accounting for quotes
-    # and spaces, anything else is left up to the user to fix if they want to
-    # copy and paste a runnable command line.
-    set(cuda_execute_process_string)
-    foreach(arg ${ARGN})
-      # If there are quotes, excape them, so they come through.
-      string(REPLACE "\"" "\\\"" arg ${arg})
-      # Args with spaces need quotes around them to get them to be parsed as a single argument.
-      if(arg MATCHES " ")
-        list(APPEND cuda_execute_process_string "\"${arg}\"")
-      else()
-        list(APPEND cuda_execute_process_string ${arg})
-      endif()
-    endforeach()
-    # Echo the command
-    execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string})
-  endif()
-  # Run the command
-  execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result )
-endmacro()
-
-# Delete the target file
-cuda_execute_process(
-  "Removing ${generated_file}"
-  COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
-  )
-
-# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag
-# for dependency generation and hope for the best.
-set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
-set(CUDA_VERSION @CUDA_VERSION@)
-if(CUDA_VERSION VERSION_LESS "3.0")
-  cmake_policy(PUSH)
-  # CMake policy 0007 NEW states that empty list elements are not
-  # ignored.  I'm just setting it to avoid the warning that's printed.
-  cmake_policy(SET CMP0007 NEW)
-  # Note that this will remove all occurances of -G.
-  list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G")
-  cmake_policy(POP)
-endif()
-
-# nvcc doesn't define __CUDACC__ for some reason when generating dependency files.  This
-# can cause incorrect dependencies when #including files based on this macro which is
-# defined in the generating passes of nvcc invokation.  We will go ahead and manually
-# define this for now until a future version fixes this bug.
-set(CUDACC_DEFINE -D__CUDACC__)
-
-# Generate the dependency file
-cuda_execute_process(
-  "Generating dependency file: ${NVCC_generated_dependency_file}"
-  COMMAND "${CUDA_NVCC_EXECUTABLE}"
-  -M
-  ${CUDACC_DEFINE}
-  "${source_file}"
-  ${cuda_language_flag}
-  -o "${NVCC_generated_dependency_file}"
-  ${CCBIN}
-  ${nvcc_flags}
-  ${nvcc_host_compiler_flags}
-  ${depends_CUDA_NVCC_FLAGS}
-  -DNVCC
-  ${CUDA_NVCC_INCLUDE_ARGS}
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Generate the cmake readable dependency file to a temp file.  Don't put the
-# quotes just around the filenames for the input_file and output_file variables.
-# CMake will pass the quotes through and not be able to find the file.
-cuda_execute_process(
-  "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp"
-  COMMAND "${CMAKE_COMMAND}"
-  -D "input_file:FILEPATH=${NVCC_generated_dependency_file}"
-  -D "output_file:FILEPATH=${cmake_dependency_file}.tmp"
-  -D "verbose=${verbose}"
-  -P "${CUDA_make2cmake}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Copy the file if it is different
-cuda_execute_process(
-  "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}"
-  COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Delete the temporary file
-cuda_execute_process(
-  "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}"
-  COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Generate the code
-cuda_execute_process(
-  "Generating ${generated_file}"
-  COMMAND "${CUDA_NVCC_EXECUTABLE}"
-  "${source_file}"
-  ${cuda_language_flag}
-  ${format_flag} -o "${generated_file}"
-  ${CCBIN}
-  ${nvcc_flags}
-  ${nvcc_host_compiler_flags}
-  ${CUDA_NVCC_FLAGS}
-  -DNVCC
-  ${CUDA_NVCC_INCLUDE_ARGS}
-  )
-
-if(CUDA_result)
-  # Since nvcc can sometimes leave half done files make sure that we delete the output file.
-  cuda_execute_process(
-    "Removing ${generated_file}"
-    COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
-    )
-  message(FATAL_ERROR "Error generating file ${generated_file}")
-else()
-  if(verbose)
-    message("Generated ${generated_file} successfully.")
-  endif()
-endif()
-
-# Cubin resource report commands.
-if( build_cubin )
-  # Run with -cubin to produce resource usage report.
-  cuda_execute_process(
-    "Generating ${generated_cubin_file}"
-    COMMAND "${CUDA_NVCC_EXECUTABLE}"
-    "${source_file}"
-    ${CUDA_NVCC_FLAGS}
-    ${nvcc_flags}
-    ${CCBIN}
-    ${nvcc_host_compiler_flags}
-    -DNVCC
-    -cubin
-    -o "${generated_cubin_file}"
-    ${CUDA_NVCC_INCLUDE_ARGS}
-    )
-
-  # Execute the parser script.
-  cuda_execute_process(
-    "Executing the parser script"
-    COMMAND  "${CMAKE_COMMAND}"
-    -D "input_file:STRING=${generated_cubin_file}"
-    -P "${CUDA_parse_cubin}"
-    )
-
-endif()
diff --git a/cmake/thirdparty/FindCUDA/select_compute_arch.cmake b/cmake/thirdparty/FindCUDA/select_compute_arch.cmake
deleted file mode 100644
index a96a8cac9b..0000000000
--- a/cmake/thirdparty/FindCUDA/select_compute_arch.cmake
+++ /dev/null
@@ -1,197 +0,0 @@
-# Synopsis:
-#   CUDA_SELECT_NVCC_ARCH_FLAGS(out_variable [target_CUDA_architectures])
-#   -- Selects GPU arch flags for nvcc based on target_CUDA_architectures
-#      target_CUDA_architectures : Auto | Common | All | LIST(ARCH_AND_PTX ...)
-#       - "Auto" detects local machine GPU compute arch at runtime.
-#       - "Common" and "All" cover common and entire subsets of architectures
-#      ARCH_AND_PTX : NAME | NUM.NUM | NUM.NUM(NUM.NUM) | NUM.NUM+PTX
-#      NAME: Fermi Kepler Maxwell Kepler+Tegra Kepler+Tesla Maxwell+Tegra Pascal
-#      NUM: Any number. Only those pairs are currently accepted by NVCC though:
-#            2.0 2.1 3.0 3.2 3.5 3.7 5.0 5.2 5.3 6.0 6.2
-#      Returns LIST of flags to be added to CUDA_NVCC_FLAGS in ${out_variable}
-#      Additionally, sets ${out_variable}_readable to the resulting numeric list
-#      Example:
-#       CUDA_SELECT_NVCC_ARCH_FLAGS(ARCH_FLAGS 3.0 3.5+PTX 5.2(5.0) Maxwell)
-#        LIST(APPEND CUDA_NVCC_FLAGS ${ARCH_FLAGS})
-#
-#      More info on CUDA architectures: https://en.wikipedia.org/wiki/CUDA
-#
-
-# This list will be used for CUDA_ARCH_NAME = All option
-set(CUDA_KNOWN_GPU_ARCHITECTURES  "Fermi" "Kepler" "Maxwell")
-
-# This list will be used for CUDA_ARCH_NAME = Common option (enabled by default)
-set(CUDA_COMMON_GPU_ARCHITECTURES "3.0" "3.5" "5.0")
-
-if (CUDA_VERSION VERSION_GREATER "6.5")
-  list(APPEND CUDA_KNOWN_GPU_ARCHITECTURES "Kepler+Tegra" "Kepler+Tesla" "Maxwell+Tegra")
-  list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "5.2")
-endif ()
-
-if (CUDA_VERSION VERSION_GREATER "7.5")
-  list(APPEND CUDA_KNOWN_GPU_ARCHITECTURES "Pascal")
-  list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "6.0" "6.1" "6.1+PTX")
-else()
-  list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "5.2+PTX")
-endif ()
-
-
-
-################################################################################################
-# A function for automatic detection of GPUs installed  (if autodetection is enabled)
-# Usage:
-#   CUDA_DETECT_INSTALLED_GPUS(OUT_VARIABLE)
-#
-function(CUDA_DETECT_INSTALLED_GPUS OUT_VARIABLE)
-  if(NOT CUDA_GPU_DETECT_OUTPUT)
-    set(file ${PROJECT_BINARY_DIR}/detect_cuda_compute_capabilities.cpp)
-
-    file(WRITE ${file} ""
-      "#include <cuda_runtime.h>\n"
-      "#include <cstdio>\n"
-      "int main()\n"
-      "{\n"
-      "  int count = 0;\n"
-      "  if (cudaSuccess != cudaGetDeviceCount(&count)) return -1;\n"
-      "  if (count == 0) return -1;\n"
-      "  for (int device = 0; device < count; ++device)\n"
-      "  {\n"
-      "    cudaDeviceProp prop;\n"
-      "    if (cudaSuccess == cudaGetDeviceProperties(&prop, device))\n"
-      "      std::printf(\"%d.%d \", prop.major, prop.minor);\n"
-      "  }\n"
-      "  return 0;\n"
-      "}\n")
-
-    try_run(run_result compile_result ${PROJECT_BINARY_DIR} ${file}
-            CMAKE_FLAGS "-DINCLUDE_DIRECTORIES=${CUDA_INCLUDE_DIRS}"
-            LINK_LIBRARIES ${CUDA_LIBRARIES}
-            RUN_OUTPUT_VARIABLE compute_capabilities)
-
-    if(run_result EQUAL 0)
-      string(REPLACE "2.1" "2.1(2.0)" compute_capabilities "${compute_capabilities}")
-      set(CUDA_GPU_DETECT_OUTPUT ${compute_capabilities}
-        CACHE INTERNAL "Returned GPU architectures from detect_gpus tool" FORCE)
-    endif()
-  endif()
-
-  if(NOT CUDA_GPU_DETECT_OUTPUT)
-    message(STATUS "Automatic GPU detection failed. Building for common architectures.")
-    set(${OUT_VARIABLE} ${CUDA_COMMON_GPU_ARCHITECTURES} PARENT_SCOPE)
-  else()
-    set(${OUT_VARIABLE} ${CUDA_GPU_DETECT_OUTPUT} PARENT_SCOPE)
-  endif()
-endfunction()
-
-
-################################################################################################
-# Function for selecting GPU arch flags for nvcc based on CUDA architectures from parameter list
-# Usage:
-#   SELECT_NVCC_ARCH_FLAGS(out_variable [list of CUDA compute archs])
-function(CUDA_SELECT_NVCC_ARCH_FLAGS out_variable)
-  set(CUDA_ARCH_LIST "${ARGN}")
-
-  if("X${CUDA_ARCH_LIST}" STREQUAL "X" )
-    set(CUDA_ARCH_LIST "Auto")
-  endif()
-
-  set(cuda_arch_bin)
-  set(cuda_arch_ptx)
-
-  if("${CUDA_ARCH_LIST}" STREQUAL "All")
-    set(CUDA_ARCH_LIST ${CUDA_KNOWN_GPU_ARCHITECTURES})
-  elseif("${CUDA_ARCH_LIST}" STREQUAL "Common")
-    set(CUDA_ARCH_LIST ${CUDA_COMMON_GPU_ARCHITECTURES})
-  elseif("${CUDA_ARCH_LIST}" STREQUAL "Auto")
-    CUDA_DETECT_INSTALLED_GPUS(CUDA_ARCH_LIST)
-    message(STATUS "Autodetected CUDA architecture(s): ${CUDA_ARCH_LIST}")
-  endif()
-
-  # Now process the list and look for names
-  string(REGEX REPLACE "[ \t]+" ";" CUDA_ARCH_LIST "${CUDA_ARCH_LIST}")
-  list(REMOVE_DUPLICATES CUDA_ARCH_LIST)
-  foreach(arch_name ${CUDA_ARCH_LIST})
-    set(arch_bin)
-    set(add_ptx FALSE)
-    # Check to see if we are compiling PTX
-    if(arch_name MATCHES "(.*)\\+PTX$")
-      set(add_ptx TRUE)
-      set(arch_name ${CMAKE_MATCH_1})
-    endif()
-    if(arch_name MATCHES "^([0-9]\\.[0-9](\\([0-9]\\.[0-9]\\))?)$")
-      set(arch_bin ${CMAKE_MATCH_1})
-      set(arch_ptx ${arch_bin})
-    else()
-      # Look for it in our list of known architectures
-      if(${arch_name} STREQUAL "Fermi")
-        set(arch_bin 2.0 "2.1(2.0)")
-      elseif(${arch_name} STREQUAL "Kepler+Tegra")
-        set(arch_bin 3.2)
-      elseif(${arch_name} STREQUAL "Kepler+Tesla")
-        set(arch_bin 3.7)
-      elseif(${arch_name} STREQUAL "Kepler")
-        set(arch_bin 3.0 3.5)
-        set(arch_ptx 3.5)
-      elseif(${arch_name} STREQUAL "Maxwell+Tegra")
-        set(arch_bin 5.3)
-      elseif(${arch_name} STREQUAL "Maxwell")
-        set(arch_bin 5.0 5.2)
-        set(arch_ptx 5.2)
-      elseif(${arch_name} STREQUAL "Pascal")
-        set(arch_bin 6.0 6.1)
-        set(arch_ptx 6.1)
-      else()
-        message(SEND_ERROR "Unknown CUDA Architecture Name ${arch_name} in CUDA_SELECT_NVCC_ARCH_FLAGS")
-      endif()
-    endif()
-    if(NOT arch_bin)
-      message(SEND_ERROR "arch_bin wasn't set for some reason")
-    endif()
-    list(APPEND cuda_arch_bin ${arch_bin})
-    if(add_ptx)
-      if (NOT arch_ptx)
-        set(arch_ptx ${arch_bin})
-      endif()
-      list(APPEND cuda_arch_ptx ${arch_ptx})
-    endif()
-  endforeach()
-
-  # remove dots and convert to lists
-  string(REGEX REPLACE "\\." "" cuda_arch_bin "${cuda_arch_bin}")
-  string(REGEX REPLACE "\\." "" cuda_arch_ptx "${cuda_arch_ptx}")
-  string(REGEX MATCHALL "[0-9()]+" cuda_arch_bin "${cuda_arch_bin}")
-  string(REGEX MATCHALL "[0-9]+"   cuda_arch_ptx "${cuda_arch_ptx}")
-
-  if(cuda_arch_bin)
-    list(REMOVE_DUPLICATES cuda_arch_bin)
-  endif()
-  if(cuda_arch_ptx)
-    list(REMOVE_DUPLICATES cuda_arch_ptx)
-  endif()
-
-  set(nvcc_flags "")
-  set(nvcc_archs_readable "")
-
-  # Tell NVCC to add binaries for the specified GPUs
-  foreach(arch ${cuda_arch_bin})
-    if(arch MATCHES "([0-9]+)\\(([0-9]+)\\)")
-      # User explicitly specified ARCH for the concrete CODE
-      list(APPEND nvcc_flags -gencode arch=compute_${CMAKE_MATCH_2},code=sm_${CMAKE_MATCH_1})
-      list(APPEND nvcc_archs_readable sm_${CMAKE_MATCH_1})
-    else()
-      # User didn't explicitly specify ARCH for the concrete CODE, we assume ARCH=CODE
-      list(APPEND nvcc_flags -gencode arch=compute_${arch},code=sm_${arch})
-      list(APPEND nvcc_archs_readable sm_${arch})
-    endif()
-  endforeach()
-
-  # Tell NVCC to add PTX intermediate code for the specified architectures
-  foreach(arch ${cuda_arch_ptx})
-    list(APPEND nvcc_flags -gencode arch=compute_${arch},code=compute_${arch})
-    list(APPEND nvcc_archs_readable compute_${arch})
-  endforeach()
-
-  string(REPLACE ";" " " nvcc_archs_readable "${nvcc_archs_readable}")
-  set(${out_variable}          ${nvcc_flags}          PARENT_SCOPE)
-  set(${out_variable}_readable ${nvcc_archs_readable} PARENT_SCOPE)
-endfunction()
diff --git a/docs/CMakeLists.txt b/docs/CMakeLists.txt
index 605092201a..2d92016360 100644
--- a/docs/CMakeLists.txt
+++ b/docs/CMakeLists.txt
@@ -1,3 +1,45 @@
+###############################################################################
+# Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
+#
+# Produced at the Lawrence Livermore National Laboratory
+#
+# LLNL-CODE-689114
+#
+# All rights reserved.
+#
+# This file is part of RAJA.
+#
+# For additional details, please also read RAJA/LICENSE.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the disclaimer below.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the disclaimer (as noted below) in the
+#   documentation and/or other materials provided with the distribution.
+#
+# * Neither the name of the LLNS/LLNL nor the names of its contributors may
+#   be used to endorse or promote products derived from this software without
+#   specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
+# LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
+# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+# IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#
+###############################################################################
+
 add_custom_target(docs)
 
 if (SPHINX_FOUND)
@@ -9,6 +51,6 @@ if (DOXYGEN_FOUND)
 endif()
 
 if ( NOT SPHINX_FOUND AND NOT DOXGEN_FOUND)
-  message(WARNING "RAJA_ENABLE_DOCUMENTATION=On, but Sphinx or Doxygen not found. \
+  message(WARNING "ENABLE_DOCUMENTATION=On, but Sphinx or Doxygen not found. \
     Documentation won't be built.")
 endif ()
diff --git a/docs/Licenses/libc++ License b/docs/Licenses/libc++ License
new file mode 100644
index 0000000000..c278f2c928
--- /dev/null
+++ b/docs/Licenses/libc++ License	
@@ -0,0 +1,76 @@
+==============================================================================
+libc++ License
+==============================================================================
+
+The libc++ library is dual licensed under both the University of Illinois
+"BSD-Like" license and the MIT license.  As a user of this code you may choose
+to use it under either license.  As a contributor, you agree to allow your code
+to be used under both.
+
+Full text of the relevant licenses is included below.
+
+==============================================================================
+
+University of Illinois/NCSA
+Open Source License
+
+Copyright (c) 2009-2017 by the contributors listed in CREDITS.TXT
+
+All rights reserved.
+
+Developed by:
+
+    LLVM Team
+
+    University of Illinois at Urbana-Champaign
+
+    http://llvm.org
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal with
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimers.
+
+    * Redistributions in binary form must reproduce the above copyright notice,
+      this list of conditions and the following disclaimers in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the names of the LLVM Team, University of Illinois at
+      Urbana-Champaign, nor the names of its contributors may be used to
+      endorse or promote products derived from this Software without specific
+      prior written permission.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
+SOFTWARE.
+
+==============================================================================
+
+Copyright (c) 2009-2014 by the contributors listed in CREDITS.TXT
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
diff --git a/docs/sphinx/conf.py b/docs/sphinx/conf.py
index e6d5921a59..cb055c9afa 100644
--- a/docs/sphinx/conf.py
+++ b/docs/sphinx/conf.py
@@ -66,9 +66,9 @@
 # built documents.
 #
 # The short X.Y version.
-version = u'0.3'
+version = u'0.4'
 # The full version, including alpha/beta/rc tags.
-release = u'0.3.1'
+release = u'0.4.0'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
diff --git a/docs/sphinx/config_build.rst b/docs/sphinx/config_build.rst
index b60c3daebf..592caf5526 100644
--- a/docs/sphinx/config_build.rst
+++ b/docs/sphinx/config_build.rst
@@ -80,7 +80,7 @@ so all options propagate through the build process consistently.
 These variables are turned on and off similar to standard CMake variables; 
 e.g., to enable RAJA OpenMP functionality, add this CMake option ::
 
-    -DRAJA_ENABLE_OPENMP=On
+    -DENABLE_OPENMP=On
 
 The following list describes the RAJA CMake variables and their defaults.
 
@@ -92,7 +92,7 @@ The following list describes the RAJA CMake variables and their defaults.
       ======================   ======================
       Variable                 Default
       ======================   ======================
-      RAJA_ENABLE_TESTS        On 
+      ENABLE_TESTS        On 
       ======================   ======================
      
   * **Programming Models**
@@ -103,8 +103,8 @@ The following list describes the RAJA CMake variables and their defaults.
       ======================   ======================
       Variable                 Default
       ======================   ======================
-      RAJA_ENABLE_OPENMP       On 
-      RAJA_ENABLE_CUDA         Off 
+      ENABLE_OPENMP       On 
+      ENABLE_CUDA         Off 
       ======================   ======================
 
   * **Data Types, Sizes, Alignment Parameters, etc.**
@@ -251,7 +251,7 @@ The following list describes the RAJA CMake variables and their defaults.
       =============================   ========================================
       Variable                        Meaning
       =============================   ========================================
-      RAJA_ENABLE_NESTED              Enable/disable nested loop functionality
+      ENABLE_NESTED              Enable/disable nested loop functionality
       =============================   ========================================
 
      RAJA has an experimental loop-level fault tolerance model which is 
@@ -260,7 +260,7 @@ The following list describes the RAJA CMake variables and their defaults.
       =============================   ========================================
       Variable                        Meaning
       =============================   ========================================
-      RAJA_ENABLE_FT                  Enable/disable fault-tolerance mechanism
+      ENABLE_FT                  Enable/disable fault-tolerance mechanism
       RAJA_REPORT_FT                  Enable/disable a report of fault-
                                       tolerance enabled run (e.g., number of 
                                       faults detected, recovered from, 
diff --git a/docs/sphinx/raja_license.rst b/docs/sphinx/raja_license.rst
index db271a6bee..b4e7e80404 100644
--- a/docs/sphinx/raja_license.rst
+++ b/docs/sphinx/raja_license.rst
@@ -13,7 +13,7 @@
 RAJA License
 ===================================
 
-RAJA version 0.3.1
+RAJA version 0.4.0
 
 Copyright (c) 2016, Lawrence Livermore National Security, LLC.
 
diff --git a/docs/style_guide.md b/docs/style_guide.md
index 212d7363ec..cbb0553c35 100644
--- a/docs/style_guide.md
+++ b/docs/style_guide.md
@@ -1,3 +1,27 @@
+# CAMP
+
+## Type classes
+
+### Expressions
+
+An expression is a template of the form:
+
+```c++
+template <typename...Ts>
+struct expr_s {
+};
+// OR
+template <typename...Ts>
+using expr = typename expr_s::type;
+```
+
+Generically it is an un-expanded template type that accepts one or more template
+typename parameters.
+
+### Values
+
+Any complete type is a value
+
 # Concepts
 
 ### namespaces:
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index ad015af6eb..bf278953ec 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -1,6 +1,6 @@
 ###############################################################################
 #
-# Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+# Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 #
 # Produced at the Lawrence Livermore National Laboratory
 #
@@ -10,38 +10,34 @@
 #
 # This file is part of RAJA.
 #
-# For additional details, please also read RAJA/LICENSE.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright notice,
-#   this list of conditions and the disclaimer below.
-#
-# * Redistributions in binary form must reproduce the above copyright notice,
-#   this list of conditions and the disclaimer (as noted below) in the
-#   documentation and/or other materials provided with the distribution.
-#
-# * Neither the name of the LLNS/LLNL nor the names of its contributors may
-#   be used to endorse or promote products derived from this software without
-#   specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-# ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-# LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-# IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-# POSSIBILITY OF SUCH DAMAGE.
+# For details about use and distribution, please read RAJA/LICENSE.
 #
 ###############################################################################
 
 raja_add_executable(
-  NAME example-raja-pi
-  SOURCES pi.cpp)
+  NAME example-pi
+  SOURCES example-pi.cpp)
+
+raja_add_executable(
+  NAME example-add-vectors
+  SOURCES example-add-vectors.cpp)
+
+raja_add_executable(
+  NAME example-matrix-multiply
+  SOURCES example-matrix-multiply.cpp)
+
+raja_add_executable(
+  NAME example-jacobi
+  SOURCES example-jacobi.cpp)
+
+raja_add_executable(
+  NAME example-wave
+  SOURCES example-wave.cpp)
+
+raja_add_executable(
+  NAME example-custom-index
+  SOURCES example-custom-index.cpp)
 
+raja_add_executable(
+  NAME example-gauss-seidel
+  SOURCES example-gauss-seidel.cpp)
diff --git a/examples/example-add-vectors.cpp b/examples/example-add-vectors.cpp
new file mode 100644
index 0000000000..7dd73ba849
--- /dev/null
+++ b/examples/example-add-vectors.cpp
@@ -0,0 +1,171 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
+//
+// Produced at the Lawrence Livermore National Laboratory
+//
+// LLNL-CODE-689114
+//
+// All rights reserved.
+//
+// This file is part of RAJA.
+//
+// For details about use and distribution, please read RAJA/LICENSE.
+//
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include <cstdlib>
+#include <iostream>
+
+#include "memoryManager.hpp"
+
+#include "RAJA/RAJA.hpp"
+#include "RAJA/util/defines.hpp"
+
+/*
+  Example 1: Adding Two Vectors
+
+  ----[Details]---------------------
+  Starting with a C++ style for loop, this example illustrates
+  how to construct RAJA versions of the same loop with different
+  execution policies.
+
+  In this example, three integer arrays (A,B,C) are allocated
+  using the templated memory manager found in this folder.
+  The vectors A and B are initialized to have opposite values
+  and thus when the entries are added the result should be zero.
+  The result of the vector addition is stored in C. The function
+  checkSolution is used to verify correctness.
+
+  -----[RAJA Concepts]---------------
+  1. Introduction of the forall loop and basic RAJA policies
+
+  RAJA::forall<exec_policy>(iter_space I, [=] (index_type i)) {
+
+         //body
+
+  });
+
+  [=] By-copy capture
+  [&] By-reference capture (for non-unified memory targets)
+  exec_policy - Specifies how the traversal occurs
+  iter_space  - Iteration space for RAJA loop (any random access container is
+  expected)
+  index_type  - Index for RAJA loops
+
+  ----[Kernel Variants and RAJA Features]------------
+  a. C++ style for loop
+  b. RAJA style for loop with sequential iterations
+     i.  Introduces the seq_exec policy
+     ii. Introduces RAJA::RangeSegment
+  c. RAJA style for loop with omp parallelism
+     i. Introduces the omp_parallel_for_exec policy
+  d. RAJA style for loop with CUDA parallelism
+     i. Introduces the cuda_exec policy
+ */
+
+/*
+  CUDA_BLOCK_SIZE - specifies the number of threads in a CUDA thread block
+*/
+#if defined(RAJA_ENABLE_CUDA)
+const int CUDA_BLOCK_SIZE = 256;
+#endif
+
+/*
+  Function to verify correctness
+*/
+void checkSolution(int *C, int in_N);
+
+int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
+{
+
+  printf("Example 1: Adding Two Vectors \n \n");
+
+  const int N = 1000;
+  int *A = memoryManager::allocate<int>(N);
+  int *B = memoryManager::allocate<int>(N);
+  int *C = memoryManager::allocate<int>(N);
+
+  for (int i = 0; i < N; ++i) {
+    A[i] = -i;
+    B[i] = i;
+  }
+
+  printf("Standard C++ Loop \n");
+  for (int i = 0; i < N; ++i) {
+    C[i] = A[i] + B[i];
+  }
+  checkSolution(C, N);
+
+
+  printf("RAJA: Sequential Policy \n");
+  /*
+    RAJA::seq_exec -  Executes the loop sequentially
+
+    RAJA::RangeSegment(start,stop) - Generates a contiguous sequence of numbers
+    by the [start, stop) interval specified
+  */
+  RAJA::forall<RAJA::seq_exec>(
+    RAJA::RangeSegment(0, N), [=](RAJA::Index_type i) { 
+
+      C[i] = A[i] + B[i]; 
+
+    });    
+  checkSolution(C, N);
+
+
+#if defined(RAJA_ENABLE_OPENMP)
+  printf("RAJA: OpenMP Policy \n");
+  /*
+    RAJA::omp_parallel_for_exec - executes the forall loop using the
+    #pragma omp parallel for directive
+  */
+  RAJA::forall<RAJA::omp_parallel_for_exec>(
+    RAJA::RangeSegment(0, N), [=](RAJA::Index_type i) {
+    
+      C[i] = A[i] + B[i];
+
+    });
+  checkSolution(C, N);
+#endif
+
+
+#if defined(RAJA_ENABLE_CUDA)
+  printf("RAJA: CUDA Policy \n");
+  /*
+    RAJA::cuda_exec<CUDA_BLOCK_SIZE> - excecutes loop using the CUDA API
+    Here the __device__ keyword is used to specify a CUDA kernel
+  */
+  RAJA::forall<RAJA::cuda_exec<CUDA_BLOCK_SIZE>>
+    (RAJA::RangeSegment(0, N), [=] __device__(RAJA::Index_type i) { 
+      
+      C[i] = A[i] + B[i]; 
+
+    });          
+  checkSolution(C, N);
+#endif
+
+  memoryManager::deallocate(A);
+  memoryManager::deallocate(B);
+  memoryManager::deallocate(C);
+
+  return 0;
+}
+
+/*
+  Function to check for correctness
+*/
+void checkSolution(int *C, int in_N)
+{
+
+  RAJA::forall<RAJA::seq_exec>
+    (RAJA::RangeSegment(0, in_N), [=](RAJA::Index_type i) {
+     
+      if (std::abs(C[i]) != 0) {
+        printf("Error in Result \n \n");
+        return;
+      }
+
+  });
+  
+  printf("Correct Result \n \n");
+}
diff --git a/examples/example-custom-index.cpp b/examples/example-custom-index.cpp
new file mode 100644
index 0000000000..4e2ca06b3b
--- /dev/null
+++ b/examples/example-custom-index.cpp
@@ -0,0 +1,143 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
+//
+// Produced at the Lawrence Livermore National Laboratory
+//
+// LLNL-CODE-689114
+//
+// All rights reserved.
+//
+// This file is part of RAJA.
+//
+// For details about use and distribution, please read RAJA/LICENSE.
+//
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include <cstdlib>
+#include <algorithm>
+#include <iostream>
+#include <initializer_list>
+
+#include "RAJA/RAJA.hpp"
+#include "RAJA/util/defines.hpp"
+#include "RAJA/index/RangeSegment.hpp"
+
+const int DIM = 2;
+
+/*
+  Example 5: Custom Index Set
+
+  ----[Details]-------------------
+  This example illustrates how to construct a custom
+  iteration space composed of segments. Here a segment
+  is an arbitrary collection of indices.
+
+  Assuming a grid with the following contents
+
+  grid = [1, 2, 1, 2,
+          3, 4, 3, 4,
+          1, 2, 1, 2,
+          3, 4, 3, 4];
+
+  The following code will construct four segments wherein
+  each segment will store indices corresponding to a particular
+  value on the grid. For example the first segment will store the
+  indices {0,2,8,10} corresponding to the location of values equal to 1.
+
+  --------[RAJA Concepts]---------
+  1. Constructing custom IndexSets
+  2. RAJA::View              - RAJA's wrapper for multidimensional indexing
+  3. RAJA::ListSegment       - Container for an arbitrary collection of indices
+  4. RAJA::TypedListSegment  - Container for an arbitrary collection of typed
+  indices
+  5. RAJA::StaticIndexSet    - Container for an index set which is a collection
+  of
+                               ListSegments
+*/
+int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
+{
+
+  printf("Example 5. Custom Index Set \n");
+  int n = 4;
+  int *A = new int[n * n];
+
+  auto init = {1, 2, 1, 2, 3, 4, 3, 4, 1, 2, 1, 2, 3, 4, 3, 4};
+
+  std::copy(init.begin(), init.end(), A);
+
+  /*
+    The template arguments for StaticIndexSet enables the user to indicate
+    the required storage types of various segments. In this example,
+    we only need to store TypedListSegment<Index_type> (aka ListSegment)
+  */
+  RAJA::StaticIndexSet<RAJA::TypedListSegment<RAJA::Index_type>> colorset;
+
+  /*
+    RAJA::View - RAJA's wrapper for multidimensional indexing
+   */
+  RAJA::View<int, RAJA::Layout<DIM>> Aview(A, n, n);
+
+  /*
+    Buffer used for intermediate indices storage
+   */
+  auto *idx = new RAJA::Index_type[(n + 1) * (n + 1) / 4];
+
+  /*
+    Iterate over each dimension (DIM=2 for this example)
+  */
+  for (int xdim : {0, 1}) {
+    for (int ydim : {0, 1}) {
+
+      RAJA::Index_type count = 0;
+
+      /*
+        Iterate over each extent in each dimension, incrementing by two to
+        safely advance over neighbors
+       */
+      for (int xiter = xdim; xiter < n; xiter += 2) {
+        for (int yiter = ydim; yiter < n; yiter += 2) {
+
+          /*
+            Add the computed index to the buffer
+          */
+          idx[count] = std::distance(std::addressof(Aview(0, 0)),
+                                     std::addressof(Aview(xiter, yiter)));
+          ++count;
+        }
+      }
+
+      /*
+        RAJA::ListSegment - creates a list segment from a given array with a
+        specific length.
+
+        Here the indicies are inserted from the buffer as a new ListSegment.
+      */
+      colorset.push_back(RAJA::ListSegment(idx, count));
+    }
+  }
+
+  delete[] idx;
+
+
+/*
+  -----[RAJA Loop Traversal]-------
+  Under the custom color policy, a RAJA forall loop will transverse
+  through each list segment stored in the colorset sequentially and transverse
+  each segment in parallel (if enabled).
+ */
+#if defined(RAJA_ENABLE_OPENMP)
+  using ColorPolicy =
+      RAJA::ExecPolicy<RAJA::seq_segit, RAJA::omp_parallel_for_exec>;
+#else
+  using ColorPolicy = RAJA::ExecPolicy<RAJA::seq_segit, RAJA::seq_exec>;
+#endif
+
+  RAJA::forall<ColorPolicy>(
+   colorset, [=](int idx) {
+   
+     printf("A[%d] = %d\n", idx, A[idx]);
+
+   });
+
+  return 0;
+}
diff --git a/examples/example-gauss-seidel.cpp b/examples/example-gauss-seidel.cpp
new file mode 100644
index 0000000000..2c0e343fad
--- /dev/null
+++ b/examples/example-gauss-seidel.cpp
@@ -0,0 +1,256 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
+//
+// Produced at the Lawrence Livermore National Laboratory
+//
+// LLNL-CODE-689114
+//
+// All rights reserved.
+//
+// This file is part of RAJA.
+//
+// For details about use and distribution, please read RAJA/LICENSE.
+//
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include <cstdlib>
+#include <cstdio>
+#include <cstring>
+
+#include <iostream>
+#include <cmath>
+
+#include "RAJA/RAJA.hpp"
+#include "RAJA/util/defines.hpp"
+
+#include "memoryManager.hpp"
+
+/*
+  Example 6: Gauss-Seidel with Red-Black Ordering
+
+  ----[Details]--------------------
+  This example is an extension of Example 3.
+  In particular we maintain the five point stencil
+  to discretize the boundary value problem
+
+  U_xx + U_yy = f on [0,1] x [0,1]
+
+  on a structured grid. The right-hand side is
+  chosen to be f = 2*x*(y-1)*(y-2*x+x*y+2)*exp(x-y).
+
+  Rather than computing values inside the domain with
+  the Jacobi method, a Gauss-Seidel method with red-black
+  ordering is now used.
+
+  The scheme is implemented by treating the grid as
+  a checker board and storing the indices of red and
+  black cells in RAJA list segments. The segments are
+  then stored in a RAJA static index set.
+
+  ----[RAJA Concepts]---------------
+  1. Forall loop
+  2. RAJA Reduction
+  3. RAJA::omp_collapse_nowait_exec
+  4. RAJA::ListSegment
+  5. RAJA::StaticIndexSet
+*/
+
+/*
+  Struct to hold grid info
+  o - Origin in a cartesian dimension
+  h - Spacing between grid points
+  n - Number of grid points
+ */
+struct grid_s {
+  double o, h;
+  int n;
+};
+
+/*
+  ----[Functions]---------
+  solution      - Function for the analytic solution
+  computeErr    - Displays the maximum error in the solution
+  gsColorPolicy - Generates the custom index set for this example
+*/
+double solution(double x, double y);
+void computeErr(double *I, grid_s grid);
+RAJA::StaticIndexSet<RAJA::ListSegment> gsColorPolicy(int N);
+
+int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
+{
+
+  printf("Example 6: Red-Black Gauss-Seidel \n");
+
+  /*
+    ----[Solver Parameters]------------
+    tol       - Method terminates once the norm is less than tol
+    N         - Number of unknown gridpoints per cartesian dimension
+    NN        - Total number of gridpoints on the grid
+    maxIter   - Maximum number of iterations to be taken
+
+    resI2     - Residual
+    iteration - Iteration number
+    grid_s    - Struct with grid information for a cartesian dimension
+  */
+  double tol = 1e-10;
+
+  int N = 100;
+  int NN = (N + 2) * (N + 2);
+  int maxIter = 100000;
+
+  double resI2;
+  int iteration;
+
+  grid_s gridx;
+  gridx.o = 0.0;
+  gridx.h = 1.0 / (N + 1.0);
+  gridx.n = N + 2;
+
+  double *I = memoryManager::allocate<double>(NN);
+
+  memset(I, 0, NN * sizeof(double));
+
+  RAJA::StaticIndexSet<RAJA::ListSegment> colorSet = gsColorPolicy(N);
+
+  memset(I, 0, NN * sizeof(double));
+  printf("Gauss-Seidel with Red and Black Ordering \n");
+
+#if defined(RAJA_ENABLE_OPENMP)
+  using colorPolicy =
+      RAJA::ExecPolicy<RAJA::seq_segit, RAJA::omp_parallel_for_exec>;
+#else
+  using colorPolicy = RAJA::ExecPolicy<RAJA::seq_segit, RAJA::seq_exec>;
+#endif
+
+  resI2 = 1;
+  iteration = 0;
+  while (resI2 > tol * tol) {
+
+#if defined(RAJA_ENABLE_OPENMP)
+    RAJA::ReduceSum<RAJA::omp_reduce, double> RAJA_resI2(0.0);
+#else
+    RAJA::ReduceSum<RAJA::seq_reduce, double> RAJA_resI2(0.0);
+#endif
+
+    /*
+      Gauss-Seidel Iteration
+    */
+    RAJA::forall<colorPolicy>(
+        colorSet, [=](RAJA::Index_type id) {
+        
+          /*
+            Compute x,y grid index
+          */
+          int m = id % (N + 2);
+          int n = id / (N + 2);
+
+          double x = gridx.o + m * gridx.h;
+          double y = gridx.o + n * gridx.h;
+
+          double f = gridx.h * gridx.h
+                     * (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y));
+
+          double newI = -0.25 * (f - I[id - N - 2] - I[id + N + 2] - I[id - 1]
+                                 - I[id + 1]);
+
+          double oldI = I[id];
+          RAJA_resI2 += (newI - oldI) * (newI - oldI);
+          I[id] = newI;
+
+        });
+    resI2 = RAJA_resI2;
+
+    if (iteration > maxIter) {
+      printf("Gauss-Seidel Maxed out on iterations \n");
+      break;
+    }
+
+    iteration++;
+  }
+  computeErr(I, gridx);
+  printf("No of iterations: %d \n \n", iteration);
+
+
+  memoryManager::deallocate(I);
+
+  return 0;
+}
+
+/*
+  This function will loop over the red and black cells of a grid
+  and store the index in a buffer. The buffers will then be used
+  to generate RAJA ListSegments and populate a RAJA Static Index
+  Set.
+*/
+RAJA::StaticIndexSet<RAJA::ListSegment> gsColorPolicy(int N)
+{
+
+  RAJA::StaticIndexSet<RAJA::ListSegment> colorSet;
+
+  int redN = ceil(N * N / 2);
+  int blkN = floor(N * N / 2);
+  RAJA::Index_type *Red = new RAJA::Index_type[redN];
+  RAJA::Index_type *Blk = new RAJA::Index_type[blkN];
+
+
+  int ib = 0;
+  int ir = 0;
+
+  bool isRed = true;
+  for (int n = 1; n <= N; ++n) {
+    for (int m = 1; m <= N; ++m) {
+
+      RAJA::Index_type id = n * (N + 2) + m;
+      if (isRed) {
+        Red[ib] = id;
+        ib++;
+      } else {
+        Blk[ir] = id;
+        ir++;
+      }
+      isRed = !isRed;
+    }
+  }
+  // Create Index
+  colorSet.push_back(RAJA::ListSegment(Blk, blkN));
+  colorSet.push_back(RAJA::ListSegment(Red, redN));
+  delete[] Blk;
+  delete[] Red;
+
+  return colorSet;
+}
+
+
+/*
+  Function for the anlytic solution
+*/
+double solution(double x, double y)
+{
+  return x * y * exp(x - y) * (1 - x) * (1 - y);
+}
+
+/*
+  Error is computed via ||I_{approx}(:) - U_{analytic}(:)||_{inf}
+*/
+void computeErr(double *I, grid_s grid)
+{
+
+  RAJA::RangeSegment fdBounds(0, grid.n);
+  RAJA::ReduceMax<RAJA::seq_reduce, double> tMax(-1.0);
+  using myPolicy =
+    RAJA::NestedPolicy<RAJA::ExecList<RAJA::seq_exec, RAJA::seq_exec>>;
+
+  RAJA::forallN<myPolicy>(
+    fdBounds, fdBounds, [=](RAJA::Index_type ty, RAJA::Index_type tx) {
+    
+      int id = tx + grid.n * ty;
+      double x = grid.o + tx * grid.h;
+      double y = grid.o + ty * grid.h;
+      double myErr = std::abs(I[id] - solution(x, y));
+      tMax.max(myErr);
+
+    });
+
+  double l2err = tMax;
+  printf("Max error = %lg, h = %f \n", l2err, grid.h);
+}
diff --git a/examples/example-jacobi.cpp b/examples/example-jacobi.cpp
new file mode 100644
index 0000000000..8e9fc888fe
--- /dev/null
+++ b/examples/example-jacobi.cpp
@@ -0,0 +1,413 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
+//
+// Produced at the Lawrence Livermore National Laboratory
+//
+// LLNL-CODE-689114
+//
+// All rights reserved.
+//
+// This file is part of RAJA.
+//
+// For details about use and distribution, please read RAJA/LICENSE.
+//
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include <cstdlib>
+#include <cstdio>
+#include <cstring>
+
+#include <iostream>
+#include <cmath>
+
+#include "RAJA/RAJA.hpp"
+#include "RAJA/util/defines.hpp"
+
+#include "memoryManager.hpp"
+
+/*
+  Example 3: Jacobi Method
+
+  ----[Details]--------------------
+  This code uses a five point finite difference stencil
+  to discretize the following boundary value problem
+
+  U_xx + U_yy = f on [0,1] x [0,1].
+
+  The right-hand side is chosen to be
+  f = 2*x*(y-1)*(y-2*x+x*y+2)*exp(x-y).
+
+  A structured grid is used to discretize the domain
+  [0,1] x [0,1]. Values inside the domain are computed
+  using the Jacobi method to solve the associated
+  linear system. The scheme is invoked until the l_2
+  difference of subsequent iterations is below a
+  tolerance.
+
+  The scheme is implemented by allocating two arrays
+  (I, Iold) and initialized to zero. The first set of
+  nested for loops apply an iteration of the Jacobi
+  scheme. As boundary values are already known the
+  scheme is only applied to the interior nodes.
+
+  The second set of nested for loops is used to
+  update Iold and compute the l_2 norm of the
+  difference of the iterates.
+
+  Computing the l_2 norm requires a reduction operation.
+  To simplify the reduction procedure, the RAJA API
+  introduces thread safe variables.
+
+  ----[RAJA Concepts]---------------
+  1. ForallN loop
+  2. RAJA Reduction
+  3. RAJA::omp_collapse_nowait_exec
+
+  ----[Kernel Variants and RAJA Features]---
+  a. C++ style nested for loops
+  b. RAJA style nested for loops with sequential iterations
+     i. Introduces RAJA reducers for sequential policies
+  c. RAJA style nested for loops with omp parallelism
+     i.  Introduces collapsing loops using RAJA omp policies
+     ii. Introduces RAJA reducers for omp policies
+  d. RAJA style for loop with CUDA parallelism
+     i. Introduces RAJA reducers for cuda policies
+*/
+
+
+/*
+  ----[Constant Values]-----
+  CUDA_BLOCK_SIZE_X - Number of threads in the
+                      x-dimension of a cuda thread block
+
+  CUDA_BLOCK_SIZE_Y - Number of threads in the
+                      y-dimension of a cuda thread block
+
+  CUDA_BLOCK_SIZE   - Number of threads per threads block
+*/
+#if defined(RAJA_ENABLE_CUDA)
+const int CUDA_BLOCK_DIM_X = 16;
+const int CUDA_BLOCK_DIM_Y = 16;
+const int CUDA_BLOCK_SIZE = 256;
+#endif
+
+
+/*
+  Struct to hold grid info
+  o - Origin in a cartesian dimension
+  h - Spacing between grid points
+  n - Number of grid points
+ */
+struct grid_s {
+  double o, h;
+  int n;
+};
+
+/*
+  ----[Functions]---------
+  solution   - Function for the analytic solution
+  computeErr - Displays the maximum error in the solution
+*/
+double solution(double x, double y);
+void computeErr(double *I, grid_s grid);
+
+int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
+{
+
+  printf("Example 3: Jacobi Method \n");
+
+  /*
+    ----[Solver Parameters]------------
+    tol       - Method terminates once the norm is less than tol
+    N         - Number of unknown gridpoints per cartesian dimension
+    NN        - Total number of gridpoints on the grid
+    maxIter   - Maximum number of iterations to be taken
+
+    resI2     - Residual
+    iteration - Iteration number
+    grid_s    - Struct with grid information for a cartesian dimension
+  */
+  double tol = 1e-10;
+
+  int N = 50;
+  int NN = (N + 2) * (N + 2);
+  int maxIter = 100000;
+
+  double resI2;
+  int iteration;
+
+  grid_s gridx;
+  gridx.o = 0.0;
+  gridx.h = 1.0 / (N + 1.0);
+  gridx.n = N + 2;
+
+  /*
+    I, Iold - Holds iterates of Jacobi method
+  */
+  double *I = memoryManager::allocate<double>(NN);
+  double *Iold = memoryManager::allocate<double>(NN);
+
+
+  memset(I, 0, NN * sizeof(double));
+  memset(Iold, 0, NN * sizeof(double));
+
+
+  printf("Standard  C++ Loop \n");
+  resI2 = 1;
+  iteration = 0;
+
+  while (resI2 > tol * tol) {
+
+    /*
+      Jacobi Iteration
+    */
+    for (int n = 1; n <= N; ++n) {
+      for (int m = 1; m <= N; ++m) {
+
+        double x = gridx.o + m * gridx.h;
+        double y = gridx.o + n * gridx.h;
+
+        double f = gridx.h * gridx.h
+                   * (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y));
+
+        int id = n * (N + 2) + m;
+        I[id] = -0.25 * (f - Iold[id - N - 2] - Iold[id + N + 2] - Iold[id - 1]
+                         - Iold[id + 1]);
+      }
+    }
+
+    /*
+      Compute residual and update Iold
+    */
+    resI2 = 0.0;
+    for (int k = 0; k < NN; k++) {
+      resI2 += (I[k] - Iold[k]) * (I[k] - Iold[k]);
+      Iold[k] = I[k];
+    }
+
+    if (iteration > maxIter) {
+      printf("Standard C++ Loop - Maxed out on iterations \n");
+      exit(-1);
+    }
+
+    iteration++;
+  }
+  computeErr(I, gridx);
+  printf("No of iterations: %d \n \n", iteration);
+
+
+  /*
+    RAJA loop calls may be shortened by predefining policies
+  */
+  RAJA::RangeSegment gridRange(0, NN);
+  RAJA::RangeSegment jacobiRange(1, (N + 1));
+  using jacobiSeqNestedPolicy =
+      RAJA::NestedPolicy<RAJA::ExecList<RAJA::seq_exec, RAJA::seq_exec>>;
+
+  printf("RAJA: Sequential Policy - Nested ForallN \n");
+  resI2 = 1;
+  iteration = 0;
+  memset(I, 0, NN * sizeof(double));
+  memset(Iold, 0, NN * sizeof(double));
+
+  while (resI2 > tol * tol) {
+
+    /*
+      Jacobi Iteration
+    */
+    RAJA::forallN<jacobiSeqNestedPolicy>(
+      jacobiRange, jacobiRange, [=](RAJA::Index_type m, RAJA::Index_type n) {      
+
+          double x = gridx.o + m * gridx.h;
+          double y = gridx.o + n * gridx.h;
+
+          double f = gridx.h * gridx.h
+                     * (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y));
+
+          int id = n * (N + 2) + m;
+          I[id] =
+              -0.25 * (f - Iold[id - N - 2] - Iold[id + N + 2] - Iold[id - 1]
+                       - Iold[id + 1]);
+        });
+
+    /*
+      ----[Reduction step]---------
+      The RAJA API introduces a thread-safe accumulation variable
+      "ReduceSum" in order to perform reductions
+    */
+    RAJA::ReduceSum<RAJA::seq_reduce, double> RAJA_resI2(0.0);
+    RAJA::forall<RAJA::seq_exec>(
+      gridRange, [=](RAJA::Index_type k) {
+      
+        RAJA_resI2 += (I[k] - Iold[k]) * (I[k] - Iold[k]);          
+        Iold[k] = I[k];
+
+      });
+    
+    resI2 = RAJA_resI2;
+    if (iteration > maxIter) {
+      printf("RAJA: Sequential - Maxed out on iterations! \n");
+      exit(-1);
+    }
+    iteration++;
+  }
+  computeErr(I, gridx);
+  printf("No of iterations: %d \n \n", iteration);
+  
+  
+#if defined(RAJA_ENABLE_OPENMP)
+  printf("RAJA: OpenMP Policy - Nested ForallN \n");
+  resI2 = 1;
+  iteration = 0;
+  memset(I, 0, NN * sizeof(double));
+  memset(Iold, 0, NN * sizeof(double));
+  
+  /*
+    ----[RAJA Policies]-----------
+    RAJA::omp_collapse_nowait_exec -
+    parallizes nested loops without introducing nested parallism
+
+    RAJA::OMP_Parallel<> - Creates a parallel region,
+    must be the last argument of the nested policy list
+  */
+  using jacobiompNestedPolicy =
+    RAJA::NestedPolicy<RAJA::ExecList<RAJA::omp_collapse_nowait_exec,
+    RAJA::omp_collapse_nowait_exec>, RAJA::OMP_Parallel<>>;
+
+  while (resI2 > tol * tol) {
+
+    /*
+      Jacobi Iteration
+    */
+    RAJA::forallN<jacobiompNestedPolicy>(
+        jacobiRange, jacobiRange, [=](RAJA::Index_type m, RAJA::Index_type n) {
+                
+          double x = gridx.o + m * gridx.h;
+          double y = gridx.o + n * gridx.h;
+
+          double f = gridx.h * gridx.h
+                     * (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y));
+
+          int id = n * (N + 2) + m;
+          I[id] = -0.25 * (f - Iold[id - N - 2] - Iold[id + N + 2] - Iold[id - 1]
+                             - Iold[id + 1]);              
+        });
+    /*
+      Compute residual and update Iold
+    */
+    RAJA::ReduceSum<RAJA::omp_reduce, double> RAJA_resI2(0.0);
+    RAJA::forall<RAJA::omp_parallel_for_exec>(
+      gridRange, [=](RAJA::Index_type k) {
+      
+        RAJA_resI2 += (I[k] - Iold[k]) * (I[k] - Iold[k]);                    
+        Iold[k] = I[k];
+        
+      });
+    
+    resI2 = RAJA_resI2;
+    if (iteration > maxIter) {
+      printf("RAJA: OpenMP - Maxed out on iterations! \n");
+      exit(-1);
+    }
+    iteration++;
+  }
+  computeErr(I, gridx);
+  printf("No of iterations: %d \n \n", iteration);
+#endif
+
+
+#if defined(RAJA_ENABLE_CUDA)
+  printf("RAJA: CUDA Policy - Nested ForallN \n");
+
+  using jacobiCUDANestedPolicy = RAJA::NestedPolicy<RAJA::    
+    ExecList<RAJA::cuda_threadblock_y_exec<CUDA_BLOCK_DIM_X>,
+    RAJA::cuda_threadblock_x_exec<CUDA_BLOCK_DIM_Y>>>;    
+
+  resI2 = 1;
+  iteration = 0;
+  memset(I, 0, NN * sizeof(double));
+  memset(Iold, 0, NN * sizeof(double));
+
+  while (resI2 > tol * tol) {
+
+    /*
+      Jacobi Iteration
+    */
+    RAJA::forallN<jacobiCUDANestedPolicy>(
+        jacobiRange, jacobiRange, [=] __device__(RAJA::Index_type m, RAJA::Index_type n) {
+        
+          double x = gridx.o + m * gridx.h;
+          double y = gridx.o + n * gridx.h;
+
+          double f = gridx.h * gridx.h
+                     * (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y));
+
+          int id = n * (N + 2) + m;
+          I[id] = -0.25 * (f - Iold[id - N - 2] - Iold[id + N + 2] - Iold[id - 1]
+                             - Iold[id + 1]);                            
+        });
+
+    /*
+      Compute residual and update Iold
+    */
+    RAJA::ReduceSum<RAJA::cuda_reduce<CUDA_BLOCK_SIZE>, double> RAJA_resI2(0.0);
+    RAJA::forall<RAJA::cuda_exec<CUDA_BLOCK_SIZE>>(
+      gridRange, [=] __device__(RAJA::Index_type k) {
+      
+          RAJA_resI2 += (I[k] - Iold[k]) * (I[k] - Iold[k]);
+          Iold[k] = I[k];
+
+      });
+
+    resI2 = RAJA_resI2;
+
+    if (iteration > maxIter) {
+      printf("RAJA: CUDA - Maxed out on iterations! \n");
+      exit(-1);
+    }
+    iteration++;
+  }
+  cudaDeviceSynchronize();
+  computeErr(I, gridx);
+  printf("No of iterations: %d \n \n", iteration);
+#endif
+
+  memoryManager::deallocate(I);
+  memoryManager::deallocate(Iold);
+  
+
+  return 0;
+}
+
+/*
+  Function for the anlytic solution
+*/
+double solution(double x, double y)
+{
+  return x * y * exp(x - y) * (1 - x) * (1 - y);
+}
+
+/*
+  Error is computed via ||I_{approx}(:) - U_{analytic}(:)||_{inf}
+*/
+void computeErr(double *I, grid_s grid)
+{
+
+  RAJA::RangeSegment fdBounds(0, grid.n);
+  RAJA::ReduceMax<RAJA::seq_reduce, double> tMax(-1.0);
+  using myPolicy =
+    RAJA::NestedPolicy<RAJA::ExecList<RAJA::seq_exec, RAJA::seq_exec>>;
+
+  RAJA::forallN<myPolicy>(
+    fdBounds, fdBounds, [=](RAJA::Index_type ty, RAJA::Index_type tx) {    
+
+      int id = tx + grid.n * ty;
+      double x = grid.o + tx * grid.h;
+      double y = grid.o + ty * grid.h;
+      double myErr = std::abs(I[id] - solution(x, y));
+      tMax.max(myErr);
+    });
+
+  double l2err = tMax;
+  printf("Max error = %lg, h = %f \n", l2err, grid.h);
+}
diff --git a/examples/example-matrix-multiply.cpp b/examples/example-matrix-multiply.cpp
new file mode 100644
index 0000000000..974caef4bf
--- /dev/null
+++ b/examples/example-matrix-multiply.cpp
@@ -0,0 +1,318 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
+//
+// Produced at the Lawrence Livermore National Laboratory
+//
+// LLNL-CODE-689114
+//
+// All rights reserved.
+//
+// This file is part of RAJA.
+//
+// For details about use and distribution, please read RAJA/LICENSE.
+//
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include <cstdlib>
+#include <cmath>
+#include <iostream>
+#include <algorithm>
+#include <initializer_list>
+
+#include "RAJA/RAJA.hpp"
+#include "RAJA/index/RangeSegment.hpp"
+#include "RAJA/util/defines.hpp"
+
+#include "memoryManager.hpp"
+
+/*
+  Example 2: Multiplying Two Matrices
+
+  ----[Details]--------------------
+  Starting with C++ style nested for loops, this example
+  illustrates how to construct RAJA versions of the same loops
+  using different execution policies. Furthermore, as nesting
+  RAJA forall loops are not currently supported with CUDA,
+  this example makes utility of RAJA's forallN loop which
+  may be used with any policy.
+
+  In this example two matrices of dimension N x N are allocated and multiplied.
+  The matrix A is populated with a constant value along the rows while B is
+  populated with a constant value along the columns. The function checkSolution 
+  checks for correctness.
+
+  -----[RAJA Concepts]-------------
+  1. Nesting forall loops (Not currently supported in CUDA)
+
+  2. ForallN loop (Supported with all policies)
+
+  RAJA::forallN<
+  RAJA::NestedPolicy<exec_policy1, .... , exec_policyN> >(
+  iter_space I1,..., iter_space IN, [=](index_type i1,..., index_type iN) {
+
+         //body
+
+  });
+
+  [=] By-copy capture
+  [&] By-reference capture (for non-unified memory targets)
+  RAJA::NestedPolicy - Stores a list of RAJA execution policies
+  exec_policy        - Specifies how the traversal occurs
+  iter_space         - Iteration space for RAJA loop (any random access
+  container is expected)
+
+  3. RAJA::View - RAJA's wrapper for multidimensional indexing
+
+  ----[Kernel Variants and RAJA Features]-----
+  a. C++ style nested for loops
+  b. RAJA style outer loop with a sequential policy
+     and a C++ style inner for loop
+  c. RAJA style nested for loops with sequential policies
+  d. RAJA forallN loop with sequential policies
+     i. This kernel introduces RAJA::ExecList
+  e. RAJA forallN loop with OpenMP parallism on the outer loop
+  f. RAJA forallN loop executed on the CUDA API
+     i.  This kernel illustrates constructing two-dimensional thread blocks
+         for use of the CUDA execution policy.
+     ii. The current implementation of forallN using the CUDA
+         variant is performed asynchronously and thus a barrier
+         (cudaDeviceSynchronize) is placed after calling forallN.
+*/
+
+/*
+  ---[Constant values]----
+  N   - Defines the number of rows/columns in a matrix
+  NN  - Total number of entries in a matrix
+  DIM - Dimension of the data structure in which the matrices
+        are stored
+
+  CUDA_BLOCK_SIZE_X - Number of threads in the
+                      x-dimension of a cuda thread block
+
+  CUDA_BLOCK_SIZE_Y - Number of threads in the
+                      y-dimension of a cuda thread block
+*/
+const int N = 1000;
+const int NN = N * N;
+const int DIM = 2;
+
+#if defined(RAJA_ENABLE_CUDA)
+const int CUDA_BLOCK_SIZE_X = 16;
+const int CUDA_BLOCK_SIZE_Y = 16;
+#endif
+
+/*
+ Macros are used here to simplify indexing
+*/
+#define A(x1, x2) A[x1 + N * x2]
+#define B(x1, x2) B[x1 + N * x2]
+#define C(x1, x2) C[x1 + N * x2]
+
+template <typename T>
+void checkSolution(T *C, int N);
+
+template <typename T>
+void checkSolution(RAJA::View<T, RAJA::Layout<DIM>> Cview, int N);
+
+int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
+{
+
+  printf("Example 2: Multiplying Two N x N Matrices \n \n");
+  double *A = memoryManager::allocate<double>(NN);
+  double *B = memoryManager::allocate<double>(NN);
+  double *C = memoryManager::allocate<double>(NN);
+
+  /*
+    Intialize matrices
+   */
+  for (int row = 0; row < N; ++row) {
+    for (int col = 0; col < N; ++col) {
+      A(row, col) = row;
+      B(row, col) = col;
+    }
+  }
+
+  printf("Standard C++ Nested Loops \n");
+  for (int row = 0; row < N; ++row) {
+    for (int col = 0; col < N; ++col) {
+
+      double dot = 0.0;
+      for (int k = 0; k < N; ++k) {
+        dot += A(row, k) * B(k, col);
+      }
+
+      C(row, col) = dot;
+    }
+  }
+  checkSolution<double>(C, N);
+
+  /*
+    As an alternative to marcos RAJA::View wraps
+    a pointer to enable multi-dimensional indexing
+    In this example our data is assumed to be two-dimensional
+    with N values in each component.
+  */
+  RAJA::View<double, RAJA::Layout<DIM>> Aview(A, N, N);
+  RAJA::View<double, RAJA::Layout<DIM>> Bview(B, N, N);
+  RAJA::View<double, RAJA::Layout<DIM>> Cview(C, N, N);
+
+  /*
+    As the loops use the same bounds, we may specify
+    the bounds prior to the use of any RAJA loops
+  */
+  RAJA::RangeSegment matBounds(0, N);
+
+
+  printf("RAJA: Forall - Sequential Policies\n");
+  RAJA::forall<RAJA::seq_exec>(
+    matBounds, [=](RAJA::Index_type row) {    
+
+      for (int col = 0; col < N; ++col) {
+        
+        double dot = 0.0;
+        for (int k = 0; k < N; ++k) {
+          dot += Aview(row, k) * Bview(k, col);
+        }
+        
+        Cview(row, col) = dot;
+      }
+      
+    });
+  checkSolution<double>(Cview, N);
+
+  printf("RAJA: Nested Forall - Sequential Policies\n");
+  /*
+    Forall loops may be nested under sequential and omp policies
+  */
+  RAJA::forall<RAJA::seq_exec>(
+    matBounds, [=](RAJA::Index_type row) {
+
+      RAJA::forall<RAJA::seq_exec>(
+        matBounds, [=](RAJA::Index_type col) {
+        
+
+          double dot = 0.0;
+          for (int k = 0; k < N; ++k) {
+            dot += Aview(row, k) * Bview(k, col);
+          }
+
+          Cview(row, col) = dot;
+        });
+    });
+  checkSolution<double>(Cview, N);
+
+
+  printf("RAJA: ForallN - Sequential Policies\n");
+  /*
+    Nested forall loops may be collapsed into a single forallN loop
+  */
+  RAJA::forallN<RAJA::NestedPolicy<
+    RAJA::ExecList<RAJA::seq_exec,RAJA::seq_exec>>>(    
+      matBounds, matBounds, [=](RAJA::Index_type row, RAJA::Index_type col) {
+      
+        double dot = 0.0;
+        for (int k = 0; k < N; ++k) {
+          dot += Aview(row, k) * Bview(k, col);
+        }
+        
+        Cview(row, col) = dot;
+      });
+  checkSolution<double>(Cview, N);
+  
+
+#if defined(RAJA_ENABLE_OPENMP)
+  printf("RAJA: ForallN - OpenMP/Sequential Policies\n");
+  /*
+    Here the outer loop is excuted in parallel while the inner loop
+    is executed sequentially
+  */
+  RAJA::forallN<RAJA::NestedPolicy<
+    RAJA::ExecList<RAJA::omp_parallel_for_exec,RAJA::seq_exec>>>(
+      matBounds, matBounds, [=](RAJA::Index_type row, RAJA::Index_type col) {
+      
+        double dot = 0.0;
+        for (int k = 0; k < N; ++k) {
+          dot += Aview(row, k) * Bview(k, col);
+        }
+
+        Cview(row, col) = dot;
+      });
+  checkSolution<double>(Cview, N);
+#endif
+
+
+#if defined(RAJA_ENABLE_CUDA)
+  printf("RAJA: ForallN - CUDA Policies\n");
+  /*
+    This example illustrates creating two-dimensional thread blocks as described
+    under the CUDA nomenclature
+  */
+  RAJA::forallN<RAJA::NestedPolicy<RAJA::
+    ExecList<RAJA::cuda_threadblock_y_exec<CUDA_BLOCK_SIZE_X>,    
+      RAJA::cuda_threadblock_x_exec<CUDA_BLOCK_SIZE_Y>>>>(   
+        matBounds, matBounds, [=] __device__(RAJA::Index_type row, RAJA::Index_type col) {
+        
+          double dot = 0.0;
+          for (int k = 0; k < N; ++k) {
+            dot += Aview(row, k) * Bview(k, col);
+          }
+          
+          Cview(row, col) = dot;
+        });
+  cudaDeviceSynchronize();
+  checkSolution<double>(Cview, N);
+#endif
+
+  memoryManager::deallocate(A);
+  memoryManager::deallocate(B);
+  memoryManager::deallocate(C);
+
+  return 0;
+}
+
+/*
+  Function which checks for correctness
+*/
+template <typename T>
+void checkSolution(RAJA::View<T, RAJA::Layout<DIM>> Cview, int in_N)
+{
+
+  RAJA::forall<RAJA::seq_exec>(
+    RAJA::RangeSegment(0, N), [=](RAJA::Index_type row) {
+    
+      RAJA::forall<RAJA::seq_exec>(
+        RAJA::RangeSegment(0, N), [=](RAJA::Index_type col) {
+        
+          double diff = Cview(row, col) - row * col * in_N;
+            
+          if (std::abs(diff) > 1e-9) {
+            printf("Incorrect Result \n \n");
+            return;
+          }
+
+        });
+    });
+  printf("Correct Result \n \n");
+};
+
+template <typename T>
+void checkSolution(T *C, int in_N)
+{
+
+  RAJA::forall<RAJA::seq_exec>(
+    RAJA::RangeSegment(0, N), [=](RAJA::Index_type row) {    
+                                 
+      RAJA::forall<RAJA::seq_exec>(
+        RAJA::RangeSegment(0, N), [=](RAJA::Index_type col) {       
+
+          double diff = C(row, col) - row * col * in_N;
+            
+          if (std::abs(diff) > 1e-9) {
+            printf("Incorrect Result \n \n");
+            return;
+          }
+
+        });
+    });
+  printf("Correct Result \n \n");
+};
diff --git a/examples/example-pi.cpp b/examples/example-pi.cpp
new file mode 100644
index 0000000000..c3b414eaca
--- /dev/null
+++ b/examples/example-pi.cpp
@@ -0,0 +1,42 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
+//
+// Produced at the Lawrence Livermore National Laboratory
+//
+// LLNL-CODE-689114
+//
+// All rights reserved.
+//
+// This file is part of RAJA.
+//
+// For details about use and distribution, please read RAJA/LICENSE.
+//
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include <cstdlib>
+#include <iostream>
+
+#include "RAJA/RAJA.hpp"
+#include "RAJA/util/defines.hpp"
+
+int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
+{
+  typedef RAJA::seq_reduce reduce_policy;
+  typedef RAJA::seq_exec execute_policy;
+
+  RAJA::Index_type begin = 0;
+  RAJA::Index_type numBins = 512 * 512;
+
+  RAJA::ReduceSum<reduce_policy, double> piSum(0.0);
+
+  RAJA::forall<execute_policy>(begin,
+                               numBins,
+                               [=](int i) {
+                                 double x = (double(i) + 0.5) / numBins;
+                                 piSum += 4.0 / (1.0 + x * x);
+                               });
+
+  std::cout << "PI is ~ " << double(piSum) / numBins << std::endl;
+
+  return 0;
+}
diff --git a/examples/example-wave.cpp b/examples/example-wave.cpp
new file mode 100644
index 0000000000..cd9ff125c7
--- /dev/null
+++ b/examples/example-wave.cpp
@@ -0,0 +1,288 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
+//
+// Produced at the Lawrence Livermore National Laboratory
+//
+// LLNL-CODE-689114
+//
+// All rights reserved.
+//
+// This file is part of RAJA.
+//
+// For details about use and distribution, please read RAJA/LICENSE.
+//
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include <cmath>
+#include <cstdlib>
+#include <iostream>
+#include "memoryManager.hpp"
+
+#include "RAJA/RAJA.hpp"
+#include "RAJA/util/defines.hpp"
+
+/*
+  Example 4: Time-Domain Finite Difference 
+             Acoustic Wave Equation Solver
+
+  ------[Details]----------------------
+  This example highlights how to construct a single
+  kernel capable of being executed with different RAJA policies.
+
+  Here we solve the acoustic wave equation
+
+  P_tt = cc*(P_xx + P_yy) via finite differences.
+
+  The scheme uses a second order central difference discretization
+  for time and a fourth order central difference discretization for space.
+  Periodic boundary conditions are assumed on the grid [-1,1] x [-1, 1].
+
+  NOTE: The x and y dimensions are discretized identically.
+
+  ----[RAJA Concepts]-------------------
+  1. RAJA kernels are portable and a single implemenation can run
+     on various platforms
+
+  2. RAJA MaxReduction - RAJA's implementation for computing a maximum value
+     (MinReduction computes the min)
+*/
+
+/*
+  ---[Constant Values]-------
+  sr - Radius of the finite difference stencil
+  PI - Value of pi
+
+  CUDA_BLOCK_SIZE_X - Number of threads in the
+                      x-dimension of a cuda thread block
+  CUDA_BLOCK_SIZE_Y - Number of threads in the
+                      y-dimension of a cuda thread block
+*/
+
+const int sr = 2;
+const double PI = 3.14159265359;
+
+#if defined(RAJA_ENABLE_CUDA)
+const int CUDA_BLOCK_DIM_X = 16;
+const int CUDA_BLOCK_DIM_Y = 16;
+#endif
+
+/*
+  ----[Struct to hold grid info]-----
+  o - Origin in a cartesian dimension
+  h - Spacing between grid points
+  n - Number of grid points
+ */
+struct grid_s {
+  double ox, dx;
+  int nx;
+};
+
+
+/*
+  ----[Functions]------
+  wave       - Templated wave propagator
+  waveSol    - Function for the analytic solution of the equation
+  setIC      - Sets the intial value at two time levels (t0,t1)
+  computeErr - Displays the maximum error in the approximation
+ */
+
+template <typename T, typename fdNestedPolicy>
+void wave(T *P1, T *P2, RAJA::RangeSegment fdBounds, double ct, int nx);
+double waveSol(double t, double x, double y);
+void setIC(double *P1, double *P2, double t0, double t1, grid_s grid);
+void computeErr(double *P, double tf, grid_s grid);
+
+int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
+{
+
+  printf("Example 4. Time-Domain Finite Difference Acoustic Wave Equation Solver \n");          
+         
+  /*
+    Wave speed squared
+   */
+  double cc = 1. / 2.0;
+
+  /*
+    Multiplier for spatial refinement
+   */
+  int factor = 8;
+
+  /*
+    Discretization of the domain.
+    The same discretization of the x-dimension wil be used for the y-dimension
+  */
+  grid_s grid;
+  grid.ox = -1;
+  grid.dx = 0.1250 / factor;
+  grid.nx = 16 * factor;
+  RAJA::RangeSegment fdBounds(0, grid.nx);
+
+  /*
+    Solution is propagated until time T
+  */
+  double T = 0.82;
+
+
+  int entries = grid.nx * grid.nx;
+  double *P1 = memoryManager::allocate<double>(entries);
+  double *P2 = memoryManager::allocate<double>(entries);
+
+  /*
+    ----[Time stepping parameters]----
+    dt - Step size
+    nt - Total number of time steps
+    ct - Merged coefficents
+  */
+  double dt, nt, time, ct;
+  dt = 0.01 * (grid.dx / sqrt(cc));
+  nt = ceil(T / dt);
+  dt = T / nt;
+  ct = (cc * dt * dt) / (grid.dx * grid.dx);
+
+  /*
+    Predefined Nested Policies
+  */
+
+  // Sequential
+  using fdPolicy =
+      RAJA::NestedPolicy<RAJA::ExecList<RAJA::seq_exec, RAJA::seq_exec>>;
+
+  // OpenMP
+  // using fdPolicy =
+  // RAJA::NestedPolicy<RAJA::ExecList
+  //<RAJA::omp_collapse_nowait_exec,
+  // RAJA::omp_collapse_nowait_exec>,
+  // RAJA::OMP_Parallel<>>;
+
+  // CUDA
+  // using fdPolicy
+  //= RAJA::NestedPolicy<RAJA::ExecList
+  //<RAJA::cuda_threadblock_y_exec<CUDA_BLOCK_DIM_X>,
+  // RAJA::cuda_threadblock_x_exec<CUDA_BLOCK_DIM_Y>>>;
+
+  time = 0;
+  setIC(P1, P2, (time - dt), time, grid);
+  for (int k = 0; k < nt; ++k) {
+
+    wave<double, fdPolicy>(P1, P2, fdBounds, ct, grid.nx);
+    time += dt;
+
+    double *Temp = P2;
+    P2 = P1;
+    P1 = Temp;
+  }
+#if defined(RAJA_ENABLE_CUDA)
+  cudaDeviceSynchronize();
+#endif
+  computeErr(P2, time, grid);
+  printf("Evolved solution to time = %f \n", time);
+
+  memoryManager::deallocate(P1);
+  memoryManager::deallocate(P2);
+
+  return 0;
+}
+
+
+/*
+  Function for the analytic solution
+*/
+double waveSol(double t, double x, double y)
+{
+  return cos(2. * PI * t) * sin(2. * PI * x) * sin(2. * PI * y);
+}
+
+/*
+  Error is computed via ||P_{approx}(:) - P_{analytic}(:)||_{inf}
+*/
+void computeErr(double *P, double tf, grid_s grid)
+{
+
+  RAJA::RangeSegment fdBounds(0, grid.nx);
+  RAJA::ReduceMax<RAJA::seq_reduce, double> tMax(-1.0);
+  using myPolicy =
+    RAJA::NestedPolicy<RAJA::ExecList<RAJA::seq_exec, RAJA::seq_exec>>;
+
+  RAJA::forallN<myPolicy>(
+    fdBounds, fdBounds, [=](RAJA::Index_type ty, RAJA::Index_type tx) {
+
+      int id = tx + grid.nx * ty;
+      double x = grid.ox + tx * grid.dx;
+      double y = grid.ox + ty * grid.dx;
+      double myErr = std::abs(P[id] - waveSol(tf, x, y));
+
+      /*
+        tMax.max() is used to store the maximum value
+      */
+      tMax.max(myErr);
+    });
+
+  double lInfErr = tMax;
+  printf("Max Error = %lg, dx = %f \n", lInfErr, grid.dx);
+}
+
+
+/*
+ Function to set intial condition
+*/
+void setIC(double *P1, double *P2, double t0, double t1, grid_s grid)
+{
+
+  using myPolicy =
+    RAJA::NestedPolicy<RAJA::ExecList<RAJA::seq_exec, RAJA::seq_exec>>;
+  RAJA::RangeSegment fdBounds(0, grid.nx);
+  
+  RAJA::forallN<myPolicy>(
+    fdBounds, fdBounds, [=](RAJA::Index_type ty, RAJA::Index_type tx) {    
+
+      int id = tx + ty * grid.nx;
+      double x = grid.ox + tx * grid.dx;
+      double y = grid.ox + ty * grid.dx;
+      
+      P1[id] = waveSol(t0, x, y);
+      P2[id] = waveSol(t1, x, y);
+    });
+}
+
+/*
+  Wave Propagator
+*/
+template <typename T, typename fdNestedPolicy>
+void wave(T *P1, T *P2, RAJA::RangeSegment fdBounds, double ct, int nx)
+{
+ 
+ RAJA::forallN<fdNestedPolicy>(
+      fdBounds, fdBounds, [=] RAJA_HOST_DEVICE(RAJA::Index_type ty, RAJA::Index_type tx) {
+      
+        /*
+          Coefficients for a fourth order stencil
+        */
+        double coeff[5] = {
+          -1.0 / 12.0, 4.0 / 3.0, -5.0 / 2.0, 4.0 / 3.0, -1.0 / 12.0};
+
+        const int id = tx + ty * nx;
+        double P_old = P1[id];
+        double P_curr = P2[id];
+
+        /*
+          Computes Laplacian
+        */
+        double lap = 0.0;
+
+        for (auto r : RAJA::RangeSegment(-sr, sr + 1)) {
+          const int xi = (tx + r + nx) % nx;
+          const int idx = xi + nx * ty;
+          lap += coeff[r + sr] * P2[idx];
+
+          const int yi = (ty + r + nx) % nx;
+          const int idy = tx + nx * yi;
+          lap += coeff[r + sr] * P2[idy];
+        }
+
+        /*
+          Writes out result
+        */
+        P1[id] = 2 * P_curr - P_old + ct * lap;
+
+      });
+}
diff --git a/examples/memoryManager.hpp b/examples/memoryManager.hpp
new file mode 100644
index 0000000000..d1d59bd8f3
--- /dev/null
+++ b/examples/memoryManager.hpp
@@ -0,0 +1,58 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
+//
+// Produced at the Lawrence Livermore National Laboratory
+//
+// LLNL-CODE-689114
+//
+// All rights reserved.
+//
+// This file is part of RAJA.
+//
+// For details about use and distribution, please read RAJA/LICENSE.
+//
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef EXAMPLES_MEMORYMANAGER_HPP
+#define EXAMPLES_MEMORYMANAGER_HPP
+
+#include "RAJA/RAJA.hpp"
+#include "RAJA/util/defines.hpp"
+
+/*
+  As RAJA does not manage memory the user must allocate and deallocate memory. 
+
+  This header contains a general purpose memory manager which may be used
+  to perform c++ style allocation/deallocation or allocate/deallocate
+  CUDA unified memory. The type of memory allocated is dependent on how
+  RAJA was configured.
+*/
+namespace memoryManager{
+
+  template <typename T>
+  T *allocate(RAJA::Index_type size)
+  {
+    T *ptr;
+#if defined(RAJA_ENABLE_CUDA)
+    cudaMallocManaged((void **)&ptr, sizeof(T) * size, cudaMemAttachGlobal);
+#else
+    ptr = new T[size];
+#endif
+    return ptr;
+  }
+  
+  template <typename T>
+  void deallocate(T *&ptr)
+  {
+    if (ptr) {
+#if defined(RAJA_ENABLE_CUDA)
+      cudaFree(ptr);
+#else
+      delete[] ptr;
+#endif
+      ptr = nullptr;
+    }    
+  }
+  
+};
+#endif
diff --git a/host-configs/bgqos/clang_3_9_0.cmake b/host-configs/bgqos/clang_3_9_0.cmake
index bdaa2b69e5..500f833783 100644
--- a/host-configs/bgqos/clang_3_9_0.cmake
+++ b/host-configs/bgqos/clang_3_9_0.cmake
@@ -20,7 +20,7 @@ set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3 -ffast-math -std=c++
 set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -O3 -ffast-math -std=c++11 -stdlib=libc++" CACHE STRING "")
 set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0 -std=c++11 -stdlib=libc++" CACHE STRING "")
 
-set(RAJA_ENABLE_OPENMP On CACHE BOOL "")
+set(ENABLE_OPENMP On CACHE BOOL "")
 
 set(RAJA_RANGE_ALIGN 4 CACHE INT "")
 set(RAJA_RANGE_MIN_LENGTH 32 CACHE INT "")
diff --git a/host-configs/bgqos/gcc_4_7_2.cmake b/host-configs/bgqos/gcc_4_7_2.cmake
index e24bf51eb0..52b38fb105 100644
--- a/host-configs/bgqos/gcc_4_7_2.cmake
+++ b/host-configs/bgqos/gcc_4_7_2.cmake
@@ -20,7 +20,7 @@ set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -Ofast -mcpu=a2 -mtune=a
 set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -Ofast -mcpu=a2 -mtune=a2 -finline-functions -finline-limit=20000 -std=c++11" CACHE STRING "")
 set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0 -std=c++11" CACHE STRING "")
 
-set(RAJA_ENABLE_OPENMP On CACHE BOOL "")
+set(ENABLE_OPENMP On CACHE BOOL "")
 
 set(RAJA_RANGE_ALIGN 4 CACHE INT "")
 set(RAJA_RANGE_MIN_LENGTH 32 CACHE INT "")
diff --git a/host-configs/chaos/clang.cmake b/host-configs/chaos/clang.cmake
deleted file mode 120000
index d23b85548f..0000000000
--- a/host-configs/chaos/clang.cmake
+++ /dev/null
@@ -1 +0,0 @@
-clang_3_8_0.cmake
\ No newline at end of file
diff --git a/host-configs/chaos/clang.cmake b/host-configs/chaos/clang.cmake
new file mode 100755
index 0000000000..f26830b7d1
--- /dev/null
+++ b/host-configs/chaos/clang.cmake
@@ -0,0 +1,29 @@
+##
+## Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+##
+## Produced at the Lawrence Livermore National Laboratory.
+##
+## LLNL-CODE-689114
+##
+## All rights reserved.
+##
+## For release details and restrictions, please see RAJA/LICENSE.
+##
+
+set(RAJA_COMPILER "RAJA_COMPILER_CLANG" CACHE STRING "")
+
+set(CMAKE_CXX_COMPILER "/usr/global/tools/clang/chaos_5_x86_64_ib/clang-3.8.0/bin/clang++" CACHE PATH "")
+set(CMAKE_C_COMPILER "/usr/global/tools/clang/chaos_5_x86_64_ib/clang-3.8.0/bin/clang" CACHE PATH "")
+
+set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3" CACHE STRING "")
+set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -O3" CACHE STRING "")
+set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0" CACHE STRING "")
+
+set(ENABLE_OPENMP On CACHE BOOL "")
+
+set(RAJA_RANGE_ALIGN 4 CACHE INT "")
+set(RAJA_RANGE_MIN_LENGTH 32 CACHE INT "")
+set(RAJA_DATA_ALIGN 64 CACHE INT "")
+set(RAJA_COHERENCE_BLOCK_SIZE 64 CACHE INT "")
+
+set(RAJA_HOST_CONFIG_LOADED On CACHE Bool "")
diff --git a/host-configs/chaos/clang_3_8_0.cmake b/host-configs/chaos/clang_3_8_0.cmake
index 3003697f72..f26830b7d1 100755
--- a/host-configs/chaos/clang_3_8_0.cmake
+++ b/host-configs/chaos/clang_3_8_0.cmake
@@ -19,7 +19,7 @@ set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3" CACHE STRING "")
 set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -O3" CACHE STRING "")
 set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0" CACHE STRING "")
 
-set(RAJA_ENABLE_OPENMP On CACHE BOOL "")
+set(ENABLE_OPENMP On CACHE BOOL "")
 
 set(RAJA_RANGE_ALIGN 4 CACHE INT "")
 set(RAJA_RANGE_MIN_LENGTH 32 CACHE INT "")
diff --git a/host-configs/chaos/clang_3_9_0.cmake b/host-configs/chaos/clang_3_9_0.cmake
index 0bc96506f5..f8e0b96987 100755
--- a/host-configs/chaos/clang_3_9_0.cmake
+++ b/host-configs/chaos/clang_3_9_0.cmake
@@ -19,7 +19,7 @@ set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3" CACHE STRING "")
 set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -O3" CACHE STRING "")
 set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0 -g" CACHE STRING "")
 
-set(RAJA_ENABLE_OPENMP On CACHE BOOL "")
+set(ENABLE_OPENMP On CACHE BOOL "")
 
 set(RAJA_RANGE_ALIGN 4 CACHE INT "")
 set(RAJA_RANGE_MIN_LENGTH 32 CACHE INT "")
diff --git a/host-configs/chaos/clang_cuda.cmake b/host-configs/chaos/clang_cuda.cmake
index 1565d21e26..694a22a2e9 100755
--- a/host-configs/chaos/clang_cuda.cmake
+++ b/host-configs/chaos/clang_cuda.cmake
@@ -19,9 +19,9 @@ set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3" CACHE STRING "")
 set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -O3" CACHE STRING "")
 set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0" CACHE STRING "")
 
-set(RAJA_ENABLE_OPENMP On CACHE BOOL "")
-set(RAJA_ENABLE_CUDA On CACHE BOOL "")
-set(RAJA_ENABLE_CLANG_CUDA On CACHE BOOL "")
+set(ENABLE_OPENMP On CACHE BOOL "")
+set(ENABLE_CUDA On CACHE BOOL "")
+set(ENABLE_CLANG_CUDA On CACHE BOOL "")
 
 set(RAJA_RANGE_ALIGN 4 CACHE INT "")
 set(RAJA_RANGE_MIN_LENGTH 32 CACHE INT "")
diff --git a/host-configs/chaos/gcc.cmake b/host-configs/chaos/gcc.cmake
index 8ceb9fbff7..29373faa96 120000
--- a/host-configs/chaos/gcc.cmake
+++ b/host-configs/chaos/gcc.cmake
@@ -1 +1 @@
-gnu_4_9_3.cmake
\ No newline at end of file
+gcc_4_9_3.cmake
\ No newline at end of file
diff --git a/host-configs/chaos/gcc_4_9_3.cmake b/host-configs/chaos/gcc_4_9_3.cmake
index 9a3b00ff70..d48380ea37 100755
--- a/host-configs/chaos/gcc_4_9_3.cmake
+++ b/host-configs/chaos/gcc_4_9_3.cmake
@@ -18,7 +18,7 @@ set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -Ofast -mavx -finline-fu
 set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -Ofast -mavx -finline-functions -finline-limit=20000" CACHE STRING "")
 set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0 -g" CACHE STRING "")
 
-set(RAJA_ENABLE_OPENMP On CACHE BOOL "")
+set(ENABLE_OPENMP On CACHE BOOL "")
 
 set(RAJA_RANGE_ALIGN 4 CACHE INT "")
 set(RAJA_RANGE_MIN_LENGTH 32 CACHE INT "")
diff --git a/host-configs/chaos/icpc.cmake b/host-configs/chaos/icpc.cmake
index 0fae613ef7..2ca174a28a 120000
--- a/host-configs/chaos/icpc.cmake
+++ b/host-configs/chaos/icpc.cmake
@@ -1 +1 @@
-icpc_16_0_109.cmake
\ No newline at end of file
+icpc_16_0_258.cmake
\ No newline at end of file
diff --git a/host-configs/chaos/icpc_16_0_258.cmake b/host-configs/chaos/icpc_16_0_258.cmake
index 84657a06d4..c525739a8b 100755
--- a/host-configs/chaos/icpc_16_0_258.cmake
+++ b/host-configs/chaos/icpc_16_0_258.cmake
@@ -20,7 +20,7 @@ set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} ${COMMON_FLAGS} -O3 -mar
 set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} ${COMMON_FLAGS} -O3 -march=native -ansi-alias" CACHE STRING "")
 set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} ${COMMON_FLAGS} -O0 -g" CACHE STRING "")
 
-set(RAJA_ENABLE_OPENMP On CACHE BOOL "")
+set(ENABLE_OPENMP On CACHE BOOL "")
 
 set(RAJA_RANGE_ALIGN 4 CACHE INT "")
 set(RAJA_RANGE_MIN_LENGTH 32 CACHE INT "")
diff --git a/host-configs/chaos/icpc_17_0_174.cmake b/host-configs/chaos/icpc_17_0_174.cmake
index f71825e78e..2013619035 100755
--- a/host-configs/chaos/icpc_17_0_174.cmake
+++ b/host-configs/chaos/icpc_17_0_174.cmake
@@ -20,7 +20,7 @@ set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} ${COMMON_FLAGS} -O3 -mar
 set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} ${COMMON_FLAGS} -O3 -march=native -ansi-alias" CACHE STRING "")
 set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} ${COMMON_FLAGS} -O0 -g" CACHE STRING "")
 
-set(RAJA_ENABLE_OPENMP On CACHE BOOL "")
+set(ENABLE_OPENMP On CACHE BOOL "")
 
 set(RAJA_RANGE_ALIGN 4 CACHE INT "")
 set(RAJA_RANGE_MIN_LENGTH 32 CACHE INT "")
diff --git a/host-configs/chaos/nvcc.cmake b/host-configs/chaos/nvcc.cmake
index c4b49cb1d2..a44ec9164a 100755
--- a/host-configs/chaos/nvcc.cmake
+++ b/host-configs/chaos/nvcc.cmake
@@ -22,14 +22,16 @@ set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0 -Wall -Werror -Wextra" C
 if(CMAKE_BUILD_TYPE MATCHES Release)
   set(RAJA_NVCC_FLAGS -O2; -restrict; -arch compute_35; -std c++11; --expt-extended-lambda; -ccbin; ${CMAKE_CXX_COMPILER} CACHE LIST "")
 elseif(CMAKE_BUILD_TYPE MATCHES RelWithDebInfo)
-  set(RAJA_NVCC_FLAGS -g; -G; -O2; -restrict; -arch compute_35; -std c++11; --expt-extended-lambda; -ccbin ${CMAKE_CXX_COMPILER} CACHE LIST "")
+  set(RAJA_NVCC_FLAGS -g; -lineinfo; -O2; -restrict; -arch compute_35; -std c++11; --expt-extended-lambda; -ccbin ${CMAKE_CXX_COMPILER} CACHE LIST "")
 elseif(CMAKE_BUILD_TYPE MATCHES Debug)
   set(RAJA_NVCC_FLAGS -g; -G; -O0; -restrict; -arch compute_35; -std c++11; --expt-extended-lambda; -ccbin ${CMAKE_CXX_COMPILER} CACHE LIST "")
+else()
+  set(RAJA_NVCC_FLAGS -restrict; -arch compute_35; -std c++11; --expt-extended-lambda; -ccbin ${CMAKE_CXX_COMPILER} CACHE LIST "")
 endif()
 
 
-set(RAJA_ENABLE_CUDA On CACHE BOOL "")
-set(RAJA_ENABLE_OPENMP On CACHE BOOL "")
+set(ENABLE_CUDA On CACHE BOOL "")
+set(ENABLE_OPENMP On CACHE BOOL "")
 
 set(RAJA_RANGE_ALIGN 4 CACHE INT "")
 set(RAJA_RANGE_MIN_LENGTH 32 CACHE INT "")
diff --git a/host-configs/chaos_5_x86_64_ib b/host-configs/chaos_5_x86_64_ib
index f022ceb675..d4cd79969a 120000
--- a/host-configs/chaos_5_x86_64_ib
+++ b/host-configs/chaos_5_x86_64_ib
@@ -1 +1 @@
-chaos/intel.cmake
\ No newline at end of file
+chaos/icpc.cmake
\ No newline at end of file
diff --git a/host-configs/linux/clang.cmake b/host-configs/linux/clang.cmake
index 6a9d5a1c7e..6f417e9a67 100755
--- a/host-configs/linux/clang.cmake
+++ b/host-configs/linux/clang.cmake
@@ -18,7 +18,7 @@ set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3" CACHE STRING "")
 set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -O3" CACHE STRING "")
 set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0" CACHE STRING "")
 
-set(RAJA_ENABLE_OPENMP On CACHE BOOL "")
+set(ENABLE_OPENMP On CACHE BOOL "")
 
 set(RAJA_RANGE_ALIGN 4 CACHE INT "")
 set(RAJA_RANGE_MIN_LENGTH 32 CACHE INT "")
diff --git a/host-configs/linux/gcc.cmake b/host-configs/linux/gcc.cmake
index 080aebbcd5..913e4a04fe 100755
--- a/host-configs/linux/gcc.cmake
+++ b/host-configs/linux/gcc.cmake
@@ -18,7 +18,7 @@ set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -Ofast -mavx -finline-fu
 set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -Ofast -mavx -finline-functions -finline-limit=20000" CACHE STRING "")
 set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0 -fpermissive" CACHE STRING "")
 
-set(RAJA_ENABLE_OPENMP On CACHE BOOL "")
+set(ENABLE_OPENMP On CACHE BOOL "")
 
 set(RAJA_RANGE_ALIGN 4 CACHE INT "")
 set(RAJA_RANGE_MIN_LENGTH 32 CACHE INT "")
diff --git a/host-configs/linux/icpc.cmake b/host-configs/linux/icpc.cmake
index 1f5af6f998..d1c9646165 100755
--- a/host-configs/linux/icpc.cmake
+++ b/host-configs/linux/icpc.cmake
@@ -18,7 +18,7 @@ set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3 -mavx -inline-max-to
 set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -O3 -mavx -inline-max-total-size=20000 -inline-forceinline -ansi-alias" CACHE STRING "")
 set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0" CACHE STRING "")
 
-set(RAJA_ENABLE_OPENMP On CACHE BOOL "")
+set(ENABLE_OPENMP On CACHE BOOL "")
 
 set(RAJA_RANGE_ALIGN 4 CACHE INT "")
 set(RAJA_RANGE_MIN_LENGTH 32 CACHE INT "")
diff --git a/include/RAJA/RAJA.hpp b/include/RAJA/RAJA.hpp
index 19b9e2ad0f..93a2d22617 100644
--- a/include/RAJA/RAJA.hpp
+++ b/include/RAJA/RAJA.hpp
@@ -16,11 +16,8 @@
  ******************************************************************************
  */
 
-#ifndef RAJA_HPP
-#define RAJA_HPP
-
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -30,37 +27,13 @@
 //
 // This file is part of RAJA.
 //
-// For additional details, please also read RAJA/LICENSE.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
+// For details about use and distribution, please read RAJA/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+#ifndef RAJA_HPP
+#define RAJA_HPP
+
 #include "RAJA/config.hpp"
 
 #include "RAJA/util/defines.hpp"
@@ -69,6 +42,8 @@
 
 #include "RAJA/util/Operators.hpp"
 
+#include "RAJA/util/basic_mempool.hpp"
+
 //
 // All platforms must support sequential execution.
 //
@@ -99,7 +74,7 @@
 #include "RAJA/index/IndexSet.hpp"
 
 //
-// Strongly typed index class.
+// Strongly typed index class
 //
 #include "RAJA/index/IndexValue.hpp"
 
@@ -113,7 +88,7 @@
 
 
 //
-// Multidimensional layouts and views.
+// Multidimensional layouts and views
 //
 #include "RAJA/util/Layout.hpp"
 #include "RAJA/util/OffsetLayout.hpp"
@@ -131,8 +106,13 @@
 // Generic iteration templates for perfectly nested loops
 //
 #include "RAJA/pattern/forallN.hpp"
+#include "RAJA/pattern/nested.hpp"
 
 
+
+//
+// Reduction objects
+//
 #include "RAJA/pattern/reduce.hpp"
 
 
diff --git a/include/RAJA/config.hpp.in b/include/RAJA/config.hpp.in
index 455869cb4f..311e621d06 100644
--- a/include/RAJA/config.hpp.in
+++ b/include/RAJA/config.hpp.in
@@ -21,11 +21,8 @@
  ******************************************************************************
  */
 
-#ifndef RAJA_config_HPP
-#define RAJA_config_HPP
-
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -35,39 +32,15 @@
 //
 // This file is part of RAJA.
 //
-// For additional details, please also read RAJA/LICENSE.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
+// For details about use and distribution, please read RAJA/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+#ifndef RAJA_config_HPP
+#define RAJA_config_HPP
+
 #cmakedefine RAJA_USE_STL
-#cmakedefine RAJA_ENABLE_FT
+#cmakedefine ENABLE_FT
 
 #define @RAJA_FP@
 #define @RAJA_PTR@
@@ -96,12 +69,12 @@
 /*
  * Detect the host C++ compiler we are using.
  */
-#if defined(__clang__)
-#define RAJA_COMPILER_CLANG
-#elif defined(__INTEL_COMPILER)
+#if defined(__INTEL_COMPILER)
 #define RAJA_COMPILER_INTEL
 #elif defined(__xlc__)
 #define RAJA_COMPILER_XLC
+#elif defined(__clang__)
+#define RAJA_COMPILER_CLANG
 #elif defined(__PGI)
 #define RAJA_COMPILER_PGI
 #elif defined(_WIN32)
@@ -193,7 +166,7 @@ const int COHERENCE_BLOCK_SIZE = @RAJA_COHERENCE_BLOCK_SIZE@;
 
 #define RAJA_INLINE inline  __attribute__((always_inline))
 
-#if defined(RAJA_ENABLE_CUDA)
+#if defined(ENABLE_CUDA)
 #define RAJA_ALIGN_DATA(d)
 #else
 #define RAJA_ALIGN_DATA(d) __assume_aligned(d, DATA_ALIGN)
@@ -215,7 +188,7 @@ const int COHERENCE_BLOCK_SIZE = @RAJA_COHERENCE_BLOCK_SIZE@;
 
 #define RAJA_INLINE inline  __attribute__((always_inline))
 
-#if defined(RAJA_ENABLE_CUDA)
+#if defined(ENABLE_CUDA)
 #define RAJA_ALIGN_DATA(d)
 #else
 #define RAJA_ALIGN_DATA(d) __builtin_assume_aligned(d, DATA_ALIGN)
@@ -261,7 +234,7 @@ const int COHERENCE_BLOCK_SIZE = @RAJA_COHERENCE_BLOCK_SIZE@;
 
 #define RAJA_INLINE inline  __attribute__((always_inline))
 
-#if defined(RAJA_ENABLE_CUDA)
+#if defined(ENABLE_CUDA)
 #define RAJA_ALIGN_DATA(d)
 #else
 #define RAJA_ALIGN_DATA(d) __builtin_assume_aligned(d, DATA_ALIGN)
diff --git a/include/RAJA/index/IndexSet.hpp b/include/RAJA/index/IndexSet.hpp
index 735aff023a..a68185c29f 100644
--- a/include/RAJA/index/IndexSet.hpp
+++ b/include/RAJA/index/IndexSet.hpp
@@ -8,11 +8,8 @@
  ******************************************************************************
  */
 
-#ifndef RAJA_IndexSet_HPP
-#define RAJA_IndexSet_HPP
-
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -22,37 +19,13 @@
 //
 // This file is part of RAJA.
 //
-// For additional details, please also read RAJA/LICENSE.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
+// For details about use and distribution, please read RAJA/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+#ifndef RAJA_IndexSet_HPP
+#define RAJA_IndexSet_HPP
+
 #include "RAJA/config.hpp"
 
 #include "RAJA/index/ListSegment.hpp"
@@ -72,6 +45,13 @@ namespace RAJA
 enum PushEnd { PUSH_FRONT, PUSH_BACK };
 enum PushCopy { PUSH_COPY, PUSH_NOCOPY };
 
+template <typename... TALL>
+class StaticIndexSet;
+
+namespace policy
+{
+namespace indexset
+{
 
 ///
 /// Class representing index set execution policy.
@@ -88,8 +68,10 @@ struct ExecPolicy
   typedef SEG_EXEC_POLICY_T seg_exec;
 };
 
-template <typename... TALL>
-class StaticIndexSet;
+}  // end namespace indexset
+}  // end namespace policy
+
+using policy::indexset::ExecPolicy;
 
 
 /*!
@@ -107,7 +89,6 @@ class StaticIndexSet<T0, TREST...> : public StaticIndexSet<TREST...>
   const int T0_TypeId = sizeof...(TREST);
 
 public:
-
   //! Construct empty index set
   RAJA_INLINE constexpr StaticIndexSet() : PARENT() {}
 
@@ -220,9 +201,7 @@ class StaticIndexSet<T0, TREST...> : public StaticIndexSet<TREST...>
 
   //! Returns the number of types this IndexSet can store.
   RAJA_INLINE
-  constexpr size_t getNumTypes() const {
-    return 1 + PARENT::getNumTypes();
-  }
+  constexpr size_t getNumTypes() const { return 1 + PARENT::getNumTypes(); }
 
   /*
    * IMPORTANT: Some methods to add a segment to an index set
@@ -255,20 +234,20 @@ class StaticIndexSet<T0, TREST...> : public StaticIndexSet<TREST...>
     Index_type num = getNumSegments();
 
     RangeStrideSegment Iter = (pend == PUSH_BACK)
-      ? RangeStrideSegment(0, num, 1)
-      : RangeStrideSegment(num - 1, -1, -1);
+                                  ? RangeStrideSegment(0, num, 1)
+                                  : RangeStrideSegment(num - 1, -1, -1);
 
     for (Index_type i : Iter)
       segment_push_into(i, c, pend, pcopy);
   }
 
 
-  static constexpr int value_for(PushEnd end, PushCopy copy) {
+  static constexpr int value_for(PushEnd end, PushCopy copy)
+  {
     return (end == PUSH_BACK) << 1 | (copy == PUSH_COPY);
   }
 
 public:
-
   template <typename... CALL>
   RAJA_INLINE void segment_push_into(size_t segid,
                                      StaticIndexSet<CALL...> &c,
@@ -281,18 +260,18 @@ class StaticIndexSet<T0, TREST...> : public StaticIndexSet<TREST...>
     }
     Index_type offset = getSegmentOffsets()[segid];
     switch (value_for(pend, pcopy)) {
-    case value_for(PUSH_BACK, PUSH_COPY):
-      c.push_back(*data[offset]);
-      break;
-    case value_for(PUSH_BACK, PUSH_NOCOPY):
-      c.push_back_nocopy(data[offset]);
-      break;
-    case value_for(PUSH_FRONT, PUSH_COPY):
-      c.push_front(*data[offset]);
-      break;
-    case value_for(PUSH_FRONT, PUSH_NOCOPY):
-      c.push_front_nocopy(data[offset]);
-      break;
+      case value_for(PUSH_BACK, PUSH_COPY):
+        c.push_back(*data[offset]);
+        break;
+      case value_for(PUSH_BACK, PUSH_NOCOPY):
+        c.push_back_nocopy(data[offset]);
+        break;
+      case value_for(PUSH_FRONT, PUSH_COPY):
+        c.push_front(*data[offset]);
+        break;
+      case value_for(PUSH_FRONT, PUSH_NOCOPY):
+        c.push_front_nocopy(data[offset]);
+        break;
     }
   }
 
@@ -355,7 +334,9 @@ class StaticIndexSet<T0, TREST...> : public StaticIndexSet<TREST...>
   RAJA_INLINE void segmentCall(size_t segid, BODY body, ARGS... args) const
   {
     if (getSegmentTypes()[segid] != T0_TypeId) {
-      PARENT::segmentCall(segid, std::forward<BODY>(body), std::forward<ARGS>(args)...);
+      PARENT::segmentCall(segid,
+                          std::forward<BODY>(body),
+                          std::forward<ARGS>(args)...);
       return;
     }
     Index_type offset = getSegmentOffsets()[segid];
@@ -363,7 +344,6 @@ class StaticIndexSet<T0, TREST...> : public StaticIndexSet<TREST...>
   }
 
 protected:
-
   //! Internal logic to add a new segment -- catch invalid type insertion
   template <typename Tnew>
   RAJA_INLINE void push_internal(Tnew *val,
@@ -412,38 +392,28 @@ class StaticIndexSet<T0, TREST...> : public StaticIndexSet<TREST...>
   }
 
   //! Returns the number of indices (the total icount of segments
-  RAJA_INLINE Index_type &getTotalLength() {
-    return PARENT::getTotalLength();
-  }
+  RAJA_INLINE Index_type &getTotalLength() { return PARENT::getTotalLength(); }
 
   //! set total length of the indexset
-  RAJA_INLINE void setTotalLength(int n) {
-    return PARENT::setTotalLength(n);
-  }
+  RAJA_INLINE void setTotalLength(int n) { return PARENT::setTotalLength(n); }
 
   //! increase the total stored size of the indexset
-  RAJA_INLINE void increaseTotalLength(int n) {
+  RAJA_INLINE void increaseTotalLength(int n)
+  {
     return PARENT::increaseTotalLength(n);
   }
 
 public:
-
   using iterator = Iterators::numeric_iterator<Index_type>;
 
   //! Get an iterator to the end.
-  iterator end() const {
-    return iterator(getNumSegments());
-  }
+  iterator end() const { return iterator(getNumSegments()); }
 
   //! Get an iterator to the beginning.
-  iterator begin() const {
-    return iterator(0);
-  }
+  iterator begin() const { return iterator(0); }
 
   //! Return the number of elements in the range.
-  Index_type size() const {
-    return getNumSegments();
-  }
+  Index_type size() const { return getNumSegments(); }
 
   //!  @name IndexSet segment subsetting methods (slices ranges)
   ///
@@ -500,9 +470,11 @@ class StaticIndexSet<T0, TREST...> : public StaticIndexSet<TREST...>
   {
     StaticIndexSet<T0, TREST...> *retVal = new StaticIndexSet<T0, TREST...>();
     int numSeg = getNumSegments();
-    for (auto & seg : segIds)
-      if (seg >= 0 && seg < numSeg)
+    for (auto &seg : segIds) {
+      if (seg >= 0 && seg < numSeg) {
         segment_push_into(seg, *retVal, PUSH_BACK, PUSH_NOCOPY);
+      }
+    }
     return retVal;
   }
 
@@ -526,7 +498,6 @@ class StaticIndexSet<T0, TREST...> : public StaticIndexSet<TREST...>
   }
 
 protected:
-
   //! Returns the mapping of  segment_index -> segment_type
   RAJA_INLINE RAJA::RAJAVec<Index_type> &getSegmentTypes()
   {
@@ -564,7 +535,6 @@ class StaticIndexSet<T0, TREST...> : public StaticIndexSet<TREST...>
   }
 
 public:
-
   ///
   /// Equality operator returns true if all segments are equal; else false.
   ///
@@ -575,8 +545,7 @@ class StaticIndexSet<T0, TREST...> : public StaticIndexSet<TREST...>
   RAJA_INLINE bool operator==(const StaticIndexSet<P0, PREST...> &other) const
   {
     size_t num_seg = getNumSegments();
-    if (num_seg != other.getNumSegments())
-      return false;
+    if (num_seg != other.getNumSegments()) return false;
 
     for (size_t segid = 0; segid < num_seg; ++segid) {
       if (!compareSegmentById(segid, other)) {
@@ -594,7 +563,6 @@ class StaticIndexSet<T0, TREST...> : public StaticIndexSet<TREST...>
   }
 
 private:
-
   //! vector of IndexSet data objects of type T0
   RAJA::RAJAVec<T0 *> data;
 
@@ -613,7 +581,6 @@ template <>
 class StaticIndexSet<>
 {
 public:
-
   //! create empty IndexSet
   RAJA_INLINE StaticIndexSet() : m_len(0) {}
 
@@ -642,9 +609,7 @@ class StaticIndexSet<>
   }
 
 protected:
-  RAJA_INLINE static size_t getNumTypes() {
-    return 0;
-  }
+  RAJA_INLINE static size_t getNumTypes() { return 0; }
 
   template <typename T>
   RAJA_INLINE constexpr bool isValidSegmentType(T const &) const
@@ -653,22 +618,17 @@ class StaticIndexSet<>
     return false;
   }
 
-  RAJA_INLINE static int getNumSegments()
-  {
-    return 0;
-  }
+  RAJA_INLINE static int getNumSegments() { return 0; }
 
-  RAJA_INLINE static size_t getLength()
-  {
-    return 0;
-  }
+  RAJA_INLINE static size_t getLength() { return 0; }
 
   template <typename BODY, typename... ARGS>
   RAJA_INLINE void segmentCall(size_t, BODY, ARGS...) const
   {
   }
 
-  RAJA_INLINE RAJA::RAJAVec<Index_type> &getSegmentTypes() {
+  RAJA_INLINE RAJA::RAJAVec<Index_type> &getSegmentTypes()
+  {
     return segment_types;
   }
 
@@ -677,7 +637,8 @@ class StaticIndexSet<>
     return segment_types;
   }
 
-  RAJA_INLINE RAJA::RAJAVec<Index_type> &getSegmentOffsets() {
+  RAJA_INLINE RAJA::RAJAVec<Index_type> &getSegmentOffsets()
+  {
     return segment_offsets;
   }
 
@@ -686,7 +647,8 @@ class StaticIndexSet<>
     return segment_offsets;
   }
 
-  RAJA_INLINE RAJA::RAJAVec<Index_type> &getSegmentIcounts() {
+  RAJA_INLINE RAJA::RAJAVec<Index_type> &getSegmentIcounts()
+  {
     return segment_icounts;
   }
 
@@ -695,17 +657,11 @@ class StaticIndexSet<>
     return segment_icounts;
   }
 
-  RAJA_INLINE Index_type &getTotalLength() {
-    return m_len;
-  }
+  RAJA_INLINE Index_type &getTotalLength() { return m_len; }
 
-  RAJA_INLINE void setTotalLength(int n) {
-    m_len = n;
-  }
+  RAJA_INLINE void setTotalLength(int n) { m_len = n; }
 
-  RAJA_INLINE void increaseTotalLength(int n) {
-    m_len += n;
-  }
+  RAJA_INLINE void increaseTotalLength(int n) { m_len += n; }
 
   template <typename P0, typename... PREST>
   RAJA_INLINE bool compareSegmentById(
@@ -724,13 +680,13 @@ class StaticIndexSet<>
   template <typename P0>
   RAJA_INLINE P0 &getSegment(size_t)
   {
-    return *((P0*)(this - this));
+    return *((P0 *)(this - this));
   }
 
   template <typename P0>
   RAJA_INLINE P0 const &getSegment(size_t) const
   {
-    return *((P0*)(this - this));
+    return *((P0 *)(this - this));
   }
 
   template <typename... CALL>
@@ -752,31 +708,26 @@ class StaticIndexSet<>
   }
 
 public:
-
   using iterator = Iterators::numeric_iterator<Index_type>;
 
-  RAJA_INLINE int getStartingIcount(int segid) {
+  RAJA_INLINE int getStartingIcount(int segid)
+  {
     return segment_icounts[segid];
   }
 
-  RAJA_INLINE int getStartingIcount(int segid) const {
+  RAJA_INLINE int getStartingIcount(int segid) const
+  {
     return segment_icounts[segid];
   }
 
   //! Get an iterator to the end.
-  iterator end() const {
-    return iterator(getNumSegments());
-  }
+  iterator end() const { return iterator(getNumSegments()); }
 
   //! Get an iterator to the beginning.
-  iterator begin() const {
-    return iterator(0);
-  }
+  iterator begin() const { return iterator(0); }
 
   //! Return the number of elements in the range.
-  Index_type size() const {
-    return getNumSegments();
-  }
+  Index_type size() const { return getNumSegments(); }
 
 private:
   //! Vector of segment types:    seg_index -> seg_type
diff --git a/include/RAJA/index/IndexSetBuilders.hpp b/include/RAJA/index/IndexSetBuilders.hpp
index 841ebcfb4b..5ca8001b7b 100644
--- a/include/RAJA/index/IndexSetBuilders.hpp
+++ b/include/RAJA/index/IndexSetBuilders.hpp
@@ -8,11 +8,8 @@
  ******************************************************************************
  */
 
-#ifndef RAJA_IndexSetBuilders_HPP
-#define RAJA_IndexSetBuilders_HPP
-
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -22,46 +19,21 @@
 //
 // This file is part of RAJA.
 //
-// For additional details, please also read RAJA/LICENSE.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
+// For details about use and distribution, please read RAJA/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+#ifndef RAJA_IndexSetBuilders_HPP
+#define RAJA_IndexSetBuilders_HPP
+
 #include "RAJA/config.hpp"
 
 #include "RAJA/util/types.hpp"
+#include "RAJA/index/IndexSet.hpp"
 
 namespace RAJA
 {
 
-class IndexSet;
-
 /*!
  ******************************************************************************
  *
@@ -126,12 +98,12 @@ void buildLockFreeBlockIndexset(IndexSet& iset,
  ******************************************************************************
  */
 void buildLockFreeColorIndexset(IndexSet& iset,
-                                int const* domainToRange,
+                                Index_type const* domainToRange,
                                 int numEntity,
                                 int numRangePerDomain,
                                 int numEntityRange,
-                                int* elemPermutation = 0l,
-                                int* ielemPermutation = 0l);
+                                Index_type* elemPermutation = 0l,
+                                Index_type* ielemPermutation = 0l);
 
 }  // closing brace for RAJA namespace
 
diff --git a/include/RAJA/index/IndexSetUtils.hpp b/include/RAJA/index/IndexSetUtils.hpp
index df59d4a8f0..828cb88188 100644
--- a/include/RAJA/index/IndexSetUtils.hpp
+++ b/include/RAJA/index/IndexSetUtils.hpp
@@ -9,11 +9,8 @@
  ******************************************************************************
  */
 
-#ifndef RAJA_IndexSetUtils_HPP
-#define RAJA_IndexSetUtils_HPP
-
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -23,37 +20,13 @@
 //
 // This file is part of RAJA.
 //
-// For additional details, please also read RAJA/LICENSE.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
+// For details about use and distribution, please read RAJA/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+#ifndef RAJA_IndexSetUtils_HPP
+#define RAJA_IndexSetUtils_HPP
+
 #include "RAJA/config.hpp"
 #include "RAJA/pattern/forall.hpp"
 #include "RAJA/policy/sequential.hpp"
diff --git a/include/RAJA/index/IndexValue.hpp b/include/RAJA/index/IndexValue.hpp
index 86e86110ad..f7de53cffc 100644
--- a/include/RAJA/index/IndexValue.hpp
+++ b/include/RAJA/index/IndexValue.hpp
@@ -8,11 +8,8 @@
  ******************************************************************************
  */
 
-#ifndef RAJA_INDEXVALUE_HPP
-#define RAJA_INDEXVALUE_HPP
-
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -22,37 +19,13 @@
 //
 // This file is part of RAJA.
 //
-// For additional details, please also read RAJA/LICENSE.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
+// For details about use and distribution, please read RAJA/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+#ifndef RAJA_INDEXVALUE_HPP
+#define RAJA_INDEXVALUE_HPP
+
 #include "RAJA/config.hpp"
 
 #include "RAJA/util/defines.hpp"
diff --git a/include/RAJA/index/ListSegment.hpp b/include/RAJA/index/ListSegment.hpp
index 8262c5a23b..1e72027301 100644
--- a/include/RAJA/index/ListSegment.hpp
+++ b/include/RAJA/index/ListSegment.hpp
@@ -8,11 +8,8 @@
  ******************************************************************************
  */
 
-#ifndef RAJA_ListSegment_HPP
-#define RAJA_ListSegment_HPP
-
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -22,37 +19,13 @@
 //
 // This file is part of RAJA.
 //
-// For additional details, please also read RAJA/LICENSE.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
+// For details about use and distribution, please read RAJA/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+#ifndef RAJA_ListSegment_HPP
+#define RAJA_ListSegment_HPP
+
 #include "RAJA/config.hpp"
 #include "RAJA/util/defines.hpp"
 #include "RAJA/util/types.hpp"
@@ -128,12 +101,14 @@ class TypedListSegment
     m_data = new T[m_size];
   }
 
+#ifdef RAJA_ENABLE_CUDA
   //! copy data from container using BlockCopy
   template <typename Container>
   void copy(Container&& src, BlockCopy)
   {
     cudaErrchk(cudaMemcpy(m_data, &(*src.begin()), m_size * sizeof(T), cudaMemcpyDefault));
   }
+#endif
 
   //! copy data from container using TrivialCopy
   template <typename Container>
@@ -237,10 +212,9 @@ class TypedListSegment
   ///
   RAJA_HOST_DEVICE void swap(TypedListSegment& other)
   {
-    using std::swap;
-    swap(m_data, other.m_data);
-    swap(m_size, other.m_size);
-    swap(m_owned, other.m_owned);
+    camp::safe_swap(m_data, other.m_data);
+    camp::safe_swap(m_size, other.m_size);
+    camp::safe_swap(m_owned, other.m_owned);
   }
 
   //! accessor to get the end iterator for a TypedListSegment
diff --git a/include/RAJA/index/RangeSegment.hpp b/include/RAJA/index/RangeSegment.hpp
index 318c917131..0fd220bfe4 100644
--- a/include/RAJA/index/RangeSegment.hpp
+++ b/include/RAJA/index/RangeSegment.hpp
@@ -8,11 +8,8 @@
  ******************************************************************************
  */
 
-#ifndef RAJA_RangeSegment_HPP
-#define RAJA_RangeSegment_HPP
-
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -22,43 +19,21 @@
 //
 // This file is part of RAJA.
 //
-// For additional details, please also read RAJA/LICENSE.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
+// For details about use and distribution, please read RAJA/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+#ifndef RAJA_RangeSegment_HPP
+#define RAJA_RangeSegment_HPP
+
 #include "RAJA/config.hpp"
 
 #include "RAJA/internal/Iterators.hpp"
 
 #include "RAJA/util/concepts.hpp"
 
+#include <iostream>
+
 namespace RAJA
 {
 
@@ -138,6 +113,15 @@ struct TypedRangeSegment {
   {
   }
 
+  //! copy assignment
+  RAJA_HOST_DEVICE TypedRangeSegment& operator=(TypedRangeSegment const& o)
+  {
+    m_begin = o.m_begin;
+    m_end = o.m_end;
+    m_size = o.m_size;
+    return *this;
+  }
+
   //! destructor
   RAJA_HOST_DEVICE ~TypedRangeSegment() {}
 
@@ -147,10 +131,9 @@ struct TypedRangeSegment {
    */
   RAJA_HOST_DEVICE void swap(TypedRangeSegment& other)
   {
-    using std::swap;
-    swap(m_begin, other.m_begin);
-    swap(m_end, other.m_end);
-    swap(m_size, other.m_size);
+    camp::safe_swap(m_begin, other.m_begin);
+    camp::safe_swap(m_end, other.m_end);
+    camp::safe_swap(m_size, other.m_size);
   }
 
   //! obtain an iterator to the beginning of this TypedRangeSegment
@@ -171,6 +154,19 @@ struct TypedRangeSegment {
    */
   RAJA_HOST_DEVICE StorageT size() const { return m_size; }
 
+  //! Create a slice of this instance as a new instance
+  /*!
+   * \return A new instance spanning *begin() + begin to *begin() + begin +
+   * length
+   */
+  RAJA_HOST_DEVICE TypedRangeSegment slice(Index_type begin,
+                                           Index_type length) const
+  {
+    auto start = m_begin[0] + begin;
+    auto end = start + length > m_end[0] ? m_end[0] : start + length;
+    return TypedRangeSegment{start, end};
+  }
+
   //! equality comparison
   /*!
    * \return true if and only if the begin, end, and size match
@@ -278,7 +274,7 @@ struct TypedRangeStrideSegment {
         m_end(iterator(end, stride)),
         // essentially a ceil((end-begin)/stride) but using integer math,
         // and allowing for negative strides
-        m_size((end - begin + stride - ( stride > 0 ? 1 : -1  ) ) / stride)
+        m_size((end - begin + stride - (stride > 0 ? 1 : -1)) / stride)
   {
     // if m_size was initialized as negative, that indicates a zero iteration
     // space
@@ -311,10 +307,9 @@ struct TypedRangeStrideSegment {
    */
   RAJA_HOST_DEVICE void swap(TypedRangeStrideSegment& other)
   {
-    using std::swap;
-    swap(m_begin, other.m_begin);
-    swap(m_end, other.m_end);
-    swap(m_size, other.m_size);
+    camp::safe_swap(m_begin, other.m_begin);
+    camp::safe_swap(m_end, other.m_end);
+    camp::safe_swap(m_size, other.m_size);
   }
 
   //! obtain an iterator to the beginning of this TypedRangeStrideSegment
@@ -338,6 +333,19 @@ struct TypedRangeStrideSegment {
    */
   RAJA_HOST_DEVICE StorageT size() const { return m_size; }
 
+  //! Create a slice of this instance as a new instance
+  /*!
+   * \return A new instance spanning *begin() + begin * stride to *begin() +
+   * (begin + length) * stride
+   */
+  RAJA_HOST_DEVICE TypedRangeStrideSegment slice(Index_type begin,
+                                                 Index_type length) const
+  {
+    return TypedRangeStrideSegment{*(this->begin() + begin),
+                                   *(this->begin() + begin + length),
+                                   m_begin.stride};
+  }
+
   //! equality comparison
   /*!
    * \return true if and only if the begin, end, and size match
@@ -396,8 +404,8 @@ using common_type_t = typename common_type<Ts...>::type;
 template <typename BeginT,
           typename EndT,
           typename Common = detail::common_type_t<BeginT, EndT>>
-RAJA_HOST_DEVICE
-TypedRangeSegment<Common> make_range(BeginT&& begin, EndT&& end)
+RAJA_HOST_DEVICE TypedRangeSegment<Common> make_range(BeginT&& begin,
+                                                      EndT&& end)
 {
   return {begin, end};
 }
@@ -416,10 +424,10 @@ template <typename BeginT,
           typename EndT,
           typename StrideT,
           typename Common = detail::common_type_t<BeginT, EndT, StrideT>>
-RAJA_HOST_DEVICE
-TypedRangeStrideSegment<Common> make_strided_range(BeginT&& begin,
-                                                   EndT&& end,
-                                                   StrideT&& stride)
+RAJA_HOST_DEVICE TypedRangeStrideSegment<Common> make_strided_range(
+    BeginT&& begin,
+    EndT&& end,
+    StrideT&& stride)
 {
   return {begin, end, stride};
 }
@@ -429,12 +437,12 @@ namespace concepts
 
 template <typename T, typename U>
 struct RangeConstructible
-    : DefineConcept(val<RAJA::detail::common_type_t<T, U>>()) {
+    : DefineConcept(camp::val<RAJA::detail::common_type_t<T, U>>()) {
 };
 
 template <typename T, typename U, typename V>
 struct RangeStrideConstructible
-    : DefineConcept(val<RAJA::detail::common_type_t<T, U, V>>()) {
+    : DefineConcept(camp::val<RAJA::detail::common_type_t<T, U, V>>()) {
 };
 
 }  // closing brace for concepts namespace
@@ -457,16 +465,16 @@ namespace std
 
 //! specialization of swap for TypedRangeSegment
 template <typename T>
-RAJA_INLINE void swap(RAJA::TypedRangeSegment<T>& a,
-                      RAJA::TypedRangeSegment<T>& b)
+RAJA_HOST_DEVICE RAJA_INLINE void swap(RAJA::TypedRangeSegment<T>& a,
+                                       RAJA::TypedRangeSegment<T>& b)
 {
   a.swap(b);
 }
 
 //! specialization of swap for TypedRangeStrideSegment
 template <typename T>
-RAJA_INLINE void swap(RAJA::TypedRangeStrideSegment<T>& a,
-                      RAJA::TypedRangeStrideSegment<T>& b)
+RAJA_HOST_DEVICE RAJA_INLINE void swap(RAJA::TypedRangeStrideSegment<T>& a,
+                                       RAJA::TypedRangeStrideSegment<T>& b)
 {
   a.swap(b);
 }
diff --git a/include/RAJA/internal/DepGraphNode.hpp b/include/RAJA/internal/DepGraphNode.hpp
index 7398d97a38..bcd04b13d5 100644
--- a/include/RAJA/internal/DepGraphNode.hpp
+++ b/include/RAJA/internal/DepGraphNode.hpp
@@ -9,11 +9,8 @@
  ******************************************************************************
  */
 
-#ifndef RAJA_DepGraphNode_HPP
-#define RAJA_DepGraphNode_HPP
-
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -23,37 +20,13 @@
 //
 // This file is part of RAJA.
 //
-// For additional details, please also read RAJA/LICENSE.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
+// For details about use and distribution, please read RAJA/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+#ifndef RAJA_DepGraphNode_HPP
+#define RAJA_DepGraphNode_HPP
+
 #include "RAJA/config.hpp"
 
 #include "RAJA/util/types.hpp"
diff --git a/include/RAJA/internal/ForallNPolicy.hpp b/include/RAJA/internal/ForallNPolicy.hpp
index b2d734b4ee..570203fb59 100644
--- a/include/RAJA/internal/ForallNPolicy.hpp
+++ b/include/RAJA/internal/ForallNPolicy.hpp
@@ -8,11 +8,8 @@
  ******************************************************************************
  */
 
-#ifndef RAJA_internal_ForallNPolicy_HPP
-#define RAJA_internal_ForallNPolicy_HPP
-
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -22,37 +19,13 @@
 //
 // This file is part of RAJA.
 //
-// For additional details, please also read RAJA/LICENSE.
-//
-// Redistribution ind use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
+// For details about use and distribution, please read RAJA/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+#ifndef RAJA_internal_ForallNPolicy_HPP
+#define RAJA_internal_ForallNPolicy_HPP
+
 #include "RAJA/config.hpp"
 
 namespace RAJA
diff --git a/include/RAJA/internal/IndexArray.hpp b/include/RAJA/internal/IndexArray.hpp
deleted file mode 100644
index b6ed89f2f3..0000000000
--- a/include/RAJA/internal/IndexArray.hpp
+++ /dev/null
@@ -1,226 +0,0 @@
-/*!
- ******************************************************************************
- *
- * \file
- *
- * \brief   Header file for array indexing helpers.
- *
- ******************************************************************************
- */
-
-
-#ifndef RAJA_DETAIL_INDEXARRAY_HPP
-#define RAJA_DETAIL_INDEXARRAY_HPP
-
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
-//
-// Produced at the Lawrence Livermore National Laboratory
-//
-// LLNL-CODE-689114
-//
-// All rights reserved.
-//
-// This file is part of RAJA.
-//
-// For additional details, please also read RAJA/LICENSE.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
-//
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-
-
-#include <RAJA/config.hpp>
-#include <RAJA/internal/LegacyCompatibility.hpp>
-
-namespace RAJA
-{
-namespace detail
-{
-template <size_t Offset, typename Type>
-struct index_storage {
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  Type& get() { return data; }
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  constexpr const Type& get() const { return data; }
-  Type data;
-};
-
-template <typename StorageType>
-RAJA_HOST_DEVICE RAJA_INLINE constexpr auto get_data(StorageType& s)
-    -> decltype(s.get())
-{
-  return s.get();
-}
-template <typename StorageType>
-RAJA_HOST_DEVICE RAJA_INLINE constexpr auto get_data(const StorageType& s)
-    -> decltype(s.get())
-{
-  return s.get();
-}
-
-template <size_t I, typename AType_in>
-struct select_element {
-  using AType = typename std::remove_reference<AType_in>::type;
-  using return_type = typename AType::type&;
-  using const_return_type = const typename AType::type&;
-  using value_type = typename AType::type;
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  static constexpr return_type get(AType_in& a, size_t offset)
-  {
-    return (offset == I) ? get_data<index_storage<I, value_type>>(a)
-                         : select_element<I - 1, AType_in>::get(a, offset);
-  }
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  static constexpr const_return_type get(const AType_in& a, size_t offset)
-  {
-    return (offset == I) ? get_data<const index_storage<I, value_type>>(a)
-                         : select_element<I - 1, AType_in>::get(a, offset);
-  }
-};
-
-template <typename AType_in>
-struct select_element<0, AType_in> {
-  using AType = typename std::remove_reference<AType_in>::type;
-  using return_type = typename AType::type&;
-  using const_return_type = const typename AType::type&;
-  using value_type = typename AType::type;
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  static constexpr return_type get(AType_in& a, size_t offset)
-  {
-    return get_data<index_storage<0, value_type>>(a);
-  }
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  static constexpr const_return_type get(const AType_in& a, size_t offset)
-  {
-    return get_data<const index_storage<0, value_type>>(a);
-  }
-};
-
-template <typename... Types>
-struct index_array_helper;
-
-template <typename Type, size_t... orest>
-struct index_array_helper<Type, VarOps::index_sequence<orest...>>
-    : index_storage<orest, Type>... {
-  using type = Type;
-  using my_type = index_array_helper<Type, VarOps::index_sequence<orest...>>;
-  static constexpr size_t size = sizeof...(orest);
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  // constexpr : c++14 only
-  Type& operator[](size_t offset)
-  {
-    return select_element<size - 1, my_type>::get(*this, offset);
-  }
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  constexpr const Type& operator[](size_t offset) const
-  {
-    return select_element<size - 1, my_type>::get(*this, offset);
-  }
-};
-
-template <typename Type, size_t... orest>
-constexpr size_t
-    index_array_helper<Type, VarOps::index_sequence<orest...>>::size;
-}
-
-template <size_t Size, typename Type>
-struct index_array
-    : public detail::index_array_helper<Type,
-                                        VarOps::make_index_sequence<Size>> {
-  static_assert(Size > 0, "index_arrays must have at least one element");
-  using base =
-      detail::index_array_helper<Type, VarOps::make_index_sequence<Size>>;
-  using base::index_array_helper;
-  using base::operator[];
-};
-
-template <size_t Offset, typename Type>
-RAJA_HOST_DEVICE RAJA_INLINE Type& get(detail::index_storage<Offset, Type>& s)
-{
-  return s.data;
-}
-
-template <size_t Offset, typename Type>
-RAJA_HOST_DEVICE RAJA_INLINE const Type& get(
-    const detail::index_storage<Offset, Type>& s)
-{
-  return s.data;
-}
-
-namespace detail
-{
-template <typename Type, size_t... Seq, typename... Args>
-RAJA_HOST_DEVICE RAJA_INLINE auto make_index_array_helper(
-    VarOps::index_sequence<Seq...>,
-    Args... args) -> index_array<sizeof...(args), Type>
-{
-  index_array<sizeof...(args), Type> arr{};
-  VarOps::ignore_args((get<Seq>(arr) = args)...);
-  return arr;
-};
-}
-
-template <typename Arg1, typename... Args>
-RAJA_HOST_DEVICE RAJA_INLINE auto make_index_array(Arg1 arg1, Args... args)
-    -> index_array<sizeof...(args) + 1, Arg1>
-{
-  return detail::make_index_array_helper<Arg1>(
-      VarOps::make_index_sequence<sizeof...(args) + 1>(), arg1, args...);
-};
-
-template <size_t Size, typename Type>
-std::ostream& operator<<(std::ostream& os, index_array<Size, Type> const& a)
-{
-  // const detail::index_array_helper<Type, VarOps::make_index_sequence<Size>> &
-  // ah = a;
-  // os << "array templated iteration: " << ah << std::endl;
-  // os << "array runtime operator iteration: ";
-  os << '[';
-  for (size_t i = 0; i < Size - 1; ++i)
-    os << a[i] << ", ";
-  if (Size - 1 > 0) os << a[Size - 1];
-  os << ']';
-  return os;
-}
-}
-
-#endif /* RAJA_DETAIL_INDEXARRAY_HPP */
diff --git a/include/RAJA/internal/Iterators.hpp b/include/RAJA/internal/Iterators.hpp
index 807056fa1f..eccc3fbe2c 100644
--- a/include/RAJA/internal/Iterators.hpp
+++ b/include/RAJA/internal/Iterators.hpp
@@ -9,11 +9,8 @@
  */
 
 
-#ifndef RAJA_ITERATORS_HPP
-#define RAJA_ITERATORS_HPP
-
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -23,37 +20,13 @@
 //
 // This file is part of RAJA.
 //
-// For additional details, please also read RAJA/LICENSE.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
+// For details about use and distribution, please read RAJA/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+#ifndef RAJA_ITERATORS_HPP
+#define RAJA_ITERATORS_HPP
+
 #include "RAJA/config.hpp"
 #include "RAJA/util/defines.hpp"
 #include "RAJA/util/types.hpp"
@@ -248,6 +221,7 @@ class strided_numeric_iterator : public base_iterator<Type, DifferenceType>
       : base(rhs), stride(stride)
   {
   }
+
   RAJA_HOST_DEVICE constexpr strided_numeric_iterator(
       const strided_numeric_iterator& rhs)
       : base(rhs.val), stride(rhs.stride)
diff --git a/include/RAJA/internal/LegacyCompatibility.hpp b/include/RAJA/internal/LegacyCompatibility.hpp
index d36612653d..168be19fe3 100644
--- a/include/RAJA/internal/LegacyCompatibility.hpp
+++ b/include/RAJA/internal/LegacyCompatibility.hpp
@@ -8,11 +8,8 @@
  ******************************************************************************
  */
 
-#ifndef RAJA_LEGACY_COMPATIBILITY_HPP
-#define RAJA_LEGACY_COMPATIBILITY_HPP
-
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -22,41 +19,19 @@
 //
 // This file is part of RAJA.
 //
-// For additional details, please also read RAJA/LICENSE.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
+// For details about use and distribution, please read RAJA/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+#ifndef RAJA_LEGACY_COMPATIBILITY_HPP
+#define RAJA_LEGACY_COMPATIBILITY_HPP
+
 #include "RAJA/config.hpp"
 
 #include "RAJA/util/defines.hpp"
 
+#include "camp/camp.hpp"
+
 #if (!defined(__INTEL_COMPILER)) && (!defined(RAJA_COMPILER_MSVC))
 static_assert(__cplusplus >= 201103L,
               "C++ standards below 2011 are not "
@@ -104,30 +79,6 @@ namespace VarOps
 // Basics, using c++14 semantics in a c++11 compatible way, credit to libc++
 
 // Forward
-template <class T>
-struct remove_reference {
-  typedef T type;
-};
-template <class T>
-struct remove_reference<T&> {
-  typedef T type;
-};
-template <class T>
-struct remove_reference<T&&> {
-  typedef T type;
-};
-template <class T>
-RAJA_HOST_DEVICE RAJA_INLINE constexpr T&& forward(
-    typename remove_reference<T>::type& t) noexcept
-{
-  return static_cast<T&&>(t);
-}
-template <class T>
-RAJA_HOST_DEVICE RAJA_INLINE constexpr T&& forward(
-    typename remove_reference<T>::type&& t) noexcept
-{
-  return static_cast<T&&>(t);
-}
 
 // FoldL
 template <typename Op, typename... Rest>
@@ -149,12 +100,11 @@ template <typename Op,
           typename Arg3,
           typename... Rest>
 struct foldl_impl<Op, Arg1, Arg2, Arg3, Rest...> {
-  using Ret =
-      typename foldl_impl<Op,
-                          typename std::result_of<Op(
-                              typename std::result_of<Op(Arg1, Arg2)>::type,
-                              Arg3)>::type,
-                          Rest...>::Ret;
+  using Ret = typename foldl_impl<
+      Op,
+      typename std::result_of<Op(typename std::result_of<Op(Arg1, Arg2)>::type,
+                                 Arg3)>::type,
+      Rest...>::Ret;
 };
 
 template <typename Op, typename Arg1>
@@ -162,7 +112,7 @@ RAJA_HOST_DEVICE RAJA_INLINE constexpr auto foldl(
     Op&& RAJA_UNUSED_ARG(operation),
     Arg1&& arg) -> typename foldl_impl<Op, Arg1>::Ret
 {
-  return forward<Arg1&&>(arg);
+  return camp::forward<Arg1>(arg);
 }
 
 template <typename Op, typename Arg1, typename Arg2>
@@ -171,7 +121,8 @@ RAJA_HOST_DEVICE RAJA_INLINE constexpr auto foldl(Op&& operation,
                                                   Arg2&& arg2) ->
     typename foldl_impl<Op, Arg1, Arg2>::Ret
 {
-  return forward<Op&&>(operation)(forward<Arg1&&>(arg1), forward<Arg2&&>(arg2));
+  return camp::forward<Op>(operation)(camp::forward<Arg1>(arg1),
+                                        camp::forward<Arg2>(arg2));
 }
 
 template <typename Op,
@@ -186,12 +137,12 @@ RAJA_HOST_DEVICE RAJA_INLINE constexpr auto foldl(Op&& operation,
                                                   Rest&&... rest) ->
     typename foldl_impl<Op, Arg1, Arg2, Arg3, Rest...>::Ret
 {
-  return foldl(forward<Op&&>(operation),
-               forward<Op&&>(
-                   operation)(forward<Op&&>(operation)(forward<Arg1&&>(arg1),
-                                                       forward<Arg2&&>(arg2)),
-                              forward<Arg3&&>(arg3)),
-               forward<Rest&&>(rest)...);
+  return foldl(camp::forward<Op>(operation),
+               camp::forward<Op>(operation)(
+                   camp::forward<Op>(operation)(camp::forward<Arg1>(arg1),
+                                                  camp::forward<Arg2>(arg2)),
+                   camp::forward<Arg3>(arg3)),
+               camp::forward<Rest>(rest)...);
 }
 
 struct adder {
@@ -247,15 +198,6 @@ RAJA_HOST_DEVICE RAJA_INLINE constexpr Result max(Args... args)
 //     : value() { }
 // };
 
-// Index sequence
-
-template <size_t... Ints>
-struct integer_sequence {
-  using type = integer_sequence;
-  static constexpr size_t size = sizeof...(Ints);
-  static constexpr std::array<size_t, sizeof...(Ints)> value{{Ints...}};
-};
-
 template <template <class...> class Seq, class First, class... Ints>
 RAJA_HOST_DEVICE RAJA_INLINE constexpr auto rotate_left_one(
     const Seq<First, Ints...>) -> Seq<Ints..., First>
@@ -263,51 +205,13 @@ RAJA_HOST_DEVICE RAJA_INLINE constexpr auto rotate_left_one(
   return Seq<Ints..., First>{};
 }
 
-template <size_t... Ints>
-constexpr size_t integer_sequence<Ints...>::size;
-template <size_t... Ints>
-constexpr std::array<size_t, sizeof...(Ints)> integer_sequence<Ints...>::value;
-
-namespace integer_sequence_detail
-{
-// using aliases for cleaner syntax
-template <class T>
-using Invoke = typename T::type;
-
-template <class S1, class S2>
-struct concat;
-
-template <size_t... I1, size_t... I2>
-struct concat<integer_sequence<I1...>, integer_sequence<I2...>>
-    : integer_sequence<I1..., (sizeof...(I1) + I2)...> {
-};
-
-template <class S1, class S2>
-using Concat = Invoke<concat<S1, S2>>;
-
-template <size_t N>
-struct gen_seq;
-template <size_t N>
-using GenSeq = Invoke<gen_seq<N>>;
-
-template <size_t N>
-struct gen_seq : Concat<GenSeq<N / 2>, GenSeq<N - N / 2>> {
-};
-
-template <>
-struct gen_seq<0> : integer_sequence<> {
-};
-template <>
-struct gen_seq<1> : integer_sequence<0> {
-};
-}
 
+// Index sequence
 template <size_t Upper>
-using make_index_sequence =
-    typename integer_sequence_detail::gen_seq<Upper>::type;
+using make_index_sequence = typename camp::make_int_seq<size_t, Upper>::type;
 
 template <size_t... Ints>
-using index_sequence = integer_sequence<Ints...>;
+using index_sequence = camp::int_seq<size_t, Ints...>;
 
 // Invoke
 
@@ -408,14 +312,14 @@ struct get_arg_at {
   RAJA_HOST_DEVICE RAJA_INLINE static constexpr auto value(
       First&& RAJA_UNUSED_ARG(first),
       Rest&&... rest)
-      -> decltype(VarOps::forward<
-                  typename VarOps::get_type_at<index - 1, Rest...>::type>(
-          get_arg_at<index - 1>::value(VarOps::forward<Rest>(rest)...)))
+      -> decltype(
+          camp::forward<typename VarOps::get_type_at<index - 1, Rest...>::type>(
+              get_arg_at<index - 1>::value(camp::forward<Rest>(rest)...)))
   {
     static_assert(index < sizeof...(Rest) + 1, "index is past the end");
-    return VarOps::forward<
+    return camp::forward<
         typename VarOps::get_type_at<index - 1, Rest...>::type>(
-        get_arg_at<index - 1>::value(VarOps::forward<Rest>(rest)...));
+        get_arg_at<index - 1>::value(camp::forward<Rest>(rest)...));
   }
 };
 
@@ -424,10 +328,9 @@ struct get_arg_at<0> {
   template <typename First, typename... Rest>
   RAJA_HOST_DEVICE RAJA_INLINE static constexpr auto value(
       First&& first,
-      Rest&&... RAJA_UNUSED_ARG(rest))
-      -> decltype(VarOps::forward<First>(first))
+      Rest&&... RAJA_UNUSED_ARG(rest)) -> decltype(camp::forward<First>(first))
   {
-    return VarOps::forward<First>(first);
+    return camp::forward<First>(first);
   }
 };
 }
diff --git a/include/RAJA/internal/MemUtils_CPU.hpp b/include/RAJA/internal/MemUtils_CPU.hpp
index 854c01c3bf..6ea60a9bc2 100644
--- a/include/RAJA/internal/MemUtils_CPU.hpp
+++ b/include/RAJA/internal/MemUtils_CPU.hpp
@@ -9,11 +9,8 @@
  ******************************************************************************
  */
 
-#ifndef RAJA_MemUtils_CPU_HPP
-#define RAJA_MemUtils_CPU_HPP
-
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -23,37 +20,13 @@
 //
 // This file is part of RAJA.
 //
-// For additional details, please also read RAJA/LICENSE.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
+// For details about use and distribution, please read RAJA/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+#ifndef RAJA_MemUtils_CPU_HPP
+#define RAJA_MemUtils_CPU_HPP
+
 #include "RAJA/config.hpp"
 
 #include "RAJA/util/types.hpp"
diff --git a/include/RAJA/internal/RAJAVec.hpp b/include/RAJA/internal/RAJAVec.hpp
index b78b237564..f8aae4bc90 100644
--- a/include/RAJA/internal/RAJAVec.hpp
+++ b/include/RAJA/internal/RAJAVec.hpp
@@ -9,11 +9,8 @@
  ******************************************************************************
  */
 
-#ifndef RAJA_RAJAVec_HPP
-#define RAJA_RAJAVec_HPP
-
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -23,37 +20,13 @@
 //
 // This file is part of RAJA.
 //
-// For additional details, please also read RAJA/LICENSE.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
+// For details about use and distribution, please read RAJA/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+#ifndef RAJA_RAJAVec_HPP
+#define RAJA_RAJAVec_HPP
+
 #include "RAJA/config.hpp"
 
 #include <memory>
diff --git a/include/RAJA/internal/Span.hpp b/include/RAJA/internal/Span.hpp
index 8e60e9a5ef..bd1a3ca8b6 100644
--- a/include/RAJA/internal/Span.hpp
+++ b/include/RAJA/internal/Span.hpp
@@ -9,11 +9,8 @@
  */
 
 
-#ifndef RAJA_SPAN_HPP
-#define RAJA_SPAN_HPP
-
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -23,37 +20,13 @@
 //
 // This file is part of RAJA.
 //
-// For additional details, please also read RAJA/LICENSE.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
+// For details about use and distribution, please read RAJA/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+#ifndef RAJA_SPAN_HPP
+#define RAJA_SPAN_HPP
+
 #include <type_traits>
 
 #include "RAJA/util/concepts.hpp"
diff --git a/include/RAJA/internal/ThreadUtils_CPU.hpp b/include/RAJA/internal/ThreadUtils_CPU.hpp
index c2898de9bb..12107fbd73 100644
--- a/include/RAJA/internal/ThreadUtils_CPU.hpp
+++ b/include/RAJA/internal/ThreadUtils_CPU.hpp
@@ -9,11 +9,8 @@
  ******************************************************************************
  */
 
-#ifndef RAJA_ThreadUtils_CPU_HPP
-#define RAJA_ThreadUtils_CPU_HPP
-
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -23,37 +20,13 @@
 //
 // This file is part of RAJA.
 //
-// For additional details, please also read RAJA/LICENSE.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
+// For details about use and distribution, please read RAJA/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+#ifndef RAJA_ThreadUtils_CPU_HPP
+#define RAJA_ThreadUtils_CPU_HPP
+
 #include "RAJA/config.hpp"
 
 namespace RAJA
diff --git a/include/RAJA/internal/fault_tolerance.hpp b/include/RAJA/internal/fault_tolerance.hpp
index 51f649cb7e..834c0230fc 100644
--- a/include/RAJA/internal/fault_tolerance.hpp
+++ b/include/RAJA/internal/fault_tolerance.hpp
@@ -18,11 +18,8 @@
  ******************************************************************************
  */
 
-#ifndef RAJA_fault_tolerance_HPP
-#define RAJA_fault_tolerance_HPP
-
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -32,37 +29,13 @@
 //
 // This file is part of RAJA.
 //
-// For additional details, please also read RAJA/LICENSE.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
+// For details about use and distribution, please read RAJA/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+#ifndef RAJA_fault_tolerance_HPP
+#define RAJA_fault_tolerance_HPP
+
 #include "RAJA/config.hpp"
 
 #ifdef RAJA_ENABLE_FT
diff --git a/include/RAJA/module.modulemap b/include/RAJA/module.modulemap
new file mode 100644
index 0000000000..1a59231217
--- /dev/null
+++ b/include/RAJA/module.modulemap
@@ -0,0 +1,6 @@
+module RAJA {
+  requires cplusplus11
+  umbrella header "RAJA.hpp"
+
+  export *
+}
diff --git a/include/RAJA/module.private.modulemap b/include/RAJA/module.private.modulemap
new file mode 100644
index 0000000000..566f0715c6
--- /dev/null
+++ b/include/RAJA/module.private.modulemap
@@ -0,0 +1,12 @@
+explicit module RAJA.internal {
+  header "internal/DepGraphNode.hpp"
+  header "internal/fault_tolerance.hpp"
+  header "internal/ForallNPolicy.hpp"
+  header "internal/Iterators.hpp"
+  header "internal/LegacyCompatibility.hpp"
+  header "internal/MemUtils_CPU.hpp"
+  header "internal/RAJAVec.hpp"
+  header "internal/Span.hpp"
+  header "internal/ThreadUtils_CPU.hpp"
+  header "util/Timer.hpp"
+}
diff --git a/include/RAJA/pattern/atomic.hpp b/include/RAJA/pattern/atomic.hpp
index 05488afb81..b5b2cf4e21 100644
--- a/include/RAJA/pattern/atomic.hpp
+++ b/include/RAJA/pattern/atomic.hpp
@@ -8,11 +8,8 @@
  ******************************************************************************
  */
 
-#ifndef RAJA_pattern_atomic_HPP
-#define RAJA_pattern_atomic_HPP
-
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -22,37 +19,13 @@
 //
 // This file is part of RAJA.
 //
-// For additional details, please also read RAJA/LICENSE.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
+// For details about use and distribution, please read RAJA/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+#ifndef RAJA_pattern_atomic_HPP
+#define RAJA_pattern_atomic_HPP
+
 #include "RAJA/config.hpp"
 #include "RAJA/util/defines.hpp"
 #include "RAJA/policy/atomic_builtin.hpp"
diff --git a/include/RAJA/pattern/detail/forall.hpp b/include/RAJA/pattern/detail/forall.hpp
new file mode 100644
index 0000000000..67cb4e24ea
--- /dev/null
+++ b/include/RAJA/pattern/detail/forall.hpp
@@ -0,0 +1,14 @@
+#ifndef RAJA_PATTERN_DETAIL_FORALL_HPP
+#define RAJA_PATTERN_DETAIL_FORALL_HPP
+
+#define RAJA_EXTRACT_BED_SUFFIXED(CONTAINER, SUFFIX) \
+  using std::begin;                                  \
+  using std::end;                                    \
+  using std::distance;                               \
+  auto begin##SUFFIX = begin(CONTAINER);             \
+  auto end##SUFFIX = end(CONTAINER);                 \
+  auto distance##SUFFIX = distance(begin##SUFFIX, end##SUFFIX)
+
+#define RAJA_EXTRACT_BED_IT(CONTAINER) RAJA_EXTRACT_BED_SUFFIXED(CONTAINER, _it)
+
+#endif /* RAJA_PATTERN_DETAIL_FORALL_HPP */
diff --git a/include/RAJA/pattern/detail/reduce.hpp b/include/RAJA/pattern/detail/reduce.hpp
index 42c67ca47a..c834db1cd3 100644
--- a/include/RAJA/pattern/detail/reduce.hpp
+++ b/include/RAJA/pattern/detail/reduce.hpp
@@ -1,8 +1,15 @@
-#ifndef RAJA_PATTERN_DETAIL_REDUCE_HPP
-#define RAJA_PATTERN_DETAIL_REDUCE_HPP
+/*!
+ ******************************************************************************
+ *
+ * \file
+ *
+ * \brief  Base types used in common for RAJA reducer objects.
+ *
+ ******************************************************************************
+ */
 
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -12,37 +19,12 @@
 //
 // This file is part of RAJA.
 //
-// For additional details, please also read RAJA/LICENSE.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
+// For details about use and distribution, please read RAJA/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+#ifndef RAJA_PATTERN_DETAIL_REDUCE_HPP
+#define RAJA_PATTERN_DETAIL_REDUCE_HPP
 
 #include "RAJA/util/Operators.hpp"
 #include "RAJA/util/types.hpp"
@@ -70,7 +52,7 @@ namespace RAJA
 namespace reduce
 {
 
-#ifdef RAJA_ENABLE_TARGET_OPENMP
+#ifdef RAJA_RAJA_ENABLE_TARGET_OPENMP
 #pragma omp declare target
 #endif
 
@@ -80,7 +62,9 @@ namespace detail
 template <typename T, template <typename...> class Op>
 struct op_adapter : private Op<T, T, T> {
   using operator_type = Op<T, T, T>;
-  static constexpr T identity() { return operator_type::identity(); }
+  RAJA_HOST_DEVICE static constexpr T identity() {
+    return operator_type::identity();
+  }
   RAJA_HOST_DEVICE RAJA_INLINE void operator()(T &val, const T v) const
   {
     val = operator_type::operator()(val, v);
@@ -100,7 +84,7 @@ template <typename T>
 struct max : detail::op_adapter<T, RAJA::operators::maximum> {
 };
 
-#ifdef RAJA_ENABLE_TARGET_OPENMP
+#ifdef RAJA_RAJA_ENABLE_TARGET_OPENMP
 #pragma omp end declare target
 #endif
 
@@ -114,9 +98,9 @@ class ValueLoc
   T val = doing_min ? operators::limits<T>::max() : operators::limits<T>::min();
   Index_type loc = -1;
 
-  constexpr ValueLoc() = default;
-  constexpr ValueLoc(ValueLoc const &) = default;
-  ValueLoc &operator=(ValueLoc const &) = default;
+  RAJA_HOST_DEVICE constexpr ValueLoc() = default;
+  RAJA_HOST_DEVICE constexpr ValueLoc(ValueLoc const &) = default;
+  RAJA_HOST_DEVICE ValueLoc &operator=(ValueLoc const &) = default;
   RAJA_HOST_DEVICE constexpr ValueLoc(T const &val) : val{val}, loc{-1} {}
   RAJA_HOST_DEVICE constexpr ValueLoc(T const &val, Index_type const &loc)
       : val{val}, loc{loc}
diff --git a/include/RAJA/pattern/forall.hpp b/include/RAJA/pattern/forall.hpp
index b8884b3baa..43ec39fd05 100644
--- a/include/RAJA/pattern/forall.hpp
+++ b/include/RAJA/pattern/forall.hpp
@@ -42,11 +42,8 @@
  ******************************************************************************
  */
 
-#ifndef RAJA_forall_generic_HPP
-#define RAJA_forall_generic_HPP
-
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -56,37 +53,13 @@
 //
 // This file is part of RAJA.
 //
-// For additional details, please also read RAJA/LICENSE.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
+// For details about use and distribution, please read RAJA/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+#ifndef RAJA_forall_generic_HPP
+#define RAJA_forall_generic_HPP
+
 #include <functional>
 #include <iterator>
 #include <type_traits>
@@ -105,7 +78,9 @@
 #include "RAJA/util/concepts.hpp"
 #include "RAJA/util/types.hpp"
 
-#include "RAJA/policy/fwd.hpp"
+#include "RAJA/policy/sequential/forall.hpp"
+
+#include "RAJA/pattern/detail/forall.hpp"
 
 #if defined(RAJA_ENABLE_CHAI)
 #include "RAJA/util/chai_support.hpp"
@@ -115,6 +90,7 @@
 
 #endif
 
+
 namespace RAJA
 {
 
@@ -126,8 +102,71 @@ namespace RAJA
 //////////////////////////////////////////////////////////////////////
 //
 
-namespace impl
+namespace internal
+{
+
+template <typename T>
+struct Privatizer {
+  using value_type = camp::decay<T>;
+  using reference_type = value_type&;
+  value_type priv;
+  RAJA_HOST_DEVICE Privatizer(const T& o) : priv{o} {}
+  RAJA_HOST_DEVICE reference_type get_priv() { return priv; }
+};
+
+template <typename T>
+auto trigger_updates_before(T&& item) -> typename std::remove_reference<T>::type
+{
+  return item;
+}
+
+/**
+ * @brief Create a private copy of the argument to be stored on the current
+ * thread's stack in a class of the Privatizer concept
+ *
+ * @param item data to privatize
+ *
+ * @return Privatizer<T>
+ *
+ * This function will be invoked such that ADL can be used to extend its
+ * functionality.  Anywhere it is called it should be invoked by:
+ *
+ * `using RAJA::internal::thread_privatize; thread_privatize()`
+ *
+ * This allows other namespaces to add new versions to support functionality
+ * that does not belong here.
+ *
+ */
+template <typename T>
+RAJA_HOST_DEVICE auto thread_privatize(const T& item) -> Privatizer<T>
+{
+  return Privatizer<T>{item};
+}
+
+}  // end namespace internal
+
+namespace detail
 {
+/// Adapter to replace specific implementations for the icount variants
+template <typename Range, typename Body, typename IndexT>
+struct icount_adapter {
+  using index_type = typename std::decay<IndexT>::type;
+  typename std::decay<Body>::type body;
+  using container_type = typename std::decay<Range>::type;
+  typename container_type::iterator begin_it;
+  Index_type icount;
+  icount_adapter(Range const& r, Body const& b, IndexT icount_)
+      : body{b}, icount{icount_}
+  {
+    using std::begin;
+    begin_it = begin(r);
+  }
+  template <typename T>
+  RAJA_HOST_DEVICE void operator()(T const& i) const
+  {
+    body(static_cast<index_type>(i + icount), begin_it[i]);
+  }
+};
 
 struct CallForall {
   template <typename T, typename ExecPol, typename Body>
@@ -142,49 +181,8 @@ struct CallForallIcount {
 
   const int start;
 };
-
-template <typename SegmentIterPolicy,
-          typename SegmentExecPolicy,
-          typename LoopBody,
-          typename... SegmentTypes>
-RAJA_INLINE void forall(ExecPolicy<SegmentIterPolicy, SegmentExecPolicy>,
-                        const StaticIndexSet<SegmentTypes...>& iset,
-                        LoopBody loop_body)
-{
-  impl::forall(SegmentIterPolicy(), iset, [=](int segID) {
-    iset.segmentCall(segID, CallForall{}, SegmentExecPolicy(), loop_body);
-  });
-}
-
-
-/*!
-******************************************************************************
-*
-* \brief Execute segments from forall_Icount traversal method.
-*
-*         For usage example, see reducers.hxx.
-*
-******************************************************************************
-*/
-template <typename SegmentIterPolicy,
-          typename SegmentExecPolicy,
-          typename... SegmentTypes,
-          typename LoopBody>
-RAJA_INLINE void forall_Icount(ExecPolicy<SegmentIterPolicy, SegmentExecPolicy>,
-                               const StaticIndexSet<SegmentTypes...>& iset,
-                               LoopBody loop_body)
-{
-  // no need for icount variant here
-  impl::forall(SegmentIterPolicy(), iset, [=](int segID) {
-    iset.segmentCall(segID,
-                     CallForallIcount(iset.getStartingIcount(segID)),
-                     SegmentExecPolicy(),
-                     loop_body);
-  });
 }
 
-}  // end namespace impl
-
 /*!
  ******************************************************************************
  *
@@ -195,6 +193,7 @@ RAJA_INLINE void forall_Icount(ExecPolicy<SegmentIterPolicy, SegmentExecPolicy>,
 namespace wrap
 {
 
+
 /*!
  ******************************************************************************
  *
@@ -203,11 +202,10 @@ namespace wrap
  ******************************************************************************
  */
 template <typename ExecutionPolicy, typename Container, typename LoopBody>
-RAJA_INLINE concepts::
-    enable_if<concepts::
-                  negate<type_traits::is_indexset_policy<ExecutionPolicy>>,
-              type_traits::is_range<Container>>
-    forall(ExecutionPolicy&& p, Container&& c, LoopBody&& loop_body)
+RAJA_INLINE concepts::enable_if<
+    concepts::negate<type_traits::is_indexset_policy<ExecutionPolicy>>,
+    type_traits::is_range<Container>>
+forall(ExecutionPolicy&& p, Container&& c, LoopBody&& loop_body)
 {
 #if defined(RAJA_ENABLE_CHAI)
   chai::ArrayManager* rm = chai::ArrayManager::getInstance();
@@ -215,10 +213,11 @@ RAJA_INLINE concepts::
   rm->setExecutionSpace(detail::get_space<EP>::value);
 #endif
 
-  typename std::remove_reference<LoopBody>::type body = loop_body;
-  impl::forall(std::forward<ExecutionPolicy>(p),
-               std::forward<Container>(c),
-               body);
+  using RAJA::internal::trigger_updates_before;
+  auto body = trigger_updates_before(loop_body);
+  forall_impl(std::forward<ExecutionPolicy>(p),
+              std::forward<Container>(c),
+              body);
 
 #if defined(RAJA_ENABLE_CHAI)
   rm->setExecutionSpace(chai::NONE);
@@ -248,28 +247,39 @@ RAJA_INLINE void forall_Icount(ExecutionPolicy&& p,
   rm->setExecutionSpace(detail::get_space<EP>::value);
 #endif
 
-  typename std::remove_reference<LoopBody>::type body = loop_body;
-  impl::forall_Icount(std::forward<ExecutionPolicy>(p),
-                      std::forward<Container>(c),
-                      std::forward<IndexType>(icount),
-                      body);
+  using RAJA::internal::trigger_updates_before;
+  auto body = trigger_updates_before(loop_body);
+  using std::begin;
+  using std::end;
+  using std::distance;
+  auto range = RangeSegment(0, distance(begin(c), end(c)));
+  detail::icount_adapter<Container, LoopBody, IndexType> adapted(c,
+                                                                 body,
+                                                                 icount);
+  using policy::sequential::forall_impl;
+  forall_impl(std::forward<ExecutionPolicy>(p), range, adapted);
 
 #if defined(RAJA_ENABLE_CHAI)
   rm->setExecutionSpace(chai::NONE);
 #endif
 }
 
-namespace indexset
-{
 /*!
- ******************************************************************************
- *
- * \brief Generic dispatch over IndexSets
- *
- ******************************************************************************
- */
-template <typename ExecutionPolicy, typename IdxSet, typename LoopBody>
-RAJA_INLINE void forall(ExecutionPolicy&& p, IdxSet&& c, LoopBody&& loop_body)
+******************************************************************************
+*
+* \brief Execute segments from forall_Icount traversal method.
+*
+*         For usage example, see reducers.hxx.
+*
+******************************************************************************
+*/
+template <typename SegmentIterPolicy,
+          typename SegmentExecPolicy,
+          typename... SegmentTypes,
+          typename LoopBody>
+RAJA_INLINE void forall_Icount(ExecPolicy<SegmentIterPolicy, SegmentExecPolicy>,
+                               const StaticIndexSet<SegmentTypes...>& iset,
+                               LoopBody loop_body)
 {
 
 #if defined(RAJA_ENABLE_CHAI)
@@ -278,45 +288,50 @@ RAJA_INLINE void forall(ExecutionPolicy&& p, IdxSet&& c, LoopBody&& loop_body)
   rm->setExecutionSpace(detail::get_space<EP>::value);
 #endif
 
-  typename std::remove_reference<LoopBody>::type body = loop_body;
-  impl::forall(std::forward<ExecutionPolicy>(p), std::forward<IdxSet>(c), body);
+  using RAJA::internal::trigger_updates_before;
+  auto body = trigger_updates_before(loop_body);
+
+  // no need for icount variant here
+  wrap::forall(SegmentIterPolicy(), iset, [=](int segID) {
+    iset.segmentCall(segID,
+                     detail::CallForallIcount(iset.getStartingIcount(segID)),
+                     SegmentExecPolicy(),
+                     body);
+  });
 
 #if defined(RAJA_ENABLE_CHAI)
   rm->setExecutionSpace(chai::NONE);
 #endif
 }
 
-/*!
- ******************************************************************************
- *
- * \brief Generic dispatch over IndexSets with Icount
- *
- ******************************************************************************
- */
-template <typename ExecutionPolicy, typename IdxSet, typename LoopBody>
-RAJA_INLINE void forall_Icount(ExecutionPolicy&& p,
-                               IdxSet&& c,
-                               LoopBody loop_body)
+template <typename SegmentIterPolicy,
+          typename SegmentExecPolicy,
+          typename LoopBody,
+          typename... SegmentTypes>
+RAJA_INLINE void forall(ExecPolicy<SegmentIterPolicy, SegmentExecPolicy>,
+                             const StaticIndexSet<SegmentTypes...>& iset,
+                             LoopBody loop_body)
 {
-
 #if defined(RAJA_ENABLE_CHAI)
   chai::ArrayManager* rm = chai::ArrayManager::getInstance();
   using EP = typename std::decay<ExecutionPolicy>::type;
   rm->setExecutionSpace(detail::get_space<EP>::value);
 #endif
 
-  typename std::remove_reference<LoopBody>::type body = loop_body;
-  impl::forall_Icount(std::forward<ExecutionPolicy>(p),
-                      std::forward<IdxSet>(c),
-                      body);
+  using RAJA::internal::trigger_updates_before;
+  auto body = trigger_updates_before(loop_body);
 
+  wrap::forall(SegmentIterPolicy(), iset, [=](int segID) {
+    iset.segmentCall(segID,
+                     detail::CallForall{},
+                     SegmentExecPolicy(),
+                     body);
+  });
 #if defined(RAJA_ENABLE_CHAI)
   rm->setExecutionSpace(chai::NONE);
 #endif
 }
 
-}  // end namespace indexset
-
 }  // end namespace wrap
 
 /*!
@@ -334,9 +349,9 @@ RAJA_INLINE void forall_Icount(ExecutionPolicy&& p,
   static_assert(type_traits::is_index_set<IdxSet>::value,
                 "Expected an IndexSet but did not get one. Are you using an "
                 "IndexSet policy by mistake?");
-  wrap::indexset::forall_Icount(std::forward<ExecutionPolicy>(p),
-                                std::forward<IdxSet>(c),
-                                std::forward<LoopBody>(loop_body));
+  wrap::forall_Icount(std::forward<ExecutionPolicy>(p),
+                      std::forward<IdxSet>(c),
+                      std::forward<LoopBody>(loop_body));
 }
 
 /*!
@@ -347,16 +362,16 @@ RAJA_INLINE void forall_Icount(ExecutionPolicy&& p,
  ******************************************************************************
  */
 template <typename ExecutionPolicy, typename IdxSet, typename LoopBody>
-RAJA_INLINE concepts::
-    enable_if<type_traits::is_indexset_policy<ExecutionPolicy>>
-    forall(ExecutionPolicy&& p, IdxSet&& c, LoopBody&& loop_body)
+RAJA_INLINE concepts::enable_if<
+    type_traits::is_indexset_policy<ExecutionPolicy>>
+forall(ExecutionPolicy&& p, IdxSet&& c, LoopBody&& loop_body)
 {
   static_assert(type_traits::is_index_set<IdxSet>::value,
                 "Expected an IndexSet but did not get one. Are you using an "
                 "IndexSet policy by mistake?");
-  wrap::indexset::forall(std::forward<ExecutionPolicy>(p),
-                         std::forward<IdxSet>(c),
-                         std::forward<LoopBody>(loop_body));
+  wrap::forall(std::forward<ExecutionPolicy>(p),
+               std::forward<IdxSet>(c),
+               std::forward<LoopBody>(loop_body));
 }
 
 /*!
@@ -394,11 +409,10 @@ forall_Icount(ExecutionPolicy&& p,
  ******************************************************************************
  */
 template <typename ExecutionPolicy, typename Container, typename LoopBody>
-RAJA_INLINE concepts::
-    enable_if<concepts::
-                  negate<type_traits::is_indexset_policy<ExecutionPolicy>>,
-              type_traits::is_range<Container>>
-    forall(ExecutionPolicy&& p, Container&& c, LoopBody&& loop_body)
+RAJA_INLINE concepts::enable_if<
+    concepts::negate<type_traits::is_indexset_policy<ExecutionPolicy>>,
+    type_traits::is_range<Container>>
+forall(ExecutionPolicy&& p, Container&& c, LoopBody&& loop_body)
 {
   static_assert(type_traits::is_random_access_range<Container>::value,
                 "Container does not model RandomAccessIterator");
@@ -426,15 +440,15 @@ template <typename ExecutionPolicy,
           typename Iterator,
           typename IndexType,
           typename LoopBody>
-RAJA_INLINE concepts::
-    enable_if<type_traits::is_integral<IndexType>,
-              type_traits::is_iterator<Iterator>,
-              concepts::negate<type_traits::is_integral<Iterator>>>
-    forall_Icount(ExecutionPolicy&& p,
-                  Iterator begin,
-                  Iterator end,
-                  const IndexType icount,
-                  LoopBody&& loop_body)
+RAJA_INLINE concepts::enable_if<
+    type_traits::is_integral<IndexType>,
+    type_traits::is_iterator<Iterator>,
+    concepts::negate<type_traits::is_integral<Iterator>>>
+forall_Icount(ExecutionPolicy&& p,
+              Iterator begin,
+              Iterator end,
+              const IndexType icount,
+              LoopBody&& loop_body)
 {
   static_assert(type_traits::is_random_access_iterator<Iterator>::value,
                 "Iterator pair does not meet requirement of "
@@ -442,7 +456,7 @@ RAJA_INLINE concepts::
 
   auto len = std::distance(begin, end);
   using SpanType = impl::Span<Iterator, decltype(len)>;
-  impl::forall_Icount(std::forward<ExecutionPolicy>(p),
+  wrap::forall_Icount(std::forward<ExecutionPolicy>(p),
                       SpanType{begin, len},
                       icount,
                       std::forward<LoopBody>(loop_body));
@@ -456,13 +470,10 @@ RAJA_INLINE concepts::
  ******************************************************************************
  */
 template <typename ExecutionPolicy, typename Iterator, typename LoopBody>
-RAJA_INLINE concepts::
-    enable_if<type_traits::is_iterator<Iterator>,
-              concepts::negate<type_traits::is_integral<Iterator>>>
-    forall(ExecutionPolicy&& p,
-           Iterator begin,
-           Iterator end,
-           LoopBody&& loop_body)
+RAJA_INLINE concepts::enable_if<
+    type_traits::is_iterator<Iterator>,
+    concepts::negate<type_traits::is_integral<Iterator>>>
+forall(ExecutionPolicy&& p, Iterator begin, Iterator end, LoopBody&& loop_body)
 {
   static_assert(type_traits::is_random_access_iterator<Iterator>::value,
                 "Iterator pair does not meet requirement of "
@@ -644,13 +655,13 @@ template <typename ExecutionPolicy,
           typename ArrayIdxType,
           typename IndexType,
           typename LoopBody>
-RAJA_INLINE concepts::
-    enable_if<type_traits::is_integral<IndexType>,
-              concepts::negate<type_traits::is_iterator<IndexType>>>
-    forall(ExecutionPolicy&& p,
-           const ArrayIdxType* idx,
-           const IndexType len,
-           LoopBody&& loop_body)
+RAJA_INLINE concepts::enable_if<
+    type_traits::is_integral<IndexType>,
+    concepts::negate<type_traits::is_iterator<IndexType>>>
+forall(ExecutionPolicy&& p,
+       const ArrayIdxType* idx,
+       const IndexType len,
+       LoopBody&& loop_body)
 {
   wrap::forall(std::forward<ExecutionPolicy>(p),
                TypedListSegment<ArrayIdxType>(idx, len, Unowned),
@@ -671,18 +682,18 @@ template <typename ExecutionPolicy,
           typename IndexType,
           typename OffsetType,
           typename LoopBody>
-RAJA_INLINE concepts::
-    enable_if<type_traits::is_integral<IndexType>,
-              concepts::negate<type_traits::is_iterator<IndexType>>,
-              type_traits::is_integral<OffsetType>,
-              concepts::negate<type_traits::is_iterator<OffsetType>>,
-              type_traits::is_integral<ArrayIdxType>,
-              concepts::negate<type_traits::is_iterator<ArrayIdxType>>>
-    forall_Icount(ExecutionPolicy&& p,
-                  const ArrayIdxType* idx,
-                  const IndexType len,
-                  const OffsetType icount,
-                  LoopBody&& loop_body)
+RAJA_INLINE concepts::enable_if<
+    type_traits::is_integral<IndexType>,
+    concepts::negate<type_traits::is_iterator<IndexType>>,
+    type_traits::is_integral<OffsetType>,
+    concepts::negate<type_traits::is_iterator<OffsetType>>,
+    type_traits::is_integral<ArrayIdxType>,
+    concepts::negate<type_traits::is_iterator<ArrayIdxType>>>
+forall_Icount(ExecutionPolicy&& p,
+              const ArrayIdxType* idx,
+              const IndexType len,
+              const OffsetType icount,
+              LoopBody&& loop_body)
 {
   // turn into an iterator
   forall_Icount(std::forward<ExecutionPolicy>(p),
@@ -714,7 +725,7 @@ RAJA_INLINE void forall_Icount(Args&&... args)
   forall_Icount(ExecutionPolicy(), std::forward<Args>(args)...);
 }
 
-namespace impl
+namespace detail
 {
 
 template <typename T, typename ExecutionPolicy, typename LoopBody>
@@ -722,7 +733,9 @@ RAJA_INLINE void CallForall::operator()(T const& segment,
                                         ExecutionPolicy,
                                         LoopBody body) const
 {
-  forall(ExecutionPolicy(), segment, body);
+  // this is only called inside a region, use impl
+  using policy::sequential::forall_impl;
+  forall_impl(ExecutionPolicy(), segment, body);
 }
 
 constexpr CallForallIcount::CallForallIcount(int s) : start(s) {}
@@ -732,7 +745,8 @@ RAJA_INLINE void CallForallIcount::operator()(T const& segment,
                                               ExecutionPolicy,
                                               LoopBody body) const
 {
-  forall_Icount(ExecutionPolicy(), segment, start, body);
+  // go through wrap to unwrap icount
+  wrap::forall_Icount(ExecutionPolicy(), segment, start, body);
 }
 
 }  // closing brace for impl namespace
diff --git a/include/RAJA/pattern/forallN.hpp b/include/RAJA/pattern/forallN.hpp
index f6f85b9c34..ad337d1098 100644
--- a/include/RAJA/pattern/forallN.hpp
+++ b/include/RAJA/pattern/forallN.hpp
@@ -8,11 +8,8 @@
  ******************************************************************************
  */
 
-#ifndef RAJA_forallN_generic_HPP
-#define RAJA_forallN_generic_HPP
-
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -22,45 +19,23 @@
 //
 // This file is part of RAJA.
 //
-// For additional details, please also read RAJA/LICENSE.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
+// For details about use and distribution, please read RAJA/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+#ifndef RAJA_forallN_generic_HPP
+#define RAJA_forallN_generic_HPP
+
 #include "RAJA/config.hpp"
 #include "RAJA/internal/ForallNPolicy.hpp"
 #include "RAJA/internal/LegacyCompatibility.hpp"
 #include "RAJA/util/defines.hpp"
+#include "RAJA/util/Operators.hpp"
 
 #include "RAJA/policy/PolicyBase.hpp"
+#include "RAJA/policy/sequential/forall.hpp"
 
-#ifdef RAJA_ENABLE_CUDA
+#if defined(RAJA_ENABLE_CUDA)
 #include "RAJA/policy/cuda/MemUtils_CUDA.hpp"
 #endif
 
@@ -103,7 +78,7 @@ struct ForallN_Executor<maybe_cuda, POLICY_INIT, POLICY_REST...> {
   RAJA_INLINE void operator()(BODY const &body) const
   {
     ForallN_PeelOuter<build_device, NextExec, BODY> outer(next_exec, body);
-    RAJA::impl::forall(POLICY_I(), static_cast<TYPE_I>(is_i), outer);
+    wrap::forall(POLICY_I(), static_cast<TYPE_I>(is_i), outer);
   }
 };
 
@@ -233,12 +208,12 @@ struct type_repeater {
 
 template <typename POLICY,
           typename... Indices,
-          size_t... Range,
-          size_t... Unspecified,
+          camp::idx_t... Range,
+          camp::idx_t... Unspecified,
           typename BODY,
           typename... Ts>
-RAJA_INLINE void forallN_impl(VarOps::index_sequence<Range...>,
-                              VarOps::index_sequence<Unspecified...>,
+RAJA_INLINE void forallN_impl(camp::idx_seq<Range...>,
+                              camp::idx_seq<Unspecified...>,
                               BODY &&body,
                               const Ts &... args)
 {
@@ -255,28 +230,23 @@ RAJA_INLINE void forallN_impl(VarOps::index_sequence<Range...>,
 
 template <typename POLICY,
           typename... Indices,
-          size_t... I0s,
-          size_t... I1s,
+          camp::idx_t... I0s,
+          camp::idx_t... I1s,
           typename... Ts>
-RAJA_INLINE void fun_unpacker(VarOps::index_sequence<I0s...>,
-                              VarOps::index_sequence<I1s...>,
+RAJA_INLINE void fun_unpacker(camp::idx_seq<I0s...>,
+                              camp::idx_seq<I1s...>,
                               Ts &&... args)
 {
   forallN_impl<POLICY, Indices...>(
-      VarOps::make_index_sequence<sizeof...(args) - 1>(),
-      VarOps::make_index_sequence<sizeof...(args) - 1 - sizeof...(Indices)>(),
-      VarOps::get_arg_at<I0s>::value(VarOps::forward<Ts>(args)...)...,
-      VarOps::get_arg_at<I1s>::value(VarOps::forward<Ts>(args)...)...);
+      camp::make_idx_seq_t<sizeof...(args) - 1>(),
+      camp::make_idx_seq_t<sizeof...(args) - 1 - sizeof...(Indices)>(),
+      VarOps::get_arg_at<I0s>::value(camp::forward<Ts>(args)...)...,
+      VarOps::get_arg_at<I1s>::value(camp::forward<Ts>(args)...)...);
 }
 
 template <typename POLICY, typename... Indices, typename... Ts>
 RAJA_INLINE void forallN(Ts &&... args)
 {
-#ifdef RAJA_ENABLE_CUDA
-  // this call should be moved into a cuda file
-  // but must be made before loop_body is copied
-  beforeCudaKernelLaunch();
-#endif
 
 #if defined(RAJA_ENABLE_CHAI)
   chai::ArrayManager *rm = chai::ArrayManager::getInstance();
@@ -285,17 +255,13 @@ RAJA_INLINE void forallN(Ts &&... args)
 #endif
 
   fun_unpacker<POLICY, Indices...>(
-      VarOps::index_sequence<sizeof...(args) - 1>{},
-      VarOps::make_index_sequence<sizeof...(args) - 1>{},
-      VarOps::forward<Ts>(args)...);
+      camp::idx_seq<sizeof...(args) - 1>{},
+      camp::make_idx_seq_t<sizeof...(args) - 1>{},
+      camp::forward<Ts>(args)...);
 
 #if defined(RAJA_ENABLE_CHAI)
   rm->setExecutionSpace(chai::NONE);
 #endif
-
-#ifdef RAJA_ENABLE_CUDA
-  afterCudaKernelLaunch();
-#endif
 }
 
 }  // namespace RAJA
diff --git a/include/RAJA/pattern/nested.hpp b/include/RAJA/pattern/nested.hpp
new file mode 100644
index 0000000000..173b9c2e5e
--- /dev/null
+++ b/include/RAJA/pattern/nested.hpp
@@ -0,0 +1,240 @@
+#ifndef RAJA_pattern_nested_HPP
+#define RAJA_pattern_nested_HPP
+
+
+#include "RAJA/config.hpp"
+#include "RAJA/util/defines.hpp"
+#include "RAJA/util/types.hpp"
+#include "RAJA/policy/cuda.hpp"
+
+#include "RAJA/pattern/nested/internal.hpp"
+
+#include "camp/camp.hpp"
+#include "camp/concepts.hpp"
+#include "camp/tuple.hpp"
+
+#include <iostream>
+#include <type_traits>
+
+namespace RAJA
+{
+namespace nested
+{
+
+
+template <camp::idx_t ArgumentId, typename Pol = camp::nil, typename... Rest>
+struct For : public internal::ForList,
+             public internal::ForTraitBase<ArgumentId, Pol> {
+  using as_for_list = camp::list<For>;
+  // TODO: add static_assert for valid policy in Pol
+  const Pol pol;
+  For() : pol{} {}
+  For(const Pol &p) : pol{p} {}
+};
+
+template <camp::idx_t ArgumentId,
+          typename Pol,
+          typename IndexType,
+          typename... Rest>
+struct TypedFor : public internal::TypedForBase,
+                  public For<ArgumentId, Pol, Rest...> {
+  using Base = For<ArgumentId, Pol, Rest...>;
+  using Self = TypedFor<ArgumentId, Pol, IndexType, Rest...>;
+  using index_type = IndexType;
+  using as_for_list = camp::list<Self>;
+  // TODO: add static_assert for valid policy in Pol
+  using Base::Base;
+};
+
+template <typename... Policies>
+using Policy = camp::tuple<Policies...>;
+
+template <typename PolicyTuple, typename SegmentTuple, typename Fn>
+struct LoopData {
+  constexpr static size_t n_policies = camp::tuple_size<PolicyTuple>::value;
+  const PolicyTuple &pt;
+  SegmentTuple st;
+  const typename std::remove_reference<Fn>::type f;
+  using index_tuple_t = internal::index_tuple_from_policies_and_segments<
+      typename PolicyTuple::TList,
+      typename SegmentTuple::TList>;
+  index_tuple_t index_tuple;
+  LoopData(PolicyTuple const &p, SegmentTuple const &s, Fn const &fn)
+      : pt(p), st(s), f(fn)
+  {
+  }
+  template <camp::idx_t Idx, typename IndexT>
+  RAJA_HOST_DEVICE void assign_index(IndexT const &i)
+  {
+    camp::get<Idx>(index_tuple) =
+        camp::tuple_element_t<Idx, decltype(index_tuple)>{i};
+  }
+};
+
+
+template <typename Policy>
+struct Executor;
+
+template <camp::idx_t Index, typename BaseWrapper>
+struct GenericWrapper {
+  using data_type = camp::decay<typename BaseWrapper::data_type>;
+  GenericWrapper(BaseWrapper const &w) : bw{w} {}
+  GenericWrapper(data_type &d) : bw{d} {}
+  BaseWrapper bw;
+};
+
+template <camp::idx_t Index, typename BaseWrapper>
+struct ForWrapper : GenericWrapper<Index, BaseWrapper> {
+  using Base = GenericWrapper<Index, BaseWrapper>;
+  using Base::Base;
+  template <typename InIndexType>
+  void operator()(InIndexType i)
+  {
+    Base::bw.data.template assign_index<Index>(i);
+    Base::bw();
+  }
+};
+
+template <typename T>
+struct NestedPrivatizer {
+  using data_type = typename T::data_type;
+  using value_type = camp::decay<T>;
+  using reference_type = value_type &;
+  data_type data;
+  value_type priv;
+  NestedPrivatizer(const T &o) : data{o.bw.data}, priv{value_type{data}} {}
+  reference_type get_priv() { return priv; }
+};
+
+
+/**
+ * @brief specialization of internal::thread_privatize for nested
+ */
+template <camp::idx_t Index, typename BW>
+auto thread_privatize(const nested::ForWrapper<Index, BW> &item)
+    -> NestedPrivatizer<nested::ForWrapper<Index, BW>>
+{
+  return NestedPrivatizer<nested::ForWrapper<Index, BW>>{item};
+}
+
+template <typename ForType>
+struct Executor {
+  static_assert(std::is_base_of<internal::ForBase, ForType>::value,
+                "Only For-based policies should get here");
+  template <typename WrappedBody>
+  void operator()(ForType const &fp, WrappedBody const &wrap)
+  {
+    using ::RAJA::policy::sequential::forall_impl;
+    forall_impl(fp.pol,
+                 camp::get<ForType::index_val>(wrap.data.st),
+                 ForWrapper<ForType::index_val, WrappedBody>{wrap});
+  }
+};
+
+
+
+template <typename ExecPolicy, typename... Fors>
+struct Collapse : public internal::ForList, public internal::CollapseBase {
+  using as_for_list = camp::list<Fors...>;
+  const ExecPolicy pol;
+  Collapse() : pol{} {}
+  Collapse(ExecPolicy const &ep) : pol{ep} {}
+};
+
+
+//
+// This is for demonstration only... can be removed eventually
+//
+template <typename FT0, typename FT1>
+struct Executor<Collapse<seq_exec, FT0, FT1>> {
+  static_assert(std::is_base_of<internal::ForBase, FT0>::value,
+                "Only For-based policies should get here");
+  static_assert(std::is_base_of<internal::ForBase, FT1>::value,
+                "Only For-based policies should get here");
+  template <typename WrappedBody>
+  void operator()(Collapse<seq_exec, FT0, FT1> const &, WrappedBody const &wrap)
+  {
+    auto b0 = std::begin(camp::get<FT0::index_val>(wrap.data.st));
+    auto b1 = std::begin(camp::get<FT1::index_val>(wrap.data.st));
+
+    auto e0 = std::end(camp::get<FT0::index_val>(wrap.data.st));
+    auto e1 = std::end(camp::get<FT1::index_val>(wrap.data.st));
+
+    // Skip a level
+    for (auto i0 = b0; i0 < e0; ++i0) {
+      wrap.data.template assign_index<FT0::index_val>(*i0);
+      for (auto i1 = b1; i1 < e1; ++i1) {
+        wrap.data.template assign_index<FT1::index_val>(*i1);
+        wrap();
+      }
+    }
+  }
+};
+
+
+
+template <int idx, int n_policies, typename Data, bool Own = false>
+struct Wrapper {
+  using Next = Wrapper<idx + 1, n_policies, Data>;
+  using data_type = typename std::remove_reference<Data>::type;
+  Data &data;
+  explicit Wrapper(Data &d) : data{d} {}
+  void operator()() const
+  {
+    auto const &pol = camp::get<idx>(data.pt);
+    Executor<internal::remove_all_t<decltype(pol)>> e{};
+    Next next_wrapper{data};
+    e(pol, next_wrapper);
+  }
+};
+
+// Innermost, execute body
+template <int n_policies, typename Data, bool Own>
+struct Wrapper<n_policies, n_policies, Data, Own> {
+  using data_type = typename std::remove_reference<Data>::type;
+  Data &data;
+  explicit Wrapper(Data &d) : data{d} {}
+  void operator()() const { camp::invoke(data.index_tuple, data.f); }
+};
+
+template <typename Data>
+auto make_base_wrapper(Data &d) -> Wrapper<0, Data::n_policies, Data>
+{
+  return Wrapper<0, Data::n_policies, Data>(d);
+}
+
+template <typename Pol, typename SegmentTuple, typename Body>
+RAJA_INLINE void forall(const Pol &p, const SegmentTuple &st, const Body &b)
+{
+#if defined(RAJA_ENABLE_CHAI)
+  chai::ArrayManager *rm = chai::ArrayManager::getInstance();
+  using EP = typename std::decay<POLICY>::type;
+  rm->setExecutionSpace(detail::get_space<EP>::value);
+#endif
+  using fors = internal::get_for_policies<typename Pol::TList>;
+  // TODO: ensure no duplicate indices in For<>s
+  // TODO: ensure no gaps in For<>s
+  // TODO: test that all policy members model the Executor policy concept
+  // TODO: add a static_assert for functors which cannot be invoked with
+  //       index_tuple
+  static_assert(camp::tuple_size<SegmentTuple>::value
+                    == camp::size<fors>::value,
+                "policy and segment index counts do not match");
+  auto data = LoopData<Pol, SegmentTuple, Body>{p, st, b};
+  auto ld = make_base_wrapper(data);
+  // std::cout << typeid(ld).name() << std::endl
+  //           << typeid(data.index_tuple).name() << std::endl;
+  ld();
+
+#if defined(RAJA_ENABLE_CHAI)
+  rm->setExecutionSpace(chai::NONE);
+#endif
+}
+
+}  // end namespace nested
+}  // end namespace RAJA
+
+
+#include "RAJA/pattern/nested/tile.hpp"
+
+#endif /* RAJA_pattern_nested_HPP */
diff --git a/include/RAJA/pattern/nested/internal.hpp b/include/RAJA/pattern/nested/internal.hpp
new file mode 100644
index 0000000000..7982442d42
--- /dev/null
+++ b/include/RAJA/pattern/nested/internal.hpp
@@ -0,0 +1,118 @@
+#ifndef RAJA_pattern_nested_internal_HPP
+#define RAJA_pattern_nested_internal_HPP
+
+#include "RAJA/RAJA.hpp"
+#include "RAJA/config.hpp"
+#include "RAJA/util/defines.hpp"
+#include "RAJA/util/types.hpp"
+#include "RAJA/policy/cuda.hpp"
+
+#include "camp/camp.hpp"
+#include "camp/concepts.hpp"
+#include "camp/tuple.hpp"
+
+#include <type_traits>
+
+namespace RAJA
+{
+namespace nested
+{
+
+namespace internal
+{
+
+template <typename T>
+using remove_all_t =
+    typename std::remove_cv<typename std::remove_reference<T>::type>::type;
+
+// Universal base of all For wrappers for type traits
+struct ForList {
+};
+struct ForBase {
+};
+struct TypedForBase : public ForBase {
+};
+struct CollapseBase {
+};
+template <camp::idx_t ArgumentId, typename Pol>
+struct ForTraitBase : public ForBase {
+  constexpr static camp::idx_t index_val = ArgumentId;
+  using index = camp::num<ArgumentId>;
+  using index_type = camp::nil;  // default to invalid type
+  using policy_type = Pol;
+  using type = ForTraitBase;  // make camp::value compatible
+};
+
+using is_for_policy = typename camp::bind_front<std::is_base_of, ForBase>::type;
+using is_typed_for_policy =
+    typename camp::bind_front<std::is_base_of, TypedForBase>::type;
+
+using has_for_list = typename camp::bind_front<std::is_base_of, ForList>::type;
+
+template <typename T>
+using get_for_list = typename T::as_for_list;
+
+template <typename Seq>
+using get_for_policies = typename camp::flatten<typename camp::transform<
+    get_for_list,
+    typename camp::filter_l<has_for_list, Seq>::type>::type>::type;
+
+template <typename T>
+using is_nil_type =
+    camp::bind_front<camp::concepts::metalib::is_same, camp::nil>;
+
+template <typename Index, typename ForPol>
+struct index_matches {
+  using type = camp::num<Index::value == ForPol::index::value>;
+};
+
+template <typename IndexTypes,
+          typename ForPolicies,
+          typename Current,
+          typename Index>
+struct evaluate_policy {
+  using ForPolicy = typename camp::find_if_l<
+      typename camp::bind_front<index_matches, Index>::type,
+      ForPolicies>::type;
+  using type = typename camp::append<
+      Current,
+      camp::if_<typename std::is_base_of<TypedForBase, ForPolicy>::type,
+                typename ForPolicy::index_type,
+                typename camp::at<IndexTypes,
+                                  typename ForPolicy::index>::type>>::type;
+};
+
+template <typename Policies, typename IndexTypes>
+using get_for_index_types = typename camp::accumulate_l<
+    typename camp::bind_front<evaluate_policy,
+                              IndexTypes,
+                              get_for_policies<Policies>>::type,
+    camp::list<>,
+    camp::as_list<camp::idx_seq_from_t<IndexTypes>>>::type;
+
+template <typename Iterator>
+struct iterable_value_type_getter {
+  using type = typename Iterator::iterator::value_type;
+};
+template <>
+struct iterable_value_type_getter<IndexSet> {
+  // TODO: when static indexset drops, specialize properly
+  using type = Index_type;
+};
+
+template <typename Segments>
+using value_type_list_from_segments =
+    typename camp::transform<iterable_value_type_getter, Segments>::type;
+
+template <typename Policies, typename Segments>
+using index_tuple_from_policies_and_segments = typename camp::apply_l<
+    camp::lambda<camp::tuple>,
+    get_for_index_types<Policies,
+                        value_type_list_from_segments<Segments>>>::type;
+
+}  // end namespace internal
+}  // end namespace nested
+}  // end namespace RAJA
+
+
+#endif /* RAJA_pattern_nested_internal_HPP */
diff --git a/include/RAJA/pattern/nested/tile.hpp b/include/RAJA/pattern/nested/tile.hpp
new file mode 100644
index 0000000000..374bee3d70
--- /dev/null
+++ b/include/RAJA/pattern/nested/tile.hpp
@@ -0,0 +1,176 @@
+#ifndef RAJA_pattern_nested_tile_HPP
+#define RAJA_pattern_nested_tile_HPP
+
+#include "RAJA/RAJA.hpp"
+#include "RAJA/config.hpp"
+#include "RAJA/util/defines.hpp"
+#include "RAJA/util/types.hpp"
+
+#include "camp/camp.hpp"
+#include "camp/concepts.hpp"
+#include "camp/tuple.hpp"
+
+#include <iostream>
+#include <type_traits>
+
+namespace RAJA
+{
+namespace nested
+{
+
+
+template <camp::idx_t Index, typename TilePolicy, typename ExecPolicy>
+struct Tile {
+  const TilePolicy tpol;
+  const ExecPolicy epol;
+  Tile(TilePolicy const &tp = TilePolicy{}, ExecPolicy const &ep = ExecPolicy{})
+      : tpol{tp}, epol{ep}
+  {
+  }
+};
+
+///! tag for a tiling loop, tile_static renamed to avoid MSVC keyword
+template <camp::idx_t chunk_size_>
+struct tile_s {
+  static constexpr camp::idx_t chunk_size = chunk_size_;
+
+  tile_s() {}
+  constexpr camp::idx_t get_chunk_size() const { return chunk_size; }
+};
+
+///! tag for a tiling loop
+template <camp::idx_t default_chunk_size>
+struct tile {
+  camp::idx_t chunk_size;
+
+  tile(camp::idx_t chunk_size_ = default_chunk_size) : chunk_size{chunk_size_}
+  {
+  }
+  camp::idx_t get_chunk_size() const { return chunk_size; }
+};
+
+template <camp::idx_t Index, typename BaseWrapper>
+struct TileWrapper : GenericWrapper<Index, BaseWrapper> {
+  using Base = GenericWrapper<Index, BaseWrapper>;
+  using Base::Base;
+  template <typename InSegmentType>
+  void operator()(InSegmentType s)
+  {
+    camp::get<Index>(Base::bw.data.st) = s;
+    Base::bw();
+  }
+};
+
+/**
+ * @brief specialization of internal::thread_privatize for tile
+ */
+template <camp::idx_t Index, typename BW>
+auto thread_privatize(const nested::TileWrapper<Index, BW> &item)
+    -> NestedPrivatizer<nested::TileWrapper<Index, BW>>
+{
+  return NestedPrivatizer<nested::TileWrapper<Index, BW>>{item};
+}
+
+template <typename Iterable>
+struct IterableTiler {
+  using value_type = camp::decay<Iterable>;
+
+  class iterator
+  {
+    // NOTE: this must be held by value for NVCC support, *even on the host*
+    const IterableTiler itiler;
+    const Index_type block_id;
+
+  public:
+    using value_type = camp::decay<Iterable>;
+    using difference_type = camp::idx_t;
+    using pointer = value_type *;
+    using reference = value_type &;
+    using iterator_category = std::random_access_iterator_tag;
+
+    constexpr iterator(IterableTiler const &itiler_, Index_type block_id_)
+        : itiler{itiler_}, block_id{block_id_}
+    {
+    }
+
+    value_type operator*()
+    {
+      auto start = block_id * itiler.block_size;
+      return itiler.it.slice(start, itiler.block_size);
+    }
+
+    inline difference_type operator-(const iterator &rhs) const
+    {
+      return static_cast<difference_type>(block_id)
+             - static_cast<difference_type>(rhs.block_id);
+    }
+
+    inline iterator operator-(const difference_type &rhs) const
+    {
+      return iterator(itiler, block_id - rhs);
+    }
+
+    inline iterator operator+(const difference_type &rhs) const
+    {
+      return iterator(itiler,
+                      block_id + rhs >= itiler.num_blocks ? itiler.num_blocks
+                                                          : block_id + rhs);
+    }
+
+    inline value_type operator[](difference_type rhs) const
+    {
+      return *((*this) + rhs);
+    }
+
+    inline bool operator!=(const IterableTiler &rhs) const
+    {
+      return block_id != rhs.block_id;
+    }
+
+    inline bool operator<(const IterableTiler &rhs) const
+    {
+      return block_id < rhs.block_id;
+    }
+  };
+
+  IterableTiler(const Iterable &it_, camp::idx_t block_size_)
+      : it{it_}, block_size{block_size_}
+  {
+    using std::begin;
+    using std::end;
+    using std::distance;
+    dist = distance(begin(it), end(it));
+    num_blocks = dist / block_size;
+    if (dist % block_size) num_blocks += 1;
+  }
+
+  iterator begin() { return iterator(*this, 0); }
+
+  iterator end() { return iterator(*this, num_blocks); }
+
+  value_type it;
+  camp::idx_t block_size;
+  camp::idx_t num_blocks;
+  camp::idx_t dist;
+};
+
+template <typename TPol, typename EPol, camp::idx_t Index>
+struct Executor<Tile<Index, TPol, EPol>> {
+  using TileType = Tile<Index, TPol, EPol>;
+  template <typename WrappedBody>
+  void operator()(TileType const &fp, WrappedBody const &wrap)
+  {
+    auto const &st = camp::get<Index>(wrap.data.st);
+    IterableTiler<decltype(st)> tiled_iterable(st, fp.tpol.get_chunk_size());
+    using ::RAJA::policy::sequential::forall_impl;
+    forall_impl(fp.epol, tiled_iterable, TileWrapper<Index, WrappedBody>{wrap});
+    // Set range back to original values
+    camp::get<Index>(wrap.data.st) = tiled_iterable.it;
+  }
+};
+
+
+}  // end namespace nested
+}  // end namespace RAJA
+
+#endif /* RAJA_pattern_nested_HPP */
diff --git a/include/RAJA/pattern/permute.hpp b/include/RAJA/pattern/permute.hpp
index 49e0ffbb25..08d49b9183 100644
--- a/include/RAJA/pattern/permute.hpp
+++ b/include/RAJA/pattern/permute.hpp
@@ -8,11 +8,8 @@
  ******************************************************************************
  */
 
-#ifndef RAJA_forallN_permute_HPP
-#define RAJA_forallN_permute_HPP
-
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -22,38 +19,15 @@
 //
 // This file is part of RAJA.
 //
-// For additional details, please also read RAJA/LICENSE.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
+// For details about use and distribution, please read RAJA/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+#ifndef RAJA_forallN_permute_HPP
+#define RAJA_forallN_permute_HPP
+
 #include "RAJA/config.hpp"
+#include "camp/camp.hpp"
 
 namespace RAJA
 {
@@ -81,9 +55,9 @@ struct Permute {
 template <typename Range, typename PERM, typename BODY>
 struct ForallN_Permute_Functor_impl;
 
-template <size_t... Range, size_t... PermInts, typename BODY>
-struct ForallN_Permute_Functor_impl<VarOps::index_sequence<Range...>,
-                                    VarOps::index_sequence<PermInts...>,
+template <camp::idx_t... Range, camp::idx_t... PermInts, typename BODY>
+struct ForallN_Permute_Functor_impl<camp::idx_seq<Range...>,
+                                    camp::idx_seq<PermInts...>,
                                     BODY> {
   RAJA_INLINE
   constexpr explicit ForallN_Permute_Functor_impl(BODY const &b) : body(b) {}
@@ -109,7 +83,7 @@ struct ForallN_Permute_Functor_impl<VarOps::index_sequence<Range...>,
 };
 template <typename PERM, typename BODY>
 using ForallN_Permute_Functor =
-    ForallN_Permute_Functor_impl<VarOps::make_index_sequence<PERM::size>,
+    ForallN_Permute_Functor_impl<camp::idx_seq_from_t<PERM>,
                                  PERM,
                                  BODY>;
 
diff --git a/include/RAJA/pattern/reduce.hpp b/include/RAJA/pattern/reduce.hpp
index 055ae08d0f..c2656171c1 100644
--- a/include/RAJA/pattern/reduce.hpp
+++ b/include/RAJA/pattern/reduce.hpp
@@ -8,11 +8,8 @@
  ******************************************************************************
  */
 
-#ifndef RAJA_reducers_HPP
-#define RAJA_reducers_HPP
-
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -22,37 +19,13 @@
 //
 // This file is part of RAJA.
 //
-// For additional details, please also read RAJA/LICENSE.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
+// For details about use and distribution, please read RAJA/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+#ifndef RAJA_reduce_HPP
+#define RAJA_reduce_HPP
+
 #include "RAJA/config.hpp"
 #include "RAJA/util/Operators.hpp"
 #include "RAJA/util/defines.hpp"
diff --git a/include/RAJA/pattern/scan.hpp b/include/RAJA/pattern/scan.hpp
index e758b814ff..b51f58d8eb 100644
--- a/include/RAJA/pattern/scan.hpp
+++ b/include/RAJA/pattern/scan.hpp
@@ -8,11 +8,8 @@
 ******************************************************************************
 */
 
-#ifndef RAJA_scan_HPP
-#define RAJA_scan_HPP
-
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -22,39 +19,16 @@
 //
 // This file is part of RAJA.
 //
-// For additional details, please also read RAJA/LICENSE.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
+// For details about use and distribution, please read RAJA/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+#ifndef RAJA_scan_HPP
+#define RAJA_scan_HPP
+
 #include "RAJA/config.hpp"
-#include "RAJA/util/concepts.hpp"
+#include "camp/concepts.hpp"
+#include "camp/helpers.hpp"
 
 #include "RAJA/policy/PolicyBase.hpp"
 #include "RAJA/util/Operators.hpp"
@@ -69,14 +43,11 @@ namespace detail
 {
 
 template <typename Iter>
-using IterVal =
-    typename std::remove_const<typename std::remove_reference<decltype(
-        *RAJA::concepts::val<Iter>())>::type>::type;
+using IterVal = camp::decay<decltype(*camp::val<Iter>())>;
 
 template <typename Container>
 using ContainerVal =
-    typename std::remove_const<typename std::remove_reference<decltype(
-        *std::begin(RAJA::concepts::val<Container>()))>::type>::type;
+    camp::decay<decltype(*camp::val<camp::iterator_from<Container>>())>;
 
 }  // end namespace detail
 
diff --git a/include/RAJA/pattern/tile.hpp b/include/RAJA/pattern/tile.hpp
index a603cdacea..285db2ed8b 100644
--- a/include/RAJA/pattern/tile.hpp
+++ b/include/RAJA/pattern/tile.hpp
@@ -9,11 +9,8 @@
  ******************************************************************************
  */
 
-#ifndef RAJA_forallN_tile_HPP
-#define RAJA_forallN_tile_HPP
-
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -23,37 +20,13 @@
 //
 // This file is part of RAJA.
 //
-// For additional details, please also read RAJA/LICENSE.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
+// For details about use and distribution, please read RAJA/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+#ifndef RAJA_forallN_tile_HPP
+#define RAJA_forallN_tile_HPP
+
 #include <type_traits>
 
 #include "RAJA/config.hpp"
diff --git a/include/RAJA/policy/MultiPolicy.hpp b/include/RAJA/policy/MultiPolicy.hpp
index 69cd533816..56756d499c 100644
--- a/include/RAJA/policy/MultiPolicy.hpp
+++ b/include/RAJA/policy/MultiPolicy.hpp
@@ -8,11 +8,8 @@
  ******************************************************************************
  */
 
-#ifndef RAJA_MultiPolicy_HPP
-#define RAJA_MultiPolicy_HPP
-
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -22,42 +19,17 @@
 //
 // This file is part of RAJA.
 //
-// For additional details, please also read RAJA/LICENSE.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
+// For details about use and distribution, please read RAJA/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+#ifndef RAJA_MultiPolicy_HPP
+#define RAJA_MultiPolicy_HPP
+
 #include <tuple>
 
 #include "RAJA/config.hpp"
 #include "RAJA/internal/LegacyCompatibility.hpp"
-#include "RAJA/policy/fwd.hpp"
 
 #include "RAJA/policy/PolicyBase.hpp"
 
@@ -72,6 +44,10 @@ template <size_t index, size_t size, typename Policy, typename... rest>
 struct policy_invoker;
 }
 
+namespace policy
+{
+namespace multi
+{
 
 /// MultiPolicy - Meta-policy for choosing between a compile-time list of
 /// policies at runtime
@@ -105,6 +81,27 @@ class MultiPolicy
           _policies;
 };
 
+/// forall_impl - MultiPolicy specialization, select at runtime from a
+/// compile-time list of policies, build with make_multi_policy()
+/// \param p MultiPolicy to use for selection
+/// \param iter iterable of items to supply to body
+/// \param body functor, will receive each value produced by iterable iter
+template <typename Iterable,
+          typename Body,
+          typename Selector,
+          typename... Policies>
+RAJA_INLINE void forall_impl(MultiPolicy<Selector, Policies...> p,
+                        Iterable &&iter,
+                        Body &&body)
+{
+  p.invoke(iter, body);
+}
+
+}  // end namespace multi
+}  // end namespace policy
+
+using policy::multi::MultiPolicy;
+
 namespace detail
 {
 template <size_t... Indices, typename... Policies, typename Selector>
@@ -148,34 +145,6 @@ auto make_multi_policy(std::tuple<Policies...> policies, Selector s)
       VarOps::make_index_sequence<sizeof...(Policies)>{}, s, policies);
 }
 
-namespace wrap
-{
-
-template <typename ExecutionPolicy, typename Container, typename LoopBody>
-RAJA_INLINE concepts::
-    enable_if<concepts::
-                  negate<type_traits::is_indexset_policy<ExecutionPolicy>>,
-              type_traits::is_range<Container>>
-    forall(ExecutionPolicy &&, Container &&, LoopBody &&);
-
-/// forall - MultiPolicy specialization, select at runtime from a
-/// compile-time list of policies, build with make_multi_policy()
-/// \param p MultiPolicy to use for selection
-/// \param iter iterable of items to supply to body
-/// \param body functor, will receive each value produced by iterable iter
-template <typename Iterable,
-          typename Body,
-          typename Selector,
-          typename... Policies>
-RAJA_INLINE void forall(MultiPolicy<Selector, Policies...> p,
-                        Iterable &&iter,
-                        Body &&body)
-{
-  p.invoke(iter, body);
-}
-
-}  // closing brace for namespace wrap
-
 namespace detail
 {
 
@@ -191,7 +160,8 @@ struct policy_invoker : public policy_invoker<index - 1, size, rest...> {
   void invoke(int offset, Iterable &&iter, Body &&body)
   {
     if (offset == size - index - 1) {
-      RAJA::wrap::forall(_p, iter, body);
+      using policy::multi::forall_impl;
+      forall_impl(_p, iter, body);
     } else {
       NextInvoker::invoke(offset, iter, body);
     }
@@ -206,7 +176,8 @@ struct policy_invoker<0, size, Policy, rest...> {
   void invoke(int offset, Iterable &&iter, Body &&body)
   {
     if (offset == size - 1) {
-      RAJA::wrap::forall(_p, iter, body);
+      using policy::multi::forall_impl;
+      forall_impl(_p, iter, body);
     } else {
       throw std::runtime_error("unknown offset invoked");
     }
diff --git a/include/RAJA/policy/PolicyBase.hpp b/include/RAJA/policy/PolicyBase.hpp
index 55bd31607e..c192eafd1d 100644
--- a/include/RAJA/policy/PolicyBase.hpp
+++ b/include/RAJA/policy/PolicyBase.hpp
@@ -8,11 +8,8 @@
  ******************************************************************************
  */
 
-#ifndef RAJA_POLICYBASE_HPP
-#define RAJA_POLICYBASE_HPP
-
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -22,37 +19,13 @@
 //
 // This file is part of RAJA.
 //
-// For additional details, please also read RAJA/LICENSE.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
+// For details about use and distribution, please read RAJA/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+#ifndef RAJA_POLICYBASE_HPP
+#define RAJA_POLICYBASE_HPP
+
 #include <cstddef>
 #include "RAJA/util/concepts.hpp"
 
@@ -104,25 +77,25 @@ struct platform_of {
 
 template <typename PolicyType, RAJA::Policy P_>
 struct policy_is
-    : concepts::bool_<policy_of<concepts::types::decay_t<PolicyType>>::value
+    : camp::num<policy_of<camp::decay<PolicyType>>::value
                       == P_> {
 };
 
 template <typename PolicyType, RAJA::Pattern P_>
 struct pattern_is
-    : concepts::bool_<pattern_of<concepts::types::decay_t<PolicyType>>::value
+    : camp::num<pattern_of<camp::decay<PolicyType>>::value
                       == P_> {
 };
 
 template <typename PolicyType, RAJA::Launch L_>
 struct launch_is
-    : concepts::bool_<launch_of<concepts::types::decay_t<PolicyType>>::value
+    : camp::num<launch_of<camp::decay<PolicyType>>::value
                       == L_> {
 };
 
 template <typename PolicyType, RAJA::Platform P_>
 struct platform_is
-    : concepts::bool_<platform_of<concepts::types::decay_t<PolicyType>>::value
+    : camp::num<platform_of<camp::decay<PolicyType>>::value
                       == P_> {
 };
 
@@ -162,11 +135,11 @@ namespace concepts
 template <typename Pol>
 struct ExecutionPolicy
     : DefineConcept(
-          has_type<::RAJA::Policy>(types::decay_t<decltype(Pol::policy)>()),
-          has_type<::RAJA::Pattern>(types::decay_t<decltype(Pol::pattern)>()),
-          has_type<::RAJA::Launch>(types::decay_t<decltype(Pol::launch)>()),
+          has_type<::RAJA::Policy>(camp::decay<decltype(Pol::policy)>()),
+          has_type<::RAJA::Pattern>(camp::decay<decltype(Pol::pattern)>()),
+          has_type<::RAJA::Launch>(camp::decay<decltype(Pol::launch)>()),
           has_type<::RAJA::Platform>(
-              types::decay_t<decltype(Pol::platform)>())) {
+              camp::decay<decltype(Pol::platform)>())) {
 };
 
 }  // end namespace concepts
diff --git a/include/RAJA/policy/atomic_auto.hpp b/include/RAJA/policy/atomic_auto.hpp
index 9ece1f3dc7..4dcf3114ca 100644
--- a/include/RAJA/policy/atomic_auto.hpp
+++ b/include/RAJA/policy/atomic_auto.hpp
@@ -8,11 +8,8 @@
  ******************************************************************************
  */
 
-#ifndef RAJA_policy_atomic_auto_HPP
-#define RAJA_policy_atomic_auto_HPP
-
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -22,37 +19,13 @@
 //
 // This file is part of RAJA.
 //
-// For additional details, please also read RAJA/LICENSE.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
+// For details about use and distribution, please read RAJA/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+#ifndef RAJA_policy_atomic_auto_HPP
+#define RAJA_policy_atomic_auto_HPP
+
 #include "RAJA/config.hpp"
 #include "RAJA/util/defines.hpp"
 
diff --git a/include/RAJA/policy/atomic_builtin.hpp b/include/RAJA/policy/atomic_builtin.hpp
index 7802e3e0ea..ab56f63e76 100644
--- a/include/RAJA/policy/atomic_builtin.hpp
+++ b/include/RAJA/policy/atomic_builtin.hpp
@@ -8,11 +8,8 @@
  ******************************************************************************
  */
 
-#ifndef RAJA_policy_atomic_builtin_HPP
-#define RAJA_policy_atomic_builtin_HPP
-
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -22,37 +19,13 @@
 //
 // This file is part of RAJA.
 //
-// For additional details, please also read RAJA/LICENSE.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
+// For details about use and distribution, please read RAJA/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+#ifndef RAJA_policy_atomic_builtin_HPP
+#define RAJA_policy_atomic_builtin_HPP
+
 #include "RAJA/config.hpp"
 #include "RAJA/util/TypeConvert.hpp"
 #include "RAJA/util/defines.hpp"
diff --git a/include/RAJA/policy/cuda.hpp b/include/RAJA/policy/cuda.hpp
index d5e0e58723..13fca14ec3 100644
--- a/include/RAJA/policy/cuda.hpp
+++ b/include/RAJA/policy/cuda.hpp
@@ -10,15 +10,8 @@
  ******************************************************************************
  */
 
-#ifndef RAJA_cuda_HPP
-#define RAJA_cuda_HPP
-
-#include "RAJA/config.hpp"
-
-#if defined(RAJA_ENABLE_CUDA)
-
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -28,37 +21,17 @@
 //
 // This file is part of RAJA.
 //
-// For additional details, please also read RAJA/LICENSE.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
+// For details about use and distribution, please read RAJA/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+#ifndef RAJA_cuda_HPP
+#define RAJA_cuda_HPP
+
+#include "RAJA/config.hpp"
+
+#if defined(RAJA_ENABLE_CUDA)
+
 #include <cuda.h>
 #include <cuda_runtime.h>
 
@@ -71,6 +44,7 @@
 #endif
 
 #include "RAJA/policy/cuda/forallN.hpp"
+#include "RAJA/policy/cuda/nested.hpp"
 
 #endif  // closing endif for if defined(RAJA_ENABLE_CUDA)
 
diff --git a/include/RAJA/policy/cuda/MemUtils_CUDA.hpp b/include/RAJA/policy/cuda/MemUtils_CUDA.hpp
index 33c3c1da93..48f2bd99c2 100644
--- a/include/RAJA/policy/cuda/MemUtils_CUDA.hpp
+++ b/include/RAJA/policy/cuda/MemUtils_CUDA.hpp
@@ -9,15 +9,8 @@
  ******************************************************************************
  */
 
-#ifndef RAJA_MemUtils_CUDA_HPP
-#define RAJA_MemUtils_CUDA_HPP
-
-#include "RAJA/config.hpp"
-
-#if defined(RAJA_ENABLE_CUDA)
-
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -27,412 +20,254 @@
 //
 // This file is part of RAJA.
 //
-// For additional details, please also read RAJA/LICENSE.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
+// For details about use and distribution, please read RAJA/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+#ifndef RAJA_MemUtils_CUDA_HPP
+#define RAJA_MemUtils_CUDA_HPP
+
+#include "RAJA/config.hpp"
+
+#if defined(RAJA_ENABLE_CUDA)
+
 #include "RAJA/util/types.hpp"
 
+#include "RAJA/util/basic_mempool.hpp"
+
+#include "RAJA/policy/cuda/raja_cudaerrchk.hpp"
+
+#include "RAJA/util/mutex.hpp"
+
+#include <cstddef>
+#include <cstdio>
+#include <cassert>
+#include <unordered_map>
+#include <type_traits>
+
 namespace RAJA
 {
-/*!
- * \def RAJA_CUDA_MAX_NUM_BLOCKS
- * Maximum number of blocks that RAJA will launch
- */
-#define RAJA_CUDA_MAX_NUM_BLOCKS (1024 * 16)
 
-/*!
- * \def RAJA_CUDA_REDUCE_BLOCK_LENGTH
- * Size of reduction memory block for each reducer object (value based on
- * rough estimate of "worst case" maximum number of blocks)
- */
-#define RAJA_CUDA_REDUCE_BLOCK_LENGTH RAJA_CUDA_MAX_NUM_BLOCKS
+namespace cuda
+{
 
-/*!
- * \def RAJA_CUDA_REDUCE_TALLY_LENGTH
- * Reduction Tallies are computed into a small block to minimize memory motion
- * Set to Max Number of Reduction Variables
- */
-#define RAJA_CUDA_REDUCE_TALLY_LENGTH RAJA_MAX_REDUCE_VARS
+//! Allocator for pinned memory for use in basic_mempool
+struct PinnedAllocator {
 
-/*!
- * \def RAJA_CUDA_REDUCE_VAR_MAXSIZE
- * Size in bytes used in CudaReductionDummyDataType for array allocation to
- * accommodate the template type used in reductions.
- *
- * Note: Includes the size of the index variable for Loc reductions.
- */
-#define RAJA_CUDA_REDUCE_VAR_MAXSIZE 16
+  // returns a valid pointer on success, nullptr on failure
+  void* malloc(size_t nbytes)
+  {
+    void* ptr;
+    cudaErrchk(cudaHostAlloc(&ptr, nbytes, cudaHostAllocMapped));
+    return ptr;
+  }
 
-/*!
- * \brief Type used to keep track of the grid size on the device
- */
-typedef unsigned int GridSizeType;
+  // returns true on success, false on failure
+  bool free(void* ptr)
+  {
+    cudaErrchk(cudaFreeHost(ptr));
+    return true;
+  }
 
-/*!
- ******************************************************************************
- *
- * \brief Type representing a single typed value for a cuda reduction.
- *
- * Enough space for a double value and an index value.
- *
- ******************************************************************************
- */
-struct RAJA_ALIGNED_ATTR(RAJA_CUDA_REDUCE_VAR_MAXSIZE)
-    CudaReductionDummyDataType {
-  unsigned char data[RAJA_CUDA_REDUCE_VAR_MAXSIZE];
 };
 
-/*!
- ******************************************************************************
- *
- * \brief Type representing a memory block for a cuda reduction.
- *
- ******************************************************************************
- */
-struct RAJA_ALIGNED_ATTR(DATA_ALIGN) CudaReductionDummyBlockType {
-  CudaReductionDummyDataType values[RAJA_CUDA_REDUCE_BLOCK_LENGTH];
-};
+//! Allocator for device memory for use in basic_mempool
+struct DeviceAllocator {
 
-/*!
- ******************************************************************************
- *
- * \brief Type representing enough memory to hold a slot in the tally block.
- *
- ******************************************************************************
- */
-struct CudaReductionDummyTallyType {
-  CudaReductionDummyDataType dummy_val;
-  GridSizeType dummy_retiredBlocks;
-};
+  // returns a valid pointer on success, nullptr on failure
+  void* malloc(size_t nbytes)
+  {
+    void* ptr;
+    cudaErrchk(cudaMalloc(&ptr, nbytes));
+    return ptr;
+  }
 
-/*!
- ******************************************************************************
- *
- * \brief Type used to simplify typed memory block use in cuda reductions.
- *
- * Must fit within the dummy block type (checked in static assert in the
- * reduction classes).
- *
- ******************************************************************************
- */
-template <typename T>
-struct CudaReductionBlockType {
-  T values[RAJA_CUDA_REDUCE_BLOCK_LENGTH];
-};
+  // returns true on success, false on failure
+  bool free(void* ptr)
+  {
+    cudaErrchk(cudaFree(ptr));
+    return true;
+  }
 
-/*!
- ******************************************************************************
- *
- * \brief Type used to simplify typed memory block use in cuda Loc reductions.
- *
- * Must fit within the dummy block type (checked in static assert in the
- * reduction classes).
- *
- ******************************************************************************
- */
-template <typename T>
-struct CudaReductionLocBlockType {
-  T values[RAJA_CUDA_REDUCE_BLOCK_LENGTH];
-  Index_type indices[RAJA_CUDA_REDUCE_BLOCK_LENGTH];
 };
 
-/*!
- ******************************************************************************
- *
- * \brief Type used to simplify hold value and location in cuda Loc reductions.
- *
- * Must fit within the dummy type (checked in static assert in the
- * reduction classes).
- *
- ******************************************************************************
- */
-template <typename T>
-struct CudaReductionLocType {
-  T val;
-  Index_type idx;
-};
+//! Allocator for pre-zeroed device memory for use in basic_mempool
+//  Note: Memory must be zero when returned to mempool
+struct DeviceZeroedAllocator {
+
+  // returns a valid pointer on success, nullptr on failure
+  void* malloc(size_t nbytes)
+  {
+    void* ptr;
+    cudaErrchk(cudaMalloc(&ptr, nbytes));
+    cudaErrchk(cudaMemset(ptr, 0, nbytes));
+    return ptr;
+  }
+
+  // returns true on success, false on failure
+  bool free(void* ptr)
+  {
+    cudaErrchk(cudaFree(ptr));
+    return true;
+  }
 
-/*!
- ******************************************************************************
- *
- * \brief Type used to simplify hold tally value in cuda reductions.
- *
- * Must fit within the dummy tally type (checked in static assert in the
- * reduction classes).
- *
- * Note: Retired blocks is used to count the number of blocks that finished
- * and wrote their portion of the reduction to the memory block.
- *
- ******************************************************************************
- */
-template <typename T>
-struct CudaReductionTallyType {
-  T tally;
-  GridSizeType retiredBlocks;
 };
 
-/*!
- ******************************************************************************
- *
- * \brief Type used to simplify hold tally value in cuda atomic reductions.
- *
- * Must fit within the dummy tally type (checked in static assert in the
- * reduction classes).
- *
- ******************************************************************************
- */
-template <typename T>
-struct CudaReductionTallyTypeAtomic {
-  T tally;
+using device_mempool_type = basic_mempool::MemPool<DeviceAllocator>;
+using device_zeroed_mempool_type = basic_mempool::MemPool<DeviceZeroedAllocator>;
+using pinned_mempool_type = basic_mempool::MemPool<PinnedAllocator>;
+
+namespace detail
+{
+
+//! struct containing data necessary to coordinate kernel launches with reducers
+struct cudaInfo {
+  dim3         gridDim  = 0;
+  dim3         blockDim = 0;
+  cudaStream_t stream   = 0;
+  bool         setup_reducers = false;
+#if defined(RAJA_ENABLE_OPENMP) && defined(_OPENMP)
+  cudaInfo*    thread_states = nullptr;
+  omp::mutex   lock;
+#endif
 };
 
-/*!
- ******************************************************************************
- *
- * \brief Type used to simplify hold tally value in cuda Loc reductions.
- *
- * Must fit within the dummy tally type (checked in static assert in the
- * reduction classes).
- *
- * Note: Retired blocks is used to count the number of blocks that finished
- * and wrote their portion of the reduction to the memory block.
- *
- ******************************************************************************
- */
-template <typename T>
-struct CudaReductionLocTallyType {
-  CudaReductionLocType<T> tally;
-  GridSizeType retiredBlocks;
+//! class that changes a value on construction then resets it at destruction
+template < typename T >
+class SetterResetter {
+public:
+  SetterResetter(T& val, T new_val)
+    : m_val(val), m_old_val(val)
+  {
+    m_val = new_val;
+  }
+  SetterResetter(const SetterResetter&) = delete;
+  ~SetterResetter()
+  {
+    m_val = m_old_val;
+  }
+private:
+  T& m_val;
+  T  m_old_val;
 };
 
+extern cudaInfo g_status;
 
-/*!
- ******************************************************************************
- *
- * \brief Get the number of active cuda reducer objects.
- *
- * \return int number of active cuda reducer objects.
- *
- ******************************************************************************
- */
-int getCudaReducerActiveCount();
+extern cudaInfo tl_status;
+#if defined(RAJA_ENABLE_OPENMP) && defined(_OPENMP)
+#pragma omp threadprivate(tl_status)
+#endif
 
-/*!
- ******************************************************************************
- *
- * \brief Get the number of active cuda memblocks.
- *
- * \return int number of active cuda memblocks.
- *
- * note: getCudaMemblockUsedCount() is the number of active non-atomic reducers
- *
- ******************************************************************************
- */
-int getCudaMemblockUsedCount();
-
-/*!
- ******************************************************************************
- *
- * \brief Get a valid reduction id, or complain and exit if no valid id is
- *        available.
- *
- * \return int the next available valid reduction id.
- *
- ******************************************************************************
- */
-int getCudaReductionId();
+extern std::unordered_map<cudaStream_t, bool> g_stream_info_map;
 
-/*!
- ******************************************************************************
- *
- * \brief Release given reduction id and make inactive.
- *
- ******************************************************************************
- */
-void releaseCudaReductionId(int id);
+}  // closing brace for detail namespace
 
-/*!
- ******************************************************************************
- *
- * \brief Get tally block for reducer object with given id.
- *
- * \param[out] host_tally pointer to host tally cache slot.
- * \param[out] device_tally pointer to device tally slot.
- *
- * NOTE: Tally Block size will be:
- *
- *          sizeof(CudaReductionDummyTallyType) * RAJA_MAX_REDUCE_VARS
- *
- *       For each reducer object, we want a chunk of device memory that
- *       holds the reduced value and a small number of anciliary variables.
- *
- ******************************************************************************
- */
-void getCudaReductionTallyBlock(int id, void** host_tally, void** device_tally);
+//! Ensure all streams in use are synchronized wrt raja kernel launches
+RAJA_INLINE
+void synchronize()
+{
+#if defined(RAJA_ENABLE_OPENMP) && defined(_OPENMP)
+  lock_guard<omp::mutex> lock(detail::g_status.lock);
+#endif
+  bool synchronize = false;
+  for (auto& val : detail::g_stream_info_map) {
+    if (!val.second) {
+      synchronize = true;
+      val.second = true;
+    }
+  }
+  if (synchronize) {
+    cudaErrchk(cudaDeviceSynchronize());
+  }
+}
 
-/*!
- ******************************************************************************
- *
- * \brief Release tally block for reducer object with given id.
- *
- ******************************************************************************
- */
-void releaseCudaReductionTallyBlock(int id);
+//! Ensure stream is synchronized wrt raja kernel launches
+RAJA_INLINE
+void synchronize(cudaStream_t stream)
+{
+#if defined(RAJA_ENABLE_OPENMP) && defined(_OPENMP)
+  lock_guard<omp::mutex> lock(detail::g_status.lock);
+#endif
+  auto iter = detail::g_stream_info_map.find(stream);
+  if (iter != detail::g_stream_info_map.end() ) {
+    if (!iter->second) {
+      iter->second = true;
+      cudaErrchk(cudaStreamSynchronize(stream));
+    }
+  } else {
+    fprintf(stderr, "Cannot synchronize unknown stream.\n");
+    std::abort();
+  }
+}
 
-/*!
- ******************************************************************************
- *
- * \brief Sets up state variales before the loop body is copied and the kernel
- *        is launched.
- *
- ******************************************************************************
- */
-void beforeCudaKernelLaunch();
+//! Indicate stream is asynchronous
+RAJA_INLINE
+void launch(cudaStream_t stream)
+{
+#if defined(RAJA_ENABLE_OPENMP) && defined(_OPENMP)
+  lock_guard<omp::mutex> lock(detail::g_status.lock);
+#endif
+  auto iter = detail::g_stream_info_map.find(stream);
+  if (iter != detail::g_stream_info_map.end()) {
+    iter->second = false;
+  } else {
+    detail::g_stream_info_map.emplace(stream, false);
+  }
+}
 
-/*!
- ******************************************************************************
- *
- * \brief Resets state variables after kernel launch.
- *
- ******************************************************************************
- */
-void afterCudaKernelLaunch();
+//! Indicate stream is asynchronous
+RAJA_INLINE
+void peekAtLastError()
+{
+  cudaErrchk(cudaPeekAtLastError());
+}
 
-/*!
- ******************************************************************************
- *
- * \brief Updates host tally cache for read by reduction variable with id and
- * an asynchronous reduction policy.
- *
- ******************************************************************************
- */
-void beforeCudaReadTallyBlockAsync(int id);
+//! query whether reducers in this thread should setup for device execution now
+RAJA_INLINE
+bool setupReducers()
+{
+  return detail::tl_status.setup_reducers;
+}
 
-/*!
- ******************************************************************************
- *
- * \brief Updates host tally cache for read by reduction variable with id and
- * a synchronous reduction policy.
- *
- ******************************************************************************
- */
-void beforeCudaReadTallyBlockSync(int id);
+//! get gridDim of current launch
+RAJA_INLINE
+dim3 currentGridDim()
+{
+  return detail::tl_status.gridDim;
+}
 
-/*!
- ******************************************************************************
- *
- * \brief Updates host tally cache for read by reduction variable with id and
- * templated on Async from the reduction policy.
- *
- ******************************************************************************
- */
-template <bool Async>
-void beforeCudaReadTallyBlock(int id)
+//! get blockDim of current launch
+RAJA_INLINE
+dim3 currentBlockDim()
 {
-  if (Async) {
-    beforeCudaReadTallyBlockAsync(id);
-  } else {
-    beforeCudaReadTallyBlockSync(id);
-  }
+  return detail::tl_status.blockDim;
 }
 
-/*!
- ******************************************************************************
- *
- * \brief  Earmark amount of device shared memory and get byte offset into
- *         device shared memory.
- *
- * \return int Byte offset into dynamic shared memory.
- *
- * \param[in] reductionBlockDim Dimensions of blocks expected by this
- *                              reduction variable.
- * \param[in] size Size of shared memory in bytes for each thread.
- *
- ******************************************************************************
- */
-int getCudaSharedmemOffset(int id, dim3 reductionBlockDim, int size);
+//! get stream for current launch
+RAJA_INLINE
+cudaStream_t currentStream()
+{
+  return detail::tl_status.stream;
+}
 
-/*!
- ******************************************************************************
- *
- * \brief  Get the amount in bytes of shared memory required for the current
- *         kernel launch and checks the launch parameters.
- *
- * \param[in] launchGridDim GridDim kernel launch parameter.
- * \param[in] launchBlockDim BlockDim kernel launch parameter.
- *
- ******************************************************************************
- */
-int getCudaSharedmemAmount(dim3 launchGridDim, dim3 launchBlockDim);
+//! create copy of loop_body that is setup for device execution
+template < typename LOOP_BODY >
+RAJA_INLINE
+typename std::remove_reference<LOOP_BODY>::type make_launch_body(
+  dim3 gridDim, dim3 blockDim, size_t dynamic_smem, cudaStream_t stream,
+  LOOP_BODY&& loop_body)
+{
+  detail::SetterResetter<bool> setup_reducers_srer(
+                                        detail::tl_status.setup_reducers, true);
 
-/*!
- ******************************************************************************
- *
- * \brief  Free managed memory block used in RAJA-Cuda reductions.
- *
- ******************************************************************************
- */
-void freeCudaReductionTallyBlock();
+  detail::tl_status.stream   = stream;
+  detail::tl_status.gridDim  = gridDim;
+  detail::tl_status.blockDim = blockDim;
 
-/*!
- ******************************************************************************
- *
- * \brief  Get device memory block for RAJA-CUDA reduction variable  with
- *         given id.
- *
- *         Allocates data block if it isn't allocated already.
- *
- * \param[out] device_memblock Pointer to device memory block.
- *
- * NOTE: Total Block size will be:
- *
- *          sizeof(CudaReductionDummyDataType) *
- *            RAJA_MAX_REDUCE_VARS * RAJA_CUDA_REDUCE_BLOCK_LENGTH
- *
- *       For each reducer object, we want a chunk of device memory that
- *       holds RAJA_CUDA_REDUCE_BLOCK_LENGTH slots for the reduction
- *       value for each thread block.
- *
- ******************************************************************************
- */
-void getCudaReductionMemBlock(int id, void** device_memblock);
+  return {loop_body};
+}
 
-/*!
- ******************************************************************************
- *
- * \brief  Free device memory blocks used in RAJA-Cuda reductions.
- *
- ******************************************************************************
- */
-void freeCudaReductionMemBlock();
+}  // closing brace for cuda namespace
 
 }  // closing brace for RAJA namespace
 
diff --git a/include/RAJA/policy/cuda/atomic.hpp b/include/RAJA/policy/cuda/atomic.hpp
index 5fc3d86e63..a589ad6648 100644
--- a/include/RAJA/policy/cuda/atomic.hpp
+++ b/include/RAJA/policy/cuda/atomic.hpp
@@ -8,11 +8,8 @@
  ******************************************************************************
  */
 
-#ifndef RAJA_policy_cuda_atomic_HPP
-#define RAJA_policy_cuda_atomic_HPP
-
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -22,37 +19,13 @@
 //
 // This file is part of RAJA.
 //
-// For additional details, please also read RAJA/LICENSE.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
+// For details about use and distribution, please read RAJA/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+#ifndef RAJA_policy_cuda_atomic_HPP
+#define RAJA_policy_cuda_atomic_HPP
+
 #include "RAJA/config.hpp"
 #include "RAJA/util/Operators.hpp"
 #include "RAJA/util/TypeConvert.hpp"
diff --git a/include/RAJA/policy/cuda/forall.hpp b/include/RAJA/policy/cuda/forall.hpp
index 0c1f6ae14d..c4bc95f029 100644
--- a/include/RAJA/policy/cuda/forall.hpp
+++ b/include/RAJA/policy/cuda/forall.hpp
@@ -12,17 +12,8 @@
  ******************************************************************************
  */
 
-#ifndef RAJA_forall_cuda_HPP
-#define RAJA_forall_cuda_HPP
-
-#include "RAJA/config.hpp"
-#include "RAJA/pattern/forall.hpp"
-
-
-#if defined(RAJA_ENABLE_CUDA)
-
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -32,37 +23,19 @@
 //
 // This file is part of RAJA.
 //
-// For additional details, please also read RAJA/LICENSE.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
+// For details about use and distribution, please read RAJA/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+#ifndef RAJA_forall_cuda_HPP
+#define RAJA_forall_cuda_HPP
+
+#include "RAJA/config.hpp"
+
+#if defined(RAJA_ENABLE_CUDA)
+
+#include "RAJA/pattern/forall.hpp"
+
 #include "RAJA/util/defines.hpp"
 #include "RAJA/util/types.hpp"
 
@@ -74,22 +47,35 @@
 
 #include "RAJA/index/IndexSet.hpp"
 
+#include <algorithm>
+
 namespace RAJA
 {
 
+namespace policy
+{
+namespace cuda
+{
+
 namespace impl
 {
-//
-//////////////////////////////////////////////////////////////////////
-//
-// CUDA kernel templates.
-//
-//////////////////////////////////////////////////////////////////////
-//
 
-// INTERNAL namespace to encapsulate helper functions
-namespace INTERNAL
+/*!
+ ******************************************************************************
+ *
+ * \brief calculate gridDim from length of iteration and blockDim
+ *
+ ******************************************************************************
+ */
+RAJA_INLINE
+dim3 getGridDim(size_t len, dim3 blockDim)
 {
+  size_t block_size = blockDim.x * blockDim.y * blockDim.z;
+
+  size_t gridSize = (len + block_size-1) / block_size;
+
+  return gridSize;
+}
 
 /*!
  ******************************************************************************
@@ -133,6 +119,14 @@ __device__ __forceinline__ unsigned int getGlobalNumThreads_3D_3D()
   return numThreads;
 }
 
+//
+//////////////////////////////////////////////////////////////////////
+//
+// CUDA kernel templates.
+//
+//////////////////////////////////////////////////////////////////////
+//
+
 /*!
  ******************************************************************************
  *
@@ -140,44 +134,22 @@ __device__ __forceinline__ unsigned int getGlobalNumThreads_3D_3D()
  *
  ******************************************************************************
  */
-template <typename Iterator, typename LOOP_BODY, typename IndexType>
+template <size_t BlockSize, typename Iterator, typename LOOP_BODY, typename IndexType>
+__launch_bounds__ (BlockSize, 1)
 __global__ void forall_cuda_kernel(LOOP_BODY loop_body,
                                    const Iterator idx,
                                    IndexType length)
 {
-  auto body = loop_body;
+  using RAJA::internal::thread_privatize;
+  auto privatizer = thread_privatize(loop_body);
+  auto body = privatizer.get_priv();
   auto ii = static_cast<IndexType>(getGlobalIdx_1D_1D());
   if (ii < length) {
     body(idx[ii]);
   }
 }
 
-/*!
- ******************************************************************************
- *
- * \brief  CUDA kernal forall_Icount template for indiraction array.
- *
- *         NOTE: lambda loop body requires two args (icount, index).
- *
- ******************************************************************************
- */
-template <typename Iterator,
-          typename LoopBody,
-          typename IndexType,
-          typename IndexType2>
-__global__ void forall_Icount_cuda_kernel(LoopBody loop_body,
-                                          const Iterator idx,
-                                          IndexType length,
-                                          IndexType2 icount)
-{
-  auto body = loop_body;
-  auto ii = static_cast<IndexType>(getGlobalIdx_1D_1D());
-  if (ii < length) {
-    body(static_cast<IndexType>(ii + icount), idx[ii]);
-  }
-}
-
-}  // end INTERNAL namespace for helper functions
+}  // end impl namespace
 
 //
 ////////////////////////////////////////////////////////////////////////
@@ -188,86 +160,37 @@ __global__ void forall_Icount_cuda_kernel(LoopBody loop_body,
 //
 
 template <typename Iterable, typename LoopBody, size_t BlockSize, bool Async>
-RAJA_INLINE void forall(cuda_exec<BlockSize, Async>,
+RAJA_INLINE void forall_impl(cuda_exec<BlockSize, Async>,
                         Iterable&& iter,
                         LoopBody&& loop_body)
 {
-  beforeCudaKernelLaunch();
-
-  auto body = loop_body;
+  auto begin = std::begin(iter);
+  auto end   = std::end(iter);
 
-  auto first_begin = std::begin(iter);
-  auto final_end = std::end(iter);
-  auto total_len = std::distance(first_begin, final_end);
-  auto max_step_size = (getCudaMemblockUsedCount() > 0)
-                           ? BlockSize * RAJA_CUDA_MAX_NUM_BLOCKS
-                           : total_len;
-
-  for (decltype(total_len) step_size, offset = 0; offset < total_len;
-       offset += step_size) {
-
-    step_size = RAJA_MIN(total_len - offset, max_step_size);
-
-    auto begin = first_begin + offset;
-    auto end = begin + step_size;
-
-    auto len = std::distance(begin, end);
-    auto gridSize = RAJA_DIVIDE_CEILING_INT(len, BlockSize);
-
-    INTERNAL::
-        forall_cuda_kernel<<<RAJA_CUDA_LAUNCH_PARAMS(gridSize, BlockSize)>>>(
-            body, std::move(begin), len);
-  }
-
-  RAJA_CUDA_CHECK_AND_SYNC(Async);
-
-  afterCudaKernelLaunch();
-}
-
-
-template <typename Iterable,
-          typename IndexType,
-          typename LoopBody,
-          size_t BlockSize,
-          bool Async>
-RAJA_INLINE typename std::enable_if<std::is_integral<IndexType>::value>::type
-forall_Icount(cuda_exec<BlockSize, Async>,
-              Iterable&& iter,
-              IndexType icount,
-              LoopBody&& loop_body)
-{
-  beforeCudaKernelLaunch();
+  auto len = std::distance(begin, end);
 
-  auto body = loop_body;
+  if (len > 0 && BlockSize > 0) {
 
-  auto first_begin = std::begin(iter);
-  auto final_end = std::end(iter);
-  auto total_len = std::distance(first_begin, final_end);
+    auto gridSize = impl::getGridDim(len, BlockSize);
 
-  auto max_step_size = (getCudaMemblockUsedCount() > 0)
-                           ? BlockSize * RAJA_CUDA_MAX_NUM_BLOCKS
-                           : total_len;
+    RAJA_FT_BEGIN;
 
-  for (decltype(total_len) step_size, offset = 0; offset < total_len;
-       offset += step_size) {
+    cudaStream_t stream = 0;
 
-    step_size = RAJA_MIN(total_len - offset, max_step_size);
+    impl::forall_cuda_kernel<BlockSize><<<gridSize, BlockSize, 0, stream>>>(
+        RAJA::cuda::make_launch_body(gridSize, BlockSize, 0, stream,
+                               std::forward<LoopBody>(loop_body)),
+        std::move(begin), len);
+    RAJA::cuda::peekAtLastError();
 
-    auto begin = first_begin + offset;
-    auto end = begin + step_size;
+    RAJA::cuda::launch(stream);
+    if (!Async) RAJA::cuda::synchronize(stream);
 
-    auto len = std::distance(begin, end);
-    auto gridSize = RAJA_DIVIDE_CEILING_INT(len, BlockSize);
-
-    INTERNAL::forall_Icount_cuda_kernel<<<RAJA_CUDA_LAUNCH_PARAMS(gridSize,
-                                                                  BlockSize)>>>(
-        body, std::move(begin), len, static_cast<IndexType>(icount + offset));
+    RAJA_FT_END;
   }
+}
 
-  RAJA_CUDA_CHECK_AND_SYNC(Async);
 
-  afterCudaKernelLaunch();
-}
 
 //
 //////////////////////////////////////////////////////////////////////
@@ -291,56 +214,24 @@ template <typename LoopBody,
           size_t BlockSize,
           bool Async,
           typename... SegmentTypes>
-RAJA_INLINE void forall(ExecPolicy<seq_segit, cuda_exec<BlockSize, Async>>,
+RAJA_INLINE void forall_impl(ExecPolicy<seq_segit, cuda_exec<BlockSize, Async>>,
                         const StaticIndexSet<SegmentTypes...>& iset,
                         LoopBody&& loop_body)
 {
   int num_seg = iset.getNumSegments();
   for (int isi = 0; isi < num_seg; ++isi) {
     iset.segmentCall(isi,
-                     CallForall(),
-                     cuda_exec<BlockSize, Async>(),
+                     detail::CallForall(),
+                     cuda_exec<BlockSize, true>(),
                      loop_body);
   }  // iterate over segments of index set
 
-  RAJA_CUDA_CHECK_AND_SYNC(Async);
+  if (!Async) RAJA::cuda::synchronize();
 }
 
+}  // closing brace for cuda namespace
 
-/*!
- ******************************************************************************
- *
- * \brief  Sequential iteration over segments of index set and
- *         CUDA execution for segments.
- *
- *         This method passes index count to segment iteration.
- *
- *         NOTE: lambda loop body requires two args (icount, index).
- *
- ******************************************************************************
- */
-template <typename LoopBody,
-          size_t BlockSize,
-          bool Async,
-          typename... SegmentTypes>
-RAJA_INLINE void forall_Icount(
-    ExecPolicy<seq_segit, cuda_exec<BlockSize, Async>>,
-    const StaticIndexSet<SegmentTypes...>& iset,
-    LoopBody&& loop_body)
-{
-  auto num_seg = iset.getNumSegments();
-  for (decltype(num_seg) isi = 0; isi < num_seg; ++isi) {
-    iset.segmentCall(isi,
-                     CallForallIcount(iset.getStartingIcount(isi)),
-                     cuda_exec<BlockSize>(),
-                     loop_body);
-
-  }  // iterate over segments of index set
-
-  RAJA_CUDA_CHECK_AND_SYNC(Async);
-}
-
-}  // closing brace for impl namespace
+}  // closing brace for policy namespace
 
 }  // closing brace for RAJA namespace
 
diff --git a/include/RAJA/policy/cuda/forallN.hpp b/include/RAJA/policy/cuda/forallN.hpp
index 5789eaba08..79dbbe6303 100644
--- a/include/RAJA/policy/cuda/forallN.hpp
+++ b/include/RAJA/policy/cuda/forallN.hpp
@@ -9,15 +9,8 @@
  ******************************************************************************
  */
 
-#ifndef RAJA_forallN_cuda_HPP
-#define RAJA_forallN_cuda_HPP
-
-#include "RAJA/config.hpp"
-
-#if defined(RAJA_ENABLE_CUDA)
-
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -27,37 +20,17 @@
 //
 // This file is part of RAJA.
 //
-// For additional details, please also read RAJA/LICENSE.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
+// For details about use and distribution, please read RAJA/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+#ifndef RAJA_forallN_cuda_HPP
+#define RAJA_forallN_cuda_HPP
+
+#include "RAJA/config.hpp"
+
+#if defined(RAJA_ENABLE_CUDA)
+
 #include <cassert>
 #include <climits>
 
@@ -211,16 +184,23 @@ struct ForallN_Executor<device,
 
   template <typename BODY, typename... CARGS>
   RAJA_INLINE void callLauncher(CudaDim const &dims,
-                                BODY body,
+                                BODY loop_body,
                                 CARGS const &... cargs) const
   {
     if (numBlocks(dims) > 0 && numThreads(dims) > 0) {
-      cudaLauncherN<<<RAJA_CUDA_LAUNCH_PARAMS(dims.num_blocks,
-                                              dims.num_threads)>>>(body,
-                                                                   cargs...);
-    }
 
-    RAJA_CUDA_CHECK_AND_SYNC(true);
+      bool Async = true;
+      cudaStream_t stream = 0;
+
+      cudaLauncherN<<<dims.num_blocks, dims.num_threads, 0, stream>>>(
+          RAJA::cuda::make_launch_body(dims.num_blocks, dims.num_threads, 0, stream,
+                                 std::move(loop_body)),
+          cargs...);
+      RAJA::cuda::peekAtLastError();
+
+      RAJA::cuda::launch(stream);
+      if (!Async) RAJA::cuda::synchronize(stream);
+    }
   }
 };
 
@@ -234,17 +214,25 @@ struct ForallN_Executor<device, ForallN_PolicyPair<CudaPolicy<CuARG0>, ISET0>> {
   }
 
   template <typename BODY>
-  RAJA_INLINE void operator()(BODY body) const
+  RAJA_INLINE void operator()(BODY loop_body) const
   {
     CudaDim dims;
     auto c0 = make_cuda_iter_wrapper(CuARG0(dims, iset0), std::begin(iset0));
 
     if (numBlocks(dims) > 0 && numThreads(dims) > 0) {
-      cudaLauncherN<<<RAJA_CUDA_LAUNCH_PARAMS(dims.num_blocks,
-                                              dims.num_threads)>>>(body, c0);
-    }
 
-    RAJA_CUDA_CHECK_AND_SYNC(true);
+      bool Async = true;
+      cudaStream_t stream = 0;
+
+      cudaLauncherN<<<dims.num_blocks, dims.num_threads, 0, stream>>>(
+          RAJA::cuda::make_launch_body(dims.num_blocks, dims.num_threads, 0, stream,
+                                 std::move(loop_body)),
+          c0);
+      RAJA::cuda::peekAtLastError();
+
+      RAJA::cuda::launch(stream);
+      if (!Async) RAJA::cuda::synchronize(stream);
+    }
   }
 };
 
diff --git a/include/RAJA/policy/cuda/fwd.hpp b/include/RAJA/policy/cuda/fwd.hpp
deleted file mode 100644
index bc69f96230..0000000000
--- a/include/RAJA/policy/cuda/fwd.hpp
+++ /dev/null
@@ -1,110 +0,0 @@
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
-//
-// Produced at the Lawrence Livermore National Laboratory
-//
-// LLNL-CODE-689114
-//
-// All rights reserved.
-//
-// This file is part of RAJA.
-//
-// For additional details, please also read RAJA/LICENSE.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
-//
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-
-/*!
- ******************************************************************************
- *
- * \file
- *
- * \brief   Header file containing RAJA segment template methods for
- *          execution via CUDA kernel launch.
- *
- *          These methods should work on any platform that supports
- *          CUDA devices.
- *
- ******************************************************************************
- */
-
-#ifndef RAJA_forward_cuda_HXX
-#define RAJA_forward_cuda_HXX
-
-#include "RAJA/config.hpp"
-
-
-#if defined(RAJA_ENABLE_CUDA)
-
-#include "RAJA/policy/cuda/policy.hpp"
-
-namespace RAJA
-{
-
-namespace impl
-{
-
-template <typename Iterable, typename LoopBody, size_t BlockSize, bool Async>
-RAJA_INLINE void forall(cuda_exec<BlockSize, Async>, Iterable&&, LoopBody&&);
-
-
-template <typename Iterable,
-          typename IndexType,
-          typename LoopBody,
-          size_t BlockSize,
-          bool Async>
-RAJA_INLINE typename std::enable_if<std::is_integral<IndexType>::value>::type
-forall_Icount(cuda_exec<BlockSize, Async>, Iterable&&, IndexType, LoopBody&&);
-
-
-template <typename LoopBody,
-          size_t BlockSize,
-          bool Async,
-          typename... SegmentTypes>
-RAJA_INLINE void forall(ExecPolicy<seq_segit, cuda_exec<BlockSize, Async>>,
-                        const StaticIndexSet<SegmentTypes...>&,
-                        LoopBody&&);
-
-
-template <typename LoopBody,
-          size_t BlockSize,
-          bool Async,
-          typename... SegmentTypes>
-RAJA_INLINE void forall_Icount(
-    ExecPolicy<seq_segit, cuda_exec<BlockSize, Async>>,
-    const StaticIndexSet<SegmentTypes...>&,
-    LoopBody&&);
-
-}  // closing brace for impl namespace
-
-}  // closing brace for RAJA namespace
-
-#endif  // closing endif for RAJA_ENABLE_CUDA guard
-
-#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/cuda/nested.hpp b/include/RAJA/policy/cuda/nested.hpp
new file mode 100644
index 0000000000..866cb7886d
--- /dev/null
+++ b/include/RAJA/policy/cuda/nested.hpp
@@ -0,0 +1,150 @@
+/*!
+ ******************************************************************************
+ *
+ * \file
+ *
+ * \brief   RAJA header file containing constructs used to run forallN
+ *          traversals on GPU with CUDA.
+ *
+ ******************************************************************************
+ */
+
+#ifndef RAJA_policy_cuda_nested_HPP
+#define RAJA_policy_cuda_nested_HPP
+
+#include "RAJA/config.hpp"
+#include "RAJA/pattern/nested.hpp"
+
+#if defined(RAJA_ENABLE_CUDA)
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+//
+// Produced at the Lawrence Livermore National Laboratory
+//
+// LLNL-CODE-689114
+//
+// All rights reserved.
+//
+// This file is part of RAJA.
+//
+// For additional details, please also read RAJA/LICENSE.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the disclaimer below.
+//
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the disclaimer (as noted below) in the
+//   documentation and/or other materials provided with the distribution.
+//
+// * Neither the name of the LLNS/LLNL nor the names of its contributors may
+//   be used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
+// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
+// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include <cassert>
+#include <climits>
+
+#include "RAJA/RAJA.hpp"
+#include "RAJA/config.hpp"
+#include "RAJA/util/defines.hpp"
+#include "RAJA/util/types.hpp"
+
+#include "RAJA/policy/cuda/MemUtils_CUDA.hpp"
+#include "RAJA/policy/cuda/policy.hpp"
+
+#include "RAJA/internal/ForallNPolicy.hpp"
+#include "RAJA/internal/LegacyCompatibility.hpp"
+
+
+namespace RAJA
+{
+namespace nested
+{
+
+
+template <template <camp::idx_t, typename...> class ForTypeIn,
+          std::size_t block_size,
+          camp::idx_t Index,
+          typename... Rest>
+struct Executor<ForTypeIn<Index, cuda_exec<block_size>, Rest...>> {
+  using ForType = ForTypeIn<Index, cuda_exec<block_size>, Rest...>;
+  static_assert(std::is_base_of<internal::ForBase, ForType>::value,
+                "Only For-based policies should get here");
+  template <typename BaseWrapper>
+  struct ForWrapper {
+    // Explicitly unwrap the data from the wrapper
+    ForWrapper(BaseWrapper const &w) : data(w.data) {}
+    using data_type = typename BaseWrapper::data_type;
+    data_type data;
+    template <typename InIndexType>
+    RAJA_DEVICE void operator()(InIndexType i)
+    {
+      data.template assign_index<ForType::index_val>(i);
+      camp::invoke(data.index_tuple, data.f);
+    }
+  };
+  template <typename WrappedBody>
+  void operator()(ForType const &fp, WrappedBody const &wrap)
+  {
+
+    using ::RAJA::policy::sequential::forall_impl;
+    forall_impl(fp.pol,
+                camp::get<ForType::index_val>(wrap.data.st),
+                ForWrapper<WrappedBody>{wrap});
+  }
+};
+
+
+struct cuda_collapse_exec{};
+
+template <typename FT0, typename FT1>
+struct Executor<Collapse<cuda_collapse_exec, FT0, FT1>> {
+
+  // TODO, check that FT0 and FT1 are cuda policies
+
+  template <typename WrappedBody>
+  void operator()(Collapse<cuda_collapse_exec, FT0, FT1> const &, WrappedBody const &wrap)
+  {
+    auto b0 = std::begin(camp::get<FT0::index_val>(wrap.data.st));
+    auto b1 = std::begin(camp::get<FT1::index_val>(wrap.data.st));
+
+    auto e0 = std::end(camp::get<FT0::index_val>(wrap.data.st));
+    auto e1 = std::end(camp::get<FT1::index_val>(wrap.data.st));
+
+    // Skip a level
+    for (auto i0 = b0; i0 < e0; ++i0) {
+      wrap.data.template assign_index<FT0::index_val>(*i0);
+      for (auto i1 = b1; i1 < e1; ++i1) {
+        wrap.data.template assign_index<FT1::index_val>(*i1);
+        wrap();
+      }
+    }
+  }
+};
+
+
+
+}  //namespace nested
+}  // namespace RAJA
+
+#endif  // closing endif for RAJA_ENABLE_CUDA guard
+
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/cuda/policy.hpp b/include/RAJA/policy/cuda/policy.hpp
index 69a1db9bb2..424955830e 100644
--- a/include/RAJA/policy/cuda/policy.hpp
+++ b/include/RAJA/policy/cuda/policy.hpp
@@ -8,11 +8,8 @@
  ******************************************************************************
  */
 
-#ifndef RAJA_policy_cuda_HPP
-#define RAJA_policy_cuda_HPP
-
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -22,39 +19,18 @@
 //
 // This file is part of RAJA.
 //
-// For additional details, please also read RAJA/LICENSE.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
+// For details about use and distribution, please read RAJA/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+#ifndef RAJA_policy_cuda_HPP
+#define RAJA_policy_cuda_HPP
+
+#if defined(RAJA_ENABLE_CUDA)
+
 #include "RAJA/config.hpp"
 #include "RAJA/policy/PolicyBase.hpp"
+#include "RAJA/pattern/reduce.hpp"
 
 namespace RAJA
 {
@@ -132,7 +108,12 @@ template <>
 struct get_launch<false> {
   static constexpr RAJA::Launch value = RAJA::Launch::sync;
 };
-}
+} // end namespace detail
+
+namespace policy
+{
+namespace cuda
+{
 
 template <size_t BLOCK_SIZE, bool Async = false>
 struct cuda_exec
@@ -156,7 +137,7 @@ struct cuda_exec
 ///////////////////////////////////////////////////////////////////////
 ///
 
-template <size_t BLOCK_SIZE, bool Async = false>
+template <size_t BLOCK_SIZE, bool Async = false, bool maybe_atomic = false>
 struct cuda_reduce
     : public RAJA::
           make_policy_pattern_launch_platform_t<RAJA::Policy::cuda,
@@ -167,35 +148,46 @@ struct cuda_reduce
 };
 
 template <size_t BLOCK_SIZE>
-using cuda_reduce_async = cuda_reduce<BLOCK_SIZE, true>;
+using cuda_reduce_async = cuda_reduce<BLOCK_SIZE, true, false>;
 
-template <size_t BLOCK_SIZE, bool Async = false>
-struct cuda_reduce_atomic
+template <size_t BLOCK_SIZE>
+using cuda_reduce_atomic = cuda_reduce<BLOCK_SIZE, false, true>;
+
+template <size_t BLOCK_SIZE>
+using cuda_reduce_atomic_async = cuda_reduce<BLOCK_SIZE, true, true>;
+
+
+template <typename POL>
+struct CudaPolicy
     : public RAJA::
           make_policy_pattern_launch_platform_t<RAJA::Policy::cuda,
-                                                RAJA::Pattern::reduce,
-                                                detail::get_launch<Async>::
-                                                    value,
+                                                RAJA::Pattern::forall,
+                                                RAJA::Launch::undefined,
                                                 RAJA::Platform::cuda> {
 };
 
-template <size_t BLOCK_SIZE>
-using cuda_reduce_atomic_async = cuda_reduce_atomic<BLOCK_SIZE, true>;
-
 //
 // Operations in the included files are parametrized using the following
 // values for CUDA warp size and max block size.
 //
-const int WARP_SIZE = 32;
-const int RAJA_CUDA_MAX_BLOCK_SIZE = 2048;
-
-/*!
- * \def RAJA_CUDA_LAUNCH_PARAMS(gridSize, blockSize)
- * Macro that generates kernel launch parameters.
- */
-#define RAJA_CUDA_LAUNCH_PARAMS(gridSize, blockSize) \
-  gridSize, blockSize, getCudaSharedmemAmount(gridSize, blockSize)
-
+constexpr const int WARP_SIZE = 32;
+constexpr const int MAX_BLOCK_SIZE = 1024;
+constexpr const int MAX_WARPS = MAX_BLOCK_SIZE / WARP_SIZE;
+static_assert(WARP_SIZE >= MAX_WARPS,
+      "RAJA Assumption Broken: WARP_SIZE < MAX_WARPS");
+static_assert(MAX_BLOCK_SIZE % WARP_SIZE == 0,
+      "RAJA Assumption Broken: MAX_BLOCK_SIZE not "
+      "a multiple of WARP_SIZE");
+
+} // end namespace cuda
+} // end namespace policy
+
+using policy::cuda::cuda_exec;
+using policy::cuda::cuda_reduce;
+using policy::cuda::cuda_reduce_async;
+using policy::cuda::cuda_reduce_atomic;
+using policy::cuda::cuda_reduce_atomic_async;
+using policy::cuda::CudaPolicy;
 
 /*!
  * \brief Struct that contains two CUDA dim3's that represent the number of
@@ -219,15 +211,6 @@ struct CudaDim {
   }
 };
 
-template <typename POL>
-struct CudaPolicy
-    : public RAJA::
-          make_policy_pattern_launch_platform_t<RAJA::Policy::cuda,
-                                                RAJA::Pattern::forall,
-                                                RAJA::Launch::undefined,
-                                                RAJA::Platform::cuda> {
-};
-
 template <typename POL, typename IDX>
 struct CudaIndexPair : public POL {
   template <typename IS>
@@ -366,4 +349,5 @@ using cuda_block_z_exec = CudaPolicy<CudaBlock<Dim3z>>;
 
 }  // closing brace for RAJA namespace
 
+#endif // RAJA_ENABLE_CUDA
 #endif
diff --git a/include/RAJA/policy/cuda/raja_cudaerrchk.hpp b/include/RAJA/policy/cuda/raja_cudaerrchk.hpp
index 86736dddc5..7dfe987f32 100644
--- a/include/RAJA/policy/cuda/raja_cudaerrchk.hpp
+++ b/include/RAJA/policy/cuda/raja_cudaerrchk.hpp
@@ -10,15 +10,8 @@
  ******************************************************************************
  */
 
-#ifndef RAJA_raja_cudaerrchk_HPP
-#define RAJA_raja_cudaerrchk_HPP
-
-#include "RAJA/config.hpp"
-
-#if defined(RAJA_ENABLE_CUDA)
-
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -28,37 +21,17 @@
 //
 // This file is part of RAJA.
 //
-// For additional details, please also read RAJA/LICENSE.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
+// For details about use and distribution, please read RAJA/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+#ifndef RAJA_raja_cudaerrchk_HPP
+#define RAJA_raja_cudaerrchk_HPP
+
+#include "RAJA/config.hpp"
+
+#if defined(RAJA_ENABLE_CUDA)
+
 #include <iostream>
 #include <string>
 
@@ -95,16 +68,6 @@ inline void cudaAssert(cudaError_t code,
   }
 }
 
-/*!
- * \def RAJA_CUDA_CHECK_AND_SYNC(Async)
- * Macro that checks for errors and synchronizes based on paramater Async.
- */
-#define RAJA_CUDA_CHECK_AND_SYNC(Async)  \
-  cudaErrchk(cudaPeekAtLastError());     \
-  if (!Async) {                          \
-    cudaErrchk(cudaDeviceSynchronize()); \
-  }
-
 }  // closing brace for RAJA namespace
 
 #endif  // closing endif for if defined(RAJA_ENABLE_CUDA)
diff --git a/include/RAJA/policy/cuda/reduce.hpp b/include/RAJA/policy/cuda/reduce.hpp
index 99f9f5f784..c93e7d48a7 100644
--- a/include/RAJA/policy/cuda/reduce.hpp
+++ b/include/RAJA/policy/cuda/reduce.hpp
@@ -11,15 +11,8 @@
  ******************************************************************************
  */
 
-#ifndef RAJA_reduce_cuda_HPP
-#define RAJA_reduce_cuda_HPP
-
-#include "RAJA/config.hpp"
-
-#if defined(RAJA_ENABLE_CUDA)
-
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -29,1587 +22,962 @@
 //
 // This file is part of RAJA.
 //
-// For additional details, please also read RAJA/LICENSE.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
+// For details about use and distribution, please read RAJA/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+#ifndef RAJA_cuda_reduce_HPP
+#define RAJA_cuda_reduce_HPP
+
+#include "RAJA/config.hpp"
+
+#if defined(RAJA_ENABLE_CUDA)
+
 #include "RAJA/util/types.hpp"
 
+#include "RAJA/util/basic_mempool.hpp"
+
+#include "RAJA/util/SoAArray.hpp"
+
+#include "RAJA/util/SoAPtr.hpp"
+
+#include "RAJA/util/mutex.hpp"
+
+#include "RAJA/pattern/detail/reduce.hpp"
+
 #include "RAJA/pattern/reduce.hpp"
 
 #include "RAJA/policy/cuda/MemUtils_CUDA.hpp"
+
 #include "RAJA/policy/cuda/policy.hpp"
+
+#include "RAJA/policy/cuda/atomic.hpp"
+
 #include "RAJA/policy/cuda/raja_cudaerrchk.hpp"
 
 #include <cuda.h>
 
+#include <type_traits>
+
 namespace RAJA
 {
 
-// HIDDEN namespace to encapsulate helper functions
-namespace HIDDEN
+namespace reduce
 {
-/*!
- ******************************************************************************
- *
- * \brief Method to shuffle 32b registers in sum reduction for arbitrary type.
- *
- ******************************************************************************
- */
+
+namespace cuda
+{
+//! atomic operator version of Combiner object
+template <typename Combiner>
+struct atomic;
+
 template <typename T>
-__device__ __forceinline__ T shfl_xor(T var, int laneMask)
+struct atomic<sum<T>>
 {
-  const int int_sizeof_T = (sizeof(T) + sizeof(int) - 1) / sizeof(int);
-  union {
-    T var;
-    int arr[int_sizeof_T];
-  } Tunion;
-  Tunion.var = var;
-
-  for (int i = 0; i < int_sizeof_T; ++i) {
-    Tunion.arr[i] = __shfl_xor(Tunion.arr[i], laneMask);
+  RAJA_DEVICE RAJA_INLINE
+  void operator()(T &val, const T v)
+  {
+    RAJA::atomic::atomicAdd<T>(RAJA::atomic::cuda_atomic{}, &val, v);
   }
-  return Tunion.var;
-}
+};
 
-}  // end HIDDEN namespace for helper functions
+template <typename T>
+struct atomic<min<T>>
+{
+  RAJA_DEVICE RAJA_INLINE
+  void operator()(T &val, const T v)
+  {
+    RAJA::atomic::atomicMin<T>(RAJA::atomic::cuda_atomic{}, &val, v);
+  }
+};
 
-//
-//////////////////////////////////////////////////////////////////////
-//
-// Reduction classes.
-//
-//////////////////////////////////////////////////////////////////////
-//
+template <typename T>
+struct atomic<max<T>>
+{
+  RAJA_DEVICE RAJA_INLINE
+  void operator()(T &val, const T v)
+  {
+    RAJA::atomic::atomicMax<T>(RAJA::atomic::cuda_atomic{}, &val, v);
+  }
+};
+
+template < typename T >
+struct cuda_atomic_available {
+  static constexpr const bool value =
+      (std::is_integral<T>::value && (4 == sizeof(T) || 8 == sizeof(T))) ||
+      std::is_same<T, float>::value || std::is_same<T, double>::value;
+};
+
+} // namespace cuda
+
+} // namespace reduce
+
+namespace cuda
+{
+
+namespace impl
+{
+
+/*!
+ * \brief Abstracts T into an equal or greater size array of integers whose
+ * size is between min_integer_type_size and max_interger_type_size inclusive.
+ */
+template < typename T, size_t min_integer_type_size = 1, size_t max_integer_type_size = sizeof(long long) >
+union AsIntegerArray {
+
+  static_assert(min_integer_type_size<=max_integer_type_size, "incompatible min and max integer type size");
+  using integer_type =
+    typename std::conditional<((alignof(T)>=alignof(long long) &&
+                                sizeof(long long)<=max_integer_type_size) ||
+                               sizeof(long)<min_integer_type_size),
+      long long,
+      typename std::conditional<((alignof(T)>=alignof(long) &&
+                                  sizeof(long)<=max_integer_type_size) ||
+                                 sizeof(int)<min_integer_type_size),
+        long,
+        typename std::conditional<((alignof(T)>=alignof(int) &&
+                                    sizeof(int)<=max_integer_type_size) ||
+                                   sizeof(short)<min_integer_type_size),
+          int,
+          typename std::conditional<((alignof(T)>=alignof(short) &&
+                                      sizeof(short)<=max_integer_type_size) ||
+                                     sizeof(char)<min_integer_type_size),
+            short,
+            typename std::conditional<((alignof(T)>=alignof(char) &&
+                                        sizeof(char)<=max_integer_type_size)),
+              char,
+              void
+            >::type
+          >::type
+        >::type
+      >::type
+    >::type;
+  static_assert(!std::is_same<integer_type, void>::value,    "could not find a compatible integer type");
+  static_assert(sizeof(integer_type)>=min_integer_type_size, "integer_type smaller than min integer type size");
+  static_assert(sizeof(integer_type)<=max_integer_type_size, "integer_type greater than max integer type size");
+
+  constexpr static size_t num_integer_type = (sizeof(T) + sizeof(integer_type) - 1) / sizeof(integer_type);
+
+  T value;
+  integer_type array[num_integer_type];
+
+  RAJA_HOST_DEVICE constexpr AsIntegerArray(T value_) : value(value_) {};
+
+  RAJA_HOST_DEVICE constexpr size_t array_size() const { return num_integer_type; }
+};
+
+// cuda 8 only has shfl primitives for 32 bits while cuda 9 has 32 and 64 bits
+constexpr const size_t min_shfl_int_type_size = sizeof(int);
+#if (__CUDACC_VER_MAJOR__ >= 9)
+constexpr const size_t max_shfl_int_type_size = sizeof(long long);
+#else
+constexpr const size_t max_shfl_int_type_size = sizeof(int);
+#endif
 
 /*!
  ******************************************************************************
  *
- * \brief  Min reduction class template for use in CUDA kernels.
+ * \brief Method to shuffle 32b registers in sum reduction for arbitrary type.
  *
- *         For usage example, see reducers.hpp.
+ * \Note Returns an undefined value if src lane is inactive (divergence).
+ *       Returns this lane's value if src lane is out of bounds or has exited.
  *
  ******************************************************************************
  */
-template <size_t BLOCK_SIZE, bool Async, typename T>
-class ReduceMin<cuda_reduce<BLOCK_SIZE, Async>, T>
+template <typename T>
+RAJA_DEVICE RAJA_INLINE
+T shfl_xor_sync(T var, int laneMask)
 {
-public:
-  /*!
-   * \brief Constructor takes initial reduction value (default constructor
-   * is disabled).
-   *
-   * Note: Constructor only executes on the host.
-   */
+  AsIntegerArray<T, min_shfl_int_type_size, max_shfl_int_type_size> u(var);
 
-  explicit ReduceMin(T init_val)
-  {
-    m_is_copy_host = false;
-    m_myID = getCudaReductionId();
-    getCudaReductionTallyBlock(m_myID,
-                               (void **)&m_tally_host,
-                               (void **)&m_tally_device);
-    m_tally_host->tally = init_val;
+  for (int i = 0; i < u.array_size(); ++i) {
+#if (__CUDACC_VER_MAJOR__ >= 9)
+    u.array[i] = ::__shfl_xor_sync(0xffffffffu, u.array[i], laneMask);
+#else
+    u.array[i] = ::__shfl_xor(u.array[i], laneMask);
+#endif
   }
+  return u.value;
+}
 
-  /*!
-   * \brief Initialize shared memory on device, request shared memory on host.
-   *
-   * Copy constructor executes on both host and device.
-   * On host requests dynamic shared memory and gets offset into dynamic
-   * shared memory if in forall.
-   * On device initializes dynamic shared memory to appropriate value.
-   */
-  __host__ __device__
-  ReduceMin(const ReduceMin<cuda_reduce<BLOCK_SIZE, Async>, T> &other)
-  {
-    *this = other;
-#if defined(__CUDA_ARCH__)
-    m_is_copy_device = true;
-    m_finish_reduction = !other.m_is_copy_device;
-    extern __shared__ unsigned char sd_block[];
-    T *sd = reinterpret_cast<T *>(&sd_block[m_smem_offset]);
-
-    int threadId = threadIdx.x + blockDim.x * threadIdx.y
-                   + (blockDim.x * blockDim.y) * threadIdx.z;
-
-    // initialize shared memory
-    T val = m_tally_device->tally;
-    for (int i = BLOCK_SIZE / 2; i > 0; i /= 2) {
-      // this descends all the way to 1
-      if (threadId < i) {
-        sd[threadId + i] = val;
-      }
-    }
-    if (threadId < 1) {
-      sd[threadId] = val;
-    }
-    __syncthreads();
+template <typename T>
+RAJA_DEVICE RAJA_INLINE
+T shfl_sync(T var, int srcLane)
+{
+  AsIntegerArray<T, min_shfl_int_type_size, max_shfl_int_type_size> u(var);
+
+  for (int i = 0; i < u.array_size(); ++i) {
+#if (__CUDACC_VER_MAJOR__ >= 9)
+    u.array[i] = ::__shfl_sync(0xffffffffu, u.array[i], srcLane);
 #else
-    m_is_copy_host = true;
-    m_smem_offset = getCudaSharedmemOffset(m_myID, BLOCK_SIZE, sizeof(T));
+    u.array[i] = ::__shfl(u.array[i], srcLane);
 #endif
   }
+  return u.value;
+}
 
-  /*!
-   * \brief Finish reduction on device and free memory on host.
-   *
-   * Destruction on host releases the device memory chunk for
-   * reduction id and id itself for others to use.
-   * Destruction on device completes the reduction.
-   *
-   * Note: destructor executes on both host and device.
-   */
-  __host__ __device__ ~ReduceMin<cuda_reduce<BLOCK_SIZE, Async>, T>()
-  {
-#if defined(__CUDA_ARCH__)
-    if (m_finish_reduction) {
-      extern __shared__ unsigned char sd_block[];
-      T *sd = reinterpret_cast<T *>(&sd_block[m_smem_offset]);
+//! reduce values in block into thread 0
+template <typename Combiner, typename T>
+RAJA_DEVICE RAJA_INLINE
+T block_reduce(T val, T identity)
+{
+  int numThreads = blockDim.x * blockDim.y * blockDim.z;
 
-      int threadId = threadIdx.x + blockDim.x * threadIdx.y
-                     + (blockDim.x * blockDim.y) * threadIdx.z;
+  int threadId = threadIdx.x + blockDim.x * threadIdx.y
+                 + (blockDim.x * blockDim.y) * threadIdx.z;
 
-      __syncthreads();
+  int warpId = threadId % policy::cuda::WARP_SIZE;
+  int warpNum = threadId / policy::cuda::WARP_SIZE;
 
-      for (int i = BLOCK_SIZE / 2; i >= WARP_SIZE; i /= 2) {
-        if (threadId < i) {
-          sd[threadId] = RAJA_MIN(sd[threadId], sd[threadId + i]);
-        }
-        __syncthreads();
-      }
+  T temp = val;
 
-      for (int i = WARP_SIZE / 2; i > 0; i /= 2) {
-        if (threadId < i) {
-          sd[threadId] = RAJA_MIN(sd[threadId], sd[threadId + i]);
-        }
-      }
+  if (numThreads % policy::cuda::WARP_SIZE == 0) {
 
-      if (threadId < 1) {
-        RAJA::atomic::atomicMin<T>(RAJA::atomic::cuda_atomic{}, &m_tally_device->tally, sd[threadId]);
-      }
+    // reduce each warp
+    for (int i = 1; i < policy::cuda::WARP_SIZE ; i *= 2) {
+      T rhs = shfl_xor_sync(temp, i);
+      Combiner{}(temp, rhs);
     }
-#else
-    if (!m_is_copy_host) {
-      releaseCudaReductionTallyBlock(m_myID);
-      releaseCudaReductionId(m_myID);
-    }
-#endif
-  }
-
-  /*!
-   * \brief Operator that returns reduced min value.
-   *
-   * Note: accessor only executes on host.
-   */
-  operator T()
-  {
-    beforeCudaReadTallyBlock<Async>(m_myID);
-    return m_tally_host->tally;
-  }
-
-  /*!
-   * \brief Method that returns reduced min value.
-   *
-   * Note: accessor only executes on host.
-   */
-  T get() { return operator T(); }
-
-  /*!
-   * \brief Method that updates min value.
-   *
-   * Note: only operates on device.
-   */
-  __device__ ReduceMin<cuda_reduce<BLOCK_SIZE, Async>, T> const &min(
-      T val) const
-  {
-    extern __shared__ unsigned char sd_block[];
-    T *sd = reinterpret_cast<T *>(&sd_block[m_smem_offset]);
 
-    int threadId = threadIdx.x + blockDim.x * threadIdx.y
-                   + (blockDim.x * blockDim.y) * threadIdx.z;
+  } else {
 
-    sd[threadId] = RAJA_MIN(sd[threadId], val);
+    // reduce each warp
+    for (int i = 1; i < policy::cuda::WARP_SIZE ; i *= 2) {
+      int srcLane = threadId ^ i;
+      T rhs = shfl_sync(temp, srcLane);
+      // only add from threads that exist (don't double count own value)
+      if (srcLane < numThreads) {
+        Combiner{}(temp, rhs);
+      }
+    }
 
-    return *this;
   }
 
-private:
-  /*!
-   * \brief Default constructor is declared private and not implemented.
-   */
-  ReduceMin<cuda_reduce<BLOCK_SIZE, Async>, T>();
-
-  /*!
-   * \brief Pointer to host tally block cache slot for this reduction variable.
-   */
-  CudaReductionTallyTypeAtomic<T> *m_tally_host = nullptr;
+  // reduce per warp values
+  if (numThreads > policy::cuda::WARP_SIZE) {
 
-  /*!
-   * \brief Pointer to device tally block slot for this reduction variable.
-   */
-  CudaReductionTallyTypeAtomic<T> *m_tally_device = nullptr;
+    __shared__ RAJA::detail::SoAArray<T, policy::cuda::MAX_WARPS> sd;
 
-  /*!
-   * \brief My cuda reduction variable ID.
-   */
-  int m_myID = -1;
+    // write per warp values to shared memory
+    if (warpId == 0) {
+      sd.set(warpNum, temp);
+    }
 
-  /*!
-   * \brief Byte offset into dynamic shared memory.
-   */
-  int m_smem_offset = -1;
+    __syncthreads();
 
-  /*!
-   * \brief If this variable is a copy or not; only original may release memory
-   *        or perform finalization.
-   */
-  bool m_is_copy_host = false;
-  bool m_is_copy_device = false;
-  bool m_finish_reduction = false;
-
-  // Sanity checks for block size and template type size
-  static constexpr bool powerOfTwoCheck = (!(BLOCK_SIZE & (BLOCK_SIZE - 1)));
-  static constexpr bool reasonableRangeCheck =
-      ((BLOCK_SIZE >= 32) && (BLOCK_SIZE <= 1024));
-  static constexpr bool sizeofcheck =
-      ((sizeof(T) <= sizeof(CudaReductionDummyDataType))
-       && (sizeof(CudaReductionTallyType<T>)
-           <= sizeof(CudaReductionDummyTallyType))
-       && (sizeof(CudaReductionBlockType<T>)
-           <= sizeof(CudaReductionDummyBlockType)));
-  static_assert(powerOfTwoCheck, "Error: block sizes must be a power of 2");
-  static_assert(reasonableRangeCheck,
-                "Error: block sizes must be between 32 and 1024");
-  static_assert(sizeofcheck,
-                "Error: type must be of size <= " RAJA_STRINGIFY_MACRO(
-                    RAJA_CUDA_REDUCE_VAR_MAXSIZE));
-};
+    if (warpNum == 0) {
 
-/*!
- ******************************************************************************
- *
- * \brief  Max reduction class template for use in CUDA kernels.
- *
- *         For usage example, see reducers.hpp.
- *
- ******************************************************************************
- */
-template <size_t BLOCK_SIZE, bool Async, typename T>
-class ReduceMax<cuda_reduce<BLOCK_SIZE, Async>, T>
-{
-public:
-  /*!
-   * \brief Constructor takes initial reduction value (default constructor
-   * is disabled).
-   *
-   * Note: Constructor only executes on the host.
-   */
-  explicit ReduceMax(T init_val)
-  {
-    m_is_copy_host = false;
-    m_myID = getCudaReductionId();
-    getCudaReductionTallyBlock(m_myID,
-                               (void **)&m_tally_host,
-                               (void **)&m_tally_device);
-    m_tally_host->tally = init_val;
-  }
+      // read per warp values
+      if (warpId*policy::cuda::WARP_SIZE < numThreads) {
+        temp = sd.get(warpId);
+      } else {
+        temp = identity;
+      }
 
-  /*!
-   * \brief Initialize shared memory on device, request shared memory on host.
-   *
-   * Copy constructor executes on both host and device.
-   * On host requests dynamic shared memory and gets offset into dynamic
-   * shared memory if in forall.
-   * On device initializes dynamic shared memory to appropriate value.
-   */
-  __host__ __device__
-  ReduceMax(const ReduceMax<cuda_reduce<BLOCK_SIZE, Async>, T> &other)
-  {
-    *this = other;
-#if defined(__CUDA_ARCH__)
-    m_is_copy_device = true;
-    m_finish_reduction = !other.m_is_copy_device;
-    extern __shared__ unsigned char sd_block[];
-    T *sd = reinterpret_cast<T *>(&sd_block[m_smem_offset]);
-
-    int threadId = threadIdx.x + blockDim.x * threadIdx.y
-                   + (blockDim.x * blockDim.y) * threadIdx.z;
-
-    // initialize shared memory
-    T val = m_tally_device->tally;
-    for (int i = BLOCK_SIZE / 2; i > 0; i /= 2) {
-      // this descends all the way to 1
-      if (threadId < i) {
-        sd[threadId + i] = val;
+      for (int i = 1; i < policy::cuda::WARP_SIZE ; i *= 2) {
+        T rhs = shfl_xor_sync(temp, i);
+        Combiner{}(temp, rhs);
       }
     }
-    if (threadId < 1) {
-      sd[threadId] = val;
-    }
 
     __syncthreads();
-#else
-    m_is_copy_host = true;
-    m_smem_offset = getCudaSharedmemOffset(m_myID, BLOCK_SIZE, sizeof(T));
-#endif
+
   }
 
-  /*!
-   * \brief Finish reduction on device and free memory on host.
-   *
-   * Destruction on host releases the device memory chunk for
-   * reduction id and id itself for others to use.
-   * Destruction on device completed the reduction.
-   *
-   * Note: destructor executes on both host and device.
-   */
-  __host__ __device__ ~ReduceMax<cuda_reduce<BLOCK_SIZE, Async>, T>()
-  {
-#if defined(__CUDA_ARCH__)
-    if (m_finish_reduction) {
-      extern __shared__ unsigned char sd_block[];
-      T *sd = reinterpret_cast<T *>(&sd_block[m_smem_offset]);
+  return temp;
+}
 
-      int threadId = threadIdx.x + blockDim.x * threadIdx.y
-                     + (blockDim.x * blockDim.y) * threadIdx.z;
 
-      __syncthreads();
+//! reduce values in grid into thread 0 of last running block
+//  returns true if put reduced value in val
+template <typename Combiner, typename T, typename TempIterator>
+RAJA_DEVICE RAJA_INLINE
+bool grid_reduce(T& val, T identity,
+                 TempIterator device_mem,
+                 unsigned int* device_count)
+{
+  int numBlocks = gridDim.x * gridDim.y * gridDim.z;
+  int numThreads = blockDim.x * blockDim.y * blockDim.z;
+  unsigned int wrap_around = numBlocks - 1;
 
-      for (int i = BLOCK_SIZE / 2; i >= WARP_SIZE; i /= 2) {
-        if (threadId < i) {
-          sd[threadId] = RAJA_MAX(sd[threadId], sd[threadId + i]);
-        }
-        __syncthreads();
-      }
+  int blockId = blockIdx.x + gridDim.x * blockIdx.y
+                + (gridDim.x * gridDim.y) * blockIdx.z;
 
-      for (int i = WARP_SIZE / 2; i > 0; i /= 2) {
-        if (threadId < i) {
-          sd[threadId] = RAJA_MAX(sd[threadId], sd[threadId + i]);
-        }
-      }
+  int threadId = threadIdx.x + blockDim.x * threadIdx.y
+                 + (blockDim.x * blockDim.y) * threadIdx.z;
 
-      if (threadId < 1) {
-        RAJA::atomic::atomicMax<T>(RAJA::atomic::cuda_atomic{}, &m_tally_device->tally, sd[threadId]);
-      }
-    }
-#else
-    if (!m_is_copy_host) {
-      releaseCudaReductionTallyBlock(m_myID);
-      releaseCudaReductionId(m_myID);
-    }
-#endif
-  }
+  T temp = block_reduce<Combiner>(val, identity);
 
-  /*!
-   * \brief Operator that returns reduced max value.
-   *
-   * Note: accessor only executes on host.
-   */
-  operator T()
-  {
-    beforeCudaReadTallyBlock<Async>(m_myID);
-    return m_tally_host->tally;
+  // one thread per block writes to device_mem
+  bool lastBlock = false;
+  if (threadId == 0) {
+    device_mem.set(blockId, temp);
+    // ensure write visible to all threadblocks
+    __threadfence();
+    // increment counter, (wraps back to zero if old count == wrap_around)
+    unsigned int old_count = ::atomicInc(device_count, wrap_around);
+    lastBlock = (old_count == wrap_around);
   }
 
-  /*!
-   * \brief Method that returns reduced max value.
-   *
-   * Note: accessor only executes on host.
-   */
-  T get() { return operator T(); }
+  // returns non-zero value if any thread passes in a non-zero value
+  lastBlock = __syncthreads_or(lastBlock);
 
-  /*!
-   * \brief Method that updates max value.
-   *
-   * Note: only operates on device.
-   */
-  __device__ ReduceMax<cuda_reduce<BLOCK_SIZE, Async>, T> const &max(
-      T val) const
-  {
-    extern __shared__ unsigned char sd_block[];
-    T *sd = reinterpret_cast<T *>(&sd_block[m_smem_offset]);
+  // last block accumulates values from device_mem
+  if (lastBlock) {
+    temp = identity;
 
-    int threadId = threadIdx.x + blockDim.x * threadIdx.y
-                   + (blockDim.x * blockDim.y) * threadIdx.z;
+    for (int i = threadId; i < numBlocks; i += numThreads) {
+      Combiner{}(temp, device_mem.get(i));
+    }
 
-    sd[threadId] = RAJA_MAX(sd[threadId], val);
+    temp = block_reduce<Combiner>(temp, identity);
 
-    return *this;
+    // one thread returns value
+    if (threadId == 0) {
+      val = temp;
+    }
   }
 
-private:
-  /*!
-   * \brief Default constructor is declared private and not implemented.
-   */
-  ReduceMax<cuda_reduce<BLOCK_SIZE, Async>, T>();
+  return lastBlock && threadId == 0;
+}
 
-  /*!
-   * \brief Pointer to host tally block cache slot for this reduction variable.
-   */
-  CudaReductionTallyTypeAtomic<T> *m_tally_host = nullptr;
 
-  /*!
-   * \brief Pointer to device tally block slot for this reduction variable.
-   */
-  CudaReductionTallyTypeAtomic<T> *m_tally_device = nullptr;
+//! reduce values in grid into thread 0 of last running block
+//  returns true if put reduced value in val
+template <typename Combiner, typename T>
+RAJA_DEVICE RAJA_INLINE
+bool grid_reduce_atomic(T& val, T identity,
+                        T* device_mem,
+                        unsigned int* device_count)
+{
+  int numBlocks = gridDim.x * gridDim.y * gridDim.z;
+  unsigned int wrap_around = numBlocks + 1;
+
+  int threadId = threadIdx.x + blockDim.x * threadIdx.y
+                 + (blockDim.x * blockDim.y) * threadIdx.z;
+
+  // one thread in first block initializes device_mem
+  if (threadId == 0) {
+    unsigned int old_val = ::atomicCAS(device_count, 0u, 1u);
+    if (old_val == 0u) {
+      device_mem[0] = identity;
+      __threadfence();
+      ::atomicAdd(device_count, 1u);
+    }
+  }
 
-  /*!
-   * \brief My cuda reduction variable ID.
-   */
-  int m_myID = -1;
+  T temp = block_reduce<Combiner>(val, identity);
 
-  /*!
-   * \brief Byte offset into dynamic shared memory.
-   */
-  int m_smem_offset = -1;
+  // one thread per block performs atomic on device_mem
+  bool lastBlock = false;
+  if (threadId == 0) {
+    // thread waits for device_mem to be initialized
+    while(static_cast<volatile unsigned int*>(device_count)[0] < 2u);
+    __threadfence();
+    RAJA::reduce::cuda::atomic<Combiner>{}(device_mem[0], temp);
+    __threadfence();
+    // increment counter, (wraps back to zero if old count == wrap_around)
+    unsigned int old_count = ::atomicInc(device_count, wrap_around);
+    lastBlock = (old_count == wrap_around);
 
-  /*!
-   * \brief If this variable is a copy or not; only original may release memory
-   *        or perform finalization.
-   */
-  bool m_is_copy_host = false;
-  bool m_is_copy_device = false;
-  bool m_finish_reduction = false;
-
-  // Sanity checks for block size and template type size
-  static constexpr bool powerOfTwoCheck = (!(BLOCK_SIZE & (BLOCK_SIZE - 1)));
-  static constexpr bool reasonableRangeCheck =
-      ((BLOCK_SIZE >= 32) && (BLOCK_SIZE <= 1024));
-  static constexpr bool sizeofcheck =
-      ((sizeof(T) <= sizeof(CudaReductionDummyDataType))
-       && (sizeof(CudaReductionTallyType<T>)
-           <= sizeof(CudaReductionDummyTallyType))
-       && (sizeof(CudaReductionBlockType<T>)
-           <= sizeof(CudaReductionDummyBlockType)));
-  static_assert(powerOfTwoCheck, "Error: block sizes must be a power of 2");
-  static_assert(reasonableRangeCheck,
-                "Error: block sizes must be between 32 and 1024");
-  static_assert(sizeofcheck,
-                "Error: type must be of size <= " RAJA_STRINGIFY_MACRO(
-                    RAJA_CUDA_REDUCE_VAR_MAXSIZE));
-};
+    // last block gets value from device_mem
+    if (lastBlock) {
+      val = device_mem[0];
+    }
+  }
 
-/*!
- ******************************************************************************
- *
- * \brief  Sum reduction class template for use in CUDA kernel.
- *
- *         For usage example, see reducers.hpp.
- *
- ******************************************************************************
- */
-template <size_t BLOCK_SIZE, bool Async, typename T>
-class ReduceSum<cuda_reduce<BLOCK_SIZE, Async>, T>
+  return lastBlock;
+}
+
+}  // namespace impl
+
+//! Object that manages pinned memory buffers for reduction results
+//  use one per reducer object
+template <typename T>
+class PinnedTally
 {
 public:
-  /*!
-   * \brief Constructor takes initial reduction value (default constructor
-   * is disabled).
-   *
-   * Note: Constructor only executes on the host.
-   */
-  explicit ReduceSum(T init_val)
-  {
-    m_is_copy_host = false;
-    m_myID = getCudaReductionId();
-    getCudaReductionMemBlock(m_myID, (void **)&m_blockdata);
-    getCudaReductionTallyBlock(m_myID,
-                               (void **)&m_tally_host,
-                               (void **)&m_tally_device);
-    m_tally_host->tally = init_val;
-    m_tally_host->retiredBlocks = static_cast<GridSizeType>(0);
-  }
+  //! Object put in Pinned memory with value and pointer to next Node
+  struct Node {
+    Node* next;
+    T value;
+  };
+  //! Object per stream to keep track of pinned memory nodes
+  struct StreamNode {
+    StreamNode* next;
+    cudaStream_t stream;
+    Node* node_list;
+  };
+
+  //! Iterator over streams used by reducer
+  class StreamIterator {
+  public:
+    StreamIterator() = delete;
+
+    StreamIterator(StreamNode* sn)
+      : m_sn(sn)
+    {
+    }
 
-  /*!
-   * \brief Initialize shared memory on device, request shared memory on host.
-   *
-   * Copy constructor executes on both host and device.
-   * On host requests dynamic shared memory and gets offset into dynamic
-   * shared memory if in forall.
-   * On device initializes dynamic shared memory to appropriate value.
-   */
-  __host__ __device__
-  ReduceSum(const ReduceSum<cuda_reduce<BLOCK_SIZE, Async>, T> &other)
-  {
-    *this = other;
-#if defined(__CUDA_ARCH__)
-    m_is_copy_device = true;
-    m_finish_reduction = !other.m_is_copy_device;
-    extern __shared__ unsigned char sd_block[];
-    T *sd = reinterpret_cast<T *>(&sd_block[m_smem_offset]);
-
-    int threadId = threadIdx.x + blockDim.x * threadIdx.y
-                   + (blockDim.x * blockDim.y) * threadIdx.z;
-
-    // initialize shared memory
-    T val = static_cast<T>(0);
-    for (int i = BLOCK_SIZE / 2; i > 0; i /= 2) {
-      // this descends all the way to 1
-      if (threadId < i) {
-        sd[threadId + i] = val;
-      }
+    const StreamIterator& operator++()
+    {
+      m_sn = m_sn->next;
+      return *this;
     }
-    if (threadId < 1) {
-      sd[threadId] = val;
+
+    StreamIterator operator++(int)
+    {
+      StreamIterator ret = *this;
+      this->operator++();
+      return ret;
     }
 
-    __syncthreads();
-#else
-    m_is_copy_host = true;
-    m_smem_offset = getCudaSharedmemOffset(m_myID, BLOCK_SIZE, sizeof(T));
-#endif
-  }
+    cudaStream_t& operator*()
+    {
+      return m_sn->stream;
+    }
 
-  /*!
-   * \brief Finish reduction on device and free memory on host.
-   *
-   * Destruction on host releases the device memory chunk for
-   * reduction id and id itself for others to use.
-   * Destruction on device completes the reduction.
-   *
-   * Note: destructor executes on both host and device.
-   */
-  __host__ __device__ ~ReduceSum<cuda_reduce<BLOCK_SIZE, Async>, T>()
-  {
-#if defined(__CUDA_ARCH__)
-    if (m_finish_reduction) {
-      extern __shared__ unsigned char sd_block[];
-      T *sd = reinterpret_cast<T *>(&sd_block[m_smem_offset]);
+    bool operator==(const StreamIterator& rhs) const
+    {
+      return m_sn == rhs.m_sn;
+    }
 
-      int blockId = blockIdx.x + blockIdx.y * gridDim.x
-                    + gridDim.x * gridDim.y * blockIdx.z;
+    bool operator!=(const StreamIterator& rhs) const
+    {
+      return !this->operator==(rhs);
+    }
 
-      int blocks = gridDim.x * gridDim.y * gridDim.z;
+  private:
+    StreamNode* m_sn;
+  };
 
-      int threadId = threadIdx.x + blockDim.x * threadIdx.y
-                     + (blockDim.x * blockDim.y) * threadIdx.z;
+  //! Iterator over all values generated by reducer
+  class StreamNodeIterator {
+  public:
+    StreamNodeIterator() = delete;
 
-      __syncthreads();
+    StreamNodeIterator(StreamNode* sn, Node* n)
+      : m_sn(sn), m_n(n)
+    {
+    }
 
-      for (int i = BLOCK_SIZE / 2; i >= WARP_SIZE; i /= 2) {
-        if (threadId < i) {
-          sd[threadId] += sd[threadId + i];
-        }
-        __syncthreads();
+    const StreamNodeIterator& operator++()
+    {
+      if (m_n->next) {
+        m_n = m_n->next;
+      } else if (m_sn->next) {
+        m_sn = m_sn->next;
+        m_n = m_sn->node_list;
+      } else {
+        m_sn = nullptr;
+        m_n = nullptr;
       }
+      return *this;
+    }
 
-      T temp;
-      if (threadId < WARP_SIZE) {
-        temp = sd[threadId];
-        for (int i = WARP_SIZE / 2; i > 0; i /= 2) {
-          temp += HIDDEN::shfl_xor<T>(temp, i);
-        }
-      }
+    StreamNodeIterator operator++(int)
+    {
+      StreamNodeIterator ret = *this;
+      this->operator++();
+      return ret;
+    }
 
-      bool lastBlock = false;
-      if (threadId < 1) {
-        // write data to global memory block
-        m_blockdata->values[blockId] = temp;
-        // ensure write visible to all threadblocks
-        __threadfence();
-        // increment counter, (wraps back to zero at second parameter)
-        unsigned int oldBlockCount =
-            ::atomicInc((unsigned int *)&m_tally_device->retiredBlocks,
-                      (blocks - 1));
-        lastBlock = (oldBlockCount == (blocks - 1));
-      }
+    T& operator*()
+    {
+      return m_n->value;
+    }
 
-      // returns non-zero value if any thread passes in a non-zero value
-      lastBlock = __syncthreads_or(lastBlock);
-
-      if (lastBlock) {
-        T temp = static_cast<T>(0);
-
-        int threads = blockDim.x * blockDim.y * blockDim.z;
-        for (int i = threadId; i < blocks; i += threads) {
-          temp += m_blockdata->values[i];
-        }
-        // any unused slots were initialized in copy constructor
-        sd[threadId] = temp;
-        __syncthreads();
-
-        for (int i = BLOCK_SIZE / 2; i >= WARP_SIZE; i /= 2) {
-          if (threadId < i) {
-            sd[threadId] += sd[threadId + i];
-          }
-          __syncthreads();
-        }
-
-        if (threadId < WARP_SIZE) {
-          temp = sd[threadId];
-          for (int i = WARP_SIZE / 2; i > 0; i /= 2) {
-            temp += HIDDEN::shfl_xor<T>(temp, i);
-          }
-        }
-
-        if (threadId < 1) {
-          // add reduction to tally
-          m_tally_device->tally += temp;
-        }
-      }
-    }
-#else
-    if (!m_is_copy_host) {
-      releaseCudaReductionTallyBlock(m_myID);
-      releaseCudaReductionId(m_myID);
+    bool operator==(const StreamNodeIterator& rhs) const
+    {
+      return m_n == rhs.m_n;
     }
-#endif
-  }
 
-  /*!
-   * \brief Operator that returns reduced sum value.
-   *
-   * Note: accessor only executes on host.
-   */
-  operator T()
-  {
-    beforeCudaReadTallyBlock<Async>(m_myID);
-    return m_tally_host->tally;
-  }
+    bool operator!=(const StreamNodeIterator& rhs) const
+    {
+      return !this->operator==(rhs);
+    }
 
-  /*!
-   * \brief Method that returns reduced sum value.
-   *
-   * Note: accessor only executes on host.
-   */
-  T get() { return operator T(); }
+  private:
+    StreamNode* m_sn;
+    Node* m_n;
+  };
 
-  /*!
-   * \brief Operator that adds value to sum.
-   *
-   * Note: only operates on device.
-   */
-  __device__ ReduceSum<cuda_reduce<BLOCK_SIZE, Async>, T> const &operator+=(
-      T val) const
+  PinnedTally()
+    : stream_list(nullptr)
   {
-    extern __shared__ unsigned char sd_block[];
-    T *sd = reinterpret_cast<T *>(&sd_block[m_smem_offset]);
 
-    int threadId = threadIdx.x + blockDim.x * threadIdx.y
-                   + (blockDim.x * blockDim.y) * threadIdx.z;
-
-    sd[threadId] += val;
-
-
-    return *this;
   }
 
-private:
-  /*!
-   * \brief Default constructor is declared private and not implemented.
-   */
-  ReduceSum<cuda_reduce<BLOCK_SIZE, Async>, T>();
-
-  /*!
-   * \brief Pointer to host tally block cache slot for this reduction variable.
-   */
-  CudaReductionTallyType<T> *m_tally_host = nullptr;
-
-  /*!
-   * \brief Pointer to device data block for this reduction variable.
-   */
-  CudaReductionBlockType<T> *m_blockdata = nullptr;
+  PinnedTally(const PinnedTally&) = delete;
 
-  /*!
-   * \brief Pointer to device tally block slot for this reduction variable.
-   */
-  CudaReductionTallyType<T> *m_tally_device = nullptr;
-
-  /*!
-   * \brief My cuda reduction variable ID.
-   */
-  int m_myID = -1;
+  //! get begin iterator over streams
+  StreamIterator streamBegin()
+  {
+    return{stream_list};
+  }
 
-  /*!
-   * \brief Byte offset into dynamic shared memory.
-   */
-  int m_smem_offset = -1;
+  //! get end iterator over streams
+  StreamIterator streamEnd()
+  {
+    return{nullptr};
+  }
 
-  /*!
-   * \brief If this variable is a copy or not; only original may release memory
-   *        or perform finalization.
-   */
-  bool m_is_copy_host = false;
-  bool m_is_copy_device = false;
-  bool m_finish_reduction = false;
-
-  // Sanity checks for block size and template type size
-  static constexpr bool powerOfTwoCheck = (!(BLOCK_SIZE & (BLOCK_SIZE - 1)));
-  static constexpr bool reasonableRangeCheck =
-      ((BLOCK_SIZE >= 32) && (BLOCK_SIZE <= 1024));
-  static constexpr bool sizeofcheck =
-      ((sizeof(T) <= sizeof(CudaReductionDummyDataType))
-       && (sizeof(CudaReductionTallyType<T>)
-           <= sizeof(CudaReductionDummyTallyType))
-       && (sizeof(CudaReductionBlockType<T>)
-           <= sizeof(CudaReductionDummyBlockType)));
-  static_assert(powerOfTwoCheck, "Error: block sizes must be a power of 2");
-  static_assert(reasonableRangeCheck,
-                "Error: block sizes must be between 32 and 1024");
-  static_assert(sizeofcheck,
-                "Error: type must be of size <= " RAJA_STRINGIFY_MACRO(
-                    RAJA_CUDA_REDUCE_VAR_MAXSIZE));
-};
+  //! get begin iterator over values
+  StreamNodeIterator begin()
+  {
+    return{stream_list, stream_list ? stream_list->node_list : nullptr};
+  }
 
-/*!
- ******************************************************************************
- *
- * \brief  Sum reduction Atomic Non-Deterministic Variant class template
- *         for use in CUDA kernel.
- *
- *         For usage example, see reducers.hpp.
- *
- ******************************************************************************
- */
-template <size_t BLOCK_SIZE, bool Async, typename T>
-class ReduceSum<cuda_reduce_atomic<BLOCK_SIZE, Async>, T>
-{
-public:
-  /*!
-   * \brief Constructor takes initial reduction value (default constructor
-   * is disabled).
-   *
-   * Note: Constructor only executes on the host.
-   */
-  explicit ReduceSum(T init_val)
+  //! get end iterator over values
+  StreamNodeIterator end()
   {
-    m_is_copy_host = false;
-    m_myID = getCudaReductionId();
-    getCudaReductionTallyBlock(m_myID,
-                               (void **)&m_tally_host,
-                               (void **)&m_tally_device);
-    m_tally_host->tally = init_val;
+    return{nullptr, nullptr};
   }
 
-  /*!
-   * \brief Initialize shared memory on device, request shared memory on host.
-   *
-   * Copy constructor executes on both host and device.
-   * On host requests dynamic shared memory and gets offset into dynamic
-   * shared memory if in forall.
-   * On device initializes dynamic shared memory to appropriate value.
-   */
-  __host__ __device__
-  ReduceSum(const ReduceSum<cuda_reduce_atomic<BLOCK_SIZE, Async>, T> &other)
+  //! get new value for use in stream
+  T* new_value(cudaStream_t stream)
   {
-    *this = other;
-#if defined(__CUDA_ARCH__)
-    m_is_copy_device = true;
-    m_finish_reduction = !other.m_is_copy_device;
-    extern __shared__ unsigned char sd_block[];
-    T *sd = reinterpret_cast<T *>(&sd_block[m_smem_offset]);
-
-    int threadId = threadIdx.x + blockDim.x * threadIdx.y
-                   + (blockDim.x * blockDim.y) * threadIdx.z;
-
-    // initialize shared memory
-    T val = static_cast<T>(0);
-    for (int i = BLOCK_SIZE / 2; i > 0; i /= 2) {
-      // this descends all the way to 1
-      if (threadId < i) {
-        sd[threadId + i] = val;
-      }
+#if defined(RAJA_ENABLE_OPENMP) && defined(_OPENMP)
+    lock_guard<omp::mutex> lock(m_mutex);
+#endif
+    StreamNode* sn = stream_list;
+    while(sn) {
+      if (sn->stream == stream) break;
+      sn = sn->next;
     }
-    if (threadId < 1) {
-      sd[threadId] = val;
+    if (!sn) {
+      sn = (StreamNode*)malloc(sizeof(StreamNode));
+      sn->next = stream_list;
+      sn->stream = stream;
+      sn->node_list = nullptr;
+      stream_list = sn;
     }
-
-    __syncthreads();
-#else
-    m_is_copy_host = true;
-    m_smem_offset = getCudaSharedmemOffset(m_myID, BLOCK_SIZE, sizeof(T));
-#endif
+    Node* n = cuda::pinned_mempool_type::getInstance().template malloc<Node>(1);
+    n->next = sn->node_list;
+    sn->node_list = n;
+    return &n->value;
   }
 
-  /*!
-   * \brief Finish reduction on device and free memory on host.
-   *
-   * Destruction on host releases the device memory chunk for
-   * reduction id and id itself for others to use.
-   * Destruction on device completes the reduction.
-   *
-   * Note: destructor executes on both host and device.
-   */
-  __host__ __device__ ~ReduceSum<cuda_reduce_atomic<BLOCK_SIZE, Async>, T>()
+  //! synchronize all streams used
+  void synchronize_streams()
   {
-#if defined(__CUDA_ARCH__)
-    if (m_finish_reduction) {
-      extern __shared__ unsigned char sd_block[];
-      T *sd = reinterpret_cast<T *>(&sd_block[m_smem_offset]);
-
-      int threadId = threadIdx.x + blockDim.x * threadIdx.y
-                     + (blockDim.x * blockDim.y) * threadIdx.z;
-
-      T temp = 0;
-      __syncthreads();
-
-      for (int i = BLOCK_SIZE / 2; i >= WARP_SIZE; i /= 2) {
-        if (threadId < i) {
-          sd[threadId] += sd[threadId + i];
-        }
-        __syncthreads();
-      }
-
-      if (threadId < WARP_SIZE) {
-        temp = sd[threadId];
-        for (int i = WARP_SIZE / 2; i > 0; i /= 2) {
-          temp += HIDDEN::shfl_xor<T>(temp, i);
-        }
-      }
-
-      // one thread adds to tally
-      if (threadId == 0) {
-        RAJA::atomic::atomicAdd<T>(RAJA::atomic::cuda_atomic{}, &(m_tally_device->tally), temp);
-      }
-    }
-#else
-    if (!m_is_copy_host) {
-      releaseCudaReductionTallyBlock(m_myID);
-      releaseCudaReductionId(m_myID);
+    auto end = streamEnd();
+    for(auto s = streamBegin(); s != end; ++s) {
+      synchronize(*s);
     }
-#endif
   }
 
-  /*!
-   * \brief Operator that returns reduced sum value.
-   *
-   * Note: accessor only executes on host.
-   */
-  operator T()
+  //! all values used in all streams
+  void free_list()
   {
-    beforeCudaReadTallyBlock<Async>(m_myID);
-    return m_tally_host->tally;
+    while (stream_list) {
+      StreamNode* s = stream_list;
+      while (s->node_list) {
+        Node* n = s->node_list;
+        s->node_list = n->next;
+        cuda::pinned_mempool_type::getInstance().free(n);
+      }
+      stream_list = s->next;
+      free(s);
+    }
   }
 
-  /*!
-   * \brief Operator that returns reduced sum value.
-   *
-   * Note: accessor only executes on host.
-   */
-  T get() { return operator T(); }
-
-  /*!
-   * \brief Operator that adds value to sum.
-   *
-   * Note: only operates on device.
-   */
-  __device__ ReduceSum<cuda_reduce_atomic<BLOCK_SIZE, Async>, T> const &
-  operator+=(T val) const
+  ~PinnedTally()
   {
-    extern __shared__ unsigned char sd_block[];
-    T *sd = reinterpret_cast<T *>(&sd_block[m_smem_offset]);
-
-    int threadId = threadIdx.x + blockDim.x * threadIdx.y
-                   + (blockDim.x * blockDim.y) * threadIdx.z;
-
-    sd[threadId] += val;
-
-    return *this;
+    free_list();
   }
 
-private:
-  /*!
-   * \brief Default constructor is declared private and not implemented.
-   */
-  ReduceSum<cuda_reduce_atomic<BLOCK_SIZE, Async>, T>();
+#if defined(RAJA_ENABLE_OPENMP) && defined(_OPENMP)
+  omp::mutex m_mutex;
+#endif
 
-  /*!
-   * \brief Pointer to host tally block cache slot for this reduction variable.
-   */
-  CudaReductionTallyTypeAtomic<T> *m_tally_host = nullptr;
+private:
+  StreamNode* stream_list;
+};
 
-  /*!
-   * \brief Pointer to device tally block slot for this reduction variable.
-   */
-  CudaReductionTallyTypeAtomic<T> *m_tally_device = nullptr;
+//
+//////////////////////////////////////////////////////////////////////
+//
+// Reduction classes.
+//
+//////////////////////////////////////////////////////////////////////
+//
 
-  /*!
-   * \brief My cuda reduction variable ID.
-   */
-  int m_myID = -1;
+//! Reduction data for Cuda Offload -- stores value, host pointer, and device pointer
+template <bool Async, typename Combiner, typename T>
+struct Reduce_Data {
 
-  /*!
-   * \brief Byte offset into dynamic shared memory.
-   */
-  int m_smem_offset = -1;
+  mutable T value;
+  T identity;
+  unsigned int *device_count;
+  RAJA::detail::SoAPtr<T, device_mempool_type> device;
+  bool own_device_ptr;
 
-  /*!
-   * \brief If this variable is a copy or not; only original may release memory
-   *        or perform finalization.
-   */
-  bool m_is_copy_host = false;
-  bool m_is_copy_device = false;
-  bool m_finish_reduction = false;
-
-  // Sanity checks for block size and template type size
-  static constexpr bool powerOfTwoCheck = (!(BLOCK_SIZE & (BLOCK_SIZE - 1)));
-  static constexpr bool reasonableRangeCheck =
-      ((BLOCK_SIZE >= 32) && (BLOCK_SIZE <= 1024));
-  static constexpr bool sizeofcheck =
-      ((sizeof(T) <= sizeof(CudaReductionDummyDataType))
-       && (sizeof(CudaReductionTallyType<T>)
-           <= sizeof(CudaReductionDummyTallyType))
-       && (sizeof(CudaReductionBlockType<T>)
-           <= sizeof(CudaReductionDummyBlockType)));
-  static_assert(powerOfTwoCheck, "Error: block sizes must be a power of 2");
-  static_assert(reasonableRangeCheck,
-                "Error: block sizes must be between 32 and 1024");
-  static_assert(sizeofcheck,
-                "Error: type must be of size <= " RAJA_STRINGIFY_MACRO(
-                    RAJA_CUDA_REDUCE_VAR_MAXSIZE));
-};
+  //! disallow default constructor
+  Reduce_Data() = delete;
 
-/*!
- ******************************************************************************
- *
- * \brief  Min-loc reducer class template for use in a CUDA execution.
- *
- *         For usage example, see reducers.hpp.
- *
- ******************************************************************************
- */
-template <size_t BLOCK_SIZE, bool Async, typename T>
-class ReduceMinLoc<cuda_reduce<BLOCK_SIZE, Async>, T>
-{
-public:
-  /*!
-   * \brief Constructor takes initial reduction value (default constructor
-   * is disabled).
+  /*! \brief create from a default value and offload information
    *
-   * Note: Constructor only executes on the host.
+   *  allocates PinnedTally to hold device values
    */
-  explicit ReduceMinLoc(T init_val, Index_type init_loc)
+  explicit Reduce_Data(T initValue, T identity_)
+      : value{initValue},
+        identity{identity_},
+        device_count{nullptr},
+        device{},
+        own_device_ptr{false}
   {
-    m_is_copy_host = false;
-    m_myID = getCudaReductionId();
-    getCudaReductionMemBlock(m_myID, (void **)&m_blockdata);
-    getCudaReductionTallyBlock(m_myID,
-                               (void **)&m_tally_host,
-                               (void **)&m_tally_device);
-    m_tally_host->tally.val = init_val;
-    m_tally_host->tally.idx = init_loc;
-    m_tally_host->retiredBlocks = static_cast<GridSizeType>(0);
   }
 
-  /*!
-   * \brief Initialize shared memory on device, request shared memory on host.
-   *
-   * Copy constructor executes on both host and device.
-   * On host requests dynamic shared memory and gets offset into dynamic
-   * shared memory if in forall.
-   * On device initializes dynamic shared memory to appropriate value.
-   */
-  __host__ __device__
-  ReduceMinLoc(const ReduceMinLoc<cuda_reduce<BLOCK_SIZE, Async>, T> &other)
+  RAJA_HOST_DEVICE
+  Reduce_Data(const Reduce_Data &other)
+      : value{other.identity},
+        identity{other.identity},
+        device_count{other.device_count},
+        device{other.device},
+        own_device_ptr{false}
   {
-    *this = other;
-#if defined(__CUDA_ARCH__)
-    m_is_copy_device = true;
-    m_finish_reduction = !other.m_is_copy_device;
-    extern __shared__ unsigned char sd_block[];
-    T *sd_val = reinterpret_cast<T *>(&sd_block[m_smem_offset]);
-    Index_type *sd_idx = reinterpret_cast<Index_type *>(
-        &sd_block[m_smem_offset + sizeof(T) * BLOCK_SIZE]);
-
-    int threadId = threadIdx.x + blockDim.x * threadIdx.y
-                   + (blockDim.x * blockDim.y) * threadIdx.z;
-
-    // initialize shared memory
-    T val = m_tally_device->tally.val;
-    Index_type idx = m_tally_device->tally.idx;
-    for (int i = BLOCK_SIZE / 2; i > 0; i /= 2) {
-      // this descends all the way to 1
-      if (threadId < i) {
-        sd_val[threadId + i] = val;
-        sd_idx[threadId + i] = idx;
-      }
-    }
-    if (threadId < 1) {
-      sd_val[threadId] = val;
-      sd_idx[threadId] = idx;
-    }
-
-    __syncthreads();
-#else
-    m_is_copy_host = true;
-    m_smem_offset = getCudaSharedmemOffset(m_myID,
-                                           BLOCK_SIZE,
-                                           (sizeof(T) + sizeof(Index_type)));
-#endif
   }
 
-  /*!
-   * \brief Finish reduction on device and free memory on host.
-   *
-   * Destruction on host releases the device memory chunk for
-   * reduction id and id itself for others to use.
-   * Destruction on device completes the reduction.
-   *
-   * Note: destructor executes on both host and device.
-   */
-  __host__ __device__ ~ReduceMinLoc<cuda_reduce<BLOCK_SIZE, Async>, T>()
+  RAJA_DEVICE
+  void grid_reduce(T* output)
   {
-#if defined(__CUDA_ARCH__)
-    if (m_finish_reduction) {
-      extern __shared__ unsigned char sd_block[];
-      T *sd_val = reinterpret_cast<T *>(&sd_block[m_smem_offset]);
-      Index_type *sd_idx = reinterpret_cast<Index_type *>(
-          &sd_block[m_smem_offset + sizeof(T) * BLOCK_SIZE]);
-
-      int blockId = blockIdx.x + blockIdx.y * gridDim.x
-                    + gridDim.x * gridDim.y * blockIdx.z;
-
-      int blocks = gridDim.x * gridDim.y * gridDim.z;
-
-      int threadId = threadIdx.x + blockDim.x * threadIdx.y
-                     + (blockDim.x * blockDim.y) * threadIdx.z;
-
-      __syncthreads();
-
-      for (int i = BLOCK_SIZE / 2; i >= WARP_SIZE; i /= 2) {
-        if (threadId < i) {
-          RAJA_MINLOC_UNSTRUCTURED(sd_val[threadId],
-                                   sd_idx[threadId],
-                                   sd_val[threadId],
-                                   sd_idx[threadId],
-                                   sd_val[threadId + i],
-                                   sd_idx[threadId + i]);
-        }
-        __syncthreads();
-      }
+    T temp = value;
 
-      for (int i = WARP_SIZE / 2; i > 0; i /= 2) {
-        if (threadId < i) {
-          RAJA_MINLOC_UNSTRUCTURED(sd_val[threadId],
-                                   sd_idx[threadId],
-                                   sd_val[threadId],
-                                   sd_idx[threadId],
-                                   sd_val[threadId + i],
-                                   sd_idx[threadId + i]);
-        }
-      }
-
-      bool lastBlock = false;
-      if (threadId < 1) {
-        m_blockdata->values[blockId] = sd_val[threadId];
-        m_blockdata->indices[blockId] = sd_idx[threadId];
-
-        __threadfence();
-        unsigned int oldBlockCount =
-            ::atomicInc((unsigned int *)&m_tally_device->retiredBlocks,
-                      (blocks - 1));
-        lastBlock = (oldBlockCount == (blocks - 1));
-      }
-      lastBlock = __syncthreads_or(lastBlock);
-
-      if (lastBlock) {
-        CudaReductionLocType<T> lmin{sd_val[0], sd_idx[0]};
-
-        int threads = blockDim.x * blockDim.y * blockDim.z;
-        for (int i = threadId; i < blocks; i += threads) {
-          RAJA_MINLOC_UNSTRUCTURED(lmin.val,
-                                   lmin.idx,
-                                   lmin.val,
-                                   lmin.idx,
-                                   m_blockdata->values[i],
-                                   m_blockdata->indices[i]);
-        }
-        sd_val[threadId] = lmin.val;
-        sd_idx[threadId] = lmin.idx;
-        __syncthreads();
-
-        for (int i = BLOCK_SIZE / 2; i >= WARP_SIZE; i /= 2) {
-          if (threadId < i) {
-            RAJA_MINLOC_UNSTRUCTURED(sd_val[threadId],
-                                     sd_idx[threadId],
-                                     sd_val[threadId],
-                                     sd_idx[threadId],
-                                     sd_val[threadId + i],
-                                     sd_idx[threadId + i]);
-          }
-          __syncthreads();
-        }
-
-        for (int i = WARP_SIZE / 2; i > 0; i /= 2) {
-          if (threadId < i) {
-            RAJA_MINLOC_UNSTRUCTURED(sd_val[threadId],
-                                     sd_idx[threadId],
-                                     sd_val[threadId],
-                                     sd_idx[threadId],
-                                     sd_val[threadId + i],
-                                     sd_idx[threadId + i]);
-          }
-        }
-
-        if (threadId < 1) {
-          RAJA_MINLOC_UNSTRUCTURED(m_tally_device->tally.val,
-                                   m_tally_device->tally.idx,
-                                   m_tally_device->tally.val,
-                                   m_tally_device->tally.idx,
-                                   sd_val[threadId],
-                                   sd_idx[threadId]);
-        }
-      }
+    if (impl::grid_reduce<Combiner>(temp, identity, device, device_count)) {
+      *output = temp;
     }
-#else
-    if (!m_is_copy_host) {
-      releaseCudaReductionTallyBlock(m_myID);
-      releaseCudaReductionId(m_myID);
-    }
-#endif
   }
 
-  /*!
-   * \brief Operator that returns reduced min value.
-   *
-   * Note: accessor only executes on host.
-   */
-  operator T()
+  //! check and setup for device
+  //  allocate device pointers and get a new result buffer from the pinned tally
+  bool setupForDevice()
   {
-    beforeCudaReadTallyBlock<Async>(m_myID);
-    return m_tally_host->tally.val;
+    bool act = !device.allocated() && setupReducers();
+    if (act) {
+      dim3 gridDim = currentGridDim();
+      size_t numBlocks = gridDim.x * gridDim.y * gridDim.z;
+      device.allocate(numBlocks);
+      device_count = device_zeroed_mempool_type::getInstance().template malloc<unsigned int>(1);
+      own_device_ptr = true;
+    }
+    return act;
   }
 
-  /*!
-   * \brief Method that returns reduced min value.
-   *
-   * Note: accessor only executes on host.
-   */
-  T get() { return operator T(); }
-
-  /*!
-   * \brief Method that returns index value corresponding to the reduced min.
-   *
-   * Note: accessor only executes on host.
-   */
-  Index_type getLoc()
+  //! if own resources teardown device setup
+  //  free device pointers
+  bool teardownForDevice()
   {
-    beforeCudaReadTallyBlock<Async>(m_myID);
-    return m_tally_host->tally.idx;
+    bool act = own_device_ptr;
+    if(act) {
+      device.deallocate();
+      device_zeroed_mempool_type::getInstance().free(device_count);  device_count = nullptr;
+      own_device_ptr = false;
+    }
+    return act;
   }
+};
 
-  /*!
-   * \brief Method that updates min and index values.
-   *
-   * Note: only operates on device.
-   */
-  __device__ ReduceMinLoc<cuda_reduce<BLOCK_SIZE, Async>, T> const &minloc(
-      T val,
-      Index_type idx) const
-  {
-    extern __shared__ unsigned char sd_block[];
-    T *sd_val = reinterpret_cast<T *>(&sd_block[m_smem_offset]);
-    Index_type *sd_idx = reinterpret_cast<Index_type *>(
-        &sd_block[m_smem_offset + sizeof(T) * BLOCK_SIZE]);
-
-    int threadId = threadIdx.x + blockDim.x * threadIdx.y
-                   + (blockDim.x * blockDim.y) * threadIdx.z;
 
-    RAJA_MINLOC_UNSTRUCTURED(sd_val[threadId],
-                             sd_idx[threadId],
-                             sd_val[threadId],
-                             sd_idx[threadId],
-                             val,
-                             idx);
+//! Reduction data for Cuda Offload -- stores value, host pointer
+template <bool Async, typename Combiner, typename T>
+struct ReduceAtomic_Data {
 
-    return *this;
-  }
+  mutable T value;
+  T identity;
+  unsigned int* device_count;
+  T* device;
+  bool own_device_ptr;
 
-private:
-  /*!
-   * \brief Default constructor is declared private and not implemented.
-   */
-  ReduceMinLoc<cuda_reduce<BLOCK_SIZE, Async>, T>();
+  //! disallow default constructor
+  ReduceAtomic_Data() = delete;
 
-  /*!
-   * \brief Pointer to host tally block cache slot for this reduction variable.
+  /*! \brief create from a default value and offload information
+   *
+   *  allocates PinnedTally to hold device values
    */
-  CudaReductionLocTallyType<T> *m_tally_host = nullptr;
+  explicit ReduceAtomic_Data(T initValue, T identity_)
+      : value{initValue},
+        identity{identity_},
+        device_count{nullptr},
+        device{nullptr},
+        own_device_ptr{false}
+  {
+  }
 
-  /*!
-   * \brief Pointer to device data block for this reduction variable.
-   */
-  CudaReductionLocBlockType<T> *m_blockdata = nullptr;
+  RAJA_HOST_DEVICE
+  ReduceAtomic_Data(const ReduceAtomic_Data &other)
+      : value{other.identity},
+        identity{other.identity},
+        device_count{other.device_count},
+        device{other.device},
+        own_device_ptr{false}
+  {
+  }
 
-  /*!
-   * \brief Pointer to device tally block slot for this reduction variable.
-   */
-  CudaReductionLocTallyType<T> *m_tally_device = nullptr;
+  RAJA_DEVICE
+  void grid_reduce(T* output)
+  {
+    T temp = value;
 
-  /*!
-   * \brief My cuda reduction variable ID.
-   */
-  int m_myID = -1;
+    if (impl::grid_reduce_atomic<Combiner>(temp, identity, device,
+                                           device_count)) {
+      *output = temp;
+    }
+  }
 
-  /*!
-   * \brief Byte offset into dynamic shared memory.
-   */
-  int m_smem_offset = -1;
+  //! check and setup for device
+  //  allocate device pointers and get a new result buffer from the pinned tally
+  bool setupForDevice()
+  {
+    bool act = !device && setupReducers();
+    if (act) {
+      device = device_mempool_type::getInstance().template malloc<T>(1);
+      device_count = device_zeroed_mempool_type::getInstance().template malloc<unsigned int>(1);
+      own_device_ptr = true;
+    }
+    return act;
+  }
 
-  /*!
-   * \brief If this variable is a copy or not; only original may release memory
-   *        or perform finalization.
-   */
-  bool m_is_copy_host = false;
-  bool m_is_copy_device = false;
-  bool m_finish_reduction = false;
-
-  // Sanity checks for block size and template type size
-  static constexpr bool powerOfTwoCheck = (!(BLOCK_SIZE & (BLOCK_SIZE - 1)));
-  static constexpr bool reasonableRangeCheck =
-      ((BLOCK_SIZE >= 32) && (BLOCK_SIZE <= 1024));
-  static constexpr bool sizeofcheck =
-      ((sizeof(T) <= sizeof(CudaReductionDummyDataType))
-       && (sizeof(CudaReductionLocTallyType<T>)
-           <= sizeof(CudaReductionDummyTallyType))
-       && (sizeof(CudaReductionLocBlockType<T>)
-           <= sizeof(CudaReductionDummyBlockType)));
-  static_assert(powerOfTwoCheck, "Error: block sizes must be a power of 2");
-  static_assert(reasonableRangeCheck,
-                "Error: block sizes must be between 32 and 1024");
-  static_assert(sizeofcheck,
-                "Error: type must be of size <= " RAJA_STRINGIFY_MACRO(
-                    RAJA_CUDA_REDUCE_VAR_MAXSIZE));
+  //! if own resources teardown device setup
+  //  free device pointers
+  bool teardownForDevice()
+  {
+    bool act = own_device_ptr;
+    if(act) {
+      device_mempool_type::getInstance().free(device);  device = nullptr;
+      device_zeroed_mempool_type::getInstance().free(device_count);  device_count = nullptr;
+      own_device_ptr = false;
+    }
+    return act;
+  }
 };
 
-/*!
- ******************************************************************************
- *
- * \brief  Max-loc reducer class template for use in a CUDA execution.
- *
- *         For usage example, see reducers.hpp.
- *
- ******************************************************************************
- */
-template <size_t BLOCK_SIZE, bool Async, typename T>
-class ReduceMaxLoc<cuda_reduce<BLOCK_SIZE, Async>, T>
-{
-public:
-  /*!
-   * \brief Constructor takes initial reduction value (default constructor
-   * is disabled).
-   *
-   * Note: Constructor only executes on the host.
-   */
-  explicit ReduceMaxLoc(T init_val, Index_type init_loc)
+//! Cuda Reduction entity -- generalize on reduction, and type
+template <bool Async, typename Combiner, typename T, bool maybe_atomic>
+struct Reduce {
+  Reduce() = delete;
+
+  //! create a reduce object
+  //  the original object's parent is itself
+  explicit Reduce(T init_val, T identity_ = Combiner::identity())
+      : parent{this},
+        tally_or_val_ptr{new PinnedTally<T>},
+        val(init_val, identity_)
   {
-    m_is_copy_host = false;
-    m_myID = getCudaReductionId();
-    getCudaReductionMemBlock(m_myID, (void **)&m_blockdata);
-    getCudaReductionTallyBlock(m_myID,
-                               (void **)&m_tally_host,
-                               (void **)&m_tally_device);
-    m_tally_host->tally.val = init_val;
-    m_tally_host->tally.idx = init_loc;
-    m_tally_host->retiredBlocks = static_cast<GridSizeType>(0);
   }
 
-  /*!
-   * \brief Initialize shared memory on device, request shared memory on host.
-   *
-   * Copy constructor executes on both host and device.
-   * On host requests dynamic shared memory and gets offset into dynamic
-   * shared memory if in forall.
-   * On device initializes dynamic shared memory to appropriate value.
-   */
-  __host__ __device__
-  ReduceMaxLoc(const ReduceMaxLoc<cuda_reduce<BLOCK_SIZE, Async>, T> &other)
-  {
-    *this = other;
-#if defined(__CUDA_ARCH__)
-    m_is_copy_device = true;
-    m_finish_reduction = !other.m_is_copy_device;
-    extern __shared__ unsigned char sd_block[];
-    T *sd_val = reinterpret_cast<T *>(&sd_block[m_smem_offset]);
-    Index_type *sd_idx = reinterpret_cast<Index_type *>(
-        &sd_block[m_smem_offset + sizeof(T) * BLOCK_SIZE]);
-
-    int threadId = threadIdx.x + blockDim.x * threadIdx.y
-                   + (blockDim.x * blockDim.y) * threadIdx.z;
-
-    // initialize shared memory
-    T val = m_tally_device->tally.val;
-    Index_type idx = m_tally_device->tally.idx;
-    for (int i = BLOCK_SIZE / 2; i > 0; i /= 2) {
-      // this descends all the way to 1
-      if (threadId < i) {
-        sd_val[threadId + i] = val;
-        sd_idx[threadId + i] = idx;
+  //! copy and on host attempt to setup for device
+  RAJA_HOST_DEVICE
+  Reduce(const Reduce & other)
+#if !defined(__CUDA_ARCH__)
+      : parent{other.parent},
+#else
+      : parent{&other},
+#endif
+        tally_or_val_ptr{other.tally_or_val_ptr},
+        val(other.val)
+  {
+#if !defined(__CUDA_ARCH__)
+    if (parent) {
+      if (val.setupForDevice()) {
+        tally_or_val_ptr.val_ptr = tally_or_val_ptr.list->new_value(currentStream());
+        parent = nullptr;
       }
     }
-    if (threadId < 1) {
-      sd_val[threadId] = val;
-      sd_idx[threadId] = idx;
-    }
-
-    __syncthreads();
-#else
-    m_is_copy_host = true;
-    m_smem_offset = getCudaSharedmemOffset(m_myID,
-                                           BLOCK_SIZE,
-                                           (sizeof(T) + sizeof(Index_type)));
 #endif
   }
 
-  /*!
-   * \brief Finish reduction on device and free memory on host.
-   *
-   * Destruction on host releases the global memory block chunk for
-   * reduction id and id itself for others to use.
-   * Destruction on device completes the reduction.
-   *
-   * Note: destructor executes on both host and device.
-   */
-  __host__ __device__ ~ReduceMaxLoc<cuda_reduce<BLOCK_SIZE, Async>, T>()
+  //! apply reduction upon destruction and cleanup resources owned by this copy
+  //  on device store in pinned buffer on host
+  RAJA_HOST_DEVICE
+  ~Reduce()
   {
-#if defined(__CUDA_ARCH__)
-    if (m_finish_reduction) {
-      extern __shared__ unsigned char sd_block[];
-      T *sd_val = reinterpret_cast<T *>(&sd_block[m_smem_offset]);
-      Index_type *sd_idx = reinterpret_cast<Index_type *>(
-          &sd_block[m_smem_offset + sizeof(T) * BLOCK_SIZE]);
-
-      int blockId = blockIdx.x + blockIdx.y * gridDim.x
-                    + gridDim.x * gridDim.y * blockIdx.z;
-
-      int blocks = gridDim.x * gridDim.y * gridDim.z;
-
-      int threadId = threadIdx.x + blockDim.x * threadIdx.y
-                     + (blockDim.x * blockDim.y) * threadIdx.z;
-
-      __syncthreads();
-
-      for (int i = BLOCK_SIZE / 2; i >= WARP_SIZE; i /= 2) {
-        if (threadId < i) {
-          RAJA_MAXLOC_UNSTRUCTURED(sd_val[threadId],
-                                   sd_idx[threadId],
-                                   sd_val[threadId],
-                                   sd_idx[threadId],
-                                   sd_val[threadId + i],
-                                   sd_idx[threadId + i]);
-        }
-        __syncthreads();
-      }
-
-      for (int i = WARP_SIZE / 2; i > 0; i /= 2) {
-        if (threadId < i) {
-          RAJA_MAXLOC_UNSTRUCTURED(sd_val[threadId],
-                                   sd_idx[threadId],
-                                   sd_val[threadId],
-                                   sd_idx[threadId],
-                                   sd_val[threadId + i],
-                                   sd_idx[threadId + i]);
-        }
-      }
-
-      bool lastBlock = false;
-      if (threadId < 1) {
-        m_blockdata->values[blockId] = sd_val[threadId];
-        m_blockdata->indices[blockId] = sd_idx[threadId];
-
-        __threadfence();
-        unsigned int oldBlockCount =
-            ::atomicInc((unsigned int *)&m_tally_device->retiredBlocks,
-                      (blocks - 1));
-        lastBlock = (oldBlockCount == (blocks - 1));
+#if !defined(__CUDA_ARCH__)
+    if (parent == this) {
+      delete tally_or_val_ptr.list; tally_or_val_ptr.list = nullptr;
+    } else if (parent) {
+      if (val.value != val.identity) {
+#if defined(RAJA_ENABLE_OPENMP) && defined(_OPENMP)
+        lock_guard<omp::mutex> lock(tally_or_val_ptr.list->m_mutex);
+#endif
+        parent->combine(val.value);
       }
-      lastBlock = __syncthreads_or(lastBlock);
-
-      if (lastBlock) {
-        CudaReductionLocType<T> lmax{sd_val[0], sd_idx[0]};
-
-        int threads = blockDim.x * blockDim.y * blockDim.z;
-        for (int i = threadId; i < blocks; i += threads) {
-          RAJA_MAXLOC_UNSTRUCTURED(lmax.val,
-                                   lmax.idx,
-                                   lmax.val,
-                                   lmax.idx,
-                                   m_blockdata->values[i],
-                                   m_blockdata->indices[i]);
-        }
-        sd_val[threadId] = lmax.val;
-        sd_idx[threadId] = lmax.idx;
-        __syncthreads();
-
-        for (int i = BLOCK_SIZE / 2; i >= WARP_SIZE; i /= 2) {
-          if (threadId < i) {
-            RAJA_MAXLOC_UNSTRUCTURED(sd_val[threadId],
-                                     sd_idx[threadId],
-                                     sd_val[threadId],
-                                     sd_idx[threadId],
-                                     sd_val[threadId + i],
-                                     sd_idx[threadId + i]);
-          }
-          __syncthreads();
-        }
-
-        for (int i = WARP_SIZE / 2; i > 0; i /= 2) {
-          if (threadId < i) {
-            RAJA_MAXLOC_UNSTRUCTURED(sd_val[threadId],
-                                     sd_idx[threadId],
-                                     sd_val[threadId],
-                                     sd_idx[threadId],
-                                     sd_val[threadId + i],
-                                     sd_idx[threadId + i]);
-          }
-        }
-
-        if (threadId < 1) {
-          RAJA_MAXLOC_UNSTRUCTURED(m_tally_device->tally.val,
-                                   m_tally_device->tally.idx,
-                                   m_tally_device->tally.val,
-                                   m_tally_device->tally.idx,
-                                   sd_val[threadId],
-                                   sd_idx[threadId]);
-        }
+    } else {
+      if (val.teardownForDevice()) {
+        tally_or_val_ptr.val_ptr = nullptr;
       }
     }
 #else
-    if (!m_is_copy_host) {
-      releaseCudaReductionTallyBlock(m_myID);
-      releaseCudaReductionId(m_myID);
+    if (!parent->parent) {
+      val.grid_reduce(tally_or_val_ptr.val_ptr);
+    } else {
+      parent->combine(val.value);
     }
 #endif
   }
 
-  /*!
-   * \brief Operator that returns reduced min value.
-   *
-   * Note: accessor only executes on host.
-   */
+  //! map result value back to host if not done already; return aggregate value
   operator T()
   {
-    beforeCudaReadTallyBlock<Async>(m_myID);
-    return m_tally_host->tally.val;
+    auto n = tally_or_val_ptr.list->begin();
+    auto end = tally_or_val_ptr.list->end();
+    if (n != end) {
+      tally_or_val_ptr.list->synchronize_streams();
+      for ( ; n != end; ++n) {
+        Combiner{}(val.value, *n);
+      }
+      tally_or_val_ptr.list->free_list();
+    }
+    return val.value;
   }
-
-  /*!
-   * \brief Method that returns reduced min value.
-   *
-   * Note: accessor only executes on host.
-   */
+  //! alias for operator T()
   T get() { return operator T(); }
 
-  /*!
-   * \brief Method that returns index value corresponding to the reduced max.
-   *
-   * Note: accessor only executes on host.
-   */
-  Index_type getLoc()
+  //! apply reduction (const version) -- still combines internal values
+  RAJA_HOST_DEVICE
+  void combine(T other) const
   {
-    beforeCudaReadTallyBlock<Async>(m_myID);
-    return m_tally_host->tally.idx;
+    Combiner{}(val.value,other);
   }
 
   /*!
-   * \brief Method that updates max and index values.
-   *
-   * Note: only operates on device.
+   *  \return reference to the local value
    */
-  __device__ ReduceMaxLoc<cuda_reduce<BLOCK_SIZE, Async>, T> const &maxloc(
-      T val,
-      Index_type idx) const
+  T& local() const { return val.value; }
+
+  T get_combined() const { return val.value; }
+
+private:
+  const Reduce* parent;
+
+  //! union to hold either pointer to PinnedTally or poiter to value
+  //  only use list before setup for device and only use val_ptr after
+  union tally_u {
+    PinnedTally<T>* list;
+    T *val_ptr;
+    constexpr tally_u(PinnedTally<T>* l) : list(l) {};
+    constexpr tally_u(T *v_ptr) : val_ptr(v_ptr) {};
+  };
+
+  tally_u tally_or_val_ptr;
+
+  //! cuda reduction data storage class and folding algorithm
+  using reduce_data_type = typename std::conditional<maybe_atomic && RAJA::reduce::cuda::cuda_atomic_available<T>::value,
+                            cuda::ReduceAtomic_Data<Async, Combiner, T>,
+                            cuda::Reduce_Data<Async, Combiner, T>>::type;
+
+  //! storage for reduction data
+  reduce_data_type val;
+};
+
+}  // end namespace cuda
+
+//! specialization of ReduceSum for cuda_reduce
+template <size_t BLOCK_SIZE, bool Async, bool maybe_atomic, typename T>
+struct ReduceSum<cuda_reduce<BLOCK_SIZE, Async, maybe_atomic>, T>
+    : public cuda::Reduce<Async, RAJA::reduce::sum<T>, T, maybe_atomic> {
+  using Base = cuda::Reduce<Async, RAJA::reduce::sum<T>, T, maybe_atomic>;
+  using Base::Base;
+  //! enable operator+= for ReduceSum -- alias for combine()
+  RAJA_HOST_DEVICE
+  const ReduceSum &operator+=(T rhs) const
+  {
+    this->combine(rhs);
+    return *this;
+  }
+};
+
+//! specialization of ReduceMin for cuda_reduce
+template <size_t BLOCK_SIZE, bool Async, bool maybe_atomic, typename T>
+struct ReduceMin<cuda_reduce<BLOCK_SIZE, Async, maybe_atomic>, T>
+    : public cuda::Reduce<Async, RAJA::reduce::min<T>, T, maybe_atomic> {
+  using Base = cuda::Reduce<Async, RAJA::reduce::min<T>, T, maybe_atomic>;
+  using Base::Base;
+  //! enable min() for ReduceMin -- alias for combine()
+  RAJA_HOST_DEVICE
+  const ReduceMin &min(T rhs) const
+  {
+    this->combine(rhs);
+    return *this;
+  }
+};
+
+//! specialization of ReduceMax for cuda_reduce
+template <size_t BLOCK_SIZE, bool Async, bool maybe_atomic, typename T>
+struct ReduceMax<cuda_reduce<BLOCK_SIZE, Async, maybe_atomic>, T>
+    : public cuda::Reduce<Async, RAJA::reduce::max<T>, T, maybe_atomic> {
+  using Base = cuda::Reduce<Async, RAJA::reduce::max<T>, T, maybe_atomic>;
+  using Base::Base;
+  //! enable max() for ReduceMax -- alias for combine()
+  RAJA_HOST_DEVICE
+  const ReduceMax &max(T rhs) const
+  {
+    this->combine(rhs);
+    return *this;
+  }
+};
+
+//! specialization of ReduceMinLoc for cuda_reduce
+template <size_t BLOCK_SIZE, bool Async, bool maybe_atomic, typename T>
+struct ReduceMinLoc<cuda_reduce<BLOCK_SIZE, Async, maybe_atomic>, T>
+    : public cuda::Reduce<Async, RAJA::reduce::min<RAJA::reduce::detail::ValueLoc<T>>, RAJA::reduce::detail::ValueLoc<T>, maybe_atomic> {
+  using value_type = RAJA::reduce::detail::ValueLoc<T>;
+  using Base = cuda::Reduce<Async, RAJA::reduce::min<value_type>, value_type, maybe_atomic>;
+  using Base::Base;
+
+  //! constructor requires a default value for the reducer
+  explicit ReduceMinLoc(T init_val, Index_type init_idx)
+      : Base(value_type(init_val, init_idx))
+  {
+  }
+  //! reducer function; updates the current instance's state
+  RAJA_HOST_DEVICE
+  const ReduceMinLoc &minloc(T rhs, Index_type loc) const
   {
-    extern __shared__ unsigned char sd_block[];
-    T *sd_val = reinterpret_cast<T *>(&sd_block[m_smem_offset]);
-    Index_type *sd_idx = reinterpret_cast<Index_type *>(
-        &sd_block[m_smem_offset + sizeof(T) * BLOCK_SIZE]);
-
-    int threadId = threadIdx.x + blockDim.x * threadIdx.y
-                   + (blockDim.x * blockDim.y) * threadIdx.z;
-
-    RAJA_MAXLOC_UNSTRUCTURED(sd_val[threadId],
-                             sd_idx[threadId],
-                             sd_val[threadId],
-                             sd_idx[threadId],
-                             val,
-                             idx);
+    this->combine(value_type(rhs, loc));
     return *this;
   }
 
-private:
-  /*!
-   * \brief Default constructor is declared private and not implemented.
-   */
-  ReduceMaxLoc<cuda_reduce<BLOCK_SIZE, Async>, T>();
+  //! Get the calculated reduced value
+  Index_type getLoc() { return Base::get().getLoc(); }
 
-  /*!
-   * \brief Pointer to host tally block cache slot for this reduction variable.
-   */
-  CudaReductionLocTallyType<T> *m_tally_host = nullptr;
+  //! Get the calculated reduced value
+  operator T() { return Base::get(); }
 
-  /*!
-   * \brief Pointer to device data block for this reduction variable.
-   */
-  CudaReductionLocBlockType<T> *m_blockdata = nullptr;
+  //! Get the calculated reduced value
+  T get() { return Base::get(); }
+};
 
-  /*!
-   * \brief Pointer to device tally block slot for this reduction variable.
-   */
-  CudaReductionLocTallyType<T> *m_tally_device = nullptr;
+//! specialization of ReduceMaxLoc for cuda_reduce
+template <size_t BLOCK_SIZE, bool Async, bool maybe_atomic, typename T>
+struct ReduceMaxLoc<cuda_reduce<BLOCK_SIZE, Async, maybe_atomic>, T>
+    : public cuda::Reduce<Async, RAJA::reduce::max<RAJA::reduce::detail::ValueLoc<T, false>>, RAJA::reduce::detail::ValueLoc<T, false>, maybe_atomic> {
+  using value_type = RAJA::reduce::detail::ValueLoc<T, false>;
+  using Base = cuda::Reduce<Async, RAJA::reduce::max<value_type>, value_type, maybe_atomic>;
+  using Base::Base;
 
-  /*!
-   * \brief My cuda reduction variable ID.
-   */
-  int m_myID = -1;
+  //! constructor requires a default value for the reducer
+  explicit ReduceMaxLoc(T init_val, Index_type init_idx)
+      : Base(value_type(init_val, init_idx))
+  {
+  }
+  //! reducer function; updates the current instance's state
+  RAJA_HOST_DEVICE
+  const ReduceMaxLoc &maxloc(T rhs, Index_type loc) const
+  {
+    this->combine(value_type(rhs, loc));
+    return *this;
+  }
 
-  /*!
-   * \brief Byte offset into dynamic shared memory.
-   */
-  int m_smem_offset = -1;
+  //! Get the calculated reduced value
+  Index_type getLoc() { return Base::get().getLoc(); }
 
-  /*!
-   * \brief If this variable is a copy or not; only original may release memory
-   *        or perform finalization.
-   */
-  bool m_is_copy_host = false;
-  bool m_is_copy_device = false;
-  bool m_finish_reduction = false;
-
-  // Sanity checks for block size and template type size
-  static constexpr bool powerOfTwoCheck = (!(BLOCK_SIZE & (BLOCK_SIZE - 1)));
-  static constexpr bool reasonableRangeCheck =
-      ((BLOCK_SIZE >= 32) && (BLOCK_SIZE <= 1024));
-  static constexpr bool sizeofcheck =
-      ((sizeof(T) <= sizeof(CudaReductionDummyDataType))
-       && (sizeof(CudaReductionLocTallyType<T>)
-           <= sizeof(CudaReductionDummyTallyType))
-       && (sizeof(CudaReductionLocBlockType<T>)
-           <= sizeof(CudaReductionDummyBlockType)));
-  static_assert(powerOfTwoCheck, "Error: block sizes must be a power of 2");
-  static_assert(reasonableRangeCheck,
-                "Error: block sizes must be between 32 and 1024");
-  static_assert(sizeofcheck,
-                "Error: type must be of size <= " RAJA_STRINGIFY_MACRO(
-                    RAJA_CUDA_REDUCE_VAR_MAXSIZE));
+  //! Get the calculated reduced value
+  operator T() { return Base::get(); }
+
+  //! Get the calculated reduced value
+  T get() { return Base::get(); }
 };
 
 }  // closing brace for RAJA namespace
diff --git a/include/RAJA/policy/cuda/scan.hpp b/include/RAJA/policy/cuda/scan.hpp
index 05612557b5..0555e95c16 100644
--- a/include/RAJA/policy/cuda/scan.hpp
+++ b/include/RAJA/policy/cuda/scan.hpp
@@ -8,15 +8,8 @@
 ******************************************************************************
 */
 
-#ifndef RAJA_scan_cuda_HPP
-#define RAJA_scan_cuda_HPP
-
-#include "RAJA/config.hpp"
-
-#if defined(RAJA_ENABLE_CUDA)
-
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -26,39 +19,21 @@
 //
 // This file is part of RAJA.
 //
-// For additional details, please also read RAJA/LICENSE.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
+// For details about use and distribution, please read RAJA/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+#ifndef RAJA_scan_cuda_HPP
+#define RAJA_scan_cuda_HPP
+
+#include "RAJA/config.hpp"
+
+#if defined(RAJA_ENABLE_CUDA)
+
 #include "RAJA/policy/cuda/policy.hpp"
 
+#include "RAJA/policy/cuda/MemUtils_CUDA.hpp"
+
 #include <iterator>
 #include <type_traits>
 
@@ -68,6 +43,7 @@
 #else
 #include <thrust/device_ptr.h>
 #include <thrust/execution_policy.h>
+#include <thrust/system/cuda/execution_policy.h>
 #include <thrust/functional.h>
 #include <thrust/scan.h>
 #endif
@@ -79,14 +55,6 @@ namespace impl
 namespace scan
 {
 
-#if defined(RAJA_ENABLE_CUB)
-RAJA_INLINE::cub::CachingDeviceAllocator& getAllocator()
-{
-  static ::cub::CachingDeviceAllocator allocator(true);
-  return allocator;
-}
-#endif
-
 /*!
         \brief explicit inclusive inplace scan given range, function, and
    initial value
@@ -97,25 +65,26 @@ void inclusive_inplace(const ::RAJA::cuda_exec<BLOCK_SIZE, Async>&,
                        InputIter end,
                        Function binary_op)
 {
+  cudaStream_t stream = 0;
 #if defined(RAJA_ENABLE_CUB)
   int len = std::distance(begin, end);
   // Determine temporary device storage requirements
   void* d_temp_storage = nullptr;
   size_t temp_storage_bytes = 0;
   cudaErrchk(::cub::DeviceScan::InclusiveScan(
-      d_temp_storage, temp_storage_bytes, begin, begin, binary_op, len));
+      d_temp_storage, temp_storage_bytes, begin, begin, binary_op, len, stream));
   // Allocate temporary storage
-  cudaErrchk(
-      getAllocator().DeviceAllocate(&d_temp_storage, temp_storage_bytes));
+  d_temp_storage = cuda::device_mempool_type::getInstance().malloc<unsigned char>(temp_storage_bytes);
   // Run
   cudaErrchk(::cub::DeviceScan::InclusiveScan(
-      d_temp_storage, temp_storage_bytes, begin, begin, binary_op, len));
+      d_temp_storage, temp_storage_bytes, begin, begin, binary_op, len, stream));
   // Free temporary storage
-  cudaErrchk(getAllocator().DeviceFree(d_temp_storage));
+  cuda::device_mempool_type::getInstance().free(d_temp_storage);
 #else
-  ::thrust::inclusive_scan(::thrust::device, begin, end, begin, binary_op);
+  ::thrust::inclusive_scan(::thrust::cuda::par.on(stream), begin, end, begin, binary_op);
 #endif
-  RAJA_CUDA_CHECK_AND_SYNC(Async);
+  cuda::launch(stream);
+  if (!Async) cuda::synchronize(stream);
 }
 
 /*!
@@ -133,26 +102,27 @@ void exclusive_inplace(const ::RAJA::cuda_exec<BLOCK_SIZE, Async>&,
                        Function binary_op,
                        T init)
 {
+  cudaStream_t stream = 0;
 #if defined(RAJA_ENABLE_CUB)
   int len = std::distance(begin, end);
   // Determine temporary device storage requirements
   void* d_temp_storage = nullptr;
   size_t temp_storage_bytes = 0;
   cudaErrchk(::cub::DeviceScan::ExclusiveScan(
-      d_temp_storage, temp_storage_bytes, begin, begin, binary_op, init, len));
+      d_temp_storage, temp_storage_bytes, begin, begin, binary_op, init, len, stream));
   // Allocate temporary storage
-  cudaErrchk(
-      getAllocator().DeviceAllocate(&d_temp_storage, temp_storage_bytes));
+  d_temp_storage = cuda::device_mempool_type::getInstance().malloc<unsigned char>(temp_storage_bytes);
   // Run
   cudaErrchk(::cub::DeviceScan::ExclusiveScan(
-      d_temp_storage, temp_storage_bytes, begin, begin, binary_op, init, len));
+      d_temp_storage, temp_storage_bytes, begin, begin, binary_op, init, len, stream));
   // Free temporary storage
-  cudaErrchk(getAllocator().DeviceFree(d_temp_storage));
+  cuda::device_mempool_type::getInstance().free(d_temp_storage);
 #else
   ::thrust::exclusive_scan(
-      ::thrust::device, begin, end, begin, init, binary_op);
+      ::thrust::cuda::par.on(stream), begin, end, begin, init, binary_op);
 #endif
-  RAJA_CUDA_CHECK_AND_SYNC(Async);
+  cuda::launch(stream);
+  if (!Async) cuda::synchronize(stream);
 }
 
 /*!
@@ -170,25 +140,26 @@ void inclusive(const ::RAJA::cuda_exec<BLOCK_SIZE, Async>&,
                OutputIter out,
                Function binary_op)
 {
+  cudaStream_t stream = 0;
 #if defined(RAJA_ENABLE_CUB)
   int len = std::distance(begin, end);
   // Determine temporary device storage requirements
   void* d_temp_storage = nullptr;
   size_t temp_storage_bytes = 0;
   cudaErrchk(::cub::DeviceScan::InclusiveScan(
-      d_temp_storage, temp_storage_bytes, begin, out, binary_op, len));
+      d_temp_storage, temp_storage_bytes, begin, out, binary_op, len, stream));
   // Allocate temporary storage
-  cudaErrchk(
-      getAllocator().DeviceAllocate(&d_temp_storage, temp_storage_bytes));
+  d_temp_storage = cuda::device_mempool_type::getInstance().malloc<unsigned char>(temp_storage_bytes);
   // Run
   cudaErrchk(::cub::DeviceScan::InclusiveScan(
-      d_temp_storage, temp_storage_bytes, begin, out, binary_op, len));
+      d_temp_storage, temp_storage_bytes, begin, out, binary_op, len, stream));
   // Free temporary storage
-  cudaErrchk(getAllocator().DeviceFree(d_temp_storage));
+  cuda::device_mempool_type::getInstance().free(d_temp_storage);
 #else
-  ::thrust::inclusive_scan(::thrust::device, begin, end, out, binary_op);
+  ::thrust::inclusive_scan(::thrust::cuda::par.on(stream), begin, end, out, binary_op);
 #endif
-  RAJA_CUDA_CHECK_AND_SYNC(Async);
+  cuda::launch(stream);
+  if (!Async) cuda::synchronize(stream);
 }
 
 /*!
@@ -208,25 +179,26 @@ void exclusive(const ::RAJA::cuda_exec<BLOCK_SIZE, Async>&,
                Function binary_op,
                T init)
 {
+  cudaStream_t stream = 0;
 #if defined(RAJA_ENABLE_CUB)
   int len = std::distance(begin, end);
   // Determine temporary device storage requirements
   void* d_temp_storage = nullptr;
   size_t temp_storage_bytes = 0;
   cudaErrchk(::cub::DeviceScan::ExclusiveScan(
-      d_temp_storage, temp_storage_bytes, begin, out, binary_op, init, len));
+      d_temp_storage, temp_storage_bytes, begin, out, binary_op, init, len, stream));
   // Allocate temporary storage
-  cudaErrchk(
-      getAllocator().DeviceAllocate(&d_temp_storage, temp_storage_bytes));
+  d_temp_storage = cuda::device_mempool_type::getInstance().malloc<unsigned char>(temp_storage_bytes);
   // Run
   cudaErrchk(::cub::DeviceScan::ExclusiveScan(
-      d_temp_storage, temp_storage_bytes, begin, out, binary_op, init, len));
+      d_temp_storage, temp_storage_bytes, begin, out, binary_op, init, len, stream));
   // Free temporary storage
-  cudaErrchk(getAllocator().DeviceFree(d_temp_storage));
+  cuda::device_mempool_type::getInstance().free(d_temp_storage);
 #else
-  ::thrust::exclusive_scan(::thrust::device, begin, end, out, init, binary_op);
+  ::thrust::exclusive_scan(::thrust::cuda::par.on(stream), begin, end, out, init, binary_op);
 #endif
-  RAJA_CUDA_CHECK_AND_SYNC(Async);
+  cuda::launch(stream);
+  if (!Async) cuda::synchronize(stream);
 }
 
 }  // closing brace for scan namespace
diff --git a/include/RAJA/policy/loop.hpp b/include/RAJA/policy/loop.hpp
index 7e403d9ba3..1b7eaa568e 100644
--- a/include/RAJA/policy/loop.hpp
+++ b/include/RAJA/policy/loop.hpp
@@ -10,11 +10,8 @@
 ******************************************************************************
 */
 
-#ifndef RAJA_loop_HPP
-#define RAJA_loop_HPP
-
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -24,37 +21,13 @@
 //
 // This file is part of RAJA.
 //
-// For additional details, please also read RAJA/LICENSE.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
+// For details about use and distribution, please read RAJA/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+#ifndef RAJA_loop_HPP
+#define RAJA_loop_HPP
+
 #include "RAJA/policy/loop/atomic.hpp"
 #include "RAJA/policy/loop/forall.hpp"
 #include "RAJA/policy/loop/policy.hpp"
diff --git a/include/RAJA/policy/loop/atomic.hpp b/include/RAJA/policy/loop/atomic.hpp
index fd97b16877..c8727928d0 100644
--- a/include/RAJA/policy/loop/atomic.hpp
+++ b/include/RAJA/policy/loop/atomic.hpp
@@ -8,11 +8,8 @@
  ******************************************************************************
  */
 
-#ifndef RAJA_policy_loop_atomic_HPP
-#define RAJA_policy_loop_atomic_HPP
-
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -22,37 +19,13 @@
 //
 // This file is part of RAJA.
 //
-// For additional details, please also read RAJA/LICENSE.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
+// For details about use and distribution, please read RAJA/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+#ifndef RAJA_policy_loop_atomic_HPP
+#define RAJA_policy_loop_atomic_HPP
+
 #include "RAJA/config.hpp"
 #include "RAJA/util/defines.hpp"
 
diff --git a/include/RAJA/policy/loop/forall.hpp b/include/RAJA/policy/loop/forall.hpp
index 9cd9f37712..bbb0702b7e 100644
--- a/include/RAJA/policy/loop/forall.hpp
+++ b/include/RAJA/policy/loop/forall.hpp
@@ -11,11 +11,8 @@
  ******************************************************************************
  */
 
-#ifndef RAJA_forall_loop_HPP
-#define RAJA_forall_loop_HPP
-
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -25,37 +22,13 @@
 //
 // This file is part of RAJA.
 //
-// For additional details, please also read RAJA/LICENSE.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONLOOP
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
+// For details about use and distribution, please read RAJA/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+#ifndef RAJA_forall_loop_HPP
+#define RAJA_forall_loop_HPP
+
 #include "RAJA/config.hpp"
 
 #include "RAJA/util/types.hpp"
@@ -71,10 +44,10 @@ using RAJA::concepts::enable_if;
 
 namespace RAJA
 {
-
-namespace impl
+namespace policy
+{
+namespace loop
 {
-
 
 //
 //////////////////////////////////////////////////////////////////////
@@ -87,29 +60,18 @@ namespace impl
 //
 
 template <typename Iterable, typename Func>
-RAJA_INLINE void forall(const loop_exec &, Iterable &&iter, Func &&body)
+RAJA_INLINE void forall_impl(const loop_exec &, Iterable &&iter, Func &&body)
 {
-  auto begin = std::begin(iter);
-  auto end = std::end(iter);
-  auto distance = std::distance(begin, end);
-  for (decltype(distance) i = 0; i < distance; ++i) {
-    body(*(begin + i));
-  }
-}
+  RAJA_EXTRACT_BED_IT(iter);
 
-template <typename Iterable, typename Func, typename IndexType>
-RAJA_INLINE concepts::enable_if<type_traits::is_integral<IndexType>>
-forall_Icount(const loop_exec &, Iterable &&iter, IndexType icount, Func &&body)
-{
-  auto begin = std::begin(iter);
-  auto end = std::end(iter);
-  auto distance = std::distance(begin, end);
-  for (decltype(distance) i = 0; i < distance; ++i) {
-    body(static_cast<IndexType>(i + icount), *(begin + i));
+  for (decltype(distance_it) i = 0; i < distance_it; ++i) {
+    body(*(begin_it + i));
   }
 }
 
-}  // closing brace for impl namespace
+}  // closing brace for loop namespace
+
+}  // closing brace for policy namespace
 
 }  // closing brace for RAJA namespace
 
diff --git a/include/RAJA/policy/loop/fwd.hpp b/include/RAJA/policy/loop/fwd.hpp
deleted file mode 100644
index 6fce3a904a..0000000000
--- a/include/RAJA/policy/loop/fwd.hpp
+++ /dev/null
@@ -1,96 +0,0 @@
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
-//
-// Produced at the Lawrence Livermore National Laboratory
-//
-// LLNL-CODE-689114
-//
-// All rights reserved.
-//
-// This file is part of RAJA.
-//
-// For additional details, please also read RAJA/LICENSE.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONLOOP
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
-//
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-
-/*!
- ******************************************************************************
- *
- * \file
- *
- * \brief   Header file containing RAJA segment template methods for
- *          execution via CUDA kernel launch.
- *
- *          These methods should work on any platform that supports
- *          CUDA devices.
- *
- ******************************************************************************
- */
-
-#ifndef RAJA_forward_loop_HXX
-#define RAJA_forward_loop_HXX
-
-#include <type_traits>
-
-#include "RAJA/config.hpp"
-
-#include "RAJA/policy/loop/policy.hpp"
-
-namespace RAJA
-{
-
-namespace impl
-{
-
-template <typename Func>
-RAJA_INLINE void forall(const loop_exec &,
-                        const PolicyBase &,
-                        const RangeSegment &iter,
-                        Func &&loop_body);
-
-template <typename Iterable, typename Func>
-RAJA_INLINE void forall(const loop_exec &,
-                        const PolicyBase &,
-                        Iterable &&iter,
-                        Func &&loop_body);
-
-template <typename Iterable, typename IndexType, typename Func>
-RAJA_INLINE typename std::enable_if<std::is_integral<IndexType>::value>::type
-forall_Icount(const loop_exec &,
-              const PolicyBase &,
-              Iterable &&iter,
-              IndexType icount,
-              Func &&loop_body);
-
-}  // closing brace for impl namespace
-
-}  // closing brace for RAJA namespace
-
-#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/loop/policy.hpp b/include/RAJA/policy/loop/policy.hpp
index a62dcad878..f68236b79e 100644
--- a/include/RAJA/policy/loop/policy.hpp
+++ b/include/RAJA/policy/loop/policy.hpp
@@ -8,11 +8,8 @@
  ******************************************************************************
  */
 
-#ifndef policy_loop_HPP
-#define policy_loop_HPP
-
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -22,43 +19,23 @@
 //
 // This file is part of RAJA.
 //
-// For additional details, please also read RAJA/LICENSE.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
+// For details about use and distribution, please read RAJA/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+#ifndef policy_loop_HPP
+#define policy_loop_HPP
+
 #include "RAJA/policy/PolicyBase.hpp"
 
 #include "RAJA/policy/sequential/policy.hpp"
 
 namespace RAJA
 {
+namespace policy
+{
+namespace loop
+{
 
 //
 //////////////////////////////////////////////////////////////////////
@@ -73,9 +50,9 @@ namespace RAJA
 ///
 
 struct loop_exec : make_policy_pattern_launch_platform_t<Policy::loop,
-                                                        Pattern::forall,
-                                                        Launch::undefined,
-                                                        Platform::host> {
+                                                         Pattern::forall,
+                                                         Launch::undefined,
+                                                         Platform::host> {
 };
 
 ///
@@ -92,6 +69,14 @@ using loop_segit = loop_exec;
 ///
 using loop_reduce = seq_reduce;
 
+}  // end namespace loop
+
+}  // end namespace policy
+
+using policy::loop::loop_exec;
+using policy::loop::loop_segit;
+using policy::loop::loop_reduce;
+
 }  // closing brace for RAJA namespace
 
 #endif
diff --git a/include/RAJA/policy/loop/scan.hpp b/include/RAJA/policy/loop/scan.hpp
index 648d846aa8..3271ea4335 100644
--- a/include/RAJA/policy/loop/scan.hpp
+++ b/include/RAJA/policy/loop/scan.hpp
@@ -8,11 +8,8 @@
 ******************************************************************************
 */
 
-#ifndef RAJA_scan_loop_HPP
-#define RAJA_scan_loop_HPP
-
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -22,37 +19,13 @@
 //
 // This file is part of RAJA.
 //
-// For additional details, please also read RAJA/LICENSE.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONLOOP
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
+// For details about use and distribution, please read RAJA/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+#ifndef RAJA_scan_loop_HPP
+#define RAJA_scan_loop_HPP
+
 #include "RAJA/config.hpp"
 
 #include "RAJA/util/defines.hpp"
diff --git a/include/RAJA/policy/openmp.hpp b/include/RAJA/policy/openmp.hpp
index 635ce02982..94604b8a64 100644
--- a/include/RAJA/policy/openmp.hpp
+++ b/include/RAJA/policy/openmp.hpp
@@ -10,15 +10,8 @@
  ******************************************************************************
  */
 
-#ifndef RAJA_openmp_HPP
-#define RAJA_openmp_HPP
-
-#include "RAJA/config.hpp"
-
-#if defined(RAJA_ENABLE_OPENMP)
-
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -28,38 +21,18 @@
 //
 // This file is part of RAJA.
 //
-// For additional details, please also read RAJA/LICENSE.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
+// For details about use and distribution, please read RAJA/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 
+#ifndef RAJA_openmp_HPP
+#define RAJA_openmp_HPP
+
+#include "RAJA/config.hpp"
+
+#if defined(RAJA_ENABLE_OPENMP)
+
 #include <omp.h>
 #include <iostream>
 #include <thread>
diff --git a/include/RAJA/policy/openmp/atomic.hpp b/include/RAJA/policy/openmp/atomic.hpp
index eec00a9fa6..6c4bbda5a7 100644
--- a/include/RAJA/policy/openmp/atomic.hpp
+++ b/include/RAJA/policy/openmp/atomic.hpp
@@ -8,18 +8,8 @@
  ******************************************************************************
  */
 
-#ifndef RAJA_policy_openmp_atomic_HPP
-#define RAJA_policy_openmp_atomic_HPP
-
-#include "RAJA/config.hpp"
-
-// rely on builtin_atomic when OpenMP can't do the job
-#include "RAJA/policy/atomic_builtin.hpp"
-
-#if defined(RAJA_ENABLE_OPENMP)
-
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -29,37 +19,20 @@
 //
 // This file is part of RAJA.
 //
-// For additional details, please also read RAJA/LICENSE.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
+// For details about use and distribution, please read RAJA/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+#ifndef RAJA_policy_openmp_atomic_HPP
+#define RAJA_policy_openmp_atomic_HPP
+
+#include "RAJA/config.hpp"
+
+// rely on builtin_atomic when OpenMP can't do the job
+#include "RAJA/policy/atomic_builtin.hpp"
+
+#if defined(RAJA_ENABLE_OPENMP)
+
 #include "RAJA/util/defines.hpp"
 
 
diff --git a/include/RAJA/policy/openmp/forall.hpp b/include/RAJA/policy/openmp/forall.hpp
index 738be3837d..419fd29a61 100644
--- a/include/RAJA/policy/openmp/forall.hpp
+++ b/include/RAJA/policy/openmp/forall.hpp
@@ -11,15 +11,8 @@
  ******************************************************************************
  */
 
-#ifndef RAJA_forall_openmp_HPP
-#define RAJA_forall_openmp_HPP
-
-#include "RAJA/config.hpp"
-
-#if defined(RAJA_ENABLE_OPENMP)
-
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -29,37 +22,17 @@
 //
 // This file is part of RAJA.
 //
-// For additional details, please also read RAJA/LICENSE.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
+// For details about use and distribution, please read RAJA/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+#ifndef RAJA_forall_openmp_HPP
+#define RAJA_forall_openmp_HPP
+
+#include "RAJA/config.hpp"
+
+#if defined(RAJA_ENABLE_OPENMP)
+
 #include "RAJA/util/types.hpp"
 
 #include "RAJA/internal/fault_tolerance.hpp"
@@ -70,48 +43,34 @@
 
 #include "RAJA/policy/openmp/policy.hpp"
 
+#include "RAJA/pattern/forall.hpp"
+
 #include <iostream>
-#include <thread>
+#include <type_traits>
 
 #include <omp.h>
 
 namespace RAJA
 {
 
-namespace impl
+namespace policy
+{
+namespace omp
 {
 ///
 /// OpenMP parallel for policy implementation
 ///
 
 template <typename Iterable, typename Func, typename InnerPolicy>
-RAJA_INLINE void forall(const omp_parallel_exec<InnerPolicy>&,
+RAJA_INLINE void forall_impl(const omp_parallel_exec<InnerPolicy>&,
                         Iterable&& iter,
                         Func&& loop_body)
 {
 #pragma omp parallel
   {
-    typename std::remove_reference<decltype(loop_body)>::type body = loop_body;
-    forall<InnerPolicy>(std::forward<Iterable>(iter), std::forward<Func>(body));
-  }
-}
-
-template <typename Iterable,
-          typename IndexType,
-          typename Func,
-          typename InnerPolicy>
-RAJA_INLINE typename std::enable_if<std::is_integral<IndexType>::value>::type
-forall_Icount(const omp_parallel_exec<InnerPolicy>&,
-              Iterable&& iter,
-              IndexType icount,
-              Func&& loop_body)
-{
-#pragma omp parallel
-  {
-    typename std::remove_reference<decltype(loop_body)>::type body = loop_body;
-    forall_Icount<InnerPolicy>(std::forward<Iterable>(iter),
-                               icount,
-                               std::forward<Func>(body));
+    using RAJA::internal::thread_privatize;
+    auto body = thread_privatize(loop_body);
+    forall_impl(InnerPolicy{}, std::forward<Iterable>(iter), body.get_priv());
   }
 }
 
@@ -120,32 +79,14 @@ forall_Icount(const omp_parallel_exec<InnerPolicy>&,
 ///
 
 template <typename Iterable, typename Func>
-RAJA_INLINE void forall(const omp_for_nowait_exec&,
+RAJA_INLINE void forall_impl(const omp_for_nowait_exec&,
                         Iterable&& iter,
                         Func&& loop_body)
 {
-  auto begin = std::begin(iter);
-  auto end = std::end(iter);
-  auto distance = std::distance(begin, end);
+  RAJA_EXTRACT_BED_IT(iter);
 #pragma omp for nowait
-  for (decltype(distance) i = 0; i < distance; ++i) {
-    loop_body(begin[i]);
-  }
-}
-
-template <typename Iterable, typename IndexType, typename Func>
-RAJA_INLINE typename std::enable_if<std::is_integral<IndexType>::value>::type
-forall_Icount(const omp_for_nowait_exec&,
-              Iterable&& iter,
-              IndexType icount,
-              Func&& loop_body)
-{
-  auto begin = std::begin(iter);
-  auto end = std::end(iter);
-  auto distance = std::distance(begin, end);
-#pragma omp for nowait
-  for (decltype(distance) i = 0; i < distance; ++i) {
-    loop_body(static_cast<IndexType>(i + icount), begin[i]);
+  for (decltype(distance_it) i = 0; i < distance_it; ++i) {
+    loop_body(begin_it[i]);
   }
 }
 
@@ -154,30 +95,12 @@ forall_Icount(const omp_for_nowait_exec&,
 ///
 
 template <typename Iterable, typename Func>
-RAJA_INLINE void forall(const omp_for_exec&, Iterable&& iter, Func&& loop_body)
-{
-  auto begin = std::begin(iter);
-  auto end = std::end(iter);
-  auto distance = std::distance(begin, end);
-#pragma omp for
-  for (decltype(distance) i = 0; i < distance; ++i) {
-    loop_body(begin[i]);
-  }
-}
-
-template <typename Iterable, typename IndexType, typename Func>
-RAJA_INLINE typename std::enable_if<std::is_integral<IndexType>::value>::type
-forall_Icount(const omp_for_exec&,
-              Iterable&& iter,
-              IndexType icount,
-              Func&& loop_body)
+RAJA_INLINE void forall_impl(const omp_for_exec&, Iterable&& iter, Func&& loop_body)
 {
-  auto begin = std::begin(iter);
-  auto end = std::end(iter);
-  auto distance = std::distance(begin, end);
+  RAJA_EXTRACT_BED_IT(iter);
 #pragma omp for
-  for (decltype(distance) i = 0; i < distance; ++i) {
-    loop_body(static_cast<IndexType>(i + icount), begin[i]);
+  for (decltype(distance_it) i = 0; i < distance_it; ++i) {
+    loop_body(begin_it[i]);
   }
 }
 
@@ -186,39 +109,17 @@ forall_Icount(const omp_for_exec&,
 ///
 
 template <typename Iterable, typename Func, size_t ChunkSize>
-RAJA_INLINE void forall(const omp_for_static<ChunkSize>&,
+RAJA_INLINE void forall_impl(const omp_for_static<ChunkSize>&,
                         Iterable&& iter,
                         Func&& loop_body)
 {
-  auto begin = std::begin(iter);
-  auto end = std::end(iter);
-  auto distance = std::distance(begin, end);
+  RAJA_EXTRACT_BED_IT(iter);
 #pragma omp for schedule(static, ChunkSize)
-  for (decltype(distance) i = 0; i < distance; ++i) {
-    loop_body(begin[i]);
+  for (decltype(distance_it) i = 0; i < distance_it; ++i) {
+    loop_body(begin_it[i]);
   }
 }
 
-template <typename Iterable,
-          typename IndexType,
-          typename Func,
-          size_t ChunkSize>
-RAJA_INLINE typename std::enable_if<std::is_integral<IndexType>::value>::type
-forall_Icount(const omp_for_static<ChunkSize>&,
-              Iterable&& iter,
-              IndexType icount,
-              Func&& loop_body)
-{
-  auto begin = std::begin(iter);
-  auto end = std::end(iter);
-  auto distance = std::distance(begin, end);
-#pragma omp for schedule(static, ChunkSize)
-  for (decltype(distance) i = 0; i < distance; ++i) {
-    loop_body(static_cast<IndexType>(i + icount), begin[i]);
-  }
-}
-
-
 //
 //////////////////////////////////////////////////////////////////////
 //
@@ -243,7 +144,7 @@ forall_Icount(const omp_for_static<ChunkSize>&,
  */
 
 /*
-* TODO: Fix this!!!
+ * TODO: Fix this!!!
  */
 
 /*
@@ -291,7 +192,9 @@ RAJA_INLINE void forall(
 }
 */
 
-}  // closing brace for impl namespace
+}  // closing brace for omp namespace
+
+}  // closing brace for policy namespace
 
 }  // closing brace for RAJA namespace
 
diff --git a/include/RAJA/policy/openmp/forallN.hpp b/include/RAJA/policy/openmp/forallN.hpp
index c10808e0ae..2c36d1d0c4 100644
--- a/include/RAJA/policy/openmp/forallN.hpp
+++ b/include/RAJA/policy/openmp/forallN.hpp
@@ -8,15 +8,8 @@
  ******************************************************************************
  */
 
-#ifndef RAJA_forallN_openmp_HPP
-#define RAJA_forallN_openmp_HPP
-
-#include "RAJA/config.hpp"
-
-#if defined(RAJA_ENABLE_OPENMP)
-
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -26,37 +19,17 @@
 //
 // This file is part of RAJA.
 //
-// For additional details, please also read RAJA/LICENSE.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
+// For details about use and distribution, please read RAJA/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+#ifndef RAJA_forallN_openmp_HPP
+#define RAJA_forallN_openmp_HPP
+
+#include "RAJA/config.hpp"
+
+#if defined(RAJA_ENABLE_OPENMP)
+
 #include "RAJA/internal/ForallNPolicy.hpp"
 #include "RAJA/policy/openmp/policy.hpp"
 #include "RAJA/util/types.hpp"
diff --git a/include/RAJA/policy/openmp/fwd.hpp b/include/RAJA/policy/openmp/fwd.hpp
deleted file mode 100644
index 93e8699b3c..0000000000
--- a/include/RAJA/policy/openmp/fwd.hpp
+++ /dev/null
@@ -1,125 +0,0 @@
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
-//
-// Produced at the Lawrence Livermore National Laboratory
-//
-// LLNL-CODE-689114
-//
-// All rights reserved.
-//
-// This file is part of RAJA.
-//
-// For additional details, please also read RAJA/LICENSE.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
-//
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-
-/*!
- ******************************************************************************
- *
- * \file
- *
- * \brief   Header file containing RAJA segment template methods for
- *          execution via CUDA kernel launch.
- *
- *          These methods should work on any platform that supports
- *          CUDA devices.
- *
- ******************************************************************************
- */
-
-#ifndef RAJA_forward_openmp_HXX
-#define RAJA_forward_openmp_HXX
-
-#include <type_traits>
-
-#include "RAJA/config.hpp"
-
-#include "RAJA/policy/openmp/policy.hpp"
-
-namespace RAJA
-{
-
-namespace impl
-{
-
-
-template <typename Iterable, typename Func, typename InnerPolicy>
-RAJA_INLINE void forall(const omp_parallel_exec<InnerPolicy>&,
-                        Iterable&&,
-                        Func&&);
-
-template <typename Iterable,
-          typename IndexType,
-          typename Func,
-          typename InnerPolicy>
-RAJA_INLINE typename std::enable_if<std::is_integral<IndexType>::value>::type
-forall_Icount(const omp_parallel_exec<InnerPolicy>&,
-              Iterable&&,
-              IndexType,
-              Func&&);
-///
-/// OpenMP for nowait policy implementation
-///
-
-template <typename Iterable, typename Func>
-RAJA_INLINE void forall(const omp_for_nowait_exec&, Iterable&&, Func&&);
-
-template <typename Iterable, typename IndexType, typename Func>
-RAJA_INLINE typename std::enable_if<std::is_integral<IndexType>::value>::type
-forall_Icount(const omp_for_nowait_exec&, Iterable&&, IndexType, Func&&);
-///
-/// OpenMP parallel for policy implementation
-///
-
-template <typename Iterable, typename Func>
-RAJA_INLINE void forall(const omp_for_exec&, Iterable&&, Func&&);
-
-template <typename Iterable, typename IndexType, typename Func>
-RAJA_INLINE typename std::enable_if<std::is_integral<IndexType>::value>::type
-forall_Icount(const omp_for_exec&, Iterable&&, IndexType, Func&&);
-///
-/// OpenMP parallel for static policy implementation
-///
-
-template <typename Iterable, typename Func, size_t ChunkSize>
-RAJA_INLINE void forall(const omp_for_static<ChunkSize>&, Iterable&&, Func&&);
-
-template <typename Iterable,
-          typename IndexType,
-          typename Func,
-          size_t ChunkSize>
-RAJA_INLINE typename std::enable_if<std::is_integral<IndexType>::value>::type
-forall_Icount(const omp_for_static<ChunkSize>&, Iterable&&, IndexType, Func&&);
-
-}  // closing brace for impl namespace
-
-}  // closing brace for RAJA namespace
-
-#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/openmp/policy.hpp b/include/RAJA/policy/openmp/policy.hpp
index a20f91c971..2ed4280b36 100644
--- a/include/RAJA/policy/openmp/policy.hpp
+++ b/include/RAJA/policy/openmp/policy.hpp
@@ -8,11 +8,8 @@
  ******************************************************************************
  */
 
-#ifndef policy_openmp_HPP
-#define policy_openmp_HPP
-
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -22,43 +19,21 @@
 //
 // This file is part of RAJA.
 //
-// For additional details, please also read RAJA/LICENSE.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
+// For details about use and distribution, please read RAJA/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+#ifndef policy_openmp_HPP
+#define policy_openmp_HPP
+
 #include "RAJA/policy/PolicyBase.hpp"
 
 #include <type_traits>
 
 namespace RAJA
 {
+namespace policy
+{
 
 namespace omp
 {
@@ -92,7 +67,6 @@ struct Distribute {
 };
 
 #endif
-}
 
 //
 //////////////////////////////////////////////////////////////////////
@@ -202,6 +176,25 @@ struct omp_reduce_ordered
     : make_policy_pattern_t<Policy::openmp, Pattern::reduce, reduce::ordered> {
 };
 
+}  // closing brace for omp namespace
+}  // closing brace for policy namespace
+
+using policy::omp::omp_for_exec;
+using policy::omp::omp_for_nowait_exec;
+using policy::omp::omp_for_static;
+using policy::omp::omp_parallel_exec;
+using policy::omp::omp_parallel_for_exec;
+using policy::omp::omp_parallel_segit;
+using policy::omp::omp_parallel_for_segit;
+using policy::omp::omp_collapse_nowait_exec;
+using policy::omp::omp_reduce;
+using policy::omp::omp_reduce_ordered;
+
+#if defined(RAJA_ENABLE_TARGET_OPENMP)
+using policy::omp::omp_target_parallel_for_exec;
+using policy::omp::omp_target_reduce;
+#endif
+
 }  // closing brace for RAJA namespace
 
 
diff --git a/include/RAJA/policy/openmp/reduce.hpp b/include/RAJA/policy/openmp/reduce.hpp
index 929557e31e..b146fe091f 100644
--- a/include/RAJA/policy/openmp/reduce.hpp
+++ b/include/RAJA/policy/openmp/reduce.hpp
@@ -14,11 +14,11 @@
 #ifndef RAJA_omp_reduce_HPP
 #define RAJA_omp_reduce_HPP
 
+#include "RAJA/config.hpp"
 #if defined(RAJA_ENABLE_OPENMP)
 
-
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -28,39 +28,10 @@
 //
 // This file is part of RAJA.
 //
-// For additional details, please also read RAJA/LICENSE.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
+// For details about use and distribution, please read RAJA/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
-#include "RAJA/config.hpp"
-
 #include "RAJA/util/types.hpp"
 
 #include "RAJA/pattern/detail/reduce.hpp"
@@ -158,6 +129,6 @@ RAJA_DECLARE_ALL_REDUCERS(omp_reduce_ordered, detail::ReduceOMPOrdered)
 
 }  // closing brace for RAJA namespace
 
-#endif  // closing endif for RAJA_ENABLE_OPENMP guard
+#endif // closing endif for RAJA_ENABLE_OPENMP guard
 
 #endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/openmp/scan.hpp b/include/RAJA/policy/openmp/scan.hpp
index 32955e7673..773265e7c8 100644
--- a/include/RAJA/policy/openmp/scan.hpp
+++ b/include/RAJA/policy/openmp/scan.hpp
@@ -8,11 +8,8 @@
 ******************************************************************************
 */
 
-#ifndef RAJA_scan_openmp_HPP
-#define RAJA_scan_openmp_HPP
-
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -22,39 +19,15 @@
 //
 // This file is part of RAJA.
 //
-// For additional details, please also read RAJA/LICENSE.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
+// For details about use and distribution, please read RAJA/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 #include "RAJA/config.hpp"
 
+#ifndef RAJA_scan_openmp_HPP
+#define RAJA_scan_openmp_HPP
+
 #include "RAJA/policy/openmp/policy.hpp"
 #include "RAJA/policy/sequential/scan.hpp"
 
diff --git a/include/RAJA/policy/openmp/target_forall.hpp b/include/RAJA/policy/openmp/target_forall.hpp
index 7c789bd0f9..138621e4b9 100644
--- a/include/RAJA/policy/openmp/target_forall.hpp
+++ b/include/RAJA/policy/openmp/target_forall.hpp
@@ -11,15 +11,8 @@
  ******************************************************************************
  */
 
-#ifndef RAJA_target_forall_openmp_HXX
-#define RAJA_target_forall_openmp_HXX
-
-#include "RAJA/config.hpp"
-
-#if defined(RAJA_ENABLE_TARGET_OPENMP)
-
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -29,37 +22,17 @@
 //
 // This file is part of RAJA.
 //
-// For additional details, please also read RAJA/LICENSE.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
+// For details about use and distribution, please read RAJA/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+#ifndef RAJA_target_forall_openmp_HXX
+#define RAJA_target_forall_openmp_HXX
+
+#include "RAJA/config.hpp"
+
+#if defined(RAJA_ENABLE_TARGET_OPENMP)
+
 #include "RAJA/util/types.hpp"
 
 #include "RAJA/policy/openmp/policy.hpp"
@@ -94,29 +67,10 @@ RAJA_INLINE void forall(const omp_target_parallel_for_exec<Teams>&,
   }
 }
 
-template <size_t Teams, typename Iterable, typename Func>
-RAJA_INLINE void forall_Icount(const omp_target_parallel_for_exec<Teams>&,
-                               Iterable&& iter,
-                               Index_type icount,
-                               Func&& loop_body)
-{
-  using Body = typename std::remove_reference<decltype(loop_body)>::type;
-  Body body = loop_body;
-  auto begin = std::begin(iter);
-  auto end = std::end(iter);
-  auto distance = std::distance(begin, end);
-#pragma omp target teams distribute parallel for num_teams(Teams) \
-    schedule(static, 1) map(to : body)
-  for (Index_type i = 0; i < distance; ++i) {
-    Body ib = body;
-    ib(i + icount, begin[i]);
-  }
-}
-
 }  // closing brace for impl namespace
 
 }  // closing brace for RAJA namespace
 
-#endif  // closing endif for if defined(RAJA_TARGET_ENABLE_OPENMP)
+#endif  // closing endif for if defined(RAJA_TARGET_RAJA_ENABLE_OPENMP)
 
 #endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/openmp/target_reduce.hpp b/include/RAJA/policy/openmp/target_reduce.hpp
index 3da245cd19..57a56dccc3 100644
--- a/include/RAJA/policy/openmp/target_reduce.hpp
+++ b/include/RAJA/policy/openmp/target_reduce.hpp
@@ -11,15 +11,8 @@
  ******************************************************************************
  */
 
-#ifndef RAJA_target_reduce_omp_HXX
-#define RAJA_target_reduce_omp_HXX
-
-#include "RAJA/config.hpp"
-
-#if defined(RAJA_ENABLE_TARGET_OPENMP)
-
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -29,37 +22,17 @@
 //
 // This file is part of RAJA.
 //
-// For additional details, please also read RAJA/LICENSE.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
+// For details about use and distribution, please read RAJA/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+#ifndef RAJA_omp_target_reduce_HPP
+#define RAJA_omp_target_reduce_HPP
+
+#include "RAJA/config.hpp"
+
+#if defined(RAJA_ENABLE_TARGET_OPENMP)
+
 #include "RAJA/util/types.hpp"
 
 #include "RAJA/pattern/reduce.hpp"
diff --git a/include/RAJA/policy/sequential.hpp b/include/RAJA/policy/sequential.hpp
index 25294bc1ef..3322dd56a1 100644
--- a/include/RAJA/policy/sequential.hpp
+++ b/include/RAJA/policy/sequential.hpp
@@ -10,11 +10,8 @@
  ******************************************************************************
  */
 
-#ifndef RAJA_sequential_HPP
-#define RAJA_sequential_HPP
-
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -24,37 +21,13 @@
 //
 // This file is part of RAJA.
 //
-// For additional details, please also read RAJA/LICENSE.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
+// For details about use and distribution, please read RAJA/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+#ifndef RAJA_sequential_HPP
+#define RAJA_sequential_HPP
+
 #include "RAJA/policy/sequential/atomic.hpp"
 #include "RAJA/policy/sequential/forall.hpp"
 #include "RAJA/policy/sequential/policy.hpp"
diff --git a/include/RAJA/policy/sequential/atomic.hpp b/include/RAJA/policy/sequential/atomic.hpp
index f6ef91af0b..a70035859d 100644
--- a/include/RAJA/policy/sequential/atomic.hpp
+++ b/include/RAJA/policy/sequential/atomic.hpp
@@ -8,11 +8,8 @@
  ******************************************************************************
  */
 
-#ifndef RAJA_policy_sequential_atomic_HPP
-#define RAJA_policy_sequential_atomic_HPP
-
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -22,37 +19,13 @@
 //
 // This file is part of RAJA.
 //
-// For additional details, please also read RAJA/LICENSE.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
+// For details about use and distribution, please read RAJA/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+#ifndef RAJA_policy_sequential_atomic_HPP
+#define RAJA_policy_sequential_atomic_HPP
+
 #include "RAJA/config.hpp"
 #include "RAJA/util/defines.hpp"
 
diff --git a/include/RAJA/policy/sequential/forall.hpp b/include/RAJA/policy/sequential/forall.hpp
index e1190a9851..c35f81e851 100644
--- a/include/RAJA/policy/sequential/forall.hpp
+++ b/include/RAJA/policy/sequential/forall.hpp
@@ -7,17 +7,14 @@
  *          template methods for sequential execution.
  *
  *          These methods should work on any platform.
- *          
+ *
  *          Note: GNU compiler does not enforce sequential iterations.
  *
  ******************************************************************************
  */
 
-#ifndef RAJA_forall_sequential_HPP
-#define RAJA_forall_sequential_HPP
-
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -27,54 +24,28 @@
 //
 // This file is part of RAJA.
 //
-// For additional details, please also read RAJA/LICENSE.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
+// For details about use and distribution, please read RAJA/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+#ifndef RAJA_forall_sequential_HPP
+#define RAJA_forall_sequential_HPP
+
 #include "RAJA/config.hpp"
 
 #include "RAJA/util/types.hpp"
 
 #include "RAJA/policy/sequential/policy.hpp"
 
-#include "RAJA/index/ListSegment.hpp"
-#include "RAJA/index/RangeSegment.hpp"
-
 #include "RAJA/internal/fault_tolerance.hpp"
 
-using RAJA::concepts::enable_if;
+#include "RAJA/pattern/detail/forall.hpp"
 
 namespace RAJA
 {
-
-namespace impl
+namespace policy
+{
+namespace sequential
 {
 
 
@@ -89,33 +60,19 @@ namespace impl
 //
 
 template <typename Iterable, typename Func>
-RAJA_INLINE void forall(const seq_exec &, Iterable &&iter, Func &&body)
+RAJA_INLINE void forall_impl(const seq_exec &, Iterable &&iter, Func &&body)
 {
-  auto begin = std::begin(iter);
-  auto end = std::end(iter);
-  auto distance = std::distance(begin, end);
+  RAJA_EXTRACT_BED_IT(iter);
 
   RAJA_NO_SIMD
-  for (decltype(distance) i = 0; i < distance; ++i) {
-    body(*(begin + i));
+  for (decltype(distance_it) i = 0; i < distance_it; ++i) {
+    body(*(begin_it + i));
   }
 }
 
-template <typename Iterable, typename Func, typename IndexType>
-RAJA_INLINE concepts::enable_if<type_traits::is_integral<IndexType>>
-forall_Icount(const seq_exec &, Iterable &&iter, IndexType icount, Func &&body)
-{
-  auto begin = std::begin(iter);
-  auto end = std::end(iter);
-  auto distance = std::distance(begin, end);
-
-  RAJA_NO_SIMD
-  for (decltype(distance) i = 0; i < distance; ++i) {
-    body(static_cast<IndexType>(i + icount), *(begin + i));
-  }
-}
+}  // closing brace for sequential namespace
 
-}  // closing brace for impl namespace
+}  // closing brace for policy namespace
 
 }  // closing brace for RAJA namespace
 
diff --git a/include/RAJA/policy/sequential/policy.hpp b/include/RAJA/policy/sequential/policy.hpp
index 25a4594c63..659cdeeec8 100644
--- a/include/RAJA/policy/sequential/policy.hpp
+++ b/include/RAJA/policy/sequential/policy.hpp
@@ -8,11 +8,8 @@
  ******************************************************************************
  */
 
-#ifndef policy_sequential_HPP
-#define policy_sequential_HPP
-
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -22,41 +19,21 @@
 //
 // This file is part of RAJA.
 //
-// For additional details, please also read RAJA/LICENSE.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
+// For details about use and distribution, please read RAJA/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+#ifndef policy_sequential_HPP
+#define policy_sequential_HPP
+
 #include "RAJA/policy/PolicyBase.hpp"
 
 namespace RAJA
 {
+namespace policy
+{
+namespace sequential
+{
 
 //
 //////////////////////////////////////////////////////////////////////
@@ -93,6 +70,12 @@ struct seq_reduce : make_policy_pattern_launch_platform_t<Policy::sequential,
                                                           Launch::undefined,
                                                           Platform::host> {
 };
+} // end namespace policy
+} // end namespace sequential
+
+using policy::sequential::seq_exec;
+using policy::sequential::seq_segit;
+using policy::sequential::seq_reduce;
 
 }  // closing brace for RAJA namespace
 
diff --git a/include/RAJA/policy/sequential/reduce.hpp b/include/RAJA/policy/sequential/reduce.hpp
index ff1a269f0c..8b6d96bf4c 100644
--- a/include/RAJA/policy/sequential/reduce.hpp
+++ b/include/RAJA/policy/sequential/reduce.hpp
@@ -11,11 +11,8 @@
  ******************************************************************************
  */
 
-#ifndef RAJA_reduce_sequential_HPP
-#define RAJA_reduce_sequential_HPP
-
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -25,37 +22,13 @@
 //
 // This file is part of RAJA.
 //
-// For additional details, please also read RAJA/LICENSE.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
+// For details about use and distribution, please read RAJA/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+#ifndef RAJA_sequential_reduce_HPP
+#define RAJA_sequential_reduce_HPP
+
 #include "RAJA/config.hpp"
 
 #include "RAJA/internal/MemUtils_CPU.hpp"
diff --git a/include/RAJA/policy/sequential/scan.hpp b/include/RAJA/policy/sequential/scan.hpp
index 683badbc11..ac1a12661b 100644
--- a/include/RAJA/policy/sequential/scan.hpp
+++ b/include/RAJA/policy/sequential/scan.hpp
@@ -8,11 +8,8 @@
 ******************************************************************************
 */
 
-#ifndef RAJA_scan_sequential_HPP
-#define RAJA_scan_sequential_HPP
-
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -22,37 +19,13 @@
 //
 // This file is part of RAJA.
 //
-// For additional details, please also read RAJA/LICENSE.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
+// For details about use and distribution, please read RAJA/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+#ifndef RAJA_scan_sequential_HPP
+#define RAJA_scan_sequential_HPP
+
 #include "RAJA/config.hpp"
 
 #include "RAJA/util/defines.hpp"
diff --git a/include/RAJA/policy/simd.hpp b/include/RAJA/policy/simd.hpp
index a8d30789a9..dda6b3e238 100644
--- a/include/RAJA/policy/simd.hpp
+++ b/include/RAJA/policy/simd.hpp
@@ -10,11 +10,8 @@
  ******************************************************************************
  */
 
-#ifndef RAJA_simd_HPP
-#define RAJA_simd_HPP
-
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -24,37 +21,13 @@
 //
 // This file is part of RAJA.
 //
-// For additional details, please also read RAJA/LICENSE.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
+// For details about use and distribution, please read RAJA/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+#ifndef RAJA_simd_HPP
+#define RAJA_simd_HPP
+
 
 #include "RAJA/policy/simd/forall.hpp"
 #include "RAJA/policy/simd/policy.hpp"
diff --git a/include/RAJA/policy/simd/forall.hpp b/include/RAJA/policy/simd/forall.hpp
index a8022113a6..18b983f6b7 100644
--- a/include/RAJA/policy/simd/forall.hpp
+++ b/include/RAJA/policy/simd/forall.hpp
@@ -16,11 +16,8 @@
  ******************************************************************************
  */
 
-#ifndef RAJA_forall_simd_HPP
-#define RAJA_forall_simd_HPP
-
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -30,37 +27,13 @@
 //
 // This file is part of RAJA.
 //
-// For additional details, please also read RAJA/LICENSE.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
+// For details about use and distribution, please read RAJA/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+#ifndef RAJA_forall_simd_HPP
+#define RAJA_forall_simd_HPP
+
 #include <iterator>
 #include <type_traits>
 
@@ -74,14 +47,15 @@
 
 namespace RAJA
 {
-
-namespace impl
+namespace policy
+{
+namespace simd
 {
 
 
 template <typename Iterable, typename Func>
 RAJA_INLINE void
-forall(const simd_exec &, Iterable &&iter, Func &&loop_body)
+forall_impl(const simd_exec &, Iterable &&iter, Func &&loop_body)
 {
   auto begin = std::begin(iter);
   auto end = std::end(iter);
@@ -92,24 +66,9 @@ forall(const simd_exec &, Iterable &&iter, Func &&loop_body)
   }
 }
 
-// SIMD forall(Iterable)
-template <typename Iterable, typename IndexType, typename Func>
-RAJA_INLINE void
-forall_Icount(const simd_exec &,
-              Iterable &&iter,
-              IndexType icount,
-              Func &&loop_body)
-{
-  auto begin = std::begin(iter);
-  auto end = std::end(iter);
-  auto distance = std::distance(begin, end);
-  RAJA_SIMD
-  for (decltype(distance) i = 0; i < distance; ++i) {
-    loop_body(static_cast<IndexType>(i + icount), *(begin + i));
-  }
-}
+}  // closing brace for simd namespace
 
-}  // closing brace for impl namespace
+}  // closing brace for policy namespace
 
 }  // closing brace for RAJA namespace
 
diff --git a/include/RAJA/policy/simd/policy.hpp b/include/RAJA/policy/simd/policy.hpp
index b8514157b0..978c82ea8e 100644
--- a/include/RAJA/policy/simd/policy.hpp
+++ b/include/RAJA/policy/simd/policy.hpp
@@ -8,11 +8,8 @@
  ******************************************************************************
  */
 
-#ifndef policy_simd_HPP
-#define policy_simd_HPP
-
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -22,37 +19,13 @@
 //
 // This file is part of RAJA.
 //
-// For additional details, please also read RAJA/LICENSE.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
+// For details about use and distribution, please read RAJA/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+#ifndef policy_simd_HPP
+#define policy_simd_HPP
+
 #include "RAJA/policy/PolicyBase.hpp"
 
 //
@@ -68,13 +41,23 @@
 ///
 namespace RAJA
 {
+namespace policy
+{
+namespace simd
+{
 
 struct simd_exec : make_policy_pattern_launch_platform_t<Policy::sequential,
-                                                          Pattern::forall,
-                                                          Launch::undefined,
-                                                          Platform::host> {
+                                                         Pattern::forall,
+                                                         Launch::undefined,
+                                                         Platform::host> {
 };
 
+}  // end of namespace simd
+
+}  // end of namespace policy
+
+using policy::simd::simd_exec;
+
 }  // end of namespace RAJA
 
 #endif
diff --git a/include/RAJA/policy/tbb.hpp b/include/RAJA/policy/tbb.hpp
index da667f1009..daac6506f4 100644
--- a/include/RAJA/policy/tbb.hpp
+++ b/include/RAJA/policy/tbb.hpp
@@ -5,16 +5,11 @@
  *
  * \brief   Header file containing RAJA headers for tbb execution.
  *
- *          These methods work on all platforms.
- *
  ******************************************************************************
  */
 
-#ifndef RAJA_tbb_HPP
-#define RAJA_tbb_HPP
-
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -24,37 +19,13 @@
 //
 // This file is part of RAJA.
 //
-// For additional details, please also read RAJA/LICENSE.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONtbb
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
+// For details about use and distribution, please read RAJA/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+#ifndef RAJA_tbb_HPP
+#define RAJA_tbb_HPP
+
 #include "RAJA/config.hpp"
 
 #if defined(RAJA_ENABLE_TBB)
diff --git a/include/RAJA/policy/tbb/forall.hpp b/include/RAJA/policy/tbb/forall.hpp
index 3e840ee03a..21e3bcbb23 100644
--- a/include/RAJA/policy/tbb/forall.hpp
+++ b/include/RAJA/policy/tbb/forall.hpp
@@ -11,15 +11,8 @@
  ******************************************************************************
  */
 
-#ifndef RAJA_forall_tbb_HPP
-#define RAJA_forall_tbb_HPP
-
-#include "RAJA/config.hpp"
-
-#if defined(RAJA_ENABLE_TBB)
-
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -29,39 +22,17 @@
 //
 // This file is part of RAJA.
 //
-// For additional details, please also read RAJA/LICENSE.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
+// For details about use and distribution, please read RAJA/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+#ifndef RAJA_forall_tbb_HPP
+#define RAJA_forall_tbb_HPP
+
 #include "RAJA/config.hpp"
 
+#if defined(RAJA_ENABLE_TBB)
+
 #include "RAJA/util/types.hpp"
 
 #include "RAJA/policy/tbb/policy.hpp"
@@ -72,6 +43,8 @@
 
 #include "RAJA/internal/fault_tolerance.hpp"
 
+#include "RAJA/pattern/forall.hpp"
+
 #include <tbb/tbb.h>
 
 
@@ -85,7 +58,9 @@ using tbb_static_partitioner = tbb::static_partitioner;
 using tbb_static_partitioner = tbb::auto_partitioner;
 #endif
 
-namespace impl
+namespace policy
+{
+namespace tbb
 {
 
 
@@ -105,46 +80,34 @@ namespace impl
  * stealing at the cost of initial start-up overhead for a top-level loop.
  */
 template <typename Iterable, typename Func>
-RAJA_INLINE void forall(const tbb_for_dynamic& p,
+RAJA_INLINE void forall_impl(const tbb_for_dynamic& p,
                         Iterable&& iter,
                         Func&& loop_body)
 {
-  using brange = tbb::blocked_range<decltype(iter.begin())>;
-  tbb::parallel_for(brange(std::begin(iter), std::end(iter), p.grain_size),
+  using std::begin;
+  using std::end;
+  using brange = ::tbb::blocked_range<decltype(iter.begin())>;
+  ::tbb::parallel_for(brange(begin(iter), end(iter), p.grain_size),
                     [=](const brange& r) {
+                      using RAJA::internal::thread_privatize;
+                      auto privatizer = thread_privatize(loop_body);
+                      auto body = privatizer.get_priv();
                       for (const auto& i : r)
-                        loop_body(i);
+                        body(i);
                     });
 }
 
-template <typename Iterable, typename IndexType, typename Func>
-RAJA_INLINE typename std::enable_if<std::is_integral<IndexType>::value>::type
-forall_Icount(const tbb_for_dynamic& p,
-              Iterable&& iter,
-              IndexType icount,
-              Func&& loop_body)
-{
-  auto begin = std::begin(iter);
-  auto end = std::end(iter);
-  auto distance = std::distance(begin, end);
-  using brange = tbb::blocked_range<decltype(distance)>;
-  tbb::parallel_for(brange(0, distance, p.grain_size), [=](const brange& r) {
-    for (decltype(distance) i = r.begin(); i != r.end(); ++i)
-      loop_body(static_cast<IndexType>(i + icount), begin[i]);
-  });
-}
-
 ///
 /// TBB parallel for static policy implementation
 ///
 
-/** 
+/**
  * @brief TBB static for implementation
- * 
+ *
  * @param tbb_for_static tbb tag
  * @param iter any iterable
  * @param loop_body loop body
- * 
+ *
  * @return None
  *
  * This forall implements a TBB parallel_for loop over the specified iterable
@@ -155,43 +118,26 @@ forall_Icount(const tbb_for_dynamic& p,
  * correctnes requires the per-thread mapping, you *must* use TBB 2017 or newer
  */
 template <typename Iterable, typename Func, size_t ChunkSize>
-RAJA_INLINE void forall(const tbb_for_static<ChunkSize>&,
+RAJA_INLINE void forall_impl(const tbb_for_static<ChunkSize>&,
                         Iterable&& iter,
                         Func&& loop_body)
 {
-  using brange = tbb::blocked_range<decltype(iter.begin())>;
-  tbb::parallel_for(brange(std::begin(iter), std::end(iter), ChunkSize),
+  using std::begin;
+  using std::end;
+  using brange = ::tbb::blocked_range<decltype(iter.begin())>;
+  ::tbb::parallel_for(brange(begin(iter), end(iter), ChunkSize),
                     [=](const brange& r) {
+                      using RAJA::internal::thread_privatize;
+                      auto privatizer = thread_privatize(loop_body);
+                      auto body = privatizer.get_priv();
                       for (const auto& i : r)
-                        loop_body(i);
-                    },
-                    tbb_static_partitioner{});
-}
-
-template <typename Iterable,
-          typename IndexType,
-          typename Func,
-          size_t ChunkSize>
-RAJA_INLINE typename std::enable_if<std::is_integral<IndexType>::value>::type
-forall_Icount(const tbb_for_static<ChunkSize>&,
-              Iterable&& iter,
-              IndexType icount,
-              Func&& loop_body)
-{
-  auto begin = std::begin(iter);
-  auto end = std::end(iter);
-  auto distance = std::distance(begin, end);
-  using brange = tbb::blocked_range<decltype(distance)>;
-  tbb::parallel_for(brange(0, distance, ChunkSize),
-                    [=](const brange& r) {
-                      for (decltype(distance) i = r.begin(); i != r.end(); ++i)
-                        loop_body(static_cast<IndexType>(i + icount), begin[i]);
+                        body(i);
                     },
                     tbb_static_partitioner{});
 }
 
-
-}  // closing brace for impl namespace
+}  // closing brace for tbb namespace
+}  // closing brace for policy namespace
 
 }  // closing brace for RAJA namespace
 
diff --git a/include/RAJA/policy/tbb/forallN.hpp b/include/RAJA/policy/tbb/forallN.hpp
index e153e6e311..f480fc160a 100644
--- a/include/RAJA/policy/tbb/forallN.hpp
+++ b/include/RAJA/policy/tbb/forallN.hpp
@@ -8,15 +8,8 @@
  ******************************************************************************
  */
 
-#ifndef RAJA_forallN_tbb_HPP
-#define RAJA_forallN_tbb_HPP
-
-#include "RAJA/config.hpp"
-
-#if defined(RAJA_ENABLE_TBB)
-
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -26,37 +19,17 @@
 //
 // This file is part of RAJA.
 //
-// For additional details, please also read RAJA/LICENSE.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
+// For details about use and distribution, please read RAJA/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+#ifndef RAJA_forallN_tbb_HPP
+#define RAJA_forallN_tbb_HPP
+
+#include "RAJA/config.hpp"
+
+#if defined(RAJA_ENABLE_TBB)
+
 #include "RAJA/internal/ForallNPolicy.hpp"
 #include "RAJA/policy/tbb/policy.hpp"
 #include "RAJA/util/types.hpp"
diff --git a/include/RAJA/policy/tbb/policy.hpp b/include/RAJA/policy/tbb/policy.hpp
index 0c40a9d561..198f3c49d6 100644
--- a/include/RAJA/policy/tbb/policy.hpp
+++ b/include/RAJA/policy/tbb/policy.hpp
@@ -8,11 +8,8 @@
  ******************************************************************************
  */
 
-#ifndef policy_tbb_HPP
-#define policy_tbb_HPP
-
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -22,43 +19,23 @@
 //
 // This file is part of RAJA.
 //
-// For additional details, please also read RAJA/LICENSE.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
+// For details about use and distribution, please read RAJA/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+#ifndef policy_tbb_HPP
+#define policy_tbb_HPP
+
 #include "RAJA/policy/PolicyBase.hpp"
 
 #include <cstddef>
 
 namespace RAJA
 {
+namespace policy
+{
+namespace tbb
+{
 
 //
 //////////////////////////////////////////////////////////////////////
@@ -110,6 +87,15 @@ struct tbb_reduce : make_policy_pattern_launch_platform_t<Policy::tbb,
                                                           Platform::host> {
 };
 
+}  // closing brace for tbb
+}  // closing brace for policy
+
+using policy::tbb::tbb_for_exec;
+using policy::tbb::tbb_for_static;
+using policy::tbb::tbb_for_dynamic;
+using policy::tbb::tbb_segit;
+using policy::tbb::tbb_reduce;
+
 }  // closing brace for RAJA namespace
 
 #endif
diff --git a/include/RAJA/policy/tbb/reduce.hpp b/include/RAJA/policy/tbb/reduce.hpp
index a9a450c4a6..90ffc8fe05 100644
--- a/include/RAJA/policy/tbb/reduce.hpp
+++ b/include/RAJA/policy/tbb/reduce.hpp
@@ -11,15 +11,8 @@
  ******************************************************************************
  */
 
-#ifndef RAJA_tbb_reduce_HPP
-#define RAJA_tbb_reduce_HPP
-
-#include "RAJA/config.hpp"
-
-#if defined(RAJA_ENABLE_TBB)
-
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -29,37 +22,17 @@
 //
 // This file is part of RAJA.
 //
-// For additional details, please also read RAJA/LICENSE.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
+// For details about use and distribution, please read RAJA/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+#ifndef RAJA_tbb_reduce_HPP
+#define RAJA_tbb_reduce_HPP
+
+#include "RAJA/config.hpp"
+
+#if defined(RAJA_ENABLE_TBB)
+
 #include "RAJA/internal/MemUtils_CPU.hpp"
 #include "RAJA/pattern/detail/reduce.hpp"
 #include "RAJA/pattern/reduce.hpp"
diff --git a/include/RAJA/policy/tbb/scan.hpp b/include/RAJA/policy/tbb/scan.hpp
index cef1b7a27b..f0822316fe 100644
--- a/include/RAJA/policy/tbb/scan.hpp
+++ b/include/RAJA/policy/tbb/scan.hpp
@@ -8,11 +8,8 @@
 ******************************************************************************
 */
 
-#ifndef RAJA_scan_tbb_HPP
-#define RAJA_scan_tbb_HPP
-
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -22,37 +19,13 @@
 //
 // This file is part of RAJA.
 //
-// For additional details, please also read RAJA/LICENSE.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
+// For details about use and distribution, please read RAJA/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+#ifndef RAJA_scan_tbb_HPP
+#define RAJA_scan_tbb_HPP
+
 #include "RAJA/config.hpp"
 
 #include "RAJA/util/defines.hpp"
diff --git a/include/RAJA/util/Layout.hpp b/include/RAJA/util/Layout.hpp
index 3144b97177..923b4d6bee 100644
--- a/include/RAJA/util/Layout.hpp
+++ b/include/RAJA/util/Layout.hpp
@@ -8,11 +8,8 @@
  ******************************************************************************
  */
 
-#ifndef RAJA_LAYOUT_HPP
-#define RAJA_LAYOUT_HPP
-
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -53,25 +50,51 @@
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+#ifndef RAJA_LAYOUT_HPP
+#define RAJA_LAYOUT_HPP
+
 #include <iostream>
 #include <limits>
 #include "RAJA/config.hpp"
 #include "RAJA/index/IndexValue.hpp"
 #include "RAJA/internal/LegacyCompatibility.hpp"
+#include "RAJA/util/Operators.hpp"
 #include "RAJA/util/Permutations.hpp"
 
 namespace RAJA
 {
 
+namespace detail
+{
 template <typename Range, typename IdxLin = Index_type>
-struct LayoutBase_impl {
+struct LayoutBase_impl;
+
+/*!
+ * Helper function to compute the strides
+ */
+
+template <size_t j, size_t n_dims, typename IdxLin = Index_type>
+struct stride_calculator {
+  constexpr IdxLin operator()(IdxLin cur_stride,
+                              IdxLin const (&sizes)[n_dims]) const
+  {
+    return stride_calculator<j + 1, n_dims, IdxLin>{}(
+        cur_stride * (sizes[j] ? sizes[j] : 1), sizes);
+  }
+};
+template <size_t n_dims, typename IdxLin>
+struct stride_calculator<n_dims, n_dims, IdxLin> {
+  constexpr IdxLin operator()(IdxLin cur_stride, IdxLin const (&)[n_dims]) const
+  {
+    return cur_stride;
+  }
 };
 
-template <size_t... RangeInts, typename IdxLin>
-struct LayoutBase_impl<VarOps::index_sequence<RangeInts...>, IdxLin> {
+template <camp::idx_t... RangeInts, typename IdxLin>
+struct LayoutBase_impl<camp::idx_seq<RangeInts...>, IdxLin> {
 public:
   typedef IdxLin IndexLinear;
-  typedef VarOps::make_index_sequence<sizeof...(RangeInts)> IndexRange;
+  typedef camp::make_idx_seq_t<sizeof...(RangeInts)> IndexRange;
 
   static constexpr size_t n_dims = sizeof...(RangeInts);
   static constexpr size_t limit = RAJA::operators::limits<IdxLin>::max();
@@ -84,45 +107,29 @@ struct LayoutBase_impl<VarOps::index_sequence<RangeInts...>, IdxLin> {
   IdxLin inv_mods[n_dims];
 
 
-  /*!
-   * Helper function to compute the strides
-   */
-
-
   /*!
    * Default constructor with zero sizes and strides.
    */
-  RAJA_INLINE RAJA_HOST_DEVICE LayoutBase_impl()
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr LayoutBase_impl()
+      : sizes{0}, strides{0}, inv_strides{0}, inv_mods{0}
   {
-    for (size_t i = 0; i < n_dims; ++i) {
-      sizes[i] = strides[i] = 0;
-      inv_strides[i] = inv_mods[i] = 1;
-    }
   }
 
   /*!
    * Construct a layout given the size of each dimension.
-   *
-   * @todo this should be constexpr in c++14 mode
    */
   template <typename... Types>
-  RAJA_INLINE RAJA_HOST_DEVICE LayoutBase_impl(Types... ns)
-      : sizes{convertIndex<IdxLin>(ns)...}
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr LayoutBase_impl(Types... ns)
+      : sizes{convertIndex<IdxLin>(ns)...},
+        strides{(detail::stride_calculator<RangeInts + 1, n_dims, IdxLin>{}(
+            sizes[RangeInts] ? 1 : 0,
+            sizes))...},
+        inv_strides{(strides[RangeInts] ? strides[RangeInts] : 1)...},
+        inv_mods{(sizes[RangeInts] ? sizes[RangeInts] : 1)...}
   {
     static_assert(n_dims == sizeof...(Types),
                   "number of dimensions must "
                   "match");
-    for (size_t i = 0; i < n_dims; i++) {
-      // If the size of dimension i is zero, then the stride is zero
-      strides[i] = sizes[i] ? 1 : 0;
-
-      for (size_t j = i + 1; j < n_dims; j++) {
-        // only take product of non-zero sizes
-        strides[i] *= sizes[j] ? sizes[j] : 1;
-      }
-    }
-
-    computeInverse();
   }
 
   /*!
@@ -142,11 +149,14 @@ struct LayoutBase_impl<VarOps::index_sequence<RangeInts...>, IdxLin> {
    *  Construct a Layout given the size and stride of each dimension
    */
   template <typename... Types>
-  RAJA_INLINE LayoutBase_impl(const std::array<IdxLin, n_dims> &sizes_in,
-                              const std::array<IdxLin, n_dims> &strides_in)
-      : sizes{sizes_in[RangeInts]...}, strides{strides_in[RangeInts]...}
+  RAJA_INLINE constexpr LayoutBase_impl(
+      const std::array<IdxLin, n_dims> &sizes_in,
+      const std::array<IdxLin, n_dims> &strides_in)
+      : sizes{sizes_in[RangeInts]...},
+        strides{strides_in[RangeInts]...},
+        inv_strides{(strides[RangeInts] ? strides[RangeInts] : 1)...},
+        inv_mods{(sizes[RangeInts] ? sizes[RangeInts] : 1)...}
   {
-    computeInverse();
   }
 
 
@@ -182,36 +192,13 @@ struct LayoutBase_impl<VarOps::index_sequence<RangeInts...>, IdxLin> {
     VarOps::ignore_args((indices = (linear_index / inv_strides[RangeInts])
                                    % inv_mods[RangeInts])...);
   }
-
-private:
-  /*!
-   * @internal
-   *
-   * Computes the inverse mapping used by toIndices given the forward mapping
-   * described by strides[] and sizes[]
-   */
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  void computeInverse()
-  {
-    // Inverse strides and mods map directly from strides and sizes,
-    // except when a size (or stride) is zero for a projective layout.
-    // In this case, having a stride and size of 1 will ensure that
-    // toIndices for that dimension is always 0
-    for (size_t i = 0; i < n_dims; i++) {
-      inv_strides[i] = strides[i] ? strides[i] : 1;
-      inv_mods[i] = sizes[i] ? sizes[i] : 1;
-    }
-  }
 };
 
-template <size_t... RangeInts, typename IdxLin>
-constexpr size_t
-    LayoutBase_impl<VarOps::index_sequence<RangeInts...>, IdxLin>::n_dims;
-template <size_t... RangeInts, typename IdxLin>
-constexpr size_t
-    LayoutBase_impl<VarOps::index_sequence<RangeInts...>, IdxLin>::limit;
-
+template <camp::idx_t... RangeInts, typename IdxLin>
+constexpr size_t LayoutBase_impl<camp::idx_seq<RangeInts...>, IdxLin>::n_dims;
+template <camp::idx_t... RangeInts, typename IdxLin>
+constexpr size_t LayoutBase_impl<camp::idx_seq<RangeInts...>, IdxLin>::limit;
+}
 
 /*!
  * @brief A mapping of n-dimensional index space to a linear index space.
@@ -263,7 +250,7 @@ constexpr size_t
  *
  */
 template <size_t n_dims, typename IdxLin = Index_type>
-using Layout = LayoutBase_impl<VarOps::make_index_sequence<n_dims>, IdxLin>;
+using Layout = detail::LayoutBase_impl<camp::make_idx_seq_t<n_dims>, IdxLin>;
 
 template <typename IdxLin, typename... DimTypes>
 struct TypedLayout : public Layout<sizeof...(DimTypes), Index_type> {
@@ -305,7 +292,7 @@ struct TypedLayout : public Layout<sizeof...(DimTypes), Index_type> {
   RAJA_INLINE RAJA_HOST_DEVICE void toIndices(IdxLin linear_index,
                                               Indices &... indices) const
   {
-    toIndicesHelper(VarOps::make_index_sequence<sizeof...(DimTypes)>{},
+    toIndicesHelper(camp::make_idx_seq_t<sizeof...(DimTypes)>{},
                     std::forward<IdxLin>(linear_index),
                     std::forward<Indices &>(indices)...);
   }
@@ -318,11 +305,10 @@ struct TypedLayout : public Layout<sizeof...(DimTypes), Index_type> {
    * result to typed indices
    *
    */
-  template <typename... Indices, size_t... RangeInts>
-  RAJA_INLINE RAJA_HOST_DEVICE void toIndicesHelper(
-      VarOps::index_sequence<RangeInts...>,
-      IdxLin linear_index,
-      Indices &... indices) const
+  template <typename... Indices, camp::idx_t... RangeInts>
+  RAJA_INLINE RAJA_HOST_DEVICE void toIndicesHelper(camp::idx_seq<RangeInts...>,
+                                                    IdxLin linear_index,
+                                                    Indices &... indices) const
   {
     Index_type locals[sizeof...(DimTypes)];
     Base::toIndices(convertIndex<Index_type>(linear_index),
diff --git a/include/RAJA/util/OffsetLayout.hpp b/include/RAJA/util/OffsetLayout.hpp
index 51a98e1e04..afed3b9516 100644
--- a/include/RAJA/util/OffsetLayout.hpp
+++ b/include/RAJA/util/OffsetLayout.hpp
@@ -9,11 +9,8 @@
  ******************************************************************************
  */
 
-#ifndef RAJA_OFFSETLAYOUT_HPP
-#define RAJA_OFFSETLAYOUT_HPP
-
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -54,8 +51,8 @@
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
-#include <iostream>
-#include <limits>
+#ifndef RAJA_OFFSETLAYOUT_HPP
+#define RAJA_OFFSETLAYOUT_HPP
 
 #include "RAJA/config.hpp"
 #include "RAJA/index/IndexValue.hpp"
@@ -63,6 +60,10 @@
 #include "RAJA/util/Permutations.hpp"
 #include "RAJA/util/PermutedLayout.hpp"
 
+#include <array>
+#include <limits>
+#include "camp/camp.hpp"
+
 namespace RAJA
 {
 
@@ -72,10 +73,10 @@ namespace internal
 template <typename Range, typename IdxLin>
 struct OffsetLayout_impl;
 
-template <size_t... RangeInts, typename IdxLin>
-struct OffsetLayout_impl<VarOps::index_sequence<RangeInts...>, IdxLin> {
-  using IndexRange = VarOps::index_sequence<RangeInts...>;
-  using Base = LayoutBase_impl<IndexRange, IdxLin>;
+template <camp::idx_t... RangeInts, typename IdxLin>
+struct OffsetLayout_impl<camp::idx_seq<RangeInts...>, IdxLin> {
+  using IndexRange = camp::idx_seq<RangeInts...>;
+  using Base = detail::LayoutBase_impl<IndexRange, IdxLin>;
   Base base_;
 
   IdxLin offsets[sizeof...(RangeInts)];
@@ -100,14 +101,15 @@ struct OffsetLayout_impl<VarOps::index_sequence<RangeInts...>, IdxLin> {
       const std::array<IdxLin, sizeof...(RangeInts)>& offsets_in,
       const Layout<sizeof...(RangeInts), IdxLin>& rhs)
   {
-    return internal::OffsetLayout_impl<IndexRange, IdxLin>(offsets_in, rhs);
+    OffsetLayout_impl ret{rhs};
+    VarOps::ignore_args((ret.offsets[RangeInts] = offsets_in[RangeInts])...);
+    return ret;
   }
 
 private:
-  constexpr RAJA_INLINE OffsetLayout_impl(
-      const std::array<IdxLin, sizeof...(RangeInts)>& offsets_in,
-      const Layout<sizeof...(RangeInts), IdxLin>& rhs)
-      : base_{rhs}, offsets{offsets_in[RangeInts]...}
+  constexpr RAJA_INLINE RAJA_HOST_DEVICE
+  OffsetLayout_impl(const Layout<sizeof...(RangeInts), IdxLin>& rhs)
+      : base_{rhs}
   {
   }
 };
@@ -116,17 +118,16 @@ struct OffsetLayout_impl<VarOps::index_sequence<RangeInts...>, IdxLin> {
 
 template <size_t n_dims = 1, typename IdxLin = Index_type>
 struct OffsetLayout
-    : public internal::OffsetLayout_impl<VarOps::make_index_sequence<n_dims>,
-                                         IdxLin> {
+    : public internal::OffsetLayout_impl<camp::make_idx_seq_t<n_dims>, IdxLin> {
   using parent =
-      internal::OffsetLayout_impl<VarOps::make_index_sequence<n_dims>, IdxLin>;
+      internal::OffsetLayout_impl<camp::make_idx_seq_t<n_dims>, IdxLin>;
 
-  using internal::OffsetLayout_impl<VarOps::make_index_sequence<n_dims>,
+  using internal::OffsetLayout_impl<camp::make_idx_seq_t<n_dims>,
                                     IdxLin>::OffsetLayout_impl;
 
   constexpr RAJA_INLINE RAJA_HOST_DEVICE OffsetLayout(
-      const internal::OffsetLayout_impl<VarOps::make_index_sequence<n_dims>,
-                                        IdxLin>& rhs)
+      const internal::OffsetLayout_impl<camp::make_idx_seq_t<n_dims>, IdxLin>&
+          rhs)
       : parent{rhs}
   {
   }
@@ -143,15 +144,14 @@ auto make_offset_layout(const std::array<IdxLin, n_dims>& lower,
 template <size_t Rank, typename IdxLin = Index_type>
 auto make_permuted_offset_layout(const std::array<IdxLin, Rank>& lower,
                                  const std::array<IdxLin, Rank>& upper,
-                                 const std::array<size_t, Rank>& permutation)
+                                 const std::array<IdxLin, Rank>& permutation)
     -> decltype(make_offset_layout<Rank, IdxLin>(lower, upper))
 {
   std::array<IdxLin, Rank> sizes;
   for (size_t i = 0; i < Rank; ++i) {
     sizes[i] = upper[i] - lower[i] + 1;
   }
-  return internal::OffsetLayout_impl<VarOps::make_index_sequence<Rank>,
-                                     IdxLin>::
+  return internal::OffsetLayout_impl<camp::make_idx_seq_t<Rank>, IdxLin>::
       from_layout_and_offsets(lower, make_permuted_layout(sizes, permutation));
 }
 
diff --git a/include/RAJA/util/Operators.hpp b/include/RAJA/util/Operators.hpp
index 2c4f701046..b172d1eb65 100644
--- a/include/RAJA/util/Operators.hpp
+++ b/include/RAJA/util/Operators.hpp
@@ -11,11 +11,8 @@
  ******************************************************************************
  */
 
-#ifndef RAJA_operators_HPP
-#define RAJA_operators_HPP
-
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -56,6 +53,9 @@
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+#ifndef RAJA_operators_HPP
+#define RAJA_operators_HPP
+
 #include "RAJA/config.hpp"
 
 #include "RAJA/util/defines.hpp"
@@ -65,6 +65,11 @@
 #include <cfloat>
 #include <cstdint>
 #include <type_traits>
+#include <stdint.h>
+
+#ifdef RAJA_CHECK_LIMITS
+#include <limits>
+#endif
 
 namespace RAJA
 {
@@ -307,7 +312,6 @@ struct limits : public std::conditional<
 };
 
 #ifdef RAJA_CHECK_LIMITS
-#include <limits>
 template <typename T>
 constexpr bool check()
 {
@@ -339,7 +343,7 @@ struct plus : public detail::binary_function<Arg1, Arg2, Ret>,
   {
     return Ret{lhs} + rhs;
   }
-  static constexpr Ret identity() { return Ret{0}; }
+  RAJA_HOST_DEVICE static constexpr Ret identity() { return Ret{0}; }
 };
 
 template <typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
@@ -357,7 +361,7 @@ struct multiplies : public detail::binary_function<Arg1, Arg2, Ret>,
   {
     return Ret{lhs} * rhs;
   }
-  static constexpr Ret identity() { return Ret{1}; }
+  RAJA_HOST_DEVICE static constexpr Ret identity() { return Ret{1}; }
 };
 
 template <typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
@@ -385,7 +389,7 @@ struct logical_and : public detail::comparison_function<Arg1, Arg2>,
   {
     return lhs && rhs;
   }
-  static constexpr bool identity() { return true; }
+  RAJA_HOST_DEVICE static constexpr bool identity() { return true; }
 };
 
 template <typename Arg1, typename Arg2 = Arg1>
@@ -395,7 +399,7 @@ struct logical_or : public detail::comparison_function<Arg1, Arg2>,
   {
     return lhs || rhs;
   }
-  static constexpr bool identity() { return false; }
+  RAJA_HOST_DEVICE static constexpr bool identity() { return false; }
 };
 
 template <typename T>
@@ -438,7 +442,10 @@ struct minimum : public detail::binary_function<Arg1, Arg2, Ret>,
   {
     return (lhs < rhs) ? lhs : rhs;
   }
-  static constexpr Ret identity() { return limits<Ret>::max(); }
+  RAJA_HOST_DEVICE static constexpr Ret identity()
+  {
+    return limits<Ret>::max();
+  }
 };
 
 template <typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
@@ -448,7 +455,10 @@ struct maximum : public detail::binary_function<Arg1, Arg2, Ret>,
   {
     return (lhs < rhs) ? rhs : lhs;
   }
-  static constexpr Ret identity() { return limits<Ret>::min(); }
+  RAJA_HOST_DEVICE static constexpr Ret identity()
+  {
+    return limits<Ret>::min();
+  }
 };
 
 // Logical Comparison
@@ -553,13 +563,13 @@ template <typename Function,
           typename Arg1 = Return,
           typename Arg2 = Arg1>
 struct BinaryFunction
-    : DefineConcept(convertible_to<Return>(val<Function>()(val<Arg1>(),
-                                                           val<Arg2>()))) {
+    : DefineConcept(convertible_to<Return>(
+          camp::val<Function>()(camp::val<Arg1>(), camp::val<Arg2>()))) {
 };
 
 template <typename Function, typename Return, typename Arg = Return>
 struct UnaryFunction
-    : DefineConcept(convertible_to<Return>(val<Function>()(val<Arg>()))) {
+    : DefineConcept(convertible_to<Return>(camp::val<Function>()(camp::val<Arg>()))) {
 };
 
 namespace detail
diff --git a/include/RAJA/util/Permutations.hpp b/include/RAJA/util/Permutations.hpp
index 86935de4f3..772c314d33 100644
--- a/include/RAJA/util/Permutations.hpp
+++ b/include/RAJA/util/Permutations.hpp
@@ -8,11 +8,8 @@
  ******************************************************************************
  */
 
-#ifndef RAJA_FORALLN_PERMUTATIONS_HPP
-#define RAJA_FORALLN_PERMUTATIONS_HPP
-
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -53,165 +50,182 @@
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+#ifndef RAJA_FORALLN_PERMUTATIONS_HPP
+#define RAJA_FORALLN_PERMUTATIONS_HPP
+
+
+#include "RAJA/config.hpp"
+#include "camp/camp.hpp"
 
-#include "RAJA/internal/LegacyCompatibility.hpp"
+#include <array>
 
 namespace RAJA
 {
 
-using PERM_I = VarOps::index_sequence<0>;
-using PERM_IJ = VarOps::index_sequence<0, 1>;
-using PERM_JI = VarOps::index_sequence<1, 0>;
-using PERM_IJK = VarOps::index_sequence<0, 1, 2>;
-using PERM_IKJ = VarOps::index_sequence<0, 2, 1>;
-using PERM_JIK = VarOps::index_sequence<1, 0, 2>;
-using PERM_JKI = VarOps::index_sequence<1, 2, 0>;
-using PERM_KIJ = VarOps::index_sequence<2, 0, 1>;
-using PERM_KJI = VarOps::index_sequence<2, 1, 0>;
-using PERM_IJKL = VarOps::index_sequence<0, 1, 2, 3>;
-using PERM_IJLK = VarOps::index_sequence<0, 1, 3, 2>;
-using PERM_IKJL = VarOps::index_sequence<0, 2, 1, 3>;
-using PERM_IKLJ = VarOps::index_sequence<0, 2, 3, 1>;
-using PERM_ILJK = VarOps::index_sequence<0, 3, 1, 2>;
-using PERM_ILKJ = VarOps::index_sequence<0, 3, 2, 1>;
-using PERM_JIKL = VarOps::index_sequence<1, 0, 2, 3>;
-using PERM_JILK = VarOps::index_sequence<1, 0, 3, 2>;
-using PERM_JKIL = VarOps::index_sequence<1, 2, 0, 3>;
-using PERM_JKLI = VarOps::index_sequence<1, 2, 3, 0>;
-using PERM_JLIK = VarOps::index_sequence<1, 3, 0, 2>;
-using PERM_JLKI = VarOps::index_sequence<1, 3, 2, 0>;
-using PERM_KIJL = VarOps::index_sequence<2, 0, 1, 3>;
-using PERM_KILJ = VarOps::index_sequence<2, 0, 3, 1>;
-using PERM_KJIL = VarOps::index_sequence<2, 1, 0, 3>;
-using PERM_KJLI = VarOps::index_sequence<2, 1, 3, 0>;
-using PERM_KLIJ = VarOps::index_sequence<2, 3, 0, 1>;
-using PERM_KLJI = VarOps::index_sequence<2, 3, 1, 0>;
-using PERM_LIJK = VarOps::index_sequence<3, 0, 1, 2>;
-using PERM_LIKJ = VarOps::index_sequence<3, 0, 2, 1>;
-using PERM_LJIK = VarOps::index_sequence<3, 1, 0, 2>;
-using PERM_LJKI = VarOps::index_sequence<3, 1, 2, 0>;
-using PERM_LKIJ = VarOps::index_sequence<3, 2, 0, 1>;
-using PERM_LKJI = VarOps::index_sequence<3, 2, 1, 0>;
-using PERM_IJKLM = VarOps::index_sequence<0, 1, 2, 3, 4>;
-using PERM_IJKML = VarOps::index_sequence<0, 1, 2, 4, 3>;
-using PERM_IJLKM = VarOps::index_sequence<0, 1, 3, 2, 4>;
-using PERM_IJLMK = VarOps::index_sequence<0, 1, 3, 4, 2>;
-using PERM_IJMKL = VarOps::index_sequence<0, 1, 4, 2, 3>;
-using PERM_IJMLK = VarOps::index_sequence<0, 1, 4, 3, 2>;
-using PERM_IKJLM = VarOps::index_sequence<0, 2, 1, 3, 4>;
-using PERM_IKJML = VarOps::index_sequence<0, 2, 1, 4, 3>;
-using PERM_IKLJM = VarOps::index_sequence<0, 2, 3, 1, 4>;
-using PERM_IKLMJ = VarOps::index_sequence<0, 2, 3, 4, 1>;
-using PERM_IKMJL = VarOps::index_sequence<0, 2, 4, 1, 3>;
-using PERM_IKMLJ = VarOps::index_sequence<0, 2, 4, 3, 1>;
-using PERM_ILJKM = VarOps::index_sequence<0, 3, 1, 2, 4>;
-using PERM_ILJMK = VarOps::index_sequence<0, 3, 1, 4, 2>;
-using PERM_ILKJM = VarOps::index_sequence<0, 3, 2, 1, 4>;
-using PERM_ILKMJ = VarOps::index_sequence<0, 3, 2, 4, 1>;
-using PERM_ILMJK = VarOps::index_sequence<0, 3, 4, 1, 2>;
-using PERM_ILMKJ = VarOps::index_sequence<0, 3, 4, 2, 1>;
-using PERM_IMJKL = VarOps::index_sequence<0, 4, 1, 2, 3>;
-using PERM_IMJLK = VarOps::index_sequence<0, 4, 1, 3, 2>;
-using PERM_IMKJL = VarOps::index_sequence<0, 4, 2, 1, 3>;
-using PERM_IMKLJ = VarOps::index_sequence<0, 4, 2, 3, 1>;
-using PERM_IMLJK = VarOps::index_sequence<0, 4, 3, 1, 2>;
-using PERM_IMLKJ = VarOps::index_sequence<0, 4, 3, 2, 1>;
-using PERM_JIKLM = VarOps::index_sequence<1, 0, 2, 3, 4>;
-using PERM_JIKML = VarOps::index_sequence<1, 0, 2, 4, 3>;
-using PERM_JILKM = VarOps::index_sequence<1, 0, 3, 2, 4>;
-using PERM_JILMK = VarOps::index_sequence<1, 0, 3, 4, 2>;
-using PERM_JIMKL = VarOps::index_sequence<1, 0, 4, 2, 3>;
-using PERM_JIMLK = VarOps::index_sequence<1, 0, 4, 3, 2>;
-using PERM_JKILM = VarOps::index_sequence<1, 2, 0, 3, 4>;
-using PERM_JKIML = VarOps::index_sequence<1, 2, 0, 4, 3>;
-using PERM_JKLIM = VarOps::index_sequence<1, 2, 3, 0, 4>;
-using PERM_JKLMI = VarOps::index_sequence<1, 2, 3, 4, 0>;
-using PERM_JKMIL = VarOps::index_sequence<1, 2, 4, 0, 3>;
-using PERM_JKMLI = VarOps::index_sequence<1, 2, 4, 3, 0>;
-using PERM_JLIKM = VarOps::index_sequence<1, 3, 0, 2, 4>;
-using PERM_JLIMK = VarOps::index_sequence<1, 3, 0, 4, 2>;
-using PERM_JLKIM = VarOps::index_sequence<1, 3, 2, 0, 4>;
-using PERM_JLKMI = VarOps::index_sequence<1, 3, 2, 4, 0>;
-using PERM_JLMIK = VarOps::index_sequence<1, 3, 4, 0, 2>;
-using PERM_JLMKI = VarOps::index_sequence<1, 3, 4, 2, 0>;
-using PERM_JMIKL = VarOps::index_sequence<1, 4, 0, 2, 3>;
-using PERM_JMILK = VarOps::index_sequence<1, 4, 0, 3, 2>;
-using PERM_JMKIL = VarOps::index_sequence<1, 4, 2, 0, 3>;
-using PERM_JMKLI = VarOps::index_sequence<1, 4, 2, 3, 0>;
-using PERM_JMLIK = VarOps::index_sequence<1, 4, 3, 0, 2>;
-using PERM_JMLKI = VarOps::index_sequence<1, 4, 3, 2, 0>;
-using PERM_KIJLM = VarOps::index_sequence<2, 0, 1, 3, 4>;
-using PERM_KIJML = VarOps::index_sequence<2, 0, 1, 4, 3>;
-using PERM_KILJM = VarOps::index_sequence<2, 0, 3, 1, 4>;
-using PERM_KILMJ = VarOps::index_sequence<2, 0, 3, 4, 1>;
-using PERM_KIMJL = VarOps::index_sequence<2, 0, 4, 1, 3>;
-using PERM_KIMLJ = VarOps::index_sequence<2, 0, 4, 3, 1>;
-using PERM_KJILM = VarOps::index_sequence<2, 1, 0, 3, 4>;
-using PERM_KJIML = VarOps::index_sequence<2, 1, 0, 4, 3>;
-using PERM_KJLIM = VarOps::index_sequence<2, 1, 3, 0, 4>;
-using PERM_KJLMI = VarOps::index_sequence<2, 1, 3, 4, 0>;
-using PERM_KJMIL = VarOps::index_sequence<2, 1, 4, 0, 3>;
-using PERM_KJMLI = VarOps::index_sequence<2, 1, 4, 3, 0>;
-using PERM_KLIJM = VarOps::index_sequence<2, 3, 0, 1, 4>;
-using PERM_KLIMJ = VarOps::index_sequence<2, 3, 0, 4, 1>;
-using PERM_KLJIM = VarOps::index_sequence<2, 3, 1, 0, 4>;
-using PERM_KLJMI = VarOps::index_sequence<2, 3, 1, 4, 0>;
-using PERM_KLMIJ = VarOps::index_sequence<2, 3, 4, 0, 1>;
-using PERM_KLMJI = VarOps::index_sequence<2, 3, 4, 1, 0>;
-using PERM_KMIJL = VarOps::index_sequence<2, 4, 0, 1, 3>;
-using PERM_KMILJ = VarOps::index_sequence<2, 4, 0, 3, 1>;
-using PERM_KMJIL = VarOps::index_sequence<2, 4, 1, 0, 3>;
-using PERM_KMJLI = VarOps::index_sequence<2, 4, 1, 3, 0>;
-using PERM_KMLIJ = VarOps::index_sequence<2, 4, 3, 0, 1>;
-using PERM_KMLJI = VarOps::index_sequence<2, 4, 3, 1, 0>;
-using PERM_LIJKM = VarOps::index_sequence<3, 0, 1, 2, 4>;
-using PERM_LIJMK = VarOps::index_sequence<3, 0, 1, 4, 2>;
-using PERM_LIKJM = VarOps::index_sequence<3, 0, 2, 1, 4>;
-using PERM_LIKMJ = VarOps::index_sequence<3, 0, 2, 4, 1>;
-using PERM_LIMJK = VarOps::index_sequence<3, 0, 4, 1, 2>;
-using PERM_LIMKJ = VarOps::index_sequence<3, 0, 4, 2, 1>;
-using PERM_LJIKM = VarOps::index_sequence<3, 1, 0, 2, 4>;
-using PERM_LJIMK = VarOps::index_sequence<3, 1, 0, 4, 2>;
-using PERM_LJKIM = VarOps::index_sequence<3, 1, 2, 0, 4>;
-using PERM_LJKMI = VarOps::index_sequence<3, 1, 2, 4, 0>;
-using PERM_LJMIK = VarOps::index_sequence<3, 1, 4, 0, 2>;
-using PERM_LJMKI = VarOps::index_sequence<3, 1, 4, 2, 0>;
-using PERM_LKIJM = VarOps::index_sequence<3, 2, 0, 1, 4>;
-using PERM_LKIMJ = VarOps::index_sequence<3, 2, 0, 4, 1>;
-using PERM_LKJIM = VarOps::index_sequence<3, 2, 1, 0, 4>;
-using PERM_LKJMI = VarOps::index_sequence<3, 2, 1, 4, 0>;
-using PERM_LKMIJ = VarOps::index_sequence<3, 2, 4, 0, 1>;
-using PERM_LKMJI = VarOps::index_sequence<3, 2, 4, 1, 0>;
-using PERM_LMIJK = VarOps::index_sequence<3, 4, 0, 1, 2>;
-using PERM_LMIKJ = VarOps::index_sequence<3, 4, 0, 2, 1>;
-using PERM_LMJIK = VarOps::index_sequence<3, 4, 1, 0, 2>;
-using PERM_LMJKI = VarOps::index_sequence<3, 4, 1, 2, 0>;
-using PERM_LMKIJ = VarOps::index_sequence<3, 4, 2, 0, 1>;
-using PERM_LMKJI = VarOps::index_sequence<3, 4, 2, 1, 0>;
-using PERM_MIJKL = VarOps::index_sequence<4, 0, 1, 2, 3>;
-using PERM_MIJLK = VarOps::index_sequence<4, 0, 1, 3, 2>;
-using PERM_MIKJL = VarOps::index_sequence<4, 0, 2, 1, 3>;
-using PERM_MIKLJ = VarOps::index_sequence<4, 0, 2, 3, 1>;
-using PERM_MILJK = VarOps::index_sequence<4, 0, 3, 1, 2>;
-using PERM_MILKJ = VarOps::index_sequence<4, 0, 3, 2, 1>;
-using PERM_MJIKL = VarOps::index_sequence<4, 1, 0, 2, 3>;
-using PERM_MJILK = VarOps::index_sequence<4, 1, 0, 3, 2>;
-using PERM_MJKIL = VarOps::index_sequence<4, 1, 2, 0, 3>;
-using PERM_MJKLI = VarOps::index_sequence<4, 1, 2, 3, 0>;
-using PERM_MJLIK = VarOps::index_sequence<4, 1, 3, 0, 2>;
-using PERM_MJLKI = VarOps::index_sequence<4, 1, 3, 2, 0>;
-using PERM_MKIJL = VarOps::index_sequence<4, 2, 0, 1, 3>;
-using PERM_MKILJ = VarOps::index_sequence<4, 2, 0, 3, 1>;
-using PERM_MKJIL = VarOps::index_sequence<4, 2, 1, 0, 3>;
-using PERM_MKJLI = VarOps::index_sequence<4, 2, 1, 3, 0>;
-using PERM_MKLIJ = VarOps::index_sequence<4, 2, 3, 0, 1>;
-using PERM_MKLJI = VarOps::index_sequence<4, 2, 3, 1, 0>;
-using PERM_MLIJK = VarOps::index_sequence<4, 3, 0, 1, 2>;
-using PERM_MLIKJ = VarOps::index_sequence<4, 3, 0, 2, 1>;
-using PERM_MLJIK = VarOps::index_sequence<4, 3, 1, 0, 2>;
-using PERM_MLJKI = VarOps::index_sequence<4, 3, 1, 2, 0>;
-using PERM_MLKIJ = VarOps::index_sequence<4, 3, 2, 0, 1>;
-using PERM_MLKJI = VarOps::index_sequence<4, 3, 2, 1, 0>;
+template <typename Indices>
+struct as_array;
+
+template <camp::idx_t... Indices>
+struct as_array<camp::idx_seq<Indices...>> {
+  static constexpr std::array<Index_type, sizeof...(Indices)> get() {
+    return {Indices...};
+  }
+};
+
+using PERM_I = camp::idx_seq<0>;
+using PERM_IJ = camp::idx_seq<0, 1>;
+using PERM_JI = camp::idx_seq<1, 0>;
+using PERM_IJK = camp::idx_seq<0, 1, 2>;
+using PERM_IKJ = camp::idx_seq<0, 2, 1>;
+using PERM_JIK = camp::idx_seq<1, 0, 2>;
+using PERM_JKI = camp::idx_seq<1, 2, 0>;
+using PERM_KIJ = camp::idx_seq<2, 0, 1>;
+using PERM_KJI = camp::idx_seq<2, 1, 0>;
+using PERM_IJKL = camp::idx_seq<0, 1, 2, 3>;
+using PERM_IJLK = camp::idx_seq<0, 1, 3, 2>;
+using PERM_IKJL = camp::idx_seq<0, 2, 1, 3>;
+using PERM_IKLJ = camp::idx_seq<0, 2, 3, 1>;
+using PERM_ILJK = camp::idx_seq<0, 3, 1, 2>;
+using PERM_ILKJ = camp::idx_seq<0, 3, 2, 1>;
+using PERM_JIKL = camp::idx_seq<1, 0, 2, 3>;
+using PERM_JILK = camp::idx_seq<1, 0, 3, 2>;
+using PERM_JKIL = camp::idx_seq<1, 2, 0, 3>;
+using PERM_JKLI = camp::idx_seq<1, 2, 3, 0>;
+using PERM_JLIK = camp::idx_seq<1, 3, 0, 2>;
+using PERM_JLKI = camp::idx_seq<1, 3, 2, 0>;
+using PERM_KIJL = camp::idx_seq<2, 0, 1, 3>;
+using PERM_KILJ = camp::idx_seq<2, 0, 3, 1>;
+using PERM_KJIL = camp::idx_seq<2, 1, 0, 3>;
+using PERM_KJLI = camp::idx_seq<2, 1, 3, 0>;
+using PERM_KLIJ = camp::idx_seq<2, 3, 0, 1>;
+using PERM_KLJI = camp::idx_seq<2, 3, 1, 0>;
+using PERM_LIJK = camp::idx_seq<3, 0, 1, 2>;
+using PERM_LIKJ = camp::idx_seq<3, 0, 2, 1>;
+using PERM_LJIK = camp::idx_seq<3, 1, 0, 2>;
+using PERM_LJKI = camp::idx_seq<3, 1, 2, 0>;
+using PERM_LKIJ = camp::idx_seq<3, 2, 0, 1>;
+using PERM_LKJI = camp::idx_seq<3, 2, 1, 0>;
+using PERM_IJKLM = camp::idx_seq<0, 1, 2, 3, 4>;
+using PERM_IJKML = camp::idx_seq<0, 1, 2, 4, 3>;
+using PERM_IJLKM = camp::idx_seq<0, 1, 3, 2, 4>;
+using PERM_IJLMK = camp::idx_seq<0, 1, 3, 4, 2>;
+using PERM_IJMKL = camp::idx_seq<0, 1, 4, 2, 3>;
+using PERM_IJMLK = camp::idx_seq<0, 1, 4, 3, 2>;
+using PERM_IKJLM = camp::idx_seq<0, 2, 1, 3, 4>;
+using PERM_IKJML = camp::idx_seq<0, 2, 1, 4, 3>;
+using PERM_IKLJM = camp::idx_seq<0, 2, 3, 1, 4>;
+using PERM_IKLMJ = camp::idx_seq<0, 2, 3, 4, 1>;
+using PERM_IKMJL = camp::idx_seq<0, 2, 4, 1, 3>;
+using PERM_IKMLJ = camp::idx_seq<0, 2, 4, 3, 1>;
+using PERM_ILJKM = camp::idx_seq<0, 3, 1, 2, 4>;
+using PERM_ILJMK = camp::idx_seq<0, 3, 1, 4, 2>;
+using PERM_ILKJM = camp::idx_seq<0, 3, 2, 1, 4>;
+using PERM_ILKMJ = camp::idx_seq<0, 3, 2, 4, 1>;
+using PERM_ILMJK = camp::idx_seq<0, 3, 4, 1, 2>;
+using PERM_ILMKJ = camp::idx_seq<0, 3, 4, 2, 1>;
+using PERM_IMJKL = camp::idx_seq<0, 4, 1, 2, 3>;
+using PERM_IMJLK = camp::idx_seq<0, 4, 1, 3, 2>;
+using PERM_IMKJL = camp::idx_seq<0, 4, 2, 1, 3>;
+using PERM_IMKLJ = camp::idx_seq<0, 4, 2, 3, 1>;
+using PERM_IMLJK = camp::idx_seq<0, 4, 3, 1, 2>;
+using PERM_IMLKJ = camp::idx_seq<0, 4, 3, 2, 1>;
+using PERM_JIKLM = camp::idx_seq<1, 0, 2, 3, 4>;
+using PERM_JIKML = camp::idx_seq<1, 0, 2, 4, 3>;
+using PERM_JILKM = camp::idx_seq<1, 0, 3, 2, 4>;
+using PERM_JILMK = camp::idx_seq<1, 0, 3, 4, 2>;
+using PERM_JIMKL = camp::idx_seq<1, 0, 4, 2, 3>;
+using PERM_JIMLK = camp::idx_seq<1, 0, 4, 3, 2>;
+using PERM_JKILM = camp::idx_seq<1, 2, 0, 3, 4>;
+using PERM_JKIML = camp::idx_seq<1, 2, 0, 4, 3>;
+using PERM_JKLIM = camp::idx_seq<1, 2, 3, 0, 4>;
+using PERM_JKLMI = camp::idx_seq<1, 2, 3, 4, 0>;
+using PERM_JKMIL = camp::idx_seq<1, 2, 4, 0, 3>;
+using PERM_JKMLI = camp::idx_seq<1, 2, 4, 3, 0>;
+using PERM_JLIKM = camp::idx_seq<1, 3, 0, 2, 4>;
+using PERM_JLIMK = camp::idx_seq<1, 3, 0, 4, 2>;
+using PERM_JLKIM = camp::idx_seq<1, 3, 2, 0, 4>;
+using PERM_JLKMI = camp::idx_seq<1, 3, 2, 4, 0>;
+using PERM_JLMIK = camp::idx_seq<1, 3, 4, 0, 2>;
+using PERM_JLMKI = camp::idx_seq<1, 3, 4, 2, 0>;
+using PERM_JMIKL = camp::idx_seq<1, 4, 0, 2, 3>;
+using PERM_JMILK = camp::idx_seq<1, 4, 0, 3, 2>;
+using PERM_JMKIL = camp::idx_seq<1, 4, 2, 0, 3>;
+using PERM_JMKLI = camp::idx_seq<1, 4, 2, 3, 0>;
+using PERM_JMLIK = camp::idx_seq<1, 4, 3, 0, 2>;
+using PERM_JMLKI = camp::idx_seq<1, 4, 3, 2, 0>;
+using PERM_KIJLM = camp::idx_seq<2, 0, 1, 3, 4>;
+using PERM_KIJML = camp::idx_seq<2, 0, 1, 4, 3>;
+using PERM_KILJM = camp::idx_seq<2, 0, 3, 1, 4>;
+using PERM_KILMJ = camp::idx_seq<2, 0, 3, 4, 1>;
+using PERM_KIMJL = camp::idx_seq<2, 0, 4, 1, 3>;
+using PERM_KIMLJ = camp::idx_seq<2, 0, 4, 3, 1>;
+using PERM_KJILM = camp::idx_seq<2, 1, 0, 3, 4>;
+using PERM_KJIML = camp::idx_seq<2, 1, 0, 4, 3>;
+using PERM_KJLIM = camp::idx_seq<2, 1, 3, 0, 4>;
+using PERM_KJLMI = camp::idx_seq<2, 1, 3, 4, 0>;
+using PERM_KJMIL = camp::idx_seq<2, 1, 4, 0, 3>;
+using PERM_KJMLI = camp::idx_seq<2, 1, 4, 3, 0>;
+using PERM_KLIJM = camp::idx_seq<2, 3, 0, 1, 4>;
+using PERM_KLIMJ = camp::idx_seq<2, 3, 0, 4, 1>;
+using PERM_KLJIM = camp::idx_seq<2, 3, 1, 0, 4>;
+using PERM_KLJMI = camp::idx_seq<2, 3, 1, 4, 0>;
+using PERM_KLMIJ = camp::idx_seq<2, 3, 4, 0, 1>;
+using PERM_KLMJI = camp::idx_seq<2, 3, 4, 1, 0>;
+using PERM_KMIJL = camp::idx_seq<2, 4, 0, 1, 3>;
+using PERM_KMILJ = camp::idx_seq<2, 4, 0, 3, 1>;
+using PERM_KMJIL = camp::idx_seq<2, 4, 1, 0, 3>;
+using PERM_KMJLI = camp::idx_seq<2, 4, 1, 3, 0>;
+using PERM_KMLIJ = camp::idx_seq<2, 4, 3, 0, 1>;
+using PERM_KMLJI = camp::idx_seq<2, 4, 3, 1, 0>;
+using PERM_LIJKM = camp::idx_seq<3, 0, 1, 2, 4>;
+using PERM_LIJMK = camp::idx_seq<3, 0, 1, 4, 2>;
+using PERM_LIKJM = camp::idx_seq<3, 0, 2, 1, 4>;
+using PERM_LIKMJ = camp::idx_seq<3, 0, 2, 4, 1>;
+using PERM_LIMJK = camp::idx_seq<3, 0, 4, 1, 2>;
+using PERM_LIMKJ = camp::idx_seq<3, 0, 4, 2, 1>;
+using PERM_LJIKM = camp::idx_seq<3, 1, 0, 2, 4>;
+using PERM_LJIMK = camp::idx_seq<3, 1, 0, 4, 2>;
+using PERM_LJKIM = camp::idx_seq<3, 1, 2, 0, 4>;
+using PERM_LJKMI = camp::idx_seq<3, 1, 2, 4, 0>;
+using PERM_LJMIK = camp::idx_seq<3, 1, 4, 0, 2>;
+using PERM_LJMKI = camp::idx_seq<3, 1, 4, 2, 0>;
+using PERM_LKIJM = camp::idx_seq<3, 2, 0, 1, 4>;
+using PERM_LKIMJ = camp::idx_seq<3, 2, 0, 4, 1>;
+using PERM_LKJIM = camp::idx_seq<3, 2, 1, 0, 4>;
+using PERM_LKJMI = camp::idx_seq<3, 2, 1, 4, 0>;
+using PERM_LKMIJ = camp::idx_seq<3, 2, 4, 0, 1>;
+using PERM_LKMJI = camp::idx_seq<3, 2, 4, 1, 0>;
+using PERM_LMIJK = camp::idx_seq<3, 4, 0, 1, 2>;
+using PERM_LMIKJ = camp::idx_seq<3, 4, 0, 2, 1>;
+using PERM_LMJIK = camp::idx_seq<3, 4, 1, 0, 2>;
+using PERM_LMJKI = camp::idx_seq<3, 4, 1, 2, 0>;
+using PERM_LMKIJ = camp::idx_seq<3, 4, 2, 0, 1>;
+using PERM_LMKJI = camp::idx_seq<3, 4, 2, 1, 0>;
+using PERM_MIJKL = camp::idx_seq<4, 0, 1, 2, 3>;
+using PERM_MIJLK = camp::idx_seq<4, 0, 1, 3, 2>;
+using PERM_MIKJL = camp::idx_seq<4, 0, 2, 1, 3>;
+using PERM_MIKLJ = camp::idx_seq<4, 0, 2, 3, 1>;
+using PERM_MILJK = camp::idx_seq<4, 0, 3, 1, 2>;
+using PERM_MILKJ = camp::idx_seq<4, 0, 3, 2, 1>;
+using PERM_MJIKL = camp::idx_seq<4, 1, 0, 2, 3>;
+using PERM_MJILK = camp::idx_seq<4, 1, 0, 3, 2>;
+using PERM_MJKIL = camp::idx_seq<4, 1, 2, 0, 3>;
+using PERM_MJKLI = camp::idx_seq<4, 1, 2, 3, 0>;
+using PERM_MJLIK = camp::idx_seq<4, 1, 3, 0, 2>;
+using PERM_MJLKI = camp::idx_seq<4, 1, 3, 2, 0>;
+using PERM_MKIJL = camp::idx_seq<4, 2, 0, 1, 3>;
+using PERM_MKILJ = camp::idx_seq<4, 2, 0, 3, 1>;
+using PERM_MKJIL = camp::idx_seq<4, 2, 1, 0, 3>;
+using PERM_MKJLI = camp::idx_seq<4, 2, 1, 3, 0>;
+using PERM_MKLIJ = camp::idx_seq<4, 2, 3, 0, 1>;
+using PERM_MKLJI = camp::idx_seq<4, 2, 3, 1, 0>;
+using PERM_MLIJK = camp::idx_seq<4, 3, 0, 1, 2>;
+using PERM_MLIKJ = camp::idx_seq<4, 3, 0, 2, 1>;
+using PERM_MLJIK = camp::idx_seq<4, 3, 1, 0, 2>;
+using PERM_MLJKI = camp::idx_seq<4, 3, 1, 2, 0>;
+using PERM_MLKIJ = camp::idx_seq<4, 3, 2, 0, 1>;
+using PERM_MLKJI = camp::idx_seq<4, 3, 2, 1, 0>;
+
 }
 
 #endif /* RAJA_FORALLN_PERMUTATIONS_HPP */
diff --git a/include/RAJA/util/PermutedLayout.hpp b/include/RAJA/util/PermutedLayout.hpp
index d9acf72c0d..67467f098b 100644
--- a/include/RAJA/util/PermutedLayout.hpp
+++ b/include/RAJA/util/PermutedLayout.hpp
@@ -9,11 +9,8 @@
  ******************************************************************************
  */
 
-#ifndef RAJA_PERMUTEDLAYOUT_HPP
-#define RAJA_PERMUTEDLAYOUT_HPP
-
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -54,6 +51,9 @@
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+#ifndef RAJA_PERMUTEDLAYOUT_HPP
+#define RAJA_PERMUTEDLAYOUT_HPP
+
 #include <iostream>
 
 #include "RAJA/config.hpp"
@@ -96,7 +96,7 @@ namespace RAJA
  */
 template <size_t Rank, typename IdxLin = Index_type>
 auto make_permuted_layout(std::array<IdxLin, Rank> sizes,
-                          std::array<size_t, Rank> permutation)
+                          std::array<camp::idx_t, Rank> permutation)
     -> Layout<Rank, IdxLin>
 {
   std::array<IdxLin, Rank> strides;
@@ -118,10 +118,10 @@ auto make_permuted_layout(std::array<IdxLin, Rank> sizes,
 }
 
 
-template <size_t... Ints>
-using Perm = VarOps::index_sequence<Ints...>;
-template <size_t N>
-using MakePerm = VarOps::make_index_sequence<N>;
+template <camp::idx_t... Ints>
+using Perm = camp::idx_seq<Ints...>;
+template <camp::idx_t N>
+using MakePerm = typename camp::make_idx_seq<N>::type;
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/policy/sequential/fwd.hpp b/include/RAJA/util/SoAArray.hpp
similarity index 61%
rename from include/RAJA/policy/sequential/fwd.hpp
rename to include/RAJA/util/SoAArray.hpp
index 3a02a8d176..71481ff15d 100644
--- a/include/RAJA/policy/sequential/fwd.hpp
+++ b/include/RAJA/util/SoAArray.hpp
@@ -1,3 +1,16 @@
+/*!
+ ******************************************************************************
+ *
+ * \file
+ *
+ * \brief   Header file for common RAJA internal definitions.
+ *
+ ******************************************************************************
+ */
+
+#ifndef RAJA_SOA_ARRAY_HPP
+#define RAJA_SOA_ARRAY_HPP
+
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 // Copyright (c) 2016, Lawrence Livermore National Security, LLC.
 //
@@ -40,57 +53,68 @@
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
-/*!
- ******************************************************************************
- *
- * \file
- *
- * \brief   Header file containing RAJA segment template methods for
- *          execution via CUDA kernel launch.
- *
- *          These methods should work on any platform that supports
- *          CUDA devices.
- *
- ******************************************************************************
- */
-
-#ifndef RAJA_forward_sequential_HXX
-#define RAJA_forward_sequential_HXX
-
-#include <type_traits>
-
 #include "RAJA/config.hpp"
 
-#include "RAJA/policy/sequential/policy.hpp"
+// for RAJA::reduce::detail::ValueLoc
+#include "RAJA/pattern/detail/reduce.hpp"
 
 namespace RAJA
 {
 
-namespace impl
+namespace detail
 {
 
-template <typename Func>
-RAJA_INLINE void forall(const seq_exec &,
-                        const PolicyBase &,
-                        const RangeSegment &iter,
-                        Func &&loop_body);
+/*!
+ * @brief Array class specialized for Struct of Array data layout.
+ *
+ * This is useful for creating a vectorizable data layout and getting
+ * coalesced memory accesses or avoiding shared memory bank conflicts in cuda.
+ */
+template < typename T, size_t size >
+class SoAArray {
+  using value_type = T;
+public:
+
+  RAJA_HOST_DEVICE value_type get(size_t i) const
+  {
+    return mem[i];
+  }
+  RAJA_HOST_DEVICE void set(size_t i, value_type val)
+  {
+    mem[i] = val;
+  }
+
+private:
+  value_type mem[size];
+};
+
+/*!
+ * @brief Specialization for RAJA::reduce::detail::ValueLoc.
+ */
+template < typename T, bool doing_min, size_t size >
+class SoAArray< ::RAJA::reduce::detail::ValueLoc<T, doing_min>, size > {
+  using value_type = ::RAJA::reduce::detail::ValueLoc<T, doing_min>;
+  using first_type  = T;
+  using second_type = Index_type;
+public:
 
-template <typename Iterable, typename Func>
-RAJA_INLINE void forall(const seq_exec &,
-                        const PolicyBase &,
-                        Iterable &&iter,
-                        Func &&loop_body);
+  RAJA_HOST_DEVICE value_type get(size_t i) const
+  {
+    return value_type(mem[i], mem_idx[i]);
+  }
+  RAJA_HOST_DEVICE void set(size_t i, value_type val)
+  {
+    mem[i] = val;
+    mem_idx[i] = val.getLoc();
+  }
 
-template <typename Iterable, typename IndexType, typename Func>
-RAJA_INLINE typename std::enable_if<std::is_integral<IndexType>::value>::type
-forall_Icount(const seq_exec &,
-              const PolicyBase &,
-              Iterable &&iter,
-              IndexType icount,
-              Func &&loop_body);
+private:
+  first_type mem[size];
+  second_type mem_idx[size];
+};
 
-}  // closing brace for impl namespace
+}  // closing brace for detail namespace
 
 }  // closing brace for RAJA namespace
 
-#endif  // closing endif for header file include guard
+#endif /* RAJA_SOA_ARRAY_HPP */
diff --git a/include/RAJA/util/SoAPtr.hpp b/include/RAJA/util/SoAPtr.hpp
new file mode 100644
index 0000000000..aa5671f32b
--- /dev/null
+++ b/include/RAJA/util/SoAPtr.hpp
@@ -0,0 +1,162 @@
+/*!
+ ******************************************************************************
+ *
+ * \file
+ *
+ * \brief   Header file for common RAJA internal definitions.
+ *
+ ******************************************************************************
+ */
+
+#ifndef RAJA_SOA_PTR_HPP
+#define RAJA_SOA_PTR_HPP
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+//
+// Produced at the Lawrence Livermore National Laboratory
+//
+// LLNL-CODE-689114
+//
+// All rights reserved.
+//
+// This file is part of RAJA.
+//
+// For additional details, please also read RAJA/LICENSE.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the disclaimer below.
+//
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the disclaimer (as noted below) in the
+//   documentation and/or other materials provided with the distribution.
+//
+// * Neither the name of the LLNS/LLNL nor the names of its contributors may
+//   be used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
+// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
+// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "RAJA/config.hpp"
+
+// for RAJA::reduce::detail::ValueLoc
+#include "RAJA/pattern/detail/reduce.hpp"
+
+namespace RAJA
+{
+
+namespace detail
+{
+
+/*!
+ * @brief Pointer class specialized for Struct of Array data layout allocated
+ *        via RAJA basic_mempools.
+ *
+ * This is useful for creating a vectorizable data layout and getting
+ * coalesced memory accesses or avoiding shared memory bank conflicts in cuda.
+ */
+template < typename T, typename mempool = RAJA::basic_mempool::MemPool<RAJA::basic_mempool::generic_allocator> >
+class SoAPtr {
+  using value_type = T;
+public:
+  SoAPtr() = default;
+  explicit SoAPtr(size_t size)
+    : mem(mempool::getInstance().template malloc<value_type>(size))
+  {
+  }
+
+  SoAPtr& allocate(size_t size)
+  {
+    mem = mempool::getInstance().template malloc<value_type>(size);
+    return *this;
+  }
+
+  SoAPtr& deallocate()
+  {
+    mempool::getInstance().free(mem);  mem = nullptr;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE bool allocated() const { return mem != nullptr; }
+
+  RAJA_HOST_DEVICE value_type get(size_t i) const
+  {
+    return mem[i];
+  }
+  RAJA_HOST_DEVICE void set(size_t i, value_type val)
+  {
+    mem[i] = val;
+  }
+
+private:
+  value_type* mem = nullptr;
+};
+
+/*!
+ * @brief Specialization for RAJA::reduce::detail::ValueLoc.
+ */
+template < typename T, bool doing_min, typename mempool >
+class SoAPtr< RAJA::reduce::detail::ValueLoc<T, doing_min>, mempool > {
+  using value_type = RAJA::reduce::detail::ValueLoc<T, doing_min>;
+  using first_type  = T;
+  using second_type = Index_type;
+public:
+  SoAPtr() = default;
+  explicit SoAPtr(size_t size)
+    : mem(mempool::getInstance().template malloc<first_type>(size)),
+      mem_idx(mempool::getInstance().template malloc<second_type>(size))
+  {
+  }
+
+  SoAPtr& allocate(size_t size)
+  {
+    mem = mempool::getInstance().template malloc<first_type>(size);
+    mem_idx = mempool::getInstance().template malloc<second_type>(size);
+    return *this;
+  }
+
+  SoAPtr& deallocate()
+  {
+    mempool::getInstance().free(mem);  mem = nullptr;
+    mempool::getInstance().free(mem_idx);  mem_idx = nullptr;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE bool allocated() const { return mem != nullptr; }
+
+  RAJA_HOST_DEVICE value_type get(size_t i) const
+  {
+    return value_type(mem[i], mem_idx[i]);
+  }
+  RAJA_HOST_DEVICE void set(size_t i, value_type val)
+  {
+    mem[i] = val;
+    mem_idx[i] = val.getLoc();
+  }
+
+private:
+  first_type* mem = nullptr;
+  second_type* mem_idx = nullptr;
+};
+
+}  // closing brace for detail namespace
+
+}  // closing brace for RAJA namespace
+
+#endif /* RAJA_SOA_PTR_HPP */
diff --git a/include/RAJA/util/Timer.hpp b/include/RAJA/util/Timer.hpp
index 371fcecda8..3d0c13d669 100644
--- a/include/RAJA/util/Timer.hpp
+++ b/include/RAJA/util/Timer.hpp
@@ -10,11 +10,8 @@
  ******************************************************************************
  */
 
-#ifndef RAJA_Timer_HPP
-#define RAJA_Timer_HPP
-
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -55,6 +52,9 @@
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+#ifndef RAJA_Timer_HPP
+#define RAJA_Timer_HPP
+
 
 #include "RAJA/config.hpp"
 
diff --git a/include/RAJA/util/TypeConvert.hpp b/include/RAJA/util/TypeConvert.hpp
index 3d30c0f6ba..fed2fc7459 100644
--- a/include/RAJA/util/TypeConvert.hpp
+++ b/include/RAJA/util/TypeConvert.hpp
@@ -12,11 +12,8 @@
  ******************************************************************************
  */
 
-#ifndef RAJA_util_TypeConvert_HPP
-#define RAJA_util_TypeConvert_HPP
-
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -57,6 +54,9 @@
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+#ifndef RAJA_util_TypeConvert_HPP
+#define RAJA_util_TypeConvert_HPP
+
 #include "RAJA/config.hpp"
 #include "RAJA/util/defines.hpp"
 
diff --git a/include/RAJA/util/View.hpp b/include/RAJA/util/View.hpp
index 58e5b3f3e9..2969ed07e9 100644
--- a/include/RAJA/util/View.hpp
+++ b/include/RAJA/util/View.hpp
@@ -8,11 +8,8 @@
  ******************************************************************************
  */
 
-#ifndef RAJA_VIEW_HPP
-#define RAJA_VIEW_HPP
-
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -53,6 +50,9 @@
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+#ifndef RAJA_VIEW_HPP
+#define RAJA_VIEW_HPP
+
 #include "RAJA/config.hpp"
 #include "RAJA/util/Layout.hpp"
 #include "RAJA/pattern/atomic.hpp"
diff --git a/include/RAJA/policy/simd/fwd.hpp b/include/RAJA/util/align.hpp
similarity index 72%
rename from include/RAJA/policy/simd/fwd.hpp
rename to include/RAJA/util/align.hpp
index 4418e23601..85f953c158 100644
--- a/include/RAJA/policy/simd/fwd.hpp
+++ b/include/RAJA/util/align.hpp
@@ -1,3 +1,18 @@
+/*!
+ ******************************************************************************
+ *
+ * \file
+ *
+ * \brief   RAJA header file containing an implementation of std align.
+ *
+ ******************************************************************************
+ */
+
+#ifndef RAJA_ALIGN_HPP
+#define RAJA_ALIGN_HPP
+ 
+#include "RAJA/config.hpp"
+
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 // Copyright (c) 2016, Lawrence Livermore National Security, LLC.
 //
@@ -40,45 +55,32 @@
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
-/*!
- ******************************************************************************
- *
- * \file
- *
- * \brief   Header file containing RAJA segment template methods for
- *          execution via CUDA kernel launch.
- *
- *          These methods should work on any platform that supports
- *          CUDA devices.
- *
- ******************************************************************************
- */
-
-#ifndef RAJA_forward_simd_HXX
-#define RAJA_forward_simd_HXX
-
-#include <type_traits>
-
-#include "RAJA/config.hpp"
-
-#include "RAJA/policy/simd/policy.hpp"
-
 namespace RAJA
 {
 
-namespace impl
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Taken from libc++ 
+// See libc++ license in docs/Licenses/libc++ License
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+RAJA_INLINE
+void* align(size_t alignment, size_t size, void*& ptr, size_t& space)
 {
+    void* r = nullptr;
+    if (size <= space)
+    {
+        char* p1 = static_cast<char*>(ptr);
+        char* p2 = reinterpret_cast<char*>(reinterpret_cast<size_t>(p1 + (alignment - 1)) & -alignment);
+        size_t d = static_cast<size_t>(p2 - p1);
+        if (d <= space - size)
+        {
+            r = p2;
+            ptr = r;
+            space -= d;
+        }
+    }
+    return r;
+}
 
-template <typename Iterable, typename Func>
-RAJA_INLINE void
-forall(const simd_exec &, Iterable &&, Func &&);
-
-template <typename Iterable, typename IndexType, typename Func>
-RAJA_INLINE void
-forall_Icount(const simd_exec &, Iterable &&, IndexType, Func &&);
-
-}  // closing brace for impl namespace
-
-}  // closing brace for RAJA namespace
+} // end namespace RAJA
 
-#endif  // closing endif for header file include guard
+#endif
diff --git a/include/RAJA/util/basic_mempool.hpp b/include/RAJA/util/basic_mempool.hpp
new file mode 100644
index 0000000000..ebf9d77882
--- /dev/null
+++ b/include/RAJA/util/basic_mempool.hpp
@@ -0,0 +1,459 @@
+/*!
+ ******************************************************************************
+ *
+ * \file
+ *
+ * \brief   RAJA header file containing an implementation of a memory pool.
+ *
+ ******************************************************************************
+ */
+
+#ifndef RAJA_BASIC_MEMPOOL_HPP
+#define RAJA_BASIC_MEMPOOL_HPP
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+//
+// Produced at the Lawrence Livermore National Laboratory
+//
+// LLNL-CODE-689114
+//
+// All rights reserved.
+//
+// This file is part of RAJA.
+//
+// For additional details, please also read RAJA/LICENSE.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the disclaimer below.
+//
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the disclaimer (as noted below) in the
+//   documentation and/or other materials provided with the distribution.
+//
+// * Neither the name of the LLNS/LLNL nor the names of its contributors may
+//   be used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
+// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
+// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "RAJA/util/align.hpp"
+
+#include "RAJA/util/mutex.hpp"
+
+#include <cassert>
+#include <list>
+#include <map>
+#include <cstddef>
+#include <cstdlib>
+#include <cstdio>
+
+namespace RAJA {
+
+namespace basic_mempool {
+
+namespace detail {
+
+
+/*! \class MemoryArena
+ ******************************************************************************
+ *
+ * \brief  MemoryArena is a map based subclass for class MemPool  
+ * provides book-keeping to divy a large chunk of pre-allocated memory to avoid          
+ * the overhead of  malloc/free or cudaMalloc/cudaFree, etc
+ *
+ * get/give are the primary calls used by class MemPool to get aligned memory 
+ * from the pool or give it back
+ * 
+ *
+ ******************************************************************************
+ */
+class MemoryArena {
+public:
+
+
+  using free_type = std::map<void*, void*>;
+  using free_value_type = typename free_type::value_type;
+  using used_type = std::map<void*, void*>;
+  using used_value_type = typename used_type::value_type;
+
+  MemoryArena(void* ptr, size_t size)
+    : m_allocation{ ptr, static_cast<char*>(ptr)+size },
+      m_free_space({ free_value_type{ptr, static_cast<char*>(ptr)+size} }),
+      m_used_space()
+  {
+    if (m_allocation.begin == nullptr) {
+      fprintf(stderr, "Attempt to create MemoryArena with no memory");
+      std::abort();
+    }
+  }
+
+  MemoryArena(MemoryArena const&) = delete;
+  MemoryArena& operator=(MemoryArena const&) = delete;
+
+  MemoryArena(MemoryArena &&) = default;
+  MemoryArena& operator=(MemoryArena &&) = default;
+
+  size_t capacity()
+  {
+    return static_cast<char*>(m_allocation.end) - static_cast<char*>(m_allocation.begin);
+  }
+
+  bool unused ()
+  {
+    return m_used_space.empty();
+  }
+
+  void* get_allocation()
+  {
+    return m_allocation.begin;
+  }
+
+  void* get(size_t nbytes, size_t alignment)
+  {
+    void* ptr_out = nullptr;
+    if (capacity() >= nbytes) {
+      free_type::iterator end = m_free_space.end();
+      for (free_type::iterator iter = m_free_space.begin(); iter != end; ++iter) {
+
+        void* adj_ptr = iter->first;
+        size_t cap = static_cast<char*>(iter->second) - static_cast<char*>(adj_ptr);
+
+        if (::RAJA::align(alignment, nbytes, adj_ptr, cap)) {
+
+          ptr_out = adj_ptr;
+
+          remove_free_chunk(iter, adj_ptr, static_cast<char*>(adj_ptr) + nbytes);
+
+          add_used_chunk(adj_ptr, static_cast<char*>(adj_ptr) + nbytes);
+          
+          break;
+        }
+
+      }
+    }
+    return ptr_out;
+  }
+
+  bool give(void* ptr)
+  {
+    if ( m_allocation.begin <= ptr && ptr < m_allocation.end ) {
+
+      used_type::iterator found = m_used_space.find(ptr);
+
+      if ( found != m_used_space.end() ) {
+
+        add_free_chunk(found->first, found->second);
+
+        m_used_space.erase(found);
+
+      } else {
+        fprintf(stderr, "Invalid free %p", ptr);
+        std::abort();
+      }
+
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+private:
+  struct memory_chunk {
+    void* begin;
+    void* end;
+  };
+
+  void add_free_chunk(void* begin, void* end)
+  {
+    // integrates a chunk of memory into free_space
+    free_type::iterator invl = m_free_space.end();
+    free_type::iterator next = m_free_space.lower_bound(begin);
+
+    // check if prev exists
+    if (next != m_free_space.begin()) {
+      // check if prev can cover [begin, end)
+      free_type::iterator prev = next; --prev;
+      if (prev->second == begin) {
+        // extend prev to cover [begin, end)
+        prev->second = end;
+
+        // check if prev can cover next too
+        if (next != invl) {
+          assert(next->first != begin);
+
+          if (next->first == end) {
+            // extend prev to cover next too
+            prev->second = next->second;
+
+            // remove redundant next
+            m_free_space.erase(next);
+          }
+        }
+        return;
+      }
+    }
+
+    if (next != invl) {
+      assert(next->first != begin);
+      
+      if (next->first == end) {
+        // extend next to cover [begin, end)
+        m_free_space.insert(next, free_value_type{begin, next->second});
+        m_free_space.erase(next);
+
+        return;
+      }
+    }
+
+    // no free space adjacent to this chunk, add seperate free chunk [begin, end)
+    m_free_space.insert(next, free_value_type{begin, end});
+  }
+
+  void remove_free_chunk(free_type::iterator iter, void* begin, void* end) {
+
+    void* ptr = iter->first;
+    void* ptr_end = iter->second;
+
+    // fixup m_free_space, shrinking and adding chunks as needed
+    if (ptr != begin) {
+
+      // shrink end of current free region to [ptr, begin)
+      iter->second = begin;
+
+      if (end != ptr_end) {
+
+        // insert free region [end, ptr_end) after current free region
+        free_type::iterator next = iter; ++next;
+        m_free_space.insert(next, free_value_type{end, ptr_end});
+      }
+
+    } else if (end != ptr_end) {
+
+      // shrink beginning of current free region to [end, ptr_end)
+      free_type::iterator next = iter; ++next;
+      m_free_space.insert(next, free_value_type{end, ptr_end});
+      m_free_space.erase(iter);
+
+    } else {
+
+      // can not reuse current region, erase
+      m_free_space.erase(iter);
+    }
+  }
+
+  void add_used_chunk(void* begin, void* end)
+  {
+    // simply inserts a chunk of memory into used_space
+    m_used_space.insert(used_value_type{begin, end});
+  }
+
+  memory_chunk m_allocation;
+  free_type m_free_space;
+  used_type m_used_space;
+};
+
+} /* end namespace detail */
+
+
+
+/*! \class MemPool
+ ******************************************************************************
+ *
+ * \brief  MemPool pre-allocates a large chunk of memory and provides generic 
+ * malloc/free for the user to allocate aligned data within the pool
+ *
+ * MemPool uses MemoryArena to do the heavy lifting of maintaining access to 
+ * the used/free space.         
+ * 
+ * MemPool provides an example generic_allocator which can guide more specialized
+ * allocators. The following are some examples 
+ * 
+ * using device_mempool_type = basic_mempool::MemPool<cuda::DeviceAllocator>;
+ * using device_zeroed_mempool_type = basic_mempool::MemPool<cuda::DeviceZeroedAllocator>;
+ * using pinned_mempool_type = basic_mempool::MemPool<cuda::PinnedAllocator>;
+ *
+ * The user provides the specialized allocator, for example :
+ * struct DeviceAllocator {
+ *
+ *  // returns a valid pointer on success, nullptr on failure
+ *  void* malloc(size_t nbytes)
+ *  {
+ *    void* ptr;
+ *    cudaErrchk(cudaMalloc(&ptr, nbytes));
+ *    return ptr;
+ *  }
+ *
+ *  // returns true on success, false on failure
+ *  bool free(void* ptr)
+ *  {
+ *    cudaErrchk(cudaFree(ptr));
+ *    return true;
+ *  }
+ * };
+ *
+ * 
+ ******************************************************************************
+ */
+template < typename allocator_t >
+class MemPool {
+public:
+  using allocator_type = allocator_t;
+
+  static inline MemPool<allocator_t>& getInstance()
+  {
+    static MemPool<allocator_t> pool{};
+    return pool;
+  }
+
+  static const size_t default_default_arena_size = 32ull*1024ull*1024ull;
+
+  MemPool()
+    : m_arenas(),
+      m_default_arena_size(default_default_arena_size),
+      m_alloc()
+  {
+
+  }
+
+  ~MemPool()
+  {
+    // With static objects like MemPool, cudaErrorCudartUnloading is a possible error with cudaFree
+    // So no more cuda calls here
+  }
+
+
+  void free_chunks()
+  {
+#if defined(RAJA_ENABLE_OPENMP)
+    lock_guard<omp::mutex> lock(m_mutex);
+#endif
+
+    while (!m_arenas.empty()) {
+      void *allocation_ptr = m_arenas.front().get_allocation();
+      m_alloc.free(allocation_ptr);
+      m_arenas.pop_front();
+    }
+  }
+
+  size_t arena_size()
+  {
+#if defined(RAJA_ENABLE_OPENMP)
+    lock_guard<omp::mutex> lock(m_mutex);
+#endif
+
+    return m_default_arena_size;
+  }
+
+  size_t arena_size(size_t new_size)
+  {
+#if defined(RAJA_ENABLE_OPENMP)
+    lock_guard<omp::mutex> lock(m_mutex);
+#endif
+
+    size_t prev_size = m_default_arena_size;
+    m_default_arena_size = new_size;
+    return prev_size;
+  }
+
+  template <typename T>
+  T* malloc(size_t nTs, size_t alignment = alignof(T))
+  {
+#if defined(RAJA_ENABLE_OPENMP)
+    lock_guard<omp::mutex> lock(m_mutex);
+#endif
+
+    const size_t size = nTs*sizeof(T);
+    void* ptr = nullptr;
+    arena_container_type::iterator end = m_arenas.end();
+    for (arena_container_type::iterator iter = m_arenas.begin(); iter != end; ++iter ) {
+      ptr = iter->get(size, alignment);
+      if (ptr != nullptr) {
+        break;
+      }
+    }
+
+    if (ptr == nullptr) {
+      const size_t alloc_size = std::max(size+alignment, m_default_arena_size);
+      void* arena_ptr = m_alloc.malloc(alloc_size);
+      if (arena_ptr != nullptr) {
+        m_arenas.emplace_front(arena_ptr, alloc_size);
+        ptr = m_arenas.front().get(size, alignment);
+      }
+    }
+
+    return static_cast<T*>(ptr);
+  }
+
+  void free(const void* cptr)
+  {
+#if defined(RAJA_ENABLE_OPENMP)
+    lock_guard<omp::mutex> lock(m_mutex);
+#endif
+
+    void* ptr = const_cast<void*>(cptr);
+    arena_container_type::iterator end = m_arenas.end();
+    for (arena_container_type::iterator iter = m_arenas.begin(); iter != end; ++iter ) {
+      if (iter->give(ptr)) {
+        ptr = nullptr;
+        break;
+      }
+    }
+    if (ptr != nullptr) {
+      fprintf(stderr, "Unknown pointer %p", ptr);
+    }
+  }
+
+private:
+  using arena_container_type = std::list<detail::MemoryArena>;
+
+#if defined(RAJA_ENABLE_OPENMP)
+  omp::mutex m_mutex;
+#endif
+
+  arena_container_type m_arenas;
+  size_t m_default_arena_size;
+  allocator_t m_alloc;
+};
+
+//! example allocator for basic_mempool using malloc/free
+struct generic_allocator {
+
+  // returns a valid pointer on success, nullptr on failure
+  void* malloc(size_t nbytes)
+  {
+    return std::malloc( nbytes);
+  }
+
+  // returns true on success, false on failure
+  bool free(void* ptr)
+  {
+    std::free(ptr);
+    return true;
+  }
+
+};
+
+} /* end namespace basic_mempool */
+
+} /* end namespace RAJA */
+
+
+
+#endif /* BASIC_MEMPOOL_HXX_ */
diff --git a/include/RAJA/util/chai_support.hpp b/include/RAJA/util/chai_support.hpp
index 5d0100ec55..ecea5aca90 100644
--- a/include/RAJA/util/chai_support.hpp
+++ b/include/RAJA/util/chai_support.hpp
@@ -1,3 +1,56 @@
+/*!
+ ******************************************************************************
+ *
+ * \file
+ *
+ * \brief   RAJA header file defining internal type related constructs for
+ *          interacting with CHAI.
+ *
+ ******************************************************************************
+ */
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
+//
+// Produced at the Lawrence Livermore National Laboratory
+//
+// LLNL-CODE-689114
+//
+// All rights reserved.
+//
+// This file is part of RAJA.
+//
+// For additional details, please also read RAJA/LICENSE.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the disclaimer below.
+//
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the disclaimer (as noted below) in the
+//   documentation and/or other materials provided with the distribution.
+//
+// * Neither the name of the LLNS/LLNL nor the names of its contributors may
+//   be used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
+// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
+// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
 #ifndef RAJA_DETAIL_RAJA_CHAI_HPP
 #define RAJA_DETAIL_RAJA_CHAI_HPP
 
@@ -34,7 +87,7 @@ struct get_space_impl<Platform::host> {
 };
 
 #if defined(RAJA_ENABLE_CUDA)
-template <>
+template<>
 struct get_space_impl<Platform::cuda> {
   static constexpr chai::ExecutionSpace value = chai::GPU;
 };
diff --git a/include/RAJA/util/concepts.hpp b/include/RAJA/util/concepts.hpp
index 996b035832..a6bb41e3e8 100644
--- a/include/RAJA/util/concepts.hpp
+++ b/include/RAJA/util/concepts.hpp
@@ -10,11 +10,8 @@
  ******************************************************************************
  */
 
-#ifndef RAJA_concepts_HPP
-#define RAJA_concepts_HPP
-
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -55,6 +52,10 @@
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+#ifndef RAJA_concepts_HPP
+#define RAJA_concepts_HPP
+
+#include "camp/concepts.hpp"
 #include <iterator>
 #include <type_traits>
 
@@ -63,482 +64,15 @@ namespace RAJA
 
 namespace concepts
 {
-
-namespace metalib
-{
-
-template <class T, T v>
-struct integral_constant {
-  static constexpr T value = v;
-  using value_type = T;
-  using type = integral_constant;
-  constexpr operator value_type() const noexcept { return value; }
-  constexpr value_type operator()() const noexcept { return value; }
-};
-
-template <bool B>
-using bool_ = integral_constant<bool, B>;
-template <int I>
-using int_ = integral_constant<int, I>;
-
-using true_type = bool_<true>;
-using false_type = bool_<false>;
-
-template <typename...>
-struct list;
-
-namespace impl
-{
-
-#ifdef __clang__
-
-// Clang is faster with this implementation
-template <typename, typename = bool>
-struct _if_ {
-};
-
-template <typename If>
-struct _if_<list<If>, decltype(bool(If::type::value))>
-    : std::enable_if<If::type::value> {
-};
-
-template <typename If, typename Then>
-struct _if_<list<If, Then>, decltype(bool(If::type::value))>
-    : std::enable_if<If::type::value, Then> {
-};
-
-template <typename If, typename Then, typename Else>
-struct _if_<list<If, Then, Else>, decltype(bool(If::type::value))>
-    : std::conditional<If::type::value, Then, Else> {
-};
-
-#else
-
-// GCC seems to prefer this implementation
-template <typename, typename = true_type>
-struct _if_ {
-};
-
-template <typename If>
-struct _if_<list<If>, bool_<If::type::value>> {
-  using type = void;
-};
-
-template <typename If, typename Then>
-struct _if_<list<If, Then>, bool_<If::type::value>> {
-  using type = Then;
-};
-
-template <typename If, typename Then, typename Else>
-struct _if_<list<If, Then, Else>, bool_<If::type::value>> {
-  using type = Then;
-};
-
-template <typename If, typename Then, typename Else>
-struct _if_<list<If, Then, Else>, bool_<!If::type::value>> {
-  using type = Else;
-};
-
-#endif
-
-}  // namespace detail
-
-template <typename... Ts>
-using if_ = typename impl::_if_<list<Ts...>>::type;
-
-template <bool If, typename... Args>
-using if_c = typename impl::_if_<list<bool_<If>, Args...>>::type;
-
-
-template <typename T, typename U>
-struct is_same : false_type {
-};
-
-template <typename T>
-struct is_same<T, T> : true_type {
-};
-
-/// bool list -- use for {all,none,any}_of metafunctions
-template <bool...>
-struct blist;
-
-/// negation metafunction of a value type
-template <typename T>
-struct negate_t : bool_<!T::value> {
-};
-
-/// all_of metafunction of a value type list -- all must be "true"
-template <bool... Bs>
-struct all_of : metalib::is_same<blist<true, Bs...>, blist<Bs..., true>> {
-};
-
-/// none_of metafunction of a value type list -- all must be "false"
-template <bool... Bs>
-struct none_of : metalib::is_same<blist<false, Bs...>, blist<Bs..., false>> {
-};
-
-/// any_of metafunction of a value type list -- at least one must be "true""
-template <bool... Bs>
-struct any_of : negate_t<none_of<Bs...>> {
-};
-
-/// all_of metafunction of a bool list -- all must be "true"
-template <typename... Bs>
-struct all_of_t : all_of<Bs::value...> {
-};
-
-/// none_of metafunction of a bool list -- all must be "false"
-template <typename... Bs>
-struct none_of_t : none_of<Bs::value...> {
-};
-
-/// any_of metafunction of a bool list -- at least one must be "true""
-template <typename... Bs>
-struct any_of_t : any_of<Bs::value...> {
-};
-
-}  // end namespace metalib
-
-}  // end namespace concepts
-
-}  // end namespace RAJA
-
-template <typename... T>
-RAJA::concepts::metalib::true_type ___valid_expr___(T &&...) noexcept;
-#define DefineConcept(...) decltype(___valid_expr___(__VA_ARGS__))
-
-#define DefineTypeTraitFromConcept(TTName, ConceptName)             \
-  template <typename... Args>                                       \
-  struct TTName : RAJA::concepts::requires_<ConceptName, Args...> { \
-  }
-
-namespace RAJA
-{
-
-namespace concepts
-{
-
-namespace detail
-{
-
-template <class...>
-struct TL {
-};
-
-template <class...>
-struct voider {
-  using type = void;
-};
-
-template <class Default,
-          class /* always void*/,
-          template <class...> class Concept,
-          class TArgs>
-struct detector {
-  using value_t = metalib::false_type;
-  using type = Default;
-};
-
-template <class Default, template <class...> class Concept, class... Args>
-struct detector<Default,
-                typename voider<Concept<Args...>>::type,
-                Concept,
-                TL<Args...>> {
-  using value_t = metalib::true_type;
-  using type = Concept<Args...>;
-};
-
-template <template <class...> class Concept, class TArgs>
-using is_detected = detector<void, void, Concept, TArgs>;
-
-template <template <class...> class Concept, class TArgs>
-using detected = typename is_detected<Concept, TArgs>::value_t;
-
-
-template <typename Ret, typename T>
-Ret returns(T const &) noexcept;
-
-}  // end namespace detail
-
-template <typename T>
-using negate = metalib::negate_t<T>;
-
-using concepts::metalib::bool_;
-
-/// metafunction to get instance of value type for concepts
-template <typename T>
-auto val() noexcept -> decltype(std::declval<T>());
-
-/// metafunction to get instance of const type for concepts
-template <typename T>
-auto cval() noexcept -> decltype(std::declval<T const>());
-
-/// metafunction for use within decltype expression to validate return type is
-/// convertible to given type
-template <typename T, typename U>
-constexpr auto convertible_to(U &&u) noexcept
-    -> decltype(detail::returns<metalib::true_type>(static_cast<T>((U &&) u)));
-
-/// metafunction for use within decltype expression to validate type of
-/// expression
-template <typename T, typename U>
-constexpr auto has_type(U &&) noexcept
-    -> metalib::if_<metalib::is_same<T, U>, metalib::true_type>;
-
-template <typename BoolLike>
-constexpr auto is(BoolLike) noexcept
-    -> metalib::if_<BoolLike, metalib::true_type>;
-
-template <typename BoolLike>
-constexpr auto is_not(BoolLike) noexcept
-    -> metalib::if_c<!BoolLike::value, metalib::true_type>;
-
-/// metaprogramming concept for SFINAE checking of aggregating concepts
-template <typename... Args>
-struct all_of : metalib::all_of_t<Args...> {
-};
-
-/// metaprogramming concept for SFINAE checking of aggregating concepts
-template <typename... Args>
-struct none_of : metalib::none_of_t<Args...> {
-};
-
-/// metaprogramming concept for SFINAE checking of aggregating concepts
-template <typename... Args>
-struct any_of : metalib::any_of_t<Args...> {
-};
-
-/// SFINAE multiple type traits
-template <typename... Args>
-using enable_if = typename std::enable_if<all_of<Args...>::value, void>::type;
-
-/// SFINAE concept checking
-template <template <class...> class Op, class... Args>
-struct requires_ : detail::detected<Op, detail::TL<Args...>> {
-};
-
-namespace types
-{
-
-template <typename T>
-using decay_t =
-    typename std::remove_reference<typename std::remove_cv<T>::type>::type;
-
-template <typename T>
-using plain_t = typename std::remove_reference<T>::type;
-
-template <typename T>
-using diff_t = decltype(val<plain_t<T>>() - val<plain_t<T>>());
-
-template <typename T>
-using iterator_t = decltype(std::begin(val<plain_t<T>>()));
-
-}  // end namespace types
-
-template <typename T>
-struct LessThanComparable
-    : DefineConcept(convertible_to<bool>(val<T>() < val<T>())) {
-};
-
-template <typename T>
-struct GreaterThanComparable
-    : DefineConcept(convertible_to<bool>(val<T>() > val<T>())) {
-};
-
-template <typename T>
-struct LessEqualComparable
-    : DefineConcept(convertible_to<bool>(val<T>() <= val<T>())) {
-};
-
-template <typename T>
-struct GreaterEqualComparable
-    : DefineConcept(convertible_to<bool>(val<T>() >= val<T>())) {
-};
-
-template <typename T>
-struct EqualityComparable
-    : DefineConcept(convertible_to<bool>(val<T>() == val<T>())) {
-};
-
-template <typename T, typename U>
-struct ComparableTo
-    : DefineConcept(convertible_to<bool>(val<U>() < val<T>()),
-                    convertible_to<bool>(val<T>() < val<U>()),
-                    convertible_to<bool>(val<U>() <= val<T>()),
-                    convertible_to<bool>(val<T>() <= val<U>()),
-                    convertible_to<bool>(val<U>() > val<T>()),
-                    convertible_to<bool>(val<T>() > val<U>()),
-                    convertible_to<bool>(val<U>() >= val<T>()),
-                    convertible_to<bool>(val<T>() >= val<U>()),
-                    convertible_to<bool>(val<U>() == val<T>()),
-                    convertible_to<bool>(val<T>() == val<U>()),
-                    convertible_to<bool>(val<U>() != val<T>()),
-                    convertible_to<bool>(val<T>() != val<U>())) {
-};
-
-template <typename T>
-struct Comparable : ComparableTo<T, T> {
-};
-
-template <typename T>
-struct Arithmetic : DefineConcept(is(std::is_arithmetic<T>())) {
-};
-
-template <typename T>
-struct FloatingPoint : DefineConcept(is(std::is_floating_point<T>())) {
-};
-
-template <typename T>
-struct Integral : DefineConcept(is(std::is_integral<T>())) {
-};
-
-template <typename T>
-struct Signed : DefineConcept(Integral<T>(), is(std::is_signed<T>())) {
-};
-
-template <typename T>
-struct Unsigned : DefineConcept(Integral<T>(), is(std::is_unsigned<T>())) {
-};
-
-template <typename T>
-struct Iterator
-    : DefineConcept(is_not(Integral<T>()),  // hacky NVCC 8 workaround
-                    *(val<T>()),
-                    has_type<T &>(++val<T &>())) {
-};
-
-template <typename T>
-struct ForwardIterator
-    : DefineConcept(Iterator<T>(), val<T &>()++, *val<T &>()++) {
-};
-
-template <typename T>
-struct BidirectionalIterator
-    : DefineConcept(ForwardIterator<T>(),
-                    has_type<T &>(--val<T &>()),
-                    convertible_to<T const &>(val<T &>()--),
-                    *val<T &>()--) {
-};
-
-template <typename T>
-struct RandomAccessIterator
-    : DefineConcept(BidirectionalIterator<T>(),
-                    Comparable<T>(),
-                    has_type<T &>(val<T &>() += val<types::diff_t<T>>()),
-                    has_type<T>(val<T>() + val<types::diff_t<T>>()),
-                    has_type<T>(val<types::diff_t<T>>() + val<T>()),
-                    has_type<T &>(val<T &>() -= val<types::diff_t<T>>()),
-                    has_type<T>(val<T>() - val<types::diff_t<T>>()),
-                    val<T>()[val<types::diff_t<T>>()]) {
-};
-
-template <typename T>
-struct HasBeginEnd : DefineConcept(std::begin(val<T>()), std::end(val<T>())) {
-};
-
-template <typename T>
-struct Range
-    : DefineConcept(HasBeginEnd<T>(), Iterator<types::iterator_t<T>>()) {
-};
-
-template <typename T>
-struct ForwardRange
-    : DefineConcept(HasBeginEnd<T>(), ForwardIterator<types::iterator_t<T>>()) {
-};
-
-template <typename T>
-struct BidirectionalRange
-    : DefineConcept(HasBeginEnd<T>(),
-                    BidirectionalIterator<types::iterator_t<T>>()) {
-};
-
-template <typename T>
-struct RandomAccessRange
-    : DefineConcept(HasBeginEnd<T>(),
-                    RandomAccessIterator<types::iterator_t<T>>()) {
-};
-
-}  // end namespace concepts
+using namespace camp::concepts;
+}
 
 namespace type_traits
 {
-DefineTypeTraitFromConcept(is_iterator, RAJA::concepts::Iterator);
-DefineTypeTraitFromConcept(is_forward_iterator,
-                           RAJA::concepts::ForwardIterator);
-DefineTypeTraitFromConcept(is_bidirectional_iterator,
-                           RAJA::concepts::BidirectionalIterator);
-DefineTypeTraitFromConcept(is_random_access_iterator,
-                           RAJA::concepts::RandomAccessIterator);
-
-DefineTypeTraitFromConcept(is_range, RAJA::concepts::Range);
-DefineTypeTraitFromConcept(is_forward_range, RAJA::concepts::ForwardRange);
-DefineTypeTraitFromConcept(is_bidirectional_range,
-                           RAJA::concepts::BidirectionalRange);
-DefineTypeTraitFromConcept(is_random_access_range,
-                           RAJA::concepts::RandomAccessRange);
-
-DefineTypeTraitFromConcept(is_comparable, RAJA::concepts::Comparable);
-DefineTypeTraitFromConcept(is_comparable_to, RAJA::concepts::ComparableTo);
-
-DefineTypeTraitFromConcept(is_arithmetic, RAJA::concepts::Arithmetic);
-DefineTypeTraitFromConcept(is_floating_point, RAJA::concepts::FloatingPoint);
-DefineTypeTraitFromConcept(is_integral, RAJA::concepts::Integral);
-DefineTypeTraitFromConcept(is_signed, RAJA::concepts::Signed);
-DefineTypeTraitFromConcept(is_unsigned, RAJA::concepts::Unsigned);
-
-template <typename T>
-using IterableValue = decltype(*std::begin(RAJA::concepts::val<T>()));
-
-template <typename T>
-using IteratorValue = decltype(*RAJA::concepts::val<T>());
-
-namespace detail
-{
-
-template <typename, template <typename...> class, typename...>
-struct IsSpecialized : RAJA::concepts::metalib::false_type {
-};
-
-template <template <typename...> class Template, typename... T>
-struct IsSpecialized<typename concepts::detail::voider<decltype(
-                         concepts::val<Template<T...>>())>::type,
-                     Template,
-                     T...> : RAJA::concepts::metalib::true_type {
-};
-
-template <template <class...> class, template <class...> class, bool, class...>
-struct SpecializationOf : RAJA::concepts::metalib::false_type {
-};
-
-template <template <class...> class Expected,
-          template <class...> class Actual,
-          class... Args>
-struct SpecializationOf<Expected, Actual, true, Args...>
-    : RAJA::concepts::metalib::is_same<Expected<Args...>, Actual<Args...>> {
-};
-
-}  // end namespace detail
-
-
-template <template <class...> class Outer, class... Args>
-using IsSpecialized = detail::IsSpecialized<void, Outer, Args...>;
-
-template <template <class...> class, typename T>
-struct SpecializationOf : RAJA::concepts::metalib::false_type {
-};
-
-template <template <class...> class Expected,
-          template <class...> class Actual,
-          class... Args>
-struct SpecializationOf<Expected, Actual<Args...>>
-    : detail::SpecializationOf<Expected,
-                               Actual,
-                               IsSpecialized<Expected, Args...>::value,
-                               Args...> {
-};
-
-}  // end namespace type_traits
+using namespace camp::type_traits;
+}
 
 }  // end namespace RAJA
 
 #endif
+
diff --git a/include/RAJA/util/defines.hpp b/include/RAJA/util/defines.hpp
index 2c84574311..2d9ed041b9 100644
--- a/include/RAJA/util/defines.hpp
+++ b/include/RAJA/util/defines.hpp
@@ -8,11 +8,8 @@
  ******************************************************************************
  */
 
-#ifndef RAJA_INTERNAL_DEFINES_HPP
-#define RAJA_INTERNAL_DEFINES_HPP
-
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -53,6 +50,9 @@
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+#ifndef RAJA_INTERNAL_DEFINES_HPP
+#define RAJA_INTERNAL_DEFINES_HPP
+
 #include "RAJA/config.hpp"
 
 #include <cstdlib>
@@ -68,11 +68,15 @@
 #define RAJA_HOST_DEVICE __host__ __device__
 #define RAJA_DEVICE __device__
 
+#if defined(RAJA_ENABLE_CLANG_CUDA)
+#define RAJA_SUPPRESS_HD_WARN
+#else
 #if defined(_WIN32)  // windows is non-compliant, yay
 #define RAJA_SUPPRESS_HD_WARN __pragma(nv_exec_check_disable)
 #else
 #define RAJA_SUPPRESS_HD_WARN _Pragma("nv_exec_check_disable")
 #endif
+#endif
 
 #else
 
@@ -122,7 +126,8 @@
  * \endcode
  *******************************************************************************
  */
-#define RAJA_UNUSED_VAR(_x) static_cast<void>(_x)
+template < typename... T >
+RAJA_HOST_DEVICE RAJA_INLINE void RAJA_UNUSED_VAR(T&&...) noexcept {}
 
 /*!
  * \def RAJA_STRINGIFY_HELPER(x)
@@ -138,14 +143,10 @@
  */
 #define RAJA_STRINGIFY_MACRO(x) RAJA_STRINGIFY_HELPER(x)
 
-/*!
- * \def RAJA_DIVIDE_CEILING_INT(dividend, divisor)
- *
- * Macro to find ceiling (dividend / divisor) for integer types
- */
 #define RAJA_DIVIDE_CEILING_INT(dividend, divisor) \
   (((dividend) + (divisor)-1) / (divisor))
 
+
 inline void RAJA_ABORT_OR_THROW(const char *str)
 {
   if (std::getenv("RAJA_NO_EXCEPT") != nullptr) {
diff --git a/include/RAJA/policy/tbb/fwd.hpp b/include/RAJA/util/mutex.hpp
similarity index 56%
rename from include/RAJA/policy/tbb/fwd.hpp
rename to include/RAJA/util/mutex.hpp
index fcafea86b1..3804ae6894 100644
--- a/include/RAJA/policy/tbb/fwd.hpp
+++ b/include/RAJA/util/mutex.hpp
@@ -1,3 +1,18 @@
+/*!
+******************************************************************************
+*
+* \file
+*
+* \brief   Header file providing functionality similar to std mutex header.
+*
+******************************************************************************
+*/
+
+#ifndef RAJA_util_mutex_HPP
+#define RAJA_util_mutex_HPP
+
+#include "RAJA/config.hpp"
+
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 // Copyright (c) 2016, Lawrence Livermore National Security, LLC.
 //
@@ -40,64 +55,89 @@
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
-/*!
- ******************************************************************************
- *
- * \file
- *
- * \brief   Header file containing RAJA segment template methods for
- *          execution via CUDA kernel launch.
- *
- *          These methods should work on any platform that supports
- *          CUDA devices.
- *
- ******************************************************************************
- */
-
-#ifndef RAJA_forward_tbb_HXX
-#define RAJA_forward_tbb_HXX
-
-#include <type_traits>
-
-#include "RAJA/config.hpp"
-
-#include "RAJA/policy/tbb/policy.hpp"
+#if defined(RAJA_ENABLE_OPENMP)
+#include <omp.h>
+#endif
 
 namespace RAJA
 {
 
-namespace impl
+#if defined(RAJA_ENABLE_OPENMP)
+namespace omp
 {
 
-template <typename Iterable, typename Func>
-RAJA_INLINE void forall(const tbb_for_dynamic& p,
-                        Iterable&& iter,
-                        Func&& loop_body);
-
-template <typename Iterable, typename IndexType, typename Func>
-RAJA_INLINE typename std::enable_if<std::is_integral<IndexType>::value>::type
-forall_Icount(const tbb_for_dynamic& p,
-              Iterable&& iter,
-              IndexType icount,
-              Func&& loop_body);
-
-template <typename Iterable, typename Func, size_t ChunkSize>
-RAJA_INLINE void forall(const tbb_for_static<ChunkSize>&,
-                        Iterable&& iter,
-                        Func&& loop_body);
-
-template <typename Iterable,
-          typename IndexType,
-          typename Func,
-          size_t ChunkSize>
-RAJA_INLINE typename std::enable_if<std::is_integral<IndexType>::value>::type
-forall_Icount(const tbb_for_static<ChunkSize>&,
-              Iterable&& iter,
-              IndexType icount,
-              Func&& loop_body);
-
-}  // closing brace for impl namespace
-
-}  // closing brace for RAJA namespace
+//! class wrapping omp_lock_t with std::mutex interface
+class mutex {
+public:
+  using native_handle_type = omp_lock_t;
+
+  mutex()
+  {
+    omp_init_lock(&m_lock);
+  }
+
+  mutex( const mutex& ) = delete;
+  mutex( mutex&& ) = delete;
+  mutex& operator=( const mutex& ) = delete;
+  mutex& operator=( mutex&& ) = delete;
+
+  void lock()
+  {
+    omp_set_lock(&m_lock);
+  }
+
+  bool try_lock()
+  {
+    return omp_test_lock(&m_lock) != 0;
+  }
+
+  void unlock()
+  {
+    omp_unset_lock(&m_lock);
+  }
+
+  native_handle_type& native_handle()
+  {
+    return m_lock;
+  }
+
+  ~mutex()
+  {
+    omp_destroy_lock(&m_lock);
+  }
+
+private:
+  native_handle_type m_lock;
+};
+
+} // namespace omp
+#endif  // closing endif for if defined(RAJA_ENABLE_OPENMP)
+
+//! class providing functionality of std::lock_guard
+template < typename mutex_type >
+class lock_guard {
+public:
+  
+  explicit lock_guard( mutex_type& m )
+    : m_mutex(m)
+  {
+    m_mutex.lock();
+  }
+
+  lock_guard( const lock_guard& ) = delete;
+  lock_guard( lock_guard&& ) = delete;
+  lock_guard& operator=( const lock_guard& ) = delete;
+  lock_guard& operator=( lock_guard&& ) = delete;
+
+  ~lock_guard()
+  {
+    m_mutex.unlock();
+  }
+
+private:
+  mutex_type& m_mutex;
+};
+
+}  // namespace RAJA
 
 #endif  // closing endif for header file include guard
diff --git a/include/RAJA/util/types.hpp b/include/RAJA/util/types.hpp
index bb418ab594..14233e5bdb 100644
--- a/include/RAJA/util/types.hpp
+++ b/include/RAJA/util/types.hpp
@@ -10,11 +10,8 @@
  ******************************************************************************
  */
 
-#ifndef RAJA_Types_HPP
-#define RAJA_Types_HPP
-
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -55,7 +52,11 @@
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+#ifndef RAJA_Types_HPP
+#define RAJA_Types_HPP
+
 #include "RAJA/config.hpp"
+#include "camp/helpers.hpp"
 
 #include <cstddef>
 
diff --git a/include/camp/.clang-format b/include/camp/.clang-format
new file mode 100644
index 0000000000..9ed1dc1194
--- /dev/null
+++ b/include/camp/.clang-format
@@ -0,0 +1,28 @@
+BasedOnStyle : google
+IndentWidth : 2
+BreakBeforeBraces : Linux
+KeepEmptyLinesAtTheStartOfBlocks : true
+MaxEmptyLinesToKeep : 2
+AccessModifierOffset : -2
+UseTab: Never
+AllowShortIfStatementsOnASingleLine : true
+ConstructorInitializerAllOnOneLineOrOnePerLine : true
+AllowShortFunctionsOnASingleLine : true
+AllowShortLoopsOnASingleLine : false
+BinPackParameters : false
+AllowAllParametersOfDeclarationOnNextLine : false
+AlignTrailingComments : true
+ColumnLimit : 80
+PenaltyBreakBeforeFirstCallParameter : 100
+PenaltyReturnTypeOnItsOwnLine : 65000
+PenaltyBreakString : 10
+NamespaceIndentation : Inner
+
+# These improve formatting results but require clang 3.6/7 or higher
+BreakBeforeBinaryOperators : NonAssignment
+AlignAfterOpenBracket: true
+BinPackArguments : false
+AlignOperands : true
+AlwaysBreakTemplateDeclarations : true
+Cpp11BracedListStyle : true
+
diff --git a/include/camp/camp.hpp b/include/camp/camp.hpp
new file mode 100644
index 0000000000..929eb4e8b0
--- /dev/null
+++ b/include/camp/camp.hpp
@@ -0,0 +1,240 @@
+#ifndef __CAMP_HPP
+#define __CAMP_HPP
+
+#include <array>
+#include <type_traits>
+
+#include "camp/defines.hpp"
+#include "camp/helpers.hpp"
+#include "camp/lambda.hpp"
+#include "camp/list/at.hpp"
+#include "camp/list/find_if.hpp"
+#include "camp/number.hpp"
+#include "camp/tuple.hpp"
+#include "camp/value.hpp"
+
+namespace camp
+{
+// Fwd
+template <typename... Ts>
+struct list;
+template <typename T>
+struct as_array;
+template <typename T>
+struct size;
+template <typename Seq>
+struct flatten;
+
+// Sequences
+//// list
+
+template <typename Seq, typename T>
+struct append;
+template <typename... Elements, typename T>
+struct append<list<Elements...>, T> {
+  using type = list<Elements..., T>;
+};
+
+template <typename Seq, typename T>
+struct prepend;
+template <typename... Elements, typename T>
+struct prepend<list<Elements...>, T> {
+  using type = list<Elements..., T>;
+};
+
+template <typename Seq, typename T>
+struct extend;
+template <typename... Elements, typename... NewElements>
+struct extend<list<Elements...>, list<NewElements...>> {
+  using type = list<Elements..., NewElements...>;
+};
+
+namespace detail
+{
+  template <typename CurSeq, size_t N, typename... Rest>
+  struct flatten_impl;
+  template <typename CurSeq>
+  struct flatten_impl<CurSeq, 0> {
+    using type = CurSeq;
+  };
+  template <typename... CurSeqElements,
+            size_t N,
+            typename First,
+            typename... Rest>
+  struct flatten_impl<list<CurSeqElements...>, N, First, Rest...> {
+    using type = typename flatten_impl<list<CurSeqElements..., First>,
+                                       N - 1,
+                                       Rest...>::type;
+  };
+  template <typename... CurSeqElements,
+            size_t N,
+            typename... FirstInnerElements,
+            typename... Rest>
+  struct flatten_impl<list<CurSeqElements...>,
+                      N,
+                      list<FirstInnerElements...>,
+                      Rest...> {
+    using first_inner_flat =
+        typename flatten_impl<list<>,
+                              sizeof...(FirstInnerElements),
+                              FirstInnerElements...>::type;
+    using cur_and_first =
+        typename extend<list<CurSeqElements...>, first_inner_flat>::type;
+    using type = typename flatten_impl<cur_and_first, N - 1, Rest...>::type;
+  };
+}
+
+template <typename... Elements>
+struct flatten<list<Elements...>>
+    : detail::flatten_impl<list<>, sizeof...(Elements), Elements...> {
+};
+
+#if defined(CAMP_TEST)
+namespace test
+{
+  CHECK_TSAME((flatten<list<>>), (list<>));
+  CHECK_TSAME((flatten<list<int>>), (list<int>));
+  CHECK_TSAME((flatten<list<list<int>>>), (list<int>));
+  CHECK_TSAME((flatten<list<list<list<int>>>>), (list<int>));
+  CHECK_TSAME((flatten<list<float, list<int, double>, list<list<int>>>>),
+              (list<float, int, double, int>));
+}
+#endif
+
+template <template <typename...> class Op, typename T>
+struct transform;
+template <template <typename...> class Op, typename... Elements>
+struct transform<Op, list<Elements...>> {
+  using type = list<typename Op<Elements>::type...>;
+};
+
+#if defined(CAMP_TEST)
+namespace test
+{
+  CHECK_TSAME((transform<std::add_cv, list<int>>), (list<const volatile int>));
+  CHECK_TSAME((transform<std::remove_reference, list<int&, int&>>),
+              (list<int, int>));
+}
+#endif
+
+namespace detail
+{
+  template <template <typename...> class Op, typename Current, typename... Rest>
+  struct accumulate_impl;
+  template <template <typename...> class Op,
+            typename Current,
+            typename First,
+            typename... Rest>
+  struct accumulate_impl<Op, Current, First, Rest...> {
+    using current = typename Op<Current, First>::type;
+    using type = typename accumulate_impl<Op, current, Rest...>::type;
+  };
+  template <template <typename...> class Op, typename Current>
+  struct accumulate_impl<Op, Current> {
+    using type = Current;
+  };
+}
+
+template <template <typename...> class Op, typename Initial, typename Seq>
+struct accumulate;
+template <template <typename...> class Op,
+          typename Initial,
+          typename... Elements>
+struct accumulate<Op, Initial, list<Elements...>> {
+  using type = typename detail::accumulate_impl<Op, Initial, Elements...>::type;
+};
+
+CAMP_MAKE_L(accumulate);
+
+#if defined(CAMP_TEST)
+namespace test
+{
+  CHECK_TSAME((accumulate<append, list<>, list<int, float, double>>),
+              (list<int, float, double>));
+}
+#endif
+
+template <template <typename...> class Op, typename Seq>
+struct filter;
+
+template <template <typename...> class Op, typename... Elements>
+struct filter<Op, list<Elements...>> {
+  template <typename Seq, typename T>
+  using append_if =
+      if_<typename Op<T>::type, typename append<Seq, T>::type, Seq>;
+  using type = typename accumulate<append_if, list<>, list<Elements...>>::type;
+};
+
+CAMP_MAKE_L(filter);
+
+#if defined(CAMP_TEST)
+namespace test
+{
+  CHECK_TSAME((filter<std::is_pointer, list<int, float*, double, short*>>),
+              (list<float*, short*>));
+}
+#endif
+
+namespace detail
+{
+  template <typename T>
+  struct _as_list;
+  template <template <typename...> class T, typename... Args>
+  struct _as_list<T<Args...>> {
+    using type = list<Args...>;
+  };
+  template <typename T, T... Args>
+  struct _as_list<int_seq<T, Args...>> {
+    using type = list<integral_constant<T, Args>...>;
+  };
+} /* detail */
+
+template <typename T>
+struct as_list_s : detail::_as_list<T>::type {
+};
+
+template <typename T>
+using as_list = typename as_list_s<T>::type;
+
+//// size
+template <typename... Args>
+struct size<list<Args...>> {
+  constexpr static idx_t value{sizeof...(Args)};
+  using type = num<sizeof...(Args)>;
+};
+
+#if defined(CAMP_TEST)
+namespace test
+{
+  CHECK_IEQ((size<list<int>>), (1));
+  CHECK_IEQ((size<list<int, int>>), (2));
+  CHECK_IEQ((size<list<int, int, int>>), (3));
+}
+#endif
+
+template <typename T, T... Args>
+struct size<int_seq<T, Args...>> {
+  constexpr static idx_t value{sizeof...(Args)};
+  using type = num<sizeof...(Args)>;
+};
+
+#if defined(CAMP_TEST)
+namespace test
+{
+  CHECK_IEQ((size<idx_seq<0>>), (1));
+  CHECK_IEQ((size<idx_seq<0, 0>>), (2));
+  CHECK_IEQ((size<idx_seq<0, 0, 0>>), (3));
+}
+#endif
+
+}  // end namespace camp
+
+#if defined(CAMP_TEST)
+int main(int argc, char* argv[])
+{
+  camp::tuple<int, float> b;
+  return 0;
+}
+#endif
+
+#endif /* __CAMP_HPP */
diff --git a/include/camp/concepts.hpp b/include/camp/concepts.hpp
new file mode 100644
index 0000000000..b6596baad4
--- /dev/null
+++ b/include/camp/concepts.hpp
@@ -0,0 +1,391 @@
+#ifndef CAMP_CONCEPTS_HPP
+#define CAMP_CONCEPTS_HPP
+
+#include <iterator>
+#include <type_traits>
+
+#include "camp/helpers.hpp"
+#include "camp/number.hpp"
+
+namespace camp
+{
+
+namespace concepts
+{
+
+  namespace metalib
+  {
+
+    template <typename T, typename U>
+    struct is_same_s : false_type {
+    };
+
+    template <typename T>
+    struct is_same_s<T, T> : true_type {
+    };
+
+#if defined(CAMP_COMPILER_MSVC)
+    template <typename...Ts>
+    using is_same = typename is_same_s<Ts...>::type;
+#else
+    template <typename T, typename U>
+    using is_same = typename is_same_s<T, U>::type;
+#endif
+
+
+    /// negation metafunction of a value type
+    template <typename T>
+    struct negate_t : num<!T::value> {
+    };
+
+    /// all_of metafunction of a value type list -- all must be "true"
+    template <bool... Bs>
+    struct all_of : metalib::is_same<list<t, num<Bs>...>, list<num<Bs>..., t>> {
+    };
+
+    /// none_of metafunction of a value type list -- all must be "false"
+    template <bool... Bs>
+    struct none_of
+        : metalib::is_same<idx_seq<false, Bs...>, idx_seq<Bs..., false>> {
+    };
+
+    /// any_of metafunction of a value type list -- at least one must be "true""
+    template <bool... Bs>
+    struct any_of : negate_t<none_of<Bs...>> {
+    };
+
+    /// all_of metafunction of a bool list -- all must be "true"
+    template <typename... Bs>
+    struct all_of_t : all_of<Bs::value...> {
+    };
+
+    /// none_of metafunction of a bool list -- all must be "false"
+    template <typename... Bs>
+    struct none_of_t : none_of<Bs::value...> {
+    };
+
+    /// any_of metafunction of a bool list -- at least one must be "true""
+    template <typename... Bs>
+    struct any_of_t : any_of<Bs::value...> {
+    };
+
+  }  // end namespace metalib
+
+}  // end namespace concepts
+}  // end namespace camp
+
+template <typename... T>
+camp::true_type ___valid_expr___(T &&...) noexcept;
+#define DefineConcept(...) decltype(___valid_expr___(__VA_ARGS__))
+
+#define DefineTypeTraitFromConcept(TTName, ConceptName)             \
+  template <typename... Args>                                       \
+  struct TTName : camp::concepts::requires_<ConceptName, Args...> { \
+  }
+namespace camp
+{
+namespace concepts
+{
+
+  namespace detail
+  {
+
+    template <class...>
+    struct TL {
+    };
+
+    template <class...>
+    struct voider {
+      using type = void;
+    };
+
+    template <class Default,
+              class /* always void*/,
+              template <class...> class Concept,
+              class TArgs>
+    struct detector {
+      using value_t = false_type;
+      using type = Default;
+    };
+
+    template <class Default, template <class...> class Concept, class... Args>
+    struct detector<Default,
+                    typename voider<Concept<Args...>>::type,
+                    Concept,
+                    TL<Args...>> {
+      using value_t = true_type;
+      using type = Concept<Args...>;
+    };
+
+    template <template <class...> class Concept, class TArgs>
+    using is_detected = detector<void, void, Concept, TArgs>;
+
+    template <template <class...> class Concept, class TArgs>
+    using detected = typename is_detected<Concept, TArgs>::value_t;
+
+
+    template <typename Ret, typename T>
+    Ret returns(T const &) noexcept;
+
+  }  // end namespace detail
+
+  template <typename T>
+  using negate = metalib::negate_t<T>;
+
+  /// metafunction for use within decltype expression to validate return type is
+  /// convertible to given type
+  template <typename T, typename U>
+  constexpr auto convertible_to(U &&u) noexcept
+      -> decltype(detail::returns<camp::true_type>(static_cast<T>((U &&) u)));
+
+  /// metafunction for use within decltype expression to validate type of
+  /// expression
+  template <typename T, typename U>
+  constexpr auto has_type(U &&) noexcept -> metalib::is_same<T, U>;
+
+  template <typename BoolLike>
+  constexpr auto is(BoolLike) noexcept
+      -> camp::if_<BoolLike, camp::true_type, camp::false_type>;
+
+  template <typename BoolLike>
+  constexpr auto is_not(BoolLike) noexcept
+      -> camp::if_c<!BoolLike::value, camp::true_type, camp::false_type>;
+
+  /// metaprogramming concept for SFINAE checking of aggregating concepts
+  template <typename... Args>
+  struct all_of : metalib::all_of_t<Args...> {
+  };
+
+  /// metaprogramming concept for SFINAE checking of aggregating concepts
+  template <typename... Args>
+  struct none_of : metalib::none_of_t<Args...> {
+  };
+
+  /// metaprogramming concept for SFINAE checking of aggregating concepts
+  template <typename... Args>
+  struct any_of : metalib::any_of_t<Args...> {
+  };
+
+  /// SFINAE multiple type traits
+  template <typename... Args>
+  using enable_if = typename std::enable_if<all_of<Args...>::value, void>::type;
+
+  /// SFINAE concept checking
+  template <template <class...> class Op, class... Args>
+  struct requires_ : detail::detected<Op, detail::TL<Args...>> {
+  };
+
+  template <typename T>
+  struct Swappable
+      : DefineConcept(swap(val<T>(), val<T>())) {
+  };
+
+  template <typename T>
+  struct LessThanComparable
+      : DefineConcept(convertible_to<bool>(val<T>() < val<T>())) {
+  };
+
+  template <typename T>
+  struct GreaterThanComparable
+      : DefineConcept(convertible_to<bool>(val<T>() > val<T>())) {
+  };
+
+  template <typename T>
+  struct LessEqualComparable
+      : DefineConcept(convertible_to<bool>(val<T>() <= val<T>())) {
+  };
+
+  template <typename T>
+  struct GreaterEqualComparable
+      : DefineConcept(convertible_to<bool>(val<T>() >= val<T>())) {
+  };
+
+  template <typename T>
+  struct EqualityComparable
+      : DefineConcept(convertible_to<bool>(val<T>() == val<T>())) {
+  };
+
+  template <typename T, typename U>
+  struct ComparableTo
+      : DefineConcept(convertible_to<bool>(val<U>() < val<T>()),
+                      convertible_to<bool>(val<T>() < val<U>()),
+                      convertible_to<bool>(val<U>() <= val<T>()),
+                      convertible_to<bool>(val<T>() <= val<U>()),
+                      convertible_to<bool>(val<U>() > val<T>()),
+                      convertible_to<bool>(val<T>() > val<U>()),
+                      convertible_to<bool>(val<U>() >= val<T>()),
+                      convertible_to<bool>(val<T>() >= val<U>()),
+                      convertible_to<bool>(val<U>() == val<T>()),
+                      convertible_to<bool>(val<T>() == val<U>()),
+                      convertible_to<bool>(val<U>() != val<T>()),
+                      convertible_to<bool>(val<T>() != val<U>())) {
+  };
+
+  template <typename T>
+  struct Comparable : ComparableTo<T, T> {
+  };
+
+  template <typename T>
+  struct Arithmetic : DefineConcept(is(std::is_arithmetic<T>())) {
+  };
+
+  template <typename T>
+  struct FloatingPoint : DefineConcept(is(std::is_floating_point<T>())) {
+  };
+
+  template <typename T>
+  struct Integral : DefineConcept(is(std::is_integral<T>())) {
+  };
+
+  template <typename T>
+  struct Signed : DefineConcept(Integral<T>(), is(std::is_signed<T>())) {
+  };
+
+  template <typename T>
+  struct Unsigned : DefineConcept(Integral<T>(), is(std::is_unsigned<T>())) {
+  };
+
+  template <typename T>
+  struct Iterator
+      : DefineConcept(is_not(Integral<T>()),  // hacky NVCC 8 workaround
+                      *(val<T>()),
+                      has_type<T &>(++val<T &>())) {
+  };
+
+  template <typename T>
+  struct ForwardIterator
+      : DefineConcept(Iterator<T>(), val<T &>()++, *val<T &>()++) {
+  };
+
+  template <typename T>
+  struct BidirectionalIterator
+      : DefineConcept(ForwardIterator<T>(),
+                      has_type<T &>(--val<T &>()),
+                      convertible_to<T const &>(val<T &>()--),
+                      *val<T &>()--) {
+  };
+
+  template <typename T>
+  struct RandomAccessIterator
+      : DefineConcept(BidirectionalIterator<T>(),
+                      Comparable<T>(),
+                      has_type<T &>(val<T &>() += val<diff_from<T>>()),
+                      has_type<T>(val<T>() + val<diff_from<T>>()),
+                      has_type<T>(val<diff_from<T>>() + val<T>()),
+                      has_type<T &>(val<T &>() -= val<diff_from<T>>()),
+                      has_type<T>(val<T>() - val<diff_from<T>>()),
+                      val<T>()[val<diff_from<T>>()]) {
+  };
+
+  template <typename T>
+  struct HasBeginEnd : DefineConcept(std::begin(val<T>()), std::end(val<T>())) {
+  };
+
+  template <typename T>
+  struct Range
+      : DefineConcept(HasBeginEnd<T>(), Iterator<iterator_from<T>>()) {
+  };
+
+  template <typename T>
+  struct ForwardRange : DefineConcept(HasBeginEnd<T>(),
+                                      ForwardIterator<iterator_from<T>>()) {
+  };
+
+  template <typename T>
+  struct BidirectionalRange
+      : DefineConcept(HasBeginEnd<T>(),
+                      BidirectionalIterator<iterator_from<T>>()) {
+  };
+
+  template <typename T>
+  struct RandomAccessRange
+      : DefineConcept(HasBeginEnd<T>(),
+                      RandomAccessIterator<iterator_from<T>>()) {
+  };
+
+}  // end namespace concepts
+
+namespace type_traits
+{
+  DefineTypeTraitFromConcept(is_iterator, camp::concepts::Iterator);
+  DefineTypeTraitFromConcept(is_forward_iterator,
+                             camp::concepts::ForwardIterator);
+  DefineTypeTraitFromConcept(is_bidirectional_iterator,
+                             camp::concepts::BidirectionalIterator);
+  DefineTypeTraitFromConcept(is_random_access_iterator,
+                             camp::concepts::RandomAccessIterator);
+
+  DefineTypeTraitFromConcept(is_range, camp::concepts::Range);
+  DefineTypeTraitFromConcept(is_forward_range, camp::concepts::ForwardRange);
+  DefineTypeTraitFromConcept(is_bidirectional_range,
+                             camp::concepts::BidirectionalRange);
+  DefineTypeTraitFromConcept(is_random_access_range,
+                             camp::concepts::RandomAccessRange);
+
+  DefineTypeTraitFromConcept(is_comparable, camp::concepts::Comparable);
+  DefineTypeTraitFromConcept(is_comparable_to, camp::concepts::ComparableTo);
+
+  DefineTypeTraitFromConcept(is_arithmetic, camp::concepts::Arithmetic);
+  DefineTypeTraitFromConcept(is_floating_point, camp::concepts::FloatingPoint);
+  DefineTypeTraitFromConcept(is_integral, camp::concepts::Integral);
+  DefineTypeTraitFromConcept(is_signed, camp::concepts::Signed);
+  DefineTypeTraitFromConcept(is_unsigned, camp::concepts::Unsigned);
+
+  template <typename T>
+  using IterableValue = decltype(*std::begin(camp::val<T>()));
+
+  template <typename T>
+  using IteratorValue = decltype(*camp::val<T>());
+
+  namespace detail
+  {
+
+    template <typename, template <typename...> class, typename...>
+    struct IsSpecialized : camp::false_type {
+    };
+
+    template <template <typename...> class Template, typename... T>
+    struct IsSpecialized<typename concepts::detail::voider<decltype(
+                             camp::val<Template<T...>>())>::type,
+                         Template,
+                         T...> : camp::true_type {
+    };
+
+    template <template <class...> class,
+              template <class...> class,
+              bool,
+              class...>
+    struct SpecializationOf : camp::false_type {
+    };
+
+    template <template <class...> class Expected,
+              template <class...> class Actual,
+              class... Args>
+    struct SpecializationOf<Expected, Actual, true, Args...>
+        : camp::concepts::metalib::is_same<Expected<Args...>, Actual<Args...>> {
+    };
+
+  }  // end namespace detail
+
+
+  template <template <class...> class Outer, class... Args>
+  using IsSpecialized = detail::IsSpecialized<void, Outer, Args...>;
+
+  template <template <class...> class, typename T>
+  struct SpecializationOf : camp::false_type {
+  };
+
+  template <template <class...> class Expected,
+            template <class...> class Actual,
+            class... Args>
+  struct SpecializationOf<Expected, Actual<Args...>>
+      : detail::SpecializationOf<Expected,
+                                 Actual,
+                                 IsSpecialized<Expected, Args...>::value,
+                                 Args...> {
+  };
+
+}  // end namespace type_traits
+}
+
+#endif /* CAMP_CONCEPTS_HPP */
diff --git a/include/camp/defines.hpp b/include/camp/defines.hpp
new file mode 100644
index 0000000000..b0bb43b513
--- /dev/null
+++ b/include/camp/defines.hpp
@@ -0,0 +1,84 @@
+#ifndef CAMP_DEFINES_HPP
+#define CAMP_DEFINES_HPP
+
+#include <cstddef>
+#include <cstdint>
+
+namespace camp
+{
+
+#if defined(__clang__)
+#define CAMP_COMPILER_CLANG
+#elif defined(__INTEL_COMPILER)
+#define CAMP_COMPILER_INTEL
+#elif defined(__xlc__)
+#define CAMP_COMPILER_XLC
+#elif defined(__PGI)
+#define CAMP_COMPILER_PGI
+#elif defined(_WIN32)
+#define CAMP_COMPILER_MSVC
+#elif defined(__GNUC__)
+#define RAJA_COMPILER_GNU
+#else
+#pragma warn("Unknown compiler!")
+#endif
+
+#if defined(__cpp_constexpr) && __cpp_constexpr >= 201304
+#define CAMP_HAS_CONSTEXPR14
+#define CAMP_CONSTEXPR14 constexpr
+#else
+#define CAMP_CONSTEXPR14
+#endif
+
+#if defined(__CUDACC__)
+#define CAMP_DEVICE __device__
+#define CAMP_HOST_DEVICE __host__ __device__
+#else
+#define CAMP_DEVICE
+#define CAMP_HOST_DEVICE
+#endif
+
+#if defined(__has_builtin)
+#if __has_builtin(__make_integer_seq)
+#define CAMP_USE_MAKE_INTEGER_SEQ 1
+#endif
+#endif
+
+// Types
+using idx_t = std::ptrdiff_t;
+
+// Helper macros
+// TODO: -> CAMP_MAKE_LAMBDA_CONSUMER
+#define CAMP_MAKE_L(X)                                             \
+  template <typename Lambda, typename... Rest>                     \
+  struct X##_l {                                                   \
+    using type = typename X<Lambda::template expr, Rest...>::type; \
+  };
+
+
+#if defined(CAMP_TEST)
+template <typename T1, typename T2>
+struct AssertSame {
+  static_assert(std::is_same<T1, T2>::value,
+                "is_same assertion failed <see below for more information>");
+  static bool constexpr value = std::is_same<T1, T2>::value;
+};
+#define UNQUOTE(...) __VA_ARGS__
+#define CHECK_SAME(X, Y) \
+  static_assert(AssertSame<UNQUOTE X, UNQUOTE Y>::value, #X " same as " #Y)
+#define CHECK_TSAME(X, Y)                                               \
+  static_assert(AssertSame<typename UNQUOTE X::type, UNQUOTE Y>::value, \
+                #X " same as " #Y)
+template <typename Assertion, idx_t i>
+struct AssertValue {
+  static_assert(Assertion::value == i,
+                "value assertion failed <see below for more information>");
+  static bool const value = Assertion::value == i;
+};
+#define CHECK_IEQ(X, Y) \
+  static_assert(AssertValue<UNQUOTE X, UNQUOTE Y>::value, #X "::value == " #Y)
+#endif
+
+}
+
+#endif /*  */
diff --git a/include/RAJA/policy/fwd.hpp b/include/camp/detail/sfinae.hpp
similarity index 61%
rename from include/RAJA/policy/fwd.hpp
rename to include/camp/detail/sfinae.hpp
index f8aa04777b..b2d7c42275 100644
--- a/include/RAJA/policy/fwd.hpp
+++ b/include/camp/detail/sfinae.hpp
@@ -1,3 +1,6 @@
+#ifndef CAMP_DETAIL_SFINAE_HPP
+#define CAMP_DETAIL_SFINAE_HPP
+
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 // Copyright (c) 2016, Lawrence Livermore National Security, LLC.
 //
@@ -40,59 +43,44 @@
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
-/*!
- ******************************************************************************
- *
- * \file
- *
- * \brief   Forward declarations for impl::forall overloads
- *
- ******************************************************************************
- */
-
-#ifndef RAJA_policy_fwd_HPP
-#define RAJA_policy_fwd_HPP
+#include "camp/helpers.hpp"
+#include "camp/value.hpp"
+#include "camp/number/number.hpp"
 
-#include "RAJA/config.hpp"
+#include <type_traits>
 
-#if defined(RAJA_ENABLE_CUDA)
-#include "RAJA/policy/cuda/fwd.hpp"
-#endif
-#if defined(RAJA_ENABLE_OPENMP)
-#include "RAJA/policy/openmp/fwd.hpp"
-#endif
-#include "RAJA/policy/sequential/fwd.hpp"
-#include "RAJA/policy/simd/fwd.hpp"
-
-namespace RAJA
+namespace camp
 {
-template <typename Selector, typename... Policies>
-class MultiPolicy;
 
-namespace impl
+  /// \cond
+namespace detail
 {
 
-template <typename Iterable,
-          typename Body,
-          typename Selector,
-          typename... Policies>
-RAJA_INLINE void forall(MultiPolicy<Selector, Policies...> p,
-                        Iterable &&,
-                        Body &&);
-}  // end namespace impl
+  // caller pattern from metal library
+  template <template <typename...> class expr, typename... vals>
+  struct caller;
 
-namespace wrap
-{
+  template <
+      template <typename...> class expr,
+      typename... vals,
+      typename std::enable_if<is_value<expr<vals...>>::value>::type* = nullptr>
+  value<expr<vals...>> sfinae(caller<expr, vals...>*);
+
+  value<> sfinae(...);
+
+  template <template <typename...> class expr, typename... vals>
+  struct caller : decltype(sfinae(declptr<caller<expr, vals...>>())) {
+  };
+
+  template <template <typename...> class Expr, typename... Vals>
+  struct call_s : caller<Expr, Vals...> {
+  };
 
-template <typename Iterable,
-          typename Body,
-          typename Selector,
-          typename... Policies>
-RAJA_INLINE void forall(MultiPolicy<Selector, Policies...>,
-                        Iterable &&,
-                        Body &&);
-}
+  template <template <typename...> class Expr, typename... Vals>
+  using call = Expr<Vals...>;
+};
+/// \endcond
 
-}  // end namespace RAJA
+}  // end namespace camp
 
-#endif  // closing endif for header file include guard
+#endif /* CAMP_DETAIL_SFINAE_HPP */
diff --git a/include/camp/helpers.hpp b/include/camp/helpers.hpp
new file mode 100644
index 0000000000..e82761c29a
--- /dev/null
+++ b/include/camp/helpers.hpp
@@ -0,0 +1,227 @@
+#ifndef CAMP_HELPERS_HPP
+#define CAMP_HELPERS_HPP
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+//
+// Produced at the Lawrence Livermore National Laboratory
+//
+// LLNL-CODE-689114
+//
+// All rights reserved.
+//
+// This file is part of RAJA.
+//
+// For additional details, please also read RAJA/README.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the disclaimer below.
+//
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the disclaimer (as noted below) in the
+//   documentation and/or other materials provided with the distribution.
+//
+// * Neither the name of the LLNS/LLNL nor the names of its contributors may
+//   be used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
+// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
+// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include <cstddef>
+#include <iterator>
+#include <utility>
+
+#include "camp/defines.hpp"
+
+namespace camp
+{
+
+/// metafunction to get instance of pointer type
+template <typename T>
+T* declptr();
+
+/// metafunction to get instance of value type
+template <typename T>
+auto val() noexcept -> decltype(std::declval<T>());
+
+/// metafunction to get instance of const type
+template <typename T>
+auto cval() noexcept -> decltype(std::declval<T const>());
+
+/// metafunction to expand a parameter pack and ignore result
+template <typename... Ts>
+void sink(Ts...)
+{
+}
+
+
+// bring common utility routines into scope to allow ADL
+using std::begin;
+using std::swap;
+
+namespace type
+{
+  namespace ref
+  {
+    template <class T>
+    struct rem_s {
+      using type = T;
+    };
+    template <class T>
+    struct rem_s<T&> {
+      using type = T;
+    };
+    template <class T>
+    struct rem_s<T&&> {
+      using type = T;
+    };
+
+    /// remove reference from T
+    template <class T>
+    using rem = typename rem_s<T>::type;
+
+    /// add remove reference to T
+    template <class T>
+    using add = T&;
+  }  // end namespace ref
+
+  namespace rvref
+  {
+    /// add rvalue reference to T
+    template <class T>
+    using add = T&&;
+  }  // end namespace rvref
+
+  namespace c
+  {
+    template <class T>
+    struct rem_s {
+      using type = T;
+    };
+    template <class T>
+    struct rem_s<const T> {
+      using type = T;
+    };
+
+    /// remove const qualifier from T
+    template <class T>
+    using rem = typename rem_s<T>::type;
+
+    /// add const qualifier to T
+    template <class T>
+    using add = const T;
+  }  // end namespace ref
+
+  namespace v
+  {
+    template <class T>
+    struct rem_s {
+      using type = T;
+    };
+    template <class T>
+    struct rem_s<volatile T> {
+      using type = T;
+    };
+
+    /// remove volatile qualifier from T
+    template <class T>
+    using rem = typename rem_s<T>::type;
+
+    /// add volatile qualifier to T
+    template <class T>
+    using add = volatile T;
+  }  // end namespace ref
+
+  namespace cv
+  {
+    template <class T>
+    struct rem_s {
+      using type = T;
+    };
+    template <class T>
+    struct rem_s<const T> {
+      using type = T;
+    };
+    template <class T>
+    struct rem_s<volatile T> {
+      using type = T;
+    };
+    template <class T>
+    struct rem_s<const volatile T> {
+      using type = T;
+    };
+
+    /// remove const and volatile qualifiers from T
+    template <class T>
+    using rem = typename rem_s<T>::type;
+
+    /// add const and volatile qualifiers to T
+    template <class T>
+    using add = const volatile T;
+  }  // end namespace ref
+}  // end namespace type
+
+template <typename T>
+using decay = type::cv::rem<type::ref::rem<T>>;
+
+template <typename T>
+using plain = type::ref::rem<T>;
+
+template <typename T>
+using diff_from = decltype(val<plain<T>>() - val<plain<T>>());
+template <typename T, typename U>
+using diff_between = decltype(val<plain<T>>() - val<plain<U>>());
+
+template <typename T>
+using iterator_from = decltype(begin(val<plain<T>>()));
+
+template <class T>
+CAMP_HOST_DEVICE constexpr T&& forward(type::ref::rem<T>& t) noexcept
+{
+  return static_cast<T&&>(t);
+}
+template <class T>
+CAMP_HOST_DEVICE constexpr T&& forward(type::ref::rem<T>&& t) noexcept
+{
+  return static_cast<T&&>(t);
+}
+
+template <typename T>
+CAMP_HOST_DEVICE void safe_swap(T& t1, T& t2)
+{
+#if defined(__CUDA_ARCH__)
+  T temp{std::move(t1)};
+  t1 = std::move(t2);
+  t2 = std::move(temp);
+#else
+  using std::swap;
+  swap(t1, t2);
+#endif
+}
+
+template <typename T, typename = decltype(sink(swap(val<T>(), val<T>())))>
+CAMP_HOST_DEVICE void safe_swap(T& t1, T& t2)
+{
+  using std::swap;
+  swap(t1, t2);
+}
+
+}
+
+#endif /* CAMP_HELPERS_HPP */
diff --git a/include/camp/lambda.hpp b/include/camp/lambda.hpp
new file mode 100644
index 0000000000..dcdbf72bae
--- /dev/null
+++ b/include/camp/lambda.hpp
@@ -0,0 +1,138 @@
+#ifndef CAMP_LAMBDA_HPP
+#define CAMP_LAMBDA_HPP
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+//
+// Produced at the Lawrence Livermore National Laboratory
+//
+// LLNL-CODE-689114
+//
+// All rights reserved.
+//
+// This file is part of RAJA.
+//
+// For additional details, please also read RAJA/LICENSE.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the disclaimer below.
+//
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the disclaimer (as noted below) in the
+//   documentation and/or other materials provided with the distribution.
+//
+// * Neither the name of the LLNS/LLNL nor the names of its contributors may
+//   be used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
+// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
+// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include <type_traits>
+
+#include "camp/defines.hpp"
+#include "camp/list/list.hpp"
+#include "camp/list/at.hpp"
+
+
+namespace camp
+{
+
+template <template <typename...> class Expr>
+struct lambda {
+  template <typename... Ts>
+  using expr = typename Expr<Ts...>::type;
+};
+
+template <typename Lambda, typename Seq>
+struct apply_l;
+template <typename Lambda, typename... Args>
+struct apply_l<Lambda, list<Args...>> {
+  using type = typename Lambda::template expr<Args...>::type;
+};
+
+template <typename Lambda, typename... Args>
+struct invoke_l {
+  using type = typename Lambda::template expr<Args...>::type;
+};
+
+template <idx_t n>
+struct arg {
+  template <typename... Ts>
+  using expr = typename at<list<Ts...>, num<n - 1>>::type;
+};
+
+using _1 = arg<1>;
+using _2 = arg<2>;
+using _3 = arg<3>;
+using _4 = arg<4>;
+using _5 = arg<5>;
+using _6 = arg<6>;
+using _7 = arg<7>;
+using _8 = arg<8>;
+using _9 = arg<9>;
+
+namespace detail
+{
+  template <typename T, typename... Args>
+  struct get_bound_arg {
+    using type = T;
+  };
+  template <idx_t i, typename... Args>
+  struct get_bound_arg<arg<i>, Args...> {
+    using type = typename arg<i>::template expr<Args...>;
+  };
+}
+
+template <template <typename...> class Expr, typename... ArgBindings>
+struct bind {
+  using bindings = list<ArgBindings...>;
+  template <typename... Ts>
+  using expr = typename Expr<
+      typename detail::get_bound_arg<ArgBindings, Ts...>::type...>::type;
+  using type = bind;
+};
+
+#if defined(CAMP_TEST)
+namespace test
+{
+  CHECK_TSAME((invoke_l<bind<list, _1, int, _2>, float, double>),
+              (list<float, int, double>));
+}
+#endif
+
+template <template <typename...> class Expr, typename... BoundArgs>
+struct bind_front {
+  template <typename... Ts>
+  using expr = typename Expr<BoundArgs..., Ts...>::type;
+  using type = bind_front;
+};
+
+CAMP_MAKE_L(bind_front);
+
+#if defined(CAMP_TEST)
+namespace test
+{
+  CHECK_TSAME((invoke_l<bind_front<list, int>, float, double>),
+              (list<int, float, double>));
+}
+#endif
+
+} // end namespace camp
+
+#endif /* CAMP_LAMBDA_HPP */
diff --git a/include/camp/list.hpp b/include/camp/list.hpp
new file mode 100644
index 0000000000..6931d9690d
--- /dev/null
+++ b/include/camp/list.hpp
@@ -0,0 +1,7 @@
+#ifndef __CAMP_list_hpp
+#define __CAMP_list_hpp
+
+#include "list/at.hpp"
+#include "list/list.hpp"
+
+#endif /* __CAMP_list_hpp */
diff --git a/include/camp/list/at.hpp b/include/camp/list/at.hpp
new file mode 100644
index 0000000000..067a39104f
--- /dev/null
+++ b/include/camp/list/at.hpp
@@ -0,0 +1,84 @@
+#ifndef __CAMP_list_at_hpp
+#define __CAMP_list_at_hpp
+
+#include "camp/defines.hpp"
+#include "camp/helpers.hpp"
+#include "camp/list/list.hpp"
+#include "camp/number.hpp"
+#include "camp/value.hpp"
+
+namespace camp
+{
+
+namespace detail
+{
+  // Lookup from metal::at machinery
+  template <idx_t, typename>
+  struct entry {
+  };
+
+  template <typename, typename>
+  struct entries;
+
+  template <idx_t... keys, typename... vals>
+  struct entries<idx_seq<keys...>, list<vals...>> : entry<keys, vals>... {
+  };
+
+  template <idx_t key, typename val>
+  value<val> _lookup_impl(entry<key, val>*);
+
+  template <typename>
+  value<> _lookup_impl(...);
+
+  template <typename vals, typename indices, idx_t Idx>
+  struct _lookup
+      : decltype(_lookup_impl<Idx>(declptr<entries<indices, vals>>())) {
+  };
+
+  template <typename T, idx_t Idx>
+  struct _at;
+  template <template <class...> class T, typename X, typename... Rest>
+  struct _at<T<X, Rest...>, 0> {
+    using type = X;
+  };
+  template <template <class...> class T,
+            typename X,
+            typename Y,
+            typename... Rest>
+  struct _at<T<X, Y, Rest...>, 1> {
+    using type = Y;
+  };
+  template <template <class...> class T, idx_t Idx, typename... Rest>
+  struct _at<T<Rest...>, Idx> {
+    using type = typename _lookup<T<Rest...>,
+                                  make_idx_seq_t<sizeof...(Rest)>,
+                                  Idx>::type;
+  };
+}
+
+// TODO: document
+template <typename Seq, typename Num>
+struct at;
+template <typename T, idx_t Val>
+struct at<T, num<Val>> {
+  using type = typename detail::_at<T, Val>::type;
+};
+
+
+template <typename T>
+using first = typename at<T, num<0>>::type;
+
+template <typename T>
+using second = typename at<T, num<1>>::type;
+
+// TODO: document
+template <typename T, idx_t Idx>
+using at_v = typename at<T, num<Idx>>::type;
+
+// TODO: document
+template <typename T, typename U>
+using at_t = typename at<T, U>::type;
+}
+
+
+#endif /* __CAMP_list_at_hpp */
diff --git a/include/camp/list/find_if.hpp b/include/camp/list/find_if.hpp
new file mode 100644
index 0000000000..a368dc8224
--- /dev/null
+++ b/include/camp/list/find_if.hpp
@@ -0,0 +1,112 @@
+#ifndef CAMP_LIST_FIND_IF_HPP
+#define CAMP_LIST_FIND_IF_HPP
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+//
+// Produced at the Lawrence Livermore National Laboratory
+//
+// LLNL-CODE-689114
+//
+// All rights reserved.
+//
+// This file is part of RAJA.
+//
+// For additional details, please also read RAJA/LICENSE.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the disclaimer below.
+//
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the disclaimer (as noted below) in the
+//   documentation and/or other materials provided with the distribution.
+//
+// * Neither the name of the LLNS/LLNL nor the names of its contributors may
+//   be used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
+// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
+// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include <cstddef>
+#include <type_traits>
+
+#include "camp/lambda.hpp"
+#include "camp/list/list.hpp"
+#include "camp/number.hpp"
+#include "camp/value.hpp"
+
+namespace camp
+{
+
+/// \cond
+namespace detail
+{
+  template <template <typename...> class Cond, typename... Elements>
+  struct _find_if;
+  template <template <typename...> class Cond, typename First, typename... Rest>
+  struct _find_if<Cond, First, Rest...> {
+    using type = if_<typename Cond<First>::type,
+                     First,
+                     typename _find_if<Cond, Rest...>::type>;
+  };
+  template <template <typename...> class Cond>
+  struct _find_if<Cond> {
+    using type = nil;
+  };
+}
+/// \endcond
+
+template <template <typename...> class Cond, typename Seq>
+struct find_if;
+
+// TODO: document
+template <template <typename...> class Cond, typename... Elements>
+struct find_if<Cond, list<Elements...>> {
+  using type = typename detail::_find_if<Cond, Elements...>::type;
+};
+
+CAMP_MAKE_L(find_if);
+
+#if defined(CAMP_TEST)
+#include "camp/lambda.hpp"
+namespace test
+{
+  template <typename Index, typename ForPol>
+  struct index_matches {
+    using type = typename std::is_same<Index, typename ForPol::index>::type;
+  };
+  template <typename Index, typename T>
+  struct For {
+    using index = Index;
+    constexpr static std::size_t value = Index::value;
+  };
+  CHECK_TSAME((find_if<std::is_pointer, list<float, double, int*>>), (int*));
+  CHECK_TSAME((find_if<std::is_pointer, list<float, double>>), (nil));
+  CHECK_TSAME((find_if_l<bind_front<std::is_same, For<num<1>, int>>,
+                         list<For<num<0>, int>, For<num<1>, int>>>),
+              (For<num<1>, int>));
+  CHECK_TSAME((find_if_l<bind_front<index_matches, num<1>>,
+                         list<For<num<0>, int>, For<num<1>, int>>>),
+              (For<num<1>, int>));
+}
+#endif
+
+}  // end namespace camp
+
+#endif /* CAMP_LIST_FIND_IF_HPP */
diff --git a/include/camp/list/list.hpp b/include/camp/list/list.hpp
new file mode 100644
index 0000000000..670db59b5a
--- /dev/null
+++ b/include/camp/list/list.hpp
@@ -0,0 +1,15 @@
+#ifndef CAMP_LIST_LIST_HPP
+#define CAMP_LIST_LIST_HPP
+
+namespace camp
+{
+// TODO: document
+
+template <typename... Ts>
+struct list {
+  using type = list;
+};
+
+}
+
+#endif /* CAMP_LIST_LIST_HPP */
diff --git a/include/camp/number.hpp b/include/camp/number.hpp
new file mode 100644
index 0000000000..55113db0dd
--- /dev/null
+++ b/include/camp/number.hpp
@@ -0,0 +1,124 @@
+#ifndef CAMP_NUMBER_HPP
+#define CAMP_NUMBER_HPP
+
+#include "camp/list/list.hpp"
+#include "camp/number/if.hpp"
+#include "camp/number/number.hpp"
+
+#include "camp/defines.hpp"
+
+#include <array>
+#include <type_traits>
+
+namespace camp
+{
+
+// TODO: document
+template <typename T, T... vs>
+struct int_seq {
+  using type = int_seq;
+};
+/// Index list, use for indexing into parameter packs and lists
+template <idx_t... vs>
+using idx_seq = int_seq<idx_t, vs...>;
+
+namespace detail
+{
+  template <typename T, typename N>
+  struct gen_seq;
+#if defined(CAMP_USE_MAKE_INTEGER_SEQ) && !__NVCC__
+  template <typename T, T N>
+  struct gen_seq<T, integral_constant<T, N>> {
+    using type = __make_integer_seq<int_seq, T, N>;
+  };
+#else
+  template <typename T, typename S1, typename S2>
+  struct concat;
+
+  template <typename T, T... I1, T... I2>
+  struct concat<T, int_seq<T, I1...>, int_seq<T, I2...>> {
+    using type = typename int_seq<T, I1..., (sizeof...(I1) + I2)...>::type;
+  };
+
+  template <typename T, typename N_t>
+  struct gen_seq
+      : concat<T,
+               typename gen_seq<T, integral_constant<T, N_t::value / 2>>::type,
+               typename gen_seq<
+                   T,
+                   integral_constant<T, N_t::value - N_t::value / 2>>::type>::
+            type {
+  };
+
+  template <typename T>
+  struct gen_seq<T, integral_constant<T, 0>> : int_seq<T> {
+  };
+  template <typename T>
+  struct gen_seq<T, integral_constant<T, 1>> : int_seq<T, 0> {
+  };
+#endif
+}
+
+// TODO: document
+template <idx_t Upper>
+struct make_idx_seq {
+  using type =
+      typename detail::gen_seq<idx_t, integral_constant<idx_t, Upper>>::type;
+};
+
+
+// TODO: document
+template <idx_t Upper>
+using make_idx_seq_t = typename make_idx_seq<Upper>::type;
+
+#if defined(CAMP_TEST)
+namespace test
+{
+  CHECK_TSAME((make_idx_seq_t<3>), (idx_seq<0, 1, 2>));
+  CHECK_TSAME((make_idx_seq_t<2>), (idx_seq<0, 1>));
+  CHECK_TSAME((make_idx_seq_t<1>), (idx_seq<0>));
+  CHECK_TSAME((make_idx_seq_t<0>), (idx_seq<>));
+}
+#endif
+
+
+// TODO: document
+template <class... Ts>
+using idx_seq_for_t = typename make_idx_seq<sizeof...(Ts)>::type;
+
+// TODO: document
+template <typename T>
+struct idx_seq_from;
+
+// TODO: document
+template <template <typename...> class T, typename... Args>
+struct idx_seq_from<T<Args...>> : make_idx_seq<sizeof...(Args)> {
+};
+
+// TODO: document
+template <typename T, T... Args>
+struct idx_seq_from<int_seq<T, Args...>> : make_idx_seq<sizeof...(Args)> {
+};
+
+// TODO: document
+template <typename T>
+using idx_seq_from_t = typename idx_seq_from<T>::type;
+
+// TODO: document
+template <typename T, T Upper>
+struct make_int_seq : detail::gen_seq<T, integral_constant<T, Upper>>::type {
+};
+
+// TODO: document
+template <typename T, idx_t Upper>
+using make_int_seq_t = typename make_int_seq<T, Upper>::type;
+
+// TODO: document
+template <typename T>
+struct not_ {
+  using type = typename if_s<T, false_type, true_type>::type;
+};
+
+}  // end namespace camp
+
+#endif /* CAMP_NUMBER_HPP */
diff --git a/include/camp/number/if.hpp b/include/camp/number/if.hpp
new file mode 100644
index 0000000000..6b87076798
--- /dev/null
+++ b/include/camp/number/if.hpp
@@ -0,0 +1,85 @@
+#ifndef CAMP_NUMBER_IF_HPP
+#define CAMP_NUMBER_IF_HPP
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+//
+// Produced at the Lawrence Livermore National Laboratory
+//
+// LLNL-CODE-689114
+//
+// All rights reserved.
+//
+// This file is part of RAJA.
+//
+// For additional details, please also read RAJA/LICENSE.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the disclaimer below.
+//
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the disclaimer (as noted below) in the
+//   documentation and/or other materials provided with the distribution.
+//
+// * Neither the name of the LLNS/LLNL nor the names of its contributors may
+//   be used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
+// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
+// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "camp/list/list.hpp"
+#include "camp/number/number.hpp"
+#include "camp/value.hpp"
+
+#include <type_traits>
+
+namespace camp
+{
+
+// TODO: document
+template <bool Cond, typename Then, typename Else>
+struct if_cs {
+  using type = Then;
+};
+
+template <typename Then, typename Else>
+struct if_cs<false, Then, Else> {
+  using type = Else;
+};
+
+// TODO: document
+template <bool Cond, typename Then, typename Else>
+using if_c = typename if_cs<Cond, Then, Else>::type;
+
+// TODO: document
+template <typename Cond, typename Then, typename Else>
+struct if_s : if_cs<Cond::value, Then, Else> {
+};
+
+template <typename Then, typename Else>
+struct if_s<nil, Then, Else> : if_cs<false, Then, Else> {
+};
+
+// TODO: document
+template <typename... Ts>
+using if_ = typename if_s<Ts...>::type;
+
+}  // end namespace camp
+
+#endif /* CAMP_NUMBER_IF_HPP */
diff --git a/include/camp/number/number.hpp b/include/camp/number/number.hpp
new file mode 100644
index 0000000000..7ff552ab56
--- /dev/null
+++ b/include/camp/number/number.hpp
@@ -0,0 +1,72 @@
+#ifndef CAMP_NUMBER_NUMBER_HPP
+#define CAMP_NUMBER_NUMBER_HPP
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+//
+// Produced at the Lawrence Livermore National Laboratory
+//
+// LLNL-CODE-689114
+//
+// All rights reserved.
+//
+// This file is part of RAJA.
+//
+// For additional details, please also read RAJA/LICENSE.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the disclaimer below.
+//
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the disclaimer (as noted below) in the
+//   documentation and/or other materials provided with the distribution.
+//
+// * Neither the name of the LLNS/LLNL nor the names of its contributors may
+//   be used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
+// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
+// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "camp/defines.hpp"
+
+namespace camp
+{
+
+// TODO: document, consider making use/match std::integral_constant
+template <class T, T v>
+struct integral_constant {
+  static constexpr T value = v;
+  using value_type = T;
+  using type = integral_constant;
+  constexpr operator value_type() const noexcept { return value; }
+  constexpr value_type operator()() const noexcept { return value; }
+};
+
+// TODO: document
+template <idx_t N>
+using num = integral_constant<idx_t, N>;
+
+using true_type = num<true>;
+using false_type = num<false>;
+
+using t = num<true>;
+
+}  // end namespace camp
+
+#endif /* CAMP_NUMBER_NUMBER_HPP */
diff --git a/include/camp/tuple.hpp b/include/camp/tuple.hpp
new file mode 100644
index 0000000000..c8eba6cfe1
--- /dev/null
+++ b/include/camp/tuple.hpp
@@ -0,0 +1,267 @@
+#ifndef camp_tuple_HPP__
+#define camp_tuple_HPP__
+
+/*!
+ * \file
+ *
+ * \brief   Exceptionally basic tuple for host-device support
+ */
+
+#include "camp/camp.hpp"
+
+#include <iostream>
+#include <type_traits>
+
+namespace camp
+{
+
+template <typename... Rest>
+struct tuple;
+
+namespace internal
+{
+  template <class T>
+  struct unwrap_refwrapper {
+    using type = T;
+  };
+
+  template <class T>
+  struct unwrap_refwrapper<std::reference_wrapper<T>> {
+    using type = T&;
+  };
+
+  template <class T>
+  using special_decay_t =
+      typename unwrap_refwrapper<typename std::decay<T>::type>::type;
+}
+
+template <typename... Args>
+CAMP_HOST_DEVICE constexpr auto make_tuple(Args&&... args)
+    -> tuple<internal::special_decay_t<Args>...>;
+
+namespace internal
+{
+  template <camp::idx_t index, typename Type>
+  struct tuple_storage {
+    tuple_storage() = default;
+    CAMP_HOST_DEVICE constexpr tuple_storage(Type val) : val{val} {}
+
+    CAMP_HOST_DEVICE constexpr const Type& get_inner() const noexcept
+    {
+      return val;
+    }
+
+    CAMP_CONSTEXPR14
+    CAMP_HOST_DEVICE
+    Type& get_inner() noexcept { return val; }
+
+  public:
+    Type val;
+  };
+
+  template <typename Indices, typename Typelist>
+  struct tuple_helper;
+
+  template <>
+  struct tuple_helper<camp::idx_seq<>, camp::list<>> {
+  };
+
+  template <typename... Types, camp::idx_t... Indices>
+  struct tuple_helper<camp::idx_seq<Indices...>, camp::list<Types...>>
+      : public internal::tuple_storage<Indices, Types>... {
+
+    tuple_helper() = default;
+
+    CAMP_HOST_DEVICE constexpr tuple_helper(Types... args)
+        : internal::tuple_storage<Indices, Types>(std::forward<Types>(args))...
+    {
+    }
+
+    template <typename... RTypes>
+    CAMP_HOST_DEVICE tuple_helper& operator=(
+        const tuple_helper<camp::idx_seq<Indices...>, RTypes...>& rhs)
+    {
+      return (camp::sink(
+                  (this->tuple_storage<Indices, Types>::get_inner() =
+                       rhs.tuple_storage<Indices, RTypes>::get_inner())...),
+              *this);
+    }
+  };
+}
+
+template <typename T, camp::idx_t I>
+using tpl_get_ret = camp::at_v<typename T::TList, I>;
+template <typename T, camp::idx_t I>
+using tpl_get_store = internal::tuple_storage<I, tpl_get_ret<T, I>>;
+
+template <typename... Elements>
+struct tuple : public internal::tuple_helper<
+                   typename camp::make_idx_seq<sizeof...(Elements)>::type,
+                   camp::list<Elements...>> {
+  using TList = camp::list<Elements...>;
+  using type = tuple;
+
+private:
+  using Self = tuple;
+  using Base = internal::tuple_helper<camp::make_idx_seq_t<sizeof...(Elements)>,
+                                      camp::list<Elements...>>;
+
+public:
+  // Constructors
+  CAMP_HOST_DEVICE constexpr tuple() : Base{} {};
+  CAMP_HOST_DEVICE constexpr tuple(tuple const& o) : Base{static_cast<Base>(o)}
+  {
+  }
+  CAMP_HOST_DEVICE constexpr tuple(tuple&& o)
+      : Base{std::move(static_cast<Base>(o))}
+  {
+  }
+  CAMP_HOST_DEVICE tuple& operator=(tuple const& rhs)
+  {
+    Base::operator=(static_cast<Base>(rhs.base));
+  }
+  CAMP_HOST_DEVICE tuple& operator=(tuple&& rhs)
+  {
+    Base::operator=(std::move(static_cast<Base>(rhs)));
+  }
+
+  template <typename... OtherTypes>
+  CAMP_HOST_DEVICE constexpr explicit tuple(OtherTypes&&... rest)
+      : Base{std::forward<OtherTypes>(rest)...}
+  {
+  }
+
+  template <typename... RTypes>
+  CAMP_HOST_DEVICE CAMP_CONSTEXPR14 Self& operator=(const tuple<RTypes...>& rhs)
+  {
+    Base::operator=(rhs);
+    return *this;
+  }
+
+  template <camp::idx_t index>
+  CAMP_HOST_DEVICE auto get() noexcept -> tpl_get_ret<Self, index>&
+  {
+    static_assert(sizeof...(Elements) > index, "index out of range");
+    return tpl_get_store<Self, index>::get_inner();
+  }
+  template <camp::idx_t index>
+  CAMP_HOST_DEVICE auto get() const noexcept -> const tpl_get_ret<Self, index>&
+  {
+    static_assert(sizeof...(Elements) > index, "index out of range");
+    return tpl_get_store<Self, index>::get_inner();
+  }
+};
+
+template <camp::idx_t i, typename T>
+struct tuple_element;
+template <camp::idx_t i, typename... Types>
+struct tuple_element<i, tuple<Types...>> {
+  using type = camp::at_v<typename tuple<Types...>::TList, i>;
+};
+template <camp::idx_t i, typename T>
+using tuple_element_t = typename tuple_element<i, T>::type;
+
+template <int index, typename... Args>
+CAMP_HOST_DEVICE constexpr auto get(const tuple<Args...>& t) noexcept
+    -> tpl_get_ret<tuple<Args...>, index> const&
+{
+  static_assert(sizeof...(Args) > index, "index out of range");
+  return t.tpl_get_store<tuple<Args...>, index>::get_inner();
+}
+
+template <int index, typename... Args>
+CAMP_HOST_DEVICE constexpr auto get(tuple<Args...>& t) noexcept
+    -> tpl_get_ret<tuple<Args...>, index>&
+{
+  static_assert(sizeof...(Args) > index, "index out of range");
+  return t.tpl_get_store<tuple<Args...>, index>::get_inner();
+}
+
+template <typename Tuple>
+struct tuple_size;
+
+template <typename... Args>
+struct tuple_size<tuple<Args...>> {
+  static constexpr size_t value = sizeof...(Args);
+};
+
+template <typename... Args>
+struct tuple_size<tuple<Args...>&> {
+  static constexpr size_t value = sizeof...(Args);
+};
+
+template <typename... Args>
+CAMP_HOST_DEVICE constexpr auto make_tuple(Args&&... args)
+    -> tuple<internal::special_decay_t<Args>...>
+{
+  return tuple<internal::special_decay_t<Args>...>{std::forward<Args>(args)...};
+}
+
+template <typename... Args>
+CAMP_HOST_DEVICE constexpr auto forward_as_tuple(Args&&... args) noexcept
+    -> tuple<Args&&...>
+{
+  return tuple<Args&&...>(std::forward<Args>(args)...);
+}
+
+template <class... Types>
+CAMP_HOST_DEVICE constexpr tuple<Types&...> tie(Types&... args) noexcept
+{
+  return tuple<Types&...>{args...};
+}
+
+template <typename... Lelem,
+          typename... Relem,
+          camp::idx_t... Lidx,
+          camp::idx_t... Ridx>
+CAMP_HOST_DEVICE constexpr auto tuple_cat_pair(tuple<Lelem...>&& l,
+                                               camp::idx_seq<Lidx...>,
+                                               tuple<Relem...>&& r,
+                                               camp::idx_seq<Ridx...>) noexcept
+    -> tuple<Lelem..., Relem...>
+{
+  return make_tuple(get<Lidx>(l)..., get<Ridx>(r)...);
+}
+
+template <typename Fn, camp::idx_t... Sequence, typename TupleLike>
+CAMP_HOST_DEVICE constexpr auto invoke_with_order(TupleLike&& t,
+                                                  Fn&& f,
+                                                  camp::idx_seq<Sequence...>)
+    -> decltype(f(get<Sequence>(t)...))
+{
+  return f(get<Sequence>(t)...);
+}
+
+template <typename Fn, typename TupleLike>
+CAMP_HOST_DEVICE constexpr auto invoke(TupleLike&& t, Fn&& f) -> decltype(
+    invoke_with_order(forward<TupleLike>(t),
+                      forward<Fn>(f),
+                      camp::make_idx_seq_t<tuple_size<TupleLike>::value>{}))
+{
+  return invoke_with_order(
+      forward<TupleLike>(t),
+      forward<Fn>(f),
+      camp::make_idx_seq_t<tuple_size<TupleLike>::value>{});
+}
+}
+
+namespace internal
+{
+template <class Tuple, camp::idx_t... Idxs>
+void print_tuple(std::ostream& os, Tuple const& t, camp::idx_seq<Idxs...>)
+{
+  camp::sink((void*)&(os << (Idxs == 0 ? "" : ", ") << camp::get<Idxs>(t))...);
+}
+}
+
+template <class... Args>
+auto operator<<(std::ostream& os, camp::tuple<Args...> const& t)
+    -> std::ostream&
+{
+  os << "(";
+  internal::print_tuple(os, t, camp::make_idx_seq_t<sizeof...(Args)>{});
+  return os << ")";
+}
+
+
+#endif /* camp_tuple_HPP__ */
diff --git a/include/camp/value.hpp b/include/camp/value.hpp
new file mode 100644
index 0000000000..80f7969f7b
--- /dev/null
+++ b/include/camp/value.hpp
@@ -0,0 +1,36 @@
+#ifndef __CAMP_value_hpp
+#define __CAMP_value_hpp
+
+#include "camp/number/number.hpp"
+
+namespace camp
+{
+
+/// \cond
+namespace detail
+{
+  struct nothing;
+}
+/// \endcond
+
+// TODO: document
+template <typename val = detail::nothing>
+struct value {
+  using type = val;
+};
+
+/// A non-value, in truth tests evaluates to false
+using nil = value<>;
+
+/// Test whether a type is a valid camp value
+template <typename Val>
+struct is_value_s {
+  using type = camp::t;
+};
+
+/// Test whether a type is a valid camp value
+template <typename Val>
+using is_value = typename is_value_s<Val>::type;
+}
+
+#endif /* __CAMP_value_hpp */
diff --git a/examples/pi.cpp b/include/camp/value/eval.hpp
similarity index 76%
rename from examples/pi.cpp
rename to include/camp/value/eval.hpp
index 78eccfcba7..1cc3a8a25b 100644
--- a/examples/pi.cpp
+++ b/include/camp/value/eval.hpp
@@ -1,3 +1,6 @@
+#ifndef CAMP_VALUE_EVAL_HPP
+#define CAMP_VALUE_EVAL_HPP
+
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 // Copyright (c) 2016, Lawrence Livermore National Security, LLC.
 //
@@ -40,28 +43,13 @@
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
-#include <cstdlib>
-#include <iostream>
-
-#include "RAJA/RAJA.hpp"
-#include "RAJA/util/defines.hpp"
-
-int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
+namespace camp
 {
-  typedef RAJA::seq_reduce reduce_policy;
-  typedef RAJA::seq_exec execute_policy;
-
-  RAJA::Index_type begin = 0;
-  RAJA::Index_type numBins = 512 * 512;
-
-  RAJA::ReduceSum<reduce_policy, double> piSum(0.0);
 
-  RAJA::forall<execute_policy>(begin, numBins, [=](int i) {
-    double x = (double(i) + 0.5) / numBins;
-    piSum += 4.0 / (1.0 + x * x);
-  });
+// TODO: document
+template <typename Val>
+using eval = typename Val::type;
 
-  std::cout << "PI is ~ " << double(piSum) / numBins << std::endl;
+}  // end namespace camp
 
-  return 0;
-}
+#endif /* CAMP_VALUE_EVAL_HPP */
diff --git a/scripts/bgq_clang-3.9.0.sh b/scripts/bgq_clang-3.9.0.sh
index 19e6707c43..df5d78fd81 100755
--- a/scripts/bgq_clang-3.9.0.sh
+++ b/scripts/bgq_clang-3.9.0.sh
@@ -29,7 +29,7 @@ cmake \
 #  -C ${RAJA_DIR}/host-configs/bgqos/clang_3_9_0.cmake \
 #  -DCMAKE_INSTALL_PREFIX=../install-bgq_clang-3.9.0 \
 #  -DCMAKE_BUILD_TYPE=Release \
-#  -DRAJA_ENABLE_PERFSUITE=On \
-#  -DRAJA_ENABLE_APPLICATIONS=On \
+#  -DENABLE_PERFSUITE=On \
+#  -DENABLE_APPLICATIONS=On \
 #  "$@" \
 #  ${RAJA_DIR}
diff --git a/scripts/bgq_gcc-4.7.2.sh b/scripts/bgq_gcc-4.7.2.sh
index bb60ed43b6..2b5d8b8fb4 100755
--- a/scripts/bgq_gcc-4.7.2.sh
+++ b/scripts/bgq_gcc-4.7.2.sh
@@ -29,7 +29,7 @@ cmake \
 #  -C ${RAJA_DIR}/host-configs/bgqos/gcc_4_7_2.cmake \
 #  -DCMAKE_BUILD_TYPE=Release \
 #  -DCMAKE_INSTALL_PREFIX=../install-bgq_gcc-4.7.2 \
-#  -DRAJA_ENABLE_PERFSUITE=On \
-#  -DRAJA_ENABLE_APPLICATIONS=On \
+#  -DENABLE_PERFSUITE=On \
+#  -DENABLE_APPLICATIONS=On \
 #  "$@" \
 #  ${RAJA_DIR}
diff --git a/scripts/clang-3.8.0.sh b/scripts/clang-3.8.0.sh
index b85df62d4b..0a75cb766a 100755
--- a/scripts/clang-3.8.0.sh
+++ b/scripts/clang-3.8.0.sh
@@ -26,7 +26,7 @@ cmake \
 #cmake \
 #  -C ${RAJA_DIR}/host-configs/chaos/clang_3_8_0.cmake \
 #  -DCMAKE_INSTALL_PREFIX=../install-clang-3.8.0 \
-#  -DRAJA_ENABLE_PERFSUITE=On \
-#  -DRAJA_ENABLE_APPLICATIONS=On \
+#  -DENABLE_PERFSUITE=On \
+#  -DENABLE_APPLICATIONS=On \
 #  "$@" \
 #  ${RAJA_DIR}
diff --git a/scripts/clang-3.9.0.sh b/scripts/clang-3.9.0.sh
index 2a738f1629..a6649d10aa 100755
--- a/scripts/clang-3.9.0.sh
+++ b/scripts/clang-3.9.0.sh
@@ -26,7 +26,7 @@ cmake \
 #cmake \
 #  -C ${RAJA_DIR}/host-configs/chaos/clang_3_9_0.cmake \
 #  -DCMAKE_INSTALL_PREFIX=../install-clang-3.9.0 \
-#  -DRAJA_ENABLE_PERFSUITE=On \
-#  -DRAJA_ENABLE_APPLICATIONS=On \
+#  -DENABLE_PERFSUITE=On \
+#  -DENABLE_APPLICATIONS=On \
 #  "$@" \
 #  ${RAJA_DIR}
diff --git a/scripts/clang-3.9.0_WARN.sh b/scripts/clang-3.9.0_WARN.sh
index 3f4258d981..5d8fa670c0 100755
--- a/scripts/clang-3.9.0_WARN.sh
+++ b/scripts/clang-3.9.0_WARN.sh
@@ -20,8 +20,8 @@ RAJA_DIR=$(git rev-parse --show-toplevel)
 cmake \
   -C ${RAJA_DIR}/host-configs/chaos/clang_3_9_0.cmake \
   -DCMAKE_BUILD_TYPE=Debug \
-  -DRAJA_ENABLE_WARNINGS=On \
-  -DRAJA_ENABLE_APPLICATIONS=On \
-  -DRAJA_ENABLE_WARNINGS=On \
+  -DENABLE_WARNINGS=On \
+  -DENABLE_APPLICATIONS=On \
+  -DENABLE_WARNINGS=On \
   "$@" \
   ${RAJA_DIR}
diff --git a/scripts/gcc-4.9.3.sh b/scripts/gcc-4.9.3.sh
index 929f25f3e4..cba0873902 100755
--- a/scripts/gcc-4.9.3.sh
+++ b/scripts/gcc-4.9.3.sh
@@ -26,7 +26,7 @@ cmake \
 #cmake \
 #  -C ${RAJA_DIR}/host-configs/chaos/gcc_4_9_3.cmake \
 #  -DCMAKE_INSTALL_PREFIX=../install-gcc-4.9.3 \
-#  -DRAJA_ENABLE_PERFSUITE=On \
-#  -DRAJA_ENABLE_APPLICATIONS=On \
+#  -DENABLE_PERFSUITE=On \
+#  -DENABLE_APPLICATIONS=On \
 #  "$@" \
 #  ${RAJA_DIR}
diff --git a/scripts/gcc-4.9.3_WARN.sh b/scripts/gcc-4.9.3_WARN.sh
index 6c16885c3f..87462fc522 100755
--- a/scripts/gcc-4.9.3_WARN.sh
+++ b/scripts/gcc-4.9.3_WARN.sh
@@ -20,8 +20,8 @@ RAJA_DIR=$(git rev-parse --show-toplevel)
 cmake \
   -C ${RAJA_DIR}/host-configs/chaos/gcc_4_9_3.cmake \
   -DCMAKE_BUILD_TYPE=Debug \
-  -DRAJA_ENABLE_WARNINGS=On \
-  -DRAJA_ENABLE_APPLICATIONS=On \
-  -DRAJA_ENABLE_PERFSUITE=On \
+  -DENABLE_WARNINGS=On \
+  -DENABLE_APPLICATIONS=On \
+  -DENABLE_PERFSUITE=On \
   "$@" \
   ${RAJA_DIR}
diff --git a/scripts/icpc-16.0.258.sh b/scripts/icpc-16.0.258.sh
index ffc4b058e3..04897ef687 100755
--- a/scripts/icpc-16.0.258.sh
+++ b/scripts/icpc-16.0.258.sh
@@ -27,7 +27,7 @@ cmake \
 #cmake \
 #  -C ${RAJA_DIR}/host-configs/chaos/icpc_16_0_258.cmake \
 #  -DCMAKE_INSTALL_PREFIX=../install-icpc-16.0.258 \
-#  -DRAJA_ENABLE_PERFSUITE=On \
-#  -DRAJA_ENABLE_APPLICATIONS=On \
+#  -DENABLE_PERFSUITE=On \
+#  -DENABLE_APPLICATIONS=On \
 #  "$@" \
 #  ${RAJA_DIR}
diff --git a/scripts/icpc-17.0.174.sh b/scripts/icpc-17.0.174.sh
index d7dc96e041..842c68e066 100755
--- a/scripts/icpc-17.0.174.sh
+++ b/scripts/icpc-17.0.174.sh
@@ -27,7 +27,7 @@ cmake \
 #cmake \
 #  -C ${RAJA_DIR}/host-configs/chaos/icpc_17_0_174.cmake \
 #  -DCMAKE_INSTALL_PREFIX=../install-icpc-1cpc-17.0.174 \
-#  -DRAJA_ENABLE_PERFSUITE=On \
-#  -DRAJA_ENABLE_APPLICATIONS=On \
+#  -DENABLE_PERFSUITE=On \
+#  -DENABLE_APPLICATIONS=On \
 #  "$@" \
 #  ${RAJA_DIR}
diff --git a/scripts/icpc-17.0.174_WARN.sh b/scripts/icpc-17.0.174_WARN.sh
index e241a080b3..b986799dd2 100755
--- a/scripts/icpc-17.0.174_WARN.sh
+++ b/scripts/icpc-17.0.174_WARN.sh
@@ -21,8 +21,8 @@ RAJA_DIR=$(git rev-parse --show-toplevel)
 cmake \
   -C ${RAJA_DIR}/host-configs/chaos/icpc_17_0_174.cmake \
   -DCMAKE_BUILD_TYPE=Debug \
-  -DRAJA_ENABLE_WARNINGS=On \
-  -DRAJA_ENABLE_APPLICATIONS=On \
-  -DRAJA_ENABLE_PERFSUITE=On \
+  -DENABLE_WARNINGS=On \
+  -DENABLE_APPLICATIONS=On \
+  -DENABLE_PERFSUITE=On \
   "$@" \
   ${RAJA_DIR}
diff --git a/scripts/nvcc-8.0_gcc-4.9.3.sh b/scripts/nvcc-8.0_gcc-4.9.3.sh
index 815b4565b2..0d36551545 100755
--- a/scripts/nvcc-8.0_gcc-4.9.3.sh
+++ b/scripts/nvcc-8.0_gcc-4.9.3.sh
@@ -24,5 +24,6 @@ cmake \
   -DRAJA_ENABLE_CUDA=On \
   -DCUDA_TOOLKIT_ROOT_DIR=/opt/cudatoolkit-8.0 \
   -DCMAKE_INSTALL_PREFIX=../install-nvcc-8.0_gcc-4.9.3 \
+  -DRAJA_ENABLE_PERFSUITE=On \
   "$@" \
   ${RAJA_DIR}
diff --git a/scripts/travis_build_and_test.sh b/scripts/travis_build_and_test.sh
index 09ea01d42f..cefe601c2a 100755
--- a/scripts/travis_build_and_test.sh
+++ b/scripts/travis_build_and_test.sh
@@ -15,7 +15,11 @@ or_die mkdir travis-build
 cd travis-build
 if [[ "$DO_BUILD" == "yes" ]] ; then
     or_die cmake -DCMAKE_CXX_COMPILER="${COMPILER}" ${CMAKE_EXTRA_FLAGS} ../
-    or_die make -j 3 VERBOSE=1
+    if [[ ${CMAKE_EXTRA_FLAGS} == *COVERAGE* ]] ; then
+      or_die make -j 3
+    else
+      or_die make -j 3 VERBOSE=1
+    fi
     if [[ "${DO_TEST}" == "yes" ]] ; then
         or_die ctest -V
     fi
diff --git a/share/raja/cmake/RAJA-config.cmake.in b/share/raja/cmake/RAJA-config.cmake.in
index 3878fd6639..cb10d0da99 100644
--- a/share/raja/cmake/RAJA-config.cmake.in
+++ b/share/raja/cmake/RAJA-config.cmake.in
@@ -83,11 +83,11 @@ if (NOT RAJA_CONFIG_LOADED)
   set(RAJA_RT_LIBRARIES "${RT_LIBRARIES}")
 
   set(RAJA_TIMER_TYPE    ${RAJA_TIMER_TYPE})
-  set(RAJA_ENABLE_CUDA   ${RAJA_ENABLE_CUDA})
-  set(RAJA_ENABLE_FT     ${RAJA_ENABLE_FT})
-  set(RAJA_ENABLE_OPENMP ${RAJA_ENABLE_OPENMP})
-  set(RAJA_ENABLE_TARGET_OPENMP ${RAJA_ENABLE_TARGET_OPENMP})
-  set(RAJA_ENABLE_TESTS  ${RAJA_ENABLE_TESTS})
+  set(ENABLE_CUDA   ${ENABLE_CUDA})
+  set(ENABLE_FT     ${ENABLE_FT})
+  set(ENABLE_OPENMP ${ENABLE_OPENMP})
+  set(ENABLE_TARGET_OPENMP ${ENABLE_TARGET_OPENMP})
+  set(ENABLE_TESTS  ${ENABLE_TESTS})
   set(RAJA_REPORT_FT     ${RAJA_REPORT_FT})
   set(RAJA_USE_COMPLEX   ${RAJA_USE_COMPLEX})
   set(RAJA_USE_DOUBLE    ${RAJA_USE_DOUBLE})
diff --git a/src/AlignedRangeIndexSetBuilders.cpp b/src/AlignedRangeIndexSetBuilders.cpp
index 5c8360e3e2..35831dc5c7 100644
--- a/src/AlignedRangeIndexSetBuilders.cpp
+++ b/src/AlignedRangeIndexSetBuilders.cpp
@@ -9,7 +9,7 @@
  */
 
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -19,34 +19,7 @@
 //
 // This file is part of RAJA.
 //
-// For additional details, please also read RAJA/LICENSE.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
+// For details about use and distribution, please read RAJA/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
deleted file mode 100644
index f8fbce3384..0000000000
--- a/src/CMakeLists.txt
+++ /dev/null
@@ -1,72 +0,0 @@
-###############################################################################
-# Copyright (c) 2016, Lawrence Livermore National Security, LLC.
-#
-# Produced at the Lawrence Livermore National Laboratory
-#
-# LLNL-CODE-689114
-#
-# All rights reserved.
-#
-# This file is part of RAJA.
-#
-# For additional details, please also read RAJA/LICENSE.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright notice,
-#   this list of conditions and the disclaimer below.
-#
-# * Redistributions in binary form must reproduce the above copyright notice,
-#   this list of conditions and the disclaimer (as noted below) in the
-#   documentation and/or other materials provided with the distribution.
-#
-# * Neither the name of the LLNS/LLNL nor the names of its contributors may
-#   be used to endorse or promote products derived from this software without
-#   specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-# ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-# LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-# IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-# POSSIBILITY OF SUCH DAMAGE.
-#
-###############################################################################
-
-set (raja_sources
-  AlignedRangeIndexSetBuilders.cpp
-  DepGraphNode.cpp
-  LockFreeIndexSetBuilders.cpp
-  MemUtils_CUDA.cpp
-  ThreadUtils_CPU.cpp)
-
-raja_add_library(
-  NAME RAJA
-  SOURCES ${raja_sources})
-
-install(TARGETS RAJA EXPORT RAJA
-  ARCHIVE DESTINATION lib
-  LIBRARY DESTINATION lib
-  RUNTIME DESTINATION lib
-)
-
-if (RAJA_ENABLE_TBB)
-  if (RAJA_ENABLE_CUDA)
-    if (RAJA_ENABLE_CLANG_CUDA)
-      target_link_libraries( RAJA ${TBB_LIBRARIES} )
-    else ()
-      target_link_libraries( RAJA PUBLIC ${TBB_LIBRARIES} )
-    endif ()
-  else ()
-    target_link_libraries( RAJA ${TBB_LIBRARIES} )
-  endif ()
-endif ()
-
-install(EXPORT RAJA DESTINATION share/raja/cmake/)
diff --git a/src/DepGraphNode.cpp b/src/DepGraphNode.cpp
index 5c10b65e0c..a0b7573b15 100644
--- a/src/DepGraphNode.cpp
+++ b/src/DepGraphNode.cpp
@@ -9,7 +9,7 @@
  */
 
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -19,41 +19,15 @@
 //
 // This file is part of RAJA.
 //
-// For additional details, please also read RAJA/LICENSE.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
+// For details about use and distribution, please read RAJA/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
-#include "RAJA/internal/DepGraphNode.hpp"
-
+#include <string>
 #include <iostream>
 
+#include "RAJA/internal/DepGraphNode.hpp"
+
 namespace RAJA
 {
 
diff --git a/src/LockFreeIndexSetBuilders.cpp b/src/LockFreeIndexSetBuilders.cpp
index f2dc64efda..4e0fe53870 100644
--- a/src/LockFreeIndexSetBuilders.cpp
+++ b/src/LockFreeIndexSetBuilders.cpp
@@ -9,7 +9,7 @@
  */
 
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -19,34 +19,7 @@
 //
 // This file is part of RAJA.
 //
-// For additional details, please also read RAJA/LICENSE.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
+// For details about use and distribution, please read RAJA/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
@@ -240,12 +213,12 @@ void buildLockFreeBlockIndexset(RAJA::IndexSet& iset,
  ******************************************************************************
  */
 void buildLockFreeColorIndexset(RAJA::IndexSet& iset,
-                                int const* domainToRange,
+                                Index_type const* domainToRange,
                                 int numEntity,
                                 int numRangePerDomain,
                                 int numEntityRange,
-                                int* elemPermutation,
-                                int* ielemPermutation)
+                                Index_type* elemPermutation,
+                                Index_type* ielemPermutation)
 {
   bool done = false;
   bool* isMarked = new bool[numEntity];
diff --git a/src/MemUtils_CUDA.cpp b/src/MemUtils_CUDA.cpp
index e2bef289a2..5a10fd9743 100644
--- a/src/MemUtils_CUDA.cpp
+++ b/src/MemUtils_CUDA.cpp
@@ -9,12 +9,8 @@
  ******************************************************************************
  */
 
-#include "RAJA/config.hpp"
-
-#if defined(RAJA_ENABLE_CUDA)
-
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -24,594 +20,49 @@
 //
 // This file is part of RAJA.
 //
-// For additional details, please also read RAJA/LICENSE.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
+// For details about use and distribution, please read RAJA/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
-#include "RAJA/policy/cuda/MemUtils_CUDA.hpp"
+#include "RAJA/config.hpp"
 
-#include "RAJA/util/types.hpp"
+#if defined(RAJA_ENABLE_CUDA)
 
-#include "RAJA/pattern/reduce.hpp"
+#include "RAJA/policy/cuda/MemUtils_CUDA.hpp"
 
 #include "RAJA/policy/cuda/raja_cudaerrchk.hpp"
 
-#include <cassert>
-#include <iostream>
-#include <string>
-
-#define RAJA_MAX_REDUCE_VARS 8
-
 namespace RAJA
 {
 
-namespace
+namespace cuda
 {
-/*!
- * \brief Number of currently active cuda reduction objects
- */
-int s_cuda_reducer_active_count = 0;
-
-/*!
- * \brief Number of cuda memblocks currently being used.
- */
-int s_cuda_memblock_used_count = 0;
-
-/*!
- * \brief Static array used to keep track of which unique ids
- * for CUDA reduction objects are used and which are not.
- */
-bool s_cuda_reduction_id_used[RAJA_MAX_REDUCE_VARS];
-
-/*!
- * \brief Static array used to keep track of which reduction
- * memblocks are in use.
- */
-bool s_cuda_reduction_memblock_used[RAJA_MAX_REDUCE_VARS];
-
-/*!
- * \brief Pointer to device memory block for RAJA-Cuda reductions.
- */
-CudaReductionDummyBlockType* s_cuda_reduction_mem_block = 0;
-
-/*!
- * \brief Pointer to the tally block on the device.
- *
- * The tally block is a number of contiguous slots in memory where the
- * results of cuda reduction variables are stored. This is done so all
- * results may be copied back to the host with one memcpy.
- */
-CudaReductionDummyTallyType* s_cuda_reduction_tally_block_device = 0;
-
-/*!
- * \brief Pointer to the tally block cache on the host.
- *
- * This cache allows multiple reads from the tally cache on the host to
- * incur only one memcpy from device memory. This cache also allows
- * multiple cuda reduction variables to be initialized without writing to
- * device memory. Changes to this cache are written back to the device
- * tally block in the next forall, before kernel launch so they are visible
- * on the device when needed.
- *
- * Note: This buffer must be allocated in pageable memory (not pinned).
- * CudaMemcpyAsync is always asynchronous with respect to managed memory.
- * However, while cudaMemcpyAsync is asynchronous to the host when used with
- * pinned or managed memory, it is synchronous to the host if the target
- * buffer is host pageable memory. Due to overheads associated with
- * synchronization of managed memory, using cudaMemcpyAsync with pageable
- * memory takes less time overall than using a synchronous routine. If
- * synchronizing managed memory incurs a smaller penalty inthe future, then
- * using other memory types could take less time.
- */
-CudaReductionDummyTallyType* s_cuda_reduction_tally_block_host = 0;
-
-//
-/////////////////////////////////////////////////////////////////////////////
-//
-// Variables representing the state of the tally block cache on the host.
-//
-/////////////////////////////////////////////////////////////////////////////
-//
-
-/*!
- * \brief Validity of host tally block cache.
- *
- * Valid means that all slots are up to date and can be read from the cache.
- * Invalid means that only dirty slots are up to date.
- */
-bool s_tally_valid = true;
-/*
- * \brief The number of slots that should be written back to the device
- *        tally block.
- */
-int s_tally_dirty = 0;
-/*
- * \brief Holds the dirty status of each slot.
- *
- * True indicates a slot written to by the host, but not copied back to
- * the device tally block.
- */
-bool s_tally_block_dirty[RAJA_CUDA_REDUCE_TALLY_LENGTH] = {false};
 
+namespace detail
+{
 //
 /////////////////////////////////////////////////////////////////////////////
 //
-// Variables representing the state of dynamic shared memory usage.
+// Variables representing the state of execution.
 //
 /////////////////////////////////////////////////////////////////////////////
 //
 
-/*!
- * \brief State of the host code, whether it is currently in a raja
- *        cuda forall function or not.
- */
-int s_raja_cuda_forall_level = 0;
-/*!
- * \brief The amount of shared memory currently earmarked for use in
- *        the current forall.
- */
-int s_shared_memory_amount_total = 0;
-/*!
- * \brief shared_memory_offsets holds the byte offsets into dynamic shared
- *        memory for each reduction variable.
- *
- * Note: -1 indicates a reduction variable that is not participating in
- * the current forall.
- */
-int s_shared_memory_offsets[RAJA_MAX_REDUCE_VARS] = {-1};
-/*!
- * \brief Holds the number of threads expected by each reduction variable.
- *
- * This is used to check the execution policy against the reduction policies
- * of participating reduction varaibles.
- *
- * Note: -1 indicates a reduction variable that is not participating in the
- * current forall and 0 represents a reduction variable whose execution does
- * not depend on the number of threads used by the execution policy.
- */
-int s_cuda_reduction_num_threads[RAJA_MAX_REDUCE_VARS] = {-1};
-}
-
-/*
-*******************************************************************************
-*
-* Return number of active cuda reducer objects.
-*
-*******************************************************************************
-*/
-int getCudaReducerActiveCount() { return s_cuda_reducer_active_count; }
-
-/*
-*******************************************************************************
-*
-* Return number of active cuda memblocks.
-*
-*******************************************************************************
-*/
-int getCudaMemblockUsedCount() { return s_cuda_memblock_used_count; }
-
-/*
-*******************************************************************************
-*
-* Return next available valid reduction id, or complain and exit if
-* no valid id is available.
-*
-*******************************************************************************
-*/
-int getCudaReductionId()
-{
-  static int first_time_called = true;
-
-  if (first_time_called) {
-    s_cuda_reducer_active_count = 0;
-    s_cuda_memblock_used_count = 0;
-
-    for (int id = 0; id < RAJA_MAX_REDUCE_VARS; ++id) {
-      s_cuda_reduction_id_used[id] = false;
-    }
-
-    first_time_called = false;
-  }
-
-  int id = 0;
-  while (id < RAJA_MAX_REDUCE_VARS && s_cuda_reduction_id_used[id]) {
-    id++;
-  }
-
-  if (id >= RAJA_MAX_REDUCE_VARS) {
-    std::cerr << "\n Exceeded allowable RAJA CUDA reduction count, "
-              << "FILE: " << __FILE__ << " line: " << __LINE__ << std::endl;
-    exit(1);
-  }
-
-  s_cuda_reducer_active_count++;
-  s_cuda_reduction_id_used[id] = true;
-
-  return id;
-}
-
-/*
-*******************************************************************************
-*
-* Release given reduction id and make inactive.
-*
-*******************************************************************************
-*/
-void releaseCudaReductionId(int id)
-{
-  if (id < RAJA_MAX_REDUCE_VARS) {
-    s_cuda_reducer_active_count--;
-    s_cuda_reduction_id_used[id] = false;
-    if (s_cuda_reduction_memblock_used[id]) {
-      s_cuda_memblock_used_count--;
-      s_cuda_reduction_memblock_used[id] = false;
-    }
-  }
-}
-
-/*
-*******************************************************************************
-*
-* Return pointer into RAJA-CUDA reduction device memory block
-* for reducer object with given id. Allocate block if not already allocated.
-*
-*******************************************************************************
-*/
-void getCudaReductionMemBlock(int id, void** device_memblock)
-{
-  if (s_cuda_reduction_mem_block == 0) {
-    cudaErrchk(
-        cudaMalloc((void**)&s_cuda_reduction_mem_block,
-                   sizeof(CudaReductionDummyBlockType) * RAJA_MAX_REDUCE_VARS));
-
-    for (int i = 0; i < RAJA_MAX_REDUCE_VARS; ++i) {
-      s_cuda_reduction_memblock_used[i] = false;
-    }
-
-    atexit(freeCudaReductionMemBlock);
-  }
-
-  s_cuda_memblock_used_count++;
-  s_cuda_reduction_memblock_used[id] = true;
-
-  *device_memblock = &(s_cuda_reduction_mem_block[id]);
-}
-
-/*
-*******************************************************************************
-*
-* Free device memory blocks used in RAJA-Cuda reductions.
-*
-*******************************************************************************
-*/
-void freeCudaReductionMemBlock()
-{
-  if (s_cuda_reduction_mem_block != 0) {
-    cudaErrchk(cudaFree(s_cuda_reduction_mem_block));
-    s_cuda_reduction_mem_block = 0;
-  }
-}
-
-
-/*
-*******************************************************************************
-*
-* Return pointer into RAJA-CUDA reduction host tally block cache
-* and device tally block for reducer object with given id.
-* Allocate blocks if not already allocated.
-*
-*******************************************************************************
-*/
-void getCudaReductionTallyBlock(int id, void** host_tally, void** device_tally)
-{
-  if (s_cuda_reduction_tally_block_host == 0) {
-    s_cuda_reduction_tally_block_host =
-        new CudaReductionDummyTallyType[RAJA_CUDA_REDUCE_TALLY_LENGTH];
-
-    cudaErrchk(cudaMalloc((void**)&s_cuda_reduction_tally_block_device,
-                          sizeof(CudaReductionDummyTallyType)
-                              * RAJA_CUDA_REDUCE_TALLY_LENGTH));
-
-    s_tally_valid = true;
-    s_tally_dirty = 0;
-    for (int i = 0; i < RAJA_CUDA_REDUCE_TALLY_LENGTH; ++i) {
-      s_tally_block_dirty[i] = false;
-    }
-
-    atexit(freeCudaReductionTallyBlock);
-  }
-
-  s_tally_dirty += 1;
-  // set block dirty
-  s_tally_block_dirty[id] = true;
-
-  *host_tally = &(s_cuda_reduction_tally_block_host[id]);
-  *device_tally = &(s_cuda_reduction_tally_block_device[id]);
-}
-
-/*
-*******************************************************************************
-*
-* Write back dirty tally blocks to device tally blocks.
-* Can be called before tally blocks have been allocated.
-*
-*******************************************************************************
-*/
-static void writeBackCudaReductionTallyBlock()
-{
-  if (s_tally_dirty > 0) {
-    int first = 0;
-    while (first < RAJA_CUDA_REDUCE_TALLY_LENGTH) {
-      if (s_tally_block_dirty[first]) {
-        int end = first + 1;
-        while (end < RAJA_CUDA_REDUCE_TALLY_LENGTH
-               && s_tally_block_dirty[end]) {
-          end++;
-        }
-        int len = (end - first);
-        cudaErrchk(cudaMemcpyAsync(&s_cuda_reduction_tally_block_device[first],
-                                   &s_cuda_reduction_tally_block_host[first],
-                                   sizeof(CudaReductionDummyTallyType) * len,
-                                   cudaMemcpyHostToDevice));
-
-        for (int i = first; i < end; ++i) {
-          s_tally_block_dirty[i] = false;
-        }
-        first = end + 1;
-      } else {
-        first++;
-      }
-    }
-    s_tally_dirty = 0;
-  }
-}
-
-/*
-*******************************************************************************
-*
-* Read tally block from device if invalid on host.
-* Must be called after tally blocks have been allocated.
-* The Async version is synchronous on the host if
-* s_cuda_reduction_tally_block_host is allocated as pageable host memory
-* and not if allocated as pinned host memory or managed memory.
-*
-*******************************************************************************
-*/
-static void readCudaReductionTallyBlockAsync()
-{
-  if (!s_tally_valid) {
-    cudaErrchk(cudaMemcpyAsync(&s_cuda_reduction_tally_block_host[0],
-                               &s_cuda_reduction_tally_block_device[0],
-                               sizeof(CudaReductionDummyTallyType)
-                                   * RAJA_CUDA_REDUCE_TALLY_LENGTH,
-                               cudaMemcpyDeviceToHost));
-    s_tally_valid = true;
-  }
-}
-static void readCudaReductionTallyBlock()
-{
-  if (!s_tally_valid) {
-    cudaErrchk(cudaMemcpy(&s_cuda_reduction_tally_block_host[0],
-                          &s_cuda_reduction_tally_block_device[0],
-                          sizeof(CudaReductionDummyTallyType)
-                              * RAJA_CUDA_REDUCE_TALLY_LENGTH,
-                          cudaMemcpyDeviceToHost));
-    s_tally_valid = true;
-  }
-}
-
-/*
-*******************************************************************************
-*
-* Must be called before each RAJA cuda kernel, and before the copy of the
-* loop body to setup state of the dynamic shared memory variables.
-* Ensures that all updates to the tally block are visible on the device by
-* writing back dirty cache lines; this invalidates the tally cache on the host.
-*
-*******************************************************************************
-*/
-void beforeCudaKernelLaunch()
-{
-  s_raja_cuda_forall_level++;
-  if (s_raja_cuda_forall_level == 1) {
-    if (s_cuda_reducer_active_count > 0) {
-      s_shared_memory_amount_total = 0;
-      for (int i = 0; i < RAJA_MAX_REDUCE_VARS; ++i) {
-        s_shared_memory_offsets[i] = -1;
-      }
-      for (int i = 0; i < RAJA_MAX_REDUCE_VARS; ++i) {
-        s_cuda_reduction_num_threads[i] = -1;
-      }
-
-      s_tally_valid = false;
-      writeBackCudaReductionTallyBlock();
-    }
-  }
-}
-
-/*
-*******************************************************************************
-*
-* Must be called after each RAJA cuda kernel.
-* This resets the state of the dynamic shared memory variables.
-*
-*******************************************************************************
-*/
-void afterCudaKernelLaunch() { s_raja_cuda_forall_level--; }
-
-/*
-*******************************************************************************
-*
-* Must be called before reading the tally block cache on the host.
-* Ensures that the host tally block cache for cuda reduction variable id can
-* be read.
-* Writes any host changes to the tally block cache to the device before
-* updating the host tally blocks with the values on the GPU.
-* The Async version is only asynchronous with regards to managed memory and
-* is synchronous to host code.
-*
-*******************************************************************************
-*/
-void beforeCudaReadTallyBlockAsync(int id)
-{
-  if (!s_tally_block_dirty[id]) {
-    writeBackCudaReductionTallyBlock();
-    readCudaReductionTallyBlockAsync();
-  }
-}
-///
-void beforeCudaReadTallyBlockSync(int id)
-{
-  if (!s_tally_block_dirty[id]) {
-    writeBackCudaReductionTallyBlock();
-    readCudaReductionTallyBlock();
-  }
-}
-
-/*
-*******************************************************************************
-*
-* Release tally block of reduction variable with id.
-*
-*******************************************************************************
-*/
-void releaseCudaReductionTallyBlock(int id)
-{
-  if (s_tally_block_dirty[id]) {
-    s_tally_block_dirty[id] = false;
-    s_tally_dirty -= 1;
-  }
-}
-
-/*
-*******************************************************************************
-*
-* Free managed memory blocks used in RAJA-Cuda reductions.
-*
-*******************************************************************************
-*/
-void freeCudaReductionTallyBlock()
-{
-  if (s_cuda_reduction_tally_block_host != 0) {
-    delete[] s_cuda_reduction_tally_block_host;
-    cudaErrchk(cudaFree(s_cuda_reduction_tally_block_device));
-    s_cuda_reduction_tally_block_host = 0;
-  }
-}
-
-/*
-*******************************************************************************
-*
-* Earmark num_threads * size bytes of dynamic shared memory and get the byte
-* offset.
-*
-*******************************************************************************
-*/
-int getCudaSharedmemOffset(int id, dim3 reductionBlockDim, int size)
-{
-  assert(id < RAJA_MAX_REDUCE_VARS);
-
-  if (s_raja_cuda_forall_level > 0) {
-    if (s_shared_memory_offsets[id] < 0) {
-      // in a forall and have not yet gotten shared memory
-
-      s_shared_memory_offsets[id] = s_shared_memory_amount_total;
-
-      int num_threads =
-          reductionBlockDim.x * reductionBlockDim.y * reductionBlockDim.z;
-
-      // ignore reduction variables that don't use dynamic shared memory
-      s_cuda_reduction_num_threads[id] = (size > 0) ? num_threads : 0;
-
-      s_shared_memory_amount_total += num_threads * size;
-    }
-    return s_shared_memory_offsets[id];
-  } else {
-    return -1;
-  }
-}
-
-/*
-*******************************************************************************
-*
-* Get size in bytes of dynamic shared memory.
-* Check that the number of blocks launched is consistent with the max number of
-* blocks reduction variables can handle.
-* Check that execution policy num_threads is consistent with active reduction
-* policy num_threads.
-*
-*******************************************************************************
-*/
-int getCudaSharedmemAmount(dim3 launchGridDim, dim3 launchBlockDim)
-{
-  if (s_cuda_reducer_active_count > 0) {
-    int launch_num_blocks = launchGridDim.x * launchGridDim.y * launchGridDim.z;
-
-    int launch_num_threads =
-        launchBlockDim.x * launchBlockDim.y * launchBlockDim.z;
+//! State of the host code globally
+cudaInfo g_status;
 
-    for (int i = 0; i < RAJA_MAX_REDUCE_VARS; ++i) {
-      int reducer_num_threads = s_cuda_reduction_num_threads[i];
+//! State of the host code in this thread
+cudaInfo tl_status;
+#if defined(RAJA_ENABLE_OPENMP) && defined(_OPENMP)
+#pragma omp threadprivate(tl_status)
+#endif
 
-      // check if reducer is active
-      if (reducer_num_threads >= 0) {
+//! State of raja cuda stream synchronization for cuda reducer objects
+std::unordered_map<cudaStream_t, bool> g_stream_info_map{ {cudaStream_t(0), true} };
 
-        // check if reducer cares about number of blocks
-        if (s_cuda_reduction_memblock_used[i]
-            && launch_num_blocks > RAJA_CUDA_MAX_NUM_BLOCKS) {
-          std::cerr << "\n Cuda execution error: "
-                    << "Can't launch " << launch_num_blocks << " blocks, "
-                    << "RAJA_CUDA_MAX_NUM_BLOCKS = " << RAJA_CUDA_MAX_NUM_BLOCKS
-                    << ", "
-                    << "FILE: " << __FILE__ << " line: " << __LINE__
-                    << std::endl;
-          exit(1);
-        }
+}  // closing brace for detail namespace
 
-        // check if reducer cares about number of threads
-        if (reducer_num_threads > 0
-            && launch_num_threads > reducer_num_threads) {
-          std::cerr << "\n Cuda execution, reduction policy mismatch: "
-                    << "reduction policy with BLOCK_SIZE "
-                    << reducer_num_threads
-                    << " can't be used with execution policy with BLOCK_SIZE "
-                    << launch_num_threads << ", "
-                    << "FILE: " << __FILE__ << " line: " << __LINE__
-                    << std::endl;
-          exit(1);
-        }
-      }
-    }
-  }
-  return s_shared_memory_amount_total;
-}
+}  // closing brace for cuda namespace
 
 }  // closing brace for RAJA namespace
 
diff --git a/src/ThreadUtils_CPU.cpp b/src/ThreadUtils_CPU.cpp
index dcb9644fbc..5528cf4188 100644
--- a/src/ThreadUtils_CPU.cpp
+++ b/src/ThreadUtils_CPU.cpp
@@ -10,7 +10,7 @@
  */
 
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -20,34 +20,7 @@
 //
 // This file is part of RAJA.
 //
-// For additional details, please also read RAJA/LICENSE.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
+// For details about use and distribution, please read RAJA/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index e2e7056bb2..6819d4560d 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -1,6 +1,6 @@
 ###############################################################################
 #
-# Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+# Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 #
 # Produced at the Lawrence Livermore National Laboratory
 #
@@ -10,34 +10,7 @@
 #
 # This file is part of RAJA.
 #
-# For additional details, please also read RAJA/LICENSE.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright notice,
-#   this list of conditions and the disclaimer below.
-#
-# * Redistributions in binary form must reproduce the above copyright notice,
-#   this list of conditions and the disclaimer (as noted below) in the
-#   documentation and/or other materials provided with the distribution.
-#
-# * Neither the name of the LLNS/LLNL nor the names of its contributors may
-#   be used to endorse or promote products derived from this software without
-#   specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-# ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-# LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-# IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-# POSSIBILITY OF SUCH DAMAGE.
+# For details about use and distribution, please read RAJA/LICENSE.
 #
 ###############################################################################
 
diff --git a/test/include/RAJA_gtest.hpp b/test/include/RAJA_gtest.hpp
index a158af3caa..6793280d15 100644
--- a/test/include/RAJA_gtest.hpp
+++ b/test/include/RAJA_gtest.hpp
@@ -8,11 +8,8 @@
  ******************************************************************************
  */
 
-#ifndef RAJA_gtest_HPP
-#define RAJA_gtest_HPP
-
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -22,37 +19,13 @@
 //
 // This file is part of RAJA.
 //
-// For additional details, please also read RAJA/LICENSE.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
+// For details about use and distribution, please read RAJA/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+#ifndef RAJA_gtest_HPP
+#define RAJA_gtest_HPP
+
 #include "gtest/gtest.h"
 
 #define CUDA_TEST(X, Y)                 \
@@ -71,48 +44,46 @@
   }                                                           \
   static void cuda_test_f_##test_fixture##_##test_name()
 
-#define CUDA_TEST_P(test_case_name, test_name)                                 \
-  template <typename Invocable>                                                \
-  static void gtest_cuda_##test_case_name##_##test_name(Invocable &&);         \
-  class GTEST_TEST_CLASS_NAME_(test_case_name, test_name)                      \
-      : public test_case_name                                                  \
-  {                                                                            \
-  public:                                                                      \
-    GTEST_TEST_CLASS_NAME_(test_case_name, test_name)() {}                     \
-    virtual void TestBody()                                                    \
-    {                                                                          \
-      gtest_cuda_##test_case_name##_##test_name([&] { return GetParam(); });   \
-    }                                                                          \
-                                                                               \
-  private:                                                                     \
-    static int AddToRegistry()                                                 \
-    {                                                                          \
-      ::testing::UnitTest::GetInstance()                                       \
-          ->parameterized_test_registry()                                      \
-          .GetTestCasePatternHolder<test_case_name>(                           \
-              #test_case_name,                                                 \
-              ::testing::internal::CodeLocation(__FILE__, __LINE__))           \
-          ->AddTestPattern(                                                    \
-              #test_case_name,                                                 \
-              #test_name,                                                      \
-              new ::testing::internal::TestMetaFactory<GTEST_TEST_CLASS_NAME_( \
-                  test_case_name, test_name)>());                              \
-      return 0;                                                                \
-    }                                                                          \
-    static int gtest_registering_dummy_ GTEST_ATTRIBUTE_UNUSED_;               \
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(GTEST_TEST_CLASS_NAME_(test_case_name,     \
-                                                           test_name));        \
-  };                                                                           \
-  int GTEST_TEST_CLASS_NAME_(test_case_name,                                   \
-                             test_name)::gtest_registering_dummy_ =            \
-      GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::AddToRegistry();      \
-  template <typename Invocable>                                                \
+#define CUDA_TEST_P(test_case_name, test_name)                               \
+  template <typename Invocable>                                              \
+  static void gtest_cuda_##test_case_name##_##test_name(Invocable &&);       \
+  class GTEST_TEST_CLASS_NAME_(test_case_name, test_name)                    \
+      : public test_case_name                                                \
+  {                                                                          \
+  public:                                                                    \
+    GTEST_TEST_CLASS_NAME_(test_case_name, test_name)() {}                   \
+    virtual void TestBody()                                                  \
+    {                                                                        \
+      gtest_cuda_##test_case_name##_##test_name([&] { return GetParam(); }); \
+    }                                                                        \
+                                                                             \
+  private:                                                                   \
+    static int AddToRegistry()                                               \
+    {                                                                        \
+      ::testing::UnitTest::GetInstance()                                     \
+          ->parameterized_test_registry()                                    \
+          .GetTestCasePatternHolder<test_case_name>(                         \
+              #test_case_name,                                               \
+              ::testing::internal::CodeLocation(__FILE__, __LINE__))         \
+          ->AddTestPattern(                                                  \
+              #test_case_name,                                               \
+              #test_name,                                                    \
+              new ::testing::internal::TestMetaFactory<                      \
+                  GTEST_TEST_CLASS_NAME_(test_case_name, test_name)>());     \
+      return 0;                                                              \
+    }                                                                        \
+    static int gtest_registering_dummy_ GTEST_ATTRIBUTE_UNUSED_;             \
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(GTEST_TEST_CLASS_NAME_(test_case_name,   \
+                                                           test_name));      \
+  };                                                                         \
+  int GTEST_TEST_CLASS_NAME_(test_case_name,                                 \
+                             test_name)::gtest_registering_dummy_ =          \
+      GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::AddToRegistry();    \
+  template <typename Invocable>                                              \
   static void gtest_cuda_##test_case_name##_##test_name(Invocable &&GetParam)
 
 
 #define CUDA_TYPED_TEST_P(CaseName, TestName)                            \
-  template <typename TypeParam>                                          \
-  static void cuda_typed_test_p_##CaseName##_##TestName();               \
   namespace GTEST_CASE_NAMESPACE_(CaseName)                              \
   {                                                                      \
     template <typename gtest_TypeParam_>                                 \
@@ -121,10 +92,9 @@
     private:                                                             \
       typedef CaseName<gtest_TypeParam_> TestFixture;                    \
       typedef gtest_TypeParam_ TypeParam;                                \
-      virtual void TestBody()                                            \
-      {                                                                  \
-        cuda_typed_test_p_##CaseName##_##TestName<gtest_TypeParam_>();   \
-      }                                                                  \
+                                                                         \
+    public:                                                              \
+      virtual void TestBody();                                           \
     };                                                                   \
     static bool gtest_##TestName##_defined_ GTEST_ATTRIBUTE_UNUSED_ =    \
         GTEST_TYPED_TEST_CASE_P_STATE_(CaseName).AddTestName(__FILE__,   \
@@ -133,6 +103,6 @@
                                                              #TestName); \
   }                                                                      \
   template <typename TypeParam>                                          \
-  static void cuda_typed_test_p_##CaseName##_##TestName()
+  void GTEST_CASE_NAMESPACE_(CaseName)::TestName<TypeParam>::TestBody()
 
 #endif  // closing endif for header file include guard
diff --git a/test/include/type_helper.hpp b/test/include/type_helper.hpp
index e9b2d4aa75..5c6a7fe0a5 100644
--- a/test/include/type_helper.hpp
+++ b/test/include/type_helper.hpp
@@ -8,11 +8,8 @@
  ******************************************************************************
  */
 
-#ifndef _TYPE_HELPER_HPP_
-#define _TYPE_HELPER_HPP_
-
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -22,37 +19,13 @@
 //
 // This file is part of RAJA.
 //
-// For additional details, please also read RAJA/LICENSE.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
+// For details about use and distribution, please read RAJA/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+#ifndef _TYPE_HELPER_HPP_
+#define _TYPE_HELPER_HPP_
+
 #include "gtest/gtest.h"
 
 #include <tuple>
diff --git a/test/integration/CMakeLists.txt b/test/integration/CMakeLists.txt
index 474a20ac61..0ba36a3ad9 100644
--- a/test/integration/CMakeLists.txt
+++ b/test/integration/CMakeLists.txt
@@ -1,6 +1,6 @@
 ###############################################################################
 #
-# Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+# Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 #
 # Produced at the Lawrence Livermore National Laboratory
 #
@@ -10,37 +10,10 @@
 #
 # This file is part of RAJA.
 #
-# For additional details, please also read RAJA/LICENSE.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright notice,
-#   this list of conditions and the disclaimer below.
-#
-# * Redistributions in binary form must reproduce the above copyright notice,
-#   this list of conditions and the disclaimer (as noted below) in the
-#   documentation and/or other materials provided with the distribution.
-#
-# * Neither the name of the LLNS/LLNL nor the names of its contributors may
-#   be used to endorse or promote products derived from this software without
-#   specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-# ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-# LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-# IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-# POSSIBILITY OF SUCH DAMAGE.
+# For details about use and distribution, please read RAJA/LICENSE.
 #
 ###############################################################################
 
-if (RAJA_ENABLE_CHAI)
+if (ENABLE_CHAI)
   add_subdirectory(chai)
-endif(RAJA_ENABLE_CHAI)
+endif(ENABLE_CHAI)
diff --git a/test/integration/chai/CMakeLists.txt b/test/integration/chai/CMakeLists.txt
index a303b7ee88..c4a0131912 100644
--- a/test/integration/chai/CMakeLists.txt
+++ b/test/integration/chai/CMakeLists.txt
@@ -1,6 +1,6 @@
 ###############################################################################
 #
-# Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+# Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 #
 # Produced at the Lawrence Livermore National Laboratory
 #
@@ -10,42 +10,15 @@
 #
 # This file is part of RAJA.
 #
-# For additional details, please also read RAJA/LICENSE.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright notice,
-#   this list of conditions and the disclaimer below.
-#
-# * Redistributions in binary form must reproduce the above copyright notice,
-#   this list of conditions and the disclaimer (as noted below) in the
-#   documentation and/or other materials provided with the distribution.
-#
-# * Neither the name of the LLNS/LLNL nor the names of its contributors may
-#   be used to endorse or promote products derived from this software without
-#   specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-# ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-# LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-# IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-# POSSIBILITY OF SUCH DAMAGE.
+# For details about use and distribution, please read RAJA/LICENSE.
 #
 ###############################################################################
 
-if (RAJA_ENABLE_CUDA)
+if (ENABLE_CUDA)
   raja_add_test(
     NAME test-chai-nested
     SOURCES chai-nested.cpp)
-endif(RAJA_ENABLE_CUDA)
+endif(ENABLE_CUDA)
 
 raja_add_test(
   NAME test-chai-policy
diff --git a/test/integration/chai/chai-nested.cpp b/test/integration/chai/chai-nested.cpp
index e433bcd064..723f16b4a4 100644
--- a/test/integration/chai/chai-nested.cpp
+++ b/test/integration/chai/chai-nested.cpp
@@ -1,12 +1,21 @@
-/*
- * Copyright (c) 2016, Lawrence Livermore National Security, LLC.
- * Produced at the Lawrence Livermore National Laboratory.
- *
- * All rights reserved.
- *
- * This source code cannot be distributed without permission and
- * further review from Lawrence Livermore National Laboratory.
- */
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
+//
+// Produced at the Lawrence Livermore National Laboratory
+//
+// LLNL-CODE-689114
+//
+// All rights reserved.
+//
+// This file is part of RAJA.
+//
+// For details about use and distribution, please read RAJA/LICENSE.
+//
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// Source file containing tests for CHAI in RAJA nested loops.
+///
 
 #include <time.h>
 #include <cfloat>
diff --git a/test/integration/chai/chai-policy-tests.cpp b/test/integration/chai/chai-policy-tests.cpp
index cfb8a0ccd8..430b094b13 100644
--- a/test/integration/chai/chai-policy-tests.cpp
+++ b/test/integration/chai/chai-policy-tests.cpp
@@ -1,3 +1,22 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
+//
+// Produced at the Lawrence Livermore National Laboratory
+//
+// LLNL-CODE-689114
+//
+// All rights reserved.
+//
+// This file is part of RAJA.
+//
+// For details about use and distribution, please read RAJA/LICENSE.
+//
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// Source file containing tests for CHAI with different RAJA policies
+///
+
 #include "gtest/gtest.h"
 
 #include "chai/ExecutionSpaces.hpp"
@@ -26,7 +45,6 @@ static_assert(RAJA::detail::get_space<RAJA::NestedPolicy< RAJA::ExecList< RAJA::
 #endif
 
 TEST(ChaiPolicyTest, Default) {
-
 #if defined(RAJA_ENABLE_CUDA)
   std::cout << RAJA::detail::get_space<RAJA::ExecPolicy<RAJA::seq_segit, RAJA::cuda_exec<128> > >::value << std::endl;
 #else
diff --git a/test/integration/chai/chai-tests.cpp b/test/integration/chai/chai-tests.cpp
index 4cb850ce9e..210eb86992 100644
--- a/test/integration/chai/chai-tests.cpp
+++ b/test/integration/chai/chai-tests.cpp
@@ -1,3 +1,22 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
+//
+// Produced at the Lawrence Livermore National Laboratory
+//
+// LLNL-CODE-689114
+//
+// All rights reserved.
+//
+// This file is part of RAJA.
+//
+// For details about use and distribution, please read RAJA/LICENSE.
+//
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// Source file containing tests for CHAI with basic RAJA constructs
+///
+
 #include "chai/ManagedArray.hpp"
 
 #include "RAJA/RAJA.hpp"
diff --git a/test/unit/CMakeLists.txt b/test/unit/CMakeLists.txt
index 37f7f12051..9b3e68b75e 100644
--- a/test/unit/CMakeLists.txt
+++ b/test/unit/CMakeLists.txt
@@ -1,6 +1,6 @@
 ###############################################################################
 #
-# Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+# Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 #
 # Produced at the Lawrence Livermore National Laboratory
 #
@@ -10,34 +10,7 @@
 #
 # This file is part of RAJA.
 #
-# For additional details, please also read RAJA/LICENSE.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright notice,
-#   this list of conditions and the disclaimer below.
-#
-# * Redistributions in binary form must reproduce the above copyright notice,
-#   this list of conditions and the disclaimer (as noted below) in the
-#   documentation and/or other materials provided with the distribution.
-#
-# * Neither the name of the LLNS/LLNL nor the names of its contributors may
-#   be used to endorse or promote products derived from this software without
-#   specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-# ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-# LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-# IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-# POSSIBILITY OF SUCH DAMAGE.
+# For details about use and distribution, please read RAJA/LICENSE.
 #
 ###############################################################################
 
@@ -73,16 +46,20 @@ raja_add_test(
   NAME test-span
   SOURCES test-span.cpp)
 
+raja_add_test(
+  NAME nested
+  SOURCES nested.cpp)
+
 raja_add_test(
   NAME test-multipolicy
   SOURCES test-multipolicy.cpp)
 
 add_subdirectory(cpu)
 
-if(RAJA_ENABLE_CUDA)
+if(ENABLE_CUDA)
   add_subdirectory(cuda)
-endif(RAJA_ENABLE_CUDA)
+endif(ENABLE_CUDA)
 
-if(RAJA_ENABLE_TARGET_OPENMP)
+if(ENABLE_TARGET_OPENMP)
   add_subdirectory(omp-target)
-endif(RAJA_ENABLE_TARGET_OPENMP)
+endif(ENABLE_TARGET_OPENMP)
diff --git a/test/unit/README b/test/unit/README
deleted file mode 100644
index ed23da5803..0000000000
--- a/test/unit/README
+++ /dev/null
@@ -1,23 +0,0 @@
-#
-# The subdirectories in this directory contain simple tests of RAJA
-# traversal and reduction operations. These codes illustrate basic 
-# RAJA usage and are also a good sanity check to make sure that the
-# cod is built correctly and works properly.
-#
-# The subdirectory CPUtests contains two files for testing CPU execution, 
-# one for traversals and one for traversals with reductions. The tests 
-# use RAJA IndexSets with multiple Range and List Segments. Tests for the
-# nested loop RAJA constructs are also included. When RAJA is compiled, 
-# all execution policy variants available for the compiler are generated. 
-#
-# The subdirectory GPUtests contains four files for testing CUDA GPU 
-# execution, one for traversals and three for traversals with reductions 
-# (min, max, and sum). The tests use raw index ranges and RAJA IndexSets 
-# with multiple Range Segments. RAJA must be built with CUDA enabled to
-# generate executables for these tests.
-#
-# NOTE: When running the CUDA tests, we advise you to set the
-#       environment variable CUDA_VISIBLE_DEVICES to zero before running.
-#       We are using CUDA Unified Memory and we find that this setting 
-#       greatly improves performance.
-#
diff --git a/test/unit/cpu/CMakeLists.txt b/test/unit/cpu/CMakeLists.txt
index 93745f19f7..7c8a441890 100644
--- a/test/unit/cpu/CMakeLists.txt
+++ b/test/unit/cpu/CMakeLists.txt
@@ -1,6 +1,6 @@
 ###############################################################################
 #
-# Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+# Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 #
 # Produced at the Lawrence Livermore National Laboratory
 #
@@ -10,40 +10,14 @@
 #
 # This file is part of RAJA.
 #
-# For additional details, please also read RAJA/LICENSE.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright notice,
-#   this list of conditions and the disclaimer below.
-#
-# * Redistributions in binary form must reproduce the above copyright notice,
-#   this list of conditions and the disclaimer (as noted below) in the
-#   documentation and/or other materials provided with the distribution.
-#
-# * Neither the name of the LLNS/LLNL nor the names of its contributors may
-#   be used to endorse or promote products derived from this software without
-#   specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-# ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-# LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-# IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-# POSSIBILITY OF SUCH DAMAGE.
+# For details about use and distribution, please read RAJA/LICENSE.
 #
 ###############################################################################
 
-raja_add_library(
+blt_add_library(
   NAME bis
-  SOURCES buildIndexSet.cpp)
+  SOURCES buildIndexSet.cpp
+  DEPENDS_ON RAJA ${raja_depends})
 
 raja_add_test(
   NAME test-reduce
diff --git a/test/unit/cpu/buildIndexSet.cpp b/test/unit/cpu/buildIndexSet.cpp
index fe1a0e1c96..14325ded00 100644
--- a/test/unit/cpu/buildIndexSet.cpp
+++ b/test/unit/cpu/buildIndexSet.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -9,34 +9,7 @@
 //
 // This file is part of RAJA.
 //
-// For additional details, please also read RAJA/README.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
+// For details about use and distribution, please read RAJA/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
diff --git a/test/unit/cpu/buildIndexSet.hpp b/test/unit/cpu/buildIndexSet.hpp
index 2c34a4bb36..da26672d17 100644
--- a/test/unit/cpu/buildIndexSet.hpp
+++ b/test/unit/cpu/buildIndexSet.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -9,34 +9,7 @@
 //
 // This file is part of RAJA.
 //
-// For additional details, please also read RAJA/README.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
+// For details about use and distribution, please read RAJA/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
diff --git a/test/unit/cpu/test-forall.cpp b/test/unit/cpu/test-forall.cpp
index 93ee4a06ab..f69c0fb297 100644
--- a/test/unit/cpu/test-forall.cpp
+++ b/test/unit/cpu/test-forall.cpp
@@ -1,3 +1,18 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
+//
+// Produced at the Lawrence Livermore National Laboratory
+//
+// LLNL-CODE-689114
+//
+// All rights reserved.
+//
+// This file is part of RAJA.
+//
+// For details about use and distribution, please read RAJA/LICENSE.
+//
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
 #include <cstdlib>
 
 #include <string>
@@ -92,6 +107,7 @@ REGISTER_TYPED_TEST_CASE_P(ForallTest, BasicForall, BasicForallIcount);
 
 using SequentialTypes = ::testing::Types<
     ExecPolicy<seq_segit, seq_exec>,
+    ExecPolicy<seq_segit, loop_exec>,
     ExecPolicy<seq_segit, simd_exec> >;
 
 INSTANTIATE_TYPED_TEST_CASE_P(Sequential, ForallTest, SequentialTypes);
@@ -101,7 +117,7 @@ INSTANTIATE_TYPED_TEST_CASE_P(Sequential, ForallTest, SequentialTypes);
 using OpenMPTypes = ::testing::Types<
     ExecPolicy<seq_segit, omp_parallel_for_exec>,
     ExecPolicy<omp_parallel_for_segit, seq_exec>,
-    ExecPolicy<omp_parallel_for_segit, simd_exec> >;
+    ExecPolicy<omp_parallel_for_segit, loop_exec> >;
 
 INSTANTIATE_TYPED_TEST_CASE_P(OpenMP, ForallTest, OpenMPTypes);
 #endif
@@ -110,10 +126,10 @@ INSTANTIATE_TYPED_TEST_CASE_P(OpenMP, ForallTest, OpenMPTypes);
 using TBBTypes = ::testing::Types<
     ExecPolicy<seq_segit, tbb_for_exec>,
     ExecPolicy<tbb_for_exec, seq_exec>,
-    ExecPolicy<tbb_for_exec, simd_exec>,
+    ExecPolicy<tbb_for_exec, loop_exec>,
     ExecPolicy<seq_segit, tbb_for_dynamic>,
     ExecPolicy<tbb_for_dynamic, seq_exec>,
-    ExecPolicy<tbb_for_dynamic, simd_exec>
+    ExecPolicy<tbb_for_dynamic, loop_exec>
     >;
 
 INSTANTIATE_TYPED_TEST_CASE_P(TBB, ForallTest, TBBTypes);
diff --git a/test/unit/cpu/test-indexsets.cpp b/test/unit/cpu/test-indexsets.cpp
index ae7875b009..6610dd8c7d 100644
--- a/test/unit/cpu/test-indexsets.cpp
+++ b/test/unit/cpu/test-indexsets.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -9,34 +9,7 @@
 //
 // This file is part of RAJA.
 //
-// For additional details, please also read RAJA/README.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
+// For details about use and distribution, please read RAJA/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
diff --git a/test/unit/cpu/test-nested-reduce.cpp b/test/unit/cpu/test-nested-reduce.cpp
index d76d57f3a7..3b0865ccc6 100644
--- a/test/unit/cpu/test-nested-reduce.cpp
+++ b/test/unit/cpu/test-nested-reduce.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -9,34 +9,7 @@
 //
 // This file is part of RAJA.
 //
-// For additional details, please also read RAJA/README.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
+// For details about use and distribution, please read RAJA/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
diff --git a/test/unit/cpu/test-nested.cpp b/test/unit/cpu/test-nested.cpp
index 2080b69630..682c3bc565 100644
--- a/test/unit/cpu/test-nested.cpp
+++ b/test/unit/cpu/test-nested.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -9,34 +9,7 @@
 //
 // This file is part of RAJA.
 //
-// For additional details, please also read RAJA/README.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
+// For details about use and distribution, please read RAJA/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
@@ -104,21 +77,18 @@ using TRANSFORMS =
 template <typename TRANSFORMS>
 using POLICIES =
     std::tuple<ExecInfo<TRANSFORMS, seq_exec, seq_exec>
-               //,ExecInfo<TRANSFORMS, seq_exec, simd_exec> //Need to fix... 
-               //ExecInfo<TRANSFORMS, simd_exec, simd_exec>
+               ,ExecInfo<TRANSFORMS, seq_exec, loop_exec>
+               ,ExecInfo<TRANSFORMS, loop_exec, loop_exec>
 #if defined(RAJA_ENABLE_OPENMP)
-               //,
-               // ExecInfo<TRANSFORMS, seq_exec, omp_parallel_for_exec>,
-               //OMPExecInfo<TRANSFORMS, simd_exec, omp_for_nowait_exec>,
-               //OMPExecInfo<TRANSFORMS, simd_exec, omp_for_nowait_exec>
+               ,ExecInfo<TRANSFORMS, seq_exec, omp_parallel_for_exec>
+               ,OMPExecInfo<TRANSFORMS, loop_exec, omp_for_nowait_exec>
 #endif
 #if defined(RAJA_ENABLE_TBB)
-               //,
-               ,ExecInfo<TRANSFORMS, seq_exec, tbb_for_exec>
-               //,ExecInfo<TRANSFORMS, simd_exec, tbb_for_exec>
+               ,ExecInfo<TRANSFORMS, loop_exec, tbb_for_exec>
 #endif
                >;
 
+
 using InstPolicies =
     ForTesting<tt::apply_t<POLICIES, tt::apply_t<TRANSFORMS, PERMS>>>;
 
@@ -147,7 +117,8 @@ TYPED_TEST_P(NestedTest, Nested2DTest)
 
     std::vector<Index_type> v(size_i * size_j, 1);
     View view(v.data(),
-              make_permuted_layout({{size_i, size_j}}, POL::PERM::value));
+              make_permuted_layout({{size_i, size_j}},
+                                   RAJA::as_array<typename POL::PERM>::get()));
 
     forallN<Policy>(RangeSegment(1, size_i),
                     RangeSegment(0, size_j),
@@ -267,15 +238,12 @@ struct PolLTimesF_TBB : PolLTimesCommon {
   using ELL_PERM = PERM_IJ;
 };
 
-// Same as D, but with tiling on zones and TBB 2D blocked range on groups and zones
+// Parallel on zones,  loop nesting: Zones, Groups, Moments, Directions
 struct PolLTimesG_TBB : PolLTimesCommon {
   // Loops: Moments, Directions, Groups, Zones
-  using EXEC = NestedPolicy<
-      ExecList<seq_exec, seq_exec, tbb_for_exec, tbb_for_exec>,
-      Tile<TileList<tile_none, tile_none, tile_none, tile_fixed<16>>,
-           Permute<PERM_LKIJ,
-                   Execute  // implicit
-                   >>>;
+  using EXEC =
+      NestedPolicy<ExecList<seq_exec, seq_exec, seq_exec, tbb_for_dynamic>,
+                   Permute<PERM_LKIJ>>;
   using PSI_PERM = PERM_KJI;
   using PHI_PERM = PERM_KJI;
   using ELL_PERM = PERM_IJ;
@@ -339,18 +307,18 @@ TYPED_TEST_P(LTimesTest, LTimesNestedTest)
     }
 
     // create views on data
-    typename POL::ELL_VIEW ell(&ell_data[0],
-                               make_permuted_layout({num_moments,
-                                                     num_directions},
-                                                    POL::ELL_PERM::value));
+    typename POL::ELL_VIEW ell(
+        &ell_data[0],
+        make_permuted_layout({num_moments, num_directions},
+                             RAJA::as_array<typename POL::ELL_PERM>::get()));
     typename POL::PSI_VIEW psi(
         &psi_data[0],
         make_permuted_layout({num_directions, num_groups, num_zones},
-                             POL::PSI_PERM::value));
+                             RAJA::as_array<typename POL::PSI_PERM>::get()));
     typename POL::PHI_VIEW phi(
         &phi_data[0],
         make_permuted_layout({num_moments, num_groups, num_zones},
-                             POL::PHI_PERM::value));
+                             RAJA::as_array<typename POL::PHI_PERM>::get()));
 
     // get execution policy
     using EXEC = typename POL::EXEC;
diff --git a/test/unit/cpu/test-reduce.cpp b/test/unit/cpu/test-reduce.cpp
index 59f32e0d55..4a5c42ac33 100644
--- a/test/unit/cpu/test-reduce.cpp
+++ b/test/unit/cpu/test-reduce.cpp
@@ -1,6 +1,5 @@
-
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -10,34 +9,7 @@
 //
 // This file is part of RAJA.
 //
-// For additional details, please also read RAJA/README.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
+// For details about use and distribution, please read RAJA/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
@@ -66,17 +38,16 @@ using namespace RAJA;
 
 using TestingTypes = ::testing::
     Types<
-  std::tuple<ExecPolicy<seq_segit, seq_exec>, seq_reduce> 
-  //std::tuple<ExecPolicy<seq_segit, simd_exec>, seq_reduce> //code breaks if we use simd with minloc test)
+  std::tuple<ExecPolicy<seq_segit, seq_exec>, seq_reduce>, 
+  std::tuple<ExecPolicy<seq_segit, loop_exec>, loop_reduce> 
 #ifdef RAJA_ENABLE_OPENMP
-  //        ,
-  //std::tuple<ExecPolicy<omp_parallel_for_segit, simd_exec>, omp_reduce>,
-  //std::tuple<ExecPolicy<omp_parallel_for_segit, simd_exec>,
-  //omp_reduce_ordered>
+  
+  ,std::tuple<ExecPolicy<omp_parallel_for_segit, loop_exec>, omp_reduce>
+  ,std::tuple<ExecPolicy<omp_parallel_for_segit, loop_exec>,omp_reduce_ordered>              
 #endif
 #ifdef RAJA_ENABLE_TBB
           ,std::tuple<ExecPolicy<seq_segit, tbb_for_exec>, tbb_reduce>
-        //,std::tuple<ExecPolicy<tbb_for_exec, simd_exec>, tbb_reduce>
+           ,std::tuple<ExecPolicy<tbb_for_exec, loop_exec>, tbb_reduce>
 #endif
         >;
 
diff --git a/test/unit/cpu/test-reductions.cpp b/test/unit/cpu/test-reductions.cpp
index 10296ddba1..d57eda1c16 100644
--- a/test/unit/cpu/test-reductions.cpp
+++ b/test/unit/cpu/test-reductions.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -9,34 +9,7 @@
 //
 // This file is part of RAJA.
 //
-// For additional details, please also read RAJA/README.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
+// For details about use and distribution, please read RAJA/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
@@ -257,7 +230,7 @@ REGISTER_TYPED_TEST_CASE_P(ReductionCorrectnessTest,
 
 using types = ::testing::Types<
     std::tuple<RAJA::seq_exec, RAJA::seq_reduce>,
-    std::tuple<RAJA::simd_exec, RAJA::seq_reduce>
+    std::tuple<RAJA::loop_exec, RAJA::seq_reduce>
 #if defined(RAJA_ENABLE_OPENMP)
     ,
     std::tuple<RAJA::omp_parallel_for_exec, RAJA::omp_reduce>,
diff --git a/test/unit/cpu/test-scan.cpp b/test/unit/cpu/test-scan.cpp
index 2b6a283daf..bce888e54d 100644
--- a/test/unit/cpu/test-scan.cpp
+++ b/test/unit/cpu/test-scan.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -9,34 +9,7 @@
 //
 // This file is part of RAJA.
 //
-// For additional details, please also read RAJA/README.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
+// For details about use and distribution, please read RAJA/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
diff --git a/test/unit/cpu/test-segments.cpp b/test/unit/cpu/test-segments.cpp
index 4a15b59690..ff8a54ae79 100644
--- a/test/unit/cpu/test-segments.cpp
+++ b/test/unit/cpu/test-segments.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -9,34 +9,7 @@
 //
 // This file is part of RAJA.
 //
-// For additional details, please also read RAJA/README.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
+// For details about use and distribution, please read RAJA/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
diff --git a/test/unit/cuda/CMakeLists.txt b/test/unit/cuda/CMakeLists.txt
index ae130f32b4..b0aed3eb6b 100644
--- a/test/unit/cuda/CMakeLists.txt
+++ b/test/unit/cuda/CMakeLists.txt
@@ -1,6 +1,6 @@
 ###############################################################################
 #
-# Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+# Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 #
 # Produced at the Lawrence Livermore National Laboratory
 #
@@ -10,34 +10,7 @@
 #
 # This file is part of RAJA.
 #
-# For additional details, please also read RAJA/LICENSE.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright notice,
-#   this list of conditions and the disclaimer below.
-#
-# * Redistributions in binary form must reproduce the above copyright notice,
-#   this list of conditions and the disclaimer (as noted below) in the
-#   documentation and/or other materials provided with the distribution.
-#
-# * Neither the name of the LLNS/LLNL nor the names of its contributors may
-#   be used to endorse or promote products derived from this software without
-#   specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-# ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-# LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-# IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-# POSSIBILITY OF SUCH DAMAGE.
+# For details about use and distribution, please read RAJA/LICENSE.
 #
 ###############################################################################
 
@@ -45,7 +18,7 @@ raja_add_test(
   NAME test-cuda-nested
   SOURCES test-nested.cpp)
 
-if (NOT RAJA_ENABLE_CLANG_CUDA)
+if (NOT ENABLE_CLANG_CUDA)
   raja_add_test(
     NAME test-cuda-scan
     SOURCES test-scan.cpp)
@@ -61,7 +34,7 @@ raja_add_test(
 
 raja_add_test(
   NAME test-cuda-reduce-loc
-  SOURCES test-reduce-minloc.cpp)
+  SOURCES test-reduce-loc.cpp)
 
 raja_add_test(
   NAME test-cuda-reduce-max
diff --git a/test/unit/cuda/test-forall.cpp b/test/unit/cuda/test-forall.cpp
index f696d2d221..19a615d6df 100644
--- a/test/unit/cuda/test-forall.cpp
+++ b/test/unit/cuda/test-forall.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -9,34 +9,7 @@
 //
 // This file is part of RAJA.
 //
-// For additional details, please also read RAJA/README.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
+// For details about use and distribution, please read RAJA/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
diff --git a/test/unit/cuda/test-nested-strided.cpp b/test/unit/cuda/test-nested-strided.cpp
index 693ad58991..94dde9e9a6 100644
--- a/test/unit/cuda/test-nested-strided.cpp
+++ b/test/unit/cuda/test-nested-strided.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -9,34 +9,7 @@
 //
 // This file is part of RAJA.
 //
-// For additional details, please also read RAJA/README.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
+// For details about use and distribution, please read RAJA/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
@@ -56,6 +29,7 @@
 
 #include <gtest/gtest.h>
 #include <RAJA/RAJA.hpp>
+#include "RAJA_gtest.hpp"
 
 static const int x = 500, y = 300, z = 70;
 
@@ -70,14 +44,14 @@ static void stride_test(int stride, bool reverse = false)
   RangeStrideSegment seg_x( reverse ? x-1     : 0,
                             reverse ? -1      : x,
                             reverse ? -stride : stride);
-                            
+
   RangeStrideSegment seg_y( reverse ? y-1     : 0,
                             reverse ? -1      : y,
                             reverse ? -stride : stride);
-                            
+
   RangeStrideSegment seg_z( reverse ? z-1     : 0,
                             reverse ? -1      : z,
-                            reverse ? -stride : stride);                            
+                            reverse ? -stride : stride);
 
   forallN<NestedPolicy<ExecList<seq_exec,
                                 cuda_block_x_exec,
@@ -90,14 +64,14 @@ static void stride_test(int stride, bool reverse = false)
                                              arr[val] = val;
                                            });
   cudaDeviceSynchronize();
-  
-  
+
+
   for (Index_type i : RangeSegment(0,x)) {
     for (Index_type j : RangeSegment(0,y)) {
       for (Index_type k : RangeSegment(0,z)) {
-      
+
         Index_type val = (i*y*z) + (j*z) + k;
-        
+
         // Determine if this i,j,k was in the iteration space
         bool inclusive;
         if(reverse){
@@ -106,10 +80,10 @@ static void stride_test(int stride, bool reverse = false)
         else{
           inclusive = (i%stride==0) && (j%stride==0) && (k%stride==0);
         }
-        
+
         // Determine expected value
-        int expected_value = inclusive ? val : 0;        
-        
+        int expected_value = inclusive ? val : 0;
+
         ASSERT_EQ(expected_value, arr[val]);
       }
     }
@@ -118,43 +92,43 @@ static void stride_test(int stride, bool reverse = false)
 }
 
 
-TEST(forallN, rangeStrides1)
+CUDA_TEST(forallN, rangeStrides1)
 {
   stride_test(1, false);
 }
 
-TEST(forallN, rangeStrides2)
+CUDA_TEST(forallN, rangeStrides2)
 {
   stride_test(2, false);
 }
 
-TEST(forallN, rangeStrides3)
+CUDA_TEST(forallN, rangeStrides3)
 {
   stride_test(3, false);
 }
 
-TEST(forallN, rangeStrides4)
+CUDA_TEST(forallN, rangeStrides4)
 {
   stride_test(4, false);
 }
 
 
-TEST(forallN, rangeStrides1_reverse)
+CUDA_TEST(forallN, rangeStrides1_reverse)
 {
   stride_test(1, true);
 }
 
-TEST(forallN, rangeStrides2_reverse)
+CUDA_TEST(forallN, rangeStrides2_reverse)
 {
   stride_test(2, true);
 }
 
-TEST(forallN, rangeStrides3_reverse)
+CUDA_TEST(forallN, rangeStrides3_reverse)
 {
   stride_test(3, true);
 }
 
-TEST(forallN, rangeStrides4_reverse)
+CUDA_TEST(forallN, rangeStrides4_reverse)
 {
   stride_test(4, true);
 }
diff --git a/test/unit/cuda/test-nested.cpp b/test/unit/cuda/test-nested.cpp
index a6aff174cc..438823ddac 100644
--- a/test/unit/cuda/test-nested.cpp
+++ b/test/unit/cuda/test-nested.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -9,34 +9,7 @@
 //
 // This file is part of RAJA.
 //
-// For additional details, please also read RAJA/README.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
+// For details about use and distribution, please read RAJA/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
@@ -139,17 +112,18 @@ static void runLTimesTest(Index_type num_moments,
              cudaMemcpyHostToDevice);
 
   // create views on data
-  typename POL::ELL_VIEW ell(d_ell,
-                             make_permuted_layout({num_moments, num_directions},
-                                                  POL::ELL_PERM::value));
+  typename POL::ELL_VIEW ell(
+      d_ell,
+      make_permuted_layout({num_moments, num_directions},
+                           as_array<typename POL::ELL_PERM>::get()));
   typename POL::PSI_VIEW psi(
       d_psi,
       make_permuted_layout({num_directions, num_groups, num_zones},
-                           POL::PSI_PERM::value));
+                           as_array<typename POL::PSI_PERM>::get()));
   typename POL::PHI_VIEW phi(
       d_phi,
       make_permuted_layout({num_moments, num_groups, num_zones},
-                           POL::PHI_PERM::value));
+                           as_array<typename POL::PHI_PERM>::get()));
 
   // get execution policy
   using EXEC = typename POL::EXEC;
@@ -247,11 +221,9 @@ struct PolLTimesA_GPU {
 // Use thread and block mappings
 struct PolLTimesB_GPU {
   // Loops: Moments, Directions, Groups, Zones
-  typedef NestedPolicy<ExecList<seq_exec,
-                                seq_exec,
-                                cuda_thread_z_exec,
-                                cuda_block_y_exec>,
-                       Permute<PERM_IJKL>>
+  typedef NestedPolicy<
+      ExecList<seq_exec, seq_exec, cuda_thread_z_exec, cuda_block_y_exec>,
+      Permute<PERM_IJKL>>
       EXEC;
 
   // psi[direction, group, zone]
@@ -343,7 +315,7 @@ class NestedCUDA : public ::testing::Test
 
 TYPED_TEST_CASE_P(NestedCUDA);
 
-TYPED_TEST_P(NestedCUDA, LTimes)
+CUDA_TYPED_TEST_P(NestedCUDA, LTimes)
 {
   runLTimesTest<TypeParam>(2, 0, 7, 3);
   runLTimesTest<TypeParam>(2, 3, 7, 3);
@@ -367,8 +339,8 @@ CUDA_TEST(NestedCUDA, NegativeRange)
     host_data[i] = i * 1.0;
   }
 
-  forallN<NestedPolicy<ExecList<cuda_threadblock_y_exec<16>,
-                                cuda_threadblock_x_exec<16>>>>(
+  forallN<NestedPolicy<
+      ExecList<cuda_threadblock_y_exec<16>, cuda_threadblock_x_exec<16>>>>(
       RangeSegment(-2, 8), RangeSegment(-2, 8), [=] RAJA_DEVICE(int k, int j) {
         const int idx = ((k - -2) * 10) + (j - -2);
         data[idx] = idx * 1.0;
diff --git a/test/unit/cuda/test-reduce-minloc.cpp b/test/unit/cuda/test-reduce-loc.cpp
similarity index 65%
rename from test/unit/cuda/test-reduce-minloc.cpp
rename to test/unit/cuda/test-reduce-loc.cpp
index 6cdf04d2ab..5fa6e20280 100644
--- a/test/unit/cuda/test-reduce-minloc.cpp
+++ b/test/unit/cuda/test-reduce-loc.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -9,34 +9,7 @@
 //
 // This file is part of RAJA.
 //
-// For additional details, please also read RAJA/README.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
+// For details about use and distribution, please read RAJA/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
@@ -79,9 +52,19 @@ template <typename T, typename U>
 struct reduce_applier<ReduceMinLoc<T, U>> {
   static U def() { return DBL_MAX; }
   static U big() { return -500.0; }
+  template <bool B>
+  static void updatedvalue(U* dvalue,
+                                reduce::detail::ValueLoc<U, B>& randval,
+                                reduce::detail::ValueLoc<U, B>& dcurrent)
+  {
+    if (dvalue[randval.loc] > randval.val) {
+      dvalue[randval.loc] = randval.val;
+      apply(dcurrent, randval);
+    }
+  }
   RAJA_DEVICE static void apply(ReduceMinLoc<T, U> const& r,
-                                     U const& val,
-                                     Index_type i)
+                                U const& val,
+                                Index_type i)
   {
     r.minloc(val, i);
   }
@@ -92,8 +75,8 @@ struct reduce_applier<ReduceMinLoc<T, U>> {
     l = l > r ? r : l;
   }
   template <bool B>
-  RAJA_HOST_DEVICE static void cmp(ReduceMinLoc<T, U>& l,
-                                   reduce::detail::ValueLoc<U, B> const& r)
+  static void cmp(ReduceMinLoc<T, U>& l,
+                  reduce::detail::ValueLoc<U, B> const& r)
   {
     ASSERT_FLOAT_EQ(r.val, l.get());
     ASSERT_EQ(r.loc, l.getLoc());
@@ -103,9 +86,19 @@ template <typename T, typename U>
 struct reduce_applier<ReduceMaxLoc<T, U>> {
   static U def() { return -DBL_MAX; }
   static U big() { return 500.0; }
+  template <bool B>
+  static void updatedvalue(U* dvalue,
+                                reduce::detail::ValueLoc<U, B>& randval,
+                                reduce::detail::ValueLoc<U, B>& dcurrent)
+  {
+    if (randval.val > dvalue[randval.loc]) {
+      dvalue[randval.loc] = randval.val;
+      apply(dcurrent, randval);
+    }
+  }
   RAJA_DEVICE static void apply(ReduceMaxLoc<T, U> const& r,
-                                     U const& val,
-                                     Index_type i)
+                                U const& val,
+                                Index_type i)
   {
     r.maxloc(val, i);
   }
@@ -116,7 +109,7 @@ struct reduce_applier<ReduceMaxLoc<T, U>> {
     l = l > r ? l : r;
   }
   template <bool B>
-  RAJA_HOST_DEVICE static void cmp(ReduceMaxLoc<T, U>& l,
+  static void cmp(ReduceMaxLoc<T, U>& l,
                                    reduce::detail::ValueLoc<U, B> const& r)
   {
     ASSERT_FLOAT_EQ(r.val, l.get());
@@ -155,7 +148,7 @@ CUDA_TYPED_TEST_P(ReduceCUDA, generic)
   double* dvalue = reducer::dvalue;
   reset(dvalue, TEST_VEC_LEN, applier::def());
 
-  reduce::detail::ValueLoc<double> dcurrentMin(applier::def(), -1);
+  reduce::detail::ValueLoc<double> dcurrent(applier::def(), -1);
 
   for (int tcount = 0; tcount < test_repeat; ++tcount) {
 
@@ -169,9 +162,8 @@ CUDA_TYPED_TEST_P(ReduceCUDA, generic)
 
       double droll = dist(mt);
       int index = int(dist2(mt));
-      reduce::detail::ValueLoc<double> lmin{droll, index};
-      dvalue[index] = droll;
-      applier::apply(dcurrentMin, lmin);
+      reduce::detail::ValueLoc<double> randval(droll, index);
+      applier::updatedvalue(dvalue, randval, dcurrent);
 
       forall<cuda_exec<block_size>>(0, TEST_VEC_LEN, [=] __device__(int i) {
         applier::apply(dmin0, dvalue[i], i);
@@ -179,10 +171,10 @@ CUDA_TYPED_TEST_P(ReduceCUDA, generic)
         applier::apply(dmin2, dvalue[i], i);
       });
 
-      applier::cmp(dmin0, dcurrentMin);
+      applier::cmp(dmin0, dcurrent);
 
-      ASSERT_FLOAT_EQ(dcurrentMin.val * 2, dmin1.get());
-      ASSERT_EQ(dcurrentMin.getLoc(), dmin1.getLoc());
+      ASSERT_FLOAT_EQ(dcurrent.val * 2, dmin1.get());
+      ASSERT_EQ(dcurrent.getLoc(), dmin1.getLoc());
       ASSERT_FLOAT_EQ(applier::big(), dmin2.get());
     }
   }
@@ -203,12 +195,12 @@ CUDA_TYPED_TEST_P(ReduceCUDA, indexset_align)
 
   reset(dvalue, TEST_VEC_LEN, applier::def());
 
-  reduce::detail::ValueLoc<double> dcurrentMin(applier::def(), -1);
+  reduce::detail::ValueLoc<double> dcurrent(applier::def(), -1);
 
   for (int tcount = 0; tcount < test_repeat; ++tcount) {
 
     RangeSegment seg0(0, TEST_VEC_LEN / 2);
-    RangeSegment seg1(TEST_VEC_LEN / 2 + 1, TEST_VEC_LEN);
+    RangeSegment seg1(TEST_VEC_LEN / 2, TEST_VEC_LEN);
 
     IndexSet iset;
     iset.push_back(seg0);
@@ -217,13 +209,10 @@ CUDA_TYPED_TEST_P(ReduceCUDA, indexset_align)
     TypeParam dmin0(applier::def(), -1);
     TypeParam dmin1(applier::def(), -1);
 
-    int index = int(dist2(mt));
-
     double droll = dist(mt);
-    dvalue[index] = droll;
-    reduce::detail::ValueLoc<double> lmin{droll, index};
-    dvalue[index] = droll;
-    applier::apply(dcurrentMin, lmin);
+    int index = int(dist2(mt));
+    reduce::detail::ValueLoc<double> randval(droll, index);
+    applier::updatedvalue(dvalue, randval, dcurrent);
 
     forall<ExecPolicy<seq_segit, cuda_exec<block_size>>>(
         iset, [=] __device__(int i) {
@@ -231,10 +220,10 @@ CUDA_TYPED_TEST_P(ReduceCUDA, indexset_align)
           applier::apply(dmin1, 2 * dvalue[i], i);
         });
 
-    ASSERT_FLOAT_EQ(double(dcurrentMin), double(dmin0));
-    ASSERT_FLOAT_EQ(2 * double(dcurrentMin), double(dmin1));
-    ASSERT_EQ(dcurrentMin.getLoc(), dmin0.getLoc());
-    ASSERT_EQ(dcurrentMin.getLoc(), dmin1.getLoc());
+    ASSERT_FLOAT_EQ(double(dcurrent), double(dmin0));
+    ASSERT_FLOAT_EQ(2 * double(dcurrent), double(dmin1));
+    ASSERT_EQ(dcurrent.getLoc(), dmin0.getLoc());
+    ASSERT_EQ(dcurrent.getLoc(), dmin1.getLoc());
   }
 }
 
@@ -267,7 +256,7 @@ CUDA_TYPED_TEST_P(ReduceCUDA, indexset_noalign)
 
     reset(dvalue, TEST_VEC_LEN, applier::def());
 
-    reduce::detail::ValueLoc<double> dcurrentMin(applier::def(), -1);
+    reduce::detail::ValueLoc<double> dcurrent(applier::def(), -1);
 
     TypeParam dmin0(applier::def(), -1);
     TypeParam dmin1(applier::def(), -1);
@@ -279,11 +268,8 @@ CUDA_TYPED_TEST_P(ReduceCUDA, indexset_noalign)
     if (tcount % 4 == 0) index = 3457;  // seg 3
 
     double droll = dist(mt);
-    dvalue[index] = droll;
-
-    reduce::detail::ValueLoc<double> lmin{droll, index};
-    dvalue[index] = droll;
-    applier::apply(dcurrentMin, lmin);
+    reduce::detail::ValueLoc<double> randval(droll, index);
+    applier::updatedvalue(dvalue, randval, dcurrent);
 
     forall<ExecPolicy<seq_segit, cuda_exec<block_size>>>(
         iset, [=] __device__(int i) {
@@ -291,10 +277,10 @@ CUDA_TYPED_TEST_P(ReduceCUDA, indexset_noalign)
           applier::apply(dmin1, 2 * dvalue[i], i);
         });
 
-    ASSERT_FLOAT_EQ(dcurrentMin.val, double(dmin0));
-    ASSERT_FLOAT_EQ(2 * dcurrentMin.val, double(dmin1));
-    ASSERT_EQ(dcurrentMin.getLoc(), dmin0.getLoc());
-    ASSERT_EQ(dcurrentMin.getLoc(), dmin1.getLoc());
+    ASSERT_FLOAT_EQ(dcurrent.val, double(dmin0));
+    ASSERT_FLOAT_EQ(2 * dcurrent.val, double(dmin1));
+    ASSERT_EQ(dcurrent.getLoc(), dmin0.getLoc());
+    ASSERT_EQ(dcurrent.getLoc(), dmin1.getLoc());
   }
 }
 
diff --git a/test/unit/cuda/test-reduce-max.cpp b/test/unit/cuda/test-reduce-max.cpp
index 1adf9bdf76..4d27e1b528 100644
--- a/test/unit/cuda/test-reduce-max.cpp
+++ b/test/unit/cuda/test-reduce-max.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -9,34 +9,7 @@
 //
 // This file is part of RAJA.
 //
-// For additional details, please also read RAJA/README.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
+// For details about use and distribution, please read RAJA/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
@@ -111,9 +84,10 @@ CUDA_TEST_F(ReduceMaxCUDA, generic)
 
       double droll = dist(mt);
       int index = int(dist2(mt));
-      double lmax = droll;
-      dvalue[index] = droll;
-      dcurrentMax = RAJA_MAX(dcurrentMax, lmax);
+      if (droll > dvalue[index]) {
+        dvalue[index] = droll;
+        dcurrentMax = RAJA_MAX(dcurrentMax, droll);
+      }
 
       forall<cuda_exec<block_size> >(0, TEST_VEC_LEN, [=] __device__(int i) {
         dmax0.max(dvalue[i]);
@@ -147,7 +121,7 @@ CUDA_TEST_F(ReduceMaxCUDA, indexset_align)
   for (int tcount = 0; tcount < test_repeat; ++tcount) {
 
     RangeSegment seg0(0, TEST_VEC_LEN / 2);
-    RangeSegment seg1(TEST_VEC_LEN / 2 + 1, TEST_VEC_LEN);
+    RangeSegment seg1(TEST_VEC_LEN / 2, TEST_VEC_LEN);
 
     IndexSet iset;
     iset.push_back(seg0);
@@ -156,13 +130,13 @@ CUDA_TEST_F(ReduceMaxCUDA, indexset_align)
     ReduceMax<cuda_reduce<block_size>, double> dmax0(DEFAULT_VAL);
     ReduceMax<cuda_reduce<block_size>, double> dmax1(DEFAULT_VAL);
 
-    int index = int(dist2(mt));
 
     double droll = dist(mt);
-    dvalue[index] = droll;
-    double lmax = droll;
-    dvalue[index] = droll;
-    dcurrentMax = RAJA_MAX(dcurrentMax, lmax);
+    int index = int(dist2(mt));
+    if (droll > dvalue[index]) {
+      dvalue[index] = droll;
+      dcurrentMax = RAJA_MAX(dcurrentMax, droll);
+    }
 
     forall<ExecPolicy<seq_segit, cuda_exec<block_size> > >(
         iset, [=] __device__(int i) {
@@ -215,11 +189,10 @@ CUDA_TEST_F(ReduceMaxCUDA, indexset_noalign)
     if (tcount % 4 == 0) index = 29457;  // seg 3
 
     double droll = dist(mt);
-    dvalue[index] = droll;
-
-    double lmax = droll;
-    dvalue[index] = droll;
-    dcurrentMax = RAJA_MAX(dcurrentMax, lmax);
+    if (droll > dvalue[index]) {
+      dvalue[index] = droll;
+      dcurrentMax = RAJA_MAX(dcurrentMax, droll);
+    }
 
     forall<ExecPolicy<seq_segit, cuda_exec<block_size> > >(
         iset, [=] __device__(int i) {
diff --git a/test/unit/cuda/test-reduce-min.cpp b/test/unit/cuda/test-reduce-min.cpp
index cc1fc17d06..43e9d9981a 100644
--- a/test/unit/cuda/test-reduce-min.cpp
+++ b/test/unit/cuda/test-reduce-min.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -9,34 +9,7 @@
 //
 // This file is part of RAJA.
 //
-// For additional details, please also read RAJA/README.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
+// For details about use and distribution, please read RAJA/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
@@ -111,9 +84,10 @@ CUDA_TEST_F(ReduceMinCUDA, generic)
 
       double droll = dist(mt);
       int index = int(dist2(mt));
-      double lmin = droll;
-      dvalue[index] = droll;
-      dcurrentMin = RAJA_MIN(dcurrentMin, lmin);
+      if (dvalue[index] > droll) {
+        dvalue[index] = droll;
+        dcurrentMin = RAJA_MIN(dcurrentMin, droll);
+      }
 
       forall<cuda_exec<block_size> >(0, TEST_VEC_LEN, [=] __device__(int i) {
         dmin0.min(dvalue[i]);
@@ -147,7 +121,7 @@ CUDA_TEST_F(ReduceMinCUDA, indexset_align)
   for (int tcount = 0; tcount < test_repeat; ++tcount) {
 
     RangeSegment seg0(0, TEST_VEC_LEN / 2);
-    RangeSegment seg1(TEST_VEC_LEN / 2 + 1, TEST_VEC_LEN);
+    RangeSegment seg1(TEST_VEC_LEN / 2, TEST_VEC_LEN);
 
     IndexSet iset;
     iset.push_back(seg0);
@@ -156,13 +130,12 @@ CUDA_TEST_F(ReduceMinCUDA, indexset_align)
     ReduceMin<cuda_reduce<block_size>, double> dmin0(DEFAULT_VAL);
     ReduceMin<cuda_reduce<block_size>, double> dmin1(DEFAULT_VAL);
 
-    int index = int(dist2(mt));
-
     double droll = dist(mt);
-    dvalue[index] = droll;
-    double lmin = droll;
-    dvalue[index] = droll;
-    dcurrentMin = RAJA_MIN(dcurrentMin, lmin);
+    int index = int(dist2(mt));
+    if (dvalue[index] > droll) {
+      dvalue[index] = droll;
+      dcurrentMin = RAJA_MIN(dcurrentMin, droll);
+    }
 
     forall<ExecPolicy<seq_segit, cuda_exec<block_size> > >(
         iset, [=] __device__(int i) {
@@ -215,11 +188,10 @@ CUDA_TEST_F(ReduceMinCUDA, indexset_noalign)
     if (tcount % 4 == 0) index = 29457;  // seg 3
 
     double droll = dist(mt);
-    dvalue[index] = droll;
-
-    double lmin = droll;
-    dvalue[index] = droll;
-    dcurrentMin = RAJA_MIN(dcurrentMin, lmin);
+    if (dvalue[index] > droll) {
+      dvalue[index] = droll;
+      dcurrentMin = RAJA_MIN(dcurrentMin, droll);
+    }
 
     forall<ExecPolicy<seq_segit, cuda_exec<block_size> > >(
         iset, [=] __device__(int i) {
diff --git a/test/unit/cuda/test-reduce-sum.cpp b/test/unit/cuda/test-reduce-sum.cpp
index 74637ce1e9..7382205396 100644
--- a/test/unit/cuda/test-reduce-sum.cpp
+++ b/test/unit/cuda/test-reduce-sum.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -9,34 +9,7 @@
 //
 // This file is part of RAJA.
 //
-// For additional details, please also read RAJA/README.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
+// For details about use and distribution, please read RAJA/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
@@ -153,7 +126,7 @@ CUDA_TEST_F(ReduceSumCUDA, indexset_aligned)
   double* dvalue = ReduceSumCUDA::dvalue;
   int* ivalue = ReduceSumCUDA::ivalue;
 
-    RangeSegment seg0(0, TEST_VEC_LEN / 2);
+  RangeSegment seg0(0, TEST_VEC_LEN / 2);
   RangeSegment seg1(TEST_VEC_LEN / 2, TEST_VEC_LEN);
 
   IndexSet iset;
@@ -269,3 +242,23 @@ CUDA_TEST_F(ReduceSumCUDA, atomic_reduce)
     ASSERT_FLOAT_EQ(dsumP.get(), pos_chk_val);
   }
 }
+
+CUDA_TEST_F(ReduceSumCUDA, increasing_size)
+{
+  double* dvalue = ReduceSumCUDA::dvalue;
+
+  double dtinit = 5.0;
+
+  for (int size = block_size; size <= TEST_VEC_LEN; size+=block_size ) {
+
+    ReduceSum<cuda_reduce<block_size, true>, double> dsum0(dtinit);
+
+    forall<cuda_exec<block_size, true> >(0, size, [=] __device__(int i) {
+      dsum0 += dvalue[i];
+    });
+
+    double base_chk_val = dinit_val * double(size);
+
+    ASSERT_FLOAT_EQ(base_chk_val + dtinit, dsum0.get());
+  }
+}
diff --git a/test/unit/cuda/test-scan.cpp b/test/unit/cuda/test-scan.cpp
index 7593005e6d..1d52635a61 100644
--- a/test/unit/cuda/test-scan.cpp
+++ b/test/unit/cuda/test-scan.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -9,34 +9,7 @@
 //
 // This file is part of RAJA.
 //
-// For additional details, please also read RAJA/README.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
+// For details about use and distribution, please read RAJA/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
@@ -133,7 +106,7 @@ ::testing::AssertionResult check_exclusive(const T* actual,
   return ::testing::AssertionSuccess();
 }
 
-TYPED_TEST_P(ScanCUDA, inclusive)
+CUDA_TYPED_TEST_P(ScanCUDA, inclusive)
 {
   using T = typename Info<TypeParam>::data_type;
   using Function = typename Info<TypeParam>::function;
@@ -151,7 +124,7 @@ TYPED_TEST_P(ScanCUDA, inclusive)
   cudaFree(out);
 }
 
-TYPED_TEST_P(ScanCUDA, inclusive_inplace)
+CUDA_TYPED_TEST_P(ScanCUDA, inclusive_inplace)
 {
   using T = typename Info<TypeParam>::data_type;
   using Function = typename Info<TypeParam>::function;
@@ -169,7 +142,7 @@ TYPED_TEST_P(ScanCUDA, inclusive_inplace)
   cudaFree(data);
 }
 
-TYPED_TEST_P(ScanCUDA, exclusive)
+CUDA_TYPED_TEST_P(ScanCUDA, exclusive)
 {
   using T = typename Info<TypeParam>::data_type;
   using Function = typename Info<TypeParam>::function;
@@ -187,7 +160,7 @@ TYPED_TEST_P(ScanCUDA, exclusive)
   cudaFree(out);
 }
 
-TYPED_TEST_P(ScanCUDA, exclusive_inplace)
+CUDA_TYPED_TEST_P(ScanCUDA, exclusive_inplace)
 {
   using T = typename Info<TypeParam>::data_type;
   using Function = typename Info<TypeParam>::function;
@@ -205,7 +178,7 @@ TYPED_TEST_P(ScanCUDA, exclusive_inplace)
   cudaFree(data);
 }
 
-TYPED_TEST_P(ScanCUDA, exclusive_offset)
+CUDA_TYPED_TEST_P(ScanCUDA, exclusive_offset)
 {
   using T = typename Info<TypeParam>::data_type;
   using Function = typename Info<TypeParam>::function;
@@ -224,7 +197,7 @@ TYPED_TEST_P(ScanCUDA, exclusive_offset)
   cudaFree(out);
 }
 
-TYPED_TEST_P(ScanCUDA, exclusive_inplace_offset)
+CUDA_TYPED_TEST_P(ScanCUDA, exclusive_inplace_offset)
 {
   using T = typename Info<TypeParam>::data_type;
   using Function = typename Info<TypeParam>::function;
diff --git a/test/unit/nested.cpp b/test/unit/nested.cpp
new file mode 100644
index 0000000000..f3168aa4af
--- /dev/null
+++ b/test/unit/nested.cpp
@@ -0,0 +1,176 @@
+#include "RAJA/RAJA.hpp"
+#include "RAJA_gtest.hpp"
+
+#include <cstdio>
+
+#if defined(RAJA_ENABLE_CUDA)
+#include <cuda_runtime.h>
+#endif
+
+using RAJA::Index_type;
+using RAJA::View;
+using RAJA::Layout;
+using layout_2d = Layout<2, RAJA::Index_type>;
+using view_2d = View<Index_type, layout_2d>;
+static constexpr Index_type x_len = 5;
+static constexpr Index_type y_len = 5;
+
+RAJA_INDEX_VALUE(TypedIndex, "TypedIndex");
+
+template <typename NestedPolicy>
+class Nested : public ::testing::Test
+{
+protected:
+  Index_type* data;
+  view_2d view{nullptr, x_len, y_len};
+
+  virtual void SetUp()
+  {
+#if defined(RAJA_ENABLE_CUDA)
+    cudaMallocManaged(&data,
+                      sizeof(Index_type) * x_len * y_len,
+                      cudaMemAttachGlobal);
+#else
+    data = new Index_type[x_len * y_len];
+#endif
+    view.set_data(data);
+  }
+
+  virtual void TearDown()
+  {
+#if defined(RAJA_ENABLE_CUDA)
+    cudaFree(data);
+#else
+    delete[] data;
+#endif
+  }
+};
+TYPED_TEST_CASE_P(Nested);
+
+RAJA_HOST_DEVICE constexpr Index_type get_val(Index_type v) noexcept
+{
+  return v;
+}
+template <typename T>
+RAJA_HOST_DEVICE constexpr Index_type get_val(T v) noexcept
+{
+  return *v;
+}
+CUDA_TYPED_TEST_P(Nested, Basic)
+{
+  using camp::at_v;
+  using Pol = at_v<TypeParam, 0>;
+  using IndexTypes = at_v<TypeParam, 1>;
+  using Idx0 = at_v<IndexTypes, 0>;
+  using Idx1 = at_v<IndexTypes, 1>;
+  RAJA::ReduceSum<at_v<TypeParam, 2>, RAJA::Real_type> tsum(0.0);
+  RAJA::Real_type total{0.0};
+  auto ranges = camp::make_tuple(RAJA::RangeSegment(0, x_len),
+                                 RAJA::RangeSegment(0, y_len));
+  auto v = this->view;
+  using namespace RAJA::nested;
+  RAJA::nested::forall(Pol{}, ranges, [=] RAJA_HOST_DEVICE(Idx0 i, Idx1 j) {
+    // std::cerr << "i: " << get_val(i) << " j: " << j << std::endl;
+    v(get_val(i), j) = get_val(i) * x_len + j;
+    tsum += get_val(i) * 1.1 + j;
+  });
+  for (Index_type i = 0; i < x_len; ++i) {
+    for (Index_type j = 0; j < y_len; ++j) {
+      ASSERT_EQ(this->view(i, j), i * x_len + j);
+      total += i * 1.1 + j;
+    }
+  }
+  ASSERT_FLOAT_EQ(total, tsum.get());
+}
+
+REGISTER_TYPED_TEST_CASE_P(Nested, Basic);
+
+using namespace RAJA::nested;
+using camp::list;
+using s = RAJA::seq_exec;
+using TestTypes =
+    ::testing::Types<list<Policy<For<1, s>, TypedFor<0, s, TypedIndex>>,
+                          list<TypedIndex, Index_type>,
+                          RAJA::seq_reduce>,
+                     list<Policy<Tile<1, tile_s<2>, RAJA::loop_exec>,
+                                 Tile<0, tile<2>, RAJA::loop_exec>,
+                                 For<0, s>,
+                                 For<1, s>>,
+                          list<Index_type, Index_type>,
+                          RAJA::seq_reduce>,
+                     list<Policy<Collapse<s, For<0>, For<1>>>,
+                          list<Index_type, Index_type>,
+                          RAJA::seq_reduce>>;
+
+INSTANTIATE_TYPED_TEST_CASE_P(Sequential, Nested, TestTypes);
+
+#if defined(RAJA_ENABLE_OPENMP)
+using OMPTypes = ::testing::Types<
+    list<
+        Policy<For<1, RAJA::omp_parallel_for_exec>, TypedFor<0, s, TypedIndex>>,
+        list<TypedIndex, Index_type>,
+        RAJA::omp_reduce>,
+    list<Policy<Tile<1, tile_s<2>, RAJA::omp_parallel_for_exec>,
+                For<1, RAJA::loop_exec>,
+                TypedFor<0, s, TypedIndex>>,
+         list<TypedIndex, Index_type>,
+         RAJA::omp_reduce>>;
+INSTANTIATE_TYPED_TEST_CASE_P(OpenMP, Nested, OMPTypes);
+#endif
+#if defined(RAJA_ENABLE_TBB)
+using TBBTypes = ::testing::Types<
+    list<Policy<For<1, RAJA::tbb_for_exec>, TypedFor<0, s, TypedIndex>>,
+         list<TypedIndex, Index_type>,
+         RAJA::tbb_reduce>>;
+INSTANTIATE_TYPED_TEST_CASE_P(TBB, Nested, TBBTypes);
+#endif
+#if defined(RAJA_ENABLE_CUDA)
+using CUDATypes = ::testing::Types<
+    list<Policy<For<1, s>, TypedFor<0, RAJA::cuda_exec<128>, TypedIndex>>,
+         list<TypedIndex, Index_type>,
+         RAJA::cuda_reduce<128>>>;
+INSTANTIATE_TYPED_TEST_CASE_P(CUDA, Nested, CUDATypes);
+#endif
+
+TEST(Nested, TileDynamic)
+{
+  camp::idx_t count = 0;
+  camp::idx_t length = 5;
+  camp::idx_t tile_size = 3;
+  RAJA::nested::forall(
+      camp::make_tuple(Tile<1, tile<2>, RAJA::seq_exec>{tile_size},
+                       For<0, RAJA::seq_exec>{},
+                       For<1, RAJA::seq_exec>{}),
+      camp::make_tuple(RAJA::RangeSegment(0, length),
+                       RAJA::RangeSegment(0, length)),
+      [=, &count](Index_type i, Index_type j) {
+        std::cerr << "i: " << get_val(i) << " j: " << j << " count: " << count
+                  << std::endl;
+
+        ASSERT_EQ(count,
+                  count < (length * tile_size)
+                      ? (i * 3 + j)
+                      : (length * tile_size)
+                            + (i * (length - tile_size) + j - tile_size));
+        count++;
+      });
+}
+
+
+
+#if defined(RAJA_ENABLE_CUDA)
+CUDA_TEST(Nested, CudaCollapse)
+{
+  camp::idx_t length = 5;
+  RAJA::nested::forall(
+      camp::make_tuple(RAJA::nested::Collapse<
+                         RAJA::nested::cuda_collapse_exec,
+                         RAJA::nested::For<0, RAJA::cuda_exec<32>>,
+                         RAJA::nested::For<1, RAJA::cuda_exec<32>>>{}),
+      camp::make_tuple(RAJA::RangeSegment(0, length),
+                       RAJA::RangeSegment(0, length)),
+      [=] RAJA_HOST_DEVICE (Index_type i, Index_type j) {
+          printf("(%d, %d)\n", i, j);
+       });
+}
+#endif
diff --git a/test/unit/omp-target/CMakeLists.txt b/test/unit/omp-target/CMakeLists.txt
index 6e24ec3777..444d4df30a 100644
--- a/test/unit/omp-target/CMakeLists.txt
+++ b/test/unit/omp-target/CMakeLists.txt
@@ -1,6 +1,6 @@
 ###############################################################################
 #
-# Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+# Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 #
 # Produced at the Lawrence Livermore National Laboratory
 #
@@ -10,42 +10,19 @@
 #
 # This file is part of RAJA.
 #
-# For additional details, please also read RAJA/LICENSE.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright notice,
-#   this list of conditions and the disclaimer below.
-#
-# * Redistributions in binary form must reproduce the above copyright notice,
-#   this list of conditions and the disclaimer (as noted below) in the
-#   documentation and/or other materials provided with the distribution.
-#
-# * Neither the name of the LLNS/LLNL nor the names of its contributors may
-#   be used to endorse or promote products derived from this software without
-#   specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-# ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-# LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-# IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-# POSSIBILITY OF SUCH DAMAGE.
+# For details about use and distribution, please read RAJA/LICENSE.
 #
 ###############################################################################
 
-if(RAJA_ENABLE_TARGET_OPENMP)
+if(ENABLE_TARGET_OPENMP)
+  add_gtest_sources(
+    test-nested-reduce.cpp
+    test-reductions.cpp)
+
   raja_add_test(
     NAME test-omp-target-nested-reduce
     SOURCES test-nested-reduce.cpp)
   raja_add_test(
     NAME test-omp-target-reductions
     SOURCES test-reductions.cpp)
-endif(RAJA_ENABLE_TARGET_OPENMP)
+endif(ENABLE_TARGET_OPENMP)
diff --git a/test/unit/omp-target/test-nested-reduce.cpp b/test/unit/omp-target/test-nested-reduce.cpp
index 8a67993f16..ce3d7cbc8e 100644
--- a/test/unit/omp-target/test-nested-reduce.cpp
+++ b/test/unit/omp-target/test-nested-reduce.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -9,34 +9,7 @@
 //
 // This file is part of RAJA.
 //
-// For additional details, please also read RAJA/README.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
+// For details about use and distribution, please read RAJA/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
diff --git a/test/unit/omp-target/test-reductions.cpp b/test/unit/omp-target/test-reductions.cpp
index bd5b2c22c9..d67ad9e8eb 100644
--- a/test/unit/omp-target/test-reductions.cpp
+++ b/test/unit/omp-target/test-reductions.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -9,34 +9,7 @@
 //
 // This file is part of RAJA.
 //
-// For additional details, please also read RAJA/README.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
+// For details about use and distribution, please read RAJA/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
diff --git a/test/unit/test-atomic.cpp b/test/unit/test-atomic.cpp
index 0152437fe9..6ad655023d 100644
--- a/test/unit/test-atomic.cpp
+++ b/test/unit/test-atomic.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -9,34 +9,7 @@
 //
 // This file is part of RAJA.
 //
-// For additional details, please also read RAJA/README.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
+// For details about use and distribution, please read RAJA/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
diff --git a/test/unit/test-indexvalue.cpp b/test/unit/test-indexvalue.cpp
index 22933af4f9..90efcc0e46 100644
--- a/test/unit/test-indexvalue.cpp
+++ b/test/unit/test-indexvalue.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -9,34 +9,7 @@
 //
 // This file is part of RAJA.
 //
-// For additional details, please also read RAJA/README.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
+// For details about use and distribution, please read RAJA/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
diff --git a/test/unit/test-integral-limits.cpp b/test/unit/test-integral-limits.cpp
index 9cdba24d06..f71c40827d 100644
--- a/test/unit/test-integral-limits.cpp
+++ b/test/unit/test-integral-limits.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -9,34 +9,7 @@
 //
 // This file is part of RAJA.
 //
-// For additional details, please also read RAJA/README.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
+// For details about use and distribution, please read RAJA/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
diff --git a/test/unit/test-iterators.cpp b/test/unit/test-iterators.cpp
index 711866f919..0ca5d3b37a 100644
--- a/test/unit/test-iterators.cpp
+++ b/test/unit/test-iterators.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -9,34 +9,7 @@
 //
 // This file is part of RAJA.
 //
-// For additional details, please also read RAJA/README.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
+// For details about use and distribution, please read RAJA/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
diff --git a/test/unit/test-layout.cpp b/test/unit/test-layout.cpp
index b3f77bc426..283410e388 100644
--- a/test/unit/test-layout.cpp
+++ b/test/unit/test-layout.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -9,34 +9,7 @@
 //
 // This file is part of RAJA.
 //
-// For additional details, please also read RAJA/README.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
+// For details about use and distribution, please read RAJA/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
@@ -62,7 +35,7 @@ TEST(OffsetLayoutTest, 1D)
    *
    * 10, 11, 12, 13, 14
    */
-  const layout l({10}, std::array<RAJA::Index_type, 1>{14});
+  const layout l({{10}}, {{14}});
 
   /*
    * First element, 10, should have index 0.
@@ -101,17 +74,20 @@ TEST(TypedLayoutTest, 1D)
 TEST(LayoutTest, OffsetVsRegular)
 {
   const auto layout =
-      RAJA::make_permuted_layout({6, 6}, RAJA::Perm<1, 0>::value);
+      RAJA::make_permuted_layout({{6, 6}},
+                                 RAJA::as_array<RAJA::Perm<1, 0>>::get());
   const auto offset =
-      RAJA::make_permuted_offset_layout({0, 0}, {5, 5}, RAJA::PERM_JI::value);
+      RAJA::make_permuted_offset_layout({{0, 0}},
+                                        {{5, 5}},
+                                        RAJA::as_array<RAJA::PERM_JI>::get());
 
   /*
    * OffsetLayout with 0 offset should function like the regular Layout.
    */
   for (int j = 0; j < 6; ++j) {
     for (int i = 0; i < 6; ++i) {
-      ASSERT_EQ(offset(i, j), layout(i, j)) << layout.strides[0]
-                                            << layout.strides[1];
+      ASSERT_EQ(offset(i, j), layout(i, j))
+          << layout.strides[0] << layout.strides[1];
     }
   }
 }
@@ -125,7 +101,7 @@ TEST(OffsetLayoutTest, 2D_IJ)
    * (-1, -1), (0, -1), (1, -1)
    * (-1, -2), (0, -2), (1, -2)
    */
-  const auto layout = RAJA::make_offset_layout<2>({-1, -2}, {1, 0});
+  const auto layout = RAJA::make_offset_layout<2>({{-1, -2}}, {{1, 0}});
 
   /*
    * First element, (-1, -2), should have index 0.
@@ -155,7 +131,9 @@ TEST(OffsetLayoutTest, 2D_JI)
    * (-1, -2), (0, -2), (1, -2)
    */
   const my_layout layout =
-      RAJA::make_permuted_offset_layout({-1, -2}, {1, 0}, RAJA::PERM_JI::value);
+      RAJA::make_permuted_offset_layout({{-1, -2}},
+                                        {{1, 0}},
+                                        RAJA::as_array<RAJA::PERM_JI>::get());
 
   /*
    * First element, (-1, -2), should have index 0.
@@ -180,7 +158,7 @@ TEST(OffsetLayoutTest, View)
   /*
    * View is constructed by passing in the layout.
    */
-  RAJA::View<int, layout> view(data, RAJA::make_offset_layout<1>({1}, {10}));
+  RAJA::View<int, layout> view(data, RAJA::make_offset_layout<1>({{1}}, {{10}}));
 
   for (int i = 0; i < 10; i++) {
     data[i] = i;
@@ -225,21 +203,21 @@ TEST(LayoutTest, 2D_IJ)
   ASSERT_EQ(5, layout(0, 5));
 
   // Check that we get the identity (mod 15)
-  for(int k = 0;k < 20;++ k){
+  for (int k = 0; k < 20; ++k) {
 
     // inverse map
     int i, j;
     layout.toIndices(k, i, j);
 
     // forward map
-    int k2 = layout(i,j);
+    int k2 = layout(i, j);
 
     // check ident
-    ASSERT_EQ(k%15, k2);
+    ASSERT_EQ(k % 15, k2);
 
     // check with a and b
-    ASSERT_EQ(k2, layout_a(i,j));
-    ASSERT_EQ(k2, layout_b(i,j));
+    ASSERT_EQ(k2, layout_a(i, j));
+    ASSERT_EQ(k2, layout_b(i, j));
   }
 }
 
@@ -257,7 +235,7 @@ TEST(LayoutTest, 2D_JI)
    *
    */
   const my_layout layout =
-      RAJA::make_permuted_layout({3, 5}, RAJA::PERM_JI::value);
+      RAJA::make_permuted_layout({{3, 5}}, RAJA::as_array<RAJA::PERM_JI>::get());
 
   ASSERT_EQ(0, layout(0, 0));
 
@@ -268,16 +246,16 @@ TEST(LayoutTest, 2D_JI)
   ASSERT_EQ(15, layout(0, 5));
 
   // Check that we get the identity (mod 15)
-  for(int k = 0;k < 20;++ k){
+  for (int k = 0; k < 20; ++k) {
 
     // inverse map
     int i, j;
     layout.toIndices(k, i, j);
 
     // forward map
-    int k2 = layout(i,j);
+    int k2 = layout(i, j);
 
-    ASSERT_EQ(k%15, k2);
+    ASSERT_EQ(k % 15, k2);
   }
 }
 
@@ -314,17 +292,17 @@ TEST(LayoutTest, 2D_IJ_ProjJ)
   ASSERT_EQ(0, layout(0, 5));
 
   // Check that we get the identity (mod 7)
-  for(int k = 0;k < 20;++ k){
+  for (int k = 0; k < 20; ++k) {
 
     // inverse map
     int i, j;
     layout.toIndices(k, i, j);
 
     // forward map
-    int k2 = layout(i,j);
+    int k2 = layout(i, j);
 
     // check ident
-    ASSERT_EQ(k%7, k2);
+    ASSERT_EQ(k % 7, k2);
 
     // check projection of j
     ASSERT_EQ(j, 0);
@@ -354,7 +332,8 @@ TEST(LayoutTest, 3D_KJI_ProjJ)
   // Construct using variadic "sizes" ctor
   // Zero for J size should correctly produce projective layout
   const my_layout layout =
-      RAJA::make_permuted_layout({3, 0, 7}, RAJA::PERM_KJI::value);
+      RAJA::make_permuted_layout({{3, 0, 7}},
+                                 RAJA::as_array<RAJA::PERM_KJI>::get());
 
   ASSERT_EQ(0, layout(0, 0, 0));
 
@@ -369,17 +348,17 @@ TEST(LayoutTest, 3D_KJI_ProjJ)
   ASSERT_EQ(12, layout(0, 0, 4));
 
   // Check that we get the identity (mod 21)
-  for(int x = 0;x < 40;++ x){
+  for (int x = 0; x < 40; ++x) {
 
     // inverse map
     int i, j, k;
     layout.toIndices(x, i, j, k);
 
     // forward map
-    int x2 = layout(i,j,k);
+    int x2 = layout(i, j, k);
 
     // check ident
-    ASSERT_EQ(x%21, x2);
+    ASSERT_EQ(x % 21, x2);
 
     // check projection of j
     ASSERT_EQ(j, 0);
diff --git a/test/unit/test-multipolicy.cpp b/test/unit/test-multipolicy.cpp
index d6a7afbbd8..78722898eb 100644
--- a/test/unit/test-multipolicy.cpp
+++ b/test/unit/test-multipolicy.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -9,34 +9,7 @@
 //
 // This file is part of RAJA.
 //
-// For additional details, please also read RAJA/README.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
+// For details about use and distribution, please read RAJA/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
@@ -48,23 +21,22 @@
 #include "gtest/gtest.h"
 
 // Tag type to dispatch to test bodies based on policy selected by multipolicy
-template <int i>
-struct mp_tag {
-};
 
 struct mp_test_body;
-namespace RAJA
-{
-namespace impl
+namespace test_policy
 {
-// fake RAJA::impl::forall overload to test multipolicy dispatch
+template <int i>
+struct mp_tag {
+};
+// fake forall_impl overload to test multipolicy dispatch
 template <int i, typename Iterable>
-void forall(const mp_tag<i> &p, Iterable &&iter, mp_test_body const &body)
+void forall_impl(const mp_tag<i> &p, Iterable &&iter, mp_test_body const &body)
 {
   body(p, iter.size());
 }
 }
-}
+
+using test_policy::mp_tag;
 
 // NOTE: this *must* be after the above to work
 #include "RAJA/RAJA.hpp"
diff --git a/test/unit/test-rajavec.cpp b/test/unit/test-rajavec.cpp
index a07e681b7c..6ef9933b86 100644
--- a/test/unit/test-rajavec.cpp
+++ b/test/unit/test-rajavec.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -9,34 +9,7 @@
 //
 // This file is part of RAJA.
 //
-// For additional details, please also read RAJA/README.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
+// For details about use and distribution, please read RAJA/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
diff --git a/test/unit/test-span.cpp b/test/unit/test-span.cpp
index fc43060275..fd8161fbb7 100644
--- a/test/unit/test-span.cpp
+++ b/test/unit/test-span.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -9,34 +9,7 @@
 //
 // This file is part of RAJA.
 //
-// For additional details, please also read RAJA/README.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
+// For details about use and distribution, please read RAJA/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
diff --git a/test/unit/test-timer.cpp b/test/unit/test-timer.cpp
index 258963d077..dbf066130a 100644
--- a/test/unit/test-timer.cpp
+++ b/test/unit/test-timer.cpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-17, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
@@ -9,34 +9,7 @@
 //
 // This file is part of RAJA.
 //
-// For additional details, please also read RAJA/README.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the disclaimer below.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the disclaimer (as noted below) in the
-//   documentation and/or other materials provided with the distribution.
-//
-// * Neither the name of the LLNS/LLNL nor the names of its contributors may
-//   be used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-// LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
+// For details about use and distribution, please read RAJA/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
@@ -58,7 +31,7 @@
 
 TEST(TimerTest, No1)
 {
-  RAJA::Timer timer;
+  auto timer = RAJA::Timer();
 
   timer.start("test_timer");