Add load_unaligned and allow construction from unaligned scalar arrays

KavrakiLab · Feb 6, 2025 · 508f5aa · 508f5aa
1 parent 064fb7e
commit 508f5aa
Show file tree

Hide file tree

Showing 3 changed files with 54 additions and 5 deletions.
diff --git a/src/impl/vamp/vector/avx.hh b/src/impl/vamp/vector/avx.hh
@@ -31,6 +31,12 @@ namespace vamp
             return _mm256_load_ps(f);
         }
 
+        template <unsigned int = 0>
+        inline static constexpr auto load_unaligned(const ScalarT *const f) noexcept -> VectorT
+        {
+            return _mm256_loadu_ps(f);
+        }
+
         template <unsigned int = 0>
         inline static constexpr auto store(ScalarT *f, VectorT v) noexcept -> void
         {
@@ -408,6 +414,12 @@ namespace vamp
 
         template <unsigned int = 0>
         inline static constexpr auto load(const ScalarT *const i) noexcept -> VectorT
+        {
+            return _mm256_load_si256((const __m256i *const)i);
+        }
+
+        template <unsigned int = 0>
+        inline static constexpr auto load_unaligned(const ScalarT *const i) noexcept -> VectorT
         {
             return _mm256_loadu_si256((const __m256i *const)i);
         }

diff --git a/src/impl/vamp/vector/interface.hh b/src/impl/vamp/vector/interface.hh
@@ -59,8 +59,8 @@ namespace vamp
         inline static constexpr std::size_t num_rows = Sig::num_rows;
         using DataT = typename Sig::DataT;
 
-        inline constexpr auto
-        to_array() const noexcept -> std::array<typename S::ScalarT, num_scalars_rounded>
+        inline constexpr auto to_array() const noexcept
+            -> std::array<typename S::ScalarT, num_scalars_rounded>
         {
             alignas(S::Alignment) std::array<typename S::ScalarT, num_scalars_rounded> result = {};
             to_array(result);
@@ -718,9 +718,10 @@ namespace vamp
             return S::template constant<0>(s);
         }
 
+        template <bool is_aligned = true>
         inline constexpr void pack(const typename S::ScalarT *const scalar_data) noexcept
         {
-            load_vector(scalar_data, std::make_index_sequence<num_vectors>());
+            load_vector<is_aligned>(scalar_data, std::make_index_sequence<num_vectors>());
         }
 
         template <auto fn, std::size_t stride = 1, std::size_t... I>
@@ -737,13 +738,21 @@ namespace vamp
             (..., fn(base + I * stride, std::get<I>(data)));
         }
 
-        template <std::size_t... I>
+        template <bool is_aligned, std::size_t... I>
         inline constexpr void
         load_vector(const typename S::ScalarT *const scalar_array, std::index_sequence<I...>) noexcept
         {
             // TODO: This might segfault if we had to over-allocate vectors and the scalar data isn't
             // full for the over-allocated size
-            (..., (std::get<I>(d()->data) = S::template load<0>(scalar_array + I * S::VectorWidth)));
+            if constexpr (is_aligned)
+            {
+                (..., (std::get<I>(d()->data) = S::template load<0>(scalar_array + I * S::VectorWidth)));
+            }
+            else
+            {
+                (...,
+                 (std::get<I>(d()->data) = S::template load_unaligned<0>(scalar_array + I * S::VectorWidth)));
+            }
         }
 
         template <std::size_t... I>
@@ -815,6 +824,20 @@ namespace vamp
         {
         }
 
+        // TODO: Enable unaligned load for other constructors too
+        constexpr Vector(const typename S::ScalarT *const scalar_data, bool is_aligned) noexcept
+        {
+            // NOTE: assumes that scalar_data is a multiple of VectorWidth of valid data
+            if (is_aligned)
+            {
+                Interface::pack(scalar_data);
+            }
+            else
+            {
+                Interface::pack<false>(scalar_data);
+            }
+        }
+
         constexpr Vector(const typename S::ScalarT *const scalar_data) noexcept
         {
             // NOTE: assumes that scalar_data is a multiple of VectorWidth of valid data

diff --git a/src/impl/vamp/vector/neon.hh b/src/impl/vamp/vector/neon.hh
@@ -34,6 +34,13 @@ namespace vamp
             return vld1q_f32(f);
         }
 
+        template <unsigned int = 0>
+        inline static auto load_unaligned(const ScalarT *const f) noexcept -> VectorT
+        {
+            // NOTE: The same instruction seems to do double-duty for ARM?
+            return vld1q_f32(f);
+        }
+
         template <unsigned int = 0>
         inline static auto store(ScalarT *f, VectorT v) noexcept -> void
         {
@@ -490,6 +497,13 @@ namespace vamp
             return vld1q_s32((const int32_t *const)i);
         }
 
+        template <unsigned int = 0>
+        inline static auto load_unaligned(const ScalarT *const i) noexcept -> VectorT
+        {
+            // NOTE: The same instruction seems to do double-duty for ARM?
+            return vld1q_s32((const int32_t *const)i);
+        }
+
         template <unsigned int = 0>
         inline static auto store(ScalarT *i, VectorT v) noexcept -> void
         {