Skip to content

Commit

Permalink
Add load_unaligned and allow construction from unaligned scalar arrays
Browse files Browse the repository at this point in the history
  • Loading branch information
wbthomason committed Feb 6, 2025
1 parent 064fb7e commit 508f5aa
Show file tree
Hide file tree
Showing 3 changed files with 54 additions and 5 deletions.
12 changes: 12 additions & 0 deletions src/impl/vamp/vector/avx.hh
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,12 @@ namespace vamp
return _mm256_load_ps(f);
}

template <unsigned int = 0>
inline static constexpr auto load_unaligned(const ScalarT *const f) noexcept -> VectorT
{
return _mm256_loadu_ps(f);
}

template <unsigned int = 0>
inline static constexpr auto store(ScalarT *f, VectorT v) noexcept -> void
{
Expand Down Expand Up @@ -408,6 +414,12 @@ namespace vamp

template <unsigned int = 0>
inline static constexpr auto load(const ScalarT *const i) noexcept -> VectorT
{
return _mm256_load_si256((const __m256i *const)i);
}

template <unsigned int = 0>
inline static constexpr auto load_unaligned(const ScalarT *const i) noexcept -> VectorT
{
return _mm256_loadu_si256((const __m256i *const)i);
}
Expand Down
33 changes: 28 additions & 5 deletions src/impl/vamp/vector/interface.hh
Original file line number Diff line number Diff line change
Expand Up @@ -59,8 +59,8 @@ namespace vamp
inline static constexpr std::size_t num_rows = Sig::num_rows;
using DataT = typename Sig::DataT;

inline constexpr auto
to_array() const noexcept -> std::array<typename S::ScalarT, num_scalars_rounded>
inline constexpr auto to_array() const noexcept
-> std::array<typename S::ScalarT, num_scalars_rounded>
{
alignas(S::Alignment) std::array<typename S::ScalarT, num_scalars_rounded> result = {};
to_array(result);
Expand Down Expand Up @@ -718,9 +718,10 @@ namespace vamp
return S::template constant<0>(s);
}

template <bool is_aligned = true>
inline constexpr void pack(const typename S::ScalarT *const scalar_data) noexcept
{
load_vector(scalar_data, std::make_index_sequence<num_vectors>());
load_vector<is_aligned>(scalar_data, std::make_index_sequence<num_vectors>());
}

template <auto fn, std::size_t stride = 1, std::size_t... I>
Expand All @@ -737,13 +738,21 @@ namespace vamp
(..., fn(base + I * stride, std::get<I>(data)));
}

template <std::size_t... I>
template <bool is_aligned, std::size_t... I>
inline constexpr void
load_vector(const typename S::ScalarT *const scalar_array, std::index_sequence<I...>) noexcept
{
// TODO: This might segfault if we had to over-allocate vectors and the scalar data isn't
// full for the over-allocated size
(..., (std::get<I>(d()->data) = S::template load<0>(scalar_array + I * S::VectorWidth)));
if constexpr (is_aligned)
{
(..., (std::get<I>(d()->data) = S::template load<0>(scalar_array + I * S::VectorWidth)));
}
else
{
(...,
(std::get<I>(d()->data) = S::template load_unaligned<0>(scalar_array + I * S::VectorWidth)));
}
}

template <std::size_t... I>
Expand Down Expand Up @@ -815,6 +824,20 @@ namespace vamp
{
}

// TODO: Enable unaligned load for other constructors too
constexpr Vector(const typename S::ScalarT *const scalar_data, bool is_aligned) noexcept
{
// NOTE: assumes that scalar_data is a multiple of VectorWidth of valid data
if (is_aligned)
{
Interface::pack(scalar_data);
}
else
{
Interface::pack<false>(scalar_data);
}
}

constexpr Vector(const typename S::ScalarT *const scalar_data) noexcept
{
// NOTE: assumes that scalar_data is a multiple of VectorWidth of valid data
Expand Down
14 changes: 14 additions & 0 deletions src/impl/vamp/vector/neon.hh
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,13 @@ namespace vamp
return vld1q_f32(f);
}

template <unsigned int = 0>
inline static auto load_unaligned(const ScalarT *const f) noexcept -> VectorT
{
// NOTE: The same instruction seems to do double-duty for ARM?
return vld1q_f32(f);
}

template <unsigned int = 0>
inline static auto store(ScalarT *f, VectorT v) noexcept -> void
{
Expand Down Expand Up @@ -490,6 +497,13 @@ namespace vamp
return vld1q_s32((const int32_t *const)i);
}

template <unsigned int = 0>
inline static auto load_unaligned(const ScalarT *const i) noexcept -> VectorT
{
// NOTE: The same instruction seems to do double-duty for ARM?
return vld1q_s32((const int32_t *const)i);
}

template <unsigned int = 0>
inline static auto store(ScalarT *i, VectorT v) noexcept -> void
{
Expand Down

0 comments on commit 508f5aa

Please sign in to comment.