diff options
| author | Nikita Kostovsky <nikita@kostovsky.me> | 2025-11-09 08:53:54 +0100 |
|---|---|---|
| committer | Nikita Kostovsky <nikita@kostovsky.me> | 2025-11-09 08:55:23 +0100 |
| commit | c81e60b1f5d62b74a2ebc269348b36bf91df5015 (patch) | |
| tree | 8589c151556a8d4755ebfa5520d076ae0b902387 /src/mem_utils.h | |
| parent | 69c5e9c07941212ac77368effd1c60db3140d4a3 (diff) | |
fix memcpy_neon
Diffstat (limited to 'src/mem_utils.h')
| -rw-r--r-- | src/mem_utils.h | 33 |
1 files changed, 25 insertions, 8 deletions
diff --git a/src/mem_utils.h b/src/mem_utils.h index 8601f78..cb5f179 100644 --- a/src/mem_utils.h +++ b/src/mem_utils.h @@ -1,25 +1,42 @@ #pragma once #include <cstddef> +#include <iostream> #include <arm_neon.h> +// using ARRAY_TYPE = uint16_t; +// using ARRAY_TYPE = uint32_t; +using ARRAY_TYPE = uint64_t; +// using ARRAY_TYPE = uint64_t; + template<std::size_t S> -void memcpy_1by1(std::byte *dst, const std::byte *src) +void memcpy_1by1(ARRAY_TYPE *dst, const ARRAY_TYPE *src) { for (std::size_t i{0}; i < S; ++i) { dst[i] = src[i]; } } - -using ARRAY_TYPE = uint32_t; template<std::size_t S> -void memcpy_neon(ARRAY_TYPE *dst, const ARRAY_TYPE *src) +inline void memcpy_neon(ARRAY_TYPE *dst, const ARRAY_TYPE *src) { - uint32x4_t tmp; + // for (std::size_t i{0}; i < S; i += 4) { + // vst1q_u32(&dst[i], vld1q_u32(src + i)); + // } + // if (uint64_t(src) % 16 != 0) { + // std::cerr << "src misaligned" << std::endl; + // } + // if (uint64_t(dst) % 16 != 0) { + // std::cerr << "dst misaligned" << std::endl; + // } - for (std::size_t i{0}; i < (S / 4); i += 4) { - tmp = vld1q_u32(src + i); - vst1q_u32(&dst[i], tmp); + for (std::size_t i{0}; i < S; i += 2) { + vst1q_u64(&dst[i], vld1q_u64(src + i)); } + // for (std::size_t i{0}; i < S; i += 8) { + // vst1q_u16(&dst[i], vld1q_u16(src + i)); + // } + // for (std::size_t i{0}; i < S; i += 4) { + // vst1q_u64(&dst[i], vld1q_u64(src + i)); + // } } |
