#pragma once #include #include template void memcpy_1by1(std::byte *dst, const std::byte *src) { for (std::size_t i{0}; i < S; ++i) { dst[i] = src[i]; } } using ARRAY_TYPE = uint32_t; template void memcpy_neon(ARRAY_TYPE *dst, const ARRAY_TYPE *src) { uint32x4_t tmp; for (std::size_t i{0}; i < (S / 4); i += 4) { tmp = vld1q_u32(src + i); vst1q_u32(&dst[i], tmp); } }