diff options
| author | Nikita Kostovsky <nikita@kostovsky.me> | 2025-11-09 08:53:54 +0100 |
|---|---|---|
| committer | Nikita Kostovsky <nikita@kostovsky.me> | 2025-11-09 08:55:23 +0100 |
| commit | c81e60b1f5d62b74a2ebc269348b36bf91df5015 (patch) | |
| tree | 8589c151556a8d4755ebfa5520d076ae0b902387 /src | |
| parent | 69c5e9c07941212ac77368effd1c60db3140d4a3 (diff) | |
fix memcpy_neon
Diffstat (limited to 'src')
| -rw-r--r-- | src/camera/veyeimx287m.cpp | 22 | ||||
| -rw-r--r-- | src/image.h | 6 | ||||
| -rw-r--r-- | src/mem_utils.h | 33 |
3 files changed, 45 insertions, 16 deletions
diff --git a/src/camera/veyeimx287m.cpp b/src/camera/veyeimx287m.cpp index d4c19fe..e89442c 100644 --- a/src/camera/veyeimx287m.cpp +++ b/src/camera/veyeimx287m.cpp @@ -81,8 +81,7 @@ std::vector<std::shared_ptr<ICamera> > VeyeIMX287m::search() if (!cam->init()) return {}; - // if (!cam->setExposureTimeUs(30)) - if (!cam->setExposureTimeUs(250)) + if (!cam->setExposureTimeUs(30)) return {}; if (!cam->setLaserLevel(1)) @@ -379,12 +378,25 @@ void VeyeIMX287m::calcFrameLoop(std::stop_token stopToken) t.start(); // std::lock_guard buffer_lock{m_bufferMutexes[bufferIdx]}; // get: 4100-4500 - // memcpy(&image.data, m_videoBuffers[bufferIdx], img_size); + memcpy(&image.data, m_videoBuffers[bufferIdx], img_size); // get: 5000-5100 + // memcpy_1by1<img_size / sizeof(ARRAY_TYPE)>((ARRAY_TYPE *) &image.data, + // (ARRAY_TYPE *) m_videoBuffers[bufferIdx]); // memcpy_1by1<img_size>((std::byte *) &image.data, // (std::byte *) m_videoBuffers[bufferIdx]); - memcpy_neon<img_size / sizeof(ARRAY_TYPE)>((ARRAY_TYPE *) &image.data, - (ARRAY_TYPE *) m_videoBuffers[bufferIdx]); + + // memcpy_neon<img_size / sizeof(ARRAY_TYPE)>((ARRAY_TYPE *) &image.data, + // (ARRAY_TYPE *) m_videoBuffers[bufferIdx]); + // std::cerr << "size: " << img_size / sizeof(ARRAY_TYPE) << std::endl; + // Image::data_t native; + // Image::data_t neon; + // memcpy(&native, m_videoBuffers[bufferIdx], img_size); + // memcpy_neon<img_size / sizeof(ARRAY_TYPE)>((ARRAY_TYPE *) &neon, + // (ARRAY_TYPE *) m_videoBuffers[bufferIdx]); + + // if (memcmp(&native, &neon, img_size) != 0) { + // std::cerr << "different: " << img_size / sizeof(ARRAY_TYPE) << std::endl; + // } get_elapsed_ns += t.nsecsElapsed(); } diff --git a/src/image.h b/src/image.h index a3c4476..e4cf9eb 100644 --- a/src/image.h +++ b/src/image.h @@ -17,8 +17,6 @@ struct Image Image(Image &other) = delete; Image(Image &&other) = delete; Image &operator=(Image &&other) = default; - int width{0}; - int height{0}; // uint8_t data[img_height][img_width] = {{0}}; using row_t = std::array<uint8_t, img_width>; @@ -27,7 +25,9 @@ struct Image using column_t = rotated_row_t; using rotated_data_t = std::array<column_t, img_width>; // data_t d; - data_t data; + alignas(128) data_t data; + int width{0}; + int height{0}; // data_t *data; // uint8_t *data = {nullptr}; // uint8_t &dataAt(size_t row, size_t col); diff --git a/src/mem_utils.h b/src/mem_utils.h index 8601f78..cb5f179 100644 --- a/src/mem_utils.h +++ b/src/mem_utils.h @@ -1,25 +1,42 @@ #pragma once #include <cstddef> +#include <iostream> #include <arm_neon.h> +// using ARRAY_TYPE = uint16_t; +// using ARRAY_TYPE = uint32_t; +using ARRAY_TYPE = uint64_t; +// using ARRAY_TYPE = uint64_t; + template<std::size_t S> -void memcpy_1by1(std::byte *dst, const std::byte *src) +void memcpy_1by1(ARRAY_TYPE *dst, const ARRAY_TYPE *src) { for (std::size_t i{0}; i < S; ++i) { dst[i] = src[i]; } } - -using ARRAY_TYPE = uint32_t; template<std::size_t S> -void memcpy_neon(ARRAY_TYPE *dst, const ARRAY_TYPE *src) +inline void memcpy_neon(ARRAY_TYPE *dst, const ARRAY_TYPE *src) { - uint32x4_t tmp; + // for (std::size_t i{0}; i < S; i += 4) { + // vst1q_u32(&dst[i], vld1q_u32(src + i)); + // } + // if (uint64_t(src) % 16 != 0) { + // std::cerr << "src misaligned" << std::endl; + // } + // if (uint64_t(dst) % 16 != 0) { + // std::cerr << "dst misaligned" << std::endl; + // } - for (std::size_t i{0}; i < (S / 4); i += 4) { - tmp = vld1q_u32(src + i); - vst1q_u32(&dst[i], tmp); + for (std::size_t i{0}; i < S; i += 2) { + vst1q_u64(&dst[i], vld1q_u64(src + i)); } + // for (std::size_t i{0}; i < S; i += 8) { + // vst1q_u16(&dst[i], vld1q_u16(src + i)); + // } + // for (std::size_t i{0}; i < S; i += 4) { + // vst1q_u64(&dst[i], vld1q_u64(src + i)); + // } } |
