Blob Blame History Raw
From c726be71736f06052401970d6b612a29be87b475 Mon Sep 17 00:00:00 2001
From: Larry Gritz <lg@larrygritz.com>
Date: Thu, 22 Dec 2022 11:08:10 -0800
Subject: [PATCH] ARM: Fix signed/unsigned simd mismatch in vbool4::load

Fixes 3721

Please read the comments in 3721. This is the "local" fix for the
build break due to the type mismatch. It is a band-aid. And it may be
the best solution for the 2.4 branch if we don't want to potentially
break ABIs by changing the definition of any public types.

Still pending is to examine the issue of whether it was a mistake to
define vbool4 storage for neon as uint32x4_t or if we should change it
to int32x4_t to better match the non-simd reference implementation.
After debating that (and identifying somebody with access to an
ARM-based machine to test the solution for us), we may return to
tackle this more fundamental change.
---
 src/include/OpenImageIO/simd.h | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/src/include/OpenImageIO/simd.h b/src/include/OpenImageIO/simd.h
index 2a75f38f10..77d066c291 100644
--- a/src/include/OpenImageIO/simd.h
+++ b/src/include/OpenImageIO/simd.h
@@ -3261,7 +3261,9 @@ OIIO_FORCEINLINE void vbool4::load (bool a, bool b, bool c, bool d) {
     m_simd = _mm_castsi128_ps(_mm_set_epi32(-int(d), -int(c), -int(b), -int(a)));
 #elif OIIO_SIMD_NEON
     int values[4] = { -int(a), -int(b), -int(c), -int(d) };
-    m_simd = vld1q_s32 (values);
+    m_simd = vld1q_u32((const uint32_t*)values);
+    // this if we were using int:
+    // m_simd = vld1q_s32(values);
 #else
     m_val[0] = -int(a);
     m_val[1] = -int(b);
@@ -3501,7 +3503,9 @@ OIIO_FORCEINLINE bool extract (const vbool4& a) {
 #if OIIO_SIMD_SSE >= 4
     return _mm_extract_epi32(_mm_castps_si128(a.simd()), i);  // SSE4.1 only
 #elif OIIO_SIMD_NEON
-    return vgetq_lane_s32(a, i);
+    return vgetq_lane_u32(a, i);
+    // this if we were using int:
+    // return vgetq_lane_s32(a, i);
 #else
     return a[i];
 #endif
@@ -3514,8 +3518,11 @@ OIIO_FORCEINLINE vbool4 insert (const vbool4& a, bool val) {
     int ival = -int(val);
     return _mm_castsi128_ps (_mm_insert_epi32 (_mm_castps_si128(a), ival, i));
 #elif OIIO_SIMD_NEON
-    int ival = -int(val);
-    return vld1q_lane_s32(&ival, a, i);
+    uint32_t ival = uint32_t(val ? -1 : 0);
+    return vld1q_lane_u32(&ival, a, i);
+    // this if we were using int:
+    // int ival = -int(val);
+    // return vld1q_lane_s32(&ival, a, i);
 #else
     vbool4 tmp = a;
     tmp[i] = -int(val);