Blob Blame Raw
From cb88e5da8003afcdc443b787fdcb77285e5a8a02 Mon Sep 17 00:00:00 2001
From: mayeut <mayeut@users.noreply.github.com>
Date: Tue, 20 Sep 2016 21:06:24 +0200
Subject: [PATCH] ARM64 NEON: Fix another ABI conformance issue

Based on
https://github.com/mayeut/libjpeg-turbo/commit/98a5a9dc899aa9265858a3cbe0a96289a31a1322
with wordsmithing by DRC.

In the AArch64 ABI, as in many others, it's forbidden to read/store data
below the stack pointer.  Some SIMD functions were doing just that
(stack pointer misuse) when trying to preserve callee-saved registers,
and this resulted in those registers being restored with incorrect
contents under certain circumstances.

This patch fixes that behavior, and callee-saved registers are now
stored above the stack pointer throughout the function call.  The patch
also removes register saving in places where it is unnecessary for this
ABI, or it makes use of unused scratch regiters instead of callee-saved
registers.

Fixes #97.  Closes #101.

Refer also to https://bugzilla.redhat.com/show_bug.cgi?id=1368569
---
 simd/jsimd_arm64_neon.S | 108 +++++++++++++-----------------------------------
 2 files changed, 34 insertions(+), 80 deletions(-)

diff --git a/simd/jsimd_arm64_neon.S b/simd/jsimd_arm64_neon.S
index 6c1a959..3309858 100644
--- a/simd/jsimd_arm64_neon.S
+++ b/simd/jsimd_arm64_neon.S
@@ -217,8 +217,9 @@ asm_function jsimd_idct_islow_neon
 
     sub             sp, sp, #64
     adr             x15, Ljsimd_idct_islow_neon_consts
-    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], #32
-    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], #32
+    mov             x10, sp
+    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x10], #32
+    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x10], #32
     ld1             {v0.8h, v1.8h}, [x15]
     ld1             {v2.8h, v3.8h, v4.8h, v5.8h}, [COEF_BLOCK], #64
     ld1             {v18.8h, v19.8h, v20.8h, v21.8h}, [DCT_TABLE], #64
@@ -243,7 +244,6 @@ asm_function jsimd_idct_islow_neon
     shl             v10.8h, v2.8h, #(PASS1_BITS)
     sqxtn           v16.8b, v15.8h
     mov             TMP1, v16.d[0]
-    sub             sp, sp, #64
     mvn             TMP2, TMP1
 
     cbnz            TMP2, 2f
@@ -1117,18 +1117,12 @@ asm_function jsimd_idct_4x4_neon
     uxtw x3, w3
 
     /* Save all used NEON registers */
-    sub             sp, sp, 272
-    str             x15, [sp], 16
+    sub             sp, sp, 64
+    mov             x9, sp
     /* Load constants (v3.4h is just used for padding) */
     adr             TMP4, Ljsimd_idct_4x4_neon_consts
-    st1             {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32
-    st1             {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
-    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
-    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
-    st1             {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
-    st1             {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32
-    st1             {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
-    st1             {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32
+    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
+    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
     ld1             {v0.4h, v1.4h, v2.4h, v3.4h}, [TMP4]
 
     /* Load all COEF_BLOCK into NEON registers with the following allocation:
@@ -1237,16 +1231,8 @@ asm_function jsimd_idct_4x4_neon
 #endif
 
     /* vpop            {v8.4h - v15.4h}    ;not available */
-    sub             sp, sp, #272
-    ldr             x15, [sp], 16
-    ld1             {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32
-    ld1             {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
     ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
     ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
-    ld1             {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
-    ld1             {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32
-    ld1             {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
-    ld1             {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32
     blr             x30
 
     .unreq          DCT_TABLE
@@ -1320,18 +1306,13 @@ asm_function jsimd_idct_2x2_neon
     uxtw x3, w3
 
     /* vpush           {v8.4h - v15.4h}            ; not available */
-    sub             sp, sp, 208
-    str             x15, [sp], 16
+    sub             sp, sp, 64
+    mov             x9, sp
 
     /* Load constants */
     adr             TMP2, Ljsimd_idct_2x2_neon_consts
-    st1             {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
-    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
-    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
-    st1             {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
-    st1             {v21.8b, v22.8b}, [sp], 16
-    st1             {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
-    st1             {v30.8b, v31.8b}, [sp], 16
+    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
+    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
     ld1             {v14.4h}, [TMP2]
 
     /* Load all COEF_BLOCK into NEON registers with the following allocation:
@@ -1431,15 +1412,8 @@ asm_function jsimd_idct_2x2_neon
     st1             {v26.b}[1], [TMP2], 1
     st1             {v27.b}[5], [TMP2], 1
 
-    sub             sp, sp, #208
-    ldr             x15, [sp], 16
-    ld1             {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
     ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
     ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
-    ld1             {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
-    ld1             {v21.8b, v22.8b}, [sp], 16
-    ld1             {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
-    ld1             {v30.8b, v31.8b}, [sp], 16
     blr             x30
 
     .unreq          DCT_TABLE
@@ -1719,13 +1693,13 @@ asm_function jsimd_ycc_\colorid\()_convert_neon_slowst3
     INPUT_BUF2      .req x1
 
     RGB             .req x7
-    Y               .req x8
-    U               .req x9
-    V               .req x10
+    Y               .req x9
+    U               .req x10
+    V               .req x11
     N               .req w15
 
-    sub             sp, sp, 336
-    str             x15, [sp], 16
+    sub             sp, sp, 64
+    mov             x9, sp
 
     /* Load constants to d1, d2, d3 (v0.4h is just used for padding) */
     .if \fast_st3 == 1
@@ -1735,23 +1709,11 @@ asm_function jsimd_ycc_\colorid\()_convert_neon_slowst3
     .endif
 
     /* Save NEON registers */
-    st1             {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32
-    st1             {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
-    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
-    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
-    st1             {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
-    st1             {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32
-    st1             {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
-    st1             {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32
+    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
+    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
     ld1             {v0.4h, v1.4h}, [x15], 16
     ld1             {v2.8h}, [x15]
 
-    /* Save ARM registers and handle input arguments */
-    /* push            {x4, x5, x6, x7, x8, x9, x10, x30} */
-    stp             x4, x5, [sp], 16
-    stp             x6, x7, [sp], 16
-    stp             x8, x9, [sp], 16
-    stp             x10, x30, [sp], 16
     ldr             INPUT_BUF0, [INPUT_BUF]
     ldr             INPUT_BUF1, [INPUT_BUF, #8]
     ldr             INPUT_BUF2, [INPUT_BUF, #16]
@@ -1818,21 +1780,8 @@ asm_function jsimd_ycc_\colorid\()_convert_neon_slowst3
     b.gt            0b
 9:
     /* Restore all registers and return */
-    sub             sp, sp, #336
-    ldr             x15, [sp], 16
-    ld1             {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32
-    ld1             {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
     ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
     ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
-    ld1             {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
-    ld1             {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32
-    ld1             {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
-    ld1             {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32
-    /* pop             {r4, r5, r6, r7, r8, r9, r10, pc} */
-    ldp             x4, x5, [sp], 16
-    ldp             x6, x7, [sp], 16
-    ldp             x8, x9, [sp], 16
-    ldp             x10, x30, [sp], 16
     br              x30
     .unreq          OUTPUT_WIDTH
     .unreq          INPUT_ROW
@@ -2101,8 +2050,9 @@ asm_function jsimd_\colorid\()_ycc_convert_neon_slowld3
 
     /* Save NEON registers */
     sub             sp, sp, #64
-    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
-    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
+    mov             x9, sp
+    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
+    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
 
     /* Outer loop over scanlines */
     cmp             NUM_ROWS, #1
@@ -2155,7 +2105,6 @@ asm_function jsimd_\colorid\()_ycc_convert_neon_slowld3
     b.gt            0b
 9:
     /* Restore all registers and return */
-    sub             sp, sp, #64
     ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
     ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
     br              x30
@@ -2359,8 +2308,9 @@ asm_function jsimd_fdct_islow_neon
 
     /* Save NEON registers */
     sub             sp, sp, #64
-    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
-    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
+    mov             x10, sp
+    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x10], 32
+    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x10], 32
 
     /* Load all DATA into NEON registers with the following allocation:
      *       0 1 2 3 | 4 5 6 7
@@ -2590,7 +2540,6 @@ asm_function jsimd_fdct_islow_neon
     st1             {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
 
     /* Restore NEON registers */
-    sub             sp, sp, #64
     ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
     ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
 
@@ -3104,7 +3053,7 @@ asm_function jsimd_huff_encode_one_block_neon_slowtbl
     sub             sp, sp, 272
     sub             BUFFER, BUFFER, #0x1    /* BUFFER=buffer-- */
     /* Save ARM registers */
-    stp             x19, x20, [sp], 16
+    stp             x19, x20, [sp]
 .if \fast_tbl == 1
     adr             x15, Ljsimd_huff_encode_one_block_neon_consts
 .else
@@ -3318,7 +3267,7 @@ asm_function jsimd_huff_encode_one_block_neon_slowtbl
     and             v18.16b, v18.16b, v23.16b
       add             x3, x4, #0x400           /* r1 = dctbl->ehufsi */
     and             v20.16b, v20.16b, v23.16b
-      add             x15, sp, #0x80           /* x15 = t2 */
+      add             x15, sp, #0x90           /* x15 = t2 */
     and             v22.16b, v22.16b, v23.16b
       ldr             w10, [x4, x12, lsl #2]
     addp            v16.16b, v16.16b, v18.16b
@@ -3341,7 +3290,7 @@ asm_function jsimd_huff_encode_one_block_neon_slowtbl
     rbit            x9, x9             /* x9 = index0 */
     ldrb            w14, [x4, #0xf0]   /* x14 = actbl->ehufsi[0xf0] */
     cmp             w12, #(64-8)
-    mov             x11, sp
+    add             x11, sp, #16
     b.lt            4f
     cbz             x9, 6f
     st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x11], #64
@@ -3445,7 +3394,7 @@ asm_function jsimd_huff_encode_one_block_neon_slowtbl
     put_bits        x3, x11
     cbnz            x9, 1b
 6:
-    add             x13, sp, #0xfe
+    add             x13, sp, #0x10e
     cmp             x15, x13
     b.hs            1f
     ldr             w12, [x5]
@@ -3453,7 +3402,6 @@ asm_function jsimd_huff_encode_one_block_neon_slowtbl
     checkbuf47
     put_bits        x12, x14
 1:
-    sub             sp, sp, 16
     str             PUT_BUFFER, [x0, #0x10]
     str             PUT_BITSw, [x0, #0x18]
     ldp             x19, x20, [sp], 16