Blob Blame Raw
From f0fd8c553fa024c599f4aff65d7c603ceeaa6a58 Mon Sep 17 00:00:00 2001
From: Adenilson Cavalcanti <adenilson.cavalcanti@arm.com>
Date: Mon, 9 Apr 2018 13:52:17 -0700
Subject: [PATCH 1/3] Neon-Optimized hash chain rebase

This should help with compression of data, using NEON instructions
(therefore useful for ARMv7/ARMv8).

Original patch by Jun He.
---
 CMakeLists.txt                | 18 ++++++++
 contrib/arm/neon_slide_hash.h | 84 +++++++++++++++++++++++++++++++++++
 deflate.c                     |  7 +++
 3 files changed, 109 insertions(+)
 create mode 100644 contrib/arm/neon_slide_hash.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0fe939d..e9a74e9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -136,6 +136,24 @@ if(CMAKE_COMPILER_IS_GNUCC)
         set(ZLIB_ASMS contrib/amd64/amd64-match.S)
     endif ()
 
+    if(ARM_NEON)
+        list(REMOVE_ITEM ZLIB_SRCS inflate.c)
+        set(ZLIB_ARM_NEON_HDRS
+            contrib/arm/chunkcopy.h
+            contrib/arm/inffast_chunk.h
+            contrib/arm/neon_slide_hash.h)
+        set(ZLIB_ARM_NEON contrib/arm/inflate.c contrib/arm/inffast_chunk.c)
+        add_definitions(-DARM_NEON)
+        set(COMPILER ${CMAKE_C_COMPILER})
+        # NEON is mandatory in ARMv8.
+        if(${COMPILER} MATCHES "aarch64")
+          set_source_files_properties(${ZLIB_ARM_NEON} PROPERTIES LANGUAGE C COMPILE_FLAGS -march=armv8-a)
+          # But it was optional for ARMv7.
+        elseif(${COMPILER} MATCHES "arm")
+          set_source_files_properties(${ZLIB_ARM_NEON} PROPERTIES LANGUAGE C COMPILE_FLAGS -mfpu=neon)
+        endif()
+    endif()
+
 	if(ZLIB_ASMS)
 		add_definitions(-DASMV)
 		set_source_files_properties(${ZLIB_ASMS} PROPERTIES LANGUAGE C COMPILE_FLAGS -DNO_UNDERLINE)
diff --git a/contrib/arm/neon_slide_hash.h b/contrib/arm/neon_slide_hash.h
new file mode 100644
index 0000000..0daffa1
--- /dev/null
+++ b/contrib/arm/neon_slide_hash.h
@@ -0,0 +1,84 @@
+/* Copyright (C) 1995-2011, 2016 Mark Adler
+ * Copyright (C) 2017 ARM Holdings Inc.
+ * Authors: Adenilson Cavalcanti <adenilson.cavalcanti@arm.com>
+ *          Jun He <jun.he@arm.com>
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ * 1. The origin of this software must not be misrepresented; you must not
+ *  claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+#ifndef __NEON_SLIDE_HASH__
+#define __NEON_SLIDE_HASH__
+
+#if (defined(__ARM_NEON__) || defined(__ARM_NEON))
+#include "deflate.h"
+#include <arm_neon.h>
+
+inline static void neon_slide_hash(deflate_state *s)
+{
+    /*
+     * This is ASIMD implementation for hash table rebase
+     * it assumes:
+     * 1. hash chain offset (Pos) is 2 bytes
+     * 2. hash table size is multiple*128 bytes
+     * #1 should be true as Pos is defined as "ush"
+     * #2 should be true as hash_bits are greater that 7
+     */
+    unsigned n, m;
+    unsigned short wsize = s->w_size;
+    uint16x8_t v, *p;
+    size_t size;
+
+    size = s->hash_size*sizeof(s->head[0]);
+    Assert((size % sizeof(uint16x8_t) * 8 == 0), "hash table size err");
+
+    Assert(sizeof(Pos) == 2, "Wrong Pos size");
+
+    /* slide s->head */
+    v = vdupq_n_u16(wsize);
+    p = (uint16x8_t *)(s->head);
+    n = size / (sizeof(uint16x8_t) * 8);
+    do {
+        p[0] = vqsubq_u16(p[0], v);
+        p[1] = vqsubq_u16(p[1], v);
+        p[2] = vqsubq_u16(p[2], v);
+        p[3] = vqsubq_u16(p[3], v);
+        p[4] = vqsubq_u16(p[4], v);
+        p[5] = vqsubq_u16(p[5], v);
+        p[6] = vqsubq_u16(p[6], v);
+        p[7] = vqsubq_u16(p[7], v);
+        p += 8;
+    } while (--n);
+#ifndef FASTEST
+    /* slide s->prev */
+    size = wsize*sizeof(s->prev[0]);
+
+    Assert((size % sizeof(uint16x8_t) * 8 == 0), "hash table size err");
+
+    p = (uint16x8_t *)(s->prev);
+    n = size / (sizeof(uint16x8_t) * 8);
+    do {
+        p[0] = vqsubq_u16(p[0], v);
+        p[1] = vqsubq_u16(p[1], v);
+        p[2] = vqsubq_u16(p[2], v);
+        p[3] = vqsubq_u16(p[3], v);
+        p[4] = vqsubq_u16(p[4], v);
+        p[5] = vqsubq_u16(p[5], v);
+        p[6] = vqsubq_u16(p[6], v);
+        p[7] = vqsubq_u16(p[7], v);
+        p += 8;
+    } while (--n);
+#endif
+}
+
+#endif
+#endif
diff --git a/deflate.c b/deflate.c
index 1ec7614..36f99ac 100644
--- a/deflate.c
+++ b/deflate.c
@@ -50,6 +50,9 @@
 /* @(#) $Id$ */
 
 #include "deflate.h"
+#if __ARM_NEON
+#include "contrib/arm/neon_slide_hash.h"
+#endif
 
 const char deflate_copyright[] =
    " deflate 1.2.11 Copyright 1995-2017 Jean-loup Gailly and Mark Adler ";
@@ -201,6 +204,9 @@ local const config configuration_table[10] = {
 local void slide_hash(s)
     deflate_state *s;
 {
+#if ARM_NEON
+    return neon_slide_hash(s);
+#else
     unsigned n, m;
     Posf *p;
     uInt wsize = s->w_size;
@@ -222,6 +228,7 @@ local void slide_hash(s)
          */
     } while (--n);
 #endif
+#endif
 }
 
 /* ========================================================================= */
-- 
2.19.0