71a74f9
From f0fd8c553fa024c599f4aff65d7c603ceeaa6a58 Mon Sep 17 00:00:00 2001
25e9802
From: Adenilson Cavalcanti <adenilson.cavalcanti@arm.com>
25e9802
Date: Mon, 9 Apr 2018 13:52:17 -0700
25e9802
Subject: [PATCH 1/3] Neon-Optimized hash chain rebase
25e9802
25e9802
This should help with compression of data, using NEON instructions
25e9802
(therefore useful for ARMv7/ARMv8).
25e9802
25e9802
Original patch by Jun He.
25e9802
---
71a74f9
 CMakeLists.txt                | 18 ++++++++
71a74f9
 contrib/arm/neon_slide_hash.h | 84 +++++++++++++++++++++++++++++++++++
71a74f9
 deflate.c                     |  7 +++
71a74f9
 3 files changed, 109 insertions(+)
25e9802
 create mode 100644 contrib/arm/neon_slide_hash.h
25e9802
25e9802
diff --git a/CMakeLists.txt b/CMakeLists.txt
71a74f9
index 0fe939d..e9a74e9 100644
25e9802
--- a/CMakeLists.txt
25e9802
+++ b/CMakeLists.txt
71a74f9
@@ -136,6 +136,24 @@ if(CMAKE_COMPILER_IS_GNUCC)
71a74f9
         set(ZLIB_ASMS contrib/amd64/amd64-match.S)
71a74f9
     endif ()
25e9802
 
71a74f9
+    if(ARM_NEON)
71a74f9
+        list(REMOVE_ITEM ZLIB_SRCS inflate.c)
25e9802
+        set(ZLIB_ARM_NEON_HDRS
25e9802
+            contrib/arm/chunkcopy.h
25e9802
+            contrib/arm/inffast_chunk.h
25e9802
+            contrib/arm/neon_slide_hash.h)
71a74f9
+        set(ZLIB_ARM_NEON contrib/arm/inflate.c contrib/arm/inffast_chunk.c)
71a74f9
+        add_definitions(-DARM_NEON)
71a74f9
+        set(COMPILER ${CMAKE_C_COMPILER})
71a74f9
+        # NEON is mandatory in ARMv8.
71a74f9
+        if(${COMPILER} MATCHES "aarch64")
71a74f9
+          set_source_files_properties(${ZLIB_ARM_NEON} PROPERTIES LANGUAGE C COMPILE_FLAGS -march=armv8-a)
71a74f9
+          # But it was optional for ARMv7.
71a74f9
+        elseif(${COMPILER} MATCHES "arm")
71a74f9
+          set_source_files_properties(${ZLIB_ARM_NEON} PROPERTIES LANGUAGE C COMPILE_FLAGS -mfpu=neon)
71a74f9
+        endif()
71a74f9
+    endif()
71a74f9
+
71a74f9
 	if(ZLIB_ASMS)
71a74f9
 		add_definitions(-DASMV)
71a74f9
 		set_source_files_properties(${ZLIB_ASMS} PROPERTIES LANGUAGE C COMPILE_FLAGS -DNO_UNDERLINE)
25e9802
diff --git a/contrib/arm/neon_slide_hash.h b/contrib/arm/neon_slide_hash.h
25e9802
new file mode 100644
25e9802
index 0000000..0daffa1
25e9802
--- /dev/null
25e9802
+++ b/contrib/arm/neon_slide_hash.h
25e9802
@@ -0,0 +1,84 @@
25e9802
+/* Copyright (C) 1995-2011, 2016 Mark Adler
25e9802
+ * Copyright (C) 2017 ARM Holdings Inc.
25e9802
+ * Authors: Adenilson Cavalcanti <adenilson.cavalcanti@arm.com>
25e9802
+ *          Jun He <jun.he@arm.com>
25e9802
+ * This software is provided 'as-is', without any express or implied
25e9802
+ * warranty.  In no event will the authors be held liable for any damages
25e9802
+ * arising from the use of this software.
25e9802
+ * Permission is granted to anyone to use this software for any purpose,
25e9802
+ * including commercial applications, and to alter it and redistribute it
25e9802
+ * freely, subject to the following restrictions:
25e9802
+ * 1. The origin of this software must not be misrepresented; you must not
25e9802
+ *  claim that you wrote the original software. If you use this software
25e9802
+ *    in a product, an acknowledgment in the product documentation would be
25e9802
+ *    appreciated but is not required.
25e9802
+ * 2. Altered source versions must be plainly marked as such, and must not be
25e9802
+ *    misrepresented as being the original software.
25e9802
+ * 3. This notice may not be removed or altered from any source distribution.
25e9802
+ */
25e9802
+#ifndef __NEON_SLIDE_HASH__
25e9802
+#define __NEON_SLIDE_HASH__
25e9802
+
25e9802
+#if (defined(__ARM_NEON__) || defined(__ARM_NEON))
25e9802
+#include "deflate.h"
25e9802
+#include <arm_neon.h>
25e9802
+
25e9802
+inline static void neon_slide_hash(deflate_state *s)
25e9802
+{
25e9802
+    /*
25e9802
+     * This is ASIMD implementation for hash table rebase
25e9802
+     * it assumes:
25e9802
+     * 1. hash chain offset (Pos) is 2 bytes
25e9802
+     * 2. hash table size is multiple*128 bytes
25e9802
+     * #1 should be true as Pos is defined as "ush"
25e9802
+     * #2 should be true as hash_bits are greater that 7
25e9802
+     */
25e9802
+    unsigned n, m;
25e9802
+    unsigned short wsize = s->w_size;
25e9802
+    uint16x8_t v, *p;
25e9802
+    size_t size;
25e9802
+
25e9802
+    size = s->hash_size*sizeof(s->head[0]);
25e9802
+    Assert((size % sizeof(uint16x8_t) * 8 == 0), "hash table size err");
25e9802
+
25e9802
+    Assert(sizeof(Pos) == 2, "Wrong Pos size");
25e9802
+
25e9802
+    /* slide s->head */
25e9802
+    v = vdupq_n_u16(wsize);
25e9802
+    p = (uint16x8_t *)(s->head);
25e9802
+    n = size / (sizeof(uint16x8_t) * 8);
25e9802
+    do {
25e9802
+        p[0] = vqsubq_u16(p[0], v);
25e9802
+        p[1] = vqsubq_u16(p[1], v);
25e9802
+        p[2] = vqsubq_u16(p[2], v);
25e9802
+        p[3] = vqsubq_u16(p[3], v);
25e9802
+        p[4] = vqsubq_u16(p[4], v);
25e9802
+        p[5] = vqsubq_u16(p[5], v);
25e9802
+        p[6] = vqsubq_u16(p[6], v);
25e9802
+        p[7] = vqsubq_u16(p[7], v);
25e9802
+        p += 8;
25e9802
+    } while (--n);
25e9802
+#ifndef FASTEST
25e9802
+    /* slide s->prev */
25e9802
+    size = wsize*sizeof(s->prev[0]);
25e9802
+
25e9802
+    Assert((size % sizeof(uint16x8_t) * 8 == 0), "hash table size err");
25e9802
+
25e9802
+    p = (uint16x8_t *)(s->prev);
25e9802
+    n = size / (sizeof(uint16x8_t) * 8);
25e9802
+    do {
25e9802
+        p[0] = vqsubq_u16(p[0], v);
25e9802
+        p[1] = vqsubq_u16(p[1], v);
25e9802
+        p[2] = vqsubq_u16(p[2], v);
25e9802
+        p[3] = vqsubq_u16(p[3], v);
25e9802
+        p[4] = vqsubq_u16(p[4], v);
25e9802
+        p[5] = vqsubq_u16(p[5], v);
25e9802
+        p[6] = vqsubq_u16(p[6], v);
25e9802
+        p[7] = vqsubq_u16(p[7], v);
25e9802
+        p += 8;
25e9802
+    } while (--n);
25e9802
+#endif
25e9802
+}
25e9802
+
25e9802
+#endif
25e9802
+#endif
25e9802
diff --git a/deflate.c b/deflate.c
25e9802
index 1ec7614..36f99ac 100644
25e9802
--- a/deflate.c
25e9802
+++ b/deflate.c
25e9802
@@ -50,6 +50,9 @@
25e9802
 /* @(#) $Id$ */
25e9802
 
25e9802
 #include "deflate.h"
25e9802
+#if __ARM_NEON
25e9802
+#include "contrib/arm/neon_slide_hash.h"
25e9802
+#endif
25e9802
 
25e9802
 const char deflate_copyright[] =
25e9802
    " deflate 1.2.11 Copyright 1995-2017 Jean-loup Gailly and Mark Adler ";
25e9802
@@ -201,6 +204,9 @@ local const config configuration_table[10] = {
25e9802
 local void slide_hash(s)
25e9802
     deflate_state *s;
25e9802
 {
25e9802
+#if ARM_NEON
25e9802
+    return neon_slide_hash(s);
25e9802
+#else
25e9802
     unsigned n, m;
25e9802
     Posf *p;
25e9802
     uInt wsize = s->w_size;
25e9802
@@ -222,6 +228,7 @@ local void slide_hash(s)
25e9802
          */
25e9802
     } while (--n);
25e9802
 #endif
25e9802
+#endif
25e9802
 }
25e9802
 
25e9802
 /* ========================================================================= */
25e9802
-- 
71a74f9
2.19.0
25e9802