71a74f
From 17a154db6774a4acf347cfc5189eaf2cd675e696 Mon Sep 17 00:00:00 2001
25e980
From: Adenilson Cavalcanti <adenilson.cavalcanti@arm.com>
25e980
Date: Mon, 9 Apr 2018 15:14:19 -0700
25e980
Subject: [PATCH 2/3] Porting optimized longest_match
25e980
25e980
This patch was contributed to zlib-ng and features an improved longest_match
25e980
function using the most distant hash code to reduce number of checks
25e980
(see: http://www.gildor.org/en/projects/zlib).
25e980
25e980
Original patch by Jun He.
25e980
---
25e980
 CMakeLists.txt                  |   3 +-
71a74f
 contrib/arm/arm_longest_match.h | 142 ++++++++++++++++++++++++++++++++
71a74f
 deflate.c                       |  11 ++-
25e980
 3 files changed, 152 insertions(+), 4 deletions(-)
25e980
 create mode 100644 contrib/arm/arm_longest_match.h
25e980
25e980
diff --git a/CMakeLists.txt b/CMakeLists.txt
71a74f
index e9a74e9..3826eba 100644
25e980
--- a/CMakeLists.txt
25e980
+++ b/CMakeLists.txt
71a74f
@@ -141,7 +141,8 @@ if(CMAKE_COMPILER_IS_GNUCC)
25e980
         set(ZLIB_ARM_NEON_HDRS
25e980
             contrib/arm/chunkcopy.h
25e980
             contrib/arm/inffast_chunk.h
25e980
-            contrib/arm/neon_slide_hash.h)
25e980
+            contrib/arm/neon_slide_hash.h
25e980
+            contrib/arm/arm_longest_match.h)
25e980
         set(ZLIB_ARM_NEON contrib/arm/inflate.c contrib/arm/inffast_chunk.c)
25e980
         add_definitions(-DARM_NEON)
25e980
         set(COMPILER ${CMAKE_C_COMPILER})
25e980
diff --git a/contrib/arm/arm_longest_match.h b/contrib/arm/arm_longest_match.h
25e980
new file mode 100644
25e980
index 0000000..9e7083f
25e980
--- /dev/null
25e980
+++ b/contrib/arm/arm_longest_match.h
25e980
@@ -0,0 +1,142 @@
25e980
+/* Copyright (C) 1995-2011, 2016 Mark Adler
25e980
+ * Copyright (C) 2017 ARM Holdings Inc.
25e980
+ * Authors: Adenilson Cavalcanti <adenilson.cavalcanti@arm.com>
25e980
+ *          Jun He <jun.he@arm.com>
25e980
+ * This software is provided 'as-is', without any express or implied
25e980
+ * warranty.  In no event will the authors be held liable for any damages
25e980
+ * arising from the use of this software.
25e980
+ * Permission is granted to anyone to use this software for any purpose,
25e980
+ * including commercial applications, and to alter it and redistribute it
25e980
+ * freely, subject to the following restrictions:
25e980
+ * 1. The origin of this software must not be misrepresented; you must not
25e980
+ *  claim that you wrote the original software. If you use this software
25e980
+ *    in a product, an acknowledgment in the product documentation would be
25e980
+ *    appreciated but is not required.
25e980
+ * 2. Altered source versions must be plainly marked as such, and must not be
25e980
+ *    misrepresented as being the original software.
25e980
+ * 3. This notice may not be removed or altered from any source distribution.
25e980
+ */
25e980
+#ifndef __ARM_LONGEST__MATCH__
25e980
+#define __ARM_LONGEST__MATCH__
25e980
+
25e980
+#if defined(ARM_NEON)
25e980
+#include "deflate.h"
25e980
+#include <stdint.h>
25e980
+static inline long get_match_len(const unsigned char *a, const unsigned char *b, long max)
25e980
+{
25e980
+    register int len = 0;
25e980
+    register unsigned long xor = 0;
25e980
+    register int check_loops = max/sizeof(unsigned long);
25e980
+    while(check_loops-- > 0) {
25e980
+        xor = (*(unsigned long *)(a+len)) ^ (*(unsigned long *)(b+len));
25e980
+        if (xor) break;
25e980
+        len += sizeof(unsigned long);
25e980
+    }
25e980
+    if (0 == xor) {
25e980
+        while (len < max) {
25e980
+            if (a[len] != b[len]) break;
25e980
+            len++;
25e980
+        }
25e980
+        return len;
25e980
+    }
25e980
+    xor = __builtin_ctzl(xor)>>3;
25e980
+    return len + xor;
25e980
+}
25e980
+
25e980
+/*
25e980
+ * This implementation is based on algorithm described at:
25e980
+ * http://www.gildor.org/en/projects/zlib
25e980
+ * It uses the hash chain indexed by the most distant hash code to
25e980
+ * reduce number of checks.
25e980
+ * This also eliminates the those unnecessary check loops in legacy
25e980
+ * longest_match's do..while loop if the "most distant code" is out
25e980
+ * of search buffer
25e980
+ *
25e980
+ */
25e980
+static inline unsigned arm_longest_match(deflate_state *const s, IPos cur_match) {
25e980
+    unsigned chain_length = s->max_chain_length;/* max hash chain length */
25e980
+    unsigned char *scan = s->window + s->strstart; /* current string */
25e980
+    unsigned char *match;                       /* matched string */
25e980
+    unsigned int len;                  /* length of current match */
25e980
+    unsigned int best_len = s->prev_length;     /* best match length so far */
25e980
+    unsigned int nice_match = s->nice_match;    /* stop if match long enough */
25e980
+    IPos limit = s->strstart > (IPos)MAX_DIST(s) ?
25e980
+        s->strstart - (IPos)MAX_DIST(s) : 0;
25e980
+    /* Stop when cur_match becomes <= limit. To simplify the code,
25e980
+     * we prevent matches with the string of window index 0.
25e980
+     */
25e980
+    int offset = 0;  /* offset of the head[most_distant_hash] from IN cur_match */
25e980
+    Pos *prev = s->prev;
25e980
+    unsigned int wmask = s->w_mask;
25e980
+    unsigned char *scan_buf_base = s->window;
25e980
+
25e980
+    /* The code is optimized for HASH_BITS >= 8 and MAX_MATCH-2 multiple of 16.
25e980
+     * It is easy to get rid of this optimization if necessary.
25e980
+     */
25e980
+    Assert(s->hash_bits >= 8 && MAX_MATCH == 258, "Code too clever");
25e980
+
25e980
+    /* Do not look for matches beyond the end of the input. This is necessary
25e980
+     * to make deflate deterministic.
25e980
+     */
25e980
+    if ((unsigned int)nice_match > s->lookahead) nice_match = s->lookahead;
25e980
+
25e980
+    Assert((unsigned long)s->strstart <= s->window_size-MIN_LOOKAHEAD, "need lookahead");
25e980
+
25e980
+    /* find most distant hash code for lazy_match */
25e980
+    if (best_len > MIN_MATCH) {
25e980
+        /* search for most distant hash code */
25e980
+        int i;
25e980
+        uint16_t hash = 0;
25e980
+        IPos pos;
25e980
+
25e980
+        UPDATE_HASH(s, hash, scan[1]);
25e980
+        UPDATE_HASH(s, hash, scan[2]);
25e980
+        for (i = 3; i <= best_len; i++) {
25e980
+            UPDATE_HASH(s, hash, scan[i]);
25e980
+            /* get head IPos of hash calced by scan[i-2..i] */
25e980
+            pos = s->head[hash];
25e980
+            /* compare it to current "farthest hash" IPos */
25e980
+            if (pos <= cur_match) {
25e980
+                /* we have a new "farthest hash" now */
25e980
+                offset = i - 2;
25e980
+                cur_match = pos;
25e980
+            }
25e980
+        }
25e980
+
25e980
+        /* update variables to correspond offset */
25e980
+        limit += offset;
25e980
+        /*
25e980
+         * check if the most distant code's offset is out of search buffer
25e980
+         * if it is true, then this means scan[offset..offset+2] are not
25e980
+	 * presented in the search buffer. So we just return best_len 
25e980
+	 * we've found.
25e980
+         */
25e980
+        if (cur_match < limit) return best_len;
25e980
+
25e980
+        scan_buf_base -= offset;
25e980
+        /* reduce hash search depth based on best_len */
25e980
+        chain_length /= best_len - MIN_MATCH;
25e980
+    }
25e980
+
25e980
+    do {
25e980
+        Assert(cur_match < s->strstart, "no future");
25e980
+
25e980
+        /* Determine matched length at current pos */
25e980
+        match = scan_buf_base + cur_match;
25e980
+        len = get_match_len(match, scan, MAX_MATCH);
25e980
+
25e980
+        if (len > best_len) {
25e980
+            /* found longer string */
25e980
+            s->match_start = cur_match - offset;
25e980
+            best_len = len;
25e980
+            /* good enough? */
25e980
+            if (len >= nice_match) break;
25e980
+        }
25e980
+        /* move to prev pos in this hash chain */
25e980
+    } while ((cur_match = prev[cur_match & wmask]) > limit && --chain_length != 0);
25e980
+
25e980
+    return (best_len <= s->lookahead)? best_len : s->lookahead;
25e980
+}
25e980
+
25e980
+#endif
25e980
+#endif
25e980
diff --git a/deflate.c b/deflate.c
25e980
index 36f99ac..4c42259 100644
25e980
--- a/deflate.c
25e980
+++ b/deflate.c
25e980
@@ -50,9 +50,6 @@
25e980
 /* @(#) $Id$ */
25e980
 
25e980
 #include "deflate.h"
25e980
-#if __ARM_NEON
25e980
-#include "contrib/arm/neon_slide_hash.h"
25e980
-#endif
25e980
 
25e980
 const char deflate_copyright[] =
25e980
    " deflate 1.2.11 Copyright 1995-2017 Jean-loup Gailly and Mark Adler ";
25e980
@@ -196,6 +193,11 @@ local const config configuration_table[10] = {
25e980
     s->head[s->hash_size-1] = NIL; \
25e980
     zmemzero((Bytef *)s->head, (unsigned)(s->hash_size-1)*sizeof(*s->head));
25e980
 
25e980
+#if defined(ARM_NEON)
25e980
+#include "contrib/arm/arm_longest_match.h"
25e980
+#include "contrib/arm/neon_slide_hash.h"
25e980
+#endif
25e980
+
25e980
 /* ===========================================================================
25e980
  * Slide the hash table when sliding the window down (could be avoided with 32
25e980
  * bit values at the expense of memory usage). We slide even when level == 0 to
25e980
@@ -1244,6 +1246,9 @@ local uInt longest_match(s, cur_match)
25e980
     deflate_state *s;
25e980
     IPos cur_match;                             /* current match */
25e980
 {
25e980
+#if defined(ARM_NEON)
25e980
+    return arm_longest_match(s, cur_match);
25e980
+#endif
25e980
     unsigned chain_length = s->max_chain_length;/* max hash chain length */
25e980
     register Bytef *scan = s->window + s->strstart; /* current string */
25e980
     register Bytef *match;                      /* matched string */
25e980
-- 
71a74f
2.19.0
25e980