From 25e9802713484882c27c1f979a6610a42414ee13 Mon Sep 17 00:00:00 2001
From: Jeremy Linton <jeremy.linton@arm.com>
Date: Apr 30 2018 16:15:44 +0000
Subject: aarch64 optimizations


---

diff --git a/0001-Neon-Optimized-hash-chain-rebase.patch b/0001-Neon-Optimized-hash-chain-rebase.patch
new file mode 100644
index 0000000..bebec86
--- /dev/null
+++ b/0001-Neon-Optimized-hash-chain-rebase.patch
@@ -0,0 +1,157 @@
+From f849a23e0afc8b8a670fda64eec8b573fe62daa7 Mon Sep 17 00:00:00 2001
+From: Adenilson Cavalcanti <adenilson.cavalcanti@arm.com>
+Date: Mon, 9 Apr 2018 13:52:17 -0700
+Subject: [PATCH 1/3] Neon-Optimized hash chain rebase
+
+This should help with compression of data, using NEON instructions
+(therefore useful for ARMv7/ARMv8).
+
+Original patch by Jun He.
+---
+ CMakeLists.txt                |  5 ++-
+ contrib/arm/neon_slide_hash.h | 84 +++++++++++++++++++++++++++++++++++++++++++
+ deflate.c                     |  7 ++++
+ 3 files changed, 95 insertions(+), 1 deletion(-)
+ create mode 100644 contrib/arm/neon_slide_hash.h
+
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index 98ee4dd..230ca6d 100644
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -139,7 +139,10 @@ if(CMAKE_COMPILER_IS_GNUCC)
+ 
+     if(ARM_NEON)
+         list(REMOVE_ITEM ZLIB_SRCS inflate.c)
+-        set(ZLIB_ARM_NEON_HDRS contrib/arm/chunkcopy.h contrib/arm/inffast_chunk.h)
++        set(ZLIB_ARM_NEON_HDRS
++            contrib/arm/chunkcopy.h
++            contrib/arm/inffast_chunk.h
++            contrib/arm/neon_slide_hash.h)
+         set(ZLIB_ARM_NEON contrib/arm/inflate.c contrib/arm/inffast_chunk.c)
+         add_definitions(-DARM_NEON)
+         set(COMPILER ${CMAKE_C_COMPILER})
+diff --git a/contrib/arm/neon_slide_hash.h b/contrib/arm/neon_slide_hash.h
+new file mode 100644
+index 0000000..0daffa1
+--- /dev/null
++++ b/contrib/arm/neon_slide_hash.h
+@@ -0,0 +1,84 @@
++/* Copyright (C) 1995-2011, 2016 Mark Adler
++ * Copyright (C) 2017 ARM Holdings Inc.
++ * Authors: Adenilson Cavalcanti <adenilson.cavalcanti@arm.com>
++ *          Jun He <jun.he@arm.com>
++ * This software is provided 'as-is', without any express or implied
++ * warranty.  In no event will the authors be held liable for any damages
++ * arising from the use of this software.
++ * Permission is granted to anyone to use this software for any purpose,
++ * including commercial applications, and to alter it and redistribute it
++ * freely, subject to the following restrictions:
++ * 1. The origin of this software must not be misrepresented; you must not
++ *  claim that you wrote the original software. If you use this software
++ *    in a product, an acknowledgment in the product documentation would be
++ *    appreciated but is not required.
++ * 2. Altered source versions must be plainly marked as such, and must not be
++ *    misrepresented as being the original software.
++ * 3. This notice may not be removed or altered from any source distribution.
++ */
++#ifndef __NEON_SLIDE_HASH__
++#define __NEON_SLIDE_HASH__
++
++#if (defined(__ARM_NEON__) || defined(__ARM_NEON))
++#include "deflate.h"
++#include <arm_neon.h>
++
++inline static void neon_slide_hash(deflate_state *s)
++{
++    /*
++     * This is ASIMD implementation for hash table rebase
++     * it assumes:
++     * 1. hash chain offset (Pos) is 2 bytes
++     * 2. hash table size is multiple*128 bytes
++     * #1 should be true as Pos is defined as "ush"
++     * #2 should be true as hash_bits are greater that 7
++     */
++    unsigned n, m;
++    unsigned short wsize = s->w_size;
++    uint16x8_t v, *p;
++    size_t size;
++
++    size = s->hash_size*sizeof(s->head[0]);
++    Assert((size % sizeof(uint16x8_t) * 8 == 0), "hash table size err");
++
++    Assert(sizeof(Pos) == 2, "Wrong Pos size");
++
++    /* slide s->head */
++    v = vdupq_n_u16(wsize);
++    p = (uint16x8_t *)(s->head);
++    n = size / (sizeof(uint16x8_t) * 8);
++    do {
++        p[0] = vqsubq_u16(p[0], v);
++        p[1] = vqsubq_u16(p[1], v);
++        p[2] = vqsubq_u16(p[2], v);
++        p[3] = vqsubq_u16(p[3], v);
++        p[4] = vqsubq_u16(p[4], v);
++        p[5] = vqsubq_u16(p[5], v);
++        p[6] = vqsubq_u16(p[6], v);
++        p[7] = vqsubq_u16(p[7], v);
++        p += 8;
++    } while (--n);
++#ifndef FASTEST
++    /* slide s->prev */
++    size = wsize*sizeof(s->prev[0]);
++
++    Assert((size % sizeof(uint16x8_t) * 8 == 0), "hash table size err");
++
++    p = (uint16x8_t *)(s->prev);
++    n = size / (sizeof(uint16x8_t) * 8);
++    do {
++        p[0] = vqsubq_u16(p[0], v);
++        p[1] = vqsubq_u16(p[1], v);
++        p[2] = vqsubq_u16(p[2], v);
++        p[3] = vqsubq_u16(p[3], v);
++        p[4] = vqsubq_u16(p[4], v);
++        p[5] = vqsubq_u16(p[5], v);
++        p[6] = vqsubq_u16(p[6], v);
++        p[7] = vqsubq_u16(p[7], v);
++        p += 8;
++    } while (--n);
++#endif
++}
++
++#endif
++#endif
+diff --git a/deflate.c b/deflate.c
+index 1ec7614..36f99ac 100644
+--- a/deflate.c
++++ b/deflate.c
+@@ -50,6 +50,9 @@
+ /* @(#) $Id$ */
+ 
+ #include "deflate.h"
++#if __ARM_NEON
++#include "contrib/arm/neon_slide_hash.h"
++#endif
+ 
+ const char deflate_copyright[] =
+    " deflate 1.2.11 Copyright 1995-2017 Jean-loup Gailly and Mark Adler ";
+@@ -201,6 +204,9 @@ local const config configuration_table[10] = {
+ local void slide_hash(s)
+     deflate_state *s;
+ {
++#if ARM_NEON
++    return neon_slide_hash(s);
++#else
+     unsigned n, m;
+     Posf *p;
+     uInt wsize = s->w_size;
+@@ -222,6 +228,7 @@ local void slide_hash(s)
+          */
+     } while (--n);
+ #endif
++#endif
+ }
+ 
+ /* ========================================================================= */
+-- 
+2.14.3
+
diff --git a/0001-Porting-inflate-using-wider-loads-and-stores.patch b/0001-Porting-inflate-using-wider-loads-and-stores.patch
new file mode 100644
index 0000000..17c3e7c
--- /dev/null
+++ b/0001-Porting-inflate-using-wider-loads-and-stores.patch
@@ -0,0 +1,2268 @@
+From 390b2713be0c7f682861264a74b202439caf9460 Mon Sep 17 00:00:00 2001
+From: Adenilson Cavalcanti <adenilson.cavalcanti@arm.com>
+Date: Wed, 4 Apr 2018 12:10:35 -0700
+Subject: [PATCH 1/2] Porting inflate using wider loads and stores
+
+In inflate_fast() the output pointer always has plenty of room to write. This
+means that so long as the target is capable, wide un-aligned loads and stores
+can be used to transfer several bytes at once. When the reference distance is
+too short simply unroll the data a little to increase the distance.
+
+For reference, please see:
+https://chromium.googlesource.com/chromium/src/+/78104f4d73e3bbb4155fa804d00ed66682180556
+ps: this is still missing the fix for inflate_back corner case.
+
+Change-Id: I5216424ab584e069b77ddf04000a313d5ca99839
+---
+ CMakeLists.txt          |   21 +-
+ contrib/arm/chunkcopy.h |  297 +++++++++
+ contrib/arm/inffast.c   |  307 +++++++++
+ contrib/arm/inflate.c   | 1572 +++++++++++++++++++++++++++++++++++++++++++++++
+ 4 files changed, 2195 insertions(+), 2 deletions(-)
+ create mode 100644 contrib/arm/chunkcopy.h
+ create mode 100644 contrib/arm/inffast.c
+ create mode 100644 contrib/arm/inflate.c
+
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index 0fe939d..09bb3db 100644
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -7,6 +7,7 @@ set(VERSION "1.2.11")
+ 
+ option(ASM686 "Enable building i686 assembly implementation")
+ option(AMD64 "Enable building amd64 assembly implementation")
++option(ARM_NEON "Enable building ARM NEON intrinsics implementation")
+ 
+ set(INSTALL_BIN_DIR "${CMAKE_INSTALL_PREFIX}/bin" CACHE PATH "Installation directory for executables")
+ set(INSTALL_LIB_DIR "${CMAKE_INSTALL_PREFIX}/lib" CACHE PATH "Installation directory for libraries")
+@@ -136,6 +137,22 @@ if(CMAKE_COMPILER_IS_GNUCC)
+         set(ZLIB_ASMS contrib/amd64/amd64-match.S)
+     endif ()
+ 
++    if(ARM_NEON)
++        list(REMOVE_ITEM ZLIB_SRCS inflate.c)
++        list(REMOVE_ITEM ZLIB_SRCS inffast.c)
++        set(ZLIB_ARM_NEON_HDRS contrib/arm/chunkcopy.h)
++        set(ZLIB_ARM_NEON contrib/arm/inflate.c contrib/arm/inffast.c)
++        add_definitions(-DARM_NEON)
++        set(COMPILER ${CMAKE_C_COMPILER})
++        # NEON is mandatory in ARMv8.
++        if(${COMPILER} MATCHES "aarch64")
++          set_source_files_properties(${ZLIB_ARM_NEON} PROPERTIES LANGUAGE C COMPILE_FLAGS -march=armv8-a)
++          # But it was optional for ARMv7.
++        elseif(${COMPILER} MATCHES "arm")
++          set_source_files_properties(${ZLIB_ARM_NEON} PROPERTIES LANGUAGE C COMPILE_FLAGS -mfpu=neon)
++        endif()
++    endif()
++
+ 	if(ZLIB_ASMS)
+ 		add_definitions(-DASMV)
+ 		set_source_files_properties(${ZLIB_ASMS} PROPERTIES LANGUAGE C COMPILE_FLAGS -DNO_UNDERLINE)
+@@ -183,8 +200,8 @@ if(MINGW)
+     set(ZLIB_DLL_SRCS ${CMAKE_CURRENT_BINARY_DIR}/zlib1rc.obj)
+ endif(MINGW)
+ 
+-add_library(zlib SHARED ${ZLIB_SRCS} ${ZLIB_ASMS} ${ZLIB_DLL_SRCS} ${ZLIB_PUBLIC_HDRS} ${ZLIB_PRIVATE_HDRS})
+-add_library(zlibstatic STATIC ${ZLIB_SRCS} ${ZLIB_ASMS} ${ZLIB_PUBLIC_HDRS} ${ZLIB_PRIVATE_HDRS})
++add_library(zlib SHARED ${ZLIB_SRCS} ${ZLIB_ASMS} ${ZLIB_ARM_NEON} ${ZLIB_ARM_NEON_HDRS} ${ZLIB_DLL_SRCS} ${ZLIB_PUBLIC_HDRS} ${ZLIB_PRIVATE_HDRS})
++add_library(zlibstatic STATIC ${ZLIB_SRCS} ${ZLIB_ASMS} ${ZLIB_ARM_NEON} ${ZLIB_ARM_NEON_HDRS} ${ZLIB_PUBLIC_HDRS} ${ZLIB_PRIVATE_HDRS})
+ set_target_properties(zlib PROPERTIES DEFINE_SYMBOL ZLIB_DLL)
+ set_target_properties(zlib PROPERTIES SOVERSION 1)
+ 
+diff --git a/contrib/arm/chunkcopy.h b/contrib/arm/chunkcopy.h
+new file mode 100644
+index 0000000..d42995c
+--- /dev/null
++++ b/contrib/arm/chunkcopy.h
+@@ -0,0 +1,297 @@
++/* chunkcopy.h -- fast copies and sets
++ * Copyright (C) 2017 ARM, Inc.
++ * For conditions of distribution and use, see copyright notice in zlib.h
++ */
++
++#ifndef CHUNKCOPY_H
++#define CHUNKCOPY_H
++
++#include <arm_neon.h>
++#include "zutil.h"
++
++#if __STDC_VERSION__ >= 199901L
++#define Z_RESTRICT restrict
++#else
++#define Z_RESTRICT
++#endif
++
++typedef uint8x16_t chunkcopy_chunk_t;
++#define CHUNKCOPY_CHUNK_SIZE sizeof(chunkcopy_chunk_t)
++
++/*
++   Ask the compiler to perform a wide, unaligned load with an machine
++   instruction appropriate for the chunkcopy_chunk_t type.
++ */
++static inline chunkcopy_chunk_t loadchunk(const unsigned char FAR* s) {
++  chunkcopy_chunk_t c;
++  __builtin_memcpy(&c, s, sizeof(c));
++  return c;
++}
++
++/*
++   Ask the compiler to perform a wide, unaligned store with an machine
++   instruction appropriate for the chunkcopy_chunk_t type.
++ */
++static inline void storechunk(unsigned char FAR* d, chunkcopy_chunk_t c) {
++  __builtin_memcpy(d, &c, sizeof(c));
++}
++
++/*
++   Perform a memcpy-like operation, but assume that length is non-zero and that
++   it's OK to overwrite at least CHUNKCOPY_CHUNK_SIZE bytes of output even if
++   the length is shorter than this.
++
++   It also guarantees that it will properly unroll the data if the distance
++   between `out` and `from` is at least CHUNKCOPY_CHUNK_SIZE, which we rely on
++   in chunkcopy_relaxed().
++
++   Aside from better memory bus utilisation, this means that short copies
++   (CHUNKCOPY_CHUNK_SIZE bytes or fewer) will fall straight through the loop
++   without iteration, which will hopefully make the branch prediction more
++   reliable.
++ */
++static inline unsigned char FAR* chunkcopy_core(unsigned char FAR* out,
++                                                const unsigned char FAR* from,
++                                                unsigned len) {
++  int bump = (--len % CHUNKCOPY_CHUNK_SIZE) + 1;
++  storechunk(out, loadchunk(from));
++  out += bump;
++  from += bump;
++  len /= CHUNKCOPY_CHUNK_SIZE;
++  while (len-- > 0) {
++    storechunk(out, loadchunk(from));
++    out += CHUNKCOPY_CHUNK_SIZE;
++    from += CHUNKCOPY_CHUNK_SIZE;
++  }
++  return out;
++}
++
++/*
++   Like chunkcopy_core, but avoid writing beyond of legal output.
++
++   Accepts an additional pointer to the end of safe output.  A generic safe
++   copy would use (out + len), but it's normally the case that the end of the
++   output buffer is beyond the end of the current copy, and this can still be
++   exploited.
++ */
++static inline unsigned char FAR* chunkcopy_core_safe(
++    unsigned char FAR* out,
++    const unsigned char FAR* from,
++    unsigned len,
++    unsigned char FAR* limit) {
++  Assert(out + len <= limit, "chunk copy exceeds safety limit");
++  if (limit - out < CHUNKCOPY_CHUNK_SIZE) {
++    const unsigned char FAR* Z_RESTRICT rfrom = from;
++    if (len & 8) {
++      __builtin_memcpy(out, rfrom, 8);
++      out += 8;
++      rfrom += 8;
++    }
++    if (len & 4) {
++      __builtin_memcpy(out, rfrom, 4);
++      out += 4;
++      rfrom += 4;
++    }
++    if (len & 2) {
++      __builtin_memcpy(out, rfrom, 2);
++      out += 2;
++      rfrom += 2;
++    }
++    if (len & 1) {
++      *out++ = *rfrom++;
++    }
++    return out;
++  }
++  return chunkcopy_core(out, from, len);
++}
++
++/*
++   Perform short copies until distance can be rewritten as being at least
++   CHUNKCOPY_CHUNK_SIZE.
++
++   This assumes that it's OK to overwrite at least the first
++   2*CHUNKCOPY_CHUNK_SIZE bytes of output even if the copy is shorter than
++   this.  This assumption holds within inflate_fast() which starts every
++   iteration with at least 258 bytes of output space available (258 being the
++   maximum length output from a single token; see inffast.c).
++ */
++static inline unsigned char FAR* chunkunroll_relaxed(unsigned char FAR* out,
++                                                     unsigned FAR* dist,
++                                                     unsigned FAR* len) {
++  const unsigned char FAR* from = out - *dist;
++  while (*dist < *len && *dist < CHUNKCOPY_CHUNK_SIZE) {
++    storechunk(out, loadchunk(from));
++    out += *dist;
++    *len -= *dist;
++    *dist += *dist;
++  }
++  return out;
++}
++
++static inline uint8x16_t chunkset_vld1q_dup_u8x8(
++    const unsigned char FAR* Z_RESTRICT from) {
++#if defined(__clang__) || defined(__aarch64__)
++  return vreinterpretq_u8_u64(vld1q_dup_u64((void*)from));
++#else
++  /* 32-bit GCC uses an alignment hint for vld1q_dup_u64, even when given a
++   * void pointer, so here's an alternate implementation.
++   */
++  uint8x8_t h = vld1_u8(from);
++  return vcombine_u8(h, h);
++#endif
++}
++
++/*
++   Perform an overlapping copy which behaves as a memset() operation, but
++   supporting periods other than one, and assume that length is non-zero and
++   that it's OK to overwrite at least CHUNKCOPY_CHUNK_SIZE*3 bytes of output
++   even if the length is shorter than this.
++ */
++static inline unsigned char FAR* chunkset_core(unsigned char FAR* out,
++                                               unsigned period,
++                                               unsigned len) {
++  uint8x16_t f;
++  int bump = ((len - 1) % sizeof(f)) + 1;
++
++  switch (period) {
++    case 1:
++      f = vld1q_dup_u8(out - 1);
++      vst1q_u8(out, f);
++      out += bump;
++      len -= bump;
++      while (len > 0) {
++        vst1q_u8(out, f);
++        out += sizeof(f);
++        len -= sizeof(f);
++      }
++      return out;
++    case 2:
++      f = vreinterpretq_u8_u16(vld1q_dup_u16((void*)(out - 2)));
++      vst1q_u8(out, f);
++      out += bump;
++      len -= bump;
++      if (len > 0) {
++        f = vreinterpretq_u8_u16(vld1q_dup_u16((void*)(out - 2)));
++        do {
++          vst1q_u8(out, f);
++          out += sizeof(f);
++          len -= sizeof(f);
++        } while (len > 0);
++      }
++      return out;
++    case 4:
++      f = vreinterpretq_u8_u32(vld1q_dup_u32((void*)(out - 4)));
++      vst1q_u8(out, f);
++      out += bump;
++      len -= bump;
++      if (len > 0) {
++        f = vreinterpretq_u8_u32(vld1q_dup_u32((void*)(out - 4)));
++        do {
++          vst1q_u8(out, f);
++          out += sizeof(f);
++          len -= sizeof(f);
++        } while (len > 0);
++      }
++      return out;
++    case 8:
++      f = chunkset_vld1q_dup_u8x8(out - 8);
++      vst1q_u8(out, f);
++      out += bump;
++      len -= bump;
++      if (len > 0) {
++        f = chunkset_vld1q_dup_u8x8(out - 8);
++        do {
++          vst1q_u8(out, f);
++          out += sizeof(f);
++          len -= sizeof(f);
++        } while (len > 0);
++      }
++      return out;
++  }
++  out = chunkunroll_relaxed(out, &period, &len);
++  return chunkcopy_core(out, out - period, len);
++}
++
++/*
++   Perform a memcpy-like operation, but assume that length is non-zero and that
++   it's OK to overwrite at least CHUNKCOPY_CHUNK_SIZE bytes of output even if
++   the length is shorter than this.
++
++   Unlike chunkcopy_core() above, no guarantee is made regarding the behaviour
++   of overlapping buffers, regardless of the distance between the pointers.
++   This is reflected in the `restrict`-qualified pointers, allowing the
++   compiler to reorder loads and stores.
++ */
++static inline unsigned char FAR* chunkcopy_relaxed(
++    unsigned char FAR* Z_RESTRICT out,
++    const unsigned char FAR* Z_RESTRICT from,
++    unsigned len) {
++  return chunkcopy_core(out, from, len);
++}
++
++/*
++   Like chunkcopy_relaxed, but avoid writing beyond of legal output.
++
++   Unlike chunkcopy_core_safe() above, no guarantee is made regarding the
++   behaviour of overlapping buffers, regardless of the distance between the
++   pointers.  This is reflected in the `restrict`-qualified pointers, allowing
++   the compiler to reorder loads and stores.
++
++   Accepts an additional pointer to the end of safe output.  A generic safe
++   copy would use (out + len), but it's normally the case that the end of the
++   output buffer is beyond the end of the current copy, and this can still be
++   exploited.
++ */
++static inline unsigned char FAR* chunkcopy_safe(
++    unsigned char FAR* out,
++    const unsigned char FAR* Z_RESTRICT from,
++    unsigned len,
++    unsigned char FAR* limit) {
++  Assert(out + len <= limit, "chunk copy exceeds safety limit");
++  return chunkcopy_core_safe(out, from, len, limit);
++}
++
++/*
++   Perform chunky copy within the same buffer, where the source and destination
++   may potentially overlap.
++
++   Assumes that len > 0 on entry, and that it's safe to write at least
++   CHUNKCOPY_CHUNK_SIZE*3 bytes to the output.
++ */
++static inline unsigned char FAR*
++chunkcopy_lapped_relaxed(unsigned char FAR* out, unsigned dist, unsigned len) {
++  if (dist < len && dist < CHUNKCOPY_CHUNK_SIZE) {
++    return chunkset_core(out, dist, len);
++  }
++  return chunkcopy_core(out, out - dist, len);
++}
++
++/*
++   Behave like chunkcopy_lapped_relaxed, but avoid writing beyond of legal
++   output.
++
++   Accepts an additional pointer to the end of safe output.  A generic safe
++   copy would use (out + len), but it's normally the case that the end of the
++   output buffer is beyond the end of the current copy, and this can still be
++   exploited.
++ */
++static inline unsigned char FAR* chunkcopy_lapped_safe(
++    unsigned char FAR* out,
++    unsigned dist,
++    unsigned len,
++    unsigned char FAR* limit) {
++  Assert(out + len <= limit, "chunk copy exceeds safety limit");
++  if (limit - out < CHUNKCOPY_CHUNK_SIZE * 3) {
++    /* TODO(cavalcantii): try harder to optimise this */
++    while (len-- > 0) {
++      *out = *(out - dist);
++      out++;
++    }
++    return out;
++  }
++  return chunkcopy_lapped_relaxed(out, dist, len);
++}
++
++#undef Z_RESTRICT
++
++#endif /* CHUNKCOPY_H */
+diff --git a/contrib/arm/inffast.c b/contrib/arm/inffast.c
+new file mode 100644
+index 0000000..f7f5007
+--- /dev/null
++++ b/contrib/arm/inffast.c
+@@ -0,0 +1,307 @@
++/* inffast.c -- fast decoding
++ * Copyright (C) 1995-2017 Mark Adler
++ * For conditions of distribution and use, see copyright notice in zlib.h
++ */
++
++#include "zutil.h"
++#include "inftrees.h"
++#include "inflate.h"
++#include "inffast.h"
++#include "chunkcopy.h"
++
++#ifdef ASMINF
++#  pragma message("Assembler code may have bugs -- use at your own risk")
++#else
++
++/*
++   Decode literal, length, and distance codes and write out the resulting
++   literal and match bytes until either not enough input or output is
++   available, an end-of-block is encountered, or a data error is encountered.
++   When large enough input and output buffers are supplied to inflate(), for
++   example, a 16K input buffer and a 64K output buffer, more than 95% of the
++   inflate execution time is spent in this routine.
++
++   Entry assumptions:
++
++        state->mode == LEN
++        strm->avail_in >= 6
++        strm->avail_out >= 258
++        start >= strm->avail_out
++        state->bits < 8
++
++   On return, state->mode is one of:
++
++        LEN -- ran out of enough output space or enough available input
++        TYPE -- reached end of block code, inflate() to interpret next block
++        BAD -- error in block data
++
++   Notes:
++
++    - The maximum input bits used by a length/distance pair is 15 bits for the
++      length code, 5 bits for the length extra, 15 bits for the distance code,
++      and 13 bits for the distance extra.  This totals 48 bits, or six bytes.
++      Therefore if strm->avail_in >= 6, then there is enough input to avoid
++      checking for available input while decoding.
++
++    - The maximum bytes that a single length/distance pair can output is 258
++      bytes, which is the maximum length that can be coded.  inflate_fast()
++      requires strm->avail_out >= 258 for each loop to avoid checking for
++      output space.
++ */
++void ZLIB_INTERNAL inflate_fast(strm, start)
++z_streamp strm;
++unsigned start;         /* inflate()'s starting value for strm->avail_out */
++{
++    struct inflate_state FAR *state;
++    z_const unsigned char FAR *in;      /* local strm->next_in */
++    z_const unsigned char FAR *last;    /* have enough input while in < last */
++    unsigned char FAR *out;     /* local strm->next_out */
++    unsigned char FAR *beg;     /* inflate()'s initial strm->next_out */
++    unsigned char FAR *end;     /* while out < end, enough space available */
++    unsigned char FAR *limit;   /* safety limit for chunky copies */
++#ifdef INFLATE_STRICT
++    unsigned dmax;              /* maximum distance from zlib header */
++#endif
++    unsigned wsize;             /* window size or zero if not using window */
++    unsigned whave;             /* valid bytes in the window */
++    unsigned wnext;             /* window write index */
++    unsigned char FAR *window;  /* allocated sliding window, if wsize != 0 */
++    unsigned long hold;         /* local strm->hold */
++    unsigned bits;              /* local strm->bits */
++    code const FAR *lcode;      /* local strm->lencode */
++    code const FAR *dcode;      /* local strm->distcode */
++    unsigned lmask;             /* mask for first level of length codes */
++    unsigned dmask;             /* mask for first level of distance codes */
++    code here;                  /* retrieved table entry */
++    unsigned op;                /* code bits, operation, extra bits, or */
++                                /*  window position, window bytes to copy */
++    unsigned len;               /* match length, unused bytes */
++    unsigned dist;              /* match distance */
++    unsigned char FAR *from;    /* where to copy match from */
++
++    /* copy state to local variables */
++    state = (struct inflate_state FAR *)strm->state;
++    in = strm->next_in;
++    last = in + (strm->avail_in - 5);
++    out = strm->next_out;
++    beg = out - (start - strm->avail_out);
++    end = out + (strm->avail_out - 257);
++    limit = out + strm->avail_out;
++#ifdef INFLATE_STRICT
++    dmax = state->dmax;
++#endif
++    wsize = state->wsize;
++    whave = state->whave;
++    wnext = (state->wnext == 0 && whave >= wsize) ? wsize : state->wnext;
++    window = state->window;
++    hold = state->hold;
++    bits = state->bits;
++    lcode = state->lencode;
++    dcode = state->distcode;
++    lmask = (1U << state->lenbits) - 1;
++    dmask = (1U << state->distbits) - 1;
++
++    /* decode literals and length/distances until end-of-block or not enough
++       input data or output space */
++    do {
++        if (bits < 15) {
++            hold += (unsigned long)(*in++) << bits;
++            bits += 8;
++            hold += (unsigned long)(*in++) << bits;
++            bits += 8;
++        }
++        here = lcode[hold & lmask];
++      dolen:
++        op = (unsigned)(here.bits);
++        hold >>= op;
++        bits -= op;
++        op = (unsigned)(here.op);
++        if (op == 0) {                          /* literal */
++            Tracevv((stderr, here.val >= 0x20 && here.val < 0x7f ?
++                    "inflate:         literal '%c'\n" :
++                    "inflate:         literal 0x%02x\n", here.val));
++            *out++ = (unsigned char)(here.val);
++        }
++        else if (op & 16) {                     /* length base */
++            len = (unsigned)(here.val);
++            op &= 15;                           /* number of extra bits */
++            if (op) {
++                if (bits < op) {
++                    hold += (unsigned long)(*in++) << bits;
++                    bits += 8;
++                }
++                len += (unsigned)hold & ((1U << op) - 1);
++                hold >>= op;
++                bits -= op;
++            }
++            Tracevv((stderr, "inflate:         length %u\n", len));
++            if (bits < 15) {
++                hold += (unsigned long)(*in++) << bits;
++                bits += 8;
++                hold += (unsigned long)(*in++) << bits;
++                bits += 8;
++            }
++            here = dcode[hold & dmask];
++          dodist:
++            op = (unsigned)(here.bits);
++            hold >>= op;
++            bits -= op;
++            op = (unsigned)(here.op);
++            if (op & 16) {                      /* distance base */
++                dist = (unsigned)(here.val);
++                op &= 15;                       /* number of extra bits */
++                if (bits < op) {
++                    hold += (unsigned long)(*in++) << bits;
++                    bits += 8;
++                    if (bits < op) {
++                        hold += (unsigned long)(*in++) << bits;
++                        bits += 8;
++                    }
++                }
++                dist += (unsigned)hold & ((1U << op) - 1);
++#ifdef INFLATE_STRICT
++                if (dist > dmax) {
++                    strm->msg = (char *)"invalid distance too far back";
++                    state->mode = BAD;
++                    break;
++                }
++#endif
++                hold >>= op;
++                bits -= op;
++                Tracevv((stderr, "inflate:         distance %u\n", dist));
++                op = (unsigned)(out - beg);     /* max distance in output */
++                if (dist > op) {                /* see if copy from window */
++                    op = dist - op;             /* distance back in window */
++                    if (op > whave) {
++                        if (state->sane) {
++                            strm->msg =
++                                (char *)"invalid distance too far back";
++                            state->mode = BAD;
++                            break;
++                        }
++#ifdef INFLATE_ALLOW_INVALID_DISTANCE_TOOFAR_ARRR
++                        if (len <= op - whave) {
++                            do {
++                                *out++ = 0;
++                            } while (--len);
++                            continue;
++                        }
++                        len -= op - whave;
++                        do {
++                            *out++ = 0;
++                        } while (--op > whave);
++                        if (op == 0) {
++                            from = out - dist;
++                            do {
++                                *out++ = *from++;
++                            } while (--len);
++                            continue;
++                        }
++#endif
++                    }
++                    from = window;
++                    if (wnext >= op) {          /* contiguous in window */
++                        from += wnext - op;
++                    }
++                    else {                      /* wrap around window */
++                        op -= wnext;
++                        from += wsize - op;
++                        if (op < len) {         /* some from end of window */
++                            len -= op;
++                            out = chunkcopy_safe(out, from, op, limit);
++                            from = window;      /* more from start of window */
++                            op = wnext;
++                            /* This (rare) case can create a situation where
++                               the first chunkcopy below must be checked.
++                             */
++                        }
++                    }
++                    if (op < len) {             /* still need some from output */
++                        out = chunkcopy_safe(out, from, op, limit);
++                        len -= op;
++                        /* When dist is small the amount of data that can be
++                           copied from the window is also small, and progress
++                           towards the dangerous end of the output buffer is
++                           also small.  This means that for trivial memsets and
++                           for chunkunroll_relaxed() a safety check is
++                           unnecessary.  However, these conditions may not be
++                           entered at all, and in that case it's possible that
++                           the main copy is near the end.
++                          */
++                        out = chunkunroll_relaxed(out, &dist, &len);
++                        out = chunkcopy_safe(out, out - dist, len, limit);
++                    } else {
++                        /* from points to window, so there is no risk of
++                           overlapping pointers requiring memset-like behaviour
++                         */
++                        out = chunkcopy_safe(out, from, len, limit);
++                    }
++                }
++                else {
++                    /* Whole reference is in range of current output.  No
++                       range checks are necessary because we start with room
++                       for at least 258 bytes of output, so unroll and roundoff
++                       operations can write beyond `out+len` so long as they
++                       stay within 258 bytes of `out`.
++                     */
++                    out = chunkcopy_lapped_relaxed(out, dist, len);
++                }
++            }
++            else if ((op & 64) == 0) {          /* 2nd level distance code */
++                here = dcode[here.val + (hold & ((1U << op) - 1))];
++                goto dodist;
++            }
++            else {
++                strm->msg = (char *)"invalid distance code";
++                state->mode = BAD;
++                break;
++            }
++        }
++        else if ((op & 64) == 0) {              /* 2nd level length code */
++            here = lcode[here.val + (hold & ((1U << op) - 1))];
++            goto dolen;
++        }
++        else if (op & 32) {                     /* end-of-block */
++            Tracevv((stderr, "inflate:         end of block\n"));
++            state->mode = TYPE;
++            break;
++        }
++        else {
++            strm->msg = (char *)"invalid literal/length code";
++            state->mode = BAD;
++            break;
++        }
++    } while (in < last && out < end);
++
++    /* return unused bytes (on entry, bits < 8, so in won't go too far back) */
++    len = bits >> 3;
++    in -= len;
++    bits -= len << 3;
++    hold &= (1U << bits) - 1;
++
++    /* update state and return */
++    strm->next_in = in;
++    strm->next_out = out;
++    strm->avail_in = (unsigned)(in < last ? 5 + (last - in) : 5 - (in - last));
++    strm->avail_out = (unsigned)(out < end ?
++                                 257 + (end - out) : 257 - (out - end));
++    state->hold = hold;
++    state->bits = bits;
++    return;
++}
++
++/*
++   inflate_fast() speedups that turned out slower (on a PowerPC G3 750CXe):
++   - Using bit fields for code structure
++   - Different op definition to avoid & for extra bits (do & for table bits)
++   - Three separate decoding do-loops for direct, window, and wnext == 0
++   - Special case for distance > 1 copies to do overlapped load and store copy
++   - Explicit branch predictions (based on measured branch probabilities)
++   - Deferring match copy and interspersed it with decoding subsequent codes
++   - Swapping literal/length else
++   - Swapping window/direct else
++   - Larger unrolled copy loops (three is about right)
++   - Moving len -= 3 statement into middle of loop
++ */
++
++#endif /* !ASMINF */
+diff --git a/contrib/arm/inflate.c b/contrib/arm/inflate.c
+new file mode 100644
+index 0000000..23e95f1
+--- /dev/null
++++ b/contrib/arm/inflate.c
+@@ -0,0 +1,1572 @@
++/* inflate.c -- zlib decompression
++ * Copyright (C) 1995-2016 Mark Adler
++ * For conditions of distribution and use, see copyright notice in zlib.h
++ */
++
++/*
++ * Change history:
++ *
++ * 1.2.beta0    24 Nov 2002
++ * - First version -- complete rewrite of inflate to simplify code, avoid
++ *   creation of window when not needed, minimize use of window when it is
++ *   needed, make inffast.c even faster, implement gzip decoding, and to
++ *   improve code readability and style over the previous zlib inflate code
++ *
++ * 1.2.beta1    25 Nov 2002
++ * - Use pointers for available input and output checking in inffast.c
++ * - Remove input and output counters in inffast.c
++ * - Change inffast.c entry and loop from avail_in >= 7 to >= 6
++ * - Remove unnecessary second byte pull from length extra in inffast.c
++ * - Unroll direct copy to three copies per loop in inffast.c
++ *
++ * 1.2.beta2    4 Dec 2002
++ * - Change external routine names to reduce potential conflicts
++ * - Correct filename to inffixed.h for fixed tables in inflate.c
++ * - Make hbuf[] unsigned char to match parameter type in inflate.c
++ * - Change strm->next_out[-state->offset] to *(strm->next_out - state->offset)
++ *   to avoid negation problem on Alphas (64 bit) in inflate.c
++ *
++ * 1.2.beta3    22 Dec 2002
++ * - Add comments on state->bits assertion in inffast.c
++ * - Add comments on op field in inftrees.h
++ * - Fix bug in reuse of allocated window after inflateReset()
++ * - Remove bit fields--back to byte structure for speed
++ * - Remove distance extra == 0 check in inflate_fast()--only helps for lengths
++ * - Change post-increments to pre-increments in inflate_fast(), PPC biased?
++ * - Add compile time option, POSTINC, to use post-increments instead (Intel?)
++ * - Make MATCH copy in inflate() much faster for when inflate_fast() not used
++ * - Use local copies of stream next and avail values, as well as local bit
++ *   buffer and bit count in inflate()--for speed when inflate_fast() not used
++ *
++ * 1.2.beta4    1 Jan 2003
++ * - Split ptr - 257 statements in inflate_table() to avoid compiler warnings
++ * - Move a comment on output buffer sizes from inffast.c to inflate.c
++ * - Add comments in inffast.c to introduce the inflate_fast() routine
++ * - Rearrange window copies in inflate_fast() for speed and simplification
++ * - Unroll last copy for window match in inflate_fast()
++ * - Use local copies of window variables in inflate_fast() for speed
++ * - Pull out common wnext == 0 case for speed in inflate_fast()
++ * - Make op and len in inflate_fast() unsigned for consistency
++ * - Add FAR to lcode and dcode declarations in inflate_fast()
++ * - Simplified bad distance check in inflate_fast()
++ * - Added inflateBackInit(), inflateBack(), and inflateBackEnd() in new
++ *   source file infback.c to provide a call-back interface to inflate for
++ *   programs like gzip and unzip -- uses window as output buffer to avoid
++ *   window copying
++ *
++ * 1.2.beta5    1 Jan 2003
++ * - Improved inflateBack() interface to allow the caller to provide initial
++ *   input in strm.
++ * - Fixed stored blocks bug in inflateBack()
++ *
++ * 1.2.beta6    4 Jan 2003
++ * - Added comments in inffast.c on effectiveness of POSTINC
++ * - Typecasting all around to reduce compiler warnings
++ * - Changed loops from while (1) or do {} while (1) to for (;;), again to
++ *   make compilers happy
++ * - Changed type of window in inflateBackInit() to unsigned char *
++ *
++ * 1.2.beta7    27 Jan 2003
++ * - Changed many types to unsigned or unsigned short to avoid warnings
++ * - Added inflateCopy() function
++ *
++ * 1.2.0        9 Mar 2003
++ * - Changed inflateBack() interface to provide separate opaque descriptors
++ *   for the in() and out() functions
++ * - Changed inflateBack() argument and in_func typedef to swap the length
++ *   and buffer address return values for the input function
++ * - Check next_in and next_out for Z_NULL on entry to inflate()
++ *
++ * The history for versions after 1.2.0 are in ChangeLog in zlib distribution.
++ */
++
++#include "zutil.h"
++#include "inftrees.h"
++#include "inflate.h"
++#include "inffast.h"
++#include "contrib/arm/chunkcopy.h"
++
++#ifdef MAKEFIXED
++#  ifndef BUILDFIXED
++#    define BUILDFIXED
++#  endif
++#endif
++
++/* function prototypes */
++local int inflateStateCheck OF((z_streamp strm));
++local void fixedtables OF((struct inflate_state FAR *state));
++local int updatewindow OF((z_streamp strm, const unsigned char FAR *end,
++                           unsigned copy));
++#ifdef BUILDFIXED
++   void makefixed OF((void));
++#endif
++local unsigned syncsearch OF((unsigned FAR *have, const unsigned char FAR *buf,
++                              unsigned len));
++
++local int inflateStateCheck(strm)
++z_streamp strm;
++{
++    struct inflate_state FAR *state;
++    if (strm == Z_NULL ||
++        strm->zalloc == (alloc_func)0 || strm->zfree == (free_func)0)
++        return 1;
++    state = (struct inflate_state FAR *)strm->state;
++    if (state == Z_NULL || state->strm != strm ||
++        state->mode < HEAD || state->mode > SYNC)
++        return 1;
++    return 0;
++}
++
++int ZEXPORT inflateResetKeep(strm)
++z_streamp strm;
++{
++    struct inflate_state FAR *state;
++
++    if (inflateStateCheck(strm)) return Z_STREAM_ERROR;
++    state = (struct inflate_state FAR *)strm->state;
++    strm->total_in = strm->total_out = state->total = 0;
++    strm->msg = Z_NULL;
++    if (state->wrap)        /* to support ill-conceived Java test suite */
++        strm->adler = state->wrap & 1;
++    state->mode = HEAD;
++    state->last = 0;
++    state->havedict = 0;
++    state->dmax = 32768U;
++    state->head = Z_NULL;
++    state->hold = 0;
++    state->bits = 0;
++    state->lencode = state->distcode = state->next = state->codes;
++    state->sane = 1;
++    state->back = -1;
++    Tracev((stderr, "inflate: reset\n"));
++    return Z_OK;
++}
++
++int ZEXPORT inflateReset(strm)
++z_streamp strm;
++{
++    struct inflate_state FAR *state;
++
++    if (inflateStateCheck(strm)) return Z_STREAM_ERROR;
++    state = (struct inflate_state FAR *)strm->state;
++    state->wsize = 0;
++    state->whave = 0;
++    state->wnext = 0;
++    return inflateResetKeep(strm);
++}
++
++int ZEXPORT inflateReset2(strm, windowBits)
++z_streamp strm;
++int windowBits;
++{
++    int wrap;
++    struct inflate_state FAR *state;
++
++    /* get the state */
++    if (inflateStateCheck(strm)) return Z_STREAM_ERROR;
++    state = (struct inflate_state FAR *)strm->state;
++
++    /* extract wrap request from windowBits parameter */
++    if (windowBits < 0) {
++        wrap = 0;
++        windowBits = -windowBits;
++    }
++    else {
++        wrap = (windowBits >> 4) + 5;
++#ifdef GUNZIP
++        if (windowBits < 48)
++            windowBits &= 15;
++#endif
++    }
++
++    /* set number of window bits, free window if different */
++    if (windowBits && (windowBits < 8 || windowBits > 15))
++        return Z_STREAM_ERROR;
++    if (state->window != Z_NULL && state->wbits != (unsigned)windowBits) {
++        ZFREE(strm, state->window);
++        state->window = Z_NULL;
++    }
++
++    /* update state and reset the rest of it */
++    state->wrap = wrap;
++    state->wbits = (unsigned)windowBits;
++    return inflateReset(strm);
++}
++
++int ZEXPORT inflateInit2_(strm, windowBits, version, stream_size)
++z_streamp strm;
++int windowBits;
++const char *version;
++int stream_size;
++{
++    int ret;
++    struct inflate_state FAR *state;
++
++    if (version == Z_NULL || version[0] != ZLIB_VERSION[0] ||
++        stream_size != (int)(sizeof(z_stream)))
++        return Z_VERSION_ERROR;
++    if (strm == Z_NULL) return Z_STREAM_ERROR;
++    strm->msg = Z_NULL;                 /* in case we return an error */
++    if (strm->zalloc == (alloc_func)0) {
++#ifdef Z_SOLO
++        return Z_STREAM_ERROR;
++#else
++        strm->zalloc = zcalloc;
++        strm->opaque = (voidpf)0;
++#endif
++    }
++    if (strm->zfree == (free_func)0)
++#ifdef Z_SOLO
++        return Z_STREAM_ERROR;
++#else
++        strm->zfree = zcfree;
++#endif
++    state = (struct inflate_state FAR *)
++            ZALLOC(strm, 1, sizeof(struct inflate_state));
++    if (state == Z_NULL) return Z_MEM_ERROR;
++    Tracev((stderr, "inflate: allocated\n"));
++    strm->state = (struct internal_state FAR *)state;
++    state->strm = strm;
++    state->window = Z_NULL;
++    state->mode = HEAD;     /* to pass state test in inflateReset2() */
++    state->check = 1L;      /* 1L is the result of adler32() zero length data */
++    ret = inflateReset2(strm, windowBits);
++    if (ret != Z_OK) {
++        ZFREE(strm, state);
++        strm->state = Z_NULL;
++    }
++    return ret;
++}
++
++int ZEXPORT inflateInit_(strm, version, stream_size)
++z_streamp strm;
++const char *version;
++int stream_size;
++{
++    return inflateInit2_(strm, DEF_WBITS, version, stream_size);
++}
++
++int ZEXPORT inflatePrime(strm, bits, value)
++z_streamp strm;
++int bits;
++int value;
++{
++    struct inflate_state FAR *state;
++
++    if (inflateStateCheck(strm)) return Z_STREAM_ERROR;
++    state = (struct inflate_state FAR *)strm->state;
++    if (bits < 0) {
++        state->hold = 0;
++        state->bits = 0;
++        return Z_OK;
++    }
++    if (bits > 16 || state->bits + (uInt)bits > 32) return Z_STREAM_ERROR;
++    value &= (1L << bits) - 1;
++    state->hold += (unsigned)value << state->bits;
++    state->bits += (uInt)bits;
++    return Z_OK;
++}
++
++/*
++   Return state with length and distance decoding tables and index sizes set to
++   fixed code decoding.  Normally this returns fixed tables from inffixed.h.
++   If BUILDFIXED is defined, then instead this routine builds the tables the
++   first time it's called, and returns those tables the first time and
++   thereafter.  This reduces the size of the code by about 2K bytes, in
++   exchange for a little execution time.  However, BUILDFIXED should not be
++   used for threaded applications, since the rewriting of the tables and virgin
++   may not be thread-safe.
++ */
++local void fixedtables(state)
++struct inflate_state FAR *state;
++{
++#ifdef BUILDFIXED
++    static int virgin = 1;
++    static code *lenfix, *distfix;
++    static code fixed[544];
++
++    /* build fixed huffman tables if first call (may not be thread safe) */
++    if (virgin) {
++        unsigned sym, bits;
++        static code *next;
++
++        /* literal/length table */
++        sym = 0;
++        while (sym < 144) state->lens[sym++] = 8;
++        while (sym < 256) state->lens[sym++] = 9;
++        while (sym < 280) state->lens[sym++] = 7;
++        while (sym < 288) state->lens[sym++] = 8;
++        next = fixed;
++        lenfix = next;
++        bits = 9;
++        inflate_table(LENS, state->lens, 288, &(next), &(bits), state->work);
++
++        /* distance table */
++        sym = 0;
++        while (sym < 32) state->lens[sym++] = 5;
++        distfix = next;
++        bits = 5;
++        inflate_table(DISTS, state->lens, 32, &(next), &(bits), state->work);
++
++        /* do this just once */
++        virgin = 0;
++    }
++#else /* !BUILDFIXED */
++#   include "inffixed.h"
++#endif /* BUILDFIXED */
++    state->lencode = lenfix;
++    state->lenbits = 9;
++    state->distcode = distfix;
++    state->distbits = 5;
++}
++
++#ifdef MAKEFIXED
++#include <stdio.h>
++
++/*
++   Write out the inffixed.h that is #include'd above.  Defining MAKEFIXED also
++   defines BUILDFIXED, so the tables are built on the fly.  makefixed() writes
++   those tables to stdout, which would be piped to inffixed.h.  A small program
++   can simply call makefixed to do this:
++
++    void makefixed(void);
++
++    int main(void)
++    {
++        makefixed();
++        return 0;
++    }
++
++   Then that can be linked with zlib built with MAKEFIXED defined and run:
++
++    a.out > inffixed.h
++ */
++void makefixed()
++{
++    unsigned low, size;
++    struct inflate_state state;
++
++    fixedtables(&state);
++    puts("    /* inffixed.h -- table for decoding fixed codes");
++    puts("     * Generated automatically by makefixed().");
++    puts("     */");
++    puts("");
++    puts("    /* WARNING: this file should *not* be used by applications.");
++    puts("       It is part of the implementation of this library and is");
++    puts("       subject to change. Applications should only use zlib.h.");
++    puts("     */");
++    puts("");
++    size = 1U << 9;
++    printf("    static const code lenfix[%u] = {", size);
++    low = 0;
++    for (;;) {
++        if ((low % 7) == 0) printf("\n        ");
++        printf("{%u,%u,%d}", (low & 127) == 99 ? 64 : state.lencode[low].op,
++               state.lencode[low].bits, state.lencode[low].val);
++        if (++low == size) break;
++        putchar(',');
++    }
++    puts("\n    };");
++    size = 1U << 5;
++    printf("\n    static const code distfix[%u] = {", size);
++    low = 0;
++    for (;;) {
++        if ((low % 6) == 0) printf("\n        ");
++        printf("{%u,%u,%d}", state.distcode[low].op, state.distcode[low].bits,
++               state.distcode[low].val);
++        if (++low == size) break;
++        putchar(',');
++    }
++    puts("\n    };");
++}
++#endif /* MAKEFIXED */
++
++/*
++   Update the window with the last wsize (normally 32K) bytes written before
++   returning.  If window does not exist yet, create it.  This is only called
++   when a window is already in use, or when output has been written during this
++   inflate call, but the end of the deflate stream has not been reached yet.
++   It is also called to create a window for dictionary data when a dictionary
++   is loaded.
++
++   Providing output buffers larger than 32K to inflate() should provide a speed
++   advantage, since only the last 32K of output is copied to the sliding window
++   upon return from inflate(), and since all distances after the first 32K of
++   output will fall in the output data, making match copies simpler and faster.
++   The advantage may be dependent on the size of the processor's data caches.
++ */
++local int updatewindow(strm, end, copy)
++z_streamp strm;
++const Bytef *end;
++unsigned copy;
++{
++    struct inflate_state FAR *state;
++    unsigned dist;
++
++    state = (struct inflate_state FAR *)strm->state;
++
++    /* if it hasn't been done already, allocate space for the window */
++    if (state->window == Z_NULL) {
++        unsigned wsize = 1U << state->wbits;
++        state->window = (unsigned char FAR *)
++                        ZALLOC(strm, wsize + CHUNKCOPY_CHUNK_SIZE,
++                               sizeof(unsigned char));
++        if (state->window == Z_NULL) return 1;
++#ifdef INFLATE_CLEAR_UNUSED_UNDEFINED
++        /* Copies from the overflow portion of this buffer are undefined and
++           may cause analysis tools to raise a warning if we don't initialize
++           it.  However, this undefined data overwrites other undefined data
++           and is subsequently either overwritten or left deliberately
++           undefined at the end of decode; so there's really no point.
++         */
++        memset(state->window + wsize, 0, CHUNKCOPY_CHUNK_SIZE);
++#endif
++    }
++
++    /* if window not in use yet, initialize */
++    if (state->wsize == 0) {
++        state->wsize = 1U << state->wbits;
++        state->wnext = 0;
++        state->whave = 0;
++    }
++
++    /* copy state->wsize or less output bytes into the circular window */
++    if (copy >= state->wsize) {
++        zmemcpy(state->window, end - state->wsize, state->wsize);
++        state->wnext = 0;
++        state->whave = state->wsize;
++    }
++    else {
++        dist = state->wsize - state->wnext;
++        if (dist > copy) dist = copy;
++        zmemcpy(state->window + state->wnext, end - copy, dist);
++        copy -= dist;
++        if (copy) {
++            zmemcpy(state->window, end - copy, copy);
++            state->wnext = copy;
++            state->whave = state->wsize;
++        }
++        else {
++            state->wnext += dist;
++            if (state->wnext == state->wsize) state->wnext = 0;
++            if (state->whave < state->wsize) state->whave += dist;
++        }
++    }
++    return 0;
++}
++
++/* Macros for inflate(): */
++
++/* check function to use adler32() for zlib or crc32() for gzip */
++#ifdef GUNZIP
++#  define UPDATE(check, buf, len) \
++    (state->flags ? crc32(check, buf, len) : adler32(check, buf, len))
++#else
++#  define UPDATE(check, buf, len) adler32(check, buf, len)
++#endif
++
++/* check macros for header crc */
++#ifdef GUNZIP
++#  define CRC2(check, word) \
++    do { \
++        hbuf[0] = (unsigned char)(word); \
++        hbuf[1] = (unsigned char)((word) >> 8); \
++        check = crc32(check, hbuf, 2); \
++    } while (0)
++
++#  define CRC4(check, word) \
++    do { \
++        hbuf[0] = (unsigned char)(word); \
++        hbuf[1] = (unsigned char)((word) >> 8); \
++        hbuf[2] = (unsigned char)((word) >> 16); \
++        hbuf[3] = (unsigned char)((word) >> 24); \
++        check = crc32(check, hbuf, 4); \
++    } while (0)
++#endif
++
++/* Load registers with state in inflate() for speed */
++#define LOAD() \
++    do { \
++        put = strm->next_out; \
++        left = strm->avail_out; \
++        next = strm->next_in; \
++        have = strm->avail_in; \
++        hold = state->hold; \
++        bits = state->bits; \
++    } while (0)
++
++/* Restore state from registers in inflate() */
++#define RESTORE() \
++    do { \
++        strm->next_out = put; \
++        strm->avail_out = left; \
++        strm->next_in = next; \
++        strm->avail_in = have; \
++        state->hold = hold; \
++        state->bits = bits; \
++    } while (0)
++
++/* Clear the input bit accumulator */
++#define INITBITS() \
++    do { \
++        hold = 0; \
++        bits = 0; \
++    } while (0)
++
++/* Get a byte of input into the bit accumulator, or return from inflate()
++   if there is no input available. */
++#define PULLBYTE() \
++    do { \
++        if (have == 0) goto inf_leave; \
++        have--; \
++        hold += (unsigned long)(*next++) << bits; \
++        bits += 8; \
++    } while (0)
++
++/* Assure that there are at least n bits in the bit accumulator.  If there is
++   not enough available input to do that, then return from inflate(). */
++#define NEEDBITS(n) \
++    do { \
++        while (bits < (unsigned)(n)) \
++            PULLBYTE(); \
++    } while (0)
++
++/* Return the low n bits of the bit accumulator (n < 16) */
++#define BITS(n) \
++    ((unsigned)hold & ((1U << (n)) - 1))
++
++/* Remove n bits from the bit accumulator */
++#define DROPBITS(n) \
++    do { \
++        hold >>= (n); \
++        bits -= (unsigned)(n); \
++    } while (0)
++
++/* Remove zero to seven bits as needed to go to a byte boundary */
++#define BYTEBITS() \
++    do { \
++        hold >>= bits & 7; \
++        bits -= bits & 7; \
++    } while (0)
++
++/*
++   inflate() uses a state machine to process as much input data and generate as
++   much output data as possible before returning.  The state machine is
++   structured roughly as follows:
++
++    for (;;) switch (state) {
++    ...
++    case STATEn:
++        if (not enough input data or output space to make progress)
++            return;
++        ... make progress ...
++        state = STATEm;
++        break;
++    ...
++    }
++
++   so when inflate() is called again, the same case is attempted again, and
++   if the appropriate resources are provided, the machine proceeds to the
++   next state.  The NEEDBITS() macro is usually the way the state evaluates
++   whether it can proceed or should return.  NEEDBITS() does the return if
++   the requested bits are not available.  The typical use of the BITS macros
++   is:
++
++        NEEDBITS(n);
++        ... do something with BITS(n) ...
++        DROPBITS(n);
++
++   where NEEDBITS(n) either returns from inflate() if there isn't enough
++   input left to load n bits into the accumulator, or it continues.  BITS(n)
++   gives the low n bits in the accumulator.  When done, DROPBITS(n) drops
++   the low n bits off the accumulator.  INITBITS() clears the accumulator
++   and sets the number of available bits to zero.  BYTEBITS() discards just
++   enough bits to put the accumulator on a byte boundary.  After BYTEBITS()
++   and a NEEDBITS(8), then BITS(8) would return the next byte in the stream.
++
++   NEEDBITS(n) uses PULLBYTE() to get an available byte of input, or to return
++   if there is no input available.  The decoding of variable length codes uses
++   PULLBYTE() directly in order to pull just enough bytes to decode the next
++   code, and no more.
++
++   Some states loop until they get enough input, making sure that enough
++   state information is maintained to continue the loop where it left off
++   if NEEDBITS() returns in the loop.  For example, want, need, and keep
++   would all have to actually be part of the saved state in case NEEDBITS()
++   returns:
++
++    case STATEw:
++        while (want < need) {
++            NEEDBITS(n);
++            keep[want++] = BITS(n);
++            DROPBITS(n);
++        }
++        state = STATEx;
++    case STATEx:
++
++   As shown above, if the next state is also the next case, then the break
++   is omitted.
++
++   A state may also return if there is not enough output space available to
++   complete that state.  Those states are copying stored data, writing a
++   literal byte, and copying a matching string.
++
++   When returning, a "goto inf_leave" is used to update the total counters,
++   update the check value, and determine whether any progress has been made
++   during that inflate() call in order to return the proper return code.
++   Progress is defined as a change in either strm->avail_in or strm->avail_out.
++   When there is a window, goto inf_leave will update the window with the last
++   output written.  If a goto inf_leave occurs in the middle of decompression
++   and there is no window currently, goto inf_leave will create one and copy
++   output to the window for the next call of inflate().
++
++   In this implementation, the flush parameter of inflate() only affects the
++   return code (per zlib.h).  inflate() always writes as much as possible to
++   strm->next_out, given the space available and the provided input--the effect
++   documented in zlib.h of Z_SYNC_FLUSH.  Furthermore, inflate() always defers
++   the allocation of and copying into a sliding window until necessary, which
++   provides the effect documented in zlib.h for Z_FINISH when the entire input
++   stream available.  So the only thing the flush parameter actually does is:
++   when flush is set to Z_FINISH, inflate() cannot return Z_OK.  Instead it
++   will return Z_BUF_ERROR if it has not reached the end of the stream.
++ */
++
++int ZEXPORT inflate(strm, flush)
++z_streamp strm;
++int flush;
++{
++    struct inflate_state FAR *state;
++    z_const unsigned char FAR *next;    /* next input */
++    unsigned char FAR *put;     /* next output */
++    unsigned have, left;        /* available input and output */
++    unsigned long hold;         /* bit buffer */
++    unsigned bits;              /* bits in bit buffer */
++    unsigned in, out;           /* save starting available input and output */
++    unsigned copy;              /* number of stored or match bytes to copy */
++    unsigned char FAR *from;    /* where to copy match bytes from */
++    code here;                  /* current decoding table entry */
++    code last;                  /* parent table entry */
++    unsigned len;               /* length to copy for repeats, bits to drop */
++    int ret;                    /* return code */
++#ifdef GUNZIP
++    unsigned char hbuf[4];      /* buffer for gzip header crc calculation */
++#endif
++    static const unsigned short order[19] = /* permutation of code lengths */
++        {16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15};
++
++    if (inflateStateCheck(strm) || strm->next_out == Z_NULL ||
++        (strm->next_in == Z_NULL && strm->avail_in != 0))
++        return Z_STREAM_ERROR;
++
++    state = (struct inflate_state FAR *)strm->state;
++    if (state->mode == TYPE) state->mode = TYPEDO;      /* skip check */
++    LOAD();
++    in = have;
++    out = left;
++    ret = Z_OK;
++    for (;;)
++        switch (state->mode) {
++        case HEAD:
++            if (state->wrap == 0) {
++                state->mode = TYPEDO;
++                break;
++            }
++            NEEDBITS(16);
++#ifdef GUNZIP
++            if ((state->wrap & 2) && hold == 0x8b1f) {  /* gzip header */
++                if (state->wbits == 0)
++                    state->wbits = 15;
++                state->check = crc32(0L, Z_NULL, 0);
++                CRC2(state->check, hold);
++                INITBITS();
++                state->mode = FLAGS;
++                break;
++            }
++            state->flags = 0;           /* expect zlib header */
++            if (state->head != Z_NULL)
++                state->head->done = -1;
++            if (!(state->wrap & 1) ||   /* check if zlib header allowed */
++#else
++            if (
++#endif
++                ((BITS(8) << 8) + (hold >> 8)) % 31) {
++                strm->msg = (char *)"incorrect header check";
++                state->mode = BAD;
++                break;
++            }
++            if (BITS(4) != Z_DEFLATED) {
++                strm->msg = (char *)"unknown compression method";
++                state->mode = BAD;
++                break;
++            }
++            DROPBITS(4);
++            len = BITS(4) + 8;
++            if (state->wbits == 0)
++                state->wbits = len;
++            if (len > 15 || len > state->wbits) {
++                strm->msg = (char *)"invalid window size";
++                state->mode = BAD;
++                break;
++            }
++            state->dmax = 1U << len;
++            Tracev((stderr, "inflate:   zlib header ok\n"));
++            strm->adler = state->check = adler32(0L, Z_NULL, 0);
++            state->mode = hold & 0x200 ? DICTID : TYPE;
++            INITBITS();
++            break;
++#ifdef GUNZIP
++        case FLAGS:
++            NEEDBITS(16);
++            state->flags = (int)(hold);
++            if ((state->flags & 0xff) != Z_DEFLATED) {
++                strm->msg = (char *)"unknown compression method";
++                state->mode = BAD;
++                break;
++            }
++            if (state->flags & 0xe000) {
++                strm->msg = (char *)"unknown header flags set";
++                state->mode = BAD;
++                break;
++            }
++            if (state->head != Z_NULL)
++                state->head->text = (int)((hold >> 8) & 1);
++            if ((state->flags & 0x0200) && (state->wrap & 4))
++                CRC2(state->check, hold);
++            INITBITS();
++            state->mode = TIME;
++        case TIME:
++            NEEDBITS(32);
++            if (state->head != Z_NULL)
++                state->head->time = hold;
++            if ((state->flags & 0x0200) && (state->wrap & 4))
++                CRC4(state->check, hold);
++            INITBITS();
++            state->mode = OS;
++        case OS:
++            NEEDBITS(16);
++            if (state->head != Z_NULL) {
++                state->head->xflags = (int)(hold & 0xff);
++                state->head->os = (int)(hold >> 8);
++            }
++            if ((state->flags & 0x0200) && (state->wrap & 4))
++                CRC2(state->check, hold);
++            INITBITS();
++            state->mode = EXLEN;
++        case EXLEN:
++            if (state->flags & 0x0400) {
++                NEEDBITS(16);
++                state->length = (unsigned)(hold);
++                if (state->head != Z_NULL)
++                    state->head->extra_len = (unsigned)hold;
++                if ((state->flags & 0x0200) && (state->wrap & 4))
++                    CRC2(state->check, hold);
++                INITBITS();
++            }
++            else if (state->head != Z_NULL)
++                state->head->extra = Z_NULL;
++            state->mode = EXTRA;
++        case EXTRA:
++            if (state->flags & 0x0400) {
++                copy = state->length;
++                if (copy > have) copy = have;
++                if (copy) {
++                    if (state->head != Z_NULL &&
++                        state->head->extra != Z_NULL) {
++                        len = state->head->extra_len - state->length;
++                        zmemcpy(state->head->extra + len, next,
++                                len + copy > state->head->extra_max ?
++                                state->head->extra_max - len : copy);
++                    }
++                    if ((state->flags & 0x0200) && (state->wrap & 4))
++                        state->check = crc32(state->check, next, copy);
++                    have -= copy;
++                    next += copy;
++                    state->length -= copy;
++                }
++                if (state->length) goto inf_leave;
++            }
++            state->length = 0;
++            state->mode = NAME;
++        case NAME:
++            if (state->flags & 0x0800) {
++                if (have == 0) goto inf_leave;
++                copy = 0;
++                do {
++                    len = (unsigned)(next[copy++]);
++                    if (state->head != Z_NULL &&
++                            state->head->name != Z_NULL &&
++                            state->length < state->head->name_max)
++                        state->head->name[state->length++] = (Bytef)len;
++                } while (len && copy < have);
++                if ((state->flags & 0x0200) && (state->wrap & 4))
++                    state->check = crc32(state->check, next, copy);
++                have -= copy;
++                next += copy;
++                if (len) goto inf_leave;
++            }
++            else if (state->head != Z_NULL)
++                state->head->name = Z_NULL;
++            state->length = 0;
++            state->mode = COMMENT;
++        case COMMENT:
++            if (state->flags & 0x1000) {
++                if (have == 0) goto inf_leave;
++                copy = 0;
++                do {
++                    len = (unsigned)(next[copy++]);
++                    if (state->head != Z_NULL &&
++                            state->head->comment != Z_NULL &&
++                            state->length < state->head->comm_max)
++                        state->head->comment[state->length++] = (Bytef)len;
++                } while (len && copy < have);
++                if ((state->flags & 0x0200) && (state->wrap & 4))
++                    state->check = crc32(state->check, next, copy);
++                have -= copy;
++                next += copy;
++                if (len) goto inf_leave;
++            }
++            else if (state->head != Z_NULL)
++                state->head->comment = Z_NULL;
++            state->mode = HCRC;
++        case HCRC:
++            if (state->flags & 0x0200) {
++                NEEDBITS(16);
++                if ((state->wrap & 4) && hold != (state->check & 0xffff)) {
++                    strm->msg = (char *)"header crc mismatch";
++                    state->mode = BAD;
++                    break;
++                }
++                INITBITS();
++            }
++            if (state->head != Z_NULL) {
++                state->head->hcrc = (int)((state->flags >> 9) & 1);
++                state->head->done = 1;
++            }
++            strm->adler = state->check = crc32(0L, Z_NULL, 0);
++            state->mode = TYPE;
++            break;
++#endif
++        case DICTID:
++            NEEDBITS(32);
++            strm->adler = state->check = ZSWAP32(hold);
++            INITBITS();
++            state->mode = DICT;
++        case DICT:
++            if (state->havedict == 0) {
++                RESTORE();
++                return Z_NEED_DICT;
++            }
++            strm->adler = state->check = adler32(0L, Z_NULL, 0);
++            state->mode = TYPE;
++        case TYPE:
++            if (flush == Z_BLOCK || flush == Z_TREES) goto inf_leave;
++        case TYPEDO:
++            if (state->last) {
++                BYTEBITS();
++                state->mode = CHECK;
++                break;
++            }
++            NEEDBITS(3);
++            state->last = BITS(1);
++            DROPBITS(1);
++            switch (BITS(2)) {
++            case 0:                             /* stored block */
++                Tracev((stderr, "inflate:     stored block%s\n",
++                        state->last ? " (last)" : ""));
++                state->mode = STORED;
++                break;
++            case 1:                             /* fixed block */
++                fixedtables(state);
++                Tracev((stderr, "inflate:     fixed codes block%s\n",
++                        state->last ? " (last)" : ""));
++                state->mode = LEN_;             /* decode codes */
++                if (flush == Z_TREES) {
++                    DROPBITS(2);
++                    goto inf_leave;
++                }
++                break;
++            case 2:                             /* dynamic block */
++                Tracev((stderr, "inflate:     dynamic codes block%s\n",
++                        state->last ? " (last)" : ""));
++                state->mode = TABLE;
++                break;
++            case 3:
++                strm->msg = (char *)"invalid block type";
++                state->mode = BAD;
++            }
++            DROPBITS(2);
++            break;
++        case STORED:
++            BYTEBITS();                         /* go to byte boundary */
++            NEEDBITS(32);
++            if ((hold & 0xffff) != ((hold >> 16) ^ 0xffff)) {
++                strm->msg = (char *)"invalid stored block lengths";
++                state->mode = BAD;
++                break;
++            }
++            state->length = (unsigned)hold & 0xffff;
++            Tracev((stderr, "inflate:       stored length %u\n",
++                    state->length));
++            INITBITS();
++            state->mode = COPY_;
++            if (flush == Z_TREES) goto inf_leave;
++        case COPY_:
++            state->mode = COPY;
++        case COPY:
++            copy = state->length;
++            if (copy) {
++                if (copy > have) copy = have;
++                if (copy > left) copy = left;
++                if (copy == 0) goto inf_leave;
++                zmemcpy(put, next, copy);
++                have -= copy;
++                next += copy;
++                left -= copy;
++                put += copy;
++                state->length -= copy;
++                break;
++            }
++            Tracev((stderr, "inflate:       stored end\n"));
++            state->mode = TYPE;
++            break;
++        case TABLE:
++            NEEDBITS(14);
++            state->nlen = BITS(5) + 257;
++            DROPBITS(5);
++            state->ndist = BITS(5) + 1;
++            DROPBITS(5);
++            state->ncode = BITS(4) + 4;
++            DROPBITS(4);
++#ifndef PKZIP_BUG_WORKAROUND
++            if (state->nlen > 286 || state->ndist > 30) {
++                strm->msg = (char *)"too many length or distance symbols";
++                state->mode = BAD;
++                break;
++            }
++#endif
++            Tracev((stderr, "inflate:       table sizes ok\n"));
++            state->have = 0;
++            state->mode = LENLENS;
++        case LENLENS:
++            while (state->have < state->ncode) {
++                NEEDBITS(3);
++                state->lens[order[state->have++]] = (unsigned short)BITS(3);
++                DROPBITS(3);
++            }
++            while (state->have < 19)
++                state->lens[order[state->have++]] = 0;
++            state->next = state->codes;
++            state->lencode = (const code FAR *)(state->next);
++            state->lenbits = 7;
++            ret = inflate_table(CODES, state->lens, 19, &(state->next),
++                                &(state->lenbits), state->work);
++            if (ret) {
++                strm->msg = (char *)"invalid code lengths set";
++                state->mode = BAD;
++                break;
++            }
++            Tracev((stderr, "inflate:       code lengths ok\n"));
++            state->have = 0;
++            state->mode = CODELENS;
++        case CODELENS:
++            while (state->have < state->nlen + state->ndist) {
++                for (;;) {
++                    here = state->lencode[BITS(state->lenbits)];
++                    if ((unsigned)(here.bits) <= bits) break;
++                    PULLBYTE();
++                }
++                if (here.val < 16) {
++                    DROPBITS(here.bits);
++                    state->lens[state->have++] = here.val;
++                }
++                else {
++                    if (here.val == 16) {
++                        NEEDBITS(here.bits + 2);
++                        DROPBITS(here.bits);
++                        if (state->have == 0) {
++                            strm->msg = (char *)"invalid bit length repeat";
++                            state->mode = BAD;
++                            break;
++                        }
++                        len = state->lens[state->have - 1];
++                        copy = 3 + BITS(2);
++                        DROPBITS(2);
++                    }
++                    else if (here.val == 17) {
++                        NEEDBITS(here.bits + 3);
++                        DROPBITS(here.bits);
++                        len = 0;
++                        copy = 3 + BITS(3);
++                        DROPBITS(3);
++                    }
++                    else {
++                        NEEDBITS(here.bits + 7);
++                        DROPBITS(here.bits);
++                        len = 0;
++                        copy = 11 + BITS(7);
++                        DROPBITS(7);
++                    }
++                    if (state->have + copy > state->nlen + state->ndist) {
++                        strm->msg = (char *)"invalid bit length repeat";
++                        state->mode = BAD;
++                        break;
++                    }
++                    while (copy--)
++                        state->lens[state->have++] = (unsigned short)len;
++                }
++            }
++
++            /* handle error breaks in while */
++            if (state->mode == BAD) break;
++
++            /* check for end-of-block code (better have one) */
++            if (state->lens[256] == 0) {
++                strm->msg = (char *)"invalid code -- missing end-of-block";
++                state->mode = BAD;
++                break;
++            }
++
++            /* build code tables -- note: do not change the lenbits or distbits
++               values here (9 and 6) without reading the comments in inftrees.h
++               concerning the ENOUGH constants, which depend on those values */
++            state->next = state->codes;
++            state->lencode = (const code FAR *)(state->next);
++            state->lenbits = 9;
++            ret = inflate_table(LENS, state->lens, state->nlen, &(state->next),
++                                &(state->lenbits), state->work);
++            if (ret) {
++                strm->msg = (char *)"invalid literal/lengths set";
++                state->mode = BAD;
++                break;
++            }
++            state->distcode = (const code FAR *)(state->next);
++            state->distbits = 6;
++            ret = inflate_table(DISTS, state->lens + state->nlen, state->ndist,
++                            &(state->next), &(state->distbits), state->work);
++            if (ret) {
++                strm->msg = (char *)"invalid distances set";
++                state->mode = BAD;
++                break;
++            }
++            Tracev((stderr, "inflate:       codes ok\n"));
++            state->mode = LEN_;
++            if (flush == Z_TREES) goto inf_leave;
++        case LEN_:
++            state->mode = LEN;
++        case LEN:
++            if (have >= 6 && left >= 258) {
++                RESTORE();
++                inflate_fast(strm, out);
++                LOAD();
++                if (state->mode == TYPE)
++                    state->back = -1;
++                break;
++            }
++            state->back = 0;
++            for (;;) {
++                here = state->lencode[BITS(state->lenbits)];
++                if ((unsigned)(here.bits) <= bits) break;
++                PULLBYTE();
++            }
++            if (here.op && (here.op & 0xf0) == 0) {
++                last = here;
++                for (;;) {
++                    here = state->lencode[last.val +
++                            (BITS(last.bits + last.op) >> last.bits)];
++                    if ((unsigned)(last.bits + here.bits) <= bits) break;
++                    PULLBYTE();
++                }
++                DROPBITS(last.bits);
++                state->back += last.bits;
++            }
++            DROPBITS(here.bits);
++            state->back += here.bits;
++            state->length = (unsigned)here.val;
++            if ((int)(here.op) == 0) {
++                Tracevv((stderr, here.val >= 0x20 && here.val < 0x7f ?
++                        "inflate:         literal '%c'\n" :
++                        "inflate:         literal 0x%02x\n", here.val));
++                state->mode = LIT;
++                break;
++            }
++            if (here.op & 32) {
++                Tracevv((stderr, "inflate:         end of block\n"));
++                state->back = -1;
++                state->mode = TYPE;
++                break;
++            }
++            if (here.op & 64) {
++                strm->msg = (char *)"invalid literal/length code";
++                state->mode = BAD;
++                break;
++            }
++            state->extra = (unsigned)(here.op) & 15;
++            state->mode = LENEXT;
++        case LENEXT:
++            if (state->extra) {
++                NEEDBITS(state->extra);
++                state->length += BITS(state->extra);
++                DROPBITS(state->extra);
++                state->back += state->extra;
++            }
++            Tracevv((stderr, "inflate:         length %u\n", state->length));
++            state->was = state->length;
++            state->mode = DIST;
++        case DIST:
++            for (;;) {
++                here = state->distcode[BITS(state->distbits)];
++                if ((unsigned)(here.bits) <= bits) break;
++                PULLBYTE();
++            }
++            if ((here.op & 0xf0) == 0) {
++                last = here;
++                for (;;) {
++                    here = state->distcode[last.val +
++                            (BITS(last.bits + last.op) >> last.bits)];
++                    if ((unsigned)(last.bits + here.bits) <= bits) break;
++                    PULLBYTE();
++                }
++                DROPBITS(last.bits);
++                state->back += last.bits;
++            }
++            DROPBITS(here.bits);
++            state->back += here.bits;
++            if (here.op & 64) {
++                strm->msg = (char *)"invalid distance code";
++                state->mode = BAD;
++                break;
++            }
++            state->offset = (unsigned)here.val;
++            state->extra = (unsigned)(here.op) & 15;
++            state->mode = DISTEXT;
++        case DISTEXT:
++            if (state->extra) {
++                NEEDBITS(state->extra);
++                state->offset += BITS(state->extra);
++                DROPBITS(state->extra);
++                state->back += state->extra;
++            }
++#ifdef INFLATE_STRICT
++            if (state->offset > state->dmax) {
++                strm->msg = (char *)"invalid distance too far back";
++                state->mode = BAD;
++                break;
++            }
++#endif
++            Tracevv((stderr, "inflate:         distance %u\n", state->offset));
++            state->mode = MATCH;
++        case MATCH:
++            if (left == 0) goto inf_leave;
++            copy = out - left;
++            if (state->offset > copy) {         /* copy from window */
++                copy = state->offset - copy;
++                if (copy > state->whave) {
++                    if (state->sane) {
++                        strm->msg = (char *)"invalid distance too far back";
++                        state->mode = BAD;
++                        break;
++                    }
++#ifdef INFLATE_ALLOW_INVALID_DISTANCE_TOOFAR_ARRR
++                    Trace((stderr, "inflate.c too far\n"));
++                    copy -= state->whave;
++                    if (copy > state->length) copy = state->length;
++                    if (copy > left) copy = left;
++                    left -= copy;
++                    state->length -= copy;
++                    do {
++                        *put++ = 0;
++                    } while (--copy);
++                    if (state->length == 0) state->mode = LEN;
++                    break;
++#endif
++                }
++                if (copy > state->wnext) {
++                    copy -= state->wnext;
++                    from = state->window + (state->wsize - copy);
++                }
++                else
++                    from = state->window + (state->wnext - copy);
++                if (copy > state->length) copy = state->length;
++                if (copy > left) copy = left;
++                put = chunkcopy_safe(put, from, copy, put + left);
++            }
++            else {                              /* copy from output */
++                copy = state->length;
++                if (copy > left) copy = left;
++                put = chunkcopy_lapped_safe(put, state->offset, copy, put + left);
++            }
++            left -= copy;
++            state->length -= copy;
++            if (state->length == 0) state->mode = LEN;
++            break;
++        case LIT:
++            if (left == 0) goto inf_leave;
++            *put++ = (unsigned char)(state->length);
++            left--;
++            state->mode = LEN;
++            break;
++        case CHECK:
++            if (state->wrap) {
++                NEEDBITS(32);
++                out -= left;
++                strm->total_out += out;
++                state->total += out;
++                if ((state->wrap & 4) && out)
++                    strm->adler = state->check =
++                        UPDATE(state->check, put - out, out);
++                out = left;
++                if ((state->wrap & 4) && (
++#ifdef GUNZIP
++                     state->flags ? hold :
++#endif
++                     ZSWAP32(hold)) != state->check) {
++                    strm->msg = (char *)"incorrect data check";
++                    state->mode = BAD;
++                    break;
++                }
++                INITBITS();
++                Tracev((stderr, "inflate:   check matches trailer\n"));
++            }
++#ifdef GUNZIP
++            state->mode = LENGTH;
++        case LENGTH:
++            if (state->wrap && state->flags) {
++                NEEDBITS(32);
++                if (hold != (state->total & 0xffffffffUL)) {
++                    strm->msg = (char *)"incorrect length check";
++                    state->mode = BAD;
++                    break;
++                }
++                INITBITS();
++                Tracev((stderr, "inflate:   length matches trailer\n"));
++            }
++#endif
++            state->mode = DONE;
++        case DONE:
++            ret = Z_STREAM_END;
++            goto inf_leave;
++        case BAD:
++            ret = Z_DATA_ERROR;
++            goto inf_leave;
++        case MEM:
++            return Z_MEM_ERROR;
++        case SYNC:
++        default:
++            return Z_STREAM_ERROR;
++        }
++
++    /*
++       Return from inflate(), updating the total counts and the check value.
++       If there was no progress during the inflate() call, return a buffer
++       error.  Call updatewindow() to create and/or update the window state.
++       Note: a memory error from inflate() is non-recoverable.
++     */
++  inf_leave:
++    RESTORE();
++    if (state->wsize || (out != strm->avail_out && state->mode < BAD &&
++            (state->mode < CHECK || flush != Z_FINISH)))
++        if (updatewindow(strm, strm->next_out, out - strm->avail_out)) {
++            state->mode = MEM;
++            return Z_MEM_ERROR;
++        }
++    in -= strm->avail_in;
++    out -= strm->avail_out;
++    strm->total_in += in;
++    strm->total_out += out;
++    state->total += out;
++    if ((state->wrap & 4) && out)
++        strm->adler = state->check =
++            UPDATE(state->check, strm->next_out - out, out);
++    strm->data_type = (int)state->bits + (state->last ? 64 : 0) +
++                      (state->mode == TYPE ? 128 : 0) +
++                      (state->mode == LEN_ || state->mode == COPY_ ? 256 : 0);
++    if (((in == 0 && out == 0) || flush == Z_FINISH) && ret == Z_OK)
++        ret = Z_BUF_ERROR;
++    return ret;
++}
++
++int ZEXPORT inflateEnd(strm)
++z_streamp strm;
++{
++    struct inflate_state FAR *state;
++    if (inflateStateCheck(strm))
++        return Z_STREAM_ERROR;
++    state = (struct inflate_state FAR *)strm->state;
++    if (state->window != Z_NULL) ZFREE(strm, state->window);
++    ZFREE(strm, strm->state);
++    strm->state = Z_NULL;
++    Tracev((stderr, "inflate: end\n"));
++    return Z_OK;
++}
++
++int ZEXPORT inflateGetDictionary(strm, dictionary, dictLength)
++z_streamp strm;
++Bytef *dictionary;
++uInt *dictLength;
++{
++    struct inflate_state FAR *state;
++
++    /* check state */
++    if (inflateStateCheck(strm)) return Z_STREAM_ERROR;
++    state = (struct inflate_state FAR *)strm->state;
++
++    /* copy dictionary */
++    if (state->whave && dictionary != Z_NULL) {
++        zmemcpy(dictionary, state->window + state->wnext,
++                state->whave - state->wnext);
++        zmemcpy(dictionary + state->whave - state->wnext,
++                state->window, state->wnext);
++    }
++    if (dictLength != Z_NULL)
++        *dictLength = state->whave;
++    return Z_OK;
++}
++
++int ZEXPORT inflateSetDictionary(strm, dictionary, dictLength)
++z_streamp strm;
++const Bytef *dictionary;
++uInt dictLength;
++{
++    struct inflate_state FAR *state;
++    unsigned long dictid;
++    int ret;
++
++    /* check state */
++    if (inflateStateCheck(strm)) return Z_STREAM_ERROR;
++    state = (struct inflate_state FAR *)strm->state;
++    if (state->wrap != 0 && state->mode != DICT)
++        return Z_STREAM_ERROR;
++
++    /* check for correct dictionary identifier */
++    if (state->mode == DICT) {
++        dictid = adler32(0L, Z_NULL, 0);
++        dictid = adler32(dictid, dictionary, dictLength);
++        if (dictid != state->check)
++            return Z_DATA_ERROR;
++    }
++
++    /* copy dictionary to window using updatewindow(), which will amend the
++       existing dictionary if appropriate */
++    ret = updatewindow(strm, dictionary + dictLength, dictLength);
++    if (ret) {
++        state->mode = MEM;
++        return Z_MEM_ERROR;
++    }
++    state->havedict = 1;
++    Tracev((stderr, "inflate:   dictionary set\n"));
++    return Z_OK;
++}
++
++int ZEXPORT inflateGetHeader(strm, head)
++z_streamp strm;
++gz_headerp head;
++{
++    struct inflate_state FAR *state;
++
++    /* check state */
++    if (inflateStateCheck(strm)) return Z_STREAM_ERROR;
++    state = (struct inflate_state FAR *)strm->state;
++    if ((state->wrap & 2) == 0) return Z_STREAM_ERROR;
++
++    /* save header structure */
++    state->head = head;
++    head->done = 0;
++    return Z_OK;
++}
++
++/*
++   Search buf[0..len-1] for the pattern: 0, 0, 0xff, 0xff.  Return when found
++   or when out of input.  When called, *have is the number of pattern bytes
++   found in order so far, in 0..3.  On return *have is updated to the new
++   state.  If on return *have equals four, then the pattern was found and the
++   return value is how many bytes were read including the last byte of the
++   pattern.  If *have is less than four, then the pattern has not been found
++   yet and the return value is len.  In the latter case, syncsearch() can be
++   called again with more data and the *have state.  *have is initialized to
++   zero for the first call.
++ */
++local unsigned syncsearch(have, buf, len)
++unsigned FAR *have;
++const unsigned char FAR *buf;
++unsigned len;
++{
++    unsigned got;
++    unsigned next;
++
++    got = *have;
++    next = 0;
++    while (next < len && got < 4) {
++        if ((int)(buf[next]) == (got < 2 ? 0 : 0xff))
++            got++;
++        else if (buf[next])
++            got = 0;
++        else
++            got = 4 - got;
++        next++;
++    }
++    *have = got;
++    return next;
++}
++
++int ZEXPORT inflateSync(strm)
++z_streamp strm;
++{
++    unsigned len;               /* number of bytes to look at or looked at */
++    unsigned long in, out;      /* temporary to save total_in and total_out */
++    unsigned char buf[4];       /* to restore bit buffer to byte string */
++    struct inflate_state FAR *state;
++
++    /* check parameters */
++    if (inflateStateCheck(strm)) return Z_STREAM_ERROR;
++    state = (struct inflate_state FAR *)strm->state;
++    if (strm->avail_in == 0 && state->bits < 8) return Z_BUF_ERROR;
++
++    /* if first time, start search in bit buffer */
++    if (state->mode != SYNC) {
++        state->mode = SYNC;
++        state->hold <<= state->bits & 7;
++        state->bits -= state->bits & 7;
++        len = 0;
++        while (state->bits >= 8) {
++            buf[len++] = (unsigned char)(state->hold);
++            state->hold >>= 8;
++            state->bits -= 8;
++        }
++        state->have = 0;
++        syncsearch(&(state->have), buf, len);
++    }
++
++    /* search available input */
++    len = syncsearch(&(state->have), strm->next_in, strm->avail_in);
++    strm->avail_in -= len;
++    strm->next_in += len;
++    strm->total_in += len;
++
++    /* return no joy or set up to restart inflate() on a new block */
++    if (state->have != 4) return Z_DATA_ERROR;
++    in = strm->total_in;  out = strm->total_out;
++    inflateReset(strm);
++    strm->total_in = in;  strm->total_out = out;
++    state->mode = TYPE;
++    return Z_OK;
++}
++
++/*
++   Returns true if inflate is currently at the end of a block generated by
++   Z_SYNC_FLUSH or Z_FULL_FLUSH. This function is used by one PPP
++   implementation to provide an additional safety check. PPP uses
++   Z_SYNC_FLUSH but removes the length bytes of the resulting empty stored
++   block. When decompressing, PPP checks that at the end of input packet,
++   inflate is waiting for these length bytes.
++ */
++int ZEXPORT inflateSyncPoint(strm)
++z_streamp strm;
++{
++    struct inflate_state FAR *state;
++
++    if (inflateStateCheck(strm)) return Z_STREAM_ERROR;
++    state = (struct inflate_state FAR *)strm->state;
++    return state->mode == STORED && state->bits == 0;
++}
++
++int ZEXPORT inflateCopy(dest, source)
++z_streamp dest;
++z_streamp source;
++{
++    struct inflate_state FAR *state;
++    struct inflate_state FAR *copy;
++    unsigned char FAR *window;
++    unsigned wsize;
++
++    /* check input */
++    if (inflateStateCheck(source) || dest == Z_NULL)
++        return Z_STREAM_ERROR;
++    state = (struct inflate_state FAR *)source->state;
++
++    /* allocate space */
++    copy = (struct inflate_state FAR *)
++           ZALLOC(source, 1, sizeof(struct inflate_state));
++    if (copy == Z_NULL) return Z_MEM_ERROR;
++    window = Z_NULL;
++    if (state->window != Z_NULL) {
++        window = (unsigned char FAR *)
++                 ZALLOC(source, 1U << state->wbits, sizeof(unsigned char));
++        if (window == Z_NULL) {
++            ZFREE(source, copy);
++            return Z_MEM_ERROR;
++        }
++    }
++
++    /* copy state */
++    zmemcpy((voidpf)dest, (voidpf)source, sizeof(z_stream));
++    zmemcpy((voidpf)copy, (voidpf)state, sizeof(struct inflate_state));
++    copy->strm = dest;
++    if (state->lencode >= state->codes &&
++        state->lencode <= state->codes + ENOUGH - 1) {
++        copy->lencode = copy->codes + (state->lencode - state->codes);
++        copy->distcode = copy->codes + (state->distcode - state->codes);
++    }
++    copy->next = copy->codes + (state->next - state->codes);
++    if (window != Z_NULL) {
++        wsize = 1U << state->wbits;
++        zmemcpy(window, state->window, wsize);
++    }
++    copy->window = window;
++    dest->state = (struct internal_state FAR *)copy;
++    return Z_OK;
++}
++
++int ZEXPORT inflateUndermine(strm, subvert)
++z_streamp strm;
++int subvert;
++{
++    struct inflate_state FAR *state;
++
++    if (inflateStateCheck(strm)) return Z_STREAM_ERROR;
++    state = (struct inflate_state FAR *)strm->state;
++#ifdef INFLATE_ALLOW_INVALID_DISTANCE_TOOFAR_ARRR
++    state->sane = !subvert;
++    return Z_OK;
++#else
++    (void)subvert;
++    state->sane = 1;
++    return Z_DATA_ERROR;
++#endif
++}
++
++int ZEXPORT inflateValidate(strm, check)
++z_streamp strm;
++int check;
++{
++    struct inflate_state FAR *state;
++
++    if (inflateStateCheck(strm)) return Z_STREAM_ERROR;
++    state = (struct inflate_state FAR *)strm->state;
++    if (check)
++        state->wrap |= 4;
++    else
++        state->wrap &= ~4;
++    return Z_OK;
++}
++
++long ZEXPORT inflateMark(strm)
++z_streamp strm;
++{
++    struct inflate_state FAR *state;
++
++    if (inflateStateCheck(strm))
++        return -(1L << 16);
++    state = (struct inflate_state FAR *)strm->state;
++    return (long)(((unsigned long)((long)state->back)) << 16) +
++        (state->mode == COPY ? state->length :
++            (state->mode == MATCH ? state->was - state->length : 0));
++}
++
++unsigned long ZEXPORT inflateCodesUsed(strm)
++z_streamp strm;
++{
++    struct inflate_state FAR *state;
++    if (inflateStateCheck(strm)) return (unsigned long)-1;
++    state = (struct inflate_state FAR *)strm->state;
++    return (unsigned long)(state->next - state->codes);
++}
+-- 
+2.14.3
+
diff --git a/0002-Port-Fix-InflateBack-corner-case.patch b/0002-Port-Fix-InflateBack-corner-case.patch
new file mode 100644
index 0000000..6673d9d
--- /dev/null
+++ b/0002-Port-Fix-InflateBack-corner-case.patch
@@ -0,0 +1,147 @@
+From 267e6f20170edb9a00b11fc3a2ca7649ea1c4464 Mon Sep 17 00:00:00 2001
+From: Adenilson Cavalcanti <adenilson.cavalcanti@arm.com>
+Date: Wed, 4 Apr 2018 15:14:57 -0700
+Subject: [PATCH 2/2] Port Fix InflateBack corner case
+
+This handles the case where a zlib user could rely on InflateBack
+API to decompress content.
+
+The NEON optimization assumes that it can perform wide stores, sometimes
+overwriting data on the output pointer (but never overflowing the buffer
+end as it has enough room for the write).
+
+For infback there is no such guarantees (i.e. no extra wiggle room),
+which can result in illegal operations. This patch fixes the potential
+issue by falling back to the non-optimized code for such cases.
+
+Also it adds some comments about the entry assumptions in inflate and
+writes out a defined value at the write buffer to identify where
+the real data has ended (helpful while debugging).
+
+For reference, please see:
+https://chromium.googlesource.com/chromium/src/+/0bb11040792edc5b28fcb710fc4c01fedd98c97c
+
+Change-Id: Iffbda9eb5e08a661aa15c6e3d1c59b678cc23b2c
+---
+ CMakeLists.txt                             |  5 ++---
+ contrib/arm/{inffast.c => inffast_chunk.c} | 10 +++++++---
+ contrib/arm/inffast_chunk.h                | 12 ++++++++++++
+ contrib/arm/inflate.c                      | 14 ++++++++++++--
+ 4 files changed, 33 insertions(+), 8 deletions(-)
+ rename contrib/arm/{inffast.c => inffast_chunk.c} (97%)
+ create mode 100644 contrib/arm/inffast_chunk.h
+
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index 09bb3db..98ee4dd 100644
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -139,9 +139,8 @@ if(CMAKE_COMPILER_IS_GNUCC)
+ 
+     if(ARM_NEON)
+         list(REMOVE_ITEM ZLIB_SRCS inflate.c)
+-        list(REMOVE_ITEM ZLIB_SRCS inffast.c)
+-        set(ZLIB_ARM_NEON_HDRS contrib/arm/chunkcopy.h)
+-        set(ZLIB_ARM_NEON contrib/arm/inflate.c contrib/arm/inffast.c)
++        set(ZLIB_ARM_NEON_HDRS contrib/arm/chunkcopy.h contrib/arm/inffast_chunk.h)
++        set(ZLIB_ARM_NEON contrib/arm/inflate.c contrib/arm/inffast_chunk.c)
+         add_definitions(-DARM_NEON)
+         set(COMPILER ${CMAKE_C_COMPILER})
+         # NEON is mandatory in ARMv8.
+diff --git a/contrib/arm/inffast.c b/contrib/arm/inffast_chunk.c
+similarity index 97%
+rename from contrib/arm/inffast.c
+rename to contrib/arm/inffast_chunk.c
+index f7f5007..0c5c583 100644
+--- a/contrib/arm/inffast.c
++++ b/contrib/arm/inffast_chunk.c
+@@ -6,8 +6,8 @@
+ #include "zutil.h"
+ #include "inftrees.h"
+ #include "inflate.h"
+-#include "inffast.h"
+-#include "chunkcopy.h"
++#include "contrib/arm/inffast_chunk.h"
++#include "contrib/arm/chunkcopy.h"
+ 
+ #ifdef ASMINF
+ #  pragma message("Assembler code may have bugs -- use at your own risk")
+@@ -28,6 +28,10 @@
+         strm->avail_out >= 258
+         start >= strm->avail_out
+         state->bits < 8
++        strm->next_out[0..strm->avail_out] does not overlap with
++              strm->next_in[0..strm->avail_in]
++        strm->state->window is allocated with an additional
++              CHUNKCOPY_CHUNK_SIZE-1 bytes of padding beyond strm->state->wsize
+ 
+    On return, state->mode is one of:
+ 
+@@ -48,7 +52,7 @@
+       requires strm->avail_out >= 258 for each loop to avoid checking for
+       output space.
+  */
+-void ZLIB_INTERNAL inflate_fast(strm, start)
++void ZLIB_INTERNAL inflate_fast_chunk(strm, start)
+ z_streamp strm;
+ unsigned start;         /* inflate()'s starting value for strm->avail_out */
+ {
+diff --git a/contrib/arm/inffast_chunk.h b/contrib/arm/inffast_chunk.h
+new file mode 100644
+index 0000000..7839e1d
+--- /dev/null
++++ b/contrib/arm/inffast_chunk.h
+@@ -0,0 +1,12 @@
++/* inffast.h -- header to use inffast.c
++ * Copyright (C) 1995-2003, 2010 Mark Adler
++ * Copyright (C) 2017 ARM, Inc.
++ * For conditions of distribution and use, see copyright notice in zlib.h
++ */
++
++/* WARNING: this file should *not* be used by applications. It is
++   part of the implementation of the compression library and is
++   subject to change. Applications should only use zlib.h.
++ */
++
++void ZLIB_INTERNAL inflate_fast_chunk OF((z_streamp strm, unsigned start));
+diff --git a/contrib/arm/inflate.c b/contrib/arm/inflate.c
+index 23e95f1..d860542 100644
+--- a/contrib/arm/inflate.c
++++ b/contrib/arm/inflate.c
+@@ -83,7 +83,7 @@
+ #include "zutil.h"
+ #include "inftrees.h"
+ #include "inflate.h"
+-#include "inffast.h"
++#include "contrib/arm/inffast_chunk.h"
+ #include "contrib/arm/chunkcopy.h"
+ 
+ #ifdef MAKEFIXED
+@@ -1056,7 +1056,7 @@ int flush;
+         case LEN:
+             if (have >= 6 && left >= 258) {
+                 RESTORE();
+-                inflate_fast(strm, out);
++                inflate_fast_chunk(strm, out);
+                 LOAD();
+                 if (state->mode == TYPE)
+                     state->back = -1;
+@@ -1262,6 +1262,16 @@ int flush;
+        Note: a memory error from inflate() is non-recoverable.
+      */
+   inf_leave:
++   /* We write a defined value in the unused space to help mark
++    * where the stream has ended. We don't use zeros as that can
++    * mislead clients relying on undefined behavior (i.e. assuming
++    * that the data is over when the buffer has a zero/null value).
++    */
++   if (left >= CHUNKCOPY_CHUNK_SIZE)
++      memset(put, 0x55, CHUNKCOPY_CHUNK_SIZE);
++   else
++      memset(put, 0x55, left);
++
+     RESTORE();
+     if (state->wsize || (out != strm->avail_out && state->mode < BAD &&
+             (state->mode < CHECK || flush != Z_FINISH)))
+-- 
+2.14.3
+
diff --git a/0002-Porting-optimized-longest_match.patch b/0002-Porting-optimized-longest_match.patch
new file mode 100644
index 0000000..790ae2a
--- /dev/null
+++ b/0002-Porting-optimized-longest_match.patch
@@ -0,0 +1,218 @@
+From 0ad56061ade1afe2896af1acffa5e15fbe5c98ed Mon Sep 17 00:00:00 2001
+From: Adenilson Cavalcanti <adenilson.cavalcanti@arm.com>
+Date: Mon, 9 Apr 2018 15:14:19 -0700
+Subject: [PATCH 2/3] Porting optimized longest_match
+
+This patch was contributed to zlib-ng and features an improved longest_match
+function using the most distant hash code to reduce number of checks
+(see: http://www.gildor.org/en/projects/zlib).
+
+Original patch by Jun He.
+---
+ CMakeLists.txt                  |   3 +-
+ contrib/arm/arm_longest_match.h | 142 ++++++++++++++++++++++++++++++++++++++++
+ deflate.c                       |  11 +++-
+ 3 files changed, 152 insertions(+), 4 deletions(-)
+ create mode 100644 contrib/arm/arm_longest_match.h
+
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index 230ca6d..c330093 100644
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -142,7 +142,8 @@ if(CMAKE_COMPILER_IS_GNUCC)
+         set(ZLIB_ARM_NEON_HDRS
+             contrib/arm/chunkcopy.h
+             contrib/arm/inffast_chunk.h
+-            contrib/arm/neon_slide_hash.h)
++            contrib/arm/neon_slide_hash.h
++            contrib/arm/arm_longest_match.h)
+         set(ZLIB_ARM_NEON contrib/arm/inflate.c contrib/arm/inffast_chunk.c)
+         add_definitions(-DARM_NEON)
+         set(COMPILER ${CMAKE_C_COMPILER})
+diff --git a/contrib/arm/arm_longest_match.h b/contrib/arm/arm_longest_match.h
+new file mode 100644
+index 0000000..9e7083f
+--- /dev/null
++++ b/contrib/arm/arm_longest_match.h
+@@ -0,0 +1,142 @@
++/* Copyright (C) 1995-2011, 2016 Mark Adler
++ * Copyright (C) 2017 ARM Holdings Inc.
++ * Authors: Adenilson Cavalcanti <adenilson.cavalcanti@arm.com>
++ *          Jun He <jun.he@arm.com>
++ * This software is provided 'as-is', without any express or implied
++ * warranty.  In no event will the authors be held liable for any damages
++ * arising from the use of this software.
++ * Permission is granted to anyone to use this software for any purpose,
++ * including commercial applications, and to alter it and redistribute it
++ * freely, subject to the following restrictions:
++ * 1. The origin of this software must not be misrepresented; you must not
++ *  claim that you wrote the original software. If you use this software
++ *    in a product, an acknowledgment in the product documentation would be
++ *    appreciated but is not required.
++ * 2. Altered source versions must be plainly marked as such, and must not be
++ *    misrepresented as being the original software.
++ * 3. This notice may not be removed or altered from any source distribution.
++ */
++#ifndef __ARM_LONGEST__MATCH__
++#define __ARM_LONGEST__MATCH__
++
++#if defined(ARM_NEON)
++#include "deflate.h"
++#include <stdint.h>
++static inline long get_match_len(const unsigned char *a, const unsigned char *b, long max)
++{
++    register int len = 0;
++    register unsigned long xor = 0;
++    register int check_loops = max/sizeof(unsigned long);
++    while(check_loops-- > 0) {
++        xor = (*(unsigned long *)(a+len)) ^ (*(unsigned long *)(b+len));
++        if (xor) break;
++        len += sizeof(unsigned long);
++    }
++    if (0 == xor) {
++        while (len < max) {
++            if (a[len] != b[len]) break;
++            len++;
++        }
++        return len;
++    }
++    xor = __builtin_ctzl(xor)>>3;
++    return len + xor;
++}
++
++/*
++ * This implementation is based on algorithm described at:
++ * http://www.gildor.org/en/projects/zlib
++ * It uses the hash chain indexed by the most distant hash code to
++ * reduce number of checks.
++ * This also eliminates the those unnecessary check loops in legacy
++ * longest_match's do..while loop if the "most distant code" is out
++ * of search buffer
++ *
++ */
++static inline unsigned arm_longest_match(deflate_state *const s, IPos cur_match) {
++    unsigned chain_length = s->max_chain_length;/* max hash chain length */
++    unsigned char *scan = s->window + s->strstart; /* current string */
++    unsigned char *match;                       /* matched string */
++    unsigned int len;                  /* length of current match */
++    unsigned int best_len = s->prev_length;     /* best match length so far */
++    unsigned int nice_match = s->nice_match;    /* stop if match long enough */
++    IPos limit = s->strstart > (IPos)MAX_DIST(s) ?
++        s->strstart - (IPos)MAX_DIST(s) : 0;
++    /* Stop when cur_match becomes <= limit. To simplify the code,
++     * we prevent matches with the string of window index 0.
++     */
++    int offset = 0;  /* offset of the head[most_distant_hash] from IN cur_match */
++    Pos *prev = s->prev;
++    unsigned int wmask = s->w_mask;
++    unsigned char *scan_buf_base = s->window;
++
++    /* The code is optimized for HASH_BITS >= 8 and MAX_MATCH-2 multiple of 16.
++     * It is easy to get rid of this optimization if necessary.
++     */
++    Assert(s->hash_bits >= 8 && MAX_MATCH == 258, "Code too clever");
++
++    /* Do not look for matches beyond the end of the input. This is necessary
++     * to make deflate deterministic.
++     */
++    if ((unsigned int)nice_match > s->lookahead) nice_match = s->lookahead;
++
++    Assert((unsigned long)s->strstart <= s->window_size-MIN_LOOKAHEAD, "need lookahead");
++
++    /* find most distant hash code for lazy_match */
++    if (best_len > MIN_MATCH) {
++        /* search for most distant hash code */
++        int i;
++        uint16_t hash = 0;
++        IPos pos;
++
++        UPDATE_HASH(s, hash, scan[1]);
++        UPDATE_HASH(s, hash, scan[2]);
++        for (i = 3; i <= best_len; i++) {
++            UPDATE_HASH(s, hash, scan[i]);
++            /* get head IPos of hash calced by scan[i-2..i] */
++            pos = s->head[hash];
++            /* compare it to current "farthest hash" IPos */
++            if (pos <= cur_match) {
++                /* we have a new "farthest hash" now */
++                offset = i - 2;
++                cur_match = pos;
++            }
++        }
++
++        /* update variables to correspond offset */
++        limit += offset;
++        /*
++         * check if the most distant code's offset is out of search buffer
++         * if it is true, then this means scan[offset..offset+2] are not
++	 * presented in the search buffer. So we just return best_len 
++	 * we've found.
++         */
++        if (cur_match < limit) return best_len;
++
++        scan_buf_base -= offset;
++        /* reduce hash search depth based on best_len */
++        chain_length /= best_len - MIN_MATCH;
++    }
++
++    do {
++        Assert(cur_match < s->strstart, "no future");
++
++        /* Determine matched length at current pos */
++        match = scan_buf_base + cur_match;
++        len = get_match_len(match, scan, MAX_MATCH);
++
++        if (len > best_len) {
++            /* found longer string */
++            s->match_start = cur_match - offset;
++            best_len = len;
++            /* good enough? */
++            if (len >= nice_match) break;
++        }
++        /* move to prev pos in this hash chain */
++    } while ((cur_match = prev[cur_match & wmask]) > limit && --chain_length != 0);
++
++    return (best_len <= s->lookahead)? best_len : s->lookahead;
++}
++
++#endif
++#endif
+diff --git a/deflate.c b/deflate.c
+index 36f99ac..4c42259 100644
+--- a/deflate.c
++++ b/deflate.c
+@@ -50,9 +50,6 @@
+ /* @(#) $Id$ */
+ 
+ #include "deflate.h"
+-#if __ARM_NEON
+-#include "contrib/arm/neon_slide_hash.h"
+-#endif
+ 
+ const char deflate_copyright[] =
+    " deflate 1.2.11 Copyright 1995-2017 Jean-loup Gailly and Mark Adler ";
+@@ -196,6 +193,11 @@ local const config configuration_table[10] = {
+     s->head[s->hash_size-1] = NIL; \
+     zmemzero((Bytef *)s->head, (unsigned)(s->hash_size-1)*sizeof(*s->head));
+ 
++#if defined(ARM_NEON)
++#include "contrib/arm/arm_longest_match.h"
++#include "contrib/arm/neon_slide_hash.h"
++#endif
++
+ /* ===========================================================================
+  * Slide the hash table when sliding the window down (could be avoided with 32
+  * bit values at the expense of memory usage). We slide even when level == 0 to
+@@ -1244,6 +1246,9 @@ local uInt longest_match(s, cur_match)
+     deflate_state *s;
+     IPos cur_match;                             /* current match */
+ {
++#if defined(ARM_NEON)
++    return arm_longest_match(s, cur_match);
++#endif
+     unsigned chain_length = s->max_chain_length;/* max hash chain length */
+     register Bytef *scan = s->window + s->strstart; /* current string */
+     register Bytef *match;                      /* matched string */
+-- 
+2.14.3
+
diff --git a/0003-arm64-specific-build-patch.patch b/0003-arm64-specific-build-patch.patch
new file mode 100644
index 0000000..74502d9
--- /dev/null
+++ b/0003-arm64-specific-build-patch.patch
@@ -0,0 +1,136 @@
+From bd30e5ff76aab2668ebfd46e5dbadc44322960c1 Mon Sep 17 00:00:00 2001
+From: Jeremy Linton <jeremy.linton@arm.com>
+Date: Fri, 6 Apr 2018 11:46:42 -0500
+Subject: [PATCH 3/3] arm64 specific build patch
+
+---
+ Makefile.in           | 37 +++++++++++++++++++++++++++----------
+ configure             |  2 +-
+ contrib/minizip/zip.c |  6 ++++--
+ 3 files changed, 32 insertions(+), 13 deletions(-)
+
+diff --git a/Makefile.in b/Makefile.in
+index 5a77949..1a1e452 100644
+--- a/Makefile.in
++++ b/Makefile.in
+@@ -57,11 +57,11 @@ SRCDIR=
+ ZINC=
+ ZINCOUT=-I.
+ 
+-OBJZ = adler32.o crc32.o deflate.o infback.o inffast.o inflate.o inftrees.o trees.o zutil.o
++OBJZ = adler32.o crc32.o deflate.o infback.o arminffast.o inffast.o inflate.o inflate_chunk.o inftrees.o trees.o zutil.o
+ OBJG = compress.o uncompr.o gzclose.o gzlib.o gzread.o gzwrite.o
+ OBJC = $(OBJZ) $(OBJG)
+ 
+-PIC_OBJZ = adler32.lo crc32.lo deflate.lo infback.lo inffast.lo inflate.lo inftrees.lo trees.lo zutil.lo
++PIC_OBJZ = adler32.lo crc32.lo deflate.lo infback.lo inffast.lo inflate.lo inflate_chunk.lo inftrees.lo trees.lo zutil.lo
+ PIC_OBJG = compress.lo uncompr.lo gzclose.lo gzlib.lo gzread.lo gzwrite.lo
+ PIC_OBJC = $(PIC_OBJZ) $(PIC_OBJG)
+ 
+@@ -163,16 +163,22 @@ crc32.o: $(SRCDIR)crc32.c
+ 	$(CC) $(CFLAGS) $(ZINC) -c -o $@ $(SRCDIR)crc32.c
+ 
+ deflate.o: $(SRCDIR)deflate.c
+-	$(CC) $(CFLAGS) $(ZINC) -c -o $@ $(SRCDIR)deflate.c
++	$(CC) $(CFLAGS) $(ZINC) -I$(SRCDIR) -I$(SRCDIR)contrib/arm -c -o $@ $(SRCDIR)deflate.c
+ 
+ infback.o: $(SRCDIR)infback.c
+ 	$(CC) $(CFLAGS) $(ZINC) -c -o $@ $(SRCDIR)infback.c
+ 
+ inffast.o: $(SRCDIR)inffast.c
+-	$(CC) $(CFLAGS) $(ZINC) -c -o $@ $(SRCDIR)inffast.c
++	$(CC) $(CFLAGS) $(ZINC) -I$(SRCDIR) -I$(SRCDIR)contrib/arm -c -o $@ $(SRCDIR)inffast.c
+ 
+-inflate.o: $(SRCDIR)inflate.c
+-	$(CC) $(CFLAGS) $(ZINC) -c -o $@ $(SRCDIR)inflate.c
++arminffast.o: $(SRCDIR)contrib/arm/inffast_chunk.c
++	$(CC) $(CFLAGS) $(ZINC) -I$(SRCDIR) -I$(SRCDIR)contrib/arm -c -o $@ $(SRCDIR)contrib/arm/inffast_chunk.c
++
++inflate.o: $(SRCDIR)contrib/arm/inflate.c
++	$(CC) $(CFLAGS) $(ZINC) -I$(SRCDIR) -I$(SRCDIR)contrib/arm -c -o $@ $(SRCDIR)contrib/arm/inflate.c
++
++inflate_chunk.o: $(SRCDIR)contrib/arm/inffast_chunk.c
++	$(CC) $(CFLAGS) $(ZINC) -I$(SRCDIR) -I$(SRCDIR)contrib/arm -c -o $@ $(SRCDIR)contrib/arm/inffast_chunk.c
+ 
+ inftrees.o: $(SRCDIR)inftrees.c
+ 	$(CC) $(CFLAGS) $(ZINC) -c -o $@ $(SRCDIR)inftrees.c
+@@ -214,7 +220,7 @@ crc32.lo: $(SRCDIR)crc32.c
+ 
+ deflate.lo: $(SRCDIR)deflate.c
+ 	-@mkdir objs 2>/dev/null || test -d objs
+-	$(CC) $(SFLAGS) $(ZINC) -DPIC -c -o objs/deflate.o $(SRCDIR)deflate.c
++	$(CC) $(SFLAGS) $(ZINC) -I$(SRCDIR) -I$(SRCDIR)contrib/arm -DPIC -c -o objs/deflate.o $(SRCDIR)deflate.c
+ 	-@mv objs/deflate.o $@
+ 
+ infback.lo: $(SRCDIR)infback.c
+@@ -222,16 +228,27 @@ infback.lo: $(SRCDIR)infback.c
+ 	$(CC) $(SFLAGS) $(ZINC) -DPIC -c -o objs/infback.o $(SRCDIR)infback.c
+ 	-@mv objs/infback.o $@
+ 
++arminffast.lo: $(SRCDIR)contrib/arm/inffast_chunk.c $(SRCDIR)inffast.c
++	-@mkdir objs 2>/dev/null || test -d objs
++	$(CC) $(SFLAGS) $(ZINC) -I$(SRCDIR) -I$(SRCDIR)contrib/arm -DPIC -c -o objs/arminffast.o $(SRCDIR)contrib/arm/inffast_chunk.c
++	-@mv objs/arminffast.o $@
++
+ inffast.lo: $(SRCDIR)inffast.c
+ 	-@mkdir objs 2>/dev/null || test -d objs
+-	$(CC) $(SFLAGS) $(ZINC) -DPIC -c -o objs/inffast.o $(SRCDIR)inffast.c
++	$(CC) $(SFLAGS) $(ZINC) -I$(SRCDIR) -I$(SRCDIR)contrib/arm -DPIC -c -o objs/inffast.o $(SRCDIR)inffast.c
+ 	-@mv objs/inffast.o $@
+ 
+-inflate.lo: $(SRCDIR)inflate.c
++inflate.lo: $(SRCDIR)contrib/arm/inflate.c
+ 	-@mkdir objs 2>/dev/null || test -d objs
+-	$(CC) $(SFLAGS) $(ZINC) -DPIC -c -o objs/inflate.o $(SRCDIR)inflate.c
++	$(CC) $(SFLAGS) $(ZINC) -I$(SRCDIR) -I$(SRCDIR)contrib/arm -DPIC -c -o objs/inflate.o $(SRCDIR)contrib/arm/inflate.c
+ 	-@mv objs/inflate.o $@
+ 
++inflate_chunk.lo: $(SRCDIR)contrib/arm/inffast_chunk.c
++	-@mkdir objs 2>/dev/null || test -d objs
++	$(CC) $(SFLAGS) $(ZINC) -I$(SRCDIR) -I$(SRCDIR)contrib/arm -DPIC -c -o objs/inflate_chunk.o $(SRCDIR)contrib/arm/inffast_chunk.c
++	-@mv objs/inflate_chunk.o $@
++
++
+ inftrees.lo: $(SRCDIR)inftrees.c
+ 	-@mkdir objs 2>/dev/null || test -d objs
+ 	$(CC) $(SFLAGS) $(ZINC) -DPIC -c -o objs/inftrees.o $(SRCDIR)inftrees.c
+diff --git a/configure b/configure
+index e974d1f..0c5f837 100755
+--- a/configure
++++ b/configure
+@@ -23,7 +23,7 @@ SRCDIR=`dirname $0`
+ if test $SRCDIR = "."; then
+     ZINC=""
+     ZINCOUT="-I."
+-    SRCDIR=""
++    SRCDIR="./"
+ else
+     ZINC='-include zconf.h'
+     ZINCOUT='-I. -I$(SRCDIR)'
+diff --git a/contrib/minizip/zip.c b/contrib/minizip/zip.c
+index 44e88a9..0517930 100644
+--- a/contrib/minizip/zip.c
++++ b/contrib/minizip/zip.c
+@@ -519,15 +519,17 @@ local ZPOS64_T zip64local_SearchCentralDir(const zlib_filefunc64_32_def* pzlib_f
+       break;
+ 
+     for (i=(int)uReadSize-3; (i--)>0;)
++    {
+       if (((*(buf+i))==0x50) && ((*(buf+i+1))==0x4b) &&
+         ((*(buf+i+2))==0x05) && ((*(buf+i+3))==0x06))
+       {
+         uPosFound = uReadPos+i;
+         break;
+       }
++    }
+ 
+-      if (uPosFound!=0)
+-        break;
++    if (uPosFound!=0)
++      break;
+   }
+   TRYFREE(buf);
+   return uPosFound;
+-- 
+2.14.3
+
diff --git a/zlib.spec b/zlib.spec
index 0e67f2b..9087e98 100644
--- a/zlib.spec
+++ b/zlib.spec
@@ -12,6 +12,12 @@ Source: http://www.zlib.net/zlib-%{version}.tar.xz
 Patch0: zlib-1.2.5-minizip-fixuncrypt.patch
 # resolves: #805113
 Patch1: zlib-1.2.11-optimized-s390.patch
+# general aarch64 optimizations
+Patch2: 0001-Porting-inflate-using-wider-loads-and-stores.patch
+Patch3: 0002-Port-Fix-InflateBack-corner-case.patch
+Patch4: 0001-Neon-Optimized-hash-chain-rebase.patch
+Patch5: 0002-Porting-optimized-longest_match.patch
+Patch6: 0003-arm64-specific-build-patch.patch
 
 BuildRequires: automake, autoconf, libtool
 
@@ -64,6 +70,14 @@ developing applications which use minizip.
 %ifarch s390 s390x
 %patch1 -p1 -b .optimized-deflate
 %endif
+%ifarch aarch64
+%patch2 -p1 -b .optimize-aarch64
+%patch3 -p1 -b .optimize-aarch64
+%patch4 -p1 -b .optimize-aarch64
+%patch5 -p1 -b .optimize-aarch64
+%patch6 -p1 -b .optimize-aarch64
+%endif
+
 
 iconv -f iso-8859-2 -t utf-8 < ChangeLog > ChangeLog.tmp
 mv ChangeLog.tmp ChangeLog
@@ -73,6 +87,10 @@ export CFLAGS="$RPM_OPT_FLAGS"
 %ifarch ppc64
 CFLAGS+=" -O3"
 %endif
+%ifarch aarch64
+CFLAGS+=" -DARM_NEON -O3"
+%endif
+
 export LDFLAGS="$LDFLAGS -Wl,-z,relro -Wl,-z,now"
 ./configure --libdir=%{_libdir} --includedir=%{_includedir} --prefix=%{_prefix}
 make %{?_smp_mflags}