diff --git a/.gitignore b/.gitignore
index 8c81fe6..ad7f91f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,2 @@
 /pyfastnoisesimd-0.3.2.tar.gz
+/pyfastnoisesimd-0.4.1.tar.gz
diff --git a/0001-Add-cstdlib-for-posix_memalign.patch b/0001-Add-cstdlib-for-posix_memalign.patch
deleted file mode 100644
index 773ffa6..0000000
--- a/0001-Add-cstdlib-for-posix_memalign.patch
+++ /dev/null
@@ -1,25 +0,0 @@
-From cc83e9986ad96f93f3737128a34d9e1e9fae4cae Mon Sep 17 00:00:00 2001
-From: Elliott Sales de Andrade <quantum.analyst@gmail.com>
-Date: Tue, 16 Jan 2018 02:32:57 -0500
-Subject: [PATCH 1/6] Add cstdlib for posix_memalign.
-
-Signed-off-by: Elliott Sales de Andrade <quantum.analyst@gmail.com>
----
- pyfastnoisesimd/fastnoisesimd/FastNoiseSIMD_internal.cpp | 1 +
- 1 file changed, 1 insertion(+)
-
-diff --git a/pyfastnoisesimd/fastnoisesimd/FastNoiseSIMD_internal.cpp b/pyfastnoisesimd/fastnoisesimd/FastNoiseSIMD_internal.cpp
-index 975d6cb..0af1bf8 100644
---- a/pyfastnoisesimd/fastnoisesimd/FastNoiseSIMD_internal.cpp
-+++ b/pyfastnoisesimd/fastnoisesimd/FastNoiseSIMD_internal.cpp
-@@ -112,6 +112,7 @@ typedef int SIMDi;
- #ifdef _WIN32
- #define SIMD_ALLOCATE_SET(floatP, floatCount) floatP = (float*)_aligned_malloc((floatCount)* sizeof(float), MEMORY_ALIGNMENT)
- #else
-+#include <cstdlib>
- #define SIMD_ALLOCATE_SET(floatP, floatCount) posix_memalign((void**)&floatP, MEMORY_ALIGNMENT, (floatCount)* sizeof(float))
- #endif
- #else
--- 
-2.17.1
-
diff --git a/0001-Update-casts-for-NEON.patch b/0001-Update-casts-for-NEON.patch
new file mode 100644
index 0000000..d9f2cc5
--- /dev/null
+++ b/0001-Update-casts-for-NEON.patch
@@ -0,0 +1,96 @@
+From 4346f6cdb02192edad480c323e7ab34d0f00a6e5 Mon Sep 17 00:00:00 2001
+From: Elliott Sales de Andrade <quantum.analyst@gmail.com>
+Date: Fri, 20 Jul 2018 16:46:43 -0400
+Subject: [PATCH 1/5] Update casts for NEON.
+
+A merge of various upstream commits.
+
+Signed-off-by: Elliott Sales de Andrade <quantum.analyst@gmail.com>
+---
+ pyfastnoisesimd/fastnoisesimd/FastNoiseSIMD.h    |  7 +++++--
+ .../fastnoisesimd/FastNoiseSIMD_internal.cpp     | 16 +++++++++-------
+ 2 files changed, 14 insertions(+), 9 deletions(-)
+
+diff --git a/pyfastnoisesimd/fastnoisesimd/FastNoiseSIMD.h b/pyfastnoisesimd/fastnoisesimd/FastNoiseSIMD.h
+index d0b5ed6..9554384 100644
+--- a/pyfastnoisesimd/fastnoisesimd/FastNoiseSIMD.h
++++ b/pyfastnoisesimd/fastnoisesimd/FastNoiseSIMD.h
+@@ -77,6 +77,9 @@ AMD Carrizo - Q2 2015
+ FMA3
+ Intel Haswell - Q2 2013
+ AMD Piledriver - 2012
++
++AVX-512F
++Intel Skylake-X - Q2 2017
+ */
+ 
+ struct FastNoiseVectorSet;
+@@ -97,7 +100,7 @@ public:
+ 
+ 	// Returns highest detected level of CPU support
+ 	// 5: ARM NEON
+-	// 4: AVX512
++	// 4: AVX-512F
+ 	// 3: AVX2 & FMA3
+ 	// 2: SSE4.1
+ 	// 1: SSE2
+@@ -106,7 +109,7 @@ public:
+ 
+ 	// Sets the SIMD level for newly created FastNoiseSIMD objects
+ 	// 5: ARM NEON
+-	// 4: AVX512
++	// 4: AVX-512F
+ 	// 3: AVX2 & FMA3
+ 	// 2: SSE4.1
+ 	// 1: SSE2
+diff --git a/pyfastnoisesimd/fastnoisesimd/FastNoiseSIMD_internal.cpp b/pyfastnoisesimd/fastnoisesimd/FastNoiseSIMD_internal.cpp
+index d7f5b42..ba5c83e 100644
+--- a/pyfastnoisesimd/fastnoisesimd/FastNoiseSIMD_internal.cpp
++++ b/pyfastnoisesimd/fastnoisesimd/FastNoiseSIMD_internal.cpp
+@@ -178,10 +178,10 @@ static SIMDf VECTORCALL FUNC(DIV)(SIMDf a, SIMDf b)
+ #define SIMDf_MAX(a,b) vmaxq_f32(a,b)
+ #define SIMDf_INV_SQRT(a) vrsqrteq_f32(a)
+ 
+-#define SIMDf_LESS_THAN(a,b) vreinterpretq_f32_u32(vcltq_f32(a,b))
+-#define SIMDf_GREATER_THAN(a,b) vreinterpretq_f32_u32(vcgtq_f32(a,b))
+-#define SIMDf_LESS_EQUAL(a,b) vreinterpretq_f32_u32(vcleq_f32(a,b))
+-#define SIMDf_GREATER_EQUAL(a,b) vreinterpretq_f32_u32(vcgeq_f32(a,b))
++#define SIMDf_LESS_THAN(a,b) vreinterpretq_s32_u32(vcltq_f32(a,b))
++#define SIMDf_GREATER_THAN(a,b) vreinterpretq_s32_u32(vcgtq_f32(a,b))
++#define SIMDf_LESS_EQUAL(a,b) vreinterpretq_s32_u32(vcleq_f32(a,b))
++#define SIMDf_GREATER_EQUAL(a,b) vreinterpretq_s32_u32(vcgeq_f32(a,b))
+ 
+ #define SIMDf_AND(a,b) SIMDf_CAST_TO_FLOAT(vandq_s32(vreinterpretq_s32_f32(a),vreinterpretq_s32_f32(b)))
+ #define SIMDf_AND_NOT(a,b) SIMDf_CAST_TO_FLOAT(vandq_s32(vmvnq_s32(vreinterpretq_s32_f32(a)),vreinterpretq_s32_f32(b)))
+@@ -192,7 +192,9 @@ static SIMDf VECTORCALL FUNC(FLOOR)(SIMDf a)
+ {
+ 	SIMDf fval = SIMDf_CONVERT_TO_FLOAT(SIMDi_CONVERT_TO_INT(a));
+ 
+-	return vsubq_f32(fval, SIMDf_AND(SIMDf_LESS_THAN(a, fval), SIMDf_NUM(1)));
++	return vsubq_f32(fval,
++	                 SIMDf_CAST_TO_FLOAT(vandq_s32(SIMDf_LESS_THAN(a, fval),
++	                                               SIMDi_CAST_TO_INT(SIMDf_NUM(1)))));
+ }
+ #define SIMDf_FLOOR(a) FUNC(FLOOR)(a)
+ #else
+@@ -201,7 +203,7 @@ static SIMDf VECTORCALL FUNC(FLOOR)(SIMDf a)
+ #endif
+ 
+ #define SIMDf_ABS(a) vabsq_f32(a)
+-#define SIMDf_BLENDV(a,b,mask) vbslq_f32(mask,b,a)
++#define SIMDf_BLENDV(a,b,mask) vbslq_f32(vreinterpretq_u32_s32(mask),b,a)
+ 
+ #define SIMDi_ADD(a,b) vaddq_s32(a,b)
+ #define SIMDi_SUB(a,b) vsubq_s32(a,b)
+@@ -1834,7 +1836,7 @@ static SIMDf VECTORCALL FUNC(CellularDistance##distanceFunc##Single)(SIMDi seed,
+ #define CELLULAR_DISTANCE2_SINGLE(distanceFunc, returnFunc)\
+ static SIMDf VECTORCALL FUNC(Cellular##returnFunc##distanceFunc##Single)(SIMDi seed, SIMDf x, SIMDf y, SIMDf z, SIMDf cellJitter, int index0, int index1)\
+ {\
+-	SIMDf distance[4] = {SIMDf_NUM(999999),SIMDf_NUM(999999),SIMDf_NUM(999999),SIMDf_NUM(999999)};\
++	SIMDf distance[FN_CELLULAR_INDEX_MAX+1] = {SIMDf_NUM(999999),SIMDf_NUM(999999),SIMDf_NUM(999999),SIMDf_NUM(999999)};\
+ 	\
+ 	SIMDi xc     = SIMDi_SUB(SIMDi_CONVERT_TO_INT(x), SIMDi_NUM(1));\
+ 	SIMDi ycBase = SIMDi_SUB(SIMDi_CONVERT_TO_INT(y), SIMDi_NUM(1));\
+-- 
+2.20.1
+
diff --git a/0002-Update-casts-for-NEON.patch b/0002-Update-casts-for-NEON.patch
deleted file mode 100644
index 82f249b..0000000
--- a/0002-Update-casts-for-NEON.patch
+++ /dev/null
@@ -1,96 +0,0 @@
-From 2e3868c40a667ec1c3fe02b997b66d5f8393e22f Mon Sep 17 00:00:00 2001
-From: Elliott Sales de Andrade <quantum.analyst@gmail.com>
-Date: Fri, 20 Jul 2018 16:46:43 -0400
-Subject: [PATCH 2/6] Update casts for NEON.
-
-A merge of various upstream commits.
-
-Signed-off-by: Elliott Sales de Andrade <quantum.analyst@gmail.com>
----
- pyfastnoisesimd/fastnoisesimd/FastNoiseSIMD.h    |  7 +++++--
- .../fastnoisesimd/FastNoiseSIMD_internal.cpp     | 16 +++++++++-------
- 2 files changed, 14 insertions(+), 9 deletions(-)
-
-diff --git a/pyfastnoisesimd/fastnoisesimd/FastNoiseSIMD.h b/pyfastnoisesimd/fastnoisesimd/FastNoiseSIMD.h
-index d0b5ed6..9554384 100644
---- a/pyfastnoisesimd/fastnoisesimd/FastNoiseSIMD.h
-+++ b/pyfastnoisesimd/fastnoisesimd/FastNoiseSIMD.h
-@@ -77,6 +77,9 @@ AMD Carrizo - Q2 2015
- FMA3
- Intel Haswell - Q2 2013
- AMD Piledriver - 2012
-+
-+AVX-512F
-+Intel Skylake-X - Q2 2017
- */
- 
- struct FastNoiseVectorSet;
-@@ -97,7 +100,7 @@ public:
- 
- 	// Returns highest detected level of CPU support
- 	// 5: ARM NEON
--	// 4: AVX512
-+	// 4: AVX-512F
- 	// 3: AVX2 & FMA3
- 	// 2: SSE4.1
- 	// 1: SSE2
-@@ -106,7 +109,7 @@ public:
- 
- 	// Sets the SIMD level for newly created FastNoiseSIMD objects
- 	// 5: ARM NEON
--	// 4: AVX512
-+	// 4: AVX-512F
- 	// 3: AVX2 & FMA3
- 	// 2: SSE4.1
- 	// 1: SSE2
-diff --git a/pyfastnoisesimd/fastnoisesimd/FastNoiseSIMD_internal.cpp b/pyfastnoisesimd/fastnoisesimd/FastNoiseSIMD_internal.cpp
-index 0af1bf8..d63143a 100644
---- a/pyfastnoisesimd/fastnoisesimd/FastNoiseSIMD_internal.cpp
-+++ b/pyfastnoisesimd/fastnoisesimd/FastNoiseSIMD_internal.cpp
-@@ -176,10 +176,10 @@ static SIMDf VECTORCALL FUNC(DIV)(SIMDf a, SIMDf b)
- #define SIMDf_MAX(a,b) vmaxq_f32(a,b)
- #define SIMDf_INV_SQRT(a) vrsqrteq_f32(a)
- 
--#define SIMDf_LESS_THAN(a,b) vreinterpretq_f32_u32(vcltq_f32(a,b))
--#define SIMDf_GREATER_THAN(a,b) vreinterpretq_f32_u32(vcgtq_f32(a,b))
--#define SIMDf_LESS_EQUAL(a,b) vreinterpretq_f32_u32(vcleq_f32(a,b))
--#define SIMDf_GREATER_EQUAL(a,b) vreinterpretq_f32_u32(vcgeq_f32(a,b))
-+#define SIMDf_LESS_THAN(a,b) vreinterpretq_s32_u32(vcltq_f32(a,b))
-+#define SIMDf_GREATER_THAN(a,b) vreinterpretq_s32_u32(vcgtq_f32(a,b))
-+#define SIMDf_LESS_EQUAL(a,b) vreinterpretq_s32_u32(vcleq_f32(a,b))
-+#define SIMDf_GREATER_EQUAL(a,b) vreinterpretq_s32_u32(vcgeq_f32(a,b))
- 
- #define SIMDf_AND(a,b) SIMDf_CAST_TO_FLOAT(vandq_s32(vreinterpretq_s32_f32(a),vreinterpretq_s32_f32(b)))
- #define SIMDf_AND_NOT(a,b) SIMDf_CAST_TO_FLOAT(vandq_s32(vmvnq_s32(vreinterpretq_s32_f32(a)),vreinterpretq_s32_f32(b)))
-@@ -190,7 +190,9 @@ static SIMDf VECTORCALL FUNC(FLOOR)(SIMDf a)
- {
- 	SIMDf fval = SIMDf_CONVERT_TO_FLOAT(SIMDi_CONVERT_TO_INT(a));
- 
--	return vsubq_f32(fval, SIMDf_AND(SIMDf_LESS_THAN(a, fval), SIMDf_NUM(1)));
-+	return vsubq_f32(fval,
-+	                 SIMDf_CAST_TO_FLOAT(vandq_s32(SIMDf_LESS_THAN(a, fval),
-+	                                               SIMDi_CAST_TO_INT(SIMDf_NUM(1)))));
- }
- #define SIMDf_FLOOR(a) FUNC(FLOOR)(a)
- #else
-@@ -199,7 +201,7 @@ static SIMDf VECTORCALL FUNC(FLOOR)(SIMDf a)
- #endif
- 
- #define SIMDf_ABS(a) vabsq_f32(a)
--#define SIMDf_BLENDV(a,b,mask) vbslq_f32(mask,b,a)
-+#define SIMDf_BLENDV(a,b,mask) vbslq_f32(vreinterpretq_u32_s32(mask),b,a)
- 
- #define SIMDi_ADD(a,b) vaddq_s32(a,b)
- #define SIMDi_SUB(a,b) vsubq_s32(a,b)
-@@ -1832,7 +1834,7 @@ static SIMDf VECTORCALL FUNC(CellularDistance##distanceFunc##Single)(SIMDi seed,
- #define CELLULAR_DISTANCE2_SINGLE(distanceFunc, returnFunc)\
- static SIMDf VECTORCALL FUNC(Cellular##returnFunc##distanceFunc##Single)(SIMDi seed, SIMDf x, SIMDf y, SIMDf z, SIMDf cellJitter, int index0, int index1)\
- {\
--	SIMDf distance[4] = {SIMDf_NUM(999999),SIMDf_NUM(999999),SIMDf_NUM(999999),SIMDf_NUM(999999)};\
-+	SIMDf distance[FN_CELLULAR_INDEX_MAX+1] = {SIMDf_NUM(999999),SIMDf_NUM(999999),SIMDf_NUM(999999),SIMDf_NUM(999999)};\
- 	\
- 	SIMDi xc     = SIMDi_SUB(SIMDi_CONVERT_TO_INT(x), SIMDi_NUM(1));\
- 	SIMDi ycBase = SIMDi_SUB(SIMDi_CONVERT_TO_INT(y), SIMDi_NUM(1));\
--- 
-2.17.1
-
diff --git a/0002-Use-fallback-for-PPC64-and-S390x.patch b/0002-Use-fallback-for-PPC64-and-S390x.patch
new file mode 100644
index 0000000..d7bc795
--- /dev/null
+++ b/0002-Use-fallback-for-PPC64-and-S390x.patch
@@ -0,0 +1,68 @@
+From cde6cd59345e6a950d10f77a911895840e680488 Mon Sep 17 00:00:00 2001
+From: Elliott Sales de Andrade <quantum.analyst@gmail.com>
+Date: Fri, 20 Jul 2018 05:50:13 -0400
+Subject: [PATCH 2/5] Use fallback for PPC64 and S390x.
+
+There is no SSE2 on PPC64, and no other optimized version is implemented
+yet.
+
+Signed-off-by: Elliott Sales de Andrade <quantum.analyst@gmail.com>
+---
+ pyfastnoisesimd/fastnoisesimd/FastNoiseSIMD.cpp | 12 ++++++++++++
+ pyfastnoisesimd/fastnoisesimd/FastNoiseSIMD.h   |  2 +-
+ 2 files changed, 13 insertions(+), 1 deletion(-)
+
+diff --git a/pyfastnoisesimd/fastnoisesimd/FastNoiseSIMD.cpp b/pyfastnoisesimd/fastnoisesimd/FastNoiseSIMD.cpp
+index fac36b6..6da16c4 100644
+--- a/pyfastnoisesimd/fastnoisesimd/FastNoiseSIMD.cpp
++++ b/pyfastnoisesimd/fastnoisesimd/FastNoiseSIMD.cpp
+@@ -75,7 +75,9 @@
+ #include "ARM/cpu-features.h"
+ #endif
+ #else // 'nix
++#if !(defined(__ppc64__) || defined(__PPC64__) || defined(__s390x__))
+ #include <cpuid.h>
++#endif
+ #include "inttypes.h"
+ #endif
+ int FastNoiseSIMD::s_currentSIMDLevel = -1;
+@@ -111,6 +113,15 @@ uint64_t xgetbv(unsigned int x) {
+ 	return _xgetbv(x);
+ }
+ #else
++#if defined(__ppc64__) || defined(__PPC64__) || defined(__s390x__)
++void cpuid(int32_t out[4], int32_t x) {
++	/* Just disable it as anything better is unimplemented. */
++	out[0] = 0;
++}
++uint64_t xgetbv(unsigned int index) {
++	return 0;
++}
++#else
+ void cpuid(int32_t out[4], int32_t x) {
+ 	__cpuid_count(x, 0, out[0], out[1], out[2], out[3]);
+ }
+@@ -119,6 +130,7 @@ uint64_t xgetbv(unsigned int index) {
+ 	__asm__ __volatile__("xgetbv" : "=a"(eax), "=d"(edx) : "c"(index));
+ 	return ((uint64_t)edx << 32) | eax;
+ }
++#endif
+ #define _XCR_XFEATURE_ENABLED_MASK  0
+ #endif
+ 
+diff --git a/pyfastnoisesimd/fastnoisesimd/FastNoiseSIMD.h b/pyfastnoisesimd/fastnoisesimd/FastNoiseSIMD.h
+index 9554384..6fff8bf 100644
+--- a/pyfastnoisesimd/fastnoisesimd/FastNoiseSIMD.h
++++ b/pyfastnoisesimd/fastnoisesimd/FastNoiseSIMD.h
+@@ -45,7 +45,7 @@
+ #define FN_ALIGNED_SETS
+ 
+ // SSE2/NEON support is guaranteed on 64bit CPUs so no fallback is needed
+-#if !(defined(_WIN64) || defined(__x86_64__) || defined(__ppc64__) || defined(__aarch64__) || defined(FN_IOS)) || defined(_DEBUG)
++#if !(defined(_WIN64) || defined(__x86_64__) || defined(__aarch64__) || defined(FN_IOS)) || defined(_DEBUG)
+ #define FN_COMPILE_NO_SIMD_FALLBACK
+ #endif
+ 
+-- 
+2.20.1
+
diff --git a/0003-Add-platform-specific-flags-for-NEON.patch b/0003-Add-platform-specific-flags-for-NEON.patch
new file mode 100644
index 0000000..bd227cf
--- /dev/null
+++ b/0003-Add-platform-specific-flags-for-NEON.patch
@@ -0,0 +1,93 @@
+From 329df8d668477934f9fd5d1b587a8f7ac66414d1 Mon Sep 17 00:00:00 2001
+From: Elliott Sales de Andrade <quantum.analyst@gmail.com>
+Date: Fri, 20 Jul 2018 21:14:19 -0400
+Subject: [PATCH 3/5] Add platform-specific flags for NEON.
+
+Signed-off-by: Elliott Sales de Andrade <quantum.analyst@gmail.com>
+---
+ setup.py | 27 ++++++++++++++++++++++++++-
+ 1 file changed, 26 insertions(+), 1 deletion(-)
+
+diff --git a/setup.py b/setup.py
+index 552fdfb..9c7bd4e 100644
+--- a/setup.py
++++ b/setup.py
+@@ -44,7 +44,6 @@ open('pyfastnoisesimd/version.py', 'w').write('__version__ = "%s"\n' % VERSION)
+ sources = [
+     'pyfastnoisesimd/fastnoisesimd/FastNoiseSIMD.cpp',
+     'pyfastnoisesimd/fastnoisesimd/FastNoiseSIMD_internal.cpp',
+-    'pyfastnoisesimd/fastnoisesimd/FastNoiseSIMD_neon.cpp',
+     'pyfastnoisesimd/wrapper.cpp',
+ ]
+ 
+@@ -85,6 +84,14 @@ if os.name == 'nt':
+             '/arch:AVX2',
+         ]
+     }
++    neon = {
++        'sources': [
++            'pyfastnoisesimd/fastnoisesimd/FastNoiseSIMD_neon.cpp',
++        ],
++        'cflags': [
++            '/Oi',
++        ],
++    }
+ 
+     if platform.machine() == 'AMD64': # 64-bit windows
+         #`/arch:SSE2` doesn't exist on Windows x64 builds, and generates a needless warnings
+@@ -158,6 +165,19 @@ else:  # Linux
+             '-msse2',
+         ],
+     }
++    neon = {
++        'sources': [
++            'pyfastnoisesimd/fastnoisesimd/FastNoiseSIMD_neon.cpp',
++        ],
++        'cflags': [
++            '-std=c++11',
++            '-mfpu=neon',
++        ],
++    }
++    if platform.machine() == 'aarch64':
++        # Flag is not supported, but NEON is always available.
++        neon['cflags'].remove('-mfpu=neon')
++
+     fma_flags = ['-mfma']
+ 
+ clibs = [
+@@ -165,6 +185,7 @@ clibs = [
+     ('avx2', avx2),
+     ('sse41', sse41),
+     ('sse2', sse2),
++    ('neon', neon),
+ ]
+ 
+ 
+@@ -174,6 +195,7 @@ class build(_build):
+         ('with-avx2=', None, 'Use AVX2 instructions: auto|yes|no'),
+         ('with-sse41=', None, 'Use SSE4.1 instructions: auto|yes|no'),
+         ('with-sse2=', None, 'Use SSE2 instructions: auto|yes|no'),
++        ('with-neon=', None, 'Use NEON instructions: auto|yes|no'),
+         ('with-fma=', None, 'Use FMA instructions: auto|yes|no'),
+     ]
+ 
+@@ -183,6 +205,7 @@ class build(_build):
+         self.with_avx2 = 'auto'
+         self.with_sse41 = 'auto'
+         self.with_sse2 = 'auto'
++        self.with_neon = 'auto'
+         self.with_fma = 'auto'
+ 
+     def finalize_options(self):
+@@ -221,6 +244,8 @@ class build(_build):
+                 disabled_libraries.append('avx512')
+             if msc_version < 1900:
+                 disabled_libraries.append('avx2')
++            if not platform.machine().startswith('arm'):
++                disabled_libraries.append('neon')
+         # End of SIMD limits
+ 
+         for name, lib in self.distribution.libraries:
+-- 
+2.20.1
+
diff --git a/0003-Use-fallback-for-PPC64-and-S390x.patch b/0003-Use-fallback-for-PPC64-and-S390x.patch
deleted file mode 100644
index 4c715bc..0000000
--- a/0003-Use-fallback-for-PPC64-and-S390x.patch
+++ /dev/null
@@ -1,68 +0,0 @@
-From adf5365d8105c8fdc9890e6abbc5c603fa5d1586 Mon Sep 17 00:00:00 2001
-From: Elliott Sales de Andrade <quantum.analyst@gmail.com>
-Date: Fri, 20 Jul 2018 05:50:13 -0400
-Subject: [PATCH 3/6] Use fallback for PPC64 and S390x.
-
-There is no SSE2 on PPC64, and no other optimized version is implemented
-yet.
-
-Signed-off-by: Elliott Sales de Andrade <quantum.analyst@gmail.com>
----
- pyfastnoisesimd/fastnoisesimd/FastNoiseSIMD.cpp | 12 ++++++++++++
- pyfastnoisesimd/fastnoisesimd/FastNoiseSIMD.h   |  2 +-
- 2 files changed, 13 insertions(+), 1 deletion(-)
-
-diff --git a/pyfastnoisesimd/fastnoisesimd/FastNoiseSIMD.cpp b/pyfastnoisesimd/fastnoisesimd/FastNoiseSIMD.cpp
-index be8e183..0d09b18 100644
---- a/pyfastnoisesimd/fastnoisesimd/FastNoiseSIMD.cpp
-+++ b/pyfastnoisesimd/fastnoisesimd/FastNoiseSIMD.cpp
-@@ -74,7 +74,9 @@
- #include "ARM/cpu-features.h"
- #endif
- #else // 'nix
-+#if !(defined(__ppc64__) || defined(__PPC64__) || defined(__s390x__))
- #include <cpuid.h>
-+#endif
- #include "inttypes.h"
- #endif
- int FastNoiseSIMD::s_currentSIMDLevel = -1;
-@@ -110,6 +112,15 @@ uint64_t xgetbv(unsigned int x) {
- 	return _xgetbv(x);
- }
- #else
-+#if defined(__ppc64__) || defined(__PPC64__) || defined(__s390x__)
-+void cpuid(int32_t out[4], int32_t x) {
-+	/* Just disable it as anything better is unimplemented. */
-+	out[0] = 0;
-+}
-+uint64_t xgetbv(unsigned int index) {
-+	return 0;
-+}
-+#else
- void cpuid(int32_t out[4], int32_t x) {
- 	__cpuid_count(x, 0, out[0], out[1], out[2], out[3]);
- }
-@@ -118,6 +129,7 @@ uint64_t xgetbv(unsigned int index) {
- 	__asm__ __volatile__("xgetbv" : "=a"(eax), "=d"(edx) : "c"(index));
- 	return ((uint64_t)edx << 32) | eax;
- }
-+#endif
- #define _XCR_XFEATURE_ENABLED_MASK  0
- #endif
- 
-diff --git a/pyfastnoisesimd/fastnoisesimd/FastNoiseSIMD.h b/pyfastnoisesimd/fastnoisesimd/FastNoiseSIMD.h
-index 9554384..6fff8bf 100644
---- a/pyfastnoisesimd/fastnoisesimd/FastNoiseSIMD.h
-+++ b/pyfastnoisesimd/fastnoisesimd/FastNoiseSIMD.h
-@@ -45,7 +45,7 @@
- #define FN_ALIGNED_SETS
- 
- // SSE2/NEON support is guaranteed on 64bit CPUs so no fallback is needed
--#if !(defined(_WIN64) || defined(__x86_64__) || defined(__ppc64__) || defined(__aarch64__) || defined(FN_IOS)) || defined(_DEBUG)
-+#if !(defined(_WIN64) || defined(__x86_64__) || defined(__aarch64__) || defined(FN_IOS)) || defined(_DEBUG)
- #define FN_COMPILE_NO_SIMD_FALLBACK
- #endif
- 
--- 
-2.17.1
-
diff --git a/0004-Use-correct-types-to-parse-NumPy-array-dimensions.patch b/0004-Use-correct-types-to-parse-NumPy-array-dimensions.patch
deleted file mode 100644
index 409d003..0000000
--- a/0004-Use-correct-types-to-parse-NumPy-array-dimensions.patch
+++ /dev/null
@@ -1,38 +0,0 @@
-From 1d128f884eaf397738fd36497d09b349ff842ebc Mon Sep 17 00:00:00 2001
-From: Elliott Sales de Andrade <quantum.analyst@gmail.com>
-Date: Fri, 20 Jul 2018 19:09:13 -0400
-Subject: [PATCH 4/6] Use correct types to parse NumPy array dimensions.
-
-The wrong type can cause errors on big-endian systems or others with
-differing int and pointer sizes.
-
-Signed-off-by: Elliott Sales de Andrade <quantum.analyst@gmail.com>
----
- pyfastnoisesimd/wrapper.cpp | 4 ++--
- 1 file changed, 2 insertions(+), 2 deletions(-)
-
-diff --git a/pyfastnoisesimd/wrapper.cpp b/pyfastnoisesimd/wrapper.cpp
-index 019ef55..4ebcee8 100644
---- a/pyfastnoisesimd/wrapper.cpp
-+++ b/pyfastnoisesimd/wrapper.cpp
-@@ -93,7 +93,7 @@ PyFNS_GetEmptySet(PyObject *self, PyObject *args)
- {
-     // Make a NumPy array and return it. Note the array is empty, not zeroed.
-     npy_intp dims[3] = {0, 0, 0};
--    const char *format = "i|ii";
-+    const char *format = "n|nn";
-     float *data;
- 
-     if (!PyArg_ParseTuple(args, format, &dims[0], &dims[1], &dims[2])) {
-@@ -538,7 +538,7 @@ PyFNS_GetNoiseSet(FNSObject *self, PyObject *args)
-     int xStart, yStart, zStart;
-     npy_intp dims[3] = {0, 0, 0};
-     float scaleMod = 1.0;
--    const char *format = "iiiiii|f";
-+    const char *format = "iiinnn|f";
-     float *data = NULL;
- 
-     if (!PyArg_ParseTuple(args, format, &zStart, &yStart, &xStart, &dims[0], &dims[1], &dims[2], &scaleMod))
--- 
-2.17.1
-
diff --git a/0004-Use-getauxval-to-check-for-NEON-on-Linux.patch b/0004-Use-getauxval-to-check-for-NEON-on-Linux.patch
new file mode 100644
index 0000000..1bfdcf8
--- /dev/null
+++ b/0004-Use-getauxval-to-check-for-NEON-on-Linux.patch
@@ -0,0 +1,51 @@
+From 4e731ae8a88e5275033c3561926ac53032e41f05 Mon Sep 17 00:00:00 2001
+From: Elliott Sales de Andrade <quantum.analyst@gmail.com>
+Date: Fri, 20 Jul 2018 23:17:13 -0400
+Subject: [PATCH 4/5] Use getauxval to check for NEON on Linux.
+
+Signed-off-by: Elliott Sales de Andrade <quantum.analyst@gmail.com>
+---
+ pyfastnoisesimd/fastnoisesimd/FastNoiseSIMD.cpp | 11 +++++++++--
+ 1 file changed, 9 insertions(+), 2 deletions(-)
+
+diff --git a/pyfastnoisesimd/fastnoisesimd/FastNoiseSIMD.cpp b/pyfastnoisesimd/fastnoisesimd/FastNoiseSIMD.cpp
+index 6da16c4..389b3b3 100644
+--- a/pyfastnoisesimd/fastnoisesimd/FastNoiseSIMD.cpp
++++ b/pyfastnoisesimd/fastnoisesimd/FastNoiseSIMD.cpp
+@@ -71,7 +71,10 @@
+ #ifdef _WIN32
+ #include <intrin.h>
+ #elif defined(FN_ARM)
+-#if !defined(__aarch64__) && !defined(FN_IOS)
++#if defined(__linux__) || defined(linux) || defined(__LINUX__) || defined(__linux)
++#include <sys/auxv.h>
++#include <asm/hwcap.h>
++#elif !defined(__aarch64__) && !defined(FN_IOS)
+ #include "ARM/cpu-features.h"
+ #endif
+ #else // 'nix
+@@ -88,6 +91,10 @@ int GetFastestSIMD()
+ {
+ #if defined(__aarch64__) || defined(FN_IOS)
+ 	return FN_NEON;
++#elif defined(__linux__) || defined(linux) || defined(__LINUX__) || defined(__linux)
++	if ((getauxval(AT_HWCAP) & HWCAP_NEON) != 0) {
++		return FN_NEON;
++	}
+ #else
+ 	if (android_getCpuFamily() == ANDROID_CPU_FAMILY_ARM)
+ 	{
+@@ -99,9 +106,9 @@ int GetFastestSIMD()
+ #endif
+ 				return FN_NEON;
+ 	}
++#endif
+ 
+ 	return FN_NO_SIMD_FALLBACK;
+-#endif
+ }
+ #else
+ 
+-- 
+2.20.1
+
diff --git a/0005-Add-platform-specific-flags-for-NEON.patch b/0005-Add-platform-specific-flags-for-NEON.patch
deleted file mode 100644
index 7ada2f9..0000000
--- a/0005-Add-platform-specific-flags-for-NEON.patch
+++ /dev/null
@@ -1,93 +0,0 @@
-From 078dc6b42caa3a93fc8cfa6456bd224dcd928ec1 Mon Sep 17 00:00:00 2001
-From: Elliott Sales de Andrade <quantum.analyst@gmail.com>
-Date: Fri, 20 Jul 2018 21:14:19 -0400
-Subject: [PATCH 5/6] Add platform-specific flags for NEON.
-
-Signed-off-by: Elliott Sales de Andrade <quantum.analyst@gmail.com>
----
- setup.py | 27 ++++++++++++++++++++++++++-
- 1 file changed, 26 insertions(+), 1 deletion(-)
-
-diff --git a/setup.py b/setup.py
-index 5aec105..f65cc8d 100644
---- a/setup.py
-+++ b/setup.py
-@@ -44,7 +44,6 @@ open('pyfastnoisesimd/version.py', 'w').write('__version__ = "%s"\n' % VERSION)
- sources = [
-     'pyfastnoisesimd/fastnoisesimd/FastNoiseSIMD.cpp',
-     'pyfastnoisesimd/fastnoisesimd/FastNoiseSIMD_internal.cpp',
--    'pyfastnoisesimd/fastnoisesimd/FastNoiseSIMD_neon.cpp',
-     'pyfastnoisesimd/wrapper.cpp'
- ]
- 
-@@ -85,6 +84,14 @@ if os.name == 'nt':
-             '/arch:AVX2',
-         ]
-     }
-+    neon = {
-+        'sources': [
-+            'pyfastnoisesimd/fastnoisesimd/FastNoiseSIMD_neon.cpp',
-+        ],
-+        'cflags': [
-+            '/Oi',
-+        ],
-+    }
- 
-     if platform.machine() == 'AMD64': # 64-bit windows
-         #`/arch:SSE2` doesn't exist on Windows x64 builds, and generates a needless warnings
-@@ -158,6 +165,19 @@ else:  # Linux
-             '-msse2',
-         ],
-     }
-+    neon = {
-+        'sources': [
-+            'pyfastnoisesimd/fastnoisesimd/FastNoiseSIMD_neon.cpp',
-+        ],
-+        'cflags': [
-+            '-std=c++11',
-+            '-mfpu=neon',
-+        ],
-+    }
-+    if platform.machine() == 'aarch64':
-+        # Flag is not supported, but NEON is always available.
-+        neon['cflags'].remove('-mfpu=neon')
-+
-     fma_flags = ['-mfma']
- 
- clibs = [
-@@ -165,6 +185,7 @@ clibs = [
-     ('avx2', avx2),
-     ('sse41', sse41),
-     ('sse2', sse2),
-+    ('neon', neon),
- ]
- 
- 
-@@ -174,6 +195,7 @@ class build(_build):
-         ('with-avx2=', None, 'Use AVX2 instructions: auto|yes|no'),
-         ('with-sse41=', None, 'Use SSE4.1 instructions: auto|yes|no'),
-         ('with-sse2=', None, 'Use SSE2 instructions: auto|yes|no'),
-+        ('with-neon=', None, 'Use NEON instructions: auto|yes|no'),
-         ('with-fma=', None, 'Use FMA instructions: auto|yes|no'),
-     ]
- 
-@@ -183,6 +205,7 @@ class build(_build):
-         self.with_avx2 = 'auto'
-         self.with_sse41 = 'auto'
-         self.with_sse2 = 'auto'
-+        self.with_neon = 'auto'
-         self.with_fma = 'auto'
- 
-     def finalize_options(self):
-@@ -219,6 +242,8 @@ class build(_build):
-                 disabled_libraries.append('avx512')
-             if msc_version < 1900:
-                 disabled_libraries.append('avx2')
-+            if not platform.machine().startswith('arm'):
-+                disabled_libraries.append('neon')
-         # End of SIMD limits
- 
-         for name, lib in self.distribution.libraries:
--- 
-2.17.1
-
diff --git a/0005-Fix-alignment-on-non-optimized-systems.patch b/0005-Fix-alignment-on-non-optimized-systems.patch
new file mode 100644
index 0000000..a9d04b4
--- /dev/null
+++ b/0005-Fix-alignment-on-non-optimized-systems.patch
@@ -0,0 +1,48 @@
+From d1ac49b993d625651e61dd115115b609a0c8b22a Mon Sep 17 00:00:00 2001
+From: Elliott Sales de Andrade <quantum.analyst@gmail.com>
+Date: Sun, 13 Jan 2019 01:17:40 -0500
+Subject: [PATCH 5/5] Fix alignment on non-optimized systems.
+
+PPC64LE, for example, does not have any optimized code, so alignment is
+2, and dividing by item size (which is greater than 2), the result is 0.
+This produces a divide-by-zero later, so it should be limited to 1 at
+minimum.
+
+Signed-off-by: Elliott Sales de Andrade <quantum.analyst@gmail.com>
+---
+ pyfastnoisesimd/helpers.py | 5 ++---
+ 1 file changed, 2 insertions(+), 3 deletions(-)
+
+diff --git a/pyfastnoisesimd/helpers.py b/pyfastnoisesimd/helpers.py
+index b62aeba..a9e8c58 100644
+--- a/pyfastnoisesimd/helpers.py
++++ b/pyfastnoisesimd/helpers.py
+@@ -59,7 +59,7 @@ def empty_coords(length, dtype=np.float32, n_byte=ext.SIMD_ALIGNMENT):
+     itemsize = dtype.itemsize
+ 
+     # We need to expand length to be a multiple of the vector size
+-    vect_len = ext.SIMD_ALIGNMENT // itemsize
++    vect_len = max(ext.SIMD_ALIGNMENT // itemsize, 1)
+     aligned_len = int(vect_len*np.ceil(length/vect_len))
+     shape = (3, aligned_len)
+ 
+@@ -125,7 +125,7 @@ def aligned_chunks(array, n_chunks, axis=0):
+         block_size = np.product(array.shape[axis:])
+     # print(f'Got blocksize of {block_size}')
+ 
+-    vect_len = ext.SIMD_ALIGNMENT // array.dtype.itemsize
++    vect_len = max(ext.SIMD_ALIGNMENT // array.dtype.itemsize, 1)
+ 
+     if block_size % vect_len == 0:
+         # Iterate at-will, the underlying blocks have the correct shape
+@@ -880,7 +880,6 @@ class Noise(object):
+         # for I, ((result_chunk, r_offset), (coord_chunk, offset)) in enumerate(zip(
+         #             aligned_chunks(result, self._num_workers, axis=0),
+         #             aligned_chunks(coords, self._num_workers, axis=1))):
+-        vect_len = ext.SIMD_ALIGNMENT // itemsize
+         for I, (result_chunk, offset) in enumerate(
+                     aligned_chunks(result, self._num_workers, axis=0)):
+ 
+-- 
+2.20.1
+
diff --git a/0006-Use-getauxval-to-check-for-NEON-on-Linux.patch b/0006-Use-getauxval-to-check-for-NEON-on-Linux.patch
deleted file mode 100644
index a804bcc..0000000
--- a/0006-Use-getauxval-to-check-for-NEON-on-Linux.patch
+++ /dev/null
@@ -1,51 +0,0 @@
-From 26c18ebe8438d28bfc047a1145131c17b63cad60 Mon Sep 17 00:00:00 2001
-From: Elliott Sales de Andrade <quantum.analyst@gmail.com>
-Date: Fri, 20 Jul 2018 23:17:13 -0400
-Subject: [PATCH 6/6] Use getauxval to check for NEON on Linux.
-
-Signed-off-by: Elliott Sales de Andrade <quantum.analyst@gmail.com>
----
- pyfastnoisesimd/fastnoisesimd/FastNoiseSIMD.cpp | 11 +++++++++--
- 1 file changed, 9 insertions(+), 2 deletions(-)
-
-diff --git a/pyfastnoisesimd/fastnoisesimd/FastNoiseSIMD.cpp b/pyfastnoisesimd/fastnoisesimd/FastNoiseSIMD.cpp
-index 0d09b18..42137c9 100644
---- a/pyfastnoisesimd/fastnoisesimd/FastNoiseSIMD.cpp
-+++ b/pyfastnoisesimd/fastnoisesimd/FastNoiseSIMD.cpp
-@@ -70,7 +70,10 @@
- #ifdef _WIN32
- #include <intrin.h>
- #elif defined(FN_ARM)
--#if !defined(__aarch64__) && !defined(FN_IOS)
-+#if defined(__linux__) || defined(linux) || defined(__LINUX__) || defined(__linux)
-+#include <sys/auxv.h>
-+#include <asm/hwcap.h>
-+#elif !defined(__aarch64__) && !defined(FN_IOS)
- #include "ARM/cpu-features.h"
- #endif
- #else // 'nix
-@@ -87,6 +90,10 @@ int GetFastestSIMD()
- {
- #if defined(__aarch64__) || defined(FN_IOS)
- 	return FN_NEON;
-+#elif defined(__linux__) || defined(linux) || defined(__LINUX__) || defined(__linux)
-+	if ((getauxval(AT_HWCAP) & HWCAP_NEON) != 0) {
-+		return FN_NEON;
-+	}
- #else
- 	if (android_getCpuFamily() == ANDROID_CPU_FAMILY_ARM)
- 	{
-@@ -98,9 +105,9 @@ int GetFastestSIMD()
- #endif
- 				return FN_NEON;
- 	}
-+#endif
- 
- 	return FN_NO_SIMD_FALLBACK;
--#endif
- }
- #else
- 
--- 
-2.17.1
-
diff --git a/python-pyfastnoisesimd.spec b/python-pyfastnoisesimd.spec
index a702e41..9f2e2e9 100644
--- a/python-pyfastnoisesimd.spec
+++ b/python-pyfastnoisesimd.spec
@@ -1,27 +1,25 @@
 %global srcname pyfastnoisesimd
 
 Name:           python-%{srcname}
-Version:        0.3.2
+Version:        0.4.1
 Release:        1%{?dist}
 Summary:        Python Fast Noise with SIMD
 
 License:        BSD
 URL:            http://github.com/robbmcleod/pyfastnoisesimd
 Source0:        https://files.pythonhosted.org/packages/source/p/%{srcname}/%{srcname}-%{version}.tar.gz
-# https://github.com/Auburns/FastNoiseSIMD/pull/26
-Patch0001:      0001-Add-cstdlib-for-posix_memalign.patch
 # https://github.com/Auburns/FastNoiseSIMD/commit/32873404111701397781fe9ef21931fed4f7f766
 # https://github.com/Auburns/FastNoiseSIMD/commit/575c0047bbfd2bac841359daa9db220a9f97a638
 # https://github.com/Auburns/FastNoiseSIMD/pull/27
-Patch0002:      0002-Update-casts-for-NEON.patch
+Patch0001:      0001-Update-casts-for-NEON.patch
 # https://github.com/Auburns/FastNoiseSIMD/pull/31
-Patch0003:      0003-Use-fallback-for-PPC64-and-S390x.patch
-# https://github.com/robbmcleod/pyfastnoisesimd/pull/14
-Patch0004:      0004-Use-correct-types-to-parse-NumPy-array-dimensions.patch
+Patch0002:      0002-Use-fallback-for-PPC64-and-S390x.patch
 # https://github.com/robbmcleod/pyfastnoisesimd/pull/15
-Patch0005:      0005-Add-platform-specific-flags-for-NEON.patch
+Patch0003:      0003-Add-platform-specific-flags-for-NEON.patch
 # https://github.com/Auburns/FastNoiseSIMD/pull/32
-Patch0006:      0006-Use-getauxval-to-check-for-NEON-on-Linux.patch
+Patch0004:      0004-Use-getauxval-to-check-for-NEON-on-Linux.patch
+# https://github.com/robbmcleod/pyfastnoisesimd/pull/20
+Patch0005:      0005-Fix-alignment-on-non-optimized-systems.patch
 
 %global _description \
 PyFastNoiseSIMD is a wrapper around Jordan Peck's synthetic noise library which \
@@ -55,8 +53,6 @@ Requires:       python3dist(numpy) > 1.7
 
 # Remove bundled egg-info
 rm -rf %{srcname}.egg-info
-# Remove no longer necessary file
-rm %{srcname}/cpuinfo.py
 # Fix line endings
 for file in README.rst; do
     sed "s|\r||g" ${file} > ${file}.new
@@ -89,5 +85,8 @@ popd
 
 
 %changelog
+* Sun Jan 13 2019 Elliott Sales de Andrade <quantum.analyst@gmail.com> - 0.4.1-1
+- Update to latest version
+
 * Fri Jul 20 2018 Elliott Sales de Andrade <quantum.analyst@gmail.com> - 0.3.2-1
 - Initial package.
diff --git a/sources b/sources
index b77911a..e702da5 100644
--- a/sources
+++ b/sources
@@ -1 +1 @@
-SHA512 (pyfastnoisesimd-0.3.2.tar.gz) = 90468a86587a7ff9773fca7fa11d0e0249561c92faef5a4a827238112a83c0dd1f88119f7355b344241361eac39b288606247ab94f92fe1221d8b354a5887336
+SHA512 (pyfastnoisesimd-0.4.1.tar.gz) = 6f8f0957b3e71e7221221af3bbfff0617ffd550461803c936baeb0dba217b42f223d3b0121b5072d5f61a3a45f0718bcca31e78196a53b30a513b4cf34c3f301