diff -Nrup ffmpeg.orig/libavcodec/ppc/fft_altivec.S ffmpeg/libavcodec/ppc/fft_altivec.S
--- ffmpeg.orig/libavcodec/ppc/fft_altivec.S 1970-01-01 01:00:00.000000000 +0100
+++ ffmpeg/libavcodec/ppc/fft_altivec.S 2018-09-05 15:45:34.368754131 +0200
@@ -0,0 +1,458 @@
+/*
+ * FFT transform with Altivec optimizations
+ * Copyright (c) 2009 Loren Merritt
+ *
+ * This algorithm (though not any of the implementation details) is
+ * based on libdjbfft by D. J. Bernstein.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/*
+ * These functions are not individually interchangeable with the C versions.
+ * While C takes arrays of FFTComplex, Altivec leaves intermediate results
+ * in blocks as convenient to the vector size.
+ * i.e. {4x real, 4x imaginary, 4x real, ...}
+ *
+ * I ignore standard calling convention.
+ * Instead, the following registers are treated as global constants:
+ * v14: zero
+ * v15..v18: cosines
+ * v19..v29: permutations
+ * r9: 16
+ * r12: ff_cos_tabs
+ * and the rest are free for local use.
+ */
+
+#include "config.h"
+
+#if HAVE_GNU_AS && HAVE_ALTIVEC && HAVE_BIGENDIAN
+
+#include "asm.S"
+
+.text
+
+.macro addi2 ra, imm // add 32-bit immediate
+.if \imm & 0xffff
+ addi \ra, \ra, \imm@l
+.endif
+.if (\imm+0x8000)>>16
+ addis \ra, \ra, \imm@ha
+.endif
+.endm
+
+.macro FFT4 a0, a1, a2, a3 // in:0-1 out:2-3
+ vperm \a2,\a0,\a1,v20 // vcprm(0,1,s2,s1) // {r0,i0,r3,i2}
+ vperm \a3,\a0,\a1,v21 // vcprm(2,3,s0,s3) // {r1,i1,r2,i3}
+ vaddfp \a0,\a2,\a3 // {t1,t2,t6,t5}
+ vsubfp \a1,\a2,\a3 // {t3,t4,t8,t7}
+ vmrghw \a2,\a0,\a1 // vcprm(0,s0,1,s1) // {t1,t3,t2,t4}
+ vperm \a3,\a0,\a1,v22 // vcprm(2,s3,3,s2) // {t6,t7,t5,t8}
+ vaddfp \a0,\a2,\a3 // {r0,r1,i0,i1}
+ vsubfp \a1,\a2,\a3 // {r2,r3,i2,i3}
+ vperm \a2,\a0,\a1,v23 // vcprm(0,1,s0,s1) // {r0,r1,r2,r3}
+ vperm \a3,\a0,\a1,v24 // vcprm(2,3,s2,s3) // {i0,i1,i2,i3}
+.endm
+
+.macro FFT4x2 a0, a1, b0, b1, a2, a3, b2, b3
+ vperm \a2,\a0,\a1,v20 // vcprm(0,1,s2,s1) // {r0,i0,r3,i2}
+ vperm \a3,\a0,\a1,v21 // vcprm(2,3,s0,s3) // {r1,i1,r2,i3}
+ vperm \b2,\b0,\b1,v20
+ vperm \b3,\b0,\b1,v21
+ vaddfp \a0,\a2,\a3 // {t1,t2,t6,t5}
+ vsubfp \a1,\a2,\a3 // {t3,t4,t8,t7}
+ vaddfp \b0,\b2,\b3
+ vsubfp \b1,\b2,\b3
+ vmrghw \a2,\a0,\a1 // vcprm(0,s0,1,s1) // {t1,t3,t2,t4}
+ vperm \a3,\a0,\a1,v22 // vcprm(2,s3,3,s2) // {t6,t7,t5,t8}
+ vmrghw \b2,\b0,\b1
+ vperm \b3,\b0,\b1,v22
+ vaddfp \a0,\a2,\a3 // {r0,r1,i0,i1}
+ vsubfp \a1,\a2,\a3 // {r2,r3,i2,i3}
+ vaddfp \b0,\b2,\b3
+ vsubfp \b1,\b2,\b3
+ vperm \a2,\a0,\a1,v23 // vcprm(0,1,s0,s1) // {r0,r1,r2,r3}
+ vperm \a3,\a0,\a1,v24 // vcprm(2,3,s2,s3) // {i0,i1,i2,i3}
+ vperm \b2,\b0,\b1,v23
+ vperm \b3,\b0,\b1,v24
+.endm
+
+.macro FFT8 a0, a1, b0, b1, a2, a3, b2, b3, b4 // in,out:a0-b1
+ vmrghw \b2,\b0,\b1 // vcprm(0,s0,1,s1) // {r4,r6,i4,i6}
+ vmrglw \b3,\b0,\b1 // vcprm(2,s2,3,s3) // {r5,r7,i5,i7}
+ vperm \a2,\a0,\a1,v20 // FFT4 ...
+ vperm \a3,\a0,\a1,v21
+ vaddfp \b0,\b2,\b3 // {t1,t3,t2,t4}
+ vsubfp \b1,\b2,\b3 // {r5,r7,i5,i7}
+ vperm \b4,\b1,\b1,v25 // vcprm(2,3,0,1) // {i5,i7,r5,r7}
+ vaddfp \a0,\a2,\a3
+ vsubfp \a1,\a2,\a3
+ vmaddfp \b1,\b1,v17,v14 // * {-1,1,1,-1}/sqrt(2)
+ vmaddfp \b1,\b4,v18,\b1 // * { 1,1,1,1 }/sqrt(2) // {t8,ta,t7,t9}
+ vmrghw \a2,\a0,\a1
+ vperm \a3,\a0,\a1,v22
+ vperm \b2,\b0,\b1,v26 // vcprm(1,2,s3,s0) // {t3,t2,t9,t8}
+ vperm \b3,\b0,\b1,v27 // vcprm(0,3,s2,s1) // {t1,t4,t7,ta}
+ vaddfp \a0,\a2,\a3
+ vsubfp \a1,\a2,\a3
+ vaddfp \b0,\b2,\b3 // {t1,t2,t9,ta}
+ vsubfp \b1,\b2,\b3 // {t6,t5,tc,tb}
+ vperm \a2,\a0,\a1,v23
+ vperm \a3,\a0,\a1,v24
+ vperm \b2,\b0,\b1,v28 // vcprm(0,2,s1,s3) // {t1,t9,t5,tb}
+ vperm \b3,\b0,\b1,v29 // vcprm(1,3,s0,s2) // {t2,ta,t6,tc}
+ vsubfp \b0,\a2,\b2 // {r4,r5,r6,r7}
+ vsubfp \b1,\a3,\b3 // {i4,i5,i6,i7}
+ vaddfp \a0,\a2,\b2 // {r0,r1,r2,r3}
+ vaddfp \a1,\a3,\b3 // {i0,i1,i2,i3}
+.endm
+
+.macro BF d0,d1,s0,s1
+ vsubfp \d1,\s0,\s1
+ vaddfp \d0,\s0,\s1
+.endm
+
+.macro zip d0,d1,s0,s1
+ vmrghw \d0,\s0,\s1
+ vmrglw \d1,\s0,\s1
+.endm
+
+.macro def_fft4 interleave
+fft4\interleave\()_altivec:
+ lvx v0, 0,r3
+ lvx v1,r9,r3
+ FFT4 v0,v1,v2,v3
+.ifnb \interleave
+ zip v0,v1,v2,v3
+ stvx v0, 0,r3
+ stvx v1,r9,r3
+.else
+ stvx v2, 0,r3
+ stvx v3,r9,r3
+.endif
+ blr
+.endm
+
+.macro def_fft8 interleave
+fft8\interleave\()_altivec:
+ addi r4,r3,32
+ lvx v0, 0,r3
+ lvx v1,r9,r3
+ lvx v2, 0,r4
+ lvx v3,r9,r4
+ FFT8 v0,v1,v2,v3,v4,v5,v6,v7,v8
+.ifnb \interleave
+ zip v4,v5,v0,v1
+ zip v6,v7,v2,v3
+ stvx v4, 0,r3
+ stvx v5,r9,r3
+ stvx v6, 0,r4
+ stvx v7,r9,r4
+.else
+ stvx v0, 0,r3
+ stvx v1,r9,r3
+ stvx v2, 0,r4
+ stvx v3,r9,r4
+.endif
+ blr
+.endm
+
+.macro def_fft16 interleave
+fft16\interleave\()_altivec:
+ addi r5,r3,64
+ addi r6,r3,96
+ addi r4,r3,32
+ lvx v0, 0,r5
+ lvx v1,r9,r5
+ lvx v2, 0,r6
+ lvx v3,r9,r6
+ FFT4x2 v0,v1,v2,v3,v4,v5,v6,v7
+ lvx v0, 0,r3
+ lvx v1,r9,r3
+ lvx v2, 0,r4
+ lvx v3,r9,r4
+ FFT8 v0,v1,v2,v3,v8,v9,v10,v11,v12
+ vmaddfp v8,v4,v15,v14 // r2*wre
+ vmaddfp v9,v5,v15,v14 // i2*wre
+ vmaddfp v10,v6,v15,v14 // r3*wre
+ vmaddfp v11,v7,v15,v14 // i3*wre
+ vmaddfp v8,v5,v16,v8 // i2*wim
+ vnmsubfp v9,v4,v16,v9 // r2*wim
+ vnmsubfp v10,v7,v16,v10 // i3*wim
+ vmaddfp v11,v6,v16,v11 // r3*wim
+ BF v10,v12,v10,v8
+ BF v11,v13,v9,v11
+ BF v0,v4,v0,v10
+ BF v3,v7,v3,v12
+ BF v1,v5,v1,v11
+ BF v2,v6,v2,v13
+.ifnb \interleave
+ zip v8, v9,v0,v1
+ zip v10,v11,v2,v3
+ zip v12,v13,v4,v5
+ zip v14,v15,v6,v7
+ stvx v8, 0,r3
+ stvx v9,r9,r3
+ stvx v10, 0,r4
+ stvx v11,r9,r4
+ stvx v12, 0,r5
+ stvx v13,r9,r5
+ stvx v14, 0,r6
+ stvx v15,r9,r6
+.else
+ stvx v0, 0,r3
+ stvx v4, 0,r5
+ stvx v3,r9,r4
+ stvx v7,r9,r6
+ stvx v1,r9,r3
+ stvx v5,r9,r5
+ stvx v2, 0,r4
+ stvx v6, 0,r6
+.endif
+ blr
+.endm
+
+// void pass(float *z, float *wre, int n)
+.macro PASS interleave, suffix
+fft_pass\suffix\()_altivec:
+ mtctr r5
+ slwi r0,r5,4
+ slwi r7,r5,6 // o2
+ slwi r5,r5,5 // o1
+ add r10,r5,r7 // o3
+ add r0,r4,r0 // wim
+ addi r6,r5,16 // o1+16
+ addi r8,r7,16 // o2+16
+ addi r11,r10,16 // o3+16
+1:
+ lvx v8, 0,r4 // wre
+ lvx v10, 0,r0 // wim
+ sub r0,r0,r9
+ lvx v9, 0,r0
+ vperm v9,v9,v10,v19 // vcprm(s0,3,2,1) => wim[0 .. -3]
+ lvx v4,r3,r7 // r2 = z[o2]
+ lvx v5,r3,r8 // i2 = z[o2+16]
+ lvx v6,r3,r10 // r3 = z[o3]
+ lvx v7,r3,r11 // i3 = z[o3+16]
+ vmaddfp v10,v4,v8,v14 // r2*wre
+ vmaddfp v11,v5,v8,v14 // i2*wre
+ vmaddfp v12,v6,v8,v14 // r3*wre
+ vmaddfp v13,v7,v8,v14 // i3*wre
+ lvx v0, 0,r3 // r0 = z[0]
+ lvx v3,r3,r6 // i1 = z[o1+16]
+ vmaddfp v10,v5,v9,v10 // i2*wim
+ vnmsubfp v11,v4,v9,v11 // r2*wim
+ vnmsubfp v12,v7,v9,v12 // i3*wim
+ vmaddfp v13,v6,v9,v13 // r3*wim
+ lvx v1,r3,r9 // i0 = z[16]
+ lvx v2,r3,r5 // r1 = z[o1]
+ BF v12,v8,v12,v10
+ BF v13,v9,v11,v13
+ BF v0,v4,v0,v12
+ BF v3,v7,v3,v8
+.if !\interleave
+ stvx v0, 0,r3
+ stvx v4,r3,r7
+ stvx v3,r3,r6
+ stvx v7,r3,r11
+.endif
+ BF v1,v5,v1,v13
+ BF v2,v6,v2,v9
+.if !\interleave
+ stvx v1,r3,r9
+ stvx v2,r3,r5
+ stvx v5,r3,r8
+ stvx v6,r3,r10
+.else
+ vmrghw v8,v0,v1
+ vmrglw v9,v0,v1
+ stvx v8, 0,r3
+ stvx v9,r3,r9
+ vmrghw v8,v2,v3
+ vmrglw v9,v2,v3
+ stvx v8,r3,r5
+ stvx v9,r3,r6
+ vmrghw v8,v4,v5
+ vmrglw v9,v4,v5
+ stvx v8,r3,r7
+ stvx v9,r3,r8
+ vmrghw v8,v6,v7
+ vmrglw v9,v6,v7
+ stvx v8,r3,r10
+ stvx v9,r3,r11
+.endif
+ addi r3,r3,32
+ addi r4,r4,16
+ bdnz 1b
+ sub r3,r3,r5
+ blr
+.endm
+
+#define M_SQRT1_2 0.70710678118654752440 /* 1/sqrt(2) */
+
+#define WORD_0 0x00,0x01,0x02,0x03
+#define WORD_1 0x04,0x05,0x06,0x07
+#define WORD_2 0x08,0x09,0x0a,0x0b
+#define WORD_3 0x0c,0x0d,0x0e,0x0f
+#define WORD_s0 0x10,0x11,0x12,0x13
+#define WORD_s1 0x14,0x15,0x16,0x17
+#define WORD_s2 0x18,0x19,0x1a,0x1b
+#define WORD_s3 0x1c,0x1d,0x1e,0x1f
+
+#define vcprm(a, b, c, d) .byte WORD_##a, WORD_##b, WORD_##c, WORD_##d
+
+ .rodata
+ .align 4
+fft_data:
+ .float 0, 0, 0, 0
+ .float 1, 0.92387953, M_SQRT1_2, 0.38268343
+ .float 0, 0.38268343, M_SQRT1_2, 0.92387953
+ .float -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2,-M_SQRT1_2
+ .float M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2
+ vcprm(s0,3,2,1)
+ vcprm(0,1,s2,s1)
+ vcprm(2,3,s0,s3)
+ vcprm(2,s3,3,s2)
+ vcprm(0,1,s0,s1)
+ vcprm(2,3,s2,s3)
+ vcprm(2,3,0,1)
+ vcprm(1,2,s3,s0)
+ vcprm(0,3,s2,s1)
+ vcprm(0,2,s1,s3)
+ vcprm(1,3,s0,s2)
+
+.macro lvm b, r, regs:vararg
+ lvx \r, 0, \b
+ addi \b, \b, 16
+ .ifnb \regs
+ lvm \b, \regs
+ .endif
+.endm
+
+.macro stvm b, r, regs:vararg
+ stvx \r, 0, \b
+ addi \b, \b, 16
+ .ifnb \regs
+ stvm \b, \regs
+ .endif
+.endm
+
+.macro fft_calc interleave
+extfunc ff_fft_calc\interleave\()_altivec
+ mflr r0
+ stp r0, 2*PS(R(1))
+ stpu r1, -(160+16*PS)(R(1))
+ get_got r11
+ addi r6, r1, 16*PS
+ stvm r6, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
+ mfvrsave r0
+ stw r0, 15*PS(R(1))
+#if __APPLE__
+ li r6, 0xfffffffc
+#else
+ li r6, -4
+#endif
+ mtvrsave r6
+
+ movrel r6, fft_data, r11
+ lvm r6, v14, v15, v16, v17, v18, v19, v20, v21
+ lvm r6, v22, v23, v24, v25, v26, v27, v28, v29
+
+ li r9, 16
+ movrel r12, X(ff_cos_tabs), r11
+
+ movrel r6, fft_dispatch_tab\interleave\()_altivec, r11
+ lwz r3, 0(R(3))
+ subi r3, r3, 2
+ slwi r3, r3, 2+ARCH_PPC64
+ lpx r3, r3, r6
+ mtctr r3
+ mr r3, r4
+ bctrl
+
+ addi r6, r1, 16*PS
+ lvm r6, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
+ lwz r6, 15*PS(R(1))
+ mtvrsave r6
+ lp r1, 0(R(1))
+ lp r0, 2*PS(R(1))
+ mtlr r0
+ blr
+.endm
+
+.macro DECL_FFT suffix, bits, n, n2, n4
+fft\n\suffix\()_altivec:
+ mflr r0
+ stp r0,PS*(\bits-3)(R(1))
+ bl fft\n2\()_altivec
+ addi2 r3,\n*4
+ bl fft\n4\()_altivec
+ addi2 r3,\n*2
+ bl fft\n4\()_altivec
+ addi2 r3,\n*-6
+ lp r0,PS*(\bits-3)(R(1))
+ lp r4,\bits*PS(R(12))
+ mtlr r0
+ li r5,\n/16
+ b fft_pass\suffix\()_altivec
+.endm
+
+.macro DECL_FFTS interleave, suffix
+ .text
+ def_fft4 \suffix
+ def_fft8 \suffix
+ def_fft16 \suffix
+ PASS \interleave, \suffix
+ DECL_FFT \suffix, 5, 32, 16, 8
+ DECL_FFT \suffix, 6, 64, 32, 16
+ DECL_FFT \suffix, 7, 128, 64, 32
+ DECL_FFT \suffix, 8, 256, 128, 64
+ DECL_FFT \suffix, 9, 512, 256, 128
+ DECL_FFT \suffix,10, 1024, 512, 256
+ DECL_FFT \suffix,11, 2048, 1024, 512
+ DECL_FFT \suffix,12, 4096, 2048, 1024
+ DECL_FFT \suffix,13, 8192, 4096, 2048
+ DECL_FFT \suffix,14,16384, 8192, 4096
+ DECL_FFT \suffix,15,32768,16384, 8192
+ DECL_FFT \suffix,16,65536,32768,16384
+
+ fft_calc \suffix
+
+ .rodata
+ .align 3
+fft_dispatch_tab\suffix\()_altivec:
+ PTR fft4\suffix\()_altivec
+ PTR fft8\suffix\()_altivec
+ PTR fft16\suffix\()_altivec
+ PTR fft32\suffix\()_altivec
+ PTR fft64\suffix\()_altivec
+ PTR fft128\suffix\()_altivec
+ PTR fft256\suffix\()_altivec
+ PTR fft512\suffix\()_altivec
+ PTR fft1024\suffix\()_altivec
+ PTR fft2048\suffix\()_altivec
+ PTR fft4096\suffix\()_altivec
+ PTR fft8192\suffix\()_altivec
+ PTR fft16384\suffix\()_altivec
+ PTR fft32768\suffix\()_altivec
+ PTR fft65536\suffix\()_altivec
+.endm
+
+DECL_FFTS 0
+DECL_FFTS 1, _interleave
+
+#endif /* HAVE_GNU_AS && HAVE_ALTIVEC && HAVE_BIGENDIAN */
diff -Nrup ffmpeg.orig/libavcodec/ppc/fft_init.c ffmpeg/libavcodec/ppc/fft_init.c
--- ffmpeg.orig/libavcodec/ppc/fft_init.c 1970-01-01 01:00:00.000000000 +0100
+++ ffmpeg/libavcodec/ppc/fft_init.c 2018-09-05 15:45:34.368754131 +0200
@@ -0,0 +1,167 @@
+/*
+ * FFT/IFFT transforms
+ * AltiVec-enabled
+ * Copyright (c) 2009 Loren Merritt
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "libavutil/cpu.h"
+#include "libavutil/ppc/cpu.h"
+#include "libavutil/ppc/util_altivec.h"
+#include "libavcodec/fft.h"
+
+/**
+ * Do a complex FFT with the parameters defined in ff_fft_init().
+ * The input data must be permuted before with s->revtab table.
+ * No 1.0 / sqrt(n) normalization is done.
+ * AltiVec-enabled:
+ * This code assumes that the 'z' pointer is 16 bytes-aligned.
+ * It also assumes all FFTComplex are 8 bytes-aligned pairs of floats.
+ */
+
+#if HAVE_VSX
+#include "fft_vsx.h"
+#else
+void ff_fft_calc_altivec(FFTContext *s, FFTComplex *z);
+void ff_fft_calc_interleave_altivec(FFTContext *s, FFTComplex *z);
+#endif
+
+#if HAVE_GNU_AS && HAVE_ALTIVEC && (HAVE_BIGENDIAN || HAVE_VSX)
+static void imdct_half_altivec(FFTContext *s, FFTSample *output, const FFTSample *input)
+{
+ int j, k;
+ int n = 1 << s->mdct_bits;
+ int n4 = n >> 2;
+ int n8 = n >> 3;
+ int n32 = n >> 5;
+ const uint16_t *revtabj = s->revtab;
+ const uint16_t *revtabk = s->revtab+n4;
+ const vec_f *tcos = (const vec_f*)(s->tcos+n8);
+ const vec_f *tsin = (const vec_f*)(s->tsin+n8);
+ const vec_f *pin = (const vec_f*)(input+n4);
+ vec_f *pout = (vec_f*)(output+n4);
+
+ /* pre rotation */
+ k = n32-1;
+ do {
+ vec_f cos,sin,cos0,sin0,cos1,sin1,re,im,r0,i0,r1,i1,a,b,c,d;
+#define CMULA(p,o0,o1,o2,o3)\
+ a = pin[ k*2+p]; /* { z[k].re, z[k].im, z[k+1].re, z[k+1].im } */\
+ b = pin[-k*2-p-1]; /* { z[-k-2].re, z[-k-2].im, z[-k-1].re, z[-k-1].im } */\
+ re = vec_perm(a, b, vcprm(0,2,s0,s2)); /* { z[k].re, z[k+1].re, z[-k-2].re, z[-k-1].re } */\
+ im = vec_perm(a, b, vcprm(s3,s1,3,1)); /* { z[-k-1].im, z[-k-2].im, z[k+1].im, z[k].im } */\
+ cos = vec_perm(cos0, cos1, vcprm(o0,o1,s##o2,s##o3)); /* { cos[k], cos[k+1], cos[-k-2], cos[-k-1] } */\
+ sin = vec_perm(sin0, sin1, vcprm(o0,o1,s##o2,s##o3));\
+ r##p = im*cos - re*sin;\
+ i##p = re*cos + im*sin;
+#define STORE2(v,dst)\
+ j = dst;\
+ vec_ste(v, 0, output+j*2);\
+ vec_ste(v, 4, output+j*2);
+#define STORE8(p)\
+ a = vec_perm(r##p, i##p, vcprm(0,s0,0,s0));\
+ b = vec_perm(r##p, i##p, vcprm(1,s1,1,s1));\
+ c = vec_perm(r##p, i##p, vcprm(2,s2,2,s2));\
+ d = vec_perm(r##p, i##p, vcprm(3,s3,3,s3));\
+ STORE2(a, revtabk[ p*2-4]);\
+ STORE2(b, revtabk[ p*2-3]);\
+ STORE2(c, revtabj[-p*2+2]);\
+ STORE2(d, revtabj[-p*2+3]);
+
+ cos0 = tcos[k];
+ sin0 = tsin[k];
+ cos1 = tcos[-k-1];
+ sin1 = tsin[-k-1];
+ CMULA(0, 0,1,2,3);
+ CMULA(1, 2,3,0,1);
+ STORE8(0);
+ STORE8(1);
+ revtabj += 4;
+ revtabk -= 4;
+ k--;
+ } while(k >= 0);
+
+#if HAVE_VSX
+ ff_fft_calc_vsx(s, (FFTComplex*)output);
+#else
+ ff_fft_calc_altivec(s, (FFTComplex*)output);
+#endif
+
+ /* post rotation + reordering */
+ j = -n32;
+ k = n32-1;
+ do {
+ vec_f cos,sin,re,im,a,b,c,d;
+#define CMULB(d0,d1,o)\
+ re = pout[o*2];\
+ im = pout[o*2+1];\
+ cos = tcos[o];\
+ sin = tsin[o];\
+ d0 = im*sin - re*cos;\
+ d1 = re*sin + im*cos;
+
+ CMULB(a,b,j);
+ CMULB(c,d,k);
+ pout[2*j] = vec_perm(a, d, vcprm(0,s3,1,s2));
+ pout[2*j+1] = vec_perm(a, d, vcprm(2,s1,3,s0));
+ pout[2*k] = vec_perm(c, b, vcprm(0,s3,1,s2));
+ pout[2*k+1] = vec_perm(c, b, vcprm(2,s1,3,s0));
+ j++;
+ k--;
+ } while(k >= 0);
+}
+
+static void imdct_calc_altivec(FFTContext *s, FFTSample *output, const FFTSample *input)
+{
+ int k;
+ int n = 1 << s->mdct_bits;
+ int n4 = n >> 2;
+ int n16 = n >> 4;
+ vec_u32 sign = {1U<<31,1U<<31,1U<<31,1U<<31};
+ vec_u32 *p0 = (vec_u32*)(output+n4);
+ vec_u32 *p1 = (vec_u32*)(output+n4*3);
+
+ imdct_half_altivec(s, output + n4, input);
+
+ for (k = 0; k < n16; k++) {
+ vec_u32 a = p0[k] ^ sign;
+ vec_u32 b = p1[-k-1];
+ p0[-k-1] = vec_perm(a, a, vcprm(3,2,1,0));
+ p1[k] = vec_perm(b, b, vcprm(3,2,1,0));
+ }
+}
+#endif /* HAVE_GNU_AS && HAVE_ALTIVEC && (HAVE_BIGENDIAN || HAVE_VSX) */
+
+av_cold void ff_fft_init_ppc(FFTContext *s)
+{
+#if HAVE_GNU_AS && HAVE_ALTIVEC && (HAVE_BIGENDIAN || HAVE_VSX)
+ if (!PPC_ALTIVEC(av_get_cpu_flags()))
+ return;
+
+#if HAVE_VSX
+ s->fft_calc = ff_fft_calc_interleave_vsx;
+#else
+ s->fft_calc = ff_fft_calc_interleave_altivec;
+#endif
+ if (s->mdct_bits >= 5) {
+ s->imdct_calc = imdct_calc_altivec;
+ s->imdct_half = imdct_half_altivec;
+ }
+#endif /* HAVE_GNU_AS && HAVE_ALTIVEC && HAVE_BIGENDIAN */
+}
diff -Nrup ffmpeg.orig/libavcodec/ppc/fft_vsx.c ffmpeg/libavcodec/ppc/fft_vsx.c
--- ffmpeg.orig/libavcodec/ppc/fft_vsx.c 1970-01-01 01:00:00.000000000 +0100
+++ ffmpeg/libavcodec/ppc/fft_vsx.c 2018-09-05 15:45:34.368754131 +0200
@@ -0,0 +1,226 @@
+/*
+ * FFT transform, optimized with VSX built-in functions
+ * Copyright (c) 2014 Rong Yan
+ *
+ * This algorithm (though not any of the implementation details) is
+ * based on libdjbfft by D. J. Bernstein.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+
+#include "config.h"
+#include "libavutil/cpu.h"
+#include "libavutil/ppc/util_altivec.h"
+#include "libavcodec/fft.h"
+#include "libavcodec/fft-internal.h"
+#include "fft_vsx.h"
+
+#if HAVE_VSX
+
+static void fft32_vsx_interleave(FFTComplex *z)
+{
+ fft16_vsx_interleave(z);
+ fft8_vsx_interleave(z+16);
+ fft8_vsx_interleave(z+24);
+ pass_vsx_interleave(z,ff_cos_32,4);
+}
+
+static void fft64_vsx_interleave(FFTComplex *z)
+{
+ fft32_vsx_interleave(z);
+ fft16_vsx_interleave(z+32);
+ fft16_vsx_interleave(z+48);
+ pass_vsx_interleave(z,ff_cos_64, 8);
+}
+static void fft128_vsx_interleave(FFTComplex *z)
+{
+ fft64_vsx_interleave(z);
+ fft32_vsx_interleave(z+64);
+ fft32_vsx_interleave(z+96);
+ pass_vsx_interleave(z,ff_cos_128,16);
+}
+static void fft256_vsx_interleave(FFTComplex *z)
+{
+ fft128_vsx_interleave(z);
+ fft64_vsx_interleave(z+128);
+ fft64_vsx_interleave(z+192);
+ pass_vsx_interleave(z,ff_cos_256,32);
+}
+static void fft512_vsx_interleave(FFTComplex *z)
+{
+ fft256_vsx_interleave(z);
+ fft128_vsx_interleave(z+256);
+ fft128_vsx_interleave(z+384);
+ pass_vsx_interleave(z,ff_cos_512,64);
+}
+static void fft1024_vsx_interleave(FFTComplex *z)
+{
+ fft512_vsx_interleave(z);
+ fft256_vsx_interleave(z+512);
+ fft256_vsx_interleave(z+768);
+ pass_vsx_interleave(z,ff_cos_1024,128);
+
+}
+static void fft2048_vsx_interleave(FFTComplex *z)
+{
+ fft1024_vsx_interleave(z);
+ fft512_vsx_interleave(z+1024);
+ fft512_vsx_interleave(z+1536);
+ pass_vsx_interleave(z,ff_cos_2048,256);
+}
+static void fft4096_vsx_interleave(FFTComplex *z)
+{
+ fft2048_vsx_interleave(z);
+ fft1024_vsx_interleave(z+2048);
+ fft1024_vsx_interleave(z+3072);
+ pass_vsx_interleave(z,ff_cos_4096, 512);
+}
+static void fft8192_vsx_interleave(FFTComplex *z)
+{
+ fft4096_vsx_interleave(z);
+ fft2048_vsx_interleave(z+4096);
+ fft2048_vsx_interleave(z+6144);
+ pass_vsx_interleave(z,ff_cos_8192,1024);
+}
+static void fft16384_vsx_interleave(FFTComplex *z)
+{
+ fft8192_vsx_interleave(z);
+ fft4096_vsx_interleave(z+8192);
+ fft4096_vsx_interleave(z+12288);
+ pass_vsx_interleave(z,ff_cos_16384,2048);
+}
+static void fft32768_vsx_interleave(FFTComplex *z)
+{
+ fft16384_vsx_interleave(z);
+ fft8192_vsx_interleave(z+16384);
+ fft8192_vsx_interleave(z+24576);
+ pass_vsx_interleave(z,ff_cos_32768,4096);
+}
+static void fft65536_vsx_interleave(FFTComplex *z)
+{
+ fft32768_vsx_interleave(z);
+ fft16384_vsx_interleave(z+32768);
+ fft16384_vsx_interleave(z+49152);
+ pass_vsx_interleave(z,ff_cos_65536,8192);
+}
+
+static void fft32_vsx(FFTComplex *z)
+{
+ fft16_vsx(z);
+ fft8_vsx(z+16);
+ fft8_vsx(z+24);
+ pass_vsx(z,ff_cos_32,4);
+}
+
+static void fft64_vsx(FFTComplex *z)
+{
+ fft32_vsx(z);
+ fft16_vsx(z+32);
+ fft16_vsx(z+48);
+ pass_vsx(z,ff_cos_64, 8);
+}
+static void fft128_vsx(FFTComplex *z)
+{
+ fft64_vsx(z);
+ fft32_vsx(z+64);
+ fft32_vsx(z+96);
+ pass_vsx(z,ff_cos_128,16);
+}
+static void fft256_vsx(FFTComplex *z)
+{
+ fft128_vsx(z);
+ fft64_vsx(z+128);
+ fft64_vsx(z+192);
+ pass_vsx(z,ff_cos_256,32);
+}
+static void fft512_vsx(FFTComplex *z)
+{
+ fft256_vsx(z);
+ fft128_vsx(z+256);
+ fft128_vsx(z+384);
+ pass_vsx(z,ff_cos_512,64);
+}
+static void fft1024_vsx(FFTComplex *z)
+{
+ fft512_vsx(z);
+ fft256_vsx(z+512);
+ fft256_vsx(z+768);
+ pass_vsx(z,ff_cos_1024,128);
+
+}
+static void fft2048_vsx(FFTComplex *z)
+{
+ fft1024_vsx(z);
+ fft512_vsx(z+1024);
+ fft512_vsx(z+1536);
+ pass_vsx(z,ff_cos_2048,256);
+}
+static void fft4096_vsx(FFTComplex *z)
+{
+ fft2048_vsx(z);
+ fft1024_vsx(z+2048);
+ fft1024_vsx(z+3072);
+ pass_vsx(z,ff_cos_4096, 512);
+}
+static void fft8192_vsx(FFTComplex *z)
+{
+ fft4096_vsx(z);
+ fft2048_vsx(z+4096);
+ fft2048_vsx(z+6144);
+ pass_vsx(z,ff_cos_8192,1024);
+}
+static void fft16384_vsx(FFTComplex *z)
+{
+ fft8192_vsx(z);
+ fft4096_vsx(z+8192);
+ fft4096_vsx(z+12288);
+ pass_vsx(z,ff_cos_16384,2048);
+}
+static void fft32768_vsx(FFTComplex *z)
+{
+ fft16384_vsx(z);
+ fft8192_vsx(z+16384);
+ fft8192_vsx(z+24576);
+ pass_vsx(z,ff_cos_32768,4096);
+}
+static void fft65536_vsx(FFTComplex *z)
+{
+ fft32768_vsx(z);
+ fft16384_vsx(z+32768);
+ fft16384_vsx(z+49152);
+ pass_vsx(z,ff_cos_65536,8192);
+}
+
+static void (* const fft_dispatch_vsx[])(FFTComplex*) = {
+ fft4_vsx, fft8_vsx, fft16_vsx, fft32_vsx, fft64_vsx, fft128_vsx, fft256_vsx, fft512_vsx, fft1024_vsx,
+ fft2048_vsx, fft4096_vsx, fft8192_vsx, fft16384_vsx, fft32768_vsx, fft65536_vsx,
+};
+static void (* const fft_dispatch_vsx_interleave[])(FFTComplex*) = {
+ fft4_vsx_interleave, fft8_vsx_interleave, fft16_vsx_interleave, fft32_vsx_interleave, fft64_vsx_interleave,
+ fft128_vsx_interleave, fft256_vsx_interleave, fft512_vsx_interleave, fft1024_vsx_interleave,
+ fft2048_vsx_interleave, fft4096_vsx_interleave, fft8192_vsx_interleave, fft16384_vsx_interleave, fft32768_vsx_interleave, fft65536_vsx_interleave,
+};
+void ff_fft_calc_interleave_vsx(FFTContext *s, FFTComplex *z)
+{
+ fft_dispatch_vsx_interleave[s->nbits-2](z);
+}
+void ff_fft_calc_vsx(FFTContext *s, FFTComplex *z)
+{
+ fft_dispatch_vsx[s->nbits-2](z);
+}
+#endif /* HAVE_VSX */
diff -Nrup ffmpeg.orig/libavcodec/ppc/fft_vsx.h ffmpeg/libavcodec/ppc/fft_vsx.h
--- ffmpeg.orig/libavcodec/ppc/fft_vsx.h 1970-01-01 01:00:00.000000000 +0100
+++ ffmpeg/libavcodec/ppc/fft_vsx.h 2018-09-05 15:45:34.368754131 +0200
@@ -0,0 +1,829 @@
+#ifndef AVCODEC_PPC_FFT_VSX_H
+#define AVCODEC_PPC_FFT_VSX_H
+/*
+ * FFT transform, optimized with VSX built-in functions
+ * Copyright (c) 2014 Rong Yan Copyright (c) 2009 Loren Merritt
+ *
+ * This algorithm (though not any of the implementation details) is
+ * based on libdjbfft by D. J. Bernstein, and fft_altivec_s.S.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+
+#include "config.h"
+#include "libavutil/cpu.h"
+#include "libavutil/ppc/util_altivec.h"
+#include "libavcodec/fft.h"
+#include "libavcodec/fft-internal.h"
+
+#if HAVE_VSX
+
+void ff_fft_calc_interleave_vsx(FFTContext *s, FFTComplex *z);
+void ff_fft_calc_vsx(FFTContext *s, FFTComplex *z);
+
+
+#define byte_2complex (2*sizeof(FFTComplex))
+#define byte_4complex (4*sizeof(FFTComplex))
+#define byte_6complex (6*sizeof(FFTComplex))
+#define byte_8complex (8*sizeof(FFTComplex))
+#define byte_10complex (10*sizeof(FFTComplex))
+#define byte_12complex (12*sizeof(FFTComplex))
+#define byte_14complex (14*sizeof(FFTComplex))
+
+inline static void pass_vsx_interleave(FFTComplex *z, const FFTSample *wre, unsigned int n)
+{
+ int o1 = n<<1;
+ int o2 = n<<2;
+ int o3 = o1+o2;
+ int i1, i2, i3;
+ FFTSample* out = (FFTSample*)z;
+ const FFTSample *wim = wre+o1;
+ vec_f vz0, vzo1, vzo2, vzo3;
+ vec_f x0, x1, x2, x3;
+ vec_f x4, x5, x6, x7;
+ vec_f x8, x9, x10, x11;
+ vec_f x12, x13, x14, x15;
+ vec_f x16, x17, x18, x19;
+ vec_f x20, x21, x22, x23;
+ vec_f vz0plus1, vzo1plus1, vzo2plus1, vzo3plus1;
+ vec_f y0, y1, y2, y3;
+ vec_f y4, y5, y8, y9;
+ vec_f y10, y13, y14, y15;
+ vec_f y16, y17, y18, y19;
+ vec_f y20, y21, y22, y23;
+ vec_f wr1, wi1, wr0, wi0;
+ vec_f wr2, wi2, wr3, wi3;
+ vec_f xmulwi0, xmulwi1, ymulwi2, ymulwi3;
+
+ n = n-2;
+ i1 = o1*sizeof(FFTComplex);
+ i2 = o2*sizeof(FFTComplex);
+ i3 = o3*sizeof(FFTComplex);
+ vzo2 = vec_ld(i2, &(out[0])); // zo2.r zo2.i z(o2+1).r z(o2+1).i
+ vzo2plus1 = vec_ld(i2+16, &(out[0]));
+ vzo3 = vec_ld(i3, &(out[0])); // zo3.r zo3.i z(o3+1).r z(o3+1).i
+ vzo3plus1 = vec_ld(i3+16, &(out[0]));
+ vz0 = vec_ld(0, &(out[0])); // z0.r z0.i z1.r z1.i
+ vz0plus1 = vec_ld(16, &(out[0]));
+ vzo1 = vec_ld(i1, &(out[0])); // zo1.r zo1.i z(o1+1).r z(o1+1).i
+ vzo1plus1 = vec_ld(i1+16, &(out[0]));
+
+ x0 = vec_add(vzo2, vzo3);
+ x1 = vec_sub(vzo2, vzo3);
+ y0 = vec_add(vzo2plus1, vzo3plus1);
+ y1 = vec_sub(vzo2plus1, vzo3plus1);
+
+ wr1 = vec_splats(wre[1]);
+ wi1 = vec_splats(wim[-1]);
+ wi2 = vec_splats(wim[-2]);
+ wi3 = vec_splats(wim[-3]);
+ wr2 = vec_splats(wre[2]);
+ wr3 = vec_splats(wre[3]);
+
+ x2 = vec_perm(x0, x1, vcprm(2,s2,3,s3));
+ x3 = vec_perm(x0, x1, vcprm(s3,3,s2,2));
+
+ y4 = vec_perm(y0, y1, vcprm(s1,1,s0,0));
+ y5 = vec_perm(y0, y1, vcprm(s3,3,s2,2));
+ y2 = vec_perm(y0, y1, vcprm(0,s0,1,s1));
+ y3 = vec_perm(y0, y1, vcprm(2,s2,3,s3));
+
+ ymulwi2 = vec_mul(y4, wi2);
+ ymulwi3 = vec_mul(y5, wi3);
+ x4 = vec_mul(x2, wr1);
+ x5 = vec_mul(x3, wi1);
+ y8 = vec_madd(y2, wr2, ymulwi2);
+ y9 = vec_msub(y2, wr2, ymulwi2);
+ x6 = vec_add(x4, x5);
+ x7 = vec_sub(x4, x5);
+ y13 = vec_madd(y3, wr3, ymulwi3);
+ y14 = vec_msub(y3, wr3, ymulwi3);
+
+ x8 = vec_perm(x6, x7, vcprm(0,1,s2,s3));
+ y10 = vec_perm(y8, y9, vcprm(0,1,s2,s3));
+ y15 = vec_perm(y13, y14, vcprm(0,1,s2,s3));
+
+ x9 = vec_perm(x0, x8, vcprm(0,1,s0,s2));
+ x10 = vec_perm(x1, x8, vcprm(1,0,s3,s1));
+
+ y16 = vec_perm(y10, y15, vcprm(0,2,s0,s2));
+ y17 = vec_perm(y10, y15, vcprm(3,1,s3,s1));
+
+ x11 = vec_add(vz0, x9);
+ x12 = vec_sub(vz0, x9);
+ x13 = vec_add(vzo1, x10);
+ x14 = vec_sub(vzo1, x10);
+
+ y18 = vec_add(vz0plus1, y16);
+ y19 = vec_sub(vz0plus1, y16);
+ y20 = vec_add(vzo1plus1, y17);
+ y21 = vec_sub(vzo1plus1, y17);
+
+ x15 = vec_perm(x13, x14, vcprm(0,s1,2,s3));
+ x16 = vec_perm(x13, x14, vcprm(s0,1,s2,3));
+ y22 = vec_perm(y20, y21, vcprm(0,s1,2,s3));
+ y23 = vec_perm(y20, y21, vcprm(s0,1,s2,3));
+
+
+ vec_st(x11, 0, &(out[0]));
+ vec_st(y18, 16, &(out[0]));
+ vec_st(x15, i1, &(out[0]));
+ vec_st(y22, i1+16, &(out[0]));
+ vec_st(x12, i2, &(out[0]));
+ vec_st(y19, i2+16, &(out[0]));
+ vec_st(x16, i3, &(out[0]));
+ vec_st(y23, i3+16, &(out[0]));
+
+ do {
+ out += 8;
+ wre += 4;
+ wim -= 4;
+ wr0 = vec_splats(wre[0]);
+ wr1 = vec_splats(wre[1]);
+ wi0 = vec_splats(wim[0]);
+ wi1 = vec_splats(wim[-1]);
+
+ wr2 = vec_splats(wre[2]);
+ wr3 = vec_splats(wre[3]);
+ wi2 = vec_splats(wim[-2]);
+ wi3 = vec_splats(wim[-3]);
+
+ vzo2 = vec_ld(i2, &(out[0])); // zo2.r zo2.i z(o2+1).r z(o2+1).i
+ vzo2plus1 = vec_ld(i2+16, &(out[0]));
+ vzo3 = vec_ld(i3, &(out[0])); // zo3.r zo3.i z(o3+1).r z(o3+1).i
+ vzo3plus1 = vec_ld(i3+16, &(out[0]));
+ vz0 = vec_ld(0, &(out[0])); // z0.r z0.i z1.r z1.i
+ vz0plus1 = vec_ld(16, &(out[0]));
+ vzo1 = vec_ld(i1, &(out[0])); // zo1.r zo1.i z(o1+1).r z(o1+1).i
+ vzo1plus1 = vec_ld(i1+16, &(out[0]));
+
+ x0 = vec_add(vzo2, vzo3);
+ x1 = vec_sub(vzo2, vzo3);
+
+ y0 = vec_add(vzo2plus1, vzo3plus1);
+ y1 = vec_sub(vzo2plus1, vzo3plus1);
+
+ x4 = vec_perm(x0, x1, vcprm(s1,1,s0,0));
+ x5 = vec_perm(x0, x1, vcprm(s3,3,s2,2));
+ x2 = vec_perm(x0, x1, vcprm(0,s0,1,s1));
+ x3 = vec_perm(x0, x1, vcprm(2,s2,3,s3));
+
+ y2 = vec_perm(y0, y1, vcprm(0,s0,1,s1));
+ y3 = vec_perm(y0, y1, vcprm(2,s2,3,s3));
+ xmulwi0 = vec_mul(x4, wi0);
+ xmulwi1 = vec_mul(x5, wi1);
+
+ y4 = vec_perm(y0, y1, vcprm(s1,1,s0,0));
+ y5 = vec_perm(y0, y1, vcprm(s3,3,s2,2));
+
+ x8 = vec_madd(x2, wr0, xmulwi0);
+ x9 = vec_msub(x2, wr0, xmulwi0);
+ ymulwi2 = vec_mul(y4, wi2);
+ ymulwi3 = vec_mul(y5, wi3);
+
+ x13 = vec_madd(x3, wr1, xmulwi1);
+ x14 = vec_msub(x3, wr1, xmulwi1);
+
+ y8 = vec_madd(y2, wr2, ymulwi2);
+ y9 = vec_msub(y2, wr2, ymulwi2);
+ y13 = vec_madd(y3, wr3, ymulwi3);
+ y14 = vec_msub(y3, wr3, ymulwi3);
+
+ x10 = vec_perm(x8, x9, vcprm(0,1,s2,s3));
+ x15 = vec_perm(x13, x14, vcprm(0,1,s2,s3));
+
+ y10 = vec_perm(y8, y9, vcprm(0,1,s2,s3));
+ y15 = vec_perm(y13, y14, vcprm(0,1,s2,s3));
+
+ x16 = vec_perm(x10, x15, vcprm(0,2,s0,s2));
+ x17 = vec_perm(x10, x15, vcprm(3,1,s3,s1));
+
+ y16 = vec_perm(y10, y15, vcprm(0,2,s0,s2));
+ y17 = vec_perm(y10, y15, vcprm(3,1,s3,s1));
+
+ x18 = vec_add(vz0, x16);
+ x19 = vec_sub(vz0, x16);
+ x20 = vec_add(vzo1, x17);
+ x21 = vec_sub(vzo1, x17);
+
+ y18 = vec_add(vz0plus1, y16);
+ y19 = vec_sub(vz0plus1, y16);
+ y20 = vec_add(vzo1plus1, y17);
+ y21 = vec_sub(vzo1plus1, y17);
+
+ x22 = vec_perm(x20, x21, vcprm(0,s1,2,s3));
+ x23 = vec_perm(x20, x21, vcprm(s0,1,s2,3));
+
+ y22 = vec_perm(y20, y21, vcprm(0,s1,2,s3));
+ y23 = vec_perm(y20, y21, vcprm(s0,1,s2,3));
+
+ vec_st(x18, 0, &(out[0]));
+ vec_st(y18, 16, &(out[0]));
+ vec_st(x22, i1, &(out[0]));
+ vec_st(y22, i1+16, &(out[0]));
+ vec_st(x19, i2, &(out[0]));
+ vec_st(y19, i2+16, &(out[0]));
+ vec_st(x23, i3, &(out[0]));
+ vec_st(y23, i3+16, &(out[0]));
+ } while (n-=2);
+}
+
+inline static void fft2_vsx_interleave(FFTComplex *z)
+{
+ FFTSample r1, i1;
+
+ r1 = z[0].re - z[1].re;
+ z[0].re += z[1].re;
+ z[1].re = r1;
+
+ i1 = z[0].im - z[1].im;
+ z[0].im += z[1].im;
+ z[1].im = i1;
+ }
+
+inline static void fft4_vsx_interleave(FFTComplex *z)
+{
+ vec_f a, b, c, d;
+ float* out= (float*)z;
+ a = vec_ld(0, &(out[0]));
+ b = vec_ld(byte_2complex, &(out[0]));
+
+ c = vec_perm(a, b, vcprm(0,1,s2,s1));
+ d = vec_perm(a, b, vcprm(2,3,s0,s3));
+ a = vec_add(c, d);
+ b = vec_sub(c, d);
+
+ c = vec_perm(a, b, vcprm(0,1,s0,s1));
+ d = vec_perm(a, b, vcprm(2,3,s3,s2));
+
+ a = vec_add(c, d);
+ b = vec_sub(c, d);
+ vec_st(a, 0, &(out[0]));
+ vec_st(b, byte_2complex, &(out[0]));
+}
+
+inline static void fft8_vsx_interleave(FFTComplex *z)
+{
+ vec_f vz0, vz1, vz2, vz3;
+ vec_f x0, x1, x2, x3;
+ vec_f x4, x5, x6, x7;
+ vec_f x8, x9, x10, x11;
+ vec_f x12, x13, x14, x15;
+ vec_f x16, x17, x18, x19;
+ vec_f x20, x21, x22, x23;
+ vec_f x24, x25, x26, x27;
+ vec_f x28, x29, x30, x31;
+ vec_f x32, x33, x34;
+
+ float* out= (float*)z;
+ vec_f vc1 = {sqrthalf, sqrthalf, sqrthalf, sqrthalf};
+
+ vz0 = vec_ld(0, &(out[0]));
+ vz1 = vec_ld(byte_2complex, &(out[0]));
+ vz2 = vec_ld(byte_4complex, &(out[0]));
+ vz3 = vec_ld(byte_6complex, &(out[0]));
+
+ x0 = vec_perm(vz0, vz1, vcprm(0,1,s2,s1));
+ x1 = vec_perm(vz0, vz1, vcprm(2,3,s0,s3));
+ x2 = vec_perm(vz2, vz3, vcprm(2,1,s0,s1));
+ x3 = vec_perm(vz2, vz3, vcprm(0,3,s2,s3));
+
+ x4 = vec_add(x0, x1);
+ x5 = vec_sub(x0, x1);
+ x6 = vec_add(x2, x3);
+ x7 = vec_sub(x2, x3);
+
+ x8 = vec_perm(x4, x5, vcprm(0,1,s0,s1));
+ x9 = vec_perm(x4, x5, vcprm(2,3,s3,s2));
+ x10 = vec_perm(x6, x7, vcprm(2,1,s2,s1));
+ x11 = vec_perm(x6, x7, vcprm(0,3,s0,s3));
+
+ x12 = vec_add(x8, x9);
+ x13 = vec_sub(x8, x9);
+ x14 = vec_add(x10, x11);
+ x15 = vec_sub(x10, x11);
+ x16 = vec_perm(x12, x13, vcprm(0,s0,1,s1));
+ x17 = vec_perm(x14, x15, vcprm(0,s0,1,s1));
+ x18 = vec_perm(x16, x17, vcprm(s0,s3,s2,s1));
+ x19 = vec_add(x16, x18); // z0.r z2.r z0.i z2.i
+ x20 = vec_sub(x16, x18); // z4.r z6.r z4.i z6.i
+
+ x21 = vec_perm(x12, x13, vcprm(2,s2,3,s3));
+ x22 = vec_perm(x14, x15, vcprm(2,3,s2,s3));
+ x23 = vec_perm(x14, x15, vcprm(3,2,s3,s2));
+ x24 = vec_add(x22, x23);
+ x25 = vec_sub(x22, x23);
+ x26 = vec_mul( vec_perm(x24, x25, vcprm(2,s2,0,s0)), vc1);
+
+ x27 = vec_add(x21, x26); // z1.r z7.r z1.i z3.i
+ x28 = vec_sub(x21, x26); //z5.r z3.r z5.i z7.i
+
+ x29 = vec_perm(x19, x27, vcprm(0,2,s0,s2)); // z0.r z0.i z1.r z1.i
+ x30 = vec_perm(x19, x27, vcprm(1,3,s1,s3)); // z2.r z2.i z7.r z3.i
+ x31 = vec_perm(x20, x28, vcprm(0,2,s0,s2)); // z4.r z4.i z5.r z5.i
+ x32 = vec_perm(x20, x28, vcprm(1,3,s1,s3)); // z6.r z6.i z3.r z7.i
+ x33 = vec_perm(x30, x32, vcprm(0,1,s2,3)); // z2.r z2.i z3.r z3.i
+ x34 = vec_perm(x30, x32, vcprm(s0,s1,2,s3)); // z6.r z6.i z7.r z7.i
+
+ vec_st(x29, 0, &(out[0]));
+ vec_st(x33, byte_2complex, &(out[0]));
+ vec_st(x31, byte_4complex, &(out[0]));
+ vec_st(x34, byte_6complex, &(out[0]));
+}
+
+inline static void fft16_vsx_interleave(FFTComplex *z)
+{
+ float* out= (float*)z;
+ vec_f vc0 = {sqrthalf, sqrthalf, sqrthalf, sqrthalf};
+ vec_f vc1 = {ff_cos_16[1], ff_cos_16[1], ff_cos_16[1], ff_cos_16[1]};
+ vec_f vc2 = {ff_cos_16[3], ff_cos_16[3], ff_cos_16[3], ff_cos_16[3]};
+ vec_f vz0, vz1, vz2, vz3;
+ vec_f vz4, vz5, vz6, vz7;
+ vec_f x0, x1, x2, x3;
+ vec_f x4, x5, x6, x7;
+ vec_f x8, x9, x10, x11;
+ vec_f x12, x13, x14, x15;
+ vec_f x16, x17, x18, x19;
+ vec_f x20, x21, x22, x23;
+ vec_f x24, x25, x26, x27;
+ vec_f x28, x29, x30, x31;
+ vec_f x32, x33, x34, x35;
+ vec_f x36, x37, x38, x39;
+ vec_f x40, x41, x42, x43;
+ vec_f x44, x45, x46, x47;
+ vec_f x48, x49, x50, x51;
+ vec_f x52, x53, x54, x55;
+ vec_f x56, x57, x58, x59;
+ vec_f x60, x61, x62, x63;
+ vec_f x64, x65, x66, x67;
+ vec_f x68, x69, x70, x71;
+ vec_f x72, x73, x74, x75;
+ vec_f x76, x77, x78, x79;
+ vec_f x80, x81, x82, x83;
+ vec_f x84, x85, x86;
+
+ vz0 = vec_ld(0, &(out[0]));
+ vz1 = vec_ld(byte_2complex, &(out[0]));
+ vz2 = vec_ld(byte_4complex, &(out[0]));
+ vz3 = vec_ld(byte_6complex, &(out[0]));
+ vz4 = vec_ld(byte_8complex, &(out[0]));
+ vz5 = vec_ld(byte_10complex, &(out[0]));
+ vz6 = vec_ld(byte_12complex, &(out[0]));
+ vz7 = vec_ld(byte_14complex, &(out[0]));
+
+ x0 = vec_perm(vz0, vz1, vcprm(0,1,s2,s1));
+ x1 = vec_perm(vz0, vz1, vcprm(2,3,s0,s3));
+ x2 = vec_perm(vz2, vz3, vcprm(0,1,s0,s1));
+ x3 = vec_perm(vz2, vz3, vcprm(2,3,s2,s3));
+
+ x4 = vec_perm(vz4, vz5, vcprm(0,1,s2,s1));
+ x5 = vec_perm(vz4, vz5, vcprm(2,3,s0,s3));
+ x6 = vec_perm(vz6, vz7, vcprm(0,1,s2,s1));
+ x7 = vec_perm(vz6, vz7, vcprm(2,3,s0,s3));
+
+ x8 = vec_add(x0, x1);
+ x9 = vec_sub(x0, x1);
+ x10 = vec_add(x2, x3);
+ x11 = vec_sub(x2, x3);
+
+ x12 = vec_add(x4, x5);
+ x13 = vec_sub(x4, x5);
+ x14 = vec_add(x6, x7);
+ x15 = vec_sub(x6, x7);
+
+ x16 = vec_perm(x8, x9, vcprm(0,1,s0,s1));
+ x17 = vec_perm(x8, x9, vcprm(2,3,s3,s2));
+ x18 = vec_perm(x10, x11, vcprm(2,1,s1,s2));
+ x19 = vec_perm(x10, x11, vcprm(0,3,s0,s3));
+ x20 = vec_perm(x12, x14, vcprm(0,1,s0, s1));
+ x21 = vec_perm(x12, x14, vcprm(2,3,s2,s3));
+ x22 = vec_perm(x13, x15, vcprm(0,1,s0,s1));
+ x23 = vec_perm(x13, x15, vcprm(3,2,s3,s2));
+
+ x24 = vec_add(x16, x17);
+ x25 = vec_sub(x16, x17);
+ x26 = vec_add(x18, x19);
+ x27 = vec_sub(x18, x19);
+ x28 = vec_add(x20, x21);
+ x29 = vec_sub(x20, x21);
+ x30 = vec_add(x22, x23);
+ x31 = vec_sub(x22, x23);
+
+ x32 = vec_add(x24, x26);
+ x33 = vec_sub(x24, x26);
+ x34 = vec_perm(x32, x33, vcprm(0,1,s0,s1));
+
+ x35 = vec_perm(x28, x29, vcprm(2,1,s1,s2));
+ x36 = vec_perm(x28, x29, vcprm(0,3,s0,s3));
+ x37 = vec_add(x35, x36);
+ x38 = vec_sub(x35, x36);
+ x39 = vec_perm(x37, x38, vcprm(0,1,s1,s0));
+
+ x40 = vec_perm(x27, x38, vcprm(3,2,s2,s3));
+ x41 = vec_perm(x26, x37, vcprm(2,3,s3,s2));
+ x42 = vec_add(x40, x41);
+ x43 = vec_sub(x40, x41);
+ x44 = vec_mul(x42, vc0);
+ x45 = vec_mul(x43, vc0);
+
+ x46 = vec_add(x34, x39); // z0.r z0.i z4.r z4.i
+ x47 = vec_sub(x34, x39); // z8.r z8.i z12.r z12.i
+
+ x48 = vec_perm(x30, x31, vcprm(2,1,s1,s2));
+ x49 = vec_perm(x30, x31, vcprm(0,3,s3,s0));
+ x50 = vec_add(x48, x49);
+ x51 = vec_sub(x48, x49);
+ x52 = vec_mul(x50, vc1);
+ x53 = vec_mul(x50, vc2);
+ x54 = vec_mul(x51, vc1);
+ x55 = vec_mul(x51, vc2);
+
+ x56 = vec_perm(x24, x25, vcprm(2,3,s2,s3));
+ x57 = vec_perm(x44, x45, vcprm(0,1,s1,s0));
+ x58 = vec_add(x56, x57);
+ x59 = vec_sub(x56, x57);
+
+ x60 = vec_perm(x54, x55, vcprm(1,0,3,2));
+ x61 = vec_perm(x54, x55, vcprm(s1,s0,s3,s2));
+ x62 = vec_add(x52, x61);
+ x63 = vec_sub(x52, x61);
+ x64 = vec_add(x60, x53);
+ x65 = vec_sub(x60, x53);
+ x66 = vec_perm(x62, x64, vcprm(0,1,s3,s2));
+ x67 = vec_perm(x63, x65, vcprm(s0,s1,3,2));
+
+ x68 = vec_add(x58, x66); // z1.r z1.i z3.r z3.i
+ x69 = vec_sub(x58, x66); // z9.r z9.i z11.r z11.i
+ x70 = vec_add(x59, x67); // z5.r z5.i z15.r z15.i
+ x71 = vec_sub(x59, x67); // z13.r z13.i z7.r z7.i
+
+ x72 = vec_perm(x25, x27, vcprm(s1,s0,s2,s3));
+ x73 = vec_add(x25, x72);
+ x74 = vec_sub(x25, x72);
+ x75 = vec_perm(x73, x74, vcprm(0,1,s0,s1));
+ x76 = vec_perm(x44, x45, vcprm(3,2,s2,s3));
+ x77 = vec_add(x75, x76); // z2.r z2.i z6.r z6.i
+ x78 = vec_sub(x75, x76); // z10.r z10.i z14.r z14.i
+
+ x79 = vec_perm(x46, x68, vcprm(0,1,s0,s1)); // z0.r z0.i z1.r z1.i
+ x80 = vec_perm(x77, x68, vcprm(0,1,s2,s3)); // z2.r z2.i z3.r z3.i
+ x81 = vec_perm(x46, x70, vcprm(2,3,s0,s1)); // z4.r z4.i z5.r z5.i
+ x82 = vec_perm(x71, x77, vcprm(s2,s3,2,3)); // z6.r z6.i z7.r z7.i
+ vec_st(x79, 0, &(out[0]));
+ vec_st(x80, byte_2complex, &(out[0]));
+ vec_st(x81, byte_4complex, &(out[0]));
+ vec_st(x82, byte_6complex, &(out[0]));
+ x83 = vec_perm(x47, x69, vcprm(0,1,s0,s1)); // z8.r z8.i z9.r z9.i
+ x84 = vec_perm(x78, x69, vcprm(0,1,s2,s3)); // z10.r z10.i z11.r z11.i
+ x85 = vec_perm(x47, x71, vcprm(2,3,s0,s1)); // z12.r z12.i z13.r z13.i
+ x86 = vec_perm(x70, x78, vcprm(s2,s3,2,3)); // z14.r z14.i z15.r z15.i
+ vec_st(x83, byte_8complex, &(out[0]));
+ vec_st(x84, byte_10complex, &(out[0]));
+ vec_st(x85, byte_12complex, &(out[0]));
+ vec_st(x86, byte_14complex, &(out[0]));
+}
+
+inline static void fft4_vsx(FFTComplex *z)
+{
+ vec_f a, b, c, d;
+ float* out= (float*)z;
+ a = vec_ld(0, &(out[0]));
+ b = vec_ld(byte_2complex, &(out[0]));
+
+ c = vec_perm(a, b, vcprm(0,1,s2,s1));
+ d = vec_perm(a, b, vcprm(2,3,s0,s3));
+ a = vec_add(c, d);
+ b = vec_sub(c, d);
+
+ c = vec_perm(a,b, vcprm(0,s0,1,s1));
+ d = vec_perm(a, b, vcprm(2,s3,3,s2));
+
+ a = vec_add(c, d);
+ b = vec_sub(c, d);
+
+ c = vec_perm(a, b, vcprm(0,1,s0,s1));
+ d = vec_perm(a, b, vcprm(2,3,s2,s3));
+
+ vec_st(c, 0, &(out[0]));
+ vec_st(d, byte_2complex, &(out[0]));
+ return;
+}
+
+inline static void fft8_vsx(FFTComplex *z)
+{
+ vec_f vz0, vz1, vz2, vz3;
+ vec_f vz4, vz5, vz6, vz7, vz8;
+
+ float* out= (float*)z;
+ vec_f vc0 = {0.0, 0.0, 0.0, 0.0};
+ vec_f vc1 = {-sqrthalf, sqrthalf, sqrthalf, -sqrthalf};
+ vec_f vc2 = {sqrthalf, sqrthalf, sqrthalf, sqrthalf};
+
+ vz0 = vec_ld(0, &(out[0]));
+ vz1 = vec_ld(byte_2complex, &(out[0]));
+ vz2 = vec_ld(byte_4complex, &(out[0]));
+ vz3 = vec_ld(byte_6complex, &(out[0]));
+
+ vz6 = vec_perm(vz2, vz3, vcprm(0,s0,1,s1));
+ vz7 = vec_perm(vz2, vz3, vcprm(2,s2,3,s3));
+ vz4 = vec_perm(vz0, vz1, vcprm(0,1,s2,s1));
+ vz5 = vec_perm(vz0, vz1, vcprm(2,3,s0,s3));
+
+ vz2 = vec_add(vz6, vz7);
+ vz3 = vec_sub(vz6, vz7);
+ vz8 = vec_perm(vz3, vz3, vcprm(2,3,0,1));
+
+ vz0 = vec_add(vz4, vz5);
+ vz1 = vec_sub(vz4, vz5);
+
+ vz3 = vec_madd(vz3, vc1, vc0);
+ vz3 = vec_madd(vz8, vc2, vz3);
+
+ vz4 = vec_perm(vz0, vz1, vcprm(0,s0,1,s1));
+ vz5 = vec_perm(vz0, vz1, vcprm(2,s3,3,s2));
+ vz6 = vec_perm(vz2, vz3, vcprm(1,2,s3,s0));
+ vz7 = vec_perm(vz2, vz3, vcprm(0,3,s2,s1));
+
+ vz0 = vec_add(vz4, vz5);
+ vz1 = vec_sub(vz4, vz5);
+ vz2 = vec_add(vz6, vz7);
+ vz3 = vec_sub(vz6, vz7);
+
+ vz4 = vec_perm(vz0, vz1, vcprm(0,1,s0,s1));
+ vz5 = vec_perm(vz0, vz1, vcprm(2,3,s2,s3));
+ vz6 = vec_perm(vz2, vz3, vcprm(0,2,s1,s3));
+ vz7 = vec_perm(vz2, vz3, vcprm(1,3,s0,s2));
+
+
+ vz2 = vec_sub(vz4, vz6);
+ vz3 = vec_sub(vz5, vz7);
+
+ vz0 = vec_add(vz4, vz6);
+ vz1 = vec_add(vz5, vz7);
+
+ vec_st(vz0, 0, &(out[0]));
+ vec_st(vz1, byte_2complex, &(out[0]));
+ vec_st(vz2, byte_4complex, &(out[0]));
+ vec_st(vz3, byte_6complex, &(out[0]));
+ return;
+}
+
+inline static void fft16_vsx(FFTComplex *z)
+{
+ float* out= (float*)z;
+ vec_f vc0 = {0.0, 0.0, 0.0, 0.0};
+ vec_f vc1 = {-sqrthalf, sqrthalf, sqrthalf, -sqrthalf};
+ vec_f vc2 = {sqrthalf, sqrthalf, sqrthalf, sqrthalf};
+ vec_f vc3 = {1.0, 0.92387953, sqrthalf, 0.38268343};
+ vec_f vc4 = {0.0, 0.38268343, sqrthalf, 0.92387953};
+ vec_f vc5 = {-0.0, -0.38268343, -sqrthalf, -0.92387953};
+
+ vec_f vz0, vz1, vz2, vz3;
+ vec_f vz4, vz5, vz6, vz7;
+ vec_f vz8, vz9, vz10, vz11;
+ vec_f vz12, vz13;
+
+ vz0 = vec_ld(byte_8complex, &(out[0]));
+ vz1 = vec_ld(byte_10complex, &(out[0]));
+ vz2 = vec_ld(byte_12complex, &(out[0]));
+ vz3 = vec_ld(byte_14complex, &(out[0]));
+
+ vz4 = vec_perm(vz0, vz1, vcprm(0,1,s2,s1));
+ vz5 = vec_perm(vz0, vz1, vcprm(2,3,s0,s3));
+ vz6 = vec_perm(vz2, vz3, vcprm(0,1,s2,s1));
+ vz7 = vec_perm(vz2, vz3, vcprm(2,3,s0,s3));
+
+ vz0 = vec_add(vz4, vz5);
+ vz1= vec_sub(vz4, vz5);
+ vz2 = vec_add(vz6, vz7);
+ vz3 = vec_sub(vz6, vz7);
+
+ vz4 = vec_perm(vz0, vz1, vcprm(0,s0,1,s1));
+ vz5 = vec_perm(vz0, vz1, vcprm(2,s3,3,s2));
+ vz6 = vec_perm(vz2, vz3, vcprm(0,s0,1,s1));
+ vz7 = vec_perm(vz2, vz3, vcprm(2,s3,3,s2));
+
+ vz0 = vec_add(vz4, vz5);
+ vz1 = vec_sub(vz4, vz5);
+ vz2 = vec_add(vz6, vz7);
+ vz3 = vec_sub(vz6, vz7);
+
+ vz4 = vec_perm(vz0, vz1, vcprm(0,1,s0,s1));
+ vz5 = vec_perm(vz0, vz1, vcprm(2,3,s2,s3));
+
+ vz6 = vec_perm(vz2, vz3, vcprm(0,1,s0,s1));
+ vz7 = vec_perm(vz2, vz3, vcprm(2,3,s2,s3));
+
+ vz0 = vec_ld(0, &(out[0]));
+ vz1 = vec_ld(byte_2complex, &(out[0]));
+ vz2 = vec_ld(byte_4complex, &(out[0]));
+ vz3 = vec_ld(byte_6complex, &(out[0]));
+ vz10 = vec_perm(vz2, vz3, vcprm(0,s0,1,s1));
+ vz11 = vec_perm(vz2, vz3, vcprm(2,s2,3,s3));
+ vz8 = vec_perm(vz0, vz1, vcprm(0,1,s2,s1));
+ vz9 = vec_perm(vz0, vz1, vcprm(2,3,s0,s3));
+
+ vz2 = vec_add(vz10, vz11);
+ vz3 = vec_sub(vz10, vz11);
+ vz12 = vec_perm(vz3, vz3, vcprm(2,3,0,1));
+ vz0 = vec_add(vz8, vz9);
+ vz1 = vec_sub(vz8, vz9);
+
+ vz3 = vec_madd(vz3, vc1, vc0);
+ vz3 = vec_madd(vz12, vc2, vz3);
+ vz8 = vec_perm(vz0, vz1, vcprm(0,s0,1,s1));
+ vz9 = vec_perm(vz0, vz1, vcprm(2,s3,3,s2));
+ vz10 = vec_perm(vz2, vz3, vcprm(1,2,s3,s0));
+ vz11 = vec_perm(vz2, vz3, vcprm(0,3,s2,s1));
+
+ vz0 = vec_add(vz8, vz9);
+ vz1 = vec_sub(vz8, vz9);
+ vz2 = vec_add(vz10, vz11);
+ vz3 = vec_sub(vz10, vz11);
+
+ vz8 = vec_perm(vz0, vz1, vcprm(0,1,s0,s1));
+ vz9 = vec_perm(vz0, vz1, vcprm(2,3,s2,s3));
+ vz10 = vec_perm(vz2, vz3, vcprm(0,2,s1,s3));
+ vz11 = vec_perm(vz2, vz3, vcprm(1,3,s0,s2));
+
+ vz2 = vec_sub(vz8, vz10);
+ vz3 = vec_sub(vz9, vz11);
+ vz0 = vec_add(vz8, vz10);
+ vz1 = vec_add(vz9, vz11);
+
+ vz8 = vec_madd(vz4, vc3, vc0);
+ vz9 = vec_madd(vz5, vc3, vc0);
+ vz10 = vec_madd(vz6, vc3, vc0);
+ vz11 = vec_madd(vz7, vc3, vc0);
+
+ vz8 = vec_madd(vz5, vc4, vz8);
+ vz9 = vec_madd(vz4, vc5, vz9);
+ vz10 = vec_madd(vz7, vc5, vz10);
+ vz11 = vec_madd(vz6, vc4, vz11);
+
+ vz12 = vec_sub(vz10, vz8);
+ vz10 = vec_add(vz10, vz8);
+
+ vz13 = vec_sub(vz9, vz11);
+ vz11 = vec_add(vz9, vz11);
+
+ vz4 = vec_sub(vz0, vz10);
+ vz0 = vec_add(vz0, vz10);
+
+ vz7= vec_sub(vz3, vz12);
+ vz3= vec_add(vz3, vz12);
+
+ vz5 = vec_sub(vz1, vz11);
+ vz1 = vec_add(vz1, vz11);
+
+ vz6 = vec_sub(vz2, vz13);
+ vz2 = vec_add(vz2, vz13);
+
+ vec_st(vz0, 0, &(out[0]));
+ vec_st(vz1, byte_2complex, &(out[0]));
+ vec_st(vz2, byte_4complex, &(out[0]));
+ vec_st(vz3, byte_6complex, &(out[0]));
+ vec_st(vz4, byte_8complex, &(out[0]));
+ vec_st(vz5, byte_10complex, &(out[0]));
+ vec_st(vz6, byte_12complex, &(out[0]));
+ vec_st(vz7, byte_14complex, &(out[0]));
+ return;
+
+}
+inline static void pass_vsx(FFTComplex * z, const FFTSample * wre, unsigned int n)
+{
+ int o1 = n<<1;
+ int o2 = n<<2;
+ int o3 = o1+o2;
+ int i1, i2, i3;
+ FFTSample* out = (FFTSample*)z;
+ const FFTSample *wim = wre+o1;
+ vec_f v0, v1, v2, v3;
+ vec_f v4, v5, v6, v7;
+ vec_f v8, v9, v10, v11;
+ vec_f v12, v13;
+
+ n = n-2;
+ i1 = o1*sizeof(FFTComplex);
+ i2 = o2*sizeof(FFTComplex);
+ i3 = o3*sizeof(FFTComplex);
+
+ v8 = vec_ld(0, &(wre[0]));
+ v10 = vec_ld(0, &(wim[0]));
+ v9 = vec_ld(0, &(wim[-4]));
+ v9 = vec_perm(v9, v10, vcprm(s0,3,2,1));
+
+ v4 = vec_ld(i2, &(out[0]));
+ v5 = vec_ld(i2+16, &(out[0]));
+ v6 = vec_ld(i3, &(out[0]));
+ v7 = vec_ld(i3+16, &(out[0]));
+ v10 = vec_mul(v4, v8); // r2*wre
+ v11 = vec_mul(v5, v8); // i2*wre
+ v12 = vec_mul(v6, v8); // r3*wre
+ v13 = vec_mul(v7, v8); // i3*wre
+
+ v0 = vec_ld(0, &(out[0])); // r0
+ v3 = vec_ld(i1+16, &(out[0])); // i1
+ v10 = vec_madd(v5, v9, v10); // r2*wim
+ v11 = vec_nmsub(v4, v9, v11); // i2*wim
+ v12 = vec_nmsub(v7, v9, v12); // r3*wim
+ v13 = vec_madd(v6, v9, v13); // i3*wim
+
+ v1 = vec_ld(16, &(out[0])); // i0
+ v2 = vec_ld(i1, &(out[0])); // r1
+ v8 = vec_sub(v12, v10);
+ v12 = vec_add(v12, v10);
+ v9 = vec_sub(v11, v13);
+ v13 = vec_add(v11, v13);
+ v4 = vec_sub(v0, v12);
+ v0 = vec_add(v0, v12);
+ v7 = vec_sub(v3, v8);
+ v3 = vec_add(v3, v8);
+
+ vec_st(v0, 0, &(out[0])); // r0
+ vec_st(v3, i1+16, &(out[0])); // i1
+ vec_st(v4, i2, &(out[0])); // r2
+ vec_st(v7, i3+16, &(out[0]));// i3
+
+ v5 = vec_sub(v1, v13);
+ v1 = vec_add(v1, v13);
+ v6 = vec_sub(v2, v9);
+ v2 = vec_add(v2, v9);
+
+ vec_st(v1, 16, &(out[0])); // i0
+ vec_st(v2, i1, &(out[0])); // r1
+ vec_st(v5, i2+16, &(out[0])); // i2
+ vec_st(v6, i3, &(out[0])); // r3
+
+ do {
+ out += 8;
+ wre += 4;
+ wim -= 4;
+
+ v8 = vec_ld(0, &(wre[0]));
+ v10 = vec_ld(0, &(wim[0]));
+ v9 = vec_ld(0, &(wim[-4]));
+ v9 = vec_perm(v9, v10, vcprm(s0,3,2,1));
+
+ v4 = vec_ld(i2, &(out[0])); // r2
+ v5 = vec_ld(i2+16, &(out[0])); // i2
+ v6 = vec_ld(i3, &(out[0])); // r3
+ v7 = vec_ld(i3+16, &(out[0]));// i3
+ v10 = vec_mul(v4, v8); // r2*wre
+ v11 = vec_mul(v5, v8); // i2*wre
+ v12 = vec_mul(v6, v8); // r3*wre
+ v13 = vec_mul(v7, v8); // i3*wre
+
+ v0 = vec_ld(0, &(out[0])); // r0
+ v3 = vec_ld(i1+16, &(out[0])); // i1
+ v10 = vec_madd(v5, v9, v10); // r2*wim
+ v11 = vec_nmsub(v4, v9, v11); // i2*wim
+ v12 = vec_nmsub(v7, v9, v12); // r3*wim
+ v13 = vec_madd(v6, v9, v13); // i3*wim
+
+ v1 = vec_ld(16, &(out[0])); // i0
+ v2 = vec_ld(i1, &(out[0])); // r1
+ v8 = vec_sub(v12, v10);
+ v12 = vec_add(v12, v10);
+ v9 = vec_sub(v11, v13);
+ v13 = vec_add(v11, v13);
+ v4 = vec_sub(v0, v12);
+ v0 = vec_add(v0, v12);
+ v7 = vec_sub(v3, v8);
+ v3 = vec_add(v3, v8);
+
+ vec_st(v0, 0, &(out[0])); // r0
+ vec_st(v3, i1+16, &(out[0])); // i1
+ vec_st(v4, i2, &(out[0])); // r2
+ vec_st(v7, i3+16, &(out[0])); // i3
+
+ v5 = vec_sub(v1, v13);
+ v1 = vec_add(v1, v13);
+ v6 = vec_sub(v2, v9);
+ v2 = vec_add(v2, v9);
+
+ vec_st(v1, 16, &(out[0])); // i0
+ vec_st(v2, i1, &(out[0])); // r1
+ vec_st(v5, i2+16, &(out[0])); // i2
+ vec_st(v6, i3, &(out[0])); // r3
+ } while (n-=2);
+}
+
+#endif
+
+#endif /* AVCODEC_PPC_FFT_VSX_H */
diff -Nrup ffmpeg.orig/libavcodec/ppc/hpeldsp_altivec.c ffmpeg/libavcodec/ppc/hpeldsp_altivec.c
--- ffmpeg.orig/libavcodec/ppc/hpeldsp_altivec.c 1970-01-01 01:00:00.000000000 +0100
+++ ffmpeg/libavcodec/ppc/hpeldsp_altivec.c 2018-09-05 15:45:34.368754131 +0200
@@ -0,0 +1,386 @@
+/*
+ * Copyright (c) 2002 Brian Foley
+ * Copyright (c) 2002 Dieter Shirley
+ * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/ppc/cpu.h"
+#include "libavutil/ppc/util_altivec.h"
+
+#include "libavcodec/hpeldsp.h"
+
+#include "hpeldsp_altivec.h"
+
+#if HAVE_ALTIVEC
+/* next one assumes that ((line_size % 16) == 0) */
+void ff_put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
+{
+ register vector unsigned char pixelsv1;
+ register vector unsigned char pixelsv1B;
+ register vector unsigned char pixelsv1C;
+ register vector unsigned char pixelsv1D;
+
+ int i;
+ register ptrdiff_t line_size_2 = line_size << 1;
+ register ptrdiff_t line_size_3 = line_size + line_size_2;
+ register ptrdiff_t line_size_4 = line_size << 2;
+
+// hand-unrolling the loop by 4 gains about 15%
+// mininum execution time goes from 74 to 60 cycles
+// it's faster than -funroll-loops, but using
+// -funroll-loops w/ this is bad - 74 cycles again.
+// all this is on a 7450, tuning for the 7450
+ for (i = 0; i < h; i += 4) {
+ pixelsv1 = unaligned_load( 0, pixels);
+ pixelsv1B = unaligned_load(line_size, pixels);
+ pixelsv1C = unaligned_load(line_size_2, pixels);
+ pixelsv1D = unaligned_load(line_size_3, pixels);
+ VEC_ST(pixelsv1, 0, (unsigned char*)block);
+ VEC_ST(pixelsv1B, line_size, (unsigned char*)block);
+ VEC_ST(pixelsv1C, line_size_2, (unsigned char*)block);
+ VEC_ST(pixelsv1D, line_size_3, (unsigned char*)block);
+ pixels+=line_size_4;
+ block +=line_size_4;
+ }
+}
+
+/* next one assumes that ((line_size % 16) == 0) */
+#define op_avg(a,b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
+void ff_avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
+{
+ register vector unsigned char pixelsv, blockv;
+
+ int i;
+ for (i = 0; i < h; i++) {
+ blockv = vec_ld(0, block);
+ pixelsv = VEC_LD( 0, pixels);
+ blockv = vec_avg(blockv,pixelsv);
+ vec_st(blockv, 0, (unsigned char*)block);
+ pixels+=line_size;
+ block +=line_size;
+ }
+}
+
+/* next one assumes that ((line_size % 8) == 0) */
+static void avg_pixels8_altivec(uint8_t * block, const uint8_t * pixels, ptrdiff_t line_size, int h)
+{
+ register vector unsigned char pixelsv, blockv;
+ int i;
+
+ for (i = 0; i < h; i++) {
+ /* block is 8 bytes-aligned, so we're either in the
+ left block (16 bytes-aligned) or in the right block (not) */
+ int rightside = ((unsigned long)block & 0x0000000F);
+
+ blockv = vec_ld(0, block);
+ pixelsv = VEC_LD( 0, pixels);
+
+ if (rightside) {
+ pixelsv = vec_perm(blockv, pixelsv, vcprm(0,1,s0,s1));
+ } else {
+ pixelsv = vec_perm(blockv, pixelsv, vcprm(s0,s1,2,3));
+ }
+
+ blockv = vec_avg(blockv, pixelsv);
+
+ vec_st(blockv, 0, block);
+
+ pixels += line_size;
+ block += line_size;
+ }
+}
+
+/* next one assumes that ((line_size % 8) == 0) */
+static void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
+{
+ register int i;
+ register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
+ register vector unsigned char blockv;
+ register vector unsigned short pixelssum1, pixelssum2, temp3;
+ register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
+ register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
+
+ pixelsv1 = VEC_LD(0, pixels);
+ pixelsv2 = VEC_LD(1, pixels);
+ pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
+ pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
+
+ pixelssum1 = vec_add((vector unsigned short)pixelsv1,
+ (vector unsigned short)pixelsv2);
+ pixelssum1 = vec_add(pixelssum1, vctwo);
+
+ for (i = 0; i < h ; i++) {
+ int rightside = ((unsigned long)block & 0x0000000F);
+ blockv = vec_ld(0, block);
+
+ pixelsv1 = unaligned_load(line_size, pixels);
+ pixelsv2 = unaligned_load(line_size+1, pixels);
+ pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
+ pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
+ pixelssum2 = vec_add((vector unsigned short)pixelsv1,
+ (vector unsigned short)pixelsv2);
+ temp3 = vec_add(pixelssum1, pixelssum2);
+ temp3 = vec_sra(temp3, vctwo);
+ pixelssum1 = vec_add(pixelssum2, vctwo);
+ pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
+
+ if (rightside) {
+ blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
+ } else {
+ blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
+ }
+
+ vec_st(blockv, 0, block);
+
+ block += line_size;
+ pixels += line_size;
+ }
+}
+
+/* next one assumes that ((line_size % 8) == 0) */
+static void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
+{
+ register int i;
+ register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
+ register vector unsigned char blockv;
+ register vector unsigned short pixelssum1, pixelssum2, temp3;
+ register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
+ register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
+ register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
+
+ pixelsv1 = VEC_LD(0, pixels);
+ pixelsv2 = VEC_LD(1, pixels);
+ pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
+ pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
+ pixelssum1 = vec_add((vector unsigned short)pixelsv1,
+ (vector unsigned short)pixelsv2);
+ pixelssum1 = vec_add(pixelssum1, vcone);
+
+ for (i = 0; i < h ; i++) {
+ int rightside = ((unsigned long)block & 0x0000000F);
+ blockv = vec_ld(0, block);
+
+ pixelsv1 = unaligned_load(line_size, pixels);
+ pixelsv2 = unaligned_load(line_size+1, pixels);
+ pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
+ pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
+ pixelssum2 = vec_add((vector unsigned short)pixelsv1,
+ (vector unsigned short)pixelsv2);
+ temp3 = vec_add(pixelssum1, pixelssum2);
+ temp3 = vec_sra(temp3, vctwo);
+ pixelssum1 = vec_add(pixelssum2, vcone);
+ pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
+
+ if (rightside) {
+ blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
+ } else {
+ blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
+ }
+
+ vec_st(blockv, 0, block);
+
+ block += line_size;
+ pixels += line_size;
+ }
+}
+
+/* next one assumes that ((line_size % 16) == 0) */
+static void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, ptrdiff_t line_size, int h)
+{
+ register int i;
+ register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4;
+ register vector unsigned char blockv;
+ register vector unsigned short temp3, temp4,
+ pixelssum1, pixelssum2, pixelssum3, pixelssum4;
+ register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
+ register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
+
+ pixelsv1 = VEC_LD(0, pixels);
+ pixelsv2 = VEC_LD(1, pixels);
+ pixelsv3 = VEC_MERGEL(vczero, pixelsv1);
+ pixelsv4 = VEC_MERGEL(vczero, pixelsv2);
+ pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
+ pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
+ pixelssum3 = vec_add((vector unsigned short)pixelsv3,
+ (vector unsigned short)pixelsv4);
+ pixelssum3 = vec_add(pixelssum3, vctwo);
+ pixelssum1 = vec_add((vector unsigned short)pixelsv1,
+ (vector unsigned short)pixelsv2);
+ pixelssum1 = vec_add(pixelssum1, vctwo);
+
+ for (i = 0; i < h ; i++) {
+ blockv = vec_ld(0, block);
+
+ pixelsv1 = unaligned_load(line_size, pixels);
+ pixelsv2 = unaligned_load(line_size+1, pixels);
+
+ pixelsv3 = VEC_MERGEL(vczero, pixelsv1);
+ pixelsv4 = VEC_MERGEL(vczero, pixelsv2);
+ pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
+ pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
+ pixelssum4 = vec_add((vector unsigned short)pixelsv3,
+ (vector unsigned short)pixelsv4);
+ pixelssum2 = vec_add((vector unsigned short)pixelsv1,
+ (vector unsigned short)pixelsv2);
+ temp4 = vec_add(pixelssum3, pixelssum4);
+ temp4 = vec_sra(temp4, vctwo);
+ temp3 = vec_add(pixelssum1, pixelssum2);
+ temp3 = vec_sra(temp3, vctwo);
+
+ pixelssum3 = vec_add(pixelssum4, vctwo);
+ pixelssum1 = vec_add(pixelssum2, vctwo);
+
+ blockv = vec_packsu(temp3, temp4);
+
+ vec_st(blockv, 0, block);
+
+ block += line_size;
+ pixels += line_size;
+ }
+}
+
+/* next one assumes that ((line_size % 16) == 0) */
+static void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, ptrdiff_t line_size, int h)
+{
+ register int i;
+ register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4;
+ register vector unsigned char blockv;
+ register vector unsigned short temp3, temp4,
+ pixelssum1, pixelssum2, pixelssum3, pixelssum4;
+ register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
+ register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
+ register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
+
+ pixelsv1 = VEC_LD(0, pixels);
+ pixelsv2 = VEC_LD(1, pixels);
+ pixelsv3 = VEC_MERGEL(vczero, pixelsv1);
+ pixelsv4 = VEC_MERGEL(vczero, pixelsv2);
+ pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
+ pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
+ pixelssum3 = vec_add((vector unsigned short)pixelsv3,
+ (vector unsigned short)pixelsv4);
+ pixelssum3 = vec_add(pixelssum3, vcone);
+ pixelssum1 = vec_add((vector unsigned short)pixelsv1,
+ (vector unsigned short)pixelsv2);
+ pixelssum1 = vec_add(pixelssum1, vcone);
+
+ for (i = 0; i < h ; i++) {
+ pixelsv1 = unaligned_load(line_size, pixels);
+ pixelsv2 = unaligned_load(line_size+1, pixels);
+
+ pixelsv3 = VEC_MERGEL(vczero, pixelsv1);
+ pixelsv4 = VEC_MERGEL(vczero, pixelsv2);
+ pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
+ pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
+ pixelssum4 = vec_add((vector unsigned short)pixelsv3,
+ (vector unsigned short)pixelsv4);
+ pixelssum2 = vec_add((vector unsigned short)pixelsv1,
+ (vector unsigned short)pixelsv2);
+ temp4 = vec_add(pixelssum3, pixelssum4);
+ temp4 = vec_sra(temp4, vctwo);
+ temp3 = vec_add(pixelssum1, pixelssum2);
+ temp3 = vec_sra(temp3, vctwo);
+
+ pixelssum3 = vec_add(pixelssum4, vcone);
+ pixelssum1 = vec_add(pixelssum2, vcone);
+
+ blockv = vec_packsu(temp3, temp4);
+
+ VEC_ST(blockv, 0, block);
+
+ block += line_size;
+ pixels += line_size;
+ }
+}
+
+/* next one assumes that ((line_size % 8) == 0) */
+static void avg_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
+{
+ register int i;
+ register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
+ register vector unsigned char blockv, blocktemp;
+ register vector unsigned short pixelssum1, pixelssum2, temp3;
+
+ register const vector unsigned char vczero = (const vector unsigned char)
+ vec_splat_u8(0);
+ register const vector unsigned short vctwo = (const vector unsigned short)
+ vec_splat_u16(2);
+
+ pixelsv1 = VEC_LD(0, pixels);
+ pixelsv2 = VEC_LD(1, pixels);
+ pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
+ pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
+ pixelssum1 = vec_add((vector unsigned short)pixelsv1,
+ (vector unsigned short)pixelsv2);
+ pixelssum1 = vec_add(pixelssum1, vctwo);
+
+ for (i = 0; i < h ; i++) {
+ int rightside = ((unsigned long)block & 0x0000000F);
+ blockv = vec_ld(0, block);
+
+ pixelsv1 = unaligned_load(line_size, pixels);
+ pixelsv2 = unaligned_load(line_size+1, pixels);
+
+ pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
+ pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
+ pixelssum2 = vec_add((vector unsigned short)pixelsv1,
+ (vector unsigned short)pixelsv2);
+ temp3 = vec_add(pixelssum1, pixelssum2);
+ temp3 = vec_sra(temp3, vctwo);
+ pixelssum1 = vec_add(pixelssum2, vctwo);
+ pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
+
+ if (rightside) {
+ blocktemp = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
+ } else {
+ blocktemp = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
+ }
+
+ blockv = vec_avg(blocktemp, blockv);
+ vec_st(blockv, 0, block);
+
+ block += line_size;
+ pixels += line_size;
+ }
+}
+#endif /* HAVE_ALTIVEC */
+
+av_cold void ff_hpeldsp_init_ppc(HpelDSPContext *c, int flags)
+{
+#if HAVE_ALTIVEC
+ if (!PPC_ALTIVEC(av_get_cpu_flags()))
+ return;
+
+ c->avg_pixels_tab[0][0] = ff_avg_pixels16_altivec;
+ c->avg_pixels_tab[1][0] = avg_pixels8_altivec;
+ c->avg_pixels_tab[1][3] = avg_pixels8_xy2_altivec;
+
+ c->put_pixels_tab[0][0] = ff_put_pixels16_altivec;
+ c->put_pixels_tab[1][3] = put_pixels8_xy2_altivec;
+ c->put_pixels_tab[0][3] = put_pixels16_xy2_altivec;
+
+ c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_altivec;
+ c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_altivec;
+ c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_altivec;
+#endif /* HAVE_ALTIVEC */
+}
diff -Nrup ffmpeg.orig/libavcodec/ppc/hpeldsp_altivec.h ffmpeg/libavcodec/ppc/hpeldsp_altivec.h
--- ffmpeg.orig/libavcodec/ppc/hpeldsp_altivec.h 1970-01-01 01:00:00.000000000 +0100
+++ ffmpeg/libavcodec/ppc/hpeldsp_altivec.h 2018-09-05 15:45:34.368754131 +0200
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2002 Brian Foley
+ * Copyright (c) 2002 Dieter Shirley
+ * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_PPC_HPELDSP_ALTIVEC_H
+#define AVCODEC_PPC_HPELDSP_ALTIVEC_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+void ff_avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size, int h);
+void ff_put_pixels16_altivec(uint8_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size, int h);
+
+#endif /* AVCODEC_PPC_HPELDSP_ALTIVEC_H */
diff -Nrup ffmpeg.orig/libavcodec/ppc/mathops.h ffmpeg/libavcodec/ppc/mathops.h
--- ffmpeg.orig/libavcodec/ppc/mathops.h 1970-01-01 01:00:00.000000000 +0100
+++ ffmpeg/libavcodec/ppc/mathops.h 2018-09-05 15:45:34.368754131 +0200
@@ -0,0 +1,79 @@
+/*
+ * simple math operations
+ * Copyright (c) 2001, 2002 Fabrice Bellard
+ * Copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at> et al
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_PPC_MATHOPS_H
+#define AVCODEC_PPC_MATHOPS_H
+
+#include <stdint.h>
+#include "config.h"
+#include "libavutil/common.h"
+
+#if HAVE_PPC4XX
+/* signed 16x16 -> 32 multiply add accumulate */
+#define MAC16(rt, ra, rb) \
+ __asm__ ("maclhw %0, %2, %3" : "=r" (rt) : "0" (rt), "r" (ra), "r" (rb));
+
+/* signed 16x16 -> 32 multiply */
+#define MUL16(ra, rb) \
+ ({ int __rt; \
+ __asm__ ("mullhw %0, %1, %2" : "=r" (__rt) : "r" (ra), "r" (rb)); \
+ __rt; })
+#endif
+
+#define MULH MULH
+static inline av_const int MULH(int a, int b){
+ int r;
+ __asm__ ("mulhw %0, %1, %2" : "=r"(r) : "r"(a), "r"(b));
+ return r;
+}
+
+#if !ARCH_PPC64
+static inline av_const int64_t MAC64(int64_t d, int a, int b)
+{
+ union { uint64_t x; unsigned hl[2]; } x = { d };
+ int h, l;
+ __asm__ ("mullw %3, %4, %5 \n\t"
+ "mulhw %2, %4, %5 \n\t"
+ "addc %1, %1, %3 \n\t"
+ "adde %0, %0, %2 \n\t"
+ : "+r"(x.hl[0]), "+r"(x.hl[1]), "=&r"(h), "=&r"(l)
+ : "r"(a), "r"(b));
+ return x.x;
+}
+#define MAC64(d, a, b) ((d) = MAC64(d, a, b))
+
+static inline av_const int64_t MLS64(int64_t d, int a, int b)
+{
+ union { uint64_t x; unsigned hl[2]; } x = { d };
+ int h, l;
+ __asm__ ("mullw %3, %4, %5 \n\t"
+ "mulhw %2, %4, %5 \n\t"
+ "subfc %1, %3, %1 \n\t"
+ "subfe %0, %2, %0 \n\t"
+ : "+r"(x.hl[0]), "+r"(x.hl[1]), "=&r"(h), "=&r"(l)
+ : "r"(a), "r"(b));
+ return x.x;
+}
+#define MLS64(d, a, b) ((d) = MLS64(d, a, b))
+#endif
+
+#endif /* AVCODEC_PPC_MATHOPS_H */
diff -Nrup ffmpeg.orig/libavcodec/ppc/mpegaudiodsp_altivec.c ffmpeg/libavcodec/ppc/mpegaudiodsp_altivec.c
--- ffmpeg.orig/libavcodec/ppc/mpegaudiodsp_altivec.c 1970-01-01 01:00:00.000000000 +0100
+++ ffmpeg/libavcodec/ppc/mpegaudiodsp_altivec.c 2018-09-05 15:45:34.368754131 +0200
@@ -0,0 +1,141 @@
+/*
+ * Altivec optimized MP3 decoding functions
+ * Copyright (c) 2010 Vitor Sessak
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/internal.h"
+#include "libavutil/ppc/cpu.h"
+#include "libavutil/ppc/util_altivec.h"
+#include "libavcodec/mpegaudiodsp.h"
+
+#if HAVE_ALTIVEC
+
+#define MACS(rt, ra, rb) rt+=(ra)*(rb)
+#define MLSS(rt, ra, rb) rt-=(ra)*(rb)
+
+#define SUM8(op, sum, w, p) \
+{ \
+ op(sum, (w)[0 * 64], (p)[0 * 64]); \
+ op(sum, (w)[1 * 64], (p)[1 * 64]); \
+ op(sum, (w)[2 * 64], (p)[2 * 64]); \
+ op(sum, (w)[3 * 64], (p)[3 * 64]); \
+ op(sum, (w)[4 * 64], (p)[4 * 64]); \
+ op(sum, (w)[5 * 64], (p)[5 * 64]); \
+ op(sum, (w)[6 * 64], (p)[6 * 64]); \
+ op(sum, (w)[7 * 64], (p)[7 * 64]); \
+}
+
+static void apply_window(const float *buf, const float *win1,
+ const float *win2, float *sum1, float *sum2, int len)
+{
+ const vector float *win1a = (const vector float *) win1;
+ const vector float *win2a = (const vector float *) win2;
+ const vector float *bufa = (const vector float *) buf;
+ vector float *sum1a = (vector float *) sum1;
+ vector float *sum2a = (vector float *) sum2;
+ vector float av_uninit(v0), av_uninit(v4);
+ vector float v1, v2, v3;
+
+ len = len >> 2;
+
+#define MULT(a, b) \
+ { \
+ v1 = vec_ld(a, win1a); \
+ v2 = vec_ld(b, win2a); \
+ v3 = vec_ld(a, bufa); \
+ v0 = vec_madd(v3, v1, v0); \
+ v4 = vec_madd(v2, v3, v4); \
+ }
+
+ while (len--) {
+ v0 = vec_xor(v0, v0);
+ v4 = vec_xor(v4, v4);
+
+ MULT( 0, 0);
+ MULT( 256, 64);
+ MULT( 512, 128);
+ MULT( 768, 192);
+ MULT(1024, 256);
+ MULT(1280, 320);
+ MULT(1536, 384);
+ MULT(1792, 448);
+
+ vec_st(v0, 0, sum1a);
+ vec_st(v4, 0, sum2a);
+ sum1a++;
+ sum2a++;
+ win1a++;
+ win2a++;
+ bufa++;
+ }
+}
+
+static void apply_window_mp3(float *in, float *win, int *unused, float *out,
+ ptrdiff_t incr)
+{
+ LOCAL_ALIGNED_16(float, suma, [17]);
+ LOCAL_ALIGNED_16(float, sumb, [17]);
+ LOCAL_ALIGNED_16(float, sumc, [17]);
+ LOCAL_ALIGNED_16(float, sumd, [17]);
+
+ float sum;
+ int j;
+ float *out2 = out + 32 * incr;
+
+ /* copy to avoid wrap */
+ memcpy(in + 512, in, 32 * sizeof(*in));
+
+ apply_window(in + 16, win , win + 512, suma, sumc, 16);
+ apply_window(in + 32, win + 48, win + 640, sumb, sumd, 16);
+
+ SUM8(MLSS, suma[0], win + 32, in + 48);
+
+ sumc[ 0] = 0;
+ sumb[16] = 0;
+ sumd[16] = 0;
+
+ out[0 ] = suma[ 0];
+ out += incr;
+ out2 -= incr;
+ for(j=1;j<16;j++) {
+ *out = suma[ j] - sumd[16-j];
+ *out2 = -sumb[16-j] - sumc[ j];
+ out += incr;
+ out2 -= incr;
+ }
+
+ sum = 0;
+ SUM8(MLSS, sum, win + 16 + 32, in + 32);
+ *out = sum;
+}
+
+#endif /* HAVE_ALTIVEC */
+
+av_cold void ff_mpadsp_init_ppc(MPADSPContext *s)
+{
+#if HAVE_ALTIVEC
+ if (!PPC_ALTIVEC(av_get_cpu_flags()))
+ return;
+
+ s->apply_window_float = apply_window_mp3;
+#endif /* HAVE_ALTIVEC */
+}
diff -Nrup ffmpeg.orig/libavcodec/ppc/videodsp.c ffmpeg/libavcodec/ppc/videodsp.c
--- ffmpeg.orig/libavcodec/ppc/videodsp.c 1970-01-01 01:00:00.000000000 +0100
+++ ffmpeg/libavcodec/ppc/videodsp.c 2018-09-05 15:45:34.368754131 +0200
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2003-2004 Romain Dolbeau
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavcodec/videodsp.h"
+
+static void prefetch_ppc(uint8_t *mem, ptrdiff_t stride, int h)
+{
+ register const uint8_t *p = mem;
+ do {
+ __asm__ volatile ("dcbt 0,%0" : : "r" (p));
+ p += stride;
+ } while(--h);
+}
+
+av_cold void ff_videodsp_init_ppc(VideoDSPContext *ctx, int bpc)
+{
+ ctx->prefetch = prefetch_ppc;
+}
diff -Nrup ffmpeg.orig/libavcodec/ppc/vorbisdsp_altivec.c ffmpeg/libavcodec/ppc/vorbisdsp_altivec.c
--- ffmpeg.orig/libavcodec/ppc/vorbisdsp_altivec.c 1970-01-01 01:00:00.000000000 +0100
+++ ffmpeg/libavcodec/ppc/vorbisdsp_altivec.c 2018-09-05 15:45:34.368754131 +0200
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2006 Luca Barbato <lu_zero@gentoo.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/ppc/cpu.h"
+#include "libavutil/ppc/util_altivec.h"
+
+#include "libavcodec/vorbisdsp.h"
+
+#if HAVE_ALTIVEC
+static void vorbis_inverse_coupling_altivec(float *mag, float *ang,
+ intptr_t blocksize)
+{
+ int i;
+ vector float m, a;
+ vector bool int t0, t1;
+ const vector unsigned int v_31 = //XXX
+ vec_add(vec_add(vec_splat_u32(15),vec_splat_u32(15)),vec_splat_u32(1));
+ for (i = 0; i < blocksize; i += 4) {
+ m = vec_ld(0, mag+i);
+ a = vec_ld(0, ang+i);
+ t0 = vec_cmple(m, (vector float)vec_splat_u32(0));
+ t1 = vec_cmple(a, (vector float)vec_splat_u32(0));
+ a = vec_xor(a, (vector float) vec_sl((vector unsigned int)t0, v_31));
+ t0 = (vector bool int)vec_and(a, t1);
+ t1 = (vector bool int)vec_andc(a, t1);
+ a = vec_sub(m, (vector float)t1);
+ m = vec_add(m, (vector float)t0);
+ vec_stl(a, 0, ang+i);
+ vec_stl(m, 0, mag+i);
+ }
+}
+#endif /* HAVE_ALTIVEC */
+
+av_cold void ff_vorbisdsp_init_ppc(VorbisDSPContext *c)
+{
+#if HAVE_ALTIVEC
+ if (!PPC_ALTIVEC(av_get_cpu_flags()))
+ return;
+
+ c->vorbis_inverse_coupling = vorbis_inverse_coupling_altivec;
+#endif /* HAVE_ALTIVEC */
+}
diff -Nrup ffmpeg.orig/libavcodec/ppc/vp3dsp_altivec.c ffmpeg/libavcodec/ppc/vp3dsp_altivec.c
--- ffmpeg.orig/libavcodec/ppc/vp3dsp_altivec.c 1970-01-01 01:00:00.000000000 +0100
+++ ffmpeg/libavcodec/ppc/vp3dsp_altivec.c 2018-09-05 15:45:34.368754131 +0200
@@ -0,0 +1,204 @@
+/*
+ * Copyright (C) 2009 David Conrad
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <string.h>
+
+#include "config.h"
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/ppc/cpu.h"
+#include "libavutil/ppc/util_altivec.h"
+
+#include "libavcodec/vp3dsp.h"
+
+#if HAVE_ALTIVEC
+
+static const vec_s16 constants =
+ {0, 64277, 60547, 54491, 46341, 36410, 25080, 12785};
+#if HAVE_BIGENDIAN
+static const vec_u8 interleave_high =
+ {0, 1, 16, 17, 4, 5, 20, 21, 8, 9, 24, 25, 12, 13, 28, 29};
+#else
+static const vec_u8 interleave_high =
+ {2, 3, 18, 19, 6, 7, 22, 23, 10, 11, 26, 27, 14, 15, 30, 31};
+#endif
+
+#define IDCT_START \
+ vec_s16 A, B, C, D, Ad, Bd, Cd, Dd, E, F, G, H;\
+ vec_s16 Ed, Gd, Add, Bdd, Fd, Hd;\
+ vec_s16 eight = vec_splat_s16(8);\
+ vec_u16 four = vec_splat_u16(4);\
+\
+ vec_s16 C1 = vec_splat(constants, 1);\
+ vec_s16 C2 = vec_splat(constants, 2);\
+ vec_s16 C3 = vec_splat(constants, 3);\
+ vec_s16 C4 = vec_splat(constants, 4);\
+ vec_s16 C5 = vec_splat(constants, 5);\
+ vec_s16 C6 = vec_splat(constants, 6);\
+ vec_s16 C7 = vec_splat(constants, 7);\
+\
+ vec_s16 b0 = vec_ld(0x00, block);\
+ vec_s16 b1 = vec_ld(0x10, block);\
+ vec_s16 b2 = vec_ld(0x20, block);\
+ vec_s16 b3 = vec_ld(0x30, block);\
+ vec_s16 b4 = vec_ld(0x40, block);\
+ vec_s16 b5 = vec_ld(0x50, block);\
+ vec_s16 b6 = vec_ld(0x60, block);\
+ vec_s16 b7 = vec_ld(0x70, block);
+
+// these functions do (a*C)>>16
+// things are tricky because a is signed, but C unsigned.
+// M15 is used if C fits in 15 bit unsigned (C6,C7)
+// M16 is used if C requires 16 bits unsigned
+static inline vec_s16 M15(vec_s16 a, vec_s16 C)
+{
+ return (vec_s16)vec_perm(vec_mule(a,C), vec_mulo(a,C), interleave_high);
+}
+static inline vec_s16 M16(vec_s16 a, vec_s16 C)
+{
+ return vec_add(a, M15(a, C));
+}
+
+#define IDCT_1D(ADD, SHIFT)\
+ A = vec_add(M16(b1, C1), M15(b7, C7));\
+ B = vec_sub(M15(b1, C7), M16(b7, C1));\
+ C = vec_add(M16(b3, C3), M16(b5, C5));\
+ D = vec_sub(M16(b5, C3), M16(b3, C5));\
+\
+ Ad = M16(vec_sub(A, C), C4);\
+ Bd = M16(vec_sub(B, D), C4);\
+\
+ Cd = vec_add(A, C);\
+ Dd = vec_add(B, D);\
+\
+ E = ADD(M16(vec_add(b0, b4), C4));\
+ F = ADD(M16(vec_sub(b0, b4), C4));\
+\
+ G = vec_add(M16(b2, C2), M15(b6, C6));\
+ H = vec_sub(M15(b2, C6), M16(b6, C2));\
+\
+ Ed = vec_sub(E, G);\
+ Gd = vec_add(E, G);\
+\
+ Add = vec_add(F, Ad);\
+ Bdd = vec_sub(Bd, H);\
+\
+ Fd = vec_sub(F, Ad);\
+ Hd = vec_add(Bd, H);\
+\
+ b0 = SHIFT(vec_add(Gd, Cd));\
+ b7 = SHIFT(vec_sub(Gd, Cd));\
+\
+ b1 = SHIFT(vec_add(Add, Hd));\
+ b2 = SHIFT(vec_sub(Add, Hd));\
+\
+ b3 = SHIFT(vec_add(Ed, Dd));\
+ b4 = SHIFT(vec_sub(Ed, Dd));\
+\
+ b5 = SHIFT(vec_add(Fd, Bdd));\
+ b6 = SHIFT(vec_sub(Fd, Bdd));
+
+#define NOP(a) a
+#define ADD8(a) vec_add(a, eight)
+#define SHIFT4(a) vec_sra(a, four)
+
+static void vp3_idct_put_altivec(uint8_t *dst, ptrdiff_t stride, int16_t block[64])
+{
+ vec_u8 t;
+ IDCT_START
+
+ // pixels are signed; so add 128*16 in addition to the normal 8
+ vec_s16 v2048 = vec_sl(vec_splat_s16(1), vec_splat_u16(11));
+ eight = vec_add(eight, v2048);
+
+ IDCT_1D(NOP, NOP)
+ TRANSPOSE8(b0, b1, b2, b3, b4, b5, b6, b7);
+ IDCT_1D(ADD8, SHIFT4)
+
+#define PUT(a)\
+ t = vec_packsu(a, a);\
+ vec_ste((vec_u32)t, 0, (unsigned int *)dst);\
+ vec_ste((vec_u32)t, 4, (unsigned int *)dst);
+
+ PUT(b0) dst += stride;
+ PUT(b1) dst += stride;
+ PUT(b2) dst += stride;
+ PUT(b3) dst += stride;
+ PUT(b4) dst += stride;
+ PUT(b5) dst += stride;
+ PUT(b6) dst += stride;
+ PUT(b7)
+ memset(block, 0, sizeof(*block) * 64);
+}
+
+static void vp3_idct_add_altivec(uint8_t *dst, ptrdiff_t stride, int16_t block[64])
+{
+ LOAD_ZERO;
+ vec_u8 t, vdst;
+ vec_s16 vdst_16;
+ vec_u8 vdst_mask = vec_mergeh(vec_splat_u8(-1), vec_lvsl(0, dst));
+
+ IDCT_START
+
+ IDCT_1D(NOP, NOP)
+ TRANSPOSE8(b0, b1, b2, b3, b4, b5, b6, b7);
+ IDCT_1D(ADD8, SHIFT4)
+
+#if HAVE_BIGENDIAN
+#define GET_VDST16\
+ vdst = vec_ld(0, dst);\
+ vdst_16 = (vec_s16)vec_perm(vdst, zero_u8v, vdst_mask);
+#else
+#define GET_VDST16\
+ vdst = vec_vsx_ld(0,dst);\
+ vdst_16 = (vec_s16)vec_mergeh(vdst, zero_u8v);
+#endif
+
+#define ADD(a)\
+ GET_VDST16;\
+ vdst_16 = vec_adds(a, vdst_16);\
+ t = vec_packsu(vdst_16, vdst_16);\
+ vec_ste((vec_u32)t, 0, (unsigned int *)dst);\
+ vec_ste((vec_u32)t, 4, (unsigned int *)dst);
+
+ ADD(b0) dst += stride;
+ ADD(b1) dst += stride;
+ ADD(b2) dst += stride;
+ ADD(b3) dst += stride;
+ ADD(b4) dst += stride;
+ ADD(b5) dst += stride;
+ ADD(b6) dst += stride;
+ ADD(b7)
+ memset(block, 0, sizeof(*block) * 64);
+}
+
+#endif /* HAVE_ALTIVEC */
+
+av_cold void ff_vp3dsp_init_ppc(VP3DSPContext *c, int flags)
+{
+#if HAVE_ALTIVEC
+ if (!PPC_ALTIVEC(av_get_cpu_flags()))
+ return;
+
+ c->idct_put = vp3_idct_put_altivec;
+ c->idct_add = vp3_idct_add_altivec;
+#endif
+}
diff -Nrup ffmpeg.orig/libavcodec/ppc/vp8dsp_altivec.c ffmpeg/libavcodec/ppc/vp8dsp_altivec.c
--- ffmpeg.orig/libavcodec/ppc/vp8dsp_altivec.c 1970-01-01 01:00:00.000000000 +0100
+++ ffmpeg/libavcodec/ppc/vp8dsp_altivec.c 2018-09-05 15:45:34.368754131 +0200
@@ -0,0 +1,361 @@
+/*
+ * VP8 compatible video decoder
+ *
+ * Copyright (C) 2010 David Conrad
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+#include "libavutil/cpu.h"
+#include "libavutil/mem.h"
+#include "libavutil/ppc/cpu.h"
+#include "libavutil/ppc/util_altivec.h"
+
+#include "libavcodec/vp8dsp.h"
+
+#include "hpeldsp_altivec.h"
+
+#if HAVE_ALTIVEC
+#define REPT4(...) { __VA_ARGS__, __VA_ARGS__, __VA_ARGS__, __VA_ARGS__ }
+
+// h subpel filter uses msum to multiply+add 4 pixel taps at once
+static const vec_s8 h_subpel_filters_inner[7] =
+{
+ REPT4( -6, 123, 12, -1),
+ REPT4(-11, 108, 36, -8),
+ REPT4( -9, 93, 50, -6),
+ REPT4(-16, 77, 77, -16),
+ REPT4( -6, 50, 93, -9),
+ REPT4( -8, 36, 108, -11),
+ REPT4( -1, 12, 123, -6),
+};
+
+// for 6tap filters, these are the outer two taps
+// The zeros mask off pixels 4-7 when filtering 0-3
+// and vice-versa
+static const vec_s8 h_subpel_filters_outer[3] =
+{
+ REPT4(0, 0, 2, 1),
+ REPT4(0, 0, 3, 3),
+ REPT4(0, 0, 1, 2),
+};
+
+#define LOAD_H_SUBPEL_FILTER(i) \
+ vec_s8 filter_inner = h_subpel_filters_inner[i]; \
+ vec_s8 filter_outerh = h_subpel_filters_outer[(i)>>1]; \
+ vec_s8 filter_outerl = vec_sld(filter_outerh, filter_outerh, 2)
+
+#if HAVE_BIGENDIAN
+#define GET_PIXHL(offset) \
+ a = vec_ld((offset)-is6tap-1, src); \
+ b = vec_ld((offset)-is6tap-1+15, src); \
+ pixh = vec_perm(a, b, permh##offset); \
+ pixl = vec_perm(a, b, perml##offset)
+
+#define GET_OUTER(offset) outer = vec_perm(a, b, perm_6tap##offset)
+#else
+#define GET_PIXHL(offset) \
+ a = vec_vsx_ld((offset)-is6tap-1, src); \
+ pixh = vec_perm(a, a, perm_inner); \
+ pixl = vec_perm(a, a, vec_add(perm_inner, vec_splat_u8(4)))
+
+#define GET_OUTER(offset) outer = vec_perm(a, a, perm_outer)
+#endif
+
+#define FILTER_H(dstv, off) \
+ GET_PIXHL(off); \
+ filth = vec_msum(filter_inner, pixh, c64); \
+ filtl = vec_msum(filter_inner, pixl, c64); \
+\
+ if (is6tap) { \
+ GET_OUTER(off); \
+ filth = vec_msum(filter_outerh, outer, filth); \
+ filtl = vec_msum(filter_outerl, outer, filtl); \
+ } \
+ if (w == 4) \
+ filtl = filth; /* discard pixels 4-7 */ \
+ dstv = vec_packs(filth, filtl); \
+ dstv = vec_sra(dstv, c7)
+
+static av_always_inline
+void put_vp8_epel_h_altivec_core(uint8_t *dst, ptrdiff_t dst_stride,
+ uint8_t *src, ptrdiff_t src_stride,
+ int h, int mx, int w, int is6tap)
+{
+ LOAD_H_SUBPEL_FILTER(mx-1);
+#if HAVE_BIGENDIAN
+ vec_u8 align_vec0, align_vec8, permh0, permh8;
+ vec_u8 perm_6tap0, perm_6tap8, perml0, perml8;
+ vec_u8 b;
+#endif
+ vec_u8 filt, a, pixh, pixl, outer;
+ vec_s16 f16h, f16l;
+ vec_s32 filth, filtl;
+
+ vec_u8 perm_inner6 = { 1,2,3,4, 2,3,4,5, 3,4,5,6, 4,5,6,7 };
+ vec_u8 perm_inner4 = { 0,1,2,3, 1,2,3,4, 2,3,4,5, 3,4,5,6 };
+ vec_u8 perm_inner = is6tap ? perm_inner6 : perm_inner4;
+ vec_u8 perm_outer = { 4,9, 0,5, 5,10, 1,6, 6,11, 2,7, 7,12, 3,8 };
+ vec_s32 c64 = vec_sl(vec_splat_s32(1), vec_splat_u32(6));
+ vec_u16 c7 = vec_splat_u16(7);
+
+#if HAVE_BIGENDIAN
+ align_vec0 = vec_lvsl( -is6tap-1, src);
+ align_vec8 = vec_lvsl(8-is6tap-1, src);
+
+ permh0 = vec_perm(align_vec0, align_vec0, perm_inner);
+ permh8 = vec_perm(align_vec8, align_vec8, perm_inner);
+ perm_inner = vec_add(perm_inner, vec_splat_u8(4));
+ perml0 = vec_perm(align_vec0, align_vec0, perm_inner);
+ perml8 = vec_perm(align_vec8, align_vec8, perm_inner);
+ perm_6tap0 = vec_perm(align_vec0, align_vec0, perm_outer);
+ perm_6tap8 = vec_perm(align_vec8, align_vec8, perm_outer);
+#endif
+
+ while (h --> 0) {
+ FILTER_H(f16h, 0);
+
+ if (w == 16) {
+ FILTER_H(f16l, 8);
+ filt = vec_packsu(f16h, f16l);
+ vec_st(filt, 0, dst);
+ } else {
+ filt = vec_packsu(f16h, f16h);
+ vec_ste((vec_u32)filt, 0, (uint32_t*)dst);
+ if (w == 8)
+ vec_ste((vec_u32)filt, 4, (uint32_t*)dst);
+ }
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+// v subpel filter does a simple vertical multiply + add
+static const vec_u8 v_subpel_filters[7] =
+{
+ { 0, 6, 123, 12, 1, 0 },
+ { 2, 11, 108, 36, 8, 1 },
+ { 0, 9, 93, 50, 6, 0 },
+ { 3, 16, 77, 77, 16, 3 },
+ { 0, 6, 50, 93, 9, 0 },
+ { 1, 8, 36, 108, 11, 2 },
+ { 0, 1, 12, 123, 6, 0 },
+};
+
+#define LOAD_V_SUBPEL_FILTER(i) \
+ vec_u8 subpel_filter = v_subpel_filters[i]; \
+ vec_u8 f0 = vec_splat(subpel_filter, 0); \
+ vec_u8 f1 = vec_splat(subpel_filter, 1); \
+ vec_u8 f2 = vec_splat(subpel_filter, 2); \
+ vec_u8 f3 = vec_splat(subpel_filter, 3); \
+ vec_u8 f4 = vec_splat(subpel_filter, 4); \
+ vec_u8 f5 = vec_splat(subpel_filter, 5)
+
+#define FILTER_V(dstv, vec_mul) \
+ s1f = (vec_s16)vec_mul(s1, f1); \
+ s2f = (vec_s16)vec_mul(s2, f2); \
+ s3f = (vec_s16)vec_mul(s3, f3); \
+ s4f = (vec_s16)vec_mul(s4, f4); \
+ s2f = vec_subs(s2f, s1f); \
+ s3f = vec_subs(s3f, s4f); \
+ if (is6tap) { \
+ s0f = (vec_s16)vec_mul(s0, f0); \
+ s5f = (vec_s16)vec_mul(s5, f5); \
+ s2f = vec_adds(s2f, s0f); \
+ s3f = vec_adds(s3f, s5f); \
+ } \
+ dstv = vec_adds(s2f, s3f); \
+ dstv = vec_adds(dstv, c64); \
+ dstv = vec_sra(dstv, c7)
+
+#if HAVE_BIGENDIAN
+#define LOAD_HL(off, s, perm) load_with_perm_vec(off, s, perm)
+#else
+#define LOAD_HL(off, s, perm) vec_mergeh(vec_vsx_ld(off,s), vec_vsx_ld(off+8,s))
+#endif
+
+static av_always_inline
+void put_vp8_epel_v_altivec_core(uint8_t *dst, ptrdiff_t dst_stride,
+ uint8_t *src, ptrdiff_t src_stride,
+ int h, int my, int w, int is6tap)
+{
+ LOAD_V_SUBPEL_FILTER(my-1);
+ vec_u8 s0, s1, s2, s3, s4, s5, filt, align_vech, perm_vec, align_vecl;
+ vec_s16 s0f, s1f, s2f, s3f, s4f, s5f, f16h, f16l;
+ vec_s16 c64 = vec_sl(vec_splat_s16(1), vec_splat_u16(6));
+ vec_u16 c7 = vec_splat_u16(7);
+
+#if HAVE_BIGENDIAN
+ // we want pixels 0-7 to be in the even positions and 8-15 in the odd,
+ // so combine this permute with the alignment permute vector
+ align_vech = vec_lvsl(0, src);
+ align_vecl = vec_sld(align_vech, align_vech, 8);
+ if (w ==16)
+ perm_vec = vec_mergeh(align_vech, align_vecl);
+ else
+ perm_vec = vec_mergeh(align_vech, align_vech);
+#endif
+
+ if (is6tap)
+ s0 = LOAD_HL(-2*src_stride, src, perm_vec);
+ s1 = LOAD_HL(-1*src_stride, src, perm_vec);
+ s2 = LOAD_HL( 0*src_stride, src, perm_vec);
+ s3 = LOAD_HL( 1*src_stride, src, perm_vec);
+ if (is6tap)
+ s4 = LOAD_HL( 2*src_stride, src, perm_vec);
+
+ src += (2+is6tap)*src_stride;
+
+ while (h --> 0) {
+ if (is6tap)
+ s5 = LOAD_HL(0, src, perm_vec);
+ else
+ s4 = LOAD_HL(0, src, perm_vec);
+
+ FILTER_V(f16h, vec_mule);
+
+ if (w == 16) {
+ FILTER_V(f16l, vec_mulo);
+ filt = vec_packsu(f16h, f16l);
+ vec_st(filt, 0, dst);
+ } else {
+ filt = vec_packsu(f16h, f16h);
+ if (w == 4)
+ filt = (vec_u8)vec_splat((vec_u32)filt, 0);
+ else
+ vec_ste((vec_u32)filt, 4, (uint32_t*)dst);
+ vec_ste((vec_u32)filt, 0, (uint32_t*)dst);
+ }
+
+ if (is6tap)
+ s0 = s1;
+ s1 = s2;
+ s2 = s3;
+ s3 = s4;
+ if (is6tap)
+ s4 = s5;
+
+ dst += dst_stride;
+ src += src_stride;
+ }
+}
+
+#define EPEL_FUNCS(WIDTH, TAPS) \
+static av_noinline \
+void put_vp8_epel ## WIDTH ## _h ## TAPS ## _altivec(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int h, int mx, int my) \
+{ \
+ put_vp8_epel_h_altivec_core(dst, dst_stride, src, src_stride, h, mx, WIDTH, TAPS == 6); \
+} \
+\
+static av_noinline \
+void put_vp8_epel ## WIDTH ## _v ## TAPS ## _altivec(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int h, int mx, int my) \
+{ \
+ put_vp8_epel_v_altivec_core(dst, dst_stride, src, src_stride, h, my, WIDTH, TAPS == 6); \
+}
+
+#define EPEL_HV(WIDTH, HTAPS, VTAPS) \
+static void put_vp8_epel ## WIDTH ## _h ## HTAPS ## v ## VTAPS ## _altivec(uint8_t *dst, ptrdiff_t dstride, uint8_t *src, ptrdiff_t sstride, int h, int mx, int my) \
+{ \
+ DECLARE_ALIGNED(16, uint8_t, tmp)[(2*WIDTH+5)*16]; \
+ if (VTAPS == 6) { \
+ put_vp8_epel ## WIDTH ## _h ## HTAPS ## _altivec(tmp, 16, src-2*sstride, sstride, h+5, mx, my); \
+ put_vp8_epel ## WIDTH ## _v ## VTAPS ## _altivec(dst, dstride, tmp+2*16, 16, h, mx, my); \
+ } else { \
+ put_vp8_epel ## WIDTH ## _h ## HTAPS ## _altivec(tmp, 16, src-sstride, sstride, h+4, mx, my); \
+ put_vp8_epel ## WIDTH ## _v ## VTAPS ## _altivec(dst, dstride, tmp+16, 16, h, mx, my); \
+ } \
+}
+
+EPEL_FUNCS(16,6)
+EPEL_FUNCS(8, 6)
+EPEL_FUNCS(8, 4)
+EPEL_FUNCS(4, 6)
+EPEL_FUNCS(4, 4)
+
+EPEL_HV(16, 6,6)
+EPEL_HV(8, 6,6)
+EPEL_HV(8, 4,6)
+EPEL_HV(8, 6,4)
+EPEL_HV(8, 4,4)
+EPEL_HV(4, 6,6)
+EPEL_HV(4, 4,6)
+EPEL_HV(4, 6,4)
+EPEL_HV(4, 4,4)
+
+static void put_vp8_pixels16_altivec(uint8_t *dst, ptrdiff_t dstride, uint8_t *src, ptrdiff_t sstride, int h, int mx, int my)
+{
+ register vector unsigned char perm;
+ int i;
+ register ptrdiff_t dstride2 = dstride << 1, sstride2 = sstride << 1;
+ register ptrdiff_t dstride3 = dstride2 + dstride, sstride3 = sstride + sstride2;
+ register ptrdiff_t dstride4 = dstride << 2, sstride4 = sstride << 2;
+
+#if HAVE_BIGENDIAN
+ perm = vec_lvsl(0, src);
+#endif
+// hand-unrolling the loop by 4 gains about 15%
+// mininum execution time goes from 74 to 60 cycles
+// it's faster than -funroll-loops, but using
+// -funroll-loops w/ this is bad - 74 cycles again.
+// all this is on a 7450, tuning for the 7450
+ for (i = 0; i < h; i += 4) {
+ vec_st(load_with_perm_vec(0, src, perm), 0, dst);
+ vec_st(load_with_perm_vec(sstride, src, perm), dstride, dst);
+ vec_st(load_with_perm_vec(sstride2, src, perm), dstride2, dst);
+ vec_st(load_with_perm_vec(sstride3, src, perm), dstride3, dst);
+ src += sstride4;
+ dst += dstride4;
+ }
+}
+
+#endif /* HAVE_ALTIVEC */
+
+
+av_cold void ff_vp78dsp_init_ppc(VP8DSPContext *c)
+{
+#if HAVE_ALTIVEC
+ if (!PPC_ALTIVEC(av_get_cpu_flags()))
+ return;
+
+ c->put_vp8_epel_pixels_tab[0][0][0] = put_vp8_pixels16_altivec;
+ c->put_vp8_epel_pixels_tab[0][0][2] = put_vp8_epel16_h6_altivec;
+ c->put_vp8_epel_pixels_tab[0][2][0] = put_vp8_epel16_v6_altivec;
+ c->put_vp8_epel_pixels_tab[0][2][2] = put_vp8_epel16_h6v6_altivec;
+
+ c->put_vp8_epel_pixels_tab[1][0][2] = put_vp8_epel8_h6_altivec;
+ c->put_vp8_epel_pixels_tab[1][2][0] = put_vp8_epel8_v6_altivec;
+ c->put_vp8_epel_pixels_tab[1][0][1] = put_vp8_epel8_h4_altivec;
+ c->put_vp8_epel_pixels_tab[1][1][0] = put_vp8_epel8_v4_altivec;
+
+ c->put_vp8_epel_pixels_tab[1][2][2] = put_vp8_epel8_h6v6_altivec;
+ c->put_vp8_epel_pixels_tab[1][1][1] = put_vp8_epel8_h4v4_altivec;
+ c->put_vp8_epel_pixels_tab[1][1][2] = put_vp8_epel8_h6v4_altivec;
+ c->put_vp8_epel_pixels_tab[1][2][1] = put_vp8_epel8_h4v6_altivec;
+
+ c->put_vp8_epel_pixels_tab[2][0][2] = put_vp8_epel4_h6_altivec;
+ c->put_vp8_epel_pixels_tab[2][2][0] = put_vp8_epel4_v6_altivec;
+ c->put_vp8_epel_pixels_tab[2][0][1] = put_vp8_epel4_h4_altivec;
+ c->put_vp8_epel_pixels_tab[2][1][0] = put_vp8_epel4_v4_altivec;
+
+ c->put_vp8_epel_pixels_tab[2][2][2] = put_vp8_epel4_h6v6_altivec;
+ c->put_vp8_epel_pixels_tab[2][1][1] = put_vp8_epel4_h4v4_altivec;
+ c->put_vp8_epel_pixels_tab[2][1][2] = put_vp8_epel4_h6v4_altivec;
+ c->put_vp8_epel_pixels_tab[2][2][1] = put_vp8_epel4_h4v6_altivec;
+#endif /* HAVE_ALTIVEC */
+}
diff -Nrup ffmpeg.orig/libavutil/ppc/cpu.c ffmpeg/libavutil/ppc/cpu.c
--- ffmpeg.orig/libavutil/ppc/cpu.c 1970-01-01 01:00:00.000000000 +0100
+++ ffmpeg/libavutil/ppc/cpu.c 2018-09-05 15:45:34.938766223 +0200
@@ -0,0 +1,162 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+#ifdef __APPLE__
+#include <sys/sysctl.h>
+#elif defined(__linux__)
+#include <asm/cputable.h>
+#include <linux/auxvec.h>
+#include <fcntl.h>
+#if HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+#elif defined(__OpenBSD__)
+#include <sys/param.h>
+#include <sys/sysctl.h>
+#include <machine/cpu.h>
+#elif defined(__AMIGAOS4__)
+#include <exec/exec.h>
+#include <interfaces/exec.h>
+#include <proto/exec.h>
+#endif /* __APPLE__ */
+
+#include "libavutil/avassert.h"
+#include "libavutil/cpu.h"
+#include "libavutil/cpu_internal.h"
+
+/**
+ * This function MAY rely on signal() or fork() in order to make sure AltiVec
+ * is present.
+ */
+int ff_get_cpu_flags_ppc(void)
+{
+#if HAVE_ALTIVEC
+#ifdef __AMIGAOS4__
+ ULONG result = 0;
+ extern struct ExecIFace *IExec;
+
+ IExec->GetCPUInfoTags(GCIT_VectorUnit, &result, TAG_DONE);
+ if (result == VECTORTYPE_ALTIVEC)
+ return AV_CPU_FLAG_ALTIVEC;
+ return 0;
+#elif defined(__APPLE__) || defined(__OpenBSD__)
+#ifdef __OpenBSD__
+ int sels[2] = {CTL_MACHDEP, CPU_ALTIVEC};
+#else
+ int sels[2] = {CTL_HW, HW_VECTORUNIT};
+#endif
+ int has_vu = 0;
+ size_t len = sizeof(has_vu);
+ int err;
+
+ err = sysctl(sels, 2, &has_vu, &len, NULL, 0);
+
+ if (err == 0)
+ return has_vu ? AV_CPU_FLAG_ALTIVEC : 0;
+ return 0;
+#elif defined(__linux__)
+ // The linux kernel could have the altivec support disabled
+ // even if the cpu has it.
+ int i, ret = 0;
+ int fd = open("/proc/self/auxv", O_RDONLY);
+ unsigned long buf[64] = { 0 };
+ ssize_t count;
+
+ if (fd < 0)
+ return 0;
+
+ while ((count = read(fd, buf, sizeof(buf))) > 0) {
+ for (i = 0; i < count / sizeof(*buf); i += 2) {
+ if (buf[i] == AT_NULL)
+ goto out;
+ if (buf[i] == AT_HWCAP) {
+ if (buf[i + 1] & PPC_FEATURE_HAS_ALTIVEC)
+ ret = AV_CPU_FLAG_ALTIVEC;
+#ifdef PPC_FEATURE_HAS_VSX
+ if (buf[i + 1] & PPC_FEATURE_HAS_VSX)
+ ret |= AV_CPU_FLAG_VSX;
+#endif
+#ifdef PPC_FEATURE_ARCH_2_07
+ if (buf[i + 1] & PPC_FEATURE_HAS_POWER8)
+ ret |= AV_CPU_FLAG_POWER8;
+#endif
+ if (ret & AV_CPU_FLAG_VSX)
+ av_assert0(ret & AV_CPU_FLAG_ALTIVEC);
+ goto out;
+ }
+ }
+ }
+
+out:
+ close(fd);
+ return ret;
+#elif CONFIG_RUNTIME_CPUDETECT && defined(__linux__)
+#define PVR_G4_7400 0x000C
+#define PVR_G5_970 0x0039
+#define PVR_G5_970FX 0x003C
+#define PVR_G5_970MP 0x0044
+#define PVR_G5_970GX 0x0045
+#define PVR_POWER6 0x003E
+#define PVR_POWER7 0x003F
+#define PVR_POWER8 0x004B
+#define PVR_CELL_PPU 0x0070
+ int ret = 0;
+ int proc_ver;
+ // Support of mfspr PVR emulation added in Linux 2.6.17.
+ __asm__ volatile("mfspr %0, 287" : "=r" (proc_ver));
+ proc_ver >>= 16;
+ if (proc_ver & 0x8000 ||
+ proc_ver == PVR_G4_7400 ||
+ proc_ver == PVR_G5_970 ||
+ proc_ver == PVR_G5_970FX ||
+ proc_ver == PVR_G5_970MP ||
+ proc_ver == PVR_G5_970GX ||
+ proc_ver == PVR_POWER6 ||
+ proc_ver == PVR_POWER7 ||
+ proc_ver == PVR_POWER8 ||
+ proc_ver == PVR_CELL_PPU)
+ ret = AV_CPU_FLAG_ALTIVEC;
+ if (proc_ver == PVR_POWER7 ||
+ proc_ver == PVR_POWER8)
+ ret |= AV_CPU_FLAG_VSX;
+ if (proc_ver == PVR_POWER8)
+ ret |= AV_CPU_FLAG_POWER8;
+
+ return ret;
+#else
+ // Since we were compiled for AltiVec, just assume we have it
+ // until someone comes up with a proper way (not involving signal hacks).
+ return AV_CPU_FLAG_ALTIVEC;
+#endif /* __AMIGAOS4__ */
+#endif /* HAVE_ALTIVEC */
+ return 0;
+}
+
+size_t ff_get_cpu_max_align_ppc(void)
+{
+ int flags = av_get_cpu_flags();
+
+ if (flags & (AV_CPU_FLAG_ALTIVEC |
+ AV_CPU_FLAG_VSX |
+ AV_CPU_FLAG_POWER8))
+ return 16;
+
+ return 8;
+}
diff -Nrup ffmpeg.orig/libavutil/ppc/cpu.h ffmpeg/libavutil/ppc/cpu.h
--- ffmpeg.orig/libavutil/ppc/cpu.h 1970-01-01 01:00:00.000000000 +0100
+++ ffmpeg/libavutil/ppc/cpu.h 2018-09-07 13:32:42.161848338 +0200
@@ -0,0 +1,30 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVUTIL_PPC_CPU_H
+#define AVUTIL_PPC_CPU_H
+
+#include "config.h"
+#include "libavutil/cpu.h"
+#include "libavutil/cpu_internal.h"
+
+#define PPC_ALTIVEC(flags) CPUEXT(flags, ALTIVEC)
+#define PPC_VSX(flags) CPUEXT(flags, VSX)
+#define PPC_POWER8(flags) CPUEXT(flags, POWER8)
+
+#endif /* AVUTIL_PPC_CPU_H */
diff -Nrup ffmpeg.orig/libavutil/ppc/float_dsp_altivec.c ffmpeg/libavutil/ppc/float_dsp_altivec.c
--- ffmpeg.orig/libavutil/ppc/float_dsp_altivec.c 1970-01-01 01:00:00.000000000 +0100
+++ ffmpeg/libavutil/ppc/float_dsp_altivec.c 2018-09-05 15:45:34.938766223 +0200
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2006 Luca Barbato <lu_zero@gentoo.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "util_altivec.h"
+#include "float_dsp_altivec.h"
+
+void ff_vector_fmul_altivec(float *dst, const float *src0, const float *src1,
+ int len)
+{
+ int i;
+ vec_f d0, d1, s, zero = (vec_f)vec_splat_u32(0);
+ for (i = 0; i < len - 7; i += 8) {
+ d0 = vec_ld( 0, src0 + i);
+ s = vec_ld( 0, src1 + i);
+ d1 = vec_ld(16, src0 + i);
+ d0 = vec_madd(d0, s, zero);
+ d1 = vec_madd(d1, vec_ld(16, src1 + i), zero);
+ vec_st(d0, 0, dst + i);
+ vec_st(d1, 16, dst + i);
+ }
+}
+
+void ff_vector_fmul_window_altivec(float *dst, const float *src0,
+ const float *src1, const float *win, int len)
+{
+ vec_f zero, t0, t1, s0, s1, wi, wj;
+ const vec_u8 reverse = vcprm(3, 2, 1, 0);
+ int i, j;
+
+ dst += len;
+ win += len;
+ src0 += len;
+
+ zero = (vec_f)vec_splat_u32(0);
+
+ for (i = -len * 4, j = len * 4 - 16; i < 0; i += 16, j -= 16) {
+ s0 = vec_ld(i, src0);
+ s1 = vec_ld(j, src1);
+ wi = vec_ld(i, win);
+ wj = vec_ld(j, win);
+
+ s1 = vec_perm(s1, s1, reverse);
+ wj = vec_perm(wj, wj, reverse);
+
+ t0 = vec_madd(s0, wj, zero);
+ t0 = vec_nmsub(s1, wi, t0);
+ t1 = vec_madd(s0, wi, zero);
+ t1 = vec_madd(s1, wj, t1);
+ t1 = vec_perm(t1, t1, reverse);
+
+ vec_st(t0, i, dst);
+ vec_st(t1, j, dst);
+ }
+}
+
+void ff_vector_fmul_add_altivec(float *dst, const float *src0,
+ const float *src1, const float *src2,
+ int len)
+{
+ int i;
+ vec_f d, ss0, ss1, ss2, t0, t1, edges;
+
+ for (i = 0; i < len - 3; i += 4) {
+ t0 = vec_ld(0, dst + i);
+ t1 = vec_ld(15, dst + i);
+ ss0 = vec_ld(0, src0 + i);
+ ss1 = vec_ld(0, src1 + i);
+ ss2 = vec_ld(0, src2 + i);
+ edges = vec_perm(t1, t0, vcprm(0, 1, 2, 3));
+ d = vec_madd(ss0, ss1, ss2);
+ t1 = vec_perm(d, edges, vcprm(s0,s1,s2,s3));
+ t0 = vec_perm(edges, d, vcprm(s0,s1,s2,s3));
+ vec_st(t1, 15, dst + i);
+ vec_st(t0, 0, dst + i);
+ }
+}
+
+void ff_vector_fmul_reverse_altivec(float *dst, const float *src0,
+ const float *src1, int len)
+{
+ int i;
+ vec_f d, s0, s1, h0, l0, s2, s3;
+ vec_f zero = (vec_f)vec_splat_u32(0);
+
+ src1 += len-4;
+ for(i = 0; i < len - 7; i += 8) {
+ s1 = vec_ld(0, src1 - i); // [a,b,c,d]
+ s0 = vec_ld(0, src0 + i);
+ l0 = vec_mergel(s1, s1); // [c,c,d,d]
+ s3 = vec_ld(-16, src1 - i);
+ h0 = vec_mergeh(s1, s1); // [a,a,b,b]
+ s2 = vec_ld(16, src0 + i);
+ s1 = vec_mergeh(vec_mergel(l0, h0), // [d,b,d,b]
+ vec_mergeh(l0, h0)); // [c,a,c,a]
+ // [d,c,b,a]
+ l0 = vec_mergel(s3, s3);
+ d = vec_madd(s0, s1, zero);
+ h0 = vec_mergeh(s3, s3);
+ vec_st(d, 0, dst + i);
+ s3 = vec_mergeh(vec_mergel(l0, h0),
+ vec_mergeh(l0, h0));
+ d = vec_madd(s2, s3, zero);
+ vec_st(d, 16, dst + i);
+ }
+}
diff -Nrup ffmpeg.orig/libavutil/ppc/float_dsp_altivec.h ffmpeg/libavutil/ppc/float_dsp_altivec.h
--- ffmpeg.orig/libavutil/ppc/float_dsp_altivec.h 1970-01-01 01:00:00.000000000 +0100
+++ ffmpeg/libavutil/ppc/float_dsp_altivec.h 2018-09-05 15:45:34.938766223 +0200
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2006 Luca Barbato <lu_zero@gentoo.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVUTIL_PPC_FLOAT_DSP_ALTIVEC_H
+#define AVUTIL_PPC_FLOAT_DSP_ALTIVEC_H
+
+void ff_vector_fmul_altivec(float *dst, const float *src0,
+ const float *src1, int len);
+
+void ff_vector_fmul_window_altivec(float *dst, const float *src0,
+ const float *src1, const float *win,
+ int len);
+
+void ff_vector_fmul_add_altivec(float *dst, const float *src0,
+ const float *src1, const float *src2,
+ int len);
+
+void ff_vector_fmul_reverse_altivec(float *dst, const float *src0,
+ const float *src1, int len);
+
+#endif /* AVUTIL_PPC_FLOAT_DSP_ALTIVEC_H */
diff -Nrup ffmpeg.orig/libavutil/ppc/float_dsp_init.c ffmpeg/libavutil/ppc/float_dsp_init.c
--- ffmpeg.orig/libavutil/ppc/float_dsp_init.c 1970-01-01 01:00:00.000000000 +0100
+++ ffmpeg/libavutil/ppc/float_dsp_init.c 2018-09-05 15:45:34.938766223 +0200
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2006 Luca Barbato <lu_zero@gentoo.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/float_dsp.h"
+#include "libavutil/ppc/cpu.h"
+#include "float_dsp_altivec.h"
+#include "float_dsp_vsx.h"
+
+av_cold void ff_float_dsp_init_ppc(AVFloatDSPContext *fdsp, int bit_exact)
+{
+ if (PPC_ALTIVEC(av_get_cpu_flags())) {
+ fdsp->vector_fmul = ff_vector_fmul_altivec;
+ fdsp->vector_fmul_add = ff_vector_fmul_add_altivec;
+ fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_altivec;
+
+ if (!bit_exact) {
+ fdsp->vector_fmul_window = ff_vector_fmul_window_altivec;
+ }
+ }
+
+ // The disabled function below are near identical to altivec and have
+ // been disabled to reduce code duplication
+ if (PPC_VSX(av_get_cpu_flags())) {
+// fdsp->vector_fmul = ff_vector_fmul_vsx;
+ fdsp->vector_fmul_add = ff_vector_fmul_add_vsx;
+// fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_vsx;
+
+// if (!bit_exact) {
+// fdsp->vector_fmul_window = ff_vector_fmul_window_vsx;
+// }
+ }
+}
diff -Nrup ffmpeg.orig/libavutil/ppc/float_dsp_vsx.c ffmpeg/libavutil/ppc/float_dsp_vsx.c
--- ffmpeg.orig/libavutil/ppc/float_dsp_vsx.c 1970-01-01 01:00:00.000000000 +0100
+++ ffmpeg/libavutil/ppc/float_dsp_vsx.c 2018-09-05 15:45:34.938766223 +0200
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2015 Luca Barbato <lu_zero@gentoo.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "util_altivec.h"
+#include "float_dsp_vsx.h"
+
+void ff_vector_fmul_vsx(float *dst,
+ const float *src0, const float *src1,
+ int len)
+{
+ int i;
+ vec_f d0, d1, zero = (vec_f)vec_splat_u32(0);
+ for (i = 0; i < len - 7; i += 8) {
+ d0 = vec_vsx_ld( 0, src0 + i);
+ d1 = vec_vsx_ld(16, src0 + i);
+ d0 = vec_madd(d0, vec_vsx_ld( 0, src1 + i), zero);
+ d1 = vec_madd(d1, vec_vsx_ld(16, src1 + i), zero);
+ vec_vsx_st(d0, 0, dst + i);
+ vec_vsx_st(d1, 16, dst + i);
+ }
+}
+
+void ff_vector_fmul_window_vsx(float *dst, const float *src0,
+ const float *src1, const float *win,
+ int len)
+{
+ vec_f zero, t0, t1, s0, s1, wi, wj;
+ const vec_u8 reverse = vcprm(3, 2, 1, 0);
+ int i, j;
+
+ dst += len;
+ win += len;
+ src0 += len;
+
+ zero = (vec_f)vec_splat_u32(0);
+
+ for (i = -len * 4, j = len * 4 - 16; i < 0; i += 16, j -= 16) {
+ s0 = vec_vsx_ld(i, src0);
+ s1 = vec_vsx_ld(j, src1);
+ wi = vec_vsx_ld(i, win);
+ wj = vec_vsx_ld(j, win);
+
+ s1 = vec_perm(s1, s1, reverse);
+ wj = vec_perm(wj, wj, reverse);
+
+ t0 = vec_madd(s0, wj, zero);
+ t0 = vec_nmsub(s1, wi, t0);
+ t1 = vec_madd(s0, wi, zero);
+ t1 = vec_madd(s1, wj, t1);
+ t1 = vec_perm(t1, t1, reverse);
+
+ vec_vsx_st(t0, i, dst);
+ vec_vsx_st(t1, j, dst);
+ }
+}
+
+void ff_vector_fmul_add_vsx(float *dst, const float *src0,
+ const float *src1, const float *src2,
+ int len)
+{
+ int i;
+ vec_f d, s0, s1, s2;
+
+ for (i = 0; i < len - 3; i += 4) {
+ s0 = vec_vsx_ld(0, src0 + i);
+ s1 = vec_vsx_ld(0, src1 + i);
+ s2 = vec_vsx_ld(0, src2 + i);
+ d = vec_madd(s0, s1, s2);
+ vec_vsx_st(d, 0, dst + i);
+ }
+}
+
+void ff_vector_fmul_reverse_vsx(float *dst, const float *src0,
+ const float *src1, int len)
+{
+ int i;
+ vec_f d, s0, s1, h0, l0, s2, s3;
+ vec_f zero = (vec_f)vec_splat_u32(0);
+
+ src1 += len - 4;
+ for (i = 0; i < len - 7; i += 8) {
+ s1 = vec_vsx_ld(0, src1 - i); // [a,b,c,d]
+ s0 = vec_vsx_ld(0, src0 + i);
+ l0 = vec_mergel(s1, s1); // [c,c,d,d]
+ s3 = vec_vsx_ld(-16, src1 - i);
+ h0 = vec_mergeh(s1, s1); // [a,a,b,b]
+ s2 = vec_vsx_ld(16, src0 + i);
+ s1 = vec_mergeh(vec_mergel(l0, h0), // [d,b,d,b]
+ vec_mergeh(l0, h0)); // [c,a,c,a]
+ // [d,c,b,a]
+ l0 = vec_mergel(s3, s3);
+ d = vec_madd(s0, s1, zero);
+ h0 = vec_mergeh(s3, s3);
+ vec_vsx_st(d, 0, dst + i);
+ s3 = vec_mergeh(vec_mergel(l0, h0),
+ vec_mergeh(l0, h0));
+ d = vec_madd(s2, s3, zero);
+ vec_vsx_st(d, 16, dst + i);
+ }
+}
diff -Nrup ffmpeg.orig/libavutil/ppc/float_dsp_vsx.h ffmpeg/libavutil/ppc/float_dsp_vsx.h
--- ffmpeg.orig/libavutil/ppc/float_dsp_vsx.h 1970-01-01 01:00:00.000000000 +0100
+++ ffmpeg/libavutil/ppc/float_dsp_vsx.h 2018-09-05 15:45:34.938766223 +0200
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2015 Luca Barbato <lu_zero@gentoo.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVUTIL_PPC_FLOAT_DSP_VSX_H
+#define AVUTIL_PPC_FLOAT_DSP_VSX_H
+
+void ff_vector_fmul_vsx(float *dst, const float *src0,
+ const float *src1, int len);
+
+void ff_vector_fmul_window_vsx(float *dst, const float *src0,
+ const float *src1, const float *win,
+ int len);
+
+void ff_vector_fmul_add_vsx(float *dst, const float *src0,
+ const float *src1, const float *src2,
+ int len);
+
+void ff_vector_fmul_reverse_vsx(float *dst, const float *src0,
+ const float *src1, int len);
+
+#endif /* AVUTIL_PPC_FLOAT_DSP_VSX_H */
diff -Nrup ffmpeg.orig/libavutil/ppc/intreadwrite.h ffmpeg/libavutil/ppc/intreadwrite.h
--- ffmpeg.orig/libavutil/ppc/intreadwrite.h 1970-01-01 01:00:00.000000000 +0100
+++ ffmpeg/libavutil/ppc/intreadwrite.h 2018-09-05 15:45:34.938766223 +0200
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVUTIL_PPC_INTREADWRITE_H
+#define AVUTIL_PPC_INTREADWRITE_H
+
+#include <stdint.h>
+#include "config.h"
+
+#if HAVE_XFORM_ASM
+
+#if HAVE_BIGENDIAN
+#define AV_RL16 av_read_bswap16
+#define AV_WL16 av_write_bswap16
+#define AV_RL32 av_read_bswap32
+#define AV_WL32 av_write_bswap32
+#define AV_RL64 av_read_bswap64
+#define AV_WL64 av_write_bswap64
+
+#else
+#define AV_RB16 av_read_bswap16
+#define AV_WB16 av_write_bswap16
+#define AV_RB32 av_read_bswap32
+#define AV_WB32 av_write_bswap32
+#define AV_RB64 av_read_bswap64
+#define AV_WB64 av_write_bswap64
+
+#endif
+
+static av_always_inline uint16_t av_read_bswap16(const void *p)
+{
+ uint16_t v;
+ __asm__ ("lhbrx %0, %y1" : "=r"(v) : "Z"(*(const uint16_t*)p));
+ return v;
+}
+
+static av_always_inline void av_write_bswap16(void *p, uint16_t v)
+{
+ __asm__ ("sthbrx %1, %y0" : "=Z"(*(uint16_t*)p) : "r"(v));
+}
+
+static av_always_inline uint32_t av_read_bswap32(const void *p)
+{
+ uint32_t v;
+ __asm__ ("lwbrx %0, %y1" : "=r"(v) : "Z"(*(const uint32_t*)p));
+ return v;
+}
+
+static av_always_inline void av_write_bswap32(void *p, uint32_t v)
+{
+ __asm__ ("stwbrx %1, %y0" : "=Z"(*(uint32_t*)p) : "r"(v));
+}
+
+#if HAVE_LDBRX
+
+static av_always_inline uint64_t av_read_bswap64(const void *p)
+{
+ uint64_t v;
+ __asm__ ("ldbrx %0, %y1" : "=r"(v) : "Z"(*(const uint64_t*)p));
+ return v;
+}
+
+static av_always_inline void av_write_bswap64(void *p, uint64_t v)
+{
+ __asm__ ("stdbrx %1, %y0" : "=Z"(*(uint64_t*)p) : "r"(v));
+}
+
+#else
+
+static av_always_inline uint64_t av_read_bswap64(const void *p)
+{
+ union { uint64_t v; uint32_t hl[2]; } v;
+ __asm__ ("lwbrx %0, %y2 \n\t"
+ "lwbrx %1, %y3 \n\t"
+ : "=&r"(v.hl[1]), "=r"(v.hl[0])
+ : "Z"(*(const uint32_t*)p), "Z"(*((const uint32_t*)p+1)));
+ return v.v;
+}
+
+static av_always_inline void av_write_bswap64(void *p, uint64_t v)
+{
+ union { uint64_t v; uint32_t hl[2]; } vv = { v };
+ __asm__ ("stwbrx %2, %y0 \n\t"
+ "stwbrx %3, %y1 \n\t"
+ : "=Z"(*(uint32_t*)p), "=Z"(*((uint32_t*)p+1))
+ : "r"(vv.hl[1]), "r"(vv.hl[0]));
+}
+
+#endif /* HAVE_LDBRX */
+
+#endif /* HAVE_XFORM_ASM */
+
+#endif /* AVUTIL_PPC_INTREADWRITE_H */
diff -Nrup ffmpeg.orig/libavutil/ppc/timer.h ffmpeg/libavutil/ppc/timer.h
--- ffmpeg.orig/libavutil/ppc/timer.h 1970-01-01 01:00:00.000000000 +0100
+++ ffmpeg/libavutil/ppc/timer.h 2018-09-05 15:45:34.938766223 +0200
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2005 Luca Barbato <lu_zero@gentoo.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVUTIL_PPC_TIMER_H
+#define AVUTIL_PPC_TIMER_H
+
+#include <stdint.h>
+
+#include "config.h"
+
+#define AV_READ_TIME read_time
+
+static inline uint64_t read_time(void)
+{
+ uint32_t tbu, tbl, temp;
+
+ /* from section 2.2.1 of the 32-bit PowerPC PEM */
+ __asm__ volatile(
+ "mftbu %2\n"
+ "mftb %0\n"
+ "mftbu %1\n"
+ "cmpw %2,%1\n"
+ "bne $-0x10\n"
+ : "=r"(tbl), "=r"(tbu), "=r"(temp)
+ :
+ : "cc");
+
+ return (((uint64_t)tbu)<<32) | (uint64_t)tbl;
+}
+
+#endif /* AVUTIL_PPC_TIMER_H */
diff -Nrup ffmpeg.orig/libavutil/ppc/util_altivec.h ffmpeg/libavutil/ppc/util_altivec.h
--- ffmpeg.orig/libavutil/ppc/util_altivec.h 1970-01-01 01:00:00.000000000 +0100
+++ ffmpeg/libavutil/ppc/util_altivec.h 2018-09-05 15:45:34.938766223 +0200
@@ -0,0 +1,195 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Contains misc utility macros and inline functions
+ */
+
+#ifndef AVUTIL_PPC_UTIL_ALTIVEC_H
+#define AVUTIL_PPC_UTIL_ALTIVEC_H
+
+#include <stdint.h>
+
+#include "config.h"
+
+/***********************************************************************
+ * Vector types
+ **********************************************************************/
+#define vec_u8 vector unsigned char
+#define vec_s8 vector signed char
+#define vec_u16 vector unsigned short
+#define vec_s16 vector signed short
+#define vec_u32 vector unsigned int
+#define vec_s32 vector signed int
+#define vec_f vector float
+
+/***********************************************************************
+ * Null vector
+ **********************************************************************/
+#define LOAD_ZERO const vec_u8 zerov = vec_splat_u8( 0 )
+
+#define zero_u8v (vec_u8) zerov
+#define zero_s8v (vec_s8) zerov
+#define zero_u16v (vec_u16) zerov
+#define zero_s16v (vec_s16) zerov
+#define zero_u32v (vec_u32) zerov
+#define zero_s32v (vec_s32) zerov
+
+#if HAVE_ALTIVEC
+#include <altivec.h>
+
+// used to build registers permutation vectors (vcprm)
+// the 's' are for words in the _s_econd vector
+#define WORD_0 0x00,0x01,0x02,0x03
+#define WORD_1 0x04,0x05,0x06,0x07
+#define WORD_2 0x08,0x09,0x0a,0x0b
+#define WORD_3 0x0c,0x0d,0x0e,0x0f
+#define WORD_s0 0x10,0x11,0x12,0x13
+#define WORD_s1 0x14,0x15,0x16,0x17
+#define WORD_s2 0x18,0x19,0x1a,0x1b
+#define WORD_s3 0x1c,0x1d,0x1e,0x1f
+#define vcprm(a,b,c,d) (const vec_u8){WORD_ ## a, WORD_ ## b, WORD_ ## c, WORD_ ## d}
+
+#define SWP_W2S0 0x02,0x03,0x00,0x01
+#define SWP_W2S1 0x06,0x07,0x04,0x05
+#define SWP_W2S2 0x0a,0x0b,0x08,0x09
+#define SWP_W2S3 0x0e,0x0f,0x0c,0x0d
+#define SWP_W2Ss0 0x12,0x13,0x10,0x11
+#define SWP_W2Ss1 0x16,0x17,0x14,0x15
+#define SWP_W2Ss2 0x1a,0x1b,0x18,0x19
+#define SWP_W2Ss3 0x1e,0x1f,0x1c,0x1d
+#define vcswapi2s(a,b,c,d) (const vector unsigned char){SWP_W2S ## a, SWP_W2S ## b, SWP_W2S ## c, SWP_W2S ## d}
+
+#define vcswapc() \
+ (const vector unsigned char){0x0f,0x0e,0x0d,0x0c,0x0b,0x0a,0x09,0x08,0x07,0x06,0x05,0x04,0x03,0x02,0x01,0x00}
+
+
+// Transpose 8x8 matrix of 16-bit elements (in-place)
+#define TRANSPOSE8(a,b,c,d,e,f,g,h) \
+do { \
+ vec_s16 A1, B1, C1, D1, E1, F1, G1, H1; \
+ vec_s16 A2, B2, C2, D2, E2, F2, G2, H2; \
+ \
+ A1 = vec_mergeh (a, e); \
+ B1 = vec_mergel (a, e); \
+ C1 = vec_mergeh (b, f); \
+ D1 = vec_mergel (b, f); \
+ E1 = vec_mergeh (c, g); \
+ F1 = vec_mergel (c, g); \
+ G1 = vec_mergeh (d, h); \
+ H1 = vec_mergel (d, h); \
+ \
+ A2 = vec_mergeh (A1, E1); \
+ B2 = vec_mergel (A1, E1); \
+ C2 = vec_mergeh (B1, F1); \
+ D2 = vec_mergel (B1, F1); \
+ E2 = vec_mergeh (C1, G1); \
+ F2 = vec_mergel (C1, G1); \
+ G2 = vec_mergeh (D1, H1); \
+ H2 = vec_mergel (D1, H1); \
+ \
+ a = vec_mergeh (A2, E2); \
+ b = vec_mergel (A2, E2); \
+ c = vec_mergeh (B2, F2); \
+ d = vec_mergel (B2, F2); \
+ e = vec_mergeh (C2, G2); \
+ f = vec_mergel (C2, G2); \
+ g = vec_mergeh (D2, H2); \
+ h = vec_mergel (D2, H2); \
+} while (0)
+
+
+#if HAVE_BIGENDIAN
+#define VEC_LD(offset,b) \
+ vec_perm(vec_ld(offset, b), vec_ld((offset)+15, b), vec_lvsl(offset, b))
+#else
+#define VEC_LD(offset,b) \
+ vec_vsx_ld(offset, b)
+#endif
+
+/** @brief loads unaligned vector @a *src with offset @a offset
+ and returns it */
+#if HAVE_BIGENDIAN
+static inline vec_u8 unaligned_load(int offset, const uint8_t *src)
+{
+ register vec_u8 first = vec_ld(offset, src);
+ register vec_u8 second = vec_ld(offset + 15, src);
+ register vec_u8 mask = vec_lvsl(offset, src);
+ return vec_perm(first, second, mask);
+}
+static inline vec_u8 load_with_perm_vec(int offset, const uint8_t *src, vec_u8 perm_vec)
+{
+ vec_u8 a = vec_ld(offset, src);
+ vec_u8 b = vec_ld(offset + 15, src);
+ return vec_perm(a, b, perm_vec);
+}
+#else
+#define unaligned_load(a,b) VEC_LD(a,b)
+#define load_with_perm_vec(a,b,c) VEC_LD(a,b)
+#endif
+
+
+/**
+ * loads vector known misalignment
+ * @param perm_vec the align permute vector to combine the two loads from lvsl
+ */
+
+#define vec_unaligned_load(b) VEC_LD(0, b)
+
+#if HAVE_BIGENDIAN
+#define VEC_MERGEH(a, b) vec_mergeh(a, b)
+#define VEC_MERGEL(a, b) vec_mergel(a, b)
+#else
+#define VEC_MERGEH(a, b) vec_mergeh(b, a)
+#define VEC_MERGEL(a, b) vec_mergel(b, a)
+#endif
+
+#if HAVE_BIGENDIAN
+#define VEC_ST(a,b,c) vec_st(a,b,c)
+#else
+#define VEC_ST(a,b,c) vec_vsx_st(a,b,c)
+#endif
+
+#if HAVE_BIGENDIAN
+#define VEC_SPLAT16(a,b) vec_splat((vec_s16)(a), b)
+#else
+#define VEC_SPLAT16(a,b) vec_splat((vec_s16)(vec_perm(a, a, vcswapi2s(0,1,2,3))), b)
+#endif
+
+#if HAVE_BIGENDIAN
+#define VEC_SLD16(a,b,c) vec_sld(a, b, c)
+#else
+#define VEC_SLD16(a,b,c) vec_sld(b, a, c)
+#endif
+
+#endif /* HAVE_ALTIVEC */
+
+#if HAVE_VSX
+#if HAVE_BIGENDIAN
+#define vsx_ld_u8_s16(off, p) \
+ ((vec_s16)vec_mergeh((vec_u8)vec_splat_u8(0), \
+ (vec_u8)vec_vsx_ld((off), (p))))
+#else
+#define vsx_ld_u8_s16(off, p) \
+ ((vec_s16)vec_mergeh((vec_u8)vec_vsx_ld((off), (p)), \
+ (vec_u8)vec_splat_u8(0)))
+#endif /* HAVE_BIGENDIAN */
+#endif /* HAVE_VSX */
+
+#endif /* AVUTIL_PPC_UTIL_ALTIVEC_H */