diff -Nrup ffmpeg.orig/libavcodec/ppc/fft_altivec.S ffmpeg/libavcodec/ppc/fft_altivec.S --- ffmpeg.orig/libavcodec/ppc/fft_altivec.S 1970-01-01 01:00:00.000000000 +0100 +++ ffmpeg/libavcodec/ppc/fft_altivec.S 2018-09-05 15:45:34.368754131 +0200 @@ -0,0 +1,458 @@ +/* + * FFT transform with Altivec optimizations + * Copyright (c) 2009 Loren Merritt + * + * This algorithm (though not any of the implementation details) is + * based on libdjbfft by D. J. Bernstein. + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * These functions are not individually interchangeable with the C versions. + * While C takes arrays of FFTComplex, Altivec leaves intermediate results + * in blocks as convenient to the vector size. + * i.e. {4x real, 4x imaginary, 4x real, ...} + * + * I ignore standard calling convention. + * Instead, the following registers are treated as global constants: + * v14: zero + * v15..v18: cosines + * v19..v29: permutations + * r9: 16 + * r12: ff_cos_tabs + * and the rest are free for local use. + */ + +#include "config.h" + +#if HAVE_GNU_AS && HAVE_ALTIVEC && HAVE_BIGENDIAN + +#include "asm.S" + +.text + +.macro addi2 ra, imm // add 32-bit immediate +.if \imm & 0xffff + addi \ra, \ra, \imm@l +.endif +.if (\imm+0x8000)>>16 + addis \ra, \ra, \imm@ha +.endif +.endm + +.macro FFT4 a0, a1, a2, a3 // in:0-1 out:2-3 + vperm \a2,\a0,\a1,v20 // vcprm(0,1,s2,s1) // {r0,i0,r3,i2} + vperm \a3,\a0,\a1,v21 // vcprm(2,3,s0,s3) // {r1,i1,r2,i3} + vaddfp \a0,\a2,\a3 // {t1,t2,t6,t5} + vsubfp \a1,\a2,\a3 // {t3,t4,t8,t7} + vmrghw \a2,\a0,\a1 // vcprm(0,s0,1,s1) // {t1,t3,t2,t4} + vperm \a3,\a0,\a1,v22 // vcprm(2,s3,3,s2) // {t6,t7,t5,t8} + vaddfp \a0,\a2,\a3 // {r0,r1,i0,i1} + vsubfp \a1,\a2,\a3 // {r2,r3,i2,i3} + vperm \a2,\a0,\a1,v23 // vcprm(0,1,s0,s1) // {r0,r1,r2,r3} + vperm \a3,\a0,\a1,v24 // vcprm(2,3,s2,s3) // {i0,i1,i2,i3} +.endm + +.macro FFT4x2 a0, a1, b0, b1, a2, a3, b2, b3 + vperm \a2,\a0,\a1,v20 // vcprm(0,1,s2,s1) // {r0,i0,r3,i2} + vperm \a3,\a0,\a1,v21 // vcprm(2,3,s0,s3) // {r1,i1,r2,i3} + vperm \b2,\b0,\b1,v20 + vperm \b3,\b0,\b1,v21 + vaddfp \a0,\a2,\a3 // {t1,t2,t6,t5} + vsubfp \a1,\a2,\a3 // {t3,t4,t8,t7} + vaddfp \b0,\b2,\b3 + vsubfp \b1,\b2,\b3 + vmrghw \a2,\a0,\a1 // vcprm(0,s0,1,s1) // {t1,t3,t2,t4} + vperm \a3,\a0,\a1,v22 // vcprm(2,s3,3,s2) // {t6,t7,t5,t8} + vmrghw \b2,\b0,\b1 + vperm \b3,\b0,\b1,v22 + vaddfp \a0,\a2,\a3 // {r0,r1,i0,i1} + vsubfp \a1,\a2,\a3 // {r2,r3,i2,i3} + vaddfp \b0,\b2,\b3 + vsubfp \b1,\b2,\b3 + vperm \a2,\a0,\a1,v23 // vcprm(0,1,s0,s1) // {r0,r1,r2,r3} + vperm \a3,\a0,\a1,v24 // vcprm(2,3,s2,s3) // {i0,i1,i2,i3} + vperm \b2,\b0,\b1,v23 + vperm \b3,\b0,\b1,v24 +.endm + +.macro FFT8 a0, a1, b0, b1, a2, a3, b2, b3, b4 // in,out:a0-b1 + vmrghw \b2,\b0,\b1 // vcprm(0,s0,1,s1) // {r4,r6,i4,i6} + vmrglw \b3,\b0,\b1 // vcprm(2,s2,3,s3) // {r5,r7,i5,i7} + vperm \a2,\a0,\a1,v20 // FFT4 ... + vperm \a3,\a0,\a1,v21 + vaddfp \b0,\b2,\b3 // {t1,t3,t2,t4} + vsubfp \b1,\b2,\b3 // {r5,r7,i5,i7} + vperm \b4,\b1,\b1,v25 // vcprm(2,3,0,1) // {i5,i7,r5,r7} + vaddfp \a0,\a2,\a3 + vsubfp \a1,\a2,\a3 + vmaddfp \b1,\b1,v17,v14 // * {-1,1,1,-1}/sqrt(2) + vmaddfp \b1,\b4,v18,\b1 // * { 1,1,1,1 }/sqrt(2) // {t8,ta,t7,t9} + vmrghw \a2,\a0,\a1 + vperm \a3,\a0,\a1,v22 + vperm \b2,\b0,\b1,v26 // vcprm(1,2,s3,s0) // {t3,t2,t9,t8} + vperm \b3,\b0,\b1,v27 // vcprm(0,3,s2,s1) // {t1,t4,t7,ta} + vaddfp \a0,\a2,\a3 + vsubfp \a1,\a2,\a3 + vaddfp \b0,\b2,\b3 // {t1,t2,t9,ta} + vsubfp \b1,\b2,\b3 // {t6,t5,tc,tb} + vperm \a2,\a0,\a1,v23 + vperm \a3,\a0,\a1,v24 + vperm \b2,\b0,\b1,v28 // vcprm(0,2,s1,s3) // {t1,t9,t5,tb} + vperm \b3,\b0,\b1,v29 // vcprm(1,3,s0,s2) // {t2,ta,t6,tc} + vsubfp \b0,\a2,\b2 // {r4,r5,r6,r7} + vsubfp \b1,\a3,\b3 // {i4,i5,i6,i7} + vaddfp \a0,\a2,\b2 // {r0,r1,r2,r3} + vaddfp \a1,\a3,\b3 // {i0,i1,i2,i3} +.endm + +.macro BF d0,d1,s0,s1 + vsubfp \d1,\s0,\s1 + vaddfp \d0,\s0,\s1 +.endm + +.macro zip d0,d1,s0,s1 + vmrghw \d0,\s0,\s1 + vmrglw \d1,\s0,\s1 +.endm + +.macro def_fft4 interleave +fft4\interleave\()_altivec: + lvx v0, 0,r3 + lvx v1,r9,r3 + FFT4 v0,v1,v2,v3 +.ifnb \interleave + zip v0,v1,v2,v3 + stvx v0, 0,r3 + stvx v1,r9,r3 +.else + stvx v2, 0,r3 + stvx v3,r9,r3 +.endif + blr +.endm + +.macro def_fft8 interleave +fft8\interleave\()_altivec: + addi r4,r3,32 + lvx v0, 0,r3 + lvx v1,r9,r3 + lvx v2, 0,r4 + lvx v3,r9,r4 + FFT8 v0,v1,v2,v3,v4,v5,v6,v7,v8 +.ifnb \interleave + zip v4,v5,v0,v1 + zip v6,v7,v2,v3 + stvx v4, 0,r3 + stvx v5,r9,r3 + stvx v6, 0,r4 + stvx v7,r9,r4 +.else + stvx v0, 0,r3 + stvx v1,r9,r3 + stvx v2, 0,r4 + stvx v3,r9,r4 +.endif + blr +.endm + +.macro def_fft16 interleave +fft16\interleave\()_altivec: + addi r5,r3,64 + addi r6,r3,96 + addi r4,r3,32 + lvx v0, 0,r5 + lvx v1,r9,r5 + lvx v2, 0,r6 + lvx v3,r9,r6 + FFT4x2 v0,v1,v2,v3,v4,v5,v6,v7 + lvx v0, 0,r3 + lvx v1,r9,r3 + lvx v2, 0,r4 + lvx v3,r9,r4 + FFT8 v0,v1,v2,v3,v8,v9,v10,v11,v12 + vmaddfp v8,v4,v15,v14 // r2*wre + vmaddfp v9,v5,v15,v14 // i2*wre + vmaddfp v10,v6,v15,v14 // r3*wre + vmaddfp v11,v7,v15,v14 // i3*wre + vmaddfp v8,v5,v16,v8 // i2*wim + vnmsubfp v9,v4,v16,v9 // r2*wim + vnmsubfp v10,v7,v16,v10 // i3*wim + vmaddfp v11,v6,v16,v11 // r3*wim + BF v10,v12,v10,v8 + BF v11,v13,v9,v11 + BF v0,v4,v0,v10 + BF v3,v7,v3,v12 + BF v1,v5,v1,v11 + BF v2,v6,v2,v13 +.ifnb \interleave + zip v8, v9,v0,v1 + zip v10,v11,v2,v3 + zip v12,v13,v4,v5 + zip v14,v15,v6,v7 + stvx v8, 0,r3 + stvx v9,r9,r3 + stvx v10, 0,r4 + stvx v11,r9,r4 + stvx v12, 0,r5 + stvx v13,r9,r5 + stvx v14, 0,r6 + stvx v15,r9,r6 +.else + stvx v0, 0,r3 + stvx v4, 0,r5 + stvx v3,r9,r4 + stvx v7,r9,r6 + stvx v1,r9,r3 + stvx v5,r9,r5 + stvx v2, 0,r4 + stvx v6, 0,r6 +.endif + blr +.endm + +// void pass(float *z, float *wre, int n) +.macro PASS interleave, suffix +fft_pass\suffix\()_altivec: + mtctr r5 + slwi r0,r5,4 + slwi r7,r5,6 // o2 + slwi r5,r5,5 // o1 + add r10,r5,r7 // o3 + add r0,r4,r0 // wim + addi r6,r5,16 // o1+16 + addi r8,r7,16 // o2+16 + addi r11,r10,16 // o3+16 +1: + lvx v8, 0,r4 // wre + lvx v10, 0,r0 // wim + sub r0,r0,r9 + lvx v9, 0,r0 + vperm v9,v9,v10,v19 // vcprm(s0,3,2,1) => wim[0 .. -3] + lvx v4,r3,r7 // r2 = z[o2] + lvx v5,r3,r8 // i2 = z[o2+16] + lvx v6,r3,r10 // r3 = z[o3] + lvx v7,r3,r11 // i3 = z[o3+16] + vmaddfp v10,v4,v8,v14 // r2*wre + vmaddfp v11,v5,v8,v14 // i2*wre + vmaddfp v12,v6,v8,v14 // r3*wre + vmaddfp v13,v7,v8,v14 // i3*wre + lvx v0, 0,r3 // r0 = z[0] + lvx v3,r3,r6 // i1 = z[o1+16] + vmaddfp v10,v5,v9,v10 // i2*wim + vnmsubfp v11,v4,v9,v11 // r2*wim + vnmsubfp v12,v7,v9,v12 // i3*wim + vmaddfp v13,v6,v9,v13 // r3*wim + lvx v1,r3,r9 // i0 = z[16] + lvx v2,r3,r5 // r1 = z[o1] + BF v12,v8,v12,v10 + BF v13,v9,v11,v13 + BF v0,v4,v0,v12 + BF v3,v7,v3,v8 +.if !\interleave + stvx v0, 0,r3 + stvx v4,r3,r7 + stvx v3,r3,r6 + stvx v7,r3,r11 +.endif + BF v1,v5,v1,v13 + BF v2,v6,v2,v9 +.if !\interleave + stvx v1,r3,r9 + stvx v2,r3,r5 + stvx v5,r3,r8 + stvx v6,r3,r10 +.else + vmrghw v8,v0,v1 + vmrglw v9,v0,v1 + stvx v8, 0,r3 + stvx v9,r3,r9 + vmrghw v8,v2,v3 + vmrglw v9,v2,v3 + stvx v8,r3,r5 + stvx v9,r3,r6 + vmrghw v8,v4,v5 + vmrglw v9,v4,v5 + stvx v8,r3,r7 + stvx v9,r3,r8 + vmrghw v8,v6,v7 + vmrglw v9,v6,v7 + stvx v8,r3,r10 + stvx v9,r3,r11 +.endif + addi r3,r3,32 + addi r4,r4,16 + bdnz 1b + sub r3,r3,r5 + blr +.endm + +#define M_SQRT1_2 0.70710678118654752440 /* 1/sqrt(2) */ + +#define WORD_0 0x00,0x01,0x02,0x03 +#define WORD_1 0x04,0x05,0x06,0x07 +#define WORD_2 0x08,0x09,0x0a,0x0b +#define WORD_3 0x0c,0x0d,0x0e,0x0f +#define WORD_s0 0x10,0x11,0x12,0x13 +#define WORD_s1 0x14,0x15,0x16,0x17 +#define WORD_s2 0x18,0x19,0x1a,0x1b +#define WORD_s3 0x1c,0x1d,0x1e,0x1f + +#define vcprm(a, b, c, d) .byte WORD_##a, WORD_##b, WORD_##c, WORD_##d + + .rodata + .align 4 +fft_data: + .float 0, 0, 0, 0 + .float 1, 0.92387953, M_SQRT1_2, 0.38268343 + .float 0, 0.38268343, M_SQRT1_2, 0.92387953 + .float -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2,-M_SQRT1_2 + .float M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2 + vcprm(s0,3,2,1) + vcprm(0,1,s2,s1) + vcprm(2,3,s0,s3) + vcprm(2,s3,3,s2) + vcprm(0,1,s0,s1) + vcprm(2,3,s2,s3) + vcprm(2,3,0,1) + vcprm(1,2,s3,s0) + vcprm(0,3,s2,s1) + vcprm(0,2,s1,s3) + vcprm(1,3,s0,s2) + +.macro lvm b, r, regs:vararg + lvx \r, 0, \b + addi \b, \b, 16 + .ifnb \regs + lvm \b, \regs + .endif +.endm + +.macro stvm b, r, regs:vararg + stvx \r, 0, \b + addi \b, \b, 16 + .ifnb \regs + stvm \b, \regs + .endif +.endm + +.macro fft_calc interleave +extfunc ff_fft_calc\interleave\()_altivec + mflr r0 + stp r0, 2*PS(R(1)) + stpu r1, -(160+16*PS)(R(1)) + get_got r11 + addi r6, r1, 16*PS + stvm r6, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29 + mfvrsave r0 + stw r0, 15*PS(R(1)) +#if __APPLE__ + li r6, 0xfffffffc +#else + li r6, -4 +#endif + mtvrsave r6 + + movrel r6, fft_data, r11 + lvm r6, v14, v15, v16, v17, v18, v19, v20, v21 + lvm r6, v22, v23, v24, v25, v26, v27, v28, v29 + + li r9, 16 + movrel r12, X(ff_cos_tabs), r11 + + movrel r6, fft_dispatch_tab\interleave\()_altivec, r11 + lwz r3, 0(R(3)) + subi r3, r3, 2 + slwi r3, r3, 2+ARCH_PPC64 + lpx r3, r3, r6 + mtctr r3 + mr r3, r4 + bctrl + + addi r6, r1, 16*PS + lvm r6, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29 + lwz r6, 15*PS(R(1)) + mtvrsave r6 + lp r1, 0(R(1)) + lp r0, 2*PS(R(1)) + mtlr r0 + blr +.endm + +.macro DECL_FFT suffix, bits, n, n2, n4 +fft\n\suffix\()_altivec: + mflr r0 + stp r0,PS*(\bits-3)(R(1)) + bl fft\n2\()_altivec + addi2 r3,\n*4 + bl fft\n4\()_altivec + addi2 r3,\n*2 + bl fft\n4\()_altivec + addi2 r3,\n*-6 + lp r0,PS*(\bits-3)(R(1)) + lp r4,\bits*PS(R(12)) + mtlr r0 + li r5,\n/16 + b fft_pass\suffix\()_altivec +.endm + +.macro DECL_FFTS interleave, suffix + .text + def_fft4 \suffix + def_fft8 \suffix + def_fft16 \suffix + PASS \interleave, \suffix + DECL_FFT \suffix, 5, 32, 16, 8 + DECL_FFT \suffix, 6, 64, 32, 16 + DECL_FFT \suffix, 7, 128, 64, 32 + DECL_FFT \suffix, 8, 256, 128, 64 + DECL_FFT \suffix, 9, 512, 256, 128 + DECL_FFT \suffix,10, 1024, 512, 256 + DECL_FFT \suffix,11, 2048, 1024, 512 + DECL_FFT \suffix,12, 4096, 2048, 1024 + DECL_FFT \suffix,13, 8192, 4096, 2048 + DECL_FFT \suffix,14,16384, 8192, 4096 + DECL_FFT \suffix,15,32768,16384, 8192 + DECL_FFT \suffix,16,65536,32768,16384 + + fft_calc \suffix + + .rodata + .align 3 +fft_dispatch_tab\suffix\()_altivec: + PTR fft4\suffix\()_altivec + PTR fft8\suffix\()_altivec + PTR fft16\suffix\()_altivec + PTR fft32\suffix\()_altivec + PTR fft64\suffix\()_altivec + PTR fft128\suffix\()_altivec + PTR fft256\suffix\()_altivec + PTR fft512\suffix\()_altivec + PTR fft1024\suffix\()_altivec + PTR fft2048\suffix\()_altivec + PTR fft4096\suffix\()_altivec + PTR fft8192\suffix\()_altivec + PTR fft16384\suffix\()_altivec + PTR fft32768\suffix\()_altivec + PTR fft65536\suffix\()_altivec +.endm + +DECL_FFTS 0 +DECL_FFTS 1, _interleave + +#endif /* HAVE_GNU_AS && HAVE_ALTIVEC && HAVE_BIGENDIAN */ diff -Nrup ffmpeg.orig/libavcodec/ppc/fft_init.c ffmpeg/libavcodec/ppc/fft_init.c --- ffmpeg.orig/libavcodec/ppc/fft_init.c 1970-01-01 01:00:00.000000000 +0100 +++ ffmpeg/libavcodec/ppc/fft_init.c 2018-09-05 15:45:34.368754131 +0200 @@ -0,0 +1,167 @@ +/* + * FFT/IFFT transforms + * AltiVec-enabled + * Copyright (c) 2009 Loren Merritt + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" +#include "libavutil/cpu.h" +#include "libavutil/ppc/cpu.h" +#include "libavutil/ppc/util_altivec.h" +#include "libavcodec/fft.h" + +/** + * Do a complex FFT with the parameters defined in ff_fft_init(). + * The input data must be permuted before with s->revtab table. + * No 1.0 / sqrt(n) normalization is done. + * AltiVec-enabled: + * This code assumes that the 'z' pointer is 16 bytes-aligned. + * It also assumes all FFTComplex are 8 bytes-aligned pairs of floats. + */ + +#if HAVE_VSX +#include "fft_vsx.h" +#else +void ff_fft_calc_altivec(FFTContext *s, FFTComplex *z); +void ff_fft_calc_interleave_altivec(FFTContext *s, FFTComplex *z); +#endif + +#if HAVE_GNU_AS && HAVE_ALTIVEC && (HAVE_BIGENDIAN || HAVE_VSX) +static void imdct_half_altivec(FFTContext *s, FFTSample *output, const FFTSample *input) +{ + int j, k; + int n = 1 << s->mdct_bits; + int n4 = n >> 2; + int n8 = n >> 3; + int n32 = n >> 5; + const uint16_t *revtabj = s->revtab; + const uint16_t *revtabk = s->revtab+n4; + const vec_f *tcos = (const vec_f*)(s->tcos+n8); + const vec_f *tsin = (const vec_f*)(s->tsin+n8); + const vec_f *pin = (const vec_f*)(input+n4); + vec_f *pout = (vec_f*)(output+n4); + + /* pre rotation */ + k = n32-1; + do { + vec_f cos,sin,cos0,sin0,cos1,sin1,re,im,r0,i0,r1,i1,a,b,c,d; +#define CMULA(p,o0,o1,o2,o3)\ + a = pin[ k*2+p]; /* { z[k].re, z[k].im, z[k+1].re, z[k+1].im } */\ + b = pin[-k*2-p-1]; /* { z[-k-2].re, z[-k-2].im, z[-k-1].re, z[-k-1].im } */\ + re = vec_perm(a, b, vcprm(0,2,s0,s2)); /* { z[k].re, z[k+1].re, z[-k-2].re, z[-k-1].re } */\ + im = vec_perm(a, b, vcprm(s3,s1,3,1)); /* { z[-k-1].im, z[-k-2].im, z[k+1].im, z[k].im } */\ + cos = vec_perm(cos0, cos1, vcprm(o0,o1,s##o2,s##o3)); /* { cos[k], cos[k+1], cos[-k-2], cos[-k-1] } */\ + sin = vec_perm(sin0, sin1, vcprm(o0,o1,s##o2,s##o3));\ + r##p = im*cos - re*sin;\ + i##p = re*cos + im*sin; +#define STORE2(v,dst)\ + j = dst;\ + vec_ste(v, 0, output+j*2);\ + vec_ste(v, 4, output+j*2); +#define STORE8(p)\ + a = vec_perm(r##p, i##p, vcprm(0,s0,0,s0));\ + b = vec_perm(r##p, i##p, vcprm(1,s1,1,s1));\ + c = vec_perm(r##p, i##p, vcprm(2,s2,2,s2));\ + d = vec_perm(r##p, i##p, vcprm(3,s3,3,s3));\ + STORE2(a, revtabk[ p*2-4]);\ + STORE2(b, revtabk[ p*2-3]);\ + STORE2(c, revtabj[-p*2+2]);\ + STORE2(d, revtabj[-p*2+3]); + + cos0 = tcos[k]; + sin0 = tsin[k]; + cos1 = tcos[-k-1]; + sin1 = tsin[-k-1]; + CMULA(0, 0,1,2,3); + CMULA(1, 2,3,0,1); + STORE8(0); + STORE8(1); + revtabj += 4; + revtabk -= 4; + k--; + } while(k >= 0); + +#if HAVE_VSX + ff_fft_calc_vsx(s, (FFTComplex*)output); +#else + ff_fft_calc_altivec(s, (FFTComplex*)output); +#endif + + /* post rotation + reordering */ + j = -n32; + k = n32-1; + do { + vec_f cos,sin,re,im,a,b,c,d; +#define CMULB(d0,d1,o)\ + re = pout[o*2];\ + im = pout[o*2+1];\ + cos = tcos[o];\ + sin = tsin[o];\ + d0 = im*sin - re*cos;\ + d1 = re*sin + im*cos; + + CMULB(a,b,j); + CMULB(c,d,k); + pout[2*j] = vec_perm(a, d, vcprm(0,s3,1,s2)); + pout[2*j+1] = vec_perm(a, d, vcprm(2,s1,3,s0)); + pout[2*k] = vec_perm(c, b, vcprm(0,s3,1,s2)); + pout[2*k+1] = vec_perm(c, b, vcprm(2,s1,3,s0)); + j++; + k--; + } while(k >= 0); +} + +static void imdct_calc_altivec(FFTContext *s, FFTSample *output, const FFTSample *input) +{ + int k; + int n = 1 << s->mdct_bits; + int n4 = n >> 2; + int n16 = n >> 4; + vec_u32 sign = {1U<<31,1U<<31,1U<<31,1U<<31}; + vec_u32 *p0 = (vec_u32*)(output+n4); + vec_u32 *p1 = (vec_u32*)(output+n4*3); + + imdct_half_altivec(s, output + n4, input); + + for (k = 0; k < n16; k++) { + vec_u32 a = p0[k] ^ sign; + vec_u32 b = p1[-k-1]; + p0[-k-1] = vec_perm(a, a, vcprm(3,2,1,0)); + p1[k] = vec_perm(b, b, vcprm(3,2,1,0)); + } +} +#endif /* HAVE_GNU_AS && HAVE_ALTIVEC && (HAVE_BIGENDIAN || HAVE_VSX) */ + +av_cold void ff_fft_init_ppc(FFTContext *s) +{ +#if HAVE_GNU_AS && HAVE_ALTIVEC && (HAVE_BIGENDIAN || HAVE_VSX) + if (!PPC_ALTIVEC(av_get_cpu_flags())) + return; + +#if HAVE_VSX + s->fft_calc = ff_fft_calc_interleave_vsx; +#else + s->fft_calc = ff_fft_calc_interleave_altivec; +#endif + if (s->mdct_bits >= 5) { + s->imdct_calc = imdct_calc_altivec; + s->imdct_half = imdct_half_altivec; + } +#endif /* HAVE_GNU_AS && HAVE_ALTIVEC && HAVE_BIGENDIAN */ +} diff -Nrup ffmpeg.orig/libavcodec/ppc/fft_vsx.c ffmpeg/libavcodec/ppc/fft_vsx.c --- ffmpeg.orig/libavcodec/ppc/fft_vsx.c 1970-01-01 01:00:00.000000000 +0100 +++ ffmpeg/libavcodec/ppc/fft_vsx.c 2018-09-05 15:45:34.368754131 +0200 @@ -0,0 +1,226 @@ +/* + * FFT transform, optimized with VSX built-in functions + * Copyright (c) 2014 Rong Yan + * + * This algorithm (though not any of the implementation details) is + * based on libdjbfft by D. J. Bernstein. + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + + +#include "config.h" +#include "libavutil/cpu.h" +#include "libavutil/ppc/util_altivec.h" +#include "libavcodec/fft.h" +#include "libavcodec/fft-internal.h" +#include "fft_vsx.h" + +#if HAVE_VSX + +static void fft32_vsx_interleave(FFTComplex *z) +{ + fft16_vsx_interleave(z); + fft8_vsx_interleave(z+16); + fft8_vsx_interleave(z+24); + pass_vsx_interleave(z,ff_cos_32,4); +} + +static void fft64_vsx_interleave(FFTComplex *z) +{ + fft32_vsx_interleave(z); + fft16_vsx_interleave(z+32); + fft16_vsx_interleave(z+48); + pass_vsx_interleave(z,ff_cos_64, 8); +} +static void fft128_vsx_interleave(FFTComplex *z) +{ + fft64_vsx_interleave(z); + fft32_vsx_interleave(z+64); + fft32_vsx_interleave(z+96); + pass_vsx_interleave(z,ff_cos_128,16); +} +static void fft256_vsx_interleave(FFTComplex *z) +{ + fft128_vsx_interleave(z); + fft64_vsx_interleave(z+128); + fft64_vsx_interleave(z+192); + pass_vsx_interleave(z,ff_cos_256,32); +} +static void fft512_vsx_interleave(FFTComplex *z) +{ + fft256_vsx_interleave(z); + fft128_vsx_interleave(z+256); + fft128_vsx_interleave(z+384); + pass_vsx_interleave(z,ff_cos_512,64); +} +static void fft1024_vsx_interleave(FFTComplex *z) +{ + fft512_vsx_interleave(z); + fft256_vsx_interleave(z+512); + fft256_vsx_interleave(z+768); + pass_vsx_interleave(z,ff_cos_1024,128); + +} +static void fft2048_vsx_interleave(FFTComplex *z) +{ + fft1024_vsx_interleave(z); + fft512_vsx_interleave(z+1024); + fft512_vsx_interleave(z+1536); + pass_vsx_interleave(z,ff_cos_2048,256); +} +static void fft4096_vsx_interleave(FFTComplex *z) +{ + fft2048_vsx_interleave(z); + fft1024_vsx_interleave(z+2048); + fft1024_vsx_interleave(z+3072); + pass_vsx_interleave(z,ff_cos_4096, 512); +} +static void fft8192_vsx_interleave(FFTComplex *z) +{ + fft4096_vsx_interleave(z); + fft2048_vsx_interleave(z+4096); + fft2048_vsx_interleave(z+6144); + pass_vsx_interleave(z,ff_cos_8192,1024); +} +static void fft16384_vsx_interleave(FFTComplex *z) +{ + fft8192_vsx_interleave(z); + fft4096_vsx_interleave(z+8192); + fft4096_vsx_interleave(z+12288); + pass_vsx_interleave(z,ff_cos_16384,2048); +} +static void fft32768_vsx_interleave(FFTComplex *z) +{ + fft16384_vsx_interleave(z); + fft8192_vsx_interleave(z+16384); + fft8192_vsx_interleave(z+24576); + pass_vsx_interleave(z,ff_cos_32768,4096); +} +static void fft65536_vsx_interleave(FFTComplex *z) +{ + fft32768_vsx_interleave(z); + fft16384_vsx_interleave(z+32768); + fft16384_vsx_interleave(z+49152); + pass_vsx_interleave(z,ff_cos_65536,8192); +} + +static void fft32_vsx(FFTComplex *z) +{ + fft16_vsx(z); + fft8_vsx(z+16); + fft8_vsx(z+24); + pass_vsx(z,ff_cos_32,4); +} + +static void fft64_vsx(FFTComplex *z) +{ + fft32_vsx(z); + fft16_vsx(z+32); + fft16_vsx(z+48); + pass_vsx(z,ff_cos_64, 8); +} +static void fft128_vsx(FFTComplex *z) +{ + fft64_vsx(z); + fft32_vsx(z+64); + fft32_vsx(z+96); + pass_vsx(z,ff_cos_128,16); +} +static void fft256_vsx(FFTComplex *z) +{ + fft128_vsx(z); + fft64_vsx(z+128); + fft64_vsx(z+192); + pass_vsx(z,ff_cos_256,32); +} +static void fft512_vsx(FFTComplex *z) +{ + fft256_vsx(z); + fft128_vsx(z+256); + fft128_vsx(z+384); + pass_vsx(z,ff_cos_512,64); +} +static void fft1024_vsx(FFTComplex *z) +{ + fft512_vsx(z); + fft256_vsx(z+512); + fft256_vsx(z+768); + pass_vsx(z,ff_cos_1024,128); + +} +static void fft2048_vsx(FFTComplex *z) +{ + fft1024_vsx(z); + fft512_vsx(z+1024); + fft512_vsx(z+1536); + pass_vsx(z,ff_cos_2048,256); +} +static void fft4096_vsx(FFTComplex *z) +{ + fft2048_vsx(z); + fft1024_vsx(z+2048); + fft1024_vsx(z+3072); + pass_vsx(z,ff_cos_4096, 512); +} +static void fft8192_vsx(FFTComplex *z) +{ + fft4096_vsx(z); + fft2048_vsx(z+4096); + fft2048_vsx(z+6144); + pass_vsx(z,ff_cos_8192,1024); +} +static void fft16384_vsx(FFTComplex *z) +{ + fft8192_vsx(z); + fft4096_vsx(z+8192); + fft4096_vsx(z+12288); + pass_vsx(z,ff_cos_16384,2048); +} +static void fft32768_vsx(FFTComplex *z) +{ + fft16384_vsx(z); + fft8192_vsx(z+16384); + fft8192_vsx(z+24576); + pass_vsx(z,ff_cos_32768,4096); +} +static void fft65536_vsx(FFTComplex *z) +{ + fft32768_vsx(z); + fft16384_vsx(z+32768); + fft16384_vsx(z+49152); + pass_vsx(z,ff_cos_65536,8192); +} + +static void (* const fft_dispatch_vsx[])(FFTComplex*) = { + fft4_vsx, fft8_vsx, fft16_vsx, fft32_vsx, fft64_vsx, fft128_vsx, fft256_vsx, fft512_vsx, fft1024_vsx, + fft2048_vsx, fft4096_vsx, fft8192_vsx, fft16384_vsx, fft32768_vsx, fft65536_vsx, +}; +static void (* const fft_dispatch_vsx_interleave[])(FFTComplex*) = { + fft4_vsx_interleave, fft8_vsx_interleave, fft16_vsx_interleave, fft32_vsx_interleave, fft64_vsx_interleave, + fft128_vsx_interleave, fft256_vsx_interleave, fft512_vsx_interleave, fft1024_vsx_interleave, + fft2048_vsx_interleave, fft4096_vsx_interleave, fft8192_vsx_interleave, fft16384_vsx_interleave, fft32768_vsx_interleave, fft65536_vsx_interleave, +}; +void ff_fft_calc_interleave_vsx(FFTContext *s, FFTComplex *z) +{ + fft_dispatch_vsx_interleave[s->nbits-2](z); +} +void ff_fft_calc_vsx(FFTContext *s, FFTComplex *z) +{ + fft_dispatch_vsx[s->nbits-2](z); +} +#endif /* HAVE_VSX */ diff -Nrup ffmpeg.orig/libavcodec/ppc/fft_vsx.h ffmpeg/libavcodec/ppc/fft_vsx.h --- ffmpeg.orig/libavcodec/ppc/fft_vsx.h 1970-01-01 01:00:00.000000000 +0100 +++ ffmpeg/libavcodec/ppc/fft_vsx.h 2018-09-05 15:45:34.368754131 +0200 @@ -0,0 +1,829 @@ +#ifndef AVCODEC_PPC_FFT_VSX_H +#define AVCODEC_PPC_FFT_VSX_H +/* + * FFT transform, optimized with VSX built-in functions + * Copyright (c) 2014 Rong Yan Copyright (c) 2009 Loren Merritt + * + * This algorithm (though not any of the implementation details) is + * based on libdjbfft by D. J. Bernstein, and fft_altivec_s.S. + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + + +#include "config.h" +#include "libavutil/cpu.h" +#include "libavutil/ppc/util_altivec.h" +#include "libavcodec/fft.h" +#include "libavcodec/fft-internal.h" + +#if HAVE_VSX + +void ff_fft_calc_interleave_vsx(FFTContext *s, FFTComplex *z); +void ff_fft_calc_vsx(FFTContext *s, FFTComplex *z); + + +#define byte_2complex (2*sizeof(FFTComplex)) +#define byte_4complex (4*sizeof(FFTComplex)) +#define byte_6complex (6*sizeof(FFTComplex)) +#define byte_8complex (8*sizeof(FFTComplex)) +#define byte_10complex (10*sizeof(FFTComplex)) +#define byte_12complex (12*sizeof(FFTComplex)) +#define byte_14complex (14*sizeof(FFTComplex)) + +inline static void pass_vsx_interleave(FFTComplex *z, const FFTSample *wre, unsigned int n) +{ + int o1 = n<<1; + int o2 = n<<2; + int o3 = o1+o2; + int i1, i2, i3; + FFTSample* out = (FFTSample*)z; + const FFTSample *wim = wre+o1; + vec_f vz0, vzo1, vzo2, vzo3; + vec_f x0, x1, x2, x3; + vec_f x4, x5, x6, x7; + vec_f x8, x9, x10, x11; + vec_f x12, x13, x14, x15; + vec_f x16, x17, x18, x19; + vec_f x20, x21, x22, x23; + vec_f vz0plus1, vzo1plus1, vzo2plus1, vzo3plus1; + vec_f y0, y1, y2, y3; + vec_f y4, y5, y8, y9; + vec_f y10, y13, y14, y15; + vec_f y16, y17, y18, y19; + vec_f y20, y21, y22, y23; + vec_f wr1, wi1, wr0, wi0; + vec_f wr2, wi2, wr3, wi3; + vec_f xmulwi0, xmulwi1, ymulwi2, ymulwi3; + + n = n-2; + i1 = o1*sizeof(FFTComplex); + i2 = o2*sizeof(FFTComplex); + i3 = o3*sizeof(FFTComplex); + vzo2 = vec_ld(i2, &(out[0])); // zo2.r zo2.i z(o2+1).r z(o2+1).i + vzo2plus1 = vec_ld(i2+16, &(out[0])); + vzo3 = vec_ld(i3, &(out[0])); // zo3.r zo3.i z(o3+1).r z(o3+1).i + vzo3plus1 = vec_ld(i3+16, &(out[0])); + vz0 = vec_ld(0, &(out[0])); // z0.r z0.i z1.r z1.i + vz0plus1 = vec_ld(16, &(out[0])); + vzo1 = vec_ld(i1, &(out[0])); // zo1.r zo1.i z(o1+1).r z(o1+1).i + vzo1plus1 = vec_ld(i1+16, &(out[0])); + + x0 = vec_add(vzo2, vzo3); + x1 = vec_sub(vzo2, vzo3); + y0 = vec_add(vzo2plus1, vzo3plus1); + y1 = vec_sub(vzo2plus1, vzo3plus1); + + wr1 = vec_splats(wre[1]); + wi1 = vec_splats(wim[-1]); + wi2 = vec_splats(wim[-2]); + wi3 = vec_splats(wim[-3]); + wr2 = vec_splats(wre[2]); + wr3 = vec_splats(wre[3]); + + x2 = vec_perm(x0, x1, vcprm(2,s2,3,s3)); + x3 = vec_perm(x0, x1, vcprm(s3,3,s2,2)); + + y4 = vec_perm(y0, y1, vcprm(s1,1,s0,0)); + y5 = vec_perm(y0, y1, vcprm(s3,3,s2,2)); + y2 = vec_perm(y0, y1, vcprm(0,s0,1,s1)); + y3 = vec_perm(y0, y1, vcprm(2,s2,3,s3)); + + ymulwi2 = vec_mul(y4, wi2); + ymulwi3 = vec_mul(y5, wi3); + x4 = vec_mul(x2, wr1); + x5 = vec_mul(x3, wi1); + y8 = vec_madd(y2, wr2, ymulwi2); + y9 = vec_msub(y2, wr2, ymulwi2); + x6 = vec_add(x4, x5); + x7 = vec_sub(x4, x5); + y13 = vec_madd(y3, wr3, ymulwi3); + y14 = vec_msub(y3, wr3, ymulwi3); + + x8 = vec_perm(x6, x7, vcprm(0,1,s2,s3)); + y10 = vec_perm(y8, y9, vcprm(0,1,s2,s3)); + y15 = vec_perm(y13, y14, vcprm(0,1,s2,s3)); + + x9 = vec_perm(x0, x8, vcprm(0,1,s0,s2)); + x10 = vec_perm(x1, x8, vcprm(1,0,s3,s1)); + + y16 = vec_perm(y10, y15, vcprm(0,2,s0,s2)); + y17 = vec_perm(y10, y15, vcprm(3,1,s3,s1)); + + x11 = vec_add(vz0, x9); + x12 = vec_sub(vz0, x9); + x13 = vec_add(vzo1, x10); + x14 = vec_sub(vzo1, x10); + + y18 = vec_add(vz0plus1, y16); + y19 = vec_sub(vz0plus1, y16); + y20 = vec_add(vzo1plus1, y17); + y21 = vec_sub(vzo1plus1, y17); + + x15 = vec_perm(x13, x14, vcprm(0,s1,2,s3)); + x16 = vec_perm(x13, x14, vcprm(s0,1,s2,3)); + y22 = vec_perm(y20, y21, vcprm(0,s1,2,s3)); + y23 = vec_perm(y20, y21, vcprm(s0,1,s2,3)); + + + vec_st(x11, 0, &(out[0])); + vec_st(y18, 16, &(out[0])); + vec_st(x15, i1, &(out[0])); + vec_st(y22, i1+16, &(out[0])); + vec_st(x12, i2, &(out[0])); + vec_st(y19, i2+16, &(out[0])); + vec_st(x16, i3, &(out[0])); + vec_st(y23, i3+16, &(out[0])); + + do { + out += 8; + wre += 4; + wim -= 4; + wr0 = vec_splats(wre[0]); + wr1 = vec_splats(wre[1]); + wi0 = vec_splats(wim[0]); + wi1 = vec_splats(wim[-1]); + + wr2 = vec_splats(wre[2]); + wr3 = vec_splats(wre[3]); + wi2 = vec_splats(wim[-2]); + wi3 = vec_splats(wim[-3]); + + vzo2 = vec_ld(i2, &(out[0])); // zo2.r zo2.i z(o2+1).r z(o2+1).i + vzo2plus1 = vec_ld(i2+16, &(out[0])); + vzo3 = vec_ld(i3, &(out[0])); // zo3.r zo3.i z(o3+1).r z(o3+1).i + vzo3plus1 = vec_ld(i3+16, &(out[0])); + vz0 = vec_ld(0, &(out[0])); // z0.r z0.i z1.r z1.i + vz0plus1 = vec_ld(16, &(out[0])); + vzo1 = vec_ld(i1, &(out[0])); // zo1.r zo1.i z(o1+1).r z(o1+1).i + vzo1plus1 = vec_ld(i1+16, &(out[0])); + + x0 = vec_add(vzo2, vzo3); + x1 = vec_sub(vzo2, vzo3); + + y0 = vec_add(vzo2plus1, vzo3plus1); + y1 = vec_sub(vzo2plus1, vzo3plus1); + + x4 = vec_perm(x0, x1, vcprm(s1,1,s0,0)); + x5 = vec_perm(x0, x1, vcprm(s3,3,s2,2)); + x2 = vec_perm(x0, x1, vcprm(0,s0,1,s1)); + x3 = vec_perm(x0, x1, vcprm(2,s2,3,s3)); + + y2 = vec_perm(y0, y1, vcprm(0,s0,1,s1)); + y3 = vec_perm(y0, y1, vcprm(2,s2,3,s3)); + xmulwi0 = vec_mul(x4, wi0); + xmulwi1 = vec_mul(x5, wi1); + + y4 = vec_perm(y0, y1, vcprm(s1,1,s0,0)); + y5 = vec_perm(y0, y1, vcprm(s3,3,s2,2)); + + x8 = vec_madd(x2, wr0, xmulwi0); + x9 = vec_msub(x2, wr0, xmulwi0); + ymulwi2 = vec_mul(y4, wi2); + ymulwi3 = vec_mul(y5, wi3); + + x13 = vec_madd(x3, wr1, xmulwi1); + x14 = vec_msub(x3, wr1, xmulwi1); + + y8 = vec_madd(y2, wr2, ymulwi2); + y9 = vec_msub(y2, wr2, ymulwi2); + y13 = vec_madd(y3, wr3, ymulwi3); + y14 = vec_msub(y3, wr3, ymulwi3); + + x10 = vec_perm(x8, x9, vcprm(0,1,s2,s3)); + x15 = vec_perm(x13, x14, vcprm(0,1,s2,s3)); + + y10 = vec_perm(y8, y9, vcprm(0,1,s2,s3)); + y15 = vec_perm(y13, y14, vcprm(0,1,s2,s3)); + + x16 = vec_perm(x10, x15, vcprm(0,2,s0,s2)); + x17 = vec_perm(x10, x15, vcprm(3,1,s3,s1)); + + y16 = vec_perm(y10, y15, vcprm(0,2,s0,s2)); + y17 = vec_perm(y10, y15, vcprm(3,1,s3,s1)); + + x18 = vec_add(vz0, x16); + x19 = vec_sub(vz0, x16); + x20 = vec_add(vzo1, x17); + x21 = vec_sub(vzo1, x17); + + y18 = vec_add(vz0plus1, y16); + y19 = vec_sub(vz0plus1, y16); + y20 = vec_add(vzo1plus1, y17); + y21 = vec_sub(vzo1plus1, y17); + + x22 = vec_perm(x20, x21, vcprm(0,s1,2,s3)); + x23 = vec_perm(x20, x21, vcprm(s0,1,s2,3)); + + y22 = vec_perm(y20, y21, vcprm(0,s1,2,s3)); + y23 = vec_perm(y20, y21, vcprm(s0,1,s2,3)); + + vec_st(x18, 0, &(out[0])); + vec_st(y18, 16, &(out[0])); + vec_st(x22, i1, &(out[0])); + vec_st(y22, i1+16, &(out[0])); + vec_st(x19, i2, &(out[0])); + vec_st(y19, i2+16, &(out[0])); + vec_st(x23, i3, &(out[0])); + vec_st(y23, i3+16, &(out[0])); + } while (n-=2); +} + +inline static void fft2_vsx_interleave(FFTComplex *z) +{ + FFTSample r1, i1; + + r1 = z[0].re - z[1].re; + z[0].re += z[1].re; + z[1].re = r1; + + i1 = z[0].im - z[1].im; + z[0].im += z[1].im; + z[1].im = i1; + } + +inline static void fft4_vsx_interleave(FFTComplex *z) +{ + vec_f a, b, c, d; + float* out= (float*)z; + a = vec_ld(0, &(out[0])); + b = vec_ld(byte_2complex, &(out[0])); + + c = vec_perm(a, b, vcprm(0,1,s2,s1)); + d = vec_perm(a, b, vcprm(2,3,s0,s3)); + a = vec_add(c, d); + b = vec_sub(c, d); + + c = vec_perm(a, b, vcprm(0,1,s0,s1)); + d = vec_perm(a, b, vcprm(2,3,s3,s2)); + + a = vec_add(c, d); + b = vec_sub(c, d); + vec_st(a, 0, &(out[0])); + vec_st(b, byte_2complex, &(out[0])); +} + +inline static void fft8_vsx_interleave(FFTComplex *z) +{ + vec_f vz0, vz1, vz2, vz3; + vec_f x0, x1, x2, x3; + vec_f x4, x5, x6, x7; + vec_f x8, x9, x10, x11; + vec_f x12, x13, x14, x15; + vec_f x16, x17, x18, x19; + vec_f x20, x21, x22, x23; + vec_f x24, x25, x26, x27; + vec_f x28, x29, x30, x31; + vec_f x32, x33, x34; + + float* out= (float*)z; + vec_f vc1 = {sqrthalf, sqrthalf, sqrthalf, sqrthalf}; + + vz0 = vec_ld(0, &(out[0])); + vz1 = vec_ld(byte_2complex, &(out[0])); + vz2 = vec_ld(byte_4complex, &(out[0])); + vz3 = vec_ld(byte_6complex, &(out[0])); + + x0 = vec_perm(vz0, vz1, vcprm(0,1,s2,s1)); + x1 = vec_perm(vz0, vz1, vcprm(2,3,s0,s3)); + x2 = vec_perm(vz2, vz3, vcprm(2,1,s0,s1)); + x3 = vec_perm(vz2, vz3, vcprm(0,3,s2,s3)); + + x4 = vec_add(x0, x1); + x5 = vec_sub(x0, x1); + x6 = vec_add(x2, x3); + x7 = vec_sub(x2, x3); + + x8 = vec_perm(x4, x5, vcprm(0,1,s0,s1)); + x9 = vec_perm(x4, x5, vcprm(2,3,s3,s2)); + x10 = vec_perm(x6, x7, vcprm(2,1,s2,s1)); + x11 = vec_perm(x6, x7, vcprm(0,3,s0,s3)); + + x12 = vec_add(x8, x9); + x13 = vec_sub(x8, x9); + x14 = vec_add(x10, x11); + x15 = vec_sub(x10, x11); + x16 = vec_perm(x12, x13, vcprm(0,s0,1,s1)); + x17 = vec_perm(x14, x15, vcprm(0,s0,1,s1)); + x18 = vec_perm(x16, x17, vcprm(s0,s3,s2,s1)); + x19 = vec_add(x16, x18); // z0.r z2.r z0.i z2.i + x20 = vec_sub(x16, x18); // z4.r z6.r z4.i z6.i + + x21 = vec_perm(x12, x13, vcprm(2,s2,3,s3)); + x22 = vec_perm(x14, x15, vcprm(2,3,s2,s3)); + x23 = vec_perm(x14, x15, vcprm(3,2,s3,s2)); + x24 = vec_add(x22, x23); + x25 = vec_sub(x22, x23); + x26 = vec_mul( vec_perm(x24, x25, vcprm(2,s2,0,s0)), vc1); + + x27 = vec_add(x21, x26); // z1.r z7.r z1.i z3.i + x28 = vec_sub(x21, x26); //z5.r z3.r z5.i z7.i + + x29 = vec_perm(x19, x27, vcprm(0,2,s0,s2)); // z0.r z0.i z1.r z1.i + x30 = vec_perm(x19, x27, vcprm(1,3,s1,s3)); // z2.r z2.i z7.r z3.i + x31 = vec_perm(x20, x28, vcprm(0,2,s0,s2)); // z4.r z4.i z5.r z5.i + x32 = vec_perm(x20, x28, vcprm(1,3,s1,s3)); // z6.r z6.i z3.r z7.i + x33 = vec_perm(x30, x32, vcprm(0,1,s2,3)); // z2.r z2.i z3.r z3.i + x34 = vec_perm(x30, x32, vcprm(s0,s1,2,s3)); // z6.r z6.i z7.r z7.i + + vec_st(x29, 0, &(out[0])); + vec_st(x33, byte_2complex, &(out[0])); + vec_st(x31, byte_4complex, &(out[0])); + vec_st(x34, byte_6complex, &(out[0])); +} + +inline static void fft16_vsx_interleave(FFTComplex *z) +{ + float* out= (float*)z; + vec_f vc0 = {sqrthalf, sqrthalf, sqrthalf, sqrthalf}; + vec_f vc1 = {ff_cos_16[1], ff_cos_16[1], ff_cos_16[1], ff_cos_16[1]}; + vec_f vc2 = {ff_cos_16[3], ff_cos_16[3], ff_cos_16[3], ff_cos_16[3]}; + vec_f vz0, vz1, vz2, vz3; + vec_f vz4, vz5, vz6, vz7; + vec_f x0, x1, x2, x3; + vec_f x4, x5, x6, x7; + vec_f x8, x9, x10, x11; + vec_f x12, x13, x14, x15; + vec_f x16, x17, x18, x19; + vec_f x20, x21, x22, x23; + vec_f x24, x25, x26, x27; + vec_f x28, x29, x30, x31; + vec_f x32, x33, x34, x35; + vec_f x36, x37, x38, x39; + vec_f x40, x41, x42, x43; + vec_f x44, x45, x46, x47; + vec_f x48, x49, x50, x51; + vec_f x52, x53, x54, x55; + vec_f x56, x57, x58, x59; + vec_f x60, x61, x62, x63; + vec_f x64, x65, x66, x67; + vec_f x68, x69, x70, x71; + vec_f x72, x73, x74, x75; + vec_f x76, x77, x78, x79; + vec_f x80, x81, x82, x83; + vec_f x84, x85, x86; + + vz0 = vec_ld(0, &(out[0])); + vz1 = vec_ld(byte_2complex, &(out[0])); + vz2 = vec_ld(byte_4complex, &(out[0])); + vz3 = vec_ld(byte_6complex, &(out[0])); + vz4 = vec_ld(byte_8complex, &(out[0])); + vz5 = vec_ld(byte_10complex, &(out[0])); + vz6 = vec_ld(byte_12complex, &(out[0])); + vz7 = vec_ld(byte_14complex, &(out[0])); + + x0 = vec_perm(vz0, vz1, vcprm(0,1,s2,s1)); + x1 = vec_perm(vz0, vz1, vcprm(2,3,s0,s3)); + x2 = vec_perm(vz2, vz3, vcprm(0,1,s0,s1)); + x3 = vec_perm(vz2, vz3, vcprm(2,3,s2,s3)); + + x4 = vec_perm(vz4, vz5, vcprm(0,1,s2,s1)); + x5 = vec_perm(vz4, vz5, vcprm(2,3,s0,s3)); + x6 = vec_perm(vz6, vz7, vcprm(0,1,s2,s1)); + x7 = vec_perm(vz6, vz7, vcprm(2,3,s0,s3)); + + x8 = vec_add(x0, x1); + x9 = vec_sub(x0, x1); + x10 = vec_add(x2, x3); + x11 = vec_sub(x2, x3); + + x12 = vec_add(x4, x5); + x13 = vec_sub(x4, x5); + x14 = vec_add(x6, x7); + x15 = vec_sub(x6, x7); + + x16 = vec_perm(x8, x9, vcprm(0,1,s0,s1)); + x17 = vec_perm(x8, x9, vcprm(2,3,s3,s2)); + x18 = vec_perm(x10, x11, vcprm(2,1,s1,s2)); + x19 = vec_perm(x10, x11, vcprm(0,3,s0,s3)); + x20 = vec_perm(x12, x14, vcprm(0,1,s0, s1)); + x21 = vec_perm(x12, x14, vcprm(2,3,s2,s3)); + x22 = vec_perm(x13, x15, vcprm(0,1,s0,s1)); + x23 = vec_perm(x13, x15, vcprm(3,2,s3,s2)); + + x24 = vec_add(x16, x17); + x25 = vec_sub(x16, x17); + x26 = vec_add(x18, x19); + x27 = vec_sub(x18, x19); + x28 = vec_add(x20, x21); + x29 = vec_sub(x20, x21); + x30 = vec_add(x22, x23); + x31 = vec_sub(x22, x23); + + x32 = vec_add(x24, x26); + x33 = vec_sub(x24, x26); + x34 = vec_perm(x32, x33, vcprm(0,1,s0,s1)); + + x35 = vec_perm(x28, x29, vcprm(2,1,s1,s2)); + x36 = vec_perm(x28, x29, vcprm(0,3,s0,s3)); + x37 = vec_add(x35, x36); + x38 = vec_sub(x35, x36); + x39 = vec_perm(x37, x38, vcprm(0,1,s1,s0)); + + x40 = vec_perm(x27, x38, vcprm(3,2,s2,s3)); + x41 = vec_perm(x26, x37, vcprm(2,3,s3,s2)); + x42 = vec_add(x40, x41); + x43 = vec_sub(x40, x41); + x44 = vec_mul(x42, vc0); + x45 = vec_mul(x43, vc0); + + x46 = vec_add(x34, x39); // z0.r z0.i z4.r z4.i + x47 = vec_sub(x34, x39); // z8.r z8.i z12.r z12.i + + x48 = vec_perm(x30, x31, vcprm(2,1,s1,s2)); + x49 = vec_perm(x30, x31, vcprm(0,3,s3,s0)); + x50 = vec_add(x48, x49); + x51 = vec_sub(x48, x49); + x52 = vec_mul(x50, vc1); + x53 = vec_mul(x50, vc2); + x54 = vec_mul(x51, vc1); + x55 = vec_mul(x51, vc2); + + x56 = vec_perm(x24, x25, vcprm(2,3,s2,s3)); + x57 = vec_perm(x44, x45, vcprm(0,1,s1,s0)); + x58 = vec_add(x56, x57); + x59 = vec_sub(x56, x57); + + x60 = vec_perm(x54, x55, vcprm(1,0,3,2)); + x61 = vec_perm(x54, x55, vcprm(s1,s0,s3,s2)); + x62 = vec_add(x52, x61); + x63 = vec_sub(x52, x61); + x64 = vec_add(x60, x53); + x65 = vec_sub(x60, x53); + x66 = vec_perm(x62, x64, vcprm(0,1,s3,s2)); + x67 = vec_perm(x63, x65, vcprm(s0,s1,3,2)); + + x68 = vec_add(x58, x66); // z1.r z1.i z3.r z3.i + x69 = vec_sub(x58, x66); // z9.r z9.i z11.r z11.i + x70 = vec_add(x59, x67); // z5.r z5.i z15.r z15.i + x71 = vec_sub(x59, x67); // z13.r z13.i z7.r z7.i + + x72 = vec_perm(x25, x27, vcprm(s1,s0,s2,s3)); + x73 = vec_add(x25, x72); + x74 = vec_sub(x25, x72); + x75 = vec_perm(x73, x74, vcprm(0,1,s0,s1)); + x76 = vec_perm(x44, x45, vcprm(3,2,s2,s3)); + x77 = vec_add(x75, x76); // z2.r z2.i z6.r z6.i + x78 = vec_sub(x75, x76); // z10.r z10.i z14.r z14.i + + x79 = vec_perm(x46, x68, vcprm(0,1,s0,s1)); // z0.r z0.i z1.r z1.i + x80 = vec_perm(x77, x68, vcprm(0,1,s2,s3)); // z2.r z2.i z3.r z3.i + x81 = vec_perm(x46, x70, vcprm(2,3,s0,s1)); // z4.r z4.i z5.r z5.i + x82 = vec_perm(x71, x77, vcprm(s2,s3,2,3)); // z6.r z6.i z7.r z7.i + vec_st(x79, 0, &(out[0])); + vec_st(x80, byte_2complex, &(out[0])); + vec_st(x81, byte_4complex, &(out[0])); + vec_st(x82, byte_6complex, &(out[0])); + x83 = vec_perm(x47, x69, vcprm(0,1,s0,s1)); // z8.r z8.i z9.r z9.i + x84 = vec_perm(x78, x69, vcprm(0,1,s2,s3)); // z10.r z10.i z11.r z11.i + x85 = vec_perm(x47, x71, vcprm(2,3,s0,s1)); // z12.r z12.i z13.r z13.i + x86 = vec_perm(x70, x78, vcprm(s2,s3,2,3)); // z14.r z14.i z15.r z15.i + vec_st(x83, byte_8complex, &(out[0])); + vec_st(x84, byte_10complex, &(out[0])); + vec_st(x85, byte_12complex, &(out[0])); + vec_st(x86, byte_14complex, &(out[0])); +} + +inline static void fft4_vsx(FFTComplex *z) +{ + vec_f a, b, c, d; + float* out= (float*)z; + a = vec_ld(0, &(out[0])); + b = vec_ld(byte_2complex, &(out[0])); + + c = vec_perm(a, b, vcprm(0,1,s2,s1)); + d = vec_perm(a, b, vcprm(2,3,s0,s3)); + a = vec_add(c, d); + b = vec_sub(c, d); + + c = vec_perm(a,b, vcprm(0,s0,1,s1)); + d = vec_perm(a, b, vcprm(2,s3,3,s2)); + + a = vec_add(c, d); + b = vec_sub(c, d); + + c = vec_perm(a, b, vcprm(0,1,s0,s1)); + d = vec_perm(a, b, vcprm(2,3,s2,s3)); + + vec_st(c, 0, &(out[0])); + vec_st(d, byte_2complex, &(out[0])); + return; +} + +inline static void fft8_vsx(FFTComplex *z) +{ + vec_f vz0, vz1, vz2, vz3; + vec_f vz4, vz5, vz6, vz7, vz8; + + float* out= (float*)z; + vec_f vc0 = {0.0, 0.0, 0.0, 0.0}; + vec_f vc1 = {-sqrthalf, sqrthalf, sqrthalf, -sqrthalf}; + vec_f vc2 = {sqrthalf, sqrthalf, sqrthalf, sqrthalf}; + + vz0 = vec_ld(0, &(out[0])); + vz1 = vec_ld(byte_2complex, &(out[0])); + vz2 = vec_ld(byte_4complex, &(out[0])); + vz3 = vec_ld(byte_6complex, &(out[0])); + + vz6 = vec_perm(vz2, vz3, vcprm(0,s0,1,s1)); + vz7 = vec_perm(vz2, vz3, vcprm(2,s2,3,s3)); + vz4 = vec_perm(vz0, vz1, vcprm(0,1,s2,s1)); + vz5 = vec_perm(vz0, vz1, vcprm(2,3,s0,s3)); + + vz2 = vec_add(vz6, vz7); + vz3 = vec_sub(vz6, vz7); + vz8 = vec_perm(vz3, vz3, vcprm(2,3,0,1)); + + vz0 = vec_add(vz4, vz5); + vz1 = vec_sub(vz4, vz5); + + vz3 = vec_madd(vz3, vc1, vc0); + vz3 = vec_madd(vz8, vc2, vz3); + + vz4 = vec_perm(vz0, vz1, vcprm(0,s0,1,s1)); + vz5 = vec_perm(vz0, vz1, vcprm(2,s3,3,s2)); + vz6 = vec_perm(vz2, vz3, vcprm(1,2,s3,s0)); + vz7 = vec_perm(vz2, vz3, vcprm(0,3,s2,s1)); + + vz0 = vec_add(vz4, vz5); + vz1 = vec_sub(vz4, vz5); + vz2 = vec_add(vz6, vz7); + vz3 = vec_sub(vz6, vz7); + + vz4 = vec_perm(vz0, vz1, vcprm(0,1,s0,s1)); + vz5 = vec_perm(vz0, vz1, vcprm(2,3,s2,s3)); + vz6 = vec_perm(vz2, vz3, vcprm(0,2,s1,s3)); + vz7 = vec_perm(vz2, vz3, vcprm(1,3,s0,s2)); + + + vz2 = vec_sub(vz4, vz6); + vz3 = vec_sub(vz5, vz7); + + vz0 = vec_add(vz4, vz6); + vz1 = vec_add(vz5, vz7); + + vec_st(vz0, 0, &(out[0])); + vec_st(vz1, byte_2complex, &(out[0])); + vec_st(vz2, byte_4complex, &(out[0])); + vec_st(vz3, byte_6complex, &(out[0])); + return; +} + +inline static void fft16_vsx(FFTComplex *z) +{ + float* out= (float*)z; + vec_f vc0 = {0.0, 0.0, 0.0, 0.0}; + vec_f vc1 = {-sqrthalf, sqrthalf, sqrthalf, -sqrthalf}; + vec_f vc2 = {sqrthalf, sqrthalf, sqrthalf, sqrthalf}; + vec_f vc3 = {1.0, 0.92387953, sqrthalf, 0.38268343}; + vec_f vc4 = {0.0, 0.38268343, sqrthalf, 0.92387953}; + vec_f vc5 = {-0.0, -0.38268343, -sqrthalf, -0.92387953}; + + vec_f vz0, vz1, vz2, vz3; + vec_f vz4, vz5, vz6, vz7; + vec_f vz8, vz9, vz10, vz11; + vec_f vz12, vz13; + + vz0 = vec_ld(byte_8complex, &(out[0])); + vz1 = vec_ld(byte_10complex, &(out[0])); + vz2 = vec_ld(byte_12complex, &(out[0])); + vz3 = vec_ld(byte_14complex, &(out[0])); + + vz4 = vec_perm(vz0, vz1, vcprm(0,1,s2,s1)); + vz5 = vec_perm(vz0, vz1, vcprm(2,3,s0,s3)); + vz6 = vec_perm(vz2, vz3, vcprm(0,1,s2,s1)); + vz7 = vec_perm(vz2, vz3, vcprm(2,3,s0,s3)); + + vz0 = vec_add(vz4, vz5); + vz1= vec_sub(vz4, vz5); + vz2 = vec_add(vz6, vz7); + vz3 = vec_sub(vz6, vz7); + + vz4 = vec_perm(vz0, vz1, vcprm(0,s0,1,s1)); + vz5 = vec_perm(vz0, vz1, vcprm(2,s3,3,s2)); + vz6 = vec_perm(vz2, vz3, vcprm(0,s0,1,s1)); + vz7 = vec_perm(vz2, vz3, vcprm(2,s3,3,s2)); + + vz0 = vec_add(vz4, vz5); + vz1 = vec_sub(vz4, vz5); + vz2 = vec_add(vz6, vz7); + vz3 = vec_sub(vz6, vz7); + + vz4 = vec_perm(vz0, vz1, vcprm(0,1,s0,s1)); + vz5 = vec_perm(vz0, vz1, vcprm(2,3,s2,s3)); + + vz6 = vec_perm(vz2, vz3, vcprm(0,1,s0,s1)); + vz7 = vec_perm(vz2, vz3, vcprm(2,3,s2,s3)); + + vz0 = vec_ld(0, &(out[0])); + vz1 = vec_ld(byte_2complex, &(out[0])); + vz2 = vec_ld(byte_4complex, &(out[0])); + vz3 = vec_ld(byte_6complex, &(out[0])); + vz10 = vec_perm(vz2, vz3, vcprm(0,s0,1,s1)); + vz11 = vec_perm(vz2, vz3, vcprm(2,s2,3,s3)); + vz8 = vec_perm(vz0, vz1, vcprm(0,1,s2,s1)); + vz9 = vec_perm(vz0, vz1, vcprm(2,3,s0,s3)); + + vz2 = vec_add(vz10, vz11); + vz3 = vec_sub(vz10, vz11); + vz12 = vec_perm(vz3, vz3, vcprm(2,3,0,1)); + vz0 = vec_add(vz8, vz9); + vz1 = vec_sub(vz8, vz9); + + vz3 = vec_madd(vz3, vc1, vc0); + vz3 = vec_madd(vz12, vc2, vz3); + vz8 = vec_perm(vz0, vz1, vcprm(0,s0,1,s1)); + vz9 = vec_perm(vz0, vz1, vcprm(2,s3,3,s2)); + vz10 = vec_perm(vz2, vz3, vcprm(1,2,s3,s0)); + vz11 = vec_perm(vz2, vz3, vcprm(0,3,s2,s1)); + + vz0 = vec_add(vz8, vz9); + vz1 = vec_sub(vz8, vz9); + vz2 = vec_add(vz10, vz11); + vz3 = vec_sub(vz10, vz11); + + vz8 = vec_perm(vz0, vz1, vcprm(0,1,s0,s1)); + vz9 = vec_perm(vz0, vz1, vcprm(2,3,s2,s3)); + vz10 = vec_perm(vz2, vz3, vcprm(0,2,s1,s3)); + vz11 = vec_perm(vz2, vz3, vcprm(1,3,s0,s2)); + + vz2 = vec_sub(vz8, vz10); + vz3 = vec_sub(vz9, vz11); + vz0 = vec_add(vz8, vz10); + vz1 = vec_add(vz9, vz11); + + vz8 = vec_madd(vz4, vc3, vc0); + vz9 = vec_madd(vz5, vc3, vc0); + vz10 = vec_madd(vz6, vc3, vc0); + vz11 = vec_madd(vz7, vc3, vc0); + + vz8 = vec_madd(vz5, vc4, vz8); + vz9 = vec_madd(vz4, vc5, vz9); + vz10 = vec_madd(vz7, vc5, vz10); + vz11 = vec_madd(vz6, vc4, vz11); + + vz12 = vec_sub(vz10, vz8); + vz10 = vec_add(vz10, vz8); + + vz13 = vec_sub(vz9, vz11); + vz11 = vec_add(vz9, vz11); + + vz4 = vec_sub(vz0, vz10); + vz0 = vec_add(vz0, vz10); + + vz7= vec_sub(vz3, vz12); + vz3= vec_add(vz3, vz12); + + vz5 = vec_sub(vz1, vz11); + vz1 = vec_add(vz1, vz11); + + vz6 = vec_sub(vz2, vz13); + vz2 = vec_add(vz2, vz13); + + vec_st(vz0, 0, &(out[0])); + vec_st(vz1, byte_2complex, &(out[0])); + vec_st(vz2, byte_4complex, &(out[0])); + vec_st(vz3, byte_6complex, &(out[0])); + vec_st(vz4, byte_8complex, &(out[0])); + vec_st(vz5, byte_10complex, &(out[0])); + vec_st(vz6, byte_12complex, &(out[0])); + vec_st(vz7, byte_14complex, &(out[0])); + return; + +} +inline static void pass_vsx(FFTComplex * z, const FFTSample * wre, unsigned int n) +{ + int o1 = n<<1; + int o2 = n<<2; + int o3 = o1+o2; + int i1, i2, i3; + FFTSample* out = (FFTSample*)z; + const FFTSample *wim = wre+o1; + vec_f v0, v1, v2, v3; + vec_f v4, v5, v6, v7; + vec_f v8, v9, v10, v11; + vec_f v12, v13; + + n = n-2; + i1 = o1*sizeof(FFTComplex); + i2 = o2*sizeof(FFTComplex); + i3 = o3*sizeof(FFTComplex); + + v8 = vec_ld(0, &(wre[0])); + v10 = vec_ld(0, &(wim[0])); + v9 = vec_ld(0, &(wim[-4])); + v9 = vec_perm(v9, v10, vcprm(s0,3,2,1)); + + v4 = vec_ld(i2, &(out[0])); + v5 = vec_ld(i2+16, &(out[0])); + v6 = vec_ld(i3, &(out[0])); + v7 = vec_ld(i3+16, &(out[0])); + v10 = vec_mul(v4, v8); // r2*wre + v11 = vec_mul(v5, v8); // i2*wre + v12 = vec_mul(v6, v8); // r3*wre + v13 = vec_mul(v7, v8); // i3*wre + + v0 = vec_ld(0, &(out[0])); // r0 + v3 = vec_ld(i1+16, &(out[0])); // i1 + v10 = vec_madd(v5, v9, v10); // r2*wim + v11 = vec_nmsub(v4, v9, v11); // i2*wim + v12 = vec_nmsub(v7, v9, v12); // r3*wim + v13 = vec_madd(v6, v9, v13); // i3*wim + + v1 = vec_ld(16, &(out[0])); // i0 + v2 = vec_ld(i1, &(out[0])); // r1 + v8 = vec_sub(v12, v10); + v12 = vec_add(v12, v10); + v9 = vec_sub(v11, v13); + v13 = vec_add(v11, v13); + v4 = vec_sub(v0, v12); + v0 = vec_add(v0, v12); + v7 = vec_sub(v3, v8); + v3 = vec_add(v3, v8); + + vec_st(v0, 0, &(out[0])); // r0 + vec_st(v3, i1+16, &(out[0])); // i1 + vec_st(v4, i2, &(out[0])); // r2 + vec_st(v7, i3+16, &(out[0]));// i3 + + v5 = vec_sub(v1, v13); + v1 = vec_add(v1, v13); + v6 = vec_sub(v2, v9); + v2 = vec_add(v2, v9); + + vec_st(v1, 16, &(out[0])); // i0 + vec_st(v2, i1, &(out[0])); // r1 + vec_st(v5, i2+16, &(out[0])); // i2 + vec_st(v6, i3, &(out[0])); // r3 + + do { + out += 8; + wre += 4; + wim -= 4; + + v8 = vec_ld(0, &(wre[0])); + v10 = vec_ld(0, &(wim[0])); + v9 = vec_ld(0, &(wim[-4])); + v9 = vec_perm(v9, v10, vcprm(s0,3,2,1)); + + v4 = vec_ld(i2, &(out[0])); // r2 + v5 = vec_ld(i2+16, &(out[0])); // i2 + v6 = vec_ld(i3, &(out[0])); // r3 + v7 = vec_ld(i3+16, &(out[0]));// i3 + v10 = vec_mul(v4, v8); // r2*wre + v11 = vec_mul(v5, v8); // i2*wre + v12 = vec_mul(v6, v8); // r3*wre + v13 = vec_mul(v7, v8); // i3*wre + + v0 = vec_ld(0, &(out[0])); // r0 + v3 = vec_ld(i1+16, &(out[0])); // i1 + v10 = vec_madd(v5, v9, v10); // r2*wim + v11 = vec_nmsub(v4, v9, v11); // i2*wim + v12 = vec_nmsub(v7, v9, v12); // r3*wim + v13 = vec_madd(v6, v9, v13); // i3*wim + + v1 = vec_ld(16, &(out[0])); // i0 + v2 = vec_ld(i1, &(out[0])); // r1 + v8 = vec_sub(v12, v10); + v12 = vec_add(v12, v10); + v9 = vec_sub(v11, v13); + v13 = vec_add(v11, v13); + v4 = vec_sub(v0, v12); + v0 = vec_add(v0, v12); + v7 = vec_sub(v3, v8); + v3 = vec_add(v3, v8); + + vec_st(v0, 0, &(out[0])); // r0 + vec_st(v3, i1+16, &(out[0])); // i1 + vec_st(v4, i2, &(out[0])); // r2 + vec_st(v7, i3+16, &(out[0])); // i3 + + v5 = vec_sub(v1, v13); + v1 = vec_add(v1, v13); + v6 = vec_sub(v2, v9); + v2 = vec_add(v2, v9); + + vec_st(v1, 16, &(out[0])); // i0 + vec_st(v2, i1, &(out[0])); // r1 + vec_st(v5, i2+16, &(out[0])); // i2 + vec_st(v6, i3, &(out[0])); // r3 + } while (n-=2); +} + +#endif + +#endif /* AVCODEC_PPC_FFT_VSX_H */ diff -Nrup ffmpeg.orig/libavcodec/ppc/hpeldsp_altivec.c ffmpeg/libavcodec/ppc/hpeldsp_altivec.c --- ffmpeg.orig/libavcodec/ppc/hpeldsp_altivec.c 1970-01-01 01:00:00.000000000 +0100 +++ ffmpeg/libavcodec/ppc/hpeldsp_altivec.c 2018-09-05 15:45:34.368754131 +0200 @@ -0,0 +1,386 @@ +/* + * Copyright (c) 2002 Brian Foley + * Copyright (c) 2002 Dieter Shirley + * Copyright (c) 2003-2004 Romain Dolbeau + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" + +#include "libavutil/attributes.h" +#include "libavutil/cpu.h" +#include "libavutil/ppc/cpu.h" +#include "libavutil/ppc/util_altivec.h" + +#include "libavcodec/hpeldsp.h" + +#include "hpeldsp_altivec.h" + +#if HAVE_ALTIVEC +/* next one assumes that ((line_size % 16) == 0) */ +void ff_put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) +{ + register vector unsigned char pixelsv1; + register vector unsigned char pixelsv1B; + register vector unsigned char pixelsv1C; + register vector unsigned char pixelsv1D; + + int i; + register ptrdiff_t line_size_2 = line_size << 1; + register ptrdiff_t line_size_3 = line_size + line_size_2; + register ptrdiff_t line_size_4 = line_size << 2; + +// hand-unrolling the loop by 4 gains about 15% +// mininum execution time goes from 74 to 60 cycles +// it's faster than -funroll-loops, but using +// -funroll-loops w/ this is bad - 74 cycles again. +// all this is on a 7450, tuning for the 7450 + for (i = 0; i < h; i += 4) { + pixelsv1 = unaligned_load( 0, pixels); + pixelsv1B = unaligned_load(line_size, pixels); + pixelsv1C = unaligned_load(line_size_2, pixels); + pixelsv1D = unaligned_load(line_size_3, pixels); + VEC_ST(pixelsv1, 0, (unsigned char*)block); + VEC_ST(pixelsv1B, line_size, (unsigned char*)block); + VEC_ST(pixelsv1C, line_size_2, (unsigned char*)block); + VEC_ST(pixelsv1D, line_size_3, (unsigned char*)block); + pixels+=line_size_4; + block +=line_size_4; + } +} + +/* next one assumes that ((line_size % 16) == 0) */ +#define op_avg(a,b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) ) +void ff_avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) +{ + register vector unsigned char pixelsv, blockv; + + int i; + for (i = 0; i < h; i++) { + blockv = vec_ld(0, block); + pixelsv = VEC_LD( 0, pixels); + blockv = vec_avg(blockv,pixelsv); + vec_st(blockv, 0, (unsigned char*)block); + pixels+=line_size; + block +=line_size; + } +} + +/* next one assumes that ((line_size % 8) == 0) */ +static void avg_pixels8_altivec(uint8_t * block, const uint8_t * pixels, ptrdiff_t line_size, int h) +{ + register vector unsigned char pixelsv, blockv; + int i; + + for (i = 0; i < h; i++) { + /* block is 8 bytes-aligned, so we're either in the + left block (16 bytes-aligned) or in the right block (not) */ + int rightside = ((unsigned long)block & 0x0000000F); + + blockv = vec_ld(0, block); + pixelsv = VEC_LD( 0, pixels); + + if (rightside) { + pixelsv = vec_perm(blockv, pixelsv, vcprm(0,1,s0,s1)); + } else { + pixelsv = vec_perm(blockv, pixelsv, vcprm(s0,s1,2,3)); + } + + blockv = vec_avg(blockv, pixelsv); + + vec_st(blockv, 0, block); + + pixels += line_size; + block += line_size; + } +} + +/* next one assumes that ((line_size % 8) == 0) */ +static void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) +{ + register int i; + register vector unsigned char pixelsv1, pixelsv2, pixelsavg; + register vector unsigned char blockv; + register vector unsigned short pixelssum1, pixelssum2, temp3; + register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); + register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); + + pixelsv1 = VEC_LD(0, pixels); + pixelsv2 = VEC_LD(1, pixels); + pixelsv1 = VEC_MERGEH(vczero, pixelsv1); + pixelsv2 = VEC_MERGEH(vczero, pixelsv2); + + pixelssum1 = vec_add((vector unsigned short)pixelsv1, + (vector unsigned short)pixelsv2); + pixelssum1 = vec_add(pixelssum1, vctwo); + + for (i = 0; i < h ; i++) { + int rightside = ((unsigned long)block & 0x0000000F); + blockv = vec_ld(0, block); + + pixelsv1 = unaligned_load(line_size, pixels); + pixelsv2 = unaligned_load(line_size+1, pixels); + pixelsv1 = VEC_MERGEH(vczero, pixelsv1); + pixelsv2 = VEC_MERGEH(vczero, pixelsv2); + pixelssum2 = vec_add((vector unsigned short)pixelsv1, + (vector unsigned short)pixelsv2); + temp3 = vec_add(pixelssum1, pixelssum2); + temp3 = vec_sra(temp3, vctwo); + pixelssum1 = vec_add(pixelssum2, vctwo); + pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero); + + if (rightside) { + blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1)); + } else { + blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3)); + } + + vec_st(blockv, 0, block); + + block += line_size; + pixels += line_size; + } +} + +/* next one assumes that ((line_size % 8) == 0) */ +static void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) +{ + register int i; + register vector unsigned char pixelsv1, pixelsv2, pixelsavg; + register vector unsigned char blockv; + register vector unsigned short pixelssum1, pixelssum2, temp3; + register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); + register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1); + register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); + + pixelsv1 = VEC_LD(0, pixels); + pixelsv2 = VEC_LD(1, pixels); + pixelsv1 = VEC_MERGEH(vczero, pixelsv1); + pixelsv2 = VEC_MERGEH(vczero, pixelsv2); + pixelssum1 = vec_add((vector unsigned short)pixelsv1, + (vector unsigned short)pixelsv2); + pixelssum1 = vec_add(pixelssum1, vcone); + + for (i = 0; i < h ; i++) { + int rightside = ((unsigned long)block & 0x0000000F); + blockv = vec_ld(0, block); + + pixelsv1 = unaligned_load(line_size, pixels); + pixelsv2 = unaligned_load(line_size+1, pixels); + pixelsv1 = VEC_MERGEH(vczero, pixelsv1); + pixelsv2 = VEC_MERGEH(vczero, pixelsv2); + pixelssum2 = vec_add((vector unsigned short)pixelsv1, + (vector unsigned short)pixelsv2); + temp3 = vec_add(pixelssum1, pixelssum2); + temp3 = vec_sra(temp3, vctwo); + pixelssum1 = vec_add(pixelssum2, vcone); + pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero); + + if (rightside) { + blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1)); + } else { + blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3)); + } + + vec_st(blockv, 0, block); + + block += line_size; + pixels += line_size; + } +} + +/* next one assumes that ((line_size % 16) == 0) */ +static void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, ptrdiff_t line_size, int h) +{ + register int i; + register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4; + register vector unsigned char blockv; + register vector unsigned short temp3, temp4, + pixelssum1, pixelssum2, pixelssum3, pixelssum4; + register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); + register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); + + pixelsv1 = VEC_LD(0, pixels); + pixelsv2 = VEC_LD(1, pixels); + pixelsv3 = VEC_MERGEL(vczero, pixelsv1); + pixelsv4 = VEC_MERGEL(vczero, pixelsv2); + pixelsv1 = VEC_MERGEH(vczero, pixelsv1); + pixelsv2 = VEC_MERGEH(vczero, pixelsv2); + pixelssum3 = vec_add((vector unsigned short)pixelsv3, + (vector unsigned short)pixelsv4); + pixelssum3 = vec_add(pixelssum3, vctwo); + pixelssum1 = vec_add((vector unsigned short)pixelsv1, + (vector unsigned short)pixelsv2); + pixelssum1 = vec_add(pixelssum1, vctwo); + + for (i = 0; i < h ; i++) { + blockv = vec_ld(0, block); + + pixelsv1 = unaligned_load(line_size, pixels); + pixelsv2 = unaligned_load(line_size+1, pixels); + + pixelsv3 = VEC_MERGEL(vczero, pixelsv1); + pixelsv4 = VEC_MERGEL(vczero, pixelsv2); + pixelsv1 = VEC_MERGEH(vczero, pixelsv1); + pixelsv2 = VEC_MERGEH(vczero, pixelsv2); + pixelssum4 = vec_add((vector unsigned short)pixelsv3, + (vector unsigned short)pixelsv4); + pixelssum2 = vec_add((vector unsigned short)pixelsv1, + (vector unsigned short)pixelsv2); + temp4 = vec_add(pixelssum3, pixelssum4); + temp4 = vec_sra(temp4, vctwo); + temp3 = vec_add(pixelssum1, pixelssum2); + temp3 = vec_sra(temp3, vctwo); + + pixelssum3 = vec_add(pixelssum4, vctwo); + pixelssum1 = vec_add(pixelssum2, vctwo); + + blockv = vec_packsu(temp3, temp4); + + vec_st(blockv, 0, block); + + block += line_size; + pixels += line_size; + } +} + +/* next one assumes that ((line_size % 16) == 0) */ +static void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, ptrdiff_t line_size, int h) +{ + register int i; + register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4; + register vector unsigned char blockv; + register vector unsigned short temp3, temp4, + pixelssum1, pixelssum2, pixelssum3, pixelssum4; + register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); + register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1); + register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); + + pixelsv1 = VEC_LD(0, pixels); + pixelsv2 = VEC_LD(1, pixels); + pixelsv3 = VEC_MERGEL(vczero, pixelsv1); + pixelsv4 = VEC_MERGEL(vczero, pixelsv2); + pixelsv1 = VEC_MERGEH(vczero, pixelsv1); + pixelsv2 = VEC_MERGEH(vczero, pixelsv2); + pixelssum3 = vec_add((vector unsigned short)pixelsv3, + (vector unsigned short)pixelsv4); + pixelssum3 = vec_add(pixelssum3, vcone); + pixelssum1 = vec_add((vector unsigned short)pixelsv1, + (vector unsigned short)pixelsv2); + pixelssum1 = vec_add(pixelssum1, vcone); + + for (i = 0; i < h ; i++) { + pixelsv1 = unaligned_load(line_size, pixels); + pixelsv2 = unaligned_load(line_size+1, pixels); + + pixelsv3 = VEC_MERGEL(vczero, pixelsv1); + pixelsv4 = VEC_MERGEL(vczero, pixelsv2); + pixelsv1 = VEC_MERGEH(vczero, pixelsv1); + pixelsv2 = VEC_MERGEH(vczero, pixelsv2); + pixelssum4 = vec_add((vector unsigned short)pixelsv3, + (vector unsigned short)pixelsv4); + pixelssum2 = vec_add((vector unsigned short)pixelsv1, + (vector unsigned short)pixelsv2); + temp4 = vec_add(pixelssum3, pixelssum4); + temp4 = vec_sra(temp4, vctwo); + temp3 = vec_add(pixelssum1, pixelssum2); + temp3 = vec_sra(temp3, vctwo); + + pixelssum3 = vec_add(pixelssum4, vcone); + pixelssum1 = vec_add(pixelssum2, vcone); + + blockv = vec_packsu(temp3, temp4); + + VEC_ST(blockv, 0, block); + + block += line_size; + pixels += line_size; + } +} + +/* next one assumes that ((line_size % 8) == 0) */ +static void avg_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) +{ + register int i; + register vector unsigned char pixelsv1, pixelsv2, pixelsavg; + register vector unsigned char blockv, blocktemp; + register vector unsigned short pixelssum1, pixelssum2, temp3; + + register const vector unsigned char vczero = (const vector unsigned char) + vec_splat_u8(0); + register const vector unsigned short vctwo = (const vector unsigned short) + vec_splat_u16(2); + + pixelsv1 = VEC_LD(0, pixels); + pixelsv2 = VEC_LD(1, pixels); + pixelsv1 = VEC_MERGEH(vczero, pixelsv1); + pixelsv2 = VEC_MERGEH(vczero, pixelsv2); + pixelssum1 = vec_add((vector unsigned short)pixelsv1, + (vector unsigned short)pixelsv2); + pixelssum1 = vec_add(pixelssum1, vctwo); + + for (i = 0; i < h ; i++) { + int rightside = ((unsigned long)block & 0x0000000F); + blockv = vec_ld(0, block); + + pixelsv1 = unaligned_load(line_size, pixels); + pixelsv2 = unaligned_load(line_size+1, pixels); + + pixelsv1 = VEC_MERGEH(vczero, pixelsv1); + pixelsv2 = VEC_MERGEH(vczero, pixelsv2); + pixelssum2 = vec_add((vector unsigned short)pixelsv1, + (vector unsigned short)pixelsv2); + temp3 = vec_add(pixelssum1, pixelssum2); + temp3 = vec_sra(temp3, vctwo); + pixelssum1 = vec_add(pixelssum2, vctwo); + pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero); + + if (rightside) { + blocktemp = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1)); + } else { + blocktemp = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3)); + } + + blockv = vec_avg(blocktemp, blockv); + vec_st(blockv, 0, block); + + block += line_size; + pixels += line_size; + } +} +#endif /* HAVE_ALTIVEC */ + +av_cold void ff_hpeldsp_init_ppc(HpelDSPContext *c, int flags) +{ +#if HAVE_ALTIVEC + if (!PPC_ALTIVEC(av_get_cpu_flags())) + return; + + c->avg_pixels_tab[0][0] = ff_avg_pixels16_altivec; + c->avg_pixels_tab[1][0] = avg_pixels8_altivec; + c->avg_pixels_tab[1][3] = avg_pixels8_xy2_altivec; + + c->put_pixels_tab[0][0] = ff_put_pixels16_altivec; + c->put_pixels_tab[1][3] = put_pixels8_xy2_altivec; + c->put_pixels_tab[0][3] = put_pixels16_xy2_altivec; + + c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_altivec; + c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_altivec; + c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_altivec; +#endif /* HAVE_ALTIVEC */ +} diff -Nrup ffmpeg.orig/libavcodec/ppc/hpeldsp_altivec.h ffmpeg/libavcodec/ppc/hpeldsp_altivec.h --- ffmpeg.orig/libavcodec/ppc/hpeldsp_altivec.h 1970-01-01 01:00:00.000000000 +0100 +++ ffmpeg/libavcodec/ppc/hpeldsp_altivec.h 2018-09-05 15:45:34.368754131 +0200 @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2002 Brian Foley + * Copyright (c) 2002 Dieter Shirley + * Copyright (c) 2003-2004 Romain Dolbeau + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_PPC_HPELDSP_ALTIVEC_H +#define AVCODEC_PPC_HPELDSP_ALTIVEC_H + +#include +#include + +void ff_avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h); +void ff_put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h); + +#endif /* AVCODEC_PPC_HPELDSP_ALTIVEC_H */ diff -Nrup ffmpeg.orig/libavcodec/ppc/mathops.h ffmpeg/libavcodec/ppc/mathops.h --- ffmpeg.orig/libavcodec/ppc/mathops.h 1970-01-01 01:00:00.000000000 +0100 +++ ffmpeg/libavcodec/ppc/mathops.h 2018-09-05 15:45:34.368754131 +0200 @@ -0,0 +1,79 @@ +/* + * simple math operations + * Copyright (c) 2001, 2002 Fabrice Bellard + * Copyright (c) 2006 Michael Niedermayer et al + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_PPC_MATHOPS_H +#define AVCODEC_PPC_MATHOPS_H + +#include +#include "config.h" +#include "libavutil/common.h" + +#if HAVE_PPC4XX +/* signed 16x16 -> 32 multiply add accumulate */ +#define MAC16(rt, ra, rb) \ + __asm__ ("maclhw %0, %2, %3" : "=r" (rt) : "0" (rt), "r" (ra), "r" (rb)); + +/* signed 16x16 -> 32 multiply */ +#define MUL16(ra, rb) \ + ({ int __rt; \ + __asm__ ("mullhw %0, %1, %2" : "=r" (__rt) : "r" (ra), "r" (rb)); \ + __rt; }) +#endif + +#define MULH MULH +static inline av_const int MULH(int a, int b){ + int r; + __asm__ ("mulhw %0, %1, %2" : "=r"(r) : "r"(a), "r"(b)); + return r; +} + +#if !ARCH_PPC64 +static inline av_const int64_t MAC64(int64_t d, int a, int b) +{ + union { uint64_t x; unsigned hl[2]; } x = { d }; + int h, l; + __asm__ ("mullw %3, %4, %5 \n\t" + "mulhw %2, %4, %5 \n\t" + "addc %1, %1, %3 \n\t" + "adde %0, %0, %2 \n\t" + : "+r"(x.hl[0]), "+r"(x.hl[1]), "=&r"(h), "=&r"(l) + : "r"(a), "r"(b)); + return x.x; +} +#define MAC64(d, a, b) ((d) = MAC64(d, a, b)) + +static inline av_const int64_t MLS64(int64_t d, int a, int b) +{ + union { uint64_t x; unsigned hl[2]; } x = { d }; + int h, l; + __asm__ ("mullw %3, %4, %5 \n\t" + "mulhw %2, %4, %5 \n\t" + "subfc %1, %3, %1 \n\t" + "subfe %0, %2, %0 \n\t" + : "+r"(x.hl[0]), "+r"(x.hl[1]), "=&r"(h), "=&r"(l) + : "r"(a), "r"(b)); + return x.x; +} +#define MLS64(d, a, b) ((d) = MLS64(d, a, b)) +#endif + +#endif /* AVCODEC_PPC_MATHOPS_H */ diff -Nrup ffmpeg.orig/libavcodec/ppc/mpegaudiodsp_altivec.c ffmpeg/libavcodec/ppc/mpegaudiodsp_altivec.c --- ffmpeg.orig/libavcodec/ppc/mpegaudiodsp_altivec.c 1970-01-01 01:00:00.000000000 +0100 +++ ffmpeg/libavcodec/ppc/mpegaudiodsp_altivec.c 2018-09-05 15:45:34.368754131 +0200 @@ -0,0 +1,141 @@ +/* + * Altivec optimized MP3 decoding functions + * Copyright (c) 2010 Vitor Sessak + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" +#include "libavutil/attributes.h" +#include "libavutil/cpu.h" +#include "libavutil/internal.h" +#include "libavutil/ppc/cpu.h" +#include "libavutil/ppc/util_altivec.h" +#include "libavcodec/mpegaudiodsp.h" + +#if HAVE_ALTIVEC + +#define MACS(rt, ra, rb) rt+=(ra)*(rb) +#define MLSS(rt, ra, rb) rt-=(ra)*(rb) + +#define SUM8(op, sum, w, p) \ +{ \ + op(sum, (w)[0 * 64], (p)[0 * 64]); \ + op(sum, (w)[1 * 64], (p)[1 * 64]); \ + op(sum, (w)[2 * 64], (p)[2 * 64]); \ + op(sum, (w)[3 * 64], (p)[3 * 64]); \ + op(sum, (w)[4 * 64], (p)[4 * 64]); \ + op(sum, (w)[5 * 64], (p)[5 * 64]); \ + op(sum, (w)[6 * 64], (p)[6 * 64]); \ + op(sum, (w)[7 * 64], (p)[7 * 64]); \ +} + +static void apply_window(const float *buf, const float *win1, + const float *win2, float *sum1, float *sum2, int len) +{ + const vector float *win1a = (const vector float *) win1; + const vector float *win2a = (const vector float *) win2; + const vector float *bufa = (const vector float *) buf; + vector float *sum1a = (vector float *) sum1; + vector float *sum2a = (vector float *) sum2; + vector float av_uninit(v0), av_uninit(v4); + vector float v1, v2, v3; + + len = len >> 2; + +#define MULT(a, b) \ + { \ + v1 = vec_ld(a, win1a); \ + v2 = vec_ld(b, win2a); \ + v3 = vec_ld(a, bufa); \ + v0 = vec_madd(v3, v1, v0); \ + v4 = vec_madd(v2, v3, v4); \ + } + + while (len--) { + v0 = vec_xor(v0, v0); + v4 = vec_xor(v4, v4); + + MULT( 0, 0); + MULT( 256, 64); + MULT( 512, 128); + MULT( 768, 192); + MULT(1024, 256); + MULT(1280, 320); + MULT(1536, 384); + MULT(1792, 448); + + vec_st(v0, 0, sum1a); + vec_st(v4, 0, sum2a); + sum1a++; + sum2a++; + win1a++; + win2a++; + bufa++; + } +} + +static void apply_window_mp3(float *in, float *win, int *unused, float *out, + ptrdiff_t incr) +{ + LOCAL_ALIGNED_16(float, suma, [17]); + LOCAL_ALIGNED_16(float, sumb, [17]); + LOCAL_ALIGNED_16(float, sumc, [17]); + LOCAL_ALIGNED_16(float, sumd, [17]); + + float sum; + int j; + float *out2 = out + 32 * incr; + + /* copy to avoid wrap */ + memcpy(in + 512, in, 32 * sizeof(*in)); + + apply_window(in + 16, win , win + 512, suma, sumc, 16); + apply_window(in + 32, win + 48, win + 640, sumb, sumd, 16); + + SUM8(MLSS, suma[0], win + 32, in + 48); + + sumc[ 0] = 0; + sumb[16] = 0; + sumd[16] = 0; + + out[0 ] = suma[ 0]; + out += incr; + out2 -= incr; + for(j=1;j<16;j++) { + *out = suma[ j] - sumd[16-j]; + *out2 = -sumb[16-j] - sumc[ j]; + out += incr; + out2 -= incr; + } + + sum = 0; + SUM8(MLSS, sum, win + 16 + 32, in + 32); + *out = sum; +} + +#endif /* HAVE_ALTIVEC */ + +av_cold void ff_mpadsp_init_ppc(MPADSPContext *s) +{ +#if HAVE_ALTIVEC + if (!PPC_ALTIVEC(av_get_cpu_flags())) + return; + + s->apply_window_float = apply_window_mp3; +#endif /* HAVE_ALTIVEC */ +} diff -Nrup ffmpeg.orig/libavcodec/ppc/videodsp.c ffmpeg/libavcodec/ppc/videodsp.c --- ffmpeg.orig/libavcodec/ppc/videodsp.c 1970-01-01 01:00:00.000000000 +0100 +++ ffmpeg/libavcodec/ppc/videodsp.c 2018-09-05 15:45:34.368754131 +0200 @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2003-2004 Romain Dolbeau + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/attributes.h" +#include "libavcodec/videodsp.h" + +static void prefetch_ppc(uint8_t *mem, ptrdiff_t stride, int h) +{ + register const uint8_t *p = mem; + do { + __asm__ volatile ("dcbt 0,%0" : : "r" (p)); + p += stride; + } while(--h); +} + +av_cold void ff_videodsp_init_ppc(VideoDSPContext *ctx, int bpc) +{ + ctx->prefetch = prefetch_ppc; +} diff -Nrup ffmpeg.orig/libavcodec/ppc/vorbisdsp_altivec.c ffmpeg/libavcodec/ppc/vorbisdsp_altivec.c --- ffmpeg.orig/libavcodec/ppc/vorbisdsp_altivec.c 1970-01-01 01:00:00.000000000 +0100 +++ ffmpeg/libavcodec/ppc/vorbisdsp_altivec.c 2018-09-05 15:45:34.368754131 +0200 @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2006 Luca Barbato + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" + +#include "libavutil/attributes.h" +#include "libavutil/cpu.h" +#include "libavutil/ppc/cpu.h" +#include "libavutil/ppc/util_altivec.h" + +#include "libavcodec/vorbisdsp.h" + +#if HAVE_ALTIVEC +static void vorbis_inverse_coupling_altivec(float *mag, float *ang, + intptr_t blocksize) +{ + int i; + vector float m, a; + vector bool int t0, t1; + const vector unsigned int v_31 = //XXX + vec_add(vec_add(vec_splat_u32(15),vec_splat_u32(15)),vec_splat_u32(1)); + for (i = 0; i < blocksize; i += 4) { + m = vec_ld(0, mag+i); + a = vec_ld(0, ang+i); + t0 = vec_cmple(m, (vector float)vec_splat_u32(0)); + t1 = vec_cmple(a, (vector float)vec_splat_u32(0)); + a = vec_xor(a, (vector float) vec_sl((vector unsigned int)t0, v_31)); + t0 = (vector bool int)vec_and(a, t1); + t1 = (vector bool int)vec_andc(a, t1); + a = vec_sub(m, (vector float)t1); + m = vec_add(m, (vector float)t0); + vec_stl(a, 0, ang+i); + vec_stl(m, 0, mag+i); + } +} +#endif /* HAVE_ALTIVEC */ + +av_cold void ff_vorbisdsp_init_ppc(VorbisDSPContext *c) +{ +#if HAVE_ALTIVEC + if (!PPC_ALTIVEC(av_get_cpu_flags())) + return; + + c->vorbis_inverse_coupling = vorbis_inverse_coupling_altivec; +#endif /* HAVE_ALTIVEC */ +} diff -Nrup ffmpeg.orig/libavcodec/ppc/vp3dsp_altivec.c ffmpeg/libavcodec/ppc/vp3dsp_altivec.c --- ffmpeg.orig/libavcodec/ppc/vp3dsp_altivec.c 1970-01-01 01:00:00.000000000 +0100 +++ ffmpeg/libavcodec/ppc/vp3dsp_altivec.c 2018-09-05 15:45:34.368754131 +0200 @@ -0,0 +1,204 @@ +/* + * Copyright (C) 2009 David Conrad + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include + +#include "config.h" + +#include "libavutil/attributes.h" +#include "libavutil/cpu.h" +#include "libavutil/ppc/cpu.h" +#include "libavutil/ppc/util_altivec.h" + +#include "libavcodec/vp3dsp.h" + +#if HAVE_ALTIVEC + +static const vec_s16 constants = + {0, 64277, 60547, 54491, 46341, 36410, 25080, 12785}; +#if HAVE_BIGENDIAN +static const vec_u8 interleave_high = + {0, 1, 16, 17, 4, 5, 20, 21, 8, 9, 24, 25, 12, 13, 28, 29}; +#else +static const vec_u8 interleave_high = + {2, 3, 18, 19, 6, 7, 22, 23, 10, 11, 26, 27, 14, 15, 30, 31}; +#endif + +#define IDCT_START \ + vec_s16 A, B, C, D, Ad, Bd, Cd, Dd, E, F, G, H;\ + vec_s16 Ed, Gd, Add, Bdd, Fd, Hd;\ + vec_s16 eight = vec_splat_s16(8);\ + vec_u16 four = vec_splat_u16(4);\ +\ + vec_s16 C1 = vec_splat(constants, 1);\ + vec_s16 C2 = vec_splat(constants, 2);\ + vec_s16 C3 = vec_splat(constants, 3);\ + vec_s16 C4 = vec_splat(constants, 4);\ + vec_s16 C5 = vec_splat(constants, 5);\ + vec_s16 C6 = vec_splat(constants, 6);\ + vec_s16 C7 = vec_splat(constants, 7);\ +\ + vec_s16 b0 = vec_ld(0x00, block);\ + vec_s16 b1 = vec_ld(0x10, block);\ + vec_s16 b2 = vec_ld(0x20, block);\ + vec_s16 b3 = vec_ld(0x30, block);\ + vec_s16 b4 = vec_ld(0x40, block);\ + vec_s16 b5 = vec_ld(0x50, block);\ + vec_s16 b6 = vec_ld(0x60, block);\ + vec_s16 b7 = vec_ld(0x70, block); + +// these functions do (a*C)>>16 +// things are tricky because a is signed, but C unsigned. +// M15 is used if C fits in 15 bit unsigned (C6,C7) +// M16 is used if C requires 16 bits unsigned +static inline vec_s16 M15(vec_s16 a, vec_s16 C) +{ + return (vec_s16)vec_perm(vec_mule(a,C), vec_mulo(a,C), interleave_high); +} +static inline vec_s16 M16(vec_s16 a, vec_s16 C) +{ + return vec_add(a, M15(a, C)); +} + +#define IDCT_1D(ADD, SHIFT)\ + A = vec_add(M16(b1, C1), M15(b7, C7));\ + B = vec_sub(M15(b1, C7), M16(b7, C1));\ + C = vec_add(M16(b3, C3), M16(b5, C5));\ + D = vec_sub(M16(b5, C3), M16(b3, C5));\ +\ + Ad = M16(vec_sub(A, C), C4);\ + Bd = M16(vec_sub(B, D), C4);\ +\ + Cd = vec_add(A, C);\ + Dd = vec_add(B, D);\ +\ + E = ADD(M16(vec_add(b0, b4), C4));\ + F = ADD(M16(vec_sub(b0, b4), C4));\ +\ + G = vec_add(M16(b2, C2), M15(b6, C6));\ + H = vec_sub(M15(b2, C6), M16(b6, C2));\ +\ + Ed = vec_sub(E, G);\ + Gd = vec_add(E, G);\ +\ + Add = vec_add(F, Ad);\ + Bdd = vec_sub(Bd, H);\ +\ + Fd = vec_sub(F, Ad);\ + Hd = vec_add(Bd, H);\ +\ + b0 = SHIFT(vec_add(Gd, Cd));\ + b7 = SHIFT(vec_sub(Gd, Cd));\ +\ + b1 = SHIFT(vec_add(Add, Hd));\ + b2 = SHIFT(vec_sub(Add, Hd));\ +\ + b3 = SHIFT(vec_add(Ed, Dd));\ + b4 = SHIFT(vec_sub(Ed, Dd));\ +\ + b5 = SHIFT(vec_add(Fd, Bdd));\ + b6 = SHIFT(vec_sub(Fd, Bdd)); + +#define NOP(a) a +#define ADD8(a) vec_add(a, eight) +#define SHIFT4(a) vec_sra(a, four) + +static void vp3_idct_put_altivec(uint8_t *dst, ptrdiff_t stride, int16_t block[64]) +{ + vec_u8 t; + IDCT_START + + // pixels are signed; so add 128*16 in addition to the normal 8 + vec_s16 v2048 = vec_sl(vec_splat_s16(1), vec_splat_u16(11)); + eight = vec_add(eight, v2048); + + IDCT_1D(NOP, NOP) + TRANSPOSE8(b0, b1, b2, b3, b4, b5, b6, b7); + IDCT_1D(ADD8, SHIFT4) + +#define PUT(a)\ + t = vec_packsu(a, a);\ + vec_ste((vec_u32)t, 0, (unsigned int *)dst);\ + vec_ste((vec_u32)t, 4, (unsigned int *)dst); + + PUT(b0) dst += stride; + PUT(b1) dst += stride; + PUT(b2) dst += stride; + PUT(b3) dst += stride; + PUT(b4) dst += stride; + PUT(b5) dst += stride; + PUT(b6) dst += stride; + PUT(b7) + memset(block, 0, sizeof(*block) * 64); +} + +static void vp3_idct_add_altivec(uint8_t *dst, ptrdiff_t stride, int16_t block[64]) +{ + LOAD_ZERO; + vec_u8 t, vdst; + vec_s16 vdst_16; + vec_u8 vdst_mask = vec_mergeh(vec_splat_u8(-1), vec_lvsl(0, dst)); + + IDCT_START + + IDCT_1D(NOP, NOP) + TRANSPOSE8(b0, b1, b2, b3, b4, b5, b6, b7); + IDCT_1D(ADD8, SHIFT4) + +#if HAVE_BIGENDIAN +#define GET_VDST16\ + vdst = vec_ld(0, dst);\ + vdst_16 = (vec_s16)vec_perm(vdst, zero_u8v, vdst_mask); +#else +#define GET_VDST16\ + vdst = vec_vsx_ld(0,dst);\ + vdst_16 = (vec_s16)vec_mergeh(vdst, zero_u8v); +#endif + +#define ADD(a)\ + GET_VDST16;\ + vdst_16 = vec_adds(a, vdst_16);\ + t = vec_packsu(vdst_16, vdst_16);\ + vec_ste((vec_u32)t, 0, (unsigned int *)dst);\ + vec_ste((vec_u32)t, 4, (unsigned int *)dst); + + ADD(b0) dst += stride; + ADD(b1) dst += stride; + ADD(b2) dst += stride; + ADD(b3) dst += stride; + ADD(b4) dst += stride; + ADD(b5) dst += stride; + ADD(b6) dst += stride; + ADD(b7) + memset(block, 0, sizeof(*block) * 64); +} + +#endif /* HAVE_ALTIVEC */ + +av_cold void ff_vp3dsp_init_ppc(VP3DSPContext *c, int flags) +{ +#if HAVE_ALTIVEC + if (!PPC_ALTIVEC(av_get_cpu_flags())) + return; + + c->idct_put = vp3_idct_put_altivec; + c->idct_add = vp3_idct_add_altivec; +#endif +} diff -Nrup ffmpeg.orig/libavcodec/ppc/vp8dsp_altivec.c ffmpeg/libavcodec/ppc/vp8dsp_altivec.c --- ffmpeg.orig/libavcodec/ppc/vp8dsp_altivec.c 1970-01-01 01:00:00.000000000 +0100 +++ ffmpeg/libavcodec/ppc/vp8dsp_altivec.c 2018-09-05 15:45:34.368754131 +0200 @@ -0,0 +1,361 @@ +/* + * VP8 compatible video decoder + * + * Copyright (C) 2010 David Conrad + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" + +#include "libavutil/cpu.h" +#include "libavutil/mem.h" +#include "libavutil/ppc/cpu.h" +#include "libavutil/ppc/util_altivec.h" + +#include "libavcodec/vp8dsp.h" + +#include "hpeldsp_altivec.h" + +#if HAVE_ALTIVEC +#define REPT4(...) { __VA_ARGS__, __VA_ARGS__, __VA_ARGS__, __VA_ARGS__ } + +// h subpel filter uses msum to multiply+add 4 pixel taps at once +static const vec_s8 h_subpel_filters_inner[7] = +{ + REPT4( -6, 123, 12, -1), + REPT4(-11, 108, 36, -8), + REPT4( -9, 93, 50, -6), + REPT4(-16, 77, 77, -16), + REPT4( -6, 50, 93, -9), + REPT4( -8, 36, 108, -11), + REPT4( -1, 12, 123, -6), +}; + +// for 6tap filters, these are the outer two taps +// The zeros mask off pixels 4-7 when filtering 0-3 +// and vice-versa +static const vec_s8 h_subpel_filters_outer[3] = +{ + REPT4(0, 0, 2, 1), + REPT4(0, 0, 3, 3), + REPT4(0, 0, 1, 2), +}; + +#define LOAD_H_SUBPEL_FILTER(i) \ + vec_s8 filter_inner = h_subpel_filters_inner[i]; \ + vec_s8 filter_outerh = h_subpel_filters_outer[(i)>>1]; \ + vec_s8 filter_outerl = vec_sld(filter_outerh, filter_outerh, 2) + +#if HAVE_BIGENDIAN +#define GET_PIXHL(offset) \ + a = vec_ld((offset)-is6tap-1, src); \ + b = vec_ld((offset)-is6tap-1+15, src); \ + pixh = vec_perm(a, b, permh##offset); \ + pixl = vec_perm(a, b, perml##offset) + +#define GET_OUTER(offset) outer = vec_perm(a, b, perm_6tap##offset) +#else +#define GET_PIXHL(offset) \ + a = vec_vsx_ld((offset)-is6tap-1, src); \ + pixh = vec_perm(a, a, perm_inner); \ + pixl = vec_perm(a, a, vec_add(perm_inner, vec_splat_u8(4))) + +#define GET_OUTER(offset) outer = vec_perm(a, a, perm_outer) +#endif + +#define FILTER_H(dstv, off) \ + GET_PIXHL(off); \ + filth = vec_msum(filter_inner, pixh, c64); \ + filtl = vec_msum(filter_inner, pixl, c64); \ +\ + if (is6tap) { \ + GET_OUTER(off); \ + filth = vec_msum(filter_outerh, outer, filth); \ + filtl = vec_msum(filter_outerl, outer, filtl); \ + } \ + if (w == 4) \ + filtl = filth; /* discard pixels 4-7 */ \ + dstv = vec_packs(filth, filtl); \ + dstv = vec_sra(dstv, c7) + +static av_always_inline +void put_vp8_epel_h_altivec_core(uint8_t *dst, ptrdiff_t dst_stride, + uint8_t *src, ptrdiff_t src_stride, + int h, int mx, int w, int is6tap) +{ + LOAD_H_SUBPEL_FILTER(mx-1); +#if HAVE_BIGENDIAN + vec_u8 align_vec0, align_vec8, permh0, permh8; + vec_u8 perm_6tap0, perm_6tap8, perml0, perml8; + vec_u8 b; +#endif + vec_u8 filt, a, pixh, pixl, outer; + vec_s16 f16h, f16l; + vec_s32 filth, filtl; + + vec_u8 perm_inner6 = { 1,2,3,4, 2,3,4,5, 3,4,5,6, 4,5,6,7 }; + vec_u8 perm_inner4 = { 0,1,2,3, 1,2,3,4, 2,3,4,5, 3,4,5,6 }; + vec_u8 perm_inner = is6tap ? perm_inner6 : perm_inner4; + vec_u8 perm_outer = { 4,9, 0,5, 5,10, 1,6, 6,11, 2,7, 7,12, 3,8 }; + vec_s32 c64 = vec_sl(vec_splat_s32(1), vec_splat_u32(6)); + vec_u16 c7 = vec_splat_u16(7); + +#if HAVE_BIGENDIAN + align_vec0 = vec_lvsl( -is6tap-1, src); + align_vec8 = vec_lvsl(8-is6tap-1, src); + + permh0 = vec_perm(align_vec0, align_vec0, perm_inner); + permh8 = vec_perm(align_vec8, align_vec8, perm_inner); + perm_inner = vec_add(perm_inner, vec_splat_u8(4)); + perml0 = vec_perm(align_vec0, align_vec0, perm_inner); + perml8 = vec_perm(align_vec8, align_vec8, perm_inner); + perm_6tap0 = vec_perm(align_vec0, align_vec0, perm_outer); + perm_6tap8 = vec_perm(align_vec8, align_vec8, perm_outer); +#endif + + while (h --> 0) { + FILTER_H(f16h, 0); + + if (w == 16) { + FILTER_H(f16l, 8); + filt = vec_packsu(f16h, f16l); + vec_st(filt, 0, dst); + } else { + filt = vec_packsu(f16h, f16h); + vec_ste((vec_u32)filt, 0, (uint32_t*)dst); + if (w == 8) + vec_ste((vec_u32)filt, 4, (uint32_t*)dst); + } + src += src_stride; + dst += dst_stride; + } +} + +// v subpel filter does a simple vertical multiply + add +static const vec_u8 v_subpel_filters[7] = +{ + { 0, 6, 123, 12, 1, 0 }, + { 2, 11, 108, 36, 8, 1 }, + { 0, 9, 93, 50, 6, 0 }, + { 3, 16, 77, 77, 16, 3 }, + { 0, 6, 50, 93, 9, 0 }, + { 1, 8, 36, 108, 11, 2 }, + { 0, 1, 12, 123, 6, 0 }, +}; + +#define LOAD_V_SUBPEL_FILTER(i) \ + vec_u8 subpel_filter = v_subpel_filters[i]; \ + vec_u8 f0 = vec_splat(subpel_filter, 0); \ + vec_u8 f1 = vec_splat(subpel_filter, 1); \ + vec_u8 f2 = vec_splat(subpel_filter, 2); \ + vec_u8 f3 = vec_splat(subpel_filter, 3); \ + vec_u8 f4 = vec_splat(subpel_filter, 4); \ + vec_u8 f5 = vec_splat(subpel_filter, 5) + +#define FILTER_V(dstv, vec_mul) \ + s1f = (vec_s16)vec_mul(s1, f1); \ + s2f = (vec_s16)vec_mul(s2, f2); \ + s3f = (vec_s16)vec_mul(s3, f3); \ + s4f = (vec_s16)vec_mul(s4, f4); \ + s2f = vec_subs(s2f, s1f); \ + s3f = vec_subs(s3f, s4f); \ + if (is6tap) { \ + s0f = (vec_s16)vec_mul(s0, f0); \ + s5f = (vec_s16)vec_mul(s5, f5); \ + s2f = vec_adds(s2f, s0f); \ + s3f = vec_adds(s3f, s5f); \ + } \ + dstv = vec_adds(s2f, s3f); \ + dstv = vec_adds(dstv, c64); \ + dstv = vec_sra(dstv, c7) + +#if HAVE_BIGENDIAN +#define LOAD_HL(off, s, perm) load_with_perm_vec(off, s, perm) +#else +#define LOAD_HL(off, s, perm) vec_mergeh(vec_vsx_ld(off,s), vec_vsx_ld(off+8,s)) +#endif + +static av_always_inline +void put_vp8_epel_v_altivec_core(uint8_t *dst, ptrdiff_t dst_stride, + uint8_t *src, ptrdiff_t src_stride, + int h, int my, int w, int is6tap) +{ + LOAD_V_SUBPEL_FILTER(my-1); + vec_u8 s0, s1, s2, s3, s4, s5, filt, align_vech, perm_vec, align_vecl; + vec_s16 s0f, s1f, s2f, s3f, s4f, s5f, f16h, f16l; + vec_s16 c64 = vec_sl(vec_splat_s16(1), vec_splat_u16(6)); + vec_u16 c7 = vec_splat_u16(7); + +#if HAVE_BIGENDIAN + // we want pixels 0-7 to be in the even positions and 8-15 in the odd, + // so combine this permute with the alignment permute vector + align_vech = vec_lvsl(0, src); + align_vecl = vec_sld(align_vech, align_vech, 8); + if (w ==16) + perm_vec = vec_mergeh(align_vech, align_vecl); + else + perm_vec = vec_mergeh(align_vech, align_vech); +#endif + + if (is6tap) + s0 = LOAD_HL(-2*src_stride, src, perm_vec); + s1 = LOAD_HL(-1*src_stride, src, perm_vec); + s2 = LOAD_HL( 0*src_stride, src, perm_vec); + s3 = LOAD_HL( 1*src_stride, src, perm_vec); + if (is6tap) + s4 = LOAD_HL( 2*src_stride, src, perm_vec); + + src += (2+is6tap)*src_stride; + + while (h --> 0) { + if (is6tap) + s5 = LOAD_HL(0, src, perm_vec); + else + s4 = LOAD_HL(0, src, perm_vec); + + FILTER_V(f16h, vec_mule); + + if (w == 16) { + FILTER_V(f16l, vec_mulo); + filt = vec_packsu(f16h, f16l); + vec_st(filt, 0, dst); + } else { + filt = vec_packsu(f16h, f16h); + if (w == 4) + filt = (vec_u8)vec_splat((vec_u32)filt, 0); + else + vec_ste((vec_u32)filt, 4, (uint32_t*)dst); + vec_ste((vec_u32)filt, 0, (uint32_t*)dst); + } + + if (is6tap) + s0 = s1; + s1 = s2; + s2 = s3; + s3 = s4; + if (is6tap) + s4 = s5; + + dst += dst_stride; + src += src_stride; + } +} + +#define EPEL_FUNCS(WIDTH, TAPS) \ +static av_noinline \ +void put_vp8_epel ## WIDTH ## _h ## TAPS ## _altivec(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int h, int mx, int my) \ +{ \ + put_vp8_epel_h_altivec_core(dst, dst_stride, src, src_stride, h, mx, WIDTH, TAPS == 6); \ +} \ +\ +static av_noinline \ +void put_vp8_epel ## WIDTH ## _v ## TAPS ## _altivec(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int h, int mx, int my) \ +{ \ + put_vp8_epel_v_altivec_core(dst, dst_stride, src, src_stride, h, my, WIDTH, TAPS == 6); \ +} + +#define EPEL_HV(WIDTH, HTAPS, VTAPS) \ +static void put_vp8_epel ## WIDTH ## _h ## HTAPS ## v ## VTAPS ## _altivec(uint8_t *dst, ptrdiff_t dstride, uint8_t *src, ptrdiff_t sstride, int h, int mx, int my) \ +{ \ + DECLARE_ALIGNED(16, uint8_t, tmp)[(2*WIDTH+5)*16]; \ + if (VTAPS == 6) { \ + put_vp8_epel ## WIDTH ## _h ## HTAPS ## _altivec(tmp, 16, src-2*sstride, sstride, h+5, mx, my); \ + put_vp8_epel ## WIDTH ## _v ## VTAPS ## _altivec(dst, dstride, tmp+2*16, 16, h, mx, my); \ + } else { \ + put_vp8_epel ## WIDTH ## _h ## HTAPS ## _altivec(tmp, 16, src-sstride, sstride, h+4, mx, my); \ + put_vp8_epel ## WIDTH ## _v ## VTAPS ## _altivec(dst, dstride, tmp+16, 16, h, mx, my); \ + } \ +} + +EPEL_FUNCS(16,6) +EPEL_FUNCS(8, 6) +EPEL_FUNCS(8, 4) +EPEL_FUNCS(4, 6) +EPEL_FUNCS(4, 4) + +EPEL_HV(16, 6,6) +EPEL_HV(8, 6,6) +EPEL_HV(8, 4,6) +EPEL_HV(8, 6,4) +EPEL_HV(8, 4,4) +EPEL_HV(4, 6,6) +EPEL_HV(4, 4,6) +EPEL_HV(4, 6,4) +EPEL_HV(4, 4,4) + +static void put_vp8_pixels16_altivec(uint8_t *dst, ptrdiff_t dstride, uint8_t *src, ptrdiff_t sstride, int h, int mx, int my) +{ + register vector unsigned char perm; + int i; + register ptrdiff_t dstride2 = dstride << 1, sstride2 = sstride << 1; + register ptrdiff_t dstride3 = dstride2 + dstride, sstride3 = sstride + sstride2; + register ptrdiff_t dstride4 = dstride << 2, sstride4 = sstride << 2; + +#if HAVE_BIGENDIAN + perm = vec_lvsl(0, src); +#endif +// hand-unrolling the loop by 4 gains about 15% +// mininum execution time goes from 74 to 60 cycles +// it's faster than -funroll-loops, but using +// -funroll-loops w/ this is bad - 74 cycles again. +// all this is on a 7450, tuning for the 7450 + for (i = 0; i < h; i += 4) { + vec_st(load_with_perm_vec(0, src, perm), 0, dst); + vec_st(load_with_perm_vec(sstride, src, perm), dstride, dst); + vec_st(load_with_perm_vec(sstride2, src, perm), dstride2, dst); + vec_st(load_with_perm_vec(sstride3, src, perm), dstride3, dst); + src += sstride4; + dst += dstride4; + } +} + +#endif /* HAVE_ALTIVEC */ + + +av_cold void ff_vp78dsp_init_ppc(VP8DSPContext *c) +{ +#if HAVE_ALTIVEC + if (!PPC_ALTIVEC(av_get_cpu_flags())) + return; + + c->put_vp8_epel_pixels_tab[0][0][0] = put_vp8_pixels16_altivec; + c->put_vp8_epel_pixels_tab[0][0][2] = put_vp8_epel16_h6_altivec; + c->put_vp8_epel_pixels_tab[0][2][0] = put_vp8_epel16_v6_altivec; + c->put_vp8_epel_pixels_tab[0][2][2] = put_vp8_epel16_h6v6_altivec; + + c->put_vp8_epel_pixels_tab[1][0][2] = put_vp8_epel8_h6_altivec; + c->put_vp8_epel_pixels_tab[1][2][0] = put_vp8_epel8_v6_altivec; + c->put_vp8_epel_pixels_tab[1][0][1] = put_vp8_epel8_h4_altivec; + c->put_vp8_epel_pixels_tab[1][1][0] = put_vp8_epel8_v4_altivec; + + c->put_vp8_epel_pixels_tab[1][2][2] = put_vp8_epel8_h6v6_altivec; + c->put_vp8_epel_pixels_tab[1][1][1] = put_vp8_epel8_h4v4_altivec; + c->put_vp8_epel_pixels_tab[1][1][2] = put_vp8_epel8_h6v4_altivec; + c->put_vp8_epel_pixels_tab[1][2][1] = put_vp8_epel8_h4v6_altivec; + + c->put_vp8_epel_pixels_tab[2][0][2] = put_vp8_epel4_h6_altivec; + c->put_vp8_epel_pixels_tab[2][2][0] = put_vp8_epel4_v6_altivec; + c->put_vp8_epel_pixels_tab[2][0][1] = put_vp8_epel4_h4_altivec; + c->put_vp8_epel_pixels_tab[2][1][0] = put_vp8_epel4_v4_altivec; + + c->put_vp8_epel_pixels_tab[2][2][2] = put_vp8_epel4_h6v6_altivec; + c->put_vp8_epel_pixels_tab[2][1][1] = put_vp8_epel4_h4v4_altivec; + c->put_vp8_epel_pixels_tab[2][1][2] = put_vp8_epel4_h6v4_altivec; + c->put_vp8_epel_pixels_tab[2][2][1] = put_vp8_epel4_h4v6_altivec; +#endif /* HAVE_ALTIVEC */ +} diff -Nrup ffmpeg.orig/libavutil/ppc/cpu.c ffmpeg/libavutil/ppc/cpu.c --- ffmpeg.orig/libavutil/ppc/cpu.c 1970-01-01 01:00:00.000000000 +0100 +++ ffmpeg/libavutil/ppc/cpu.c 2018-09-05 15:45:34.938766223 +0200 @@ -0,0 +1,162 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" + +#ifdef __APPLE__ +#include +#elif defined(__linux__) +#include +#include +#include +#if HAVE_UNISTD_H +#include +#endif +#elif defined(__OpenBSD__) +#include +#include +#include +#elif defined(__AMIGAOS4__) +#include +#include +#include +#endif /* __APPLE__ */ + +#include "libavutil/avassert.h" +#include "libavutil/cpu.h" +#include "libavutil/cpu_internal.h" + +/** + * This function MAY rely on signal() or fork() in order to make sure AltiVec + * is present. + */ +int ff_get_cpu_flags_ppc(void) +{ +#if HAVE_ALTIVEC +#ifdef __AMIGAOS4__ + ULONG result = 0; + extern struct ExecIFace *IExec; + + IExec->GetCPUInfoTags(GCIT_VectorUnit, &result, TAG_DONE); + if (result == VECTORTYPE_ALTIVEC) + return AV_CPU_FLAG_ALTIVEC; + return 0; +#elif defined(__APPLE__) || defined(__OpenBSD__) +#ifdef __OpenBSD__ + int sels[2] = {CTL_MACHDEP, CPU_ALTIVEC}; +#else + int sels[2] = {CTL_HW, HW_VECTORUNIT}; +#endif + int has_vu = 0; + size_t len = sizeof(has_vu); + int err; + + err = sysctl(sels, 2, &has_vu, &len, NULL, 0); + + if (err == 0) + return has_vu ? AV_CPU_FLAG_ALTIVEC : 0; + return 0; +#elif defined(__linux__) + // The linux kernel could have the altivec support disabled + // even if the cpu has it. + int i, ret = 0; + int fd = open("/proc/self/auxv", O_RDONLY); + unsigned long buf[64] = { 0 }; + ssize_t count; + + if (fd < 0) + return 0; + + while ((count = read(fd, buf, sizeof(buf))) > 0) { + for (i = 0; i < count / sizeof(*buf); i += 2) { + if (buf[i] == AT_NULL) + goto out; + if (buf[i] == AT_HWCAP) { + if (buf[i + 1] & PPC_FEATURE_HAS_ALTIVEC) + ret = AV_CPU_FLAG_ALTIVEC; +#ifdef PPC_FEATURE_HAS_VSX + if (buf[i + 1] & PPC_FEATURE_HAS_VSX) + ret |= AV_CPU_FLAG_VSX; +#endif +#ifdef PPC_FEATURE_ARCH_2_07 + if (buf[i + 1] & PPC_FEATURE_HAS_POWER8) + ret |= AV_CPU_FLAG_POWER8; +#endif + if (ret & AV_CPU_FLAG_VSX) + av_assert0(ret & AV_CPU_FLAG_ALTIVEC); + goto out; + } + } + } + +out: + close(fd); + return ret; +#elif CONFIG_RUNTIME_CPUDETECT && defined(__linux__) +#define PVR_G4_7400 0x000C +#define PVR_G5_970 0x0039 +#define PVR_G5_970FX 0x003C +#define PVR_G5_970MP 0x0044 +#define PVR_G5_970GX 0x0045 +#define PVR_POWER6 0x003E +#define PVR_POWER7 0x003F +#define PVR_POWER8 0x004B +#define PVR_CELL_PPU 0x0070 + int ret = 0; + int proc_ver; + // Support of mfspr PVR emulation added in Linux 2.6.17. + __asm__ volatile("mfspr %0, 287" : "=r" (proc_ver)); + proc_ver >>= 16; + if (proc_ver & 0x8000 || + proc_ver == PVR_G4_7400 || + proc_ver == PVR_G5_970 || + proc_ver == PVR_G5_970FX || + proc_ver == PVR_G5_970MP || + proc_ver == PVR_G5_970GX || + proc_ver == PVR_POWER6 || + proc_ver == PVR_POWER7 || + proc_ver == PVR_POWER8 || + proc_ver == PVR_CELL_PPU) + ret = AV_CPU_FLAG_ALTIVEC; + if (proc_ver == PVR_POWER7 || + proc_ver == PVR_POWER8) + ret |= AV_CPU_FLAG_VSX; + if (proc_ver == PVR_POWER8) + ret |= AV_CPU_FLAG_POWER8; + + return ret; +#else + // Since we were compiled for AltiVec, just assume we have it + // until someone comes up with a proper way (not involving signal hacks). + return AV_CPU_FLAG_ALTIVEC; +#endif /* __AMIGAOS4__ */ +#endif /* HAVE_ALTIVEC */ + return 0; +} + +size_t ff_get_cpu_max_align_ppc(void) +{ + int flags = av_get_cpu_flags(); + + if (flags & (AV_CPU_FLAG_ALTIVEC | + AV_CPU_FLAG_VSX | + AV_CPU_FLAG_POWER8)) + return 16; + + return 8; +} diff -Nrup ffmpeg.orig/libavutil/ppc/cpu.h ffmpeg/libavutil/ppc/cpu.h --- ffmpeg.orig/libavutil/ppc/cpu.h 1970-01-01 01:00:00.000000000 +0100 +++ ffmpeg/libavutil/ppc/cpu.h 2018-09-07 13:32:42.161848338 +0200 @@ -0,0 +1,30 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVUTIL_PPC_CPU_H +#define AVUTIL_PPC_CPU_H + +#include "config.h" +#include "libavutil/cpu.h" +#include "libavutil/cpu_internal.h" + +#define PPC_ALTIVEC(flags) CPUEXT(flags, ALTIVEC) +#define PPC_VSX(flags) CPUEXT(flags, VSX) +#define PPC_POWER8(flags) CPUEXT(flags, POWER8) + +#endif /* AVUTIL_PPC_CPU_H */ diff -Nrup ffmpeg.orig/libavutil/ppc/float_dsp_altivec.c ffmpeg/libavutil/ppc/float_dsp_altivec.c --- ffmpeg.orig/libavutil/ppc/float_dsp_altivec.c 1970-01-01 01:00:00.000000000 +0100 +++ ffmpeg/libavutil/ppc/float_dsp_altivec.c 2018-09-05 15:45:34.938766223 +0200 @@ -0,0 +1,122 @@ +/* + * Copyright (c) 2006 Luca Barbato + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "util_altivec.h" +#include "float_dsp_altivec.h" + +void ff_vector_fmul_altivec(float *dst, const float *src0, const float *src1, + int len) +{ + int i; + vec_f d0, d1, s, zero = (vec_f)vec_splat_u32(0); + for (i = 0; i < len - 7; i += 8) { + d0 = vec_ld( 0, src0 + i); + s = vec_ld( 0, src1 + i); + d1 = vec_ld(16, src0 + i); + d0 = vec_madd(d0, s, zero); + d1 = vec_madd(d1, vec_ld(16, src1 + i), zero); + vec_st(d0, 0, dst + i); + vec_st(d1, 16, dst + i); + } +} + +void ff_vector_fmul_window_altivec(float *dst, const float *src0, + const float *src1, const float *win, int len) +{ + vec_f zero, t0, t1, s0, s1, wi, wj; + const vec_u8 reverse = vcprm(3, 2, 1, 0); + int i, j; + + dst += len; + win += len; + src0 += len; + + zero = (vec_f)vec_splat_u32(0); + + for (i = -len * 4, j = len * 4 - 16; i < 0; i += 16, j -= 16) { + s0 = vec_ld(i, src0); + s1 = vec_ld(j, src1); + wi = vec_ld(i, win); + wj = vec_ld(j, win); + + s1 = vec_perm(s1, s1, reverse); + wj = vec_perm(wj, wj, reverse); + + t0 = vec_madd(s0, wj, zero); + t0 = vec_nmsub(s1, wi, t0); + t1 = vec_madd(s0, wi, zero); + t1 = vec_madd(s1, wj, t1); + t1 = vec_perm(t1, t1, reverse); + + vec_st(t0, i, dst); + vec_st(t1, j, dst); + } +} + +void ff_vector_fmul_add_altivec(float *dst, const float *src0, + const float *src1, const float *src2, + int len) +{ + int i; + vec_f d, ss0, ss1, ss2, t0, t1, edges; + + for (i = 0; i < len - 3; i += 4) { + t0 = vec_ld(0, dst + i); + t1 = vec_ld(15, dst + i); + ss0 = vec_ld(0, src0 + i); + ss1 = vec_ld(0, src1 + i); + ss2 = vec_ld(0, src2 + i); + edges = vec_perm(t1, t0, vcprm(0, 1, 2, 3)); + d = vec_madd(ss0, ss1, ss2); + t1 = vec_perm(d, edges, vcprm(s0,s1,s2,s3)); + t0 = vec_perm(edges, d, vcprm(s0,s1,s2,s3)); + vec_st(t1, 15, dst + i); + vec_st(t0, 0, dst + i); + } +} + +void ff_vector_fmul_reverse_altivec(float *dst, const float *src0, + const float *src1, int len) +{ + int i; + vec_f d, s0, s1, h0, l0, s2, s3; + vec_f zero = (vec_f)vec_splat_u32(0); + + src1 += len-4; + for(i = 0; i < len - 7; i += 8) { + s1 = vec_ld(0, src1 - i); // [a,b,c,d] + s0 = vec_ld(0, src0 + i); + l0 = vec_mergel(s1, s1); // [c,c,d,d] + s3 = vec_ld(-16, src1 - i); + h0 = vec_mergeh(s1, s1); // [a,a,b,b] + s2 = vec_ld(16, src0 + i); + s1 = vec_mergeh(vec_mergel(l0, h0), // [d,b,d,b] + vec_mergeh(l0, h0)); // [c,a,c,a] + // [d,c,b,a] + l0 = vec_mergel(s3, s3); + d = vec_madd(s0, s1, zero); + h0 = vec_mergeh(s3, s3); + vec_st(d, 0, dst + i); + s3 = vec_mergeh(vec_mergel(l0, h0), + vec_mergeh(l0, h0)); + d = vec_madd(s2, s3, zero); + vec_st(d, 16, dst + i); + } +} diff -Nrup ffmpeg.orig/libavutil/ppc/float_dsp_altivec.h ffmpeg/libavutil/ppc/float_dsp_altivec.h --- ffmpeg.orig/libavutil/ppc/float_dsp_altivec.h 1970-01-01 01:00:00.000000000 +0100 +++ ffmpeg/libavutil/ppc/float_dsp_altivec.h 2018-09-05 15:45:34.938766223 +0200 @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2006 Luca Barbato + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVUTIL_PPC_FLOAT_DSP_ALTIVEC_H +#define AVUTIL_PPC_FLOAT_DSP_ALTIVEC_H + +void ff_vector_fmul_altivec(float *dst, const float *src0, + const float *src1, int len); + +void ff_vector_fmul_window_altivec(float *dst, const float *src0, + const float *src1, const float *win, + int len); + +void ff_vector_fmul_add_altivec(float *dst, const float *src0, + const float *src1, const float *src2, + int len); + +void ff_vector_fmul_reverse_altivec(float *dst, const float *src0, + const float *src1, int len); + +#endif /* AVUTIL_PPC_FLOAT_DSP_ALTIVEC_H */ diff -Nrup ffmpeg.orig/libavutil/ppc/float_dsp_init.c ffmpeg/libavutil/ppc/float_dsp_init.c --- ffmpeg.orig/libavutil/ppc/float_dsp_init.c 1970-01-01 01:00:00.000000000 +0100 +++ ffmpeg/libavutil/ppc/float_dsp_init.c 2018-09-05 15:45:34.938766223 +0200 @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2006 Luca Barbato + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" +#include "libavutil/attributes.h" +#include "libavutil/cpu.h" +#include "libavutil/float_dsp.h" +#include "libavutil/ppc/cpu.h" +#include "float_dsp_altivec.h" +#include "float_dsp_vsx.h" + +av_cold void ff_float_dsp_init_ppc(AVFloatDSPContext *fdsp, int bit_exact) +{ + if (PPC_ALTIVEC(av_get_cpu_flags())) { + fdsp->vector_fmul = ff_vector_fmul_altivec; + fdsp->vector_fmul_add = ff_vector_fmul_add_altivec; + fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_altivec; + + if (!bit_exact) { + fdsp->vector_fmul_window = ff_vector_fmul_window_altivec; + } + } + + // The disabled function below are near identical to altivec and have + // been disabled to reduce code duplication + if (PPC_VSX(av_get_cpu_flags())) { +// fdsp->vector_fmul = ff_vector_fmul_vsx; + fdsp->vector_fmul_add = ff_vector_fmul_add_vsx; +// fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_vsx; + +// if (!bit_exact) { +// fdsp->vector_fmul_window = ff_vector_fmul_window_vsx; +// } + } +} diff -Nrup ffmpeg.orig/libavutil/ppc/float_dsp_vsx.c ffmpeg/libavutil/ppc/float_dsp_vsx.c --- ffmpeg.orig/libavutil/ppc/float_dsp_vsx.c 1970-01-01 01:00:00.000000000 +0100 +++ ffmpeg/libavutil/ppc/float_dsp_vsx.c 2018-09-05 15:45:34.938766223 +0200 @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2015 Luca Barbato + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "util_altivec.h" +#include "float_dsp_vsx.h" + +void ff_vector_fmul_vsx(float *dst, + const float *src0, const float *src1, + int len) +{ + int i; + vec_f d0, d1, zero = (vec_f)vec_splat_u32(0); + for (i = 0; i < len - 7; i += 8) { + d0 = vec_vsx_ld( 0, src0 + i); + d1 = vec_vsx_ld(16, src0 + i); + d0 = vec_madd(d0, vec_vsx_ld( 0, src1 + i), zero); + d1 = vec_madd(d1, vec_vsx_ld(16, src1 + i), zero); + vec_vsx_st(d0, 0, dst + i); + vec_vsx_st(d1, 16, dst + i); + } +} + +void ff_vector_fmul_window_vsx(float *dst, const float *src0, + const float *src1, const float *win, + int len) +{ + vec_f zero, t0, t1, s0, s1, wi, wj; + const vec_u8 reverse = vcprm(3, 2, 1, 0); + int i, j; + + dst += len; + win += len; + src0 += len; + + zero = (vec_f)vec_splat_u32(0); + + for (i = -len * 4, j = len * 4 - 16; i < 0; i += 16, j -= 16) { + s0 = vec_vsx_ld(i, src0); + s1 = vec_vsx_ld(j, src1); + wi = vec_vsx_ld(i, win); + wj = vec_vsx_ld(j, win); + + s1 = vec_perm(s1, s1, reverse); + wj = vec_perm(wj, wj, reverse); + + t0 = vec_madd(s0, wj, zero); + t0 = vec_nmsub(s1, wi, t0); + t1 = vec_madd(s0, wi, zero); + t1 = vec_madd(s1, wj, t1); + t1 = vec_perm(t1, t1, reverse); + + vec_vsx_st(t0, i, dst); + vec_vsx_st(t1, j, dst); + } +} + +void ff_vector_fmul_add_vsx(float *dst, const float *src0, + const float *src1, const float *src2, + int len) +{ + int i; + vec_f d, s0, s1, s2; + + for (i = 0; i < len - 3; i += 4) { + s0 = vec_vsx_ld(0, src0 + i); + s1 = vec_vsx_ld(0, src1 + i); + s2 = vec_vsx_ld(0, src2 + i); + d = vec_madd(s0, s1, s2); + vec_vsx_st(d, 0, dst + i); + } +} + +void ff_vector_fmul_reverse_vsx(float *dst, const float *src0, + const float *src1, int len) +{ + int i; + vec_f d, s0, s1, h0, l0, s2, s3; + vec_f zero = (vec_f)vec_splat_u32(0); + + src1 += len - 4; + for (i = 0; i < len - 7; i += 8) { + s1 = vec_vsx_ld(0, src1 - i); // [a,b,c,d] + s0 = vec_vsx_ld(0, src0 + i); + l0 = vec_mergel(s1, s1); // [c,c,d,d] + s3 = vec_vsx_ld(-16, src1 - i); + h0 = vec_mergeh(s1, s1); // [a,a,b,b] + s2 = vec_vsx_ld(16, src0 + i); + s1 = vec_mergeh(vec_mergel(l0, h0), // [d,b,d,b] + vec_mergeh(l0, h0)); // [c,a,c,a] + // [d,c,b,a] + l0 = vec_mergel(s3, s3); + d = vec_madd(s0, s1, zero); + h0 = vec_mergeh(s3, s3); + vec_vsx_st(d, 0, dst + i); + s3 = vec_mergeh(vec_mergel(l0, h0), + vec_mergeh(l0, h0)); + d = vec_madd(s2, s3, zero); + vec_vsx_st(d, 16, dst + i); + } +} diff -Nrup ffmpeg.orig/libavutil/ppc/float_dsp_vsx.h ffmpeg/libavutil/ppc/float_dsp_vsx.h --- ffmpeg.orig/libavutil/ppc/float_dsp_vsx.h 1970-01-01 01:00:00.000000000 +0100 +++ ffmpeg/libavutil/ppc/float_dsp_vsx.h 2018-09-05 15:45:34.938766223 +0200 @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2015 Luca Barbato + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVUTIL_PPC_FLOAT_DSP_VSX_H +#define AVUTIL_PPC_FLOAT_DSP_VSX_H + +void ff_vector_fmul_vsx(float *dst, const float *src0, + const float *src1, int len); + +void ff_vector_fmul_window_vsx(float *dst, const float *src0, + const float *src1, const float *win, + int len); + +void ff_vector_fmul_add_vsx(float *dst, const float *src0, + const float *src1, const float *src2, + int len); + +void ff_vector_fmul_reverse_vsx(float *dst, const float *src0, + const float *src1, int len); + +#endif /* AVUTIL_PPC_FLOAT_DSP_VSX_H */ diff -Nrup ffmpeg.orig/libavutil/ppc/intreadwrite.h ffmpeg/libavutil/ppc/intreadwrite.h --- ffmpeg.orig/libavutil/ppc/intreadwrite.h 1970-01-01 01:00:00.000000000 +0100 +++ ffmpeg/libavutil/ppc/intreadwrite.h 2018-09-05 15:45:34.938766223 +0200 @@ -0,0 +1,110 @@ +/* + * Copyright (c) 2008 Mans Rullgard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVUTIL_PPC_INTREADWRITE_H +#define AVUTIL_PPC_INTREADWRITE_H + +#include +#include "config.h" + +#if HAVE_XFORM_ASM + +#if HAVE_BIGENDIAN +#define AV_RL16 av_read_bswap16 +#define AV_WL16 av_write_bswap16 +#define AV_RL32 av_read_bswap32 +#define AV_WL32 av_write_bswap32 +#define AV_RL64 av_read_bswap64 +#define AV_WL64 av_write_bswap64 + +#else +#define AV_RB16 av_read_bswap16 +#define AV_WB16 av_write_bswap16 +#define AV_RB32 av_read_bswap32 +#define AV_WB32 av_write_bswap32 +#define AV_RB64 av_read_bswap64 +#define AV_WB64 av_write_bswap64 + +#endif + +static av_always_inline uint16_t av_read_bswap16(const void *p) +{ + uint16_t v; + __asm__ ("lhbrx %0, %y1" : "=r"(v) : "Z"(*(const uint16_t*)p)); + return v; +} + +static av_always_inline void av_write_bswap16(void *p, uint16_t v) +{ + __asm__ ("sthbrx %1, %y0" : "=Z"(*(uint16_t*)p) : "r"(v)); +} + +static av_always_inline uint32_t av_read_bswap32(const void *p) +{ + uint32_t v; + __asm__ ("lwbrx %0, %y1" : "=r"(v) : "Z"(*(const uint32_t*)p)); + return v; +} + +static av_always_inline void av_write_bswap32(void *p, uint32_t v) +{ + __asm__ ("stwbrx %1, %y0" : "=Z"(*(uint32_t*)p) : "r"(v)); +} + +#if HAVE_LDBRX + +static av_always_inline uint64_t av_read_bswap64(const void *p) +{ + uint64_t v; + __asm__ ("ldbrx %0, %y1" : "=r"(v) : "Z"(*(const uint64_t*)p)); + return v; +} + +static av_always_inline void av_write_bswap64(void *p, uint64_t v) +{ + __asm__ ("stdbrx %1, %y0" : "=Z"(*(uint64_t*)p) : "r"(v)); +} + +#else + +static av_always_inline uint64_t av_read_bswap64(const void *p) +{ + union { uint64_t v; uint32_t hl[2]; } v; + __asm__ ("lwbrx %0, %y2 \n\t" + "lwbrx %1, %y3 \n\t" + : "=&r"(v.hl[1]), "=r"(v.hl[0]) + : "Z"(*(const uint32_t*)p), "Z"(*((const uint32_t*)p+1))); + return v.v; +} + +static av_always_inline void av_write_bswap64(void *p, uint64_t v) +{ + union { uint64_t v; uint32_t hl[2]; } vv = { v }; + __asm__ ("stwbrx %2, %y0 \n\t" + "stwbrx %3, %y1 \n\t" + : "=Z"(*(uint32_t*)p), "=Z"(*((uint32_t*)p+1)) + : "r"(vv.hl[1]), "r"(vv.hl[0])); +} + +#endif /* HAVE_LDBRX */ + +#endif /* HAVE_XFORM_ASM */ + +#endif /* AVUTIL_PPC_INTREADWRITE_H */ diff -Nrup ffmpeg.orig/libavutil/ppc/timer.h ffmpeg/libavutil/ppc/timer.h --- ffmpeg.orig/libavutil/ppc/timer.h 1970-01-01 01:00:00.000000000 +0100 +++ ffmpeg/libavutil/ppc/timer.h 2018-09-05 15:45:34.938766223 +0200 @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2005 Luca Barbato + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVUTIL_PPC_TIMER_H +#define AVUTIL_PPC_TIMER_H + +#include + +#include "config.h" + +#define AV_READ_TIME read_time + +static inline uint64_t read_time(void) +{ + uint32_t tbu, tbl, temp; + + /* from section 2.2.1 of the 32-bit PowerPC PEM */ + __asm__ volatile( + "mftbu %2\n" + "mftb %0\n" + "mftbu %1\n" + "cmpw %2,%1\n" + "bne $-0x10\n" + : "=r"(tbl), "=r"(tbu), "=r"(temp) + : + : "cc"); + + return (((uint64_t)tbu)<<32) | (uint64_t)tbl; +} + +#endif /* AVUTIL_PPC_TIMER_H */ diff -Nrup ffmpeg.orig/libavutil/ppc/util_altivec.h ffmpeg/libavutil/ppc/util_altivec.h --- ffmpeg.orig/libavutil/ppc/util_altivec.h 1970-01-01 01:00:00.000000000 +0100 +++ ffmpeg/libavutil/ppc/util_altivec.h 2018-09-05 15:45:34.938766223 +0200 @@ -0,0 +1,195 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * Contains misc utility macros and inline functions + */ + +#ifndef AVUTIL_PPC_UTIL_ALTIVEC_H +#define AVUTIL_PPC_UTIL_ALTIVEC_H + +#include + +#include "config.h" + +/*********************************************************************** + * Vector types + **********************************************************************/ +#define vec_u8 vector unsigned char +#define vec_s8 vector signed char +#define vec_u16 vector unsigned short +#define vec_s16 vector signed short +#define vec_u32 vector unsigned int +#define vec_s32 vector signed int +#define vec_f vector float + +/*********************************************************************** + * Null vector + **********************************************************************/ +#define LOAD_ZERO const vec_u8 zerov = vec_splat_u8( 0 ) + +#define zero_u8v (vec_u8) zerov +#define zero_s8v (vec_s8) zerov +#define zero_u16v (vec_u16) zerov +#define zero_s16v (vec_s16) zerov +#define zero_u32v (vec_u32) zerov +#define zero_s32v (vec_s32) zerov + +#if HAVE_ALTIVEC +#include + +// used to build registers permutation vectors (vcprm) +// the 's' are for words in the _s_econd vector +#define WORD_0 0x00,0x01,0x02,0x03 +#define WORD_1 0x04,0x05,0x06,0x07 +#define WORD_2 0x08,0x09,0x0a,0x0b +#define WORD_3 0x0c,0x0d,0x0e,0x0f +#define WORD_s0 0x10,0x11,0x12,0x13 +#define WORD_s1 0x14,0x15,0x16,0x17 +#define WORD_s2 0x18,0x19,0x1a,0x1b +#define WORD_s3 0x1c,0x1d,0x1e,0x1f +#define vcprm(a,b,c,d) (const vec_u8){WORD_ ## a, WORD_ ## b, WORD_ ## c, WORD_ ## d} + +#define SWP_W2S0 0x02,0x03,0x00,0x01 +#define SWP_W2S1 0x06,0x07,0x04,0x05 +#define SWP_W2S2 0x0a,0x0b,0x08,0x09 +#define SWP_W2S3 0x0e,0x0f,0x0c,0x0d +#define SWP_W2Ss0 0x12,0x13,0x10,0x11 +#define SWP_W2Ss1 0x16,0x17,0x14,0x15 +#define SWP_W2Ss2 0x1a,0x1b,0x18,0x19 +#define SWP_W2Ss3 0x1e,0x1f,0x1c,0x1d +#define vcswapi2s(a,b,c,d) (const vector unsigned char){SWP_W2S ## a, SWP_W2S ## b, SWP_W2S ## c, SWP_W2S ## d} + +#define vcswapc() \ + (const vector unsigned char){0x0f,0x0e,0x0d,0x0c,0x0b,0x0a,0x09,0x08,0x07,0x06,0x05,0x04,0x03,0x02,0x01,0x00} + + +// Transpose 8x8 matrix of 16-bit elements (in-place) +#define TRANSPOSE8(a,b,c,d,e,f,g,h) \ +do { \ + vec_s16 A1, B1, C1, D1, E1, F1, G1, H1; \ + vec_s16 A2, B2, C2, D2, E2, F2, G2, H2; \ + \ + A1 = vec_mergeh (a, e); \ + B1 = vec_mergel (a, e); \ + C1 = vec_mergeh (b, f); \ + D1 = vec_mergel (b, f); \ + E1 = vec_mergeh (c, g); \ + F1 = vec_mergel (c, g); \ + G1 = vec_mergeh (d, h); \ + H1 = vec_mergel (d, h); \ + \ + A2 = vec_mergeh (A1, E1); \ + B2 = vec_mergel (A1, E1); \ + C2 = vec_mergeh (B1, F1); \ + D2 = vec_mergel (B1, F1); \ + E2 = vec_mergeh (C1, G1); \ + F2 = vec_mergel (C1, G1); \ + G2 = vec_mergeh (D1, H1); \ + H2 = vec_mergel (D1, H1); \ + \ + a = vec_mergeh (A2, E2); \ + b = vec_mergel (A2, E2); \ + c = vec_mergeh (B2, F2); \ + d = vec_mergel (B2, F2); \ + e = vec_mergeh (C2, G2); \ + f = vec_mergel (C2, G2); \ + g = vec_mergeh (D2, H2); \ + h = vec_mergel (D2, H2); \ +} while (0) + + +#if HAVE_BIGENDIAN +#define VEC_LD(offset,b) \ + vec_perm(vec_ld(offset, b), vec_ld((offset)+15, b), vec_lvsl(offset, b)) +#else +#define VEC_LD(offset,b) \ + vec_vsx_ld(offset, b) +#endif + +/** @brief loads unaligned vector @a *src with offset @a offset + and returns it */ +#if HAVE_BIGENDIAN +static inline vec_u8 unaligned_load(int offset, const uint8_t *src) +{ + register vec_u8 first = vec_ld(offset, src); + register vec_u8 second = vec_ld(offset + 15, src); + register vec_u8 mask = vec_lvsl(offset, src); + return vec_perm(first, second, mask); +} +static inline vec_u8 load_with_perm_vec(int offset, const uint8_t *src, vec_u8 perm_vec) +{ + vec_u8 a = vec_ld(offset, src); + vec_u8 b = vec_ld(offset + 15, src); + return vec_perm(a, b, perm_vec); +} +#else +#define unaligned_load(a,b) VEC_LD(a,b) +#define load_with_perm_vec(a,b,c) VEC_LD(a,b) +#endif + + +/** + * loads vector known misalignment + * @param perm_vec the align permute vector to combine the two loads from lvsl + */ + +#define vec_unaligned_load(b) VEC_LD(0, b) + +#if HAVE_BIGENDIAN +#define VEC_MERGEH(a, b) vec_mergeh(a, b) +#define VEC_MERGEL(a, b) vec_mergel(a, b) +#else +#define VEC_MERGEH(a, b) vec_mergeh(b, a) +#define VEC_MERGEL(a, b) vec_mergel(b, a) +#endif + +#if HAVE_BIGENDIAN +#define VEC_ST(a,b,c) vec_st(a,b,c) +#else +#define VEC_ST(a,b,c) vec_vsx_st(a,b,c) +#endif + +#if HAVE_BIGENDIAN +#define VEC_SPLAT16(a,b) vec_splat((vec_s16)(a), b) +#else +#define VEC_SPLAT16(a,b) vec_splat((vec_s16)(vec_perm(a, a, vcswapi2s(0,1,2,3))), b) +#endif + +#if HAVE_BIGENDIAN +#define VEC_SLD16(a,b,c) vec_sld(a, b, c) +#else +#define VEC_SLD16(a,b,c) vec_sld(b, a, c) +#endif + +#endif /* HAVE_ALTIVEC */ + +#if HAVE_VSX +#if HAVE_BIGENDIAN +#define vsx_ld_u8_s16(off, p) \ + ((vec_s16)vec_mergeh((vec_u8)vec_splat_u8(0), \ + (vec_u8)vec_vsx_ld((off), (p)))) +#else +#define vsx_ld_u8_s16(off, p) \ + ((vec_s16)vec_mergeh((vec_u8)vec_vsx_ld((off), (p)), \ + (vec_u8)vec_splat_u8(0))) +#endif /* HAVE_BIGENDIAN */ +#endif /* HAVE_VSX */ + +#endif /* AVUTIL_PPC_UTIL_ALTIVEC_H */