1
0
Fork 0
mirror of https://github.com/ossrs/srs.git synced 2025-02-15 04:42:04 +00:00

For #1659, #307, add x86 asm for ffmpeg for rtc

This commit is contained in:
winlin 2020-03-22 17:14:07 +08:00
parent 4308f238c0
commit 37c84eccc0
28 changed files with 8441 additions and 50 deletions

View file

@ -14,4 +14,5 @@ ffbuild/.config
libavutil/lib.version
libavcodec/libavcodec.version
libavutil/libavutil.version
libswresample/libswresample.version
libswresample/libswresample.version
libavutil/ffversion.h

View file

@ -0,0 +1,60 @@
# subsystems
OBJS-$(CONFIG_FFT) += aarch64/fft_init_aarch64.o
OBJS-$(CONFIG_FMTCONVERT) += aarch64/fmtconvert_init.o
OBJS-$(CONFIG_H264CHROMA) += aarch64/h264chroma_init_aarch64.o
OBJS-$(CONFIG_H264DSP) += aarch64/h264dsp_init_aarch64.o
OBJS-$(CONFIG_H264PRED) += aarch64/h264pred_init.o
OBJS-$(CONFIG_H264QPEL) += aarch64/h264qpel_init_aarch64.o
OBJS-$(CONFIG_HPELDSP) += aarch64/hpeldsp_init_aarch64.o
OBJS-$(CONFIG_MPEGAUDIODSP) += aarch64/mpegaudiodsp_init.o
OBJS-$(CONFIG_NEON_CLOBBER_TEST) += aarch64/neontest.o
OBJS-$(CONFIG_VIDEODSP) += aarch64/videodsp_init.o
OBJS-$(CONFIG_VP8DSP) += aarch64/vp8dsp_init_aarch64.o
# decoders/encoders
OBJS-$(CONFIG_AAC_DECODER) += aarch64/aacpsdsp_init_aarch64.o \
aarch64/sbrdsp_init_aarch64.o
OBJS-$(CONFIG_DCA_DECODER) += aarch64/synth_filter_init.o
OBJS-$(CONFIG_OPUS_DECODER) += aarch64/opusdsp_init.o
OBJS-$(CONFIG_RV40_DECODER) += aarch64/rv40dsp_init_aarch64.o
OBJS-$(CONFIG_VC1DSP) += aarch64/vc1dsp_init_aarch64.o
OBJS-$(CONFIG_VORBIS_DECODER) += aarch64/vorbisdsp_init.o
OBJS-$(CONFIG_VP9_DECODER) += aarch64/vp9dsp_init_10bpp_aarch64.o \
aarch64/vp9dsp_init_12bpp_aarch64.o \
aarch64/vp9dsp_init_aarch64.o
# ARMv8 optimizations
# subsystems
ARMV8-OBJS-$(CONFIG_VIDEODSP) += aarch64/videodsp.o
# NEON optimizations
# subsystems
NEON-OBJS-$(CONFIG_AAC_DECODER) += aarch64/sbrdsp_neon.o
NEON-OBJS-$(CONFIG_FFT) += aarch64/fft_neon.o
NEON-OBJS-$(CONFIG_FMTCONVERT) += aarch64/fmtconvert_neon.o
NEON-OBJS-$(CONFIG_H264CHROMA) += aarch64/h264cmc_neon.o
NEON-OBJS-$(CONFIG_H264DSP) += aarch64/h264dsp_neon.o \
aarch64/h264idct_neon.o
NEON-OBJS-$(CONFIG_H264PRED) += aarch64/h264pred_neon.o
NEON-OBJS-$(CONFIG_H264QPEL) += aarch64/h264qpel_neon.o \
aarch64/hpeldsp_neon.o
NEON-OBJS-$(CONFIG_HPELDSP) += aarch64/hpeldsp_neon.o
NEON-OBJS-$(CONFIG_IDCTDSP) += aarch64/idctdsp_init_aarch64.o \
aarch64/simple_idct_neon.o
NEON-OBJS-$(CONFIG_MDCT) += aarch64/mdct_neon.o
NEON-OBJS-$(CONFIG_MPEGAUDIODSP) += aarch64/mpegaudiodsp_neon.o
NEON-OBJS-$(CONFIG_VP8DSP) += aarch64/vp8dsp_neon.o
# decoders/encoders
NEON-OBJS-$(CONFIG_AAC_DECODER) += aarch64/aacpsdsp_neon.o
NEON-OBJS-$(CONFIG_DCA_DECODER) += aarch64/synth_filter_neon.o
NEON-OBJS-$(CONFIG_OPUS_DECODER) += aarch64/opusdsp_neon.o
NEON-OBJS-$(CONFIG_VORBIS_DECODER) += aarch64/vorbisdsp_neon.o
NEON-OBJS-$(CONFIG_VP9_DECODER) += aarch64/vp9itxfm_16bpp_neon.o \
aarch64/vp9itxfm_neon.o \
aarch64/vp9lpf_16bpp_neon.o \
aarch64/vp9lpf_neon.o \
aarch64/vp9mc_16bpp_neon.o \
aarch64/vp9mc_neon.o

View file

@ -0,0 +1,199 @@
OBJS += x86/constants.o \
# subsystems
OBJS-$(CONFIG_AC3DSP) += x86/ac3dsp_init.o
OBJS-$(CONFIG_AUDIODSP) += x86/audiodsp_init.o
OBJS-$(CONFIG_BLOCKDSP) += x86/blockdsp_init.o
OBJS-$(CONFIG_BSWAPDSP) += x86/bswapdsp_init.o
OBJS-$(CONFIG_DCT) += x86/dct_init.o
OBJS-$(CONFIG_DIRAC_DECODER) += x86/diracdsp_init.o \
x86/dirac_dwt_init.o
OBJS-$(CONFIG_FDCTDSP) += x86/fdctdsp_init.o
OBJS-$(CONFIG_FFT) += x86/fft_init.o
OBJS-$(CONFIG_FLACDSP) += x86/flacdsp_init.o
OBJS-$(CONFIG_FMTCONVERT) += x86/fmtconvert_init.o
OBJS-$(CONFIG_H263DSP) += x86/h263dsp_init.o
OBJS-$(CONFIG_H264CHROMA) += x86/h264chroma_init.o
OBJS-$(CONFIG_H264DSP) += x86/h264dsp_init.o
OBJS-$(CONFIG_H264PRED) += x86/h264_intrapred_init.o
OBJS-$(CONFIG_H264QPEL) += x86/h264_qpel.o
OBJS-$(CONFIG_HPELDSP) += x86/hpeldsp_init.o
OBJS-$(CONFIG_LLAUDDSP) += x86/lossless_audiodsp_init.o
OBJS-$(CONFIG_LLVIDDSP) += x86/lossless_videodsp_init.o
OBJS-$(CONFIG_LLVIDENCDSP) += x86/lossless_videoencdsp_init.o
OBJS-$(CONFIG_HUFFYUVDSP) += x86/huffyuvdsp_init.o
OBJS-$(CONFIG_HUFFYUVENCDSP) += x86/huffyuvencdsp_init.o
OBJS-$(CONFIG_IDCTDSP) += x86/idctdsp_init.o
OBJS-$(CONFIG_LPC) += x86/lpc.o
OBJS-$(CONFIG_MDCT15) += x86/mdct15_init.o
OBJS-$(CONFIG_ME_CMP) += x86/me_cmp_init.o
OBJS-$(CONFIG_MPEGAUDIODSP) += x86/mpegaudiodsp.o
OBJS-$(CONFIG_MPEGVIDEO) += x86/mpegvideo.o \
x86/mpegvideodsp.o
OBJS-$(CONFIG_MPEGVIDEOENC) += x86/mpegvideoenc.o \
x86/mpegvideoencdsp_init.o
OBJS-$(CONFIG_PIXBLOCKDSP) += x86/pixblockdsp_init.o
OBJS-$(CONFIG_QPELDSP) += x86/qpeldsp_init.o
OBJS-$(CONFIG_RV34DSP) += x86/rv34dsp_init.o
OBJS-$(CONFIG_VC1DSP) += x86/vc1dsp_init.o
OBJS-$(CONFIG_VIDEODSP) += x86/videodsp_init.o
OBJS-$(CONFIG_VP3DSP) += x86/vp3dsp_init.o
OBJS-$(CONFIG_VP8DSP) += x86/vp8dsp_init.o
OBJS-$(CONFIG_XMM_CLOBBER_TEST) += x86/w64xmmtest.o
# decoders/encoders
OBJS-$(CONFIG_AAC_DECODER) += x86/aacpsdsp_init.o \
x86/sbrdsp_init.o
OBJS-$(CONFIG_AAC_ENCODER) += x86/aacencdsp_init.o
OBJS-$(CONFIG_ADPCM_G722_DECODER) += x86/g722dsp_init.o
OBJS-$(CONFIG_ADPCM_G722_ENCODER) += x86/g722dsp_init.o
OBJS-$(CONFIG_ALAC_DECODER) += x86/alacdsp_init.o
OBJS-$(CONFIG_APNG_DECODER) += x86/pngdsp_init.o
OBJS-$(CONFIG_CAVS_DECODER) += x86/cavsdsp.o
OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp_init.o x86/synth_filter_init.o
OBJS-$(CONFIG_DNXHD_ENCODER) += x86/dnxhdenc_init.o
OBJS-$(CONFIG_EXR_DECODER) += x86/exrdsp_init.o
OBJS-$(CONFIG_OPUS_DECODER) += x86/opusdsp_init.o
OBJS-$(CONFIG_OPUS_ENCODER) += x86/celt_pvq_init.o
OBJS-$(CONFIG_HEVC_DECODER) += x86/hevcdsp_init.o
OBJS-$(CONFIG_JPEG2000_DECODER) += x86/jpeg2000dsp_init.o
OBJS-$(CONFIG_LSCR_DECODER) += x86/pngdsp_init.o
OBJS-$(CONFIG_MLP_DECODER) += x86/mlpdsp_init.o
OBJS-$(CONFIG_MPEG4_DECODER) += x86/xvididct_init.o
OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp_init.o
OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp_init.o
OBJS-$(CONFIG_PRORES_LGPL_DECODER) += x86/proresdsp_init.o
OBJS-$(CONFIG_RV40_DECODER) += x86/rv40dsp_init.o
OBJS-$(CONFIG_SBC_ENCODER) += x86/sbcdsp_init.o
OBJS-$(CONFIG_SVQ1_ENCODER) += x86/svq1enc_init.o
OBJS-$(CONFIG_TAK_DECODER) += x86/takdsp_init.o
OBJS-$(CONFIG_TRUEHD_DECODER) += x86/mlpdsp_init.o
OBJS-$(CONFIG_TTA_DECODER) += x86/ttadsp_init.o
OBJS-$(CONFIG_TTA_ENCODER) += x86/ttaencdsp_init.o
OBJS-$(CONFIG_UTVIDEO_DECODER) += x86/utvideodsp_init.o
OBJS-$(CONFIG_V210_DECODER) += x86/v210-init.o
OBJS-$(CONFIG_V210_ENCODER) += x86/v210enc_init.o
OBJS-$(CONFIG_VORBIS_DECODER) += x86/vorbisdsp_init.o
OBJS-$(CONFIG_VP3_DECODER) += x86/hpeldsp_vp3_init.o
OBJS-$(CONFIG_VP6_DECODER) += x86/vp6dsp_init.o
OBJS-$(CONFIG_VP9_DECODER) += x86/vp9dsp_init.o \
x86/vp9dsp_init_10bpp.o \
x86/vp9dsp_init_12bpp.o \
x86/vp9dsp_init_16bpp.o
OBJS-$(CONFIG_WEBP_DECODER) += x86/vp8dsp_init.o
# GCC inline assembly optimizations
# subsystems
MMX-OBJS-$(CONFIG_FDCTDSP) += x86/fdct.o
MMX-OBJS-$(CONFIG_VC1DSP) += x86/vc1dsp_mmx.o
# decoders/encoders
MMX-OBJS-$(CONFIG_SNOW_DECODER) += x86/snowdsp.o
MMX-OBJS-$(CONFIG_SNOW_ENCODER) += x86/snowdsp.o
# subsystems
X86ASM-OBJS-$(CONFIG_AC3DSP) += x86/ac3dsp.o \
x86/ac3dsp_downmix.o
X86ASM-OBJS-$(CONFIG_AUDIODSP) += x86/audiodsp.o
X86ASM-OBJS-$(CONFIG_BLOCKDSP) += x86/blockdsp.o
X86ASM-OBJS-$(CONFIG_BSWAPDSP) += x86/bswapdsp.o
X86ASM-OBJS-$(CONFIG_DCT) += x86/dct32.o
X86ASM-OBJS-$(CONFIG_FFT) += x86/fft.o
X86ASM-OBJS-$(CONFIG_FMTCONVERT) += x86/fmtconvert.o
X86ASM-OBJS-$(CONFIG_H263DSP) += x86/h263_loopfilter.o
X86ASM-OBJS-$(CONFIG_H264CHROMA) += x86/h264_chromamc.o \
x86/h264_chromamc_10bit.o
X86ASM-OBJS-$(CONFIG_H264DSP) += x86/h264_deblock.o \
x86/h264_deblock_10bit.o \
x86/h264_idct.o \
x86/h264_idct_10bit.o \
x86/h264_weight.o \
x86/h264_weight_10bit.o
X86ASM-OBJS-$(CONFIG_H264PRED) += x86/h264_intrapred.o \
x86/h264_intrapred_10bit.o
X86ASM-OBJS-$(CONFIG_H264QPEL) += x86/h264_qpel_8bit.o \
x86/h264_qpel_10bit.o \
x86/fpel.o \
x86/qpel.o
X86ASM-OBJS-$(CONFIG_HPELDSP) += x86/fpel.o \
x86/hpeldsp.o
X86ASM-OBJS-$(CONFIG_HUFFYUVDSP) += x86/huffyuvdsp.o
X86ASM-OBJS-$(CONFIG_HUFFYUVENCDSP) += x86/huffyuvencdsp.o
X86ASM-OBJS-$(CONFIG_IDCTDSP) += x86/idctdsp.o
X86ASM-OBJS-$(CONFIG_LLAUDDSP) += x86/lossless_audiodsp.o
X86ASM-OBJS-$(CONFIG_LLVIDDSP) += x86/lossless_videodsp.o
X86ASM-OBJS-$(CONFIG_LLVIDENCDSP) += x86/lossless_videoencdsp.o
X86ASM-OBJS-$(CONFIG_MDCT15) += x86/mdct15.o
X86ASM-OBJS-$(CONFIG_ME_CMP) += x86/me_cmp.o
X86ASM-OBJS-$(CONFIG_MPEGAUDIODSP) += x86/imdct36.o
X86ASM-OBJS-$(CONFIG_MPEGVIDEOENC) += x86/mpegvideoencdsp.o
X86ASM-OBJS-$(CONFIG_OPUS_DECODER) += x86/opusdsp.o
X86ASM-OBJS-$(CONFIG_OPUS_ENCODER) += x86/celt_pvq_search.o
X86ASM-OBJS-$(CONFIG_PIXBLOCKDSP) += x86/pixblockdsp.o
X86ASM-OBJS-$(CONFIG_QPELDSP) += x86/qpeldsp.o \
x86/fpel.o \
x86/qpel.o
X86ASM-OBJS-$(CONFIG_RV34DSP) += x86/rv34dsp.o
X86ASM-OBJS-$(CONFIG_VC1DSP) += x86/vc1dsp_loopfilter.o \
x86/vc1dsp_mc.o
X86ASM-OBJS-$(CONFIG_IDCTDSP) += x86/simple_idct10.o \
x86/simple_idct.o
X86ASM-OBJS-$(CONFIG_VIDEODSP) += x86/videodsp.o
X86ASM-OBJS-$(CONFIG_VP3DSP) += x86/vp3dsp.o
X86ASM-OBJS-$(CONFIG_VP8DSP) += x86/vp8dsp.o \
x86/vp8dsp_loopfilter.o
# decoders/encoders
X86ASM-OBJS-$(CONFIG_AAC_DECODER) += x86/aacpsdsp.o \
x86/sbrdsp.o
X86ASM-OBJS-$(CONFIG_AAC_ENCODER) += x86/aacencdsp.o
X86ASM-OBJS-$(CONFIG_ADPCM_G722_DECODER) += x86/g722dsp.o
X86ASM-OBJS-$(CONFIG_ADPCM_G722_ENCODER) += x86/g722dsp.o
X86ASM-OBJS-$(CONFIG_ALAC_DECODER) += x86/alacdsp.o
X86ASM-OBJS-$(CONFIG_APNG_DECODER) += x86/pngdsp.o
X86ASM-OBJS-$(CONFIG_CAVS_DECODER) += x86/cavsidct.o
X86ASM-OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp.o x86/synth_filter.o
X86ASM-OBJS-$(CONFIG_DIRAC_DECODER) += x86/diracdsp.o \
x86/dirac_dwt.o
X86ASM-OBJS-$(CONFIG_DNXHD_ENCODER) += x86/dnxhdenc.o
X86ASM-OBJS-$(CONFIG_EXR_DECODER) += x86/exrdsp.o
X86ASM-OBJS-$(CONFIG_FLAC_DECODER) += x86/flacdsp.o
ifdef CONFIG_GPL
X86ASM-OBJS-$(CONFIG_FLAC_ENCODER) += x86/flac_dsp_gpl.o
endif
X86ASM-OBJS-$(CONFIG_HEVC_DECODER) += x86/hevc_add_res.o \
x86/hevc_deblock.o \
x86/hevc_idct.o \
x86/hevc_mc.o \
x86/hevc_sao.o \
x86/hevc_sao_10bit.o
X86ASM-OBJS-$(CONFIG_JPEG2000_DECODER) += x86/jpeg2000dsp.o
X86ASM-OBJS-$(CONFIG_LSCR_DECODER) += x86/pngdsp.o
X86ASM-OBJS-$(CONFIG_MLP_DECODER) += x86/mlpdsp.o
X86ASM-OBJS-$(CONFIG_MPEG4_DECODER) += x86/xvididct.o
X86ASM-OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp.o
X86ASM-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp.o
X86ASM-OBJS-$(CONFIG_PRORES_LGPL_DECODER) += x86/proresdsp.o
X86ASM-OBJS-$(CONFIG_RV40_DECODER) += x86/rv40dsp.o
X86ASM-OBJS-$(CONFIG_SBC_ENCODER) += x86/sbcdsp.o
X86ASM-OBJS-$(CONFIG_SVQ1_ENCODER) += x86/svq1enc.o
X86ASM-OBJS-$(CONFIG_TAK_DECODER) += x86/takdsp.o
X86ASM-OBJS-$(CONFIG_TRUEHD_DECODER) += x86/mlpdsp.o
X86ASM-OBJS-$(CONFIG_TTA_DECODER) += x86/ttadsp.o
X86ASM-OBJS-$(CONFIG_TTA_ENCODER) += x86/ttaencdsp.o
X86ASM-OBJS-$(CONFIG_UTVIDEO_DECODER) += x86/utvideodsp.o
X86ASM-OBJS-$(CONFIG_V210_ENCODER) += x86/v210enc.o
X86ASM-OBJS-$(CONFIG_V210_DECODER) += x86/v210.o
X86ASM-OBJS-$(CONFIG_VORBIS_DECODER) += x86/vorbisdsp.o
X86ASM-OBJS-$(CONFIG_VP3_DECODER) += x86/hpeldsp_vp3.o
X86ASM-OBJS-$(CONFIG_VP6_DECODER) += x86/vp6dsp.o
X86ASM-OBJS-$(CONFIG_VP9_DECODER) += x86/vp9intrapred.o \
x86/vp9intrapred_16bpp.o \
x86/vp9itxfm.o \
x86/vp9itxfm_16bpp.o \
x86/vp9lpf.o \
x86/vp9lpf_16bpp.o \
x86/vp9mc.o \
x86/vp9mc_16bpp.o
X86ASM-OBJS-$(CONFIG_WEBP_DECODER) += x86/vp8dsp.o

View file

@ -0,0 +1,86 @@
;******************************************************************************
;* SIMD optimized AAC encoder DSP functions
;*
;* Copyright (C) 2016 Rostislav Pehlivanov <atomnuker@gmail.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION_RODATA
float_abs_mask: times 4 dd 0x7fffffff
SECTION .text
;*******************************************************************
;void ff_abs_pow34(float *out, const float *in, const int size);
;*******************************************************************
INIT_XMM sse
cglobal abs_pow34, 3, 3, 3, out, in, size
mova m2, [float_abs_mask]
shl sizeq, 2
add inq, sizeq
add outq, sizeq
neg sizeq
.loop:
andps m0, m2, [inq+sizeq]
sqrtps m1, m0
mulps m0, m1
sqrtps m0, m0
mova [outq+sizeq], m0
add sizeq, mmsize
jl .loop
RET
;*******************************************************************
;void ff_aac_quantize_bands(int *out, const float *in, const float *scaled,
; int size, int is_signed, int maxval, const float Q34,
; const float rounding)
;*******************************************************************
INIT_XMM sse2
cglobal aac_quantize_bands, 5, 5, 6, out, in, scaled, size, is_signed, maxval, Q34, rounding
%if UNIX64 == 0
movss m0, Q34m
movss m1, roundingm
cvtsi2ss m3, dword maxvalm
%else
cvtsi2ss m3, maxvald
%endif
shufps m0, m0, 0
shufps m1, m1, 0
shufps m3, m3, 0
shl is_signedd, 31
movd m4, is_signedd
shufps m4, m4, 0
shl sized, 2
add inq, sizeq
add outq, sizeq
add scaledq, sizeq
neg sizeq
.loop:
mulps m2, m0, [scaledq+sizeq]
addps m2, m1
minps m2, m3
andps m5, m4, [inq+sizeq]
orps m2, m5
cvttps2dq m2, m2
mova [outq+sizeq], m2
add sizeq, mmsize
jl .loop
RET

View file

@ -0,0 +1,487 @@
;******************************************************************************
;* SIMD optimized MPEG-4 Parametric Stereo decoding functions
;*
;* Copyright (C) 2015 James Almer
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION_RODATA
ps_p1m1p1m1: dd 0, 0x80000000, 0, 0x80000000
SECTION .text
;*************************************************************************
;void ff_ps_add_squares_<opt>(float *dst, const float (*src)[2], int n);
;*************************************************************************
%macro PS_ADD_SQUARES 1
cglobal ps_add_squares, 3, 3, %1, dst, src, n
shl nd, 3
add srcq, nq
neg nq
align 16
.loop:
movaps m0, [srcq+nq]
movaps m1, [srcq+nq+mmsize]
mulps m0, m0
mulps m1, m1
HADDPS m0, m1, m2
addps m0, [dstq]
movaps [dstq], m0
add dstq, mmsize
add nq, mmsize*2
jl .loop
REP_RET
%endmacro
INIT_XMM sse
PS_ADD_SQUARES 2
INIT_XMM sse3
PS_ADD_SQUARES 3
;*******************************************************************
;void ff_ps_mul_pair_single_sse(float (*dst)[2], float (*src0)[2],
; float *src1, int n);
;*******************************************************************
INIT_XMM sse
cglobal ps_mul_pair_single, 4, 4, 4, dst, src1, src2, n
shl nd, 3
add src1q, nq
add dstq, nq
neg nq
align 16
.loop:
movu m0, [src1q+nq]
movu m1, [src1q+nq+mmsize]
mova m2, [src2q]
mova m3, m2
unpcklps m2, m2
unpckhps m3, m3
mulps m0, m2
mulps m1, m3
mova [dstq+nq], m0
mova [dstq+nq+mmsize], m1
add src2q, mmsize
add nq, mmsize*2
jl .loop
REP_RET
;***********************************************************************
;void ff_ps_stereo_interpolate_sse3(float (*l)[2], float (*r)[2],
; float h[2][4], float h_step[2][4],
; int len);
;***********************************************************************
INIT_XMM sse3
cglobal ps_stereo_interpolate, 5, 5, 6, l, r, h, h_step, n
movaps m0, [hq]
movaps m1, [h_stepq]
unpcklps m4, m0, m0
unpckhps m0, m0
unpcklps m5, m1, m1
unpckhps m1, m1
shl nd, 3
add lq, nq
add rq, nq
neg nq
align 16
.loop:
addps m4, m5
addps m0, m1
movddup m2, [lq+nq]
movddup m3, [rq+nq]
mulps m2, m4
mulps m3, m0
addps m2, m3
movsd [lq+nq], m2
movhps [rq+nq], m2
add nq, 8
jl .loop
REP_RET
;***************************************************************************
;void ps_stereo_interpolate_ipdopd_sse3(float (*l)[2], float (*r)[2],
; float h[2][4], float h_step[2][4],
; int len);
;***************************************************************************
INIT_XMM sse3
cglobal ps_stereo_interpolate_ipdopd, 5, 5, 10, l, r, h, h_step, n
movaps m0, [hq]
movaps m1, [hq+mmsize]
%if ARCH_X86_64
movaps m8, [h_stepq]
movaps m9, [h_stepq+mmsize]
%define H_STEP0 m8
%define H_STEP1 m9
%else
%define H_STEP0 [h_stepq]
%define H_STEP1 [h_stepq+mmsize]
%endif
shl nd, 3
add lq, nq
add rq, nq
neg nq
align 16
.loop:
addps m0, H_STEP0
addps m1, H_STEP1
movddup m2, [lq+nq]
movddup m3, [rq+nq]
shufps m4, m2, m2, q2301
shufps m5, m3, m3, q2301
unpcklps m6, m0, m0
unpckhps m7, m0, m0
mulps m2, m6
mulps m3, m7
unpcklps m6, m1, m1
unpckhps m7, m1, m1
mulps m4, m6
mulps m5, m7
addps m2, m3
addsubps m2, m4
addsubps m2, m5
movsd [lq+nq], m2
movhps [rq+nq], m2
add nq, 8
jl .loop
REP_RET
;**********************************************************
;void ps_hybrid_analysis_ileave_sse(float out[2][38][64],
; float (*in)[32][2],
; int i, int len)
;**********************************************************
INIT_XMM sse
cglobal ps_hybrid_analysis_ileave, 3, 7, 5, out, in, i, len, in0, in1, tmp
movsxdifnidn iq, id
mov lend, 32 << 3
lea inq, [inq+iq*4]
mov tmpd, id
shl tmpd, 8
add outq, tmpq
mov tmpd, 64
sub tmpd, id
mov id, tmpd
test id, 1
jne .loop4
test id, 2
jne .loop8
align 16
.loop16:
mov in0q, inq
mov in1q, 38*64*4
add in1q, in0q
mov tmpd, lend
.inner_loop16:
movaps m0, [in0q]
movaps m1, [in1q]
movaps m2, [in0q+lenq]
movaps m3, [in1q+lenq]
TRANSPOSE4x4PS 0, 1, 2, 3, 4
movaps [outq], m0
movaps [outq+lenq], m1
movaps [outq+lenq*2], m2
movaps [outq+3*32*2*4], m3
lea in0q, [in0q+lenq*2]
lea in1q, [in1q+lenq*2]
add outq, mmsize
sub tmpd, mmsize
jg .inner_loop16
add inq, 16
add outq, 3*32*2*4
sub id, 4
jg .loop16
RET
align 16
.loop8:
mov in0q, inq
mov in1q, 38*64*4
add in1q, in0q
mov tmpd, lend
.inner_loop8:
movlps m0, [in0q]
movlps m1, [in1q]
movhps m0, [in0q+lenq]
movhps m1, [in1q+lenq]
SBUTTERFLYPS 0, 1, 2
SBUTTERFLYPD 0, 1, 2
movaps [outq], m0
movaps [outq+lenq], m1
lea in0q, [in0q+lenq*2]
lea in1q, [in1q+lenq*2]
add outq, mmsize
sub tmpd, mmsize
jg .inner_loop8
add inq, 8
add outq, lenq
sub id, 2
jg .loop16
RET
align 16
.loop4:
mov in0q, inq
mov in1q, 38*64*4
add in1q, in0q
mov tmpd, lend
.inner_loop4:
movss m0, [in0q]
movss m1, [in1q]
movss m2, [in0q+lenq]
movss m3, [in1q+lenq]
movlhps m0, m1
movlhps m2, m3
shufps m0, m2, q2020
movaps [outq], m0
lea in0q, [in0q+lenq*2]
lea in1q, [in1q+lenq*2]
add outq, mmsize
sub tmpd, mmsize
jg .inner_loop4
add inq, 4
sub id, 1
test id, 2
jne .loop8
cmp id, 4
jge .loop16
RET
;***********************************************************
;void ps_hybrid_synthesis_deint_sse4(float out[2][38][64],
; float (*in)[32][2],
; int i, int len)
;***********************************************************
%macro HYBRID_SYNTHESIS_DEINT 0
cglobal ps_hybrid_synthesis_deint, 3, 7, 5, out, in, i, len, out0, out1, tmp
%if cpuflag(sse4)
%define MOVH movsd
%else
%define MOVH movlps
%endif
movsxdifnidn iq, id
mov lend, 32 << 3
lea outq, [outq+iq*4]
mov tmpd, id
shl tmpd, 8
add inq, tmpq
mov tmpd, 64
sub tmpd, id
mov id, tmpd
test id, 1
jne .loop4
test id, 2
jne .loop8
align 16
.loop16:
mov out0q, outq
mov out1q, 38*64*4
add out1q, out0q
mov tmpd, lend
.inner_loop16:
movaps m0, [inq]
movaps m1, [inq+lenq]
movaps m2, [inq+lenq*2]
movaps m3, [inq+3*32*2*4]
TRANSPOSE4x4PS 0, 1, 2, 3, 4
movaps [out0q], m0
movaps [out1q], m1
movaps [out0q+lenq], m2
movaps [out1q+lenq], m3
lea out0q, [out0q+lenq*2]
lea out1q, [out1q+lenq*2]
add inq, mmsize
sub tmpd, mmsize
jg .inner_loop16
add outq, 16
add inq, 3*32*2*4
sub id, 4
jg .loop16
RET
align 16
.loop8:
mov out0q, outq
mov out1q, 38*64*4
add out1q, out0q
mov tmpd, lend
.inner_loop8:
movaps m0, [inq]
movaps m1, [inq+lenq]
SBUTTERFLYPS 0, 1, 2
SBUTTERFLYPD 0, 1, 2
MOVH [out0q], m0
MOVH [out1q], m1
movhps [out0q+lenq], m0
movhps [out1q+lenq], m1
lea out0q, [out0q+lenq*2]
lea out1q, [out1q+lenq*2]
add inq, mmsize
sub tmpd, mmsize
jg .inner_loop8
add outq, 8
add inq, lenq
sub id, 2
jg .loop16
RET
align 16
.loop4:
mov out0q, outq
mov out1q, 38*64*4
add out1q, out0q
mov tmpd, lend
.inner_loop4:
movaps m0, [inq]
movss [out0q], m0
%if cpuflag(sse4)
extractps [out1q], m0, 1
extractps [out0q+lenq], m0, 2
extractps [out1q+lenq], m0, 3
%else
movhlps m1, m0
movss [out0q+lenq], m1
shufps m0, m0, 0xb1
movss [out1q], m0
movhlps m1, m0
movss [out1q+lenq], m1
%endif
lea out0q, [out0q+lenq*2]
lea out1q, [out1q+lenq*2]
add inq, mmsize
sub tmpd, mmsize
jg .inner_loop4
add outq, 4
sub id, 1
test id, 2
jne .loop8
cmp id, 4
jge .loop16
RET
%endmacro
INIT_XMM sse
HYBRID_SYNTHESIS_DEINT
INIT_XMM sse4
HYBRID_SYNTHESIS_DEINT
;*******************************************************************
;void ff_ps_hybrid_analysis_<opt>(float (*out)[2], float (*in)[2],
; const float (*filter)[8][2],
; ptrdiff_t stride, int n);
;*******************************************************************
%macro PS_HYBRID_ANALYSIS_LOOP 3
movu %1, [inq+mmsize*%3]
movu m1, [inq+mmsize*(5-%3)+8]
%if cpuflag(sse3)
pshufd %2, %1, q2301
pshufd m4, m1, q0123
pshufd m1, m1, q1032
pshufd m2, [filterq+nq+mmsize*%3], q2301
addsubps %2, m4
addsubps %1, m1
%else
mova m2, [filterq+nq+mmsize*%3]
mova %2, %1
mova m4, m1
shufps %2, %2, q2301
shufps m4, m4, q0123
shufps m1, m1, q1032
shufps m2, m2, q2301
xorps m4, m7
xorps m1, m7
subps %2, m4
subps %1, m1
%endif
mulps %2, m2
mulps %1, m2
%if %3
addps m3, %2
addps m0, %1
%endif
%endmacro
%macro PS_HYBRID_ANALYSIS 0
cglobal ps_hybrid_analysis, 5, 5, 8, out, in, filter, stride, n
%if cpuflag(sse3)
%define MOVH movsd
%else
%define MOVH movlps
%endif
shl strideq, 3
shl nd, 6
add filterq, nq
neg nq
mova m7, [ps_p1m1p1m1]
align 16
.loop:
PS_HYBRID_ANALYSIS_LOOP m0, m3, 0
PS_HYBRID_ANALYSIS_LOOP m5, m6, 1
PS_HYBRID_ANALYSIS_LOOP m5, m6, 2
%if cpuflag(sse3)
pshufd m3, m3, q2301
xorps m0, m7
hsubps m3, m0
pshufd m1, m3, q0020
pshufd m3, m3, q0031
addps m1, m3
movsd m2, [inq+6*8]
%else
mova m1, m3
mova m2, m0
shufps m1, m1, q2301
shufps m2, m2, q2301
subps m1, m3
addps m2, m0
unpcklps m3, m1, m2
unpckhps m1, m2
addps m1, m3
movu m2, [inq+6*8] ; faster than movlps and no risk of overread
%endif
movss m3, [filterq+nq+8*6]
SPLATD m3
mulps m2, m3
addps m1, m2
MOVH [outq], m1
add outq, strideq
add nq, 64
jl .loop
REP_RET
%endmacro
INIT_XMM sse
PS_HYBRID_ANALYSIS
INIT_XMM sse3
PS_HYBRID_ANALYSIS

View file

@ -0,0 +1,385 @@
;******************************************************************************
;* SIMD optimized Opus encoder DSP function
;*
;* Copyright (C) 2017 Ivan Kalvachev <ikalvachev@gmail.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "config.asm"
%include "libavutil/x86/x86util.asm"
%ifdef __NASM_VER__
%use "smartalign"
ALIGNMODE p6
%endif
SECTION_RODATA 64
const_float_abs_mask: times 8 dd 0x7fffffff
const_align_abs_edge: times 8 dd 0
const_float_0_5: times 8 dd 0.5
const_float_1: times 8 dd 1.0
const_float_sign_mask: times 8 dd 0x80000000
const_int32_offsets:
%rep 8
dd $-const_int32_offsets
%endrep
SECTION .text
;
; Setup High Register to be used
; for holding memory constants
;
; %1 - the register to be used, assmues it is >= mm8
; %2 - name of the constant.
;
; Subsequent opcodes are going to use the constant in the form
; "addps m0, mm_const_name" and it would be turned into:
; "addps m0, [const_name]" on 32 bit arch or
; "addps m0, m8" on 64 bit arch
%macro SET_HI_REG_MM_CONSTANT 3 ; movop, reg, const_name
%if num_mmregs > 8
%define mm_%3 %2
%{1} %2, [%3] ; movaps m8, [const_name]
%else
%define mm_%3 [%3]
%endif
%endmacro
;
; Set Position Independent Code
; Base address of a constant
; %1 - the register to be used, if PIC is set
; %2 - name of the constant.
;
; Subsequent opcode are going to use the base address in the form
; "movaps m0, [pic_base_constant_name+r4]" and it would be turned into
; "movaps m0, [r5 + r4]" if PIC is enabled
; "movaps m0, [constant_name + r4]" if texrel are used
%macro SET_PIC_BASE 3; reg, const_label
%ifdef PIC
%{1} %2, [%3] ; lea r5, [rip+const]
%define pic_base_%3 %2
%else
%define pic_base_%3 %3
%endif
%endmacro
%macro PULSES_SEARCH 1
; m6 Syy_norm
; m7 Sxy_norm
addps m6, mm_const_float_0_5 ; Syy_norm += 1.0/2
pxor m1, m1 ; max_idx
xorps m3, m3 ; p_max
xor r4d, r4d
align 16
%%distortion_search:
movd xm2, dword r4d ; movd zero extends
%ifidn %1,add
movaps m4, [tmpY + r4] ; y[i]
movaps m5, [tmpX + r4] ; X[i]
%if USE_APPROXIMATION == 1
xorps m0, m0
cmpps m0, m0, m5, 4 ; m0 = (X[i] != 0.0)
%endif
addps m4, m6 ; m4 = Syy_new = y[i] + Syy_norm
addps m5, m7 ; m5 = Sxy_new = X[i] + Sxy_norm
%if USE_APPROXIMATION == 1
andps m5, m0 ; if(X[i] == 0) Sxy_new = 0; Prevent aproximation error from setting pulses in array padding.
%endif
%else
movaps m5, [tmpY + r4] ; m5 = y[i]
xorps m0, m0 ; m0 = 0;
cmpps m0, m0, m5, 1 ; m0 = (0<y)
subps m4, m6, m5 ; m4 = Syy_new = Syy_norm - y[i]
subps m5, m7, [tmpX + r4] ; m5 = Sxy_new = Sxy_norm - X[i]
andps m5, m0 ; (0<y)?m5:0
%endif
%if USE_APPROXIMATION == 1
rsqrtps m4, m4
mulps m5, m4 ; m5 = p = Sxy_new*approx(1/sqrt(Syy) )
%else
mulps m5, m5
divps m5, m4 ; m5 = p = Sxy_new*Sxy_new/Syy
%endif
VPBROADCASTD m2, xm2 ; m2=i (all lanes get same values, we add the offset-per-lane, later)
cmpps m0, m3, m5, 1 ; m0 = (m3 < m5) ; (p_max < p) ; (p > p_max)
maxps m3, m5 ; m3=max(p_max,p)
; maxps here is faster than blendvps, despite blend having lower latency.
pand m2, m0 ; This version seems faster than sse41 pblendvb
pmaxsw m1, m2 ; SSE2 signed word, so it would work for N < 32768/4
add r4d, mmsize
cmp r4d, Nd
jb %%distortion_search
por m1, mm_const_int32_offsets ; max_idx offsets per individual lane (skipped in the inner loop)
movdqa m4, m1 ; needed for the aligned y[max_idx]+=1; processing
%if mmsize >= 32
; Merge parallel maximums round 8 (4 vs 4)
vextractf128 xm5, ym3, 1 ; xmm5 = ymm3[1x128] = ymm3[255..128b]
cmpps xm0, xm3, xm5, 1 ; m0 = (m3 < m5) = ( p[0x128] < p[1x128] )
vextracti128 xm2, ym1, 1 ; xmm2 = ymm1[1x128] = ymm1[255..128b]
BLENDVPS xm3, xm5, xm0 ; max_idx = m0 ? max_idx[1x128] : max_idx[0x128]
PBLENDVB xm1, xm2, xm0 ; p = m0 ? p[1x128] : p[0x128]
%endif
; Merge parallel maximums round 4 (2 vs 2)
; m3=p[3210]
movhlps xm5, xm3 ; m5=p[xx32]
cmpps xm0, xm3, xm5, 1 ; m0 = (m3 < m5) = ( p[1,0] < p[3,2] )
pshufd xm2, xm1, q3232
BLENDVPS xm3, xm5, xm0 ; max_idx = m0 ? max_idx[3,2] : max_idx[1,0]
PBLENDVB xm1, xm2, xm0 ; p = m0 ? p[3,2] : p[1,0]
; Merge parallel maximums final round (1 vs 1)
shufps xm0, xm3, xm3, q1111 ; m0 = m3[1] = p[1]
cmpss xm0, xm3, 5 ; m0 = !(m0 >= m3) = !( p[1] >= p[0] )
pshufd xm2, xm1, q1111
PBLENDVB xm1, xm2, xm0
movd dword r4d, xm1 ; zero extends to the rest of r4q
VBROADCASTSS m3, [tmpX + r4]
%{1}ps m7, m3 ; Sxy += X[max_idx]
VBROADCASTSS m5, [tmpY + r4]
%{1}ps m6, m5 ; Syy += Y[max_idx]
; We have to update a single element in Y[i]
; However writing 4 bytes and then doing 16 byte load in the inner loop
; could cause a stall due to breaking write forwarding.
VPBROADCASTD m1, xm1
pcmpeqd m1, m1, m4 ; exactly 1 element matches max_idx and this finds it
and r4d, ~(mmsize-1) ; align address down, so the value pointed by max_idx is inside a mmsize load
movaps m5, [tmpY + r4] ; m5 = Y[y3...ym...y0]
andps m1, mm_const_float_1 ; m1 = [ 0...1.0...0]
%{1}ps m5, m1 ; m5 = Y[y3...ym...y0] +/- [0...1.0...0]
movaps [tmpY + r4], m5 ; Y[max_idx] +-= 1.0;
%endmacro
;
; We need one more register for
; PIC relative addressing. Use this
; to count it in cglobal
;
%ifdef PIC
%define num_pic_regs 1
%else
%define num_pic_regs 0
%endif
;
; Pyramid Vector Quantization Search implementation
;
; float * inX - Unaligned (SIMD) access, it will be overread,
; but extra data is masked away.
; int32 * outY - Should be aligned and padded buffer.
; It is used as temp buffer.
; uint32 K - Number of pulses to have after quantizations.
; uint32 N - Number of vector elements. Must be 0 < N < 256
;
%macro PVQ_FAST_SEARCH 1
cglobal pvq_search%1, 4, 5+num_pic_regs, 11, 256*4, inX, outY, K, N
%define tmpX rsp
%define tmpY outYq
movaps m0, [const_float_abs_mask]
shl Nd, 2 ; N *= sizeof(float); also 32 bit operation zeroes the high 32 bits in 64 bit mode.
mov r4d, Nd
neg r4d
and r4d, mmsize-1
SET_PIC_BASE lea, r5, const_align_abs_edge ; rip+const
movups m2, [pic_base_const_align_abs_edge + r4 - mmsize]
add Nd, r4d ; N = align(N, mmsize)
lea r4d, [Nd - mmsize] ; N is rounded up (aligned up) to mmsize, so r4 can't become negative here, unless N=0.
movups m1, [inXq + r4]
andps m1, m2
movaps [tmpX + r4], m1 ; Sx = abs( X[N-1] )
align 16
%%loop_abs_sum:
sub r4d, mmsize
jc %%end_loop_abs_sum
movups m2, [inXq + r4]
andps m2, m0
movaps [tmpX + r4], m2 ; tmpX[i]=abs(X[i])
addps m1, m2 ; Sx += abs(X[i])
jmp %%loop_abs_sum
align 16
%%end_loop_abs_sum:
HSUMPS m1, m2 ; m1 = Sx
xorps m0, m0
comiss xm0, xm1 ;
jz %%zero_input ; if (Sx==0) goto zero_input
cvtsi2ss xm0, dword Kd ; m0 = K
%if USE_APPROXIMATION == 1
rcpss xm1, xm1 ; m1 = approx(1/Sx)
mulss xm0, xm1 ; m0 = K*(1/Sx)
%else
divss xm0, xm1 ; b = K/Sx
; b = K/max_x
%endif
VBROADCASTSS m0, xm0
lea r4d, [Nd - mmsize]
pxor m5, m5 ; Sy ( Sum of abs( y[i]) )
xorps m6, m6 ; Syy ( Sum of y[i]*y[i] )
xorps m7, m7 ; Sxy ( Sum of X[i]*y[i] )
align 16
%%loop_guess:
movaps m1, [tmpX + r4] ; m1 = X[i]
mulps m2, m0, m1 ; m2 = res*X[i]
cvtps2dq m2, m2 ; yt = (int)lrintf( res*X[i] )
paddd m5, m2 ; Sy += yt
cvtdq2ps m2, m2 ; yt = (float)yt
mulps m1, m2 ; m1 = X[i]*yt
movaps [tmpY + r4], m2 ; y[i] = m2
addps m7, m1 ; Sxy += m1;
mulps m2, m2 ; m2 = yt*yt
addps m6, m2 ; Syy += m2
sub r4d, mmsize
jnc %%loop_guess
HSUMPS m6, m1 ; Syy_norm
HADDD m5, m4 ; pulses
movd dword r4d, xm5 ; zero extends to the rest of r4q
sub Kd, r4d ; K -= pulses , also 32 bit operation zeroes high 32 bit in 64 bit mode.
jz %%finish ; K - pulses == 0
SET_HI_REG_MM_CONSTANT movaps, m8, const_float_0_5
SET_HI_REG_MM_CONSTANT movaps, m9, const_float_1
SET_HI_REG_MM_CONSTANT movdqa, m10, const_int32_offsets
; Use Syy/2 in distortion parameter calculations.
; Saves pre and post-caclulation to correct Y[] values.
; Same precision, since float mantisa is normalized.
; The SQRT approximation does differ.
HSUMPS m7, m0 ; Sxy_norm
mulps m6, mm_const_float_0_5
jc %%remove_pulses_loop ; K - pulses < 0
align 16 ; K - pulses > 0
%%add_pulses_loop:
PULSES_SEARCH add ; m6 Syy_norm ; m7 Sxy_norm
sub Kd, 1
jnz %%add_pulses_loop
addps m6, m6 ; Syy*=2
jmp %%finish
align 16
%%remove_pulses_loop:
PULSES_SEARCH sub ; m6 Syy_norm ; m7 Sxy_norm
add Kd, 1
jnz %%remove_pulses_loop
addps m6, m6 ; Syy*=2
align 16
%%finish:
lea r4d, [Nd - mmsize]
movaps m2, [const_float_sign_mask]
align 16
%%restore_sign_loop:
movaps m0, [tmpY + r4] ; m0 = Y[i]
movups m1, [inXq + r4] ; m1 = X[i]
andps m1, m2 ; m1 = sign(X[i])
orps m0, m1 ; m0 = Y[i]*sign
cvtps2dq m3, m0 ; m3 = (int)m0
movaps [outYq + r4], m3
sub r4d, mmsize
jnc %%restore_sign_loop
%%return:
%if ARCH_X86_64 == 0 ; sbrdsp
movss r0m, xm6 ; return (float)Syy_norm
fld dword r0m
%else
movaps m0, m6 ; return (float)Syy_norm
%endif
RET
align 16
%%zero_input:
lea r4d, [Nd - mmsize]
xorps m0, m0
%%zero_loop:
movaps [outYq + r4], m0
sub r4d, mmsize
jnc %%zero_loop
movaps m6, [const_float_1]
jmp %%return
%endmacro
; if 1, use a float op that give half precision but execute for around 3 cycles.
; On Skylake & Ryzen the division is much faster (around 11c/3),
; that makes the full precision code about 2% slower.
; Opus also does use rsqrt approximation in their intrinsics code.
%define USE_APPROXIMATION 1
INIT_XMM sse2
PVQ_FAST_SEARCH _approx
INIT_XMM sse4
PVQ_FAST_SEARCH _approx
%define USE_APPROXIMATION 0
INIT_XMM avx
PVQ_FAST_SEARCH _exact

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,221 @@
;******************************************************************************
;* SIMD optimized non-power-of-two MDCT functions
;*
;* Copyright (C) 2017 Rostislav Pehlivanov <atomnuker@gmail.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION_RODATA 32
perm_neg: dd 2, 5, 3, 4, 6, 1, 7, 0
perm_pos: dd 0, 7, 1, 6, 4, 3, 5, 2
sign_adjust_r: times 4 dd 0x80000000, 0x00000000
sign_adjust_5: dd 0x00000000, 0x80000000, 0x80000000, 0x00000000
SECTION .text
%if ARCH_X86_64
;*****************************************************************************************
;void ff_fft15_avx(FFTComplex *out, FFTComplex *in, FFTComplex *exptab, ptrdiff_t stride);
;*****************************************************************************************
%macro FFT5 3 ; %1 - in_offset, %2 - dst1 (64bit used), %3 - dst2
VBROADCASTSD m0, [inq + %1] ; in[ 0].re, in[ 0].im, in[ 0].re, in[ 0].im
movsd xm1, [inq + 1*16 + 8 + %1] ; in[ 3].re, in[ 3].im, 0, 0
movsd xm4, [inq + 6*16 + 0 + %1] ; in[12].re, in[12].im, 0, 0
movhps xm1, [inq + 3*16 + 0 + %1] ; in[ 3].re, in[ 3].im, in[ 6].re, in[ 6].im
movhps xm4, [inq + 4*16 + 8 + %1] ; in[12].re, in[12].im, in[ 9].re, in[ 9].im
subps xm2, xm1, xm4 ; t[2].im, t[2].re, t[3].im, t[3].re
addps xm1, xm4 ; t[0].re, t[0].im, t[1].re, t[1].im
movhlps %2, xm1 ; t[0].re, t[1].re, t[0].im, t[1].im
addps %2, xm1
addps %2, xm0 ; DC[0].re, DC[0].im, junk...
movlhps %2, %2 ; DC[0].re, DC[0].im, DC[0].re, DC[0].im
shufps xm3, xm1, xm2, q0110 ; t[0].re, t[0].im, t[2].re, t[2].im
shufps xm1, xm2, q2332 ; t[1].re, t[1].im, t[3].re, t[3].im
mulps xm%3, xm1, xm5
mulps xm4, xm3, xm6
mulps xm1, xm6
xorps xm1, xm7
mulps xm3, xm5
addsubps xm3, xm1 ; t[0].re, t[0].im, t[2].re, t[2].im
subps xm%3, xm4 ; t[4].re, t[4].im, t[5].re, t[5].im
movhlps xm2, xm%3, xm3 ; t[2].re, t[2].im, t[5].re, t[5].im
movlhps xm3, xm%3 ; t[0].re, t[0].im, t[4].re, t[4].im
xorps xm2, xm7
addps xm%3, xm2, xm3
subps xm3, xm2
shufps xm3, xm3, q1032
vinsertf128 m%3, m%3, xm3, 1 ; All ACs (tmp[1] through to tmp[4])
addps m%3, m%3, m0 ; Finally offset with DCs
%endmacro
%macro BUTTERFLIES_DC 1 ; %1 - exptab_offset
mulps xm0, xm9, [exptabq + %1 + 16*0]
mulps xm1, xm10, [exptabq + %1 + 16*1]
haddps xm0, xm1
movhlps xm1, xm0 ; t[0].re, t[1].re, t[0].im, t[1].im
addps xm0, xm1
addps xm0, xm8
movsd [outq], xm0
%endmacro
%macro BUTTERFLIES_AC 1 ; %1 - exptab_offset
mulps m0, m12, [exptabq + 64*0 + 0*mmsize + %1]
mulps m1, m12, [exptabq + 64*0 + 1*mmsize + %1]
mulps m2, m13, [exptabq + 64*1 + 0*mmsize + %1]
mulps m3, m13, [exptabq + 64*1 + 1*mmsize + %1]
addps m0, m0, m2
addps m1, m1, m3
addps m0, m0, m11
shufps m1, m1, m1, q2301
addps m0, m0, m1
vextractf128 xm1, m0, 1
movlps [outq + strideq*1], xm0
movhps [outq + strideq*2], xm0
movlps [outq + stride3q], xm1
movhps [outq + strideq*4], xm1
%endmacro
INIT_YMM avx
cglobal fft15, 4, 5, 14, out, in, exptab, stride, stride5
shl strideq, 3
movaps xm5, [exptabq + 480 + 16*0]
movaps xm6, [exptabq + 480 + 16*1]
movaps xm7, [sign_adjust_5]
FFT5 0, xm8, 11
FFT5 8, xm9, 12
FFT5 16, xm10, 13
%define stride3q inq
lea stride3q, [strideq + strideq*2]
lea stride5q, [strideq + strideq*4]
BUTTERFLIES_DC (8*6 + 4*0)*2*4
BUTTERFLIES_AC (8*0 + 0*0)*2*4
add outq, stride5q
BUTTERFLIES_DC (8*6 + 4*1)*2*4
BUTTERFLIES_AC (8*2 + 0*0)*2*4
add outq, stride5q
BUTTERFLIES_DC (8*6 + 4*2)*2*4
BUTTERFLIES_AC (8*4 + 0*0)*2*4
RET
%endif ; ARCH_X86_64
;*******************************************************************************************************
;void ff_mdct15_postreindex(FFTComplex *out, FFTComplex *in, FFTComplex *exp, int *lut, ptrdiff_t len8);
;*******************************************************************************************************
%macro LUT_LOAD_4D 3
mov r4d, [lutq + %3q*4 + 0]
movsd xmm%1, [inq + r4q*8]
mov r4d, [lutq + %3q*4 + 4]
movhps xmm%1, [inq + r4q*8]
%if cpuflag(avx2)
mov r4d, [lutq + %3q*4 + 8]
movsd %2, [inq + r4q*8]
mov r4d, [lutq + %3q*4 + 12]
movhps %2, [inq + r4q*8]
vinsertf128 %1, %1, %2, 1
%endif
%endmacro
%macro POSTROTATE_FN 1
cglobal mdct15_postreindex, 5, 7, 8 + cpuflag(avx2)*2, out, in, exp, lut, len8, offset_p, offset_n
xor offset_nq, offset_nq
lea offset_pq, [len8q*2 - %1]
movaps m7, [sign_adjust_r]
%if cpuflag(avx2)
movaps m8, [perm_pos]
movaps m9, [perm_neg]
%endif
.loop:
movups m0, [expq + offset_pq*8] ; exp[p0].re, exp[p0].im, exp[p1].re, exp[p1].im, exp[p2].re, exp[p2].im, exp[p3].re, exp[p3].im
movups m1, [expq + offset_nq*8] ; exp[n3].re, exp[n3].im, exp[n2].re, exp[n2].im, exp[n1].re, exp[n1].im, exp[n0].re, exp[n0].im
LUT_LOAD_4D m3, xm4, offset_p ; in[p0].re, in[p0].im, in[p1].re, in[p1].im, in[p2].re, in[p2].im, in[p3].re, in[p3].im
LUT_LOAD_4D m4, xm5, offset_n ; in[n3].re, in[n3].im, in[n2].re, in[n2].im, in[n1].re, in[n1].im, in[n0].re, in[n0].im
mulps m5, m3, m0 ; in[p].reim * exp[p].reim
mulps m6, m4, m1 ; in[n].reim * exp[n].reim
xorps m5, m7 ; in[p].re *= -1, in[p].im *= 1
xorps m6, m7 ; in[n].re *= -1, in[n].im *= 1
shufps m3, m3, m3, q2301 ; in[p].imre
shufps m4, m4, m4, q2301 ; in[n].imre
mulps m3, m0 ; in[p].imre * exp[p].reim
mulps m4, m1 ; in[n].imre * exp[n].reim
haddps m3, m6 ; out[n0].im, out[n1].im, out[n3].re, out[n2].re, out[n2].im, out[n3].im, out[n1].re, out[n0].re
haddps m5, m4 ; out[p0].re, out[p1].re, out[p3].im, out[p2].im, out[p2].re, out[p3].re, out[p1].im, out[p0].im
%if cpuflag(avx2)
vpermps m3, m9, m3 ; out[n3].im, out[n3].re, out[n2].im, out[n2].re, out[n1].im, out[n1].re, out[n0].im, out[n0].re
vpermps m5, m8, m5 ; out[p0].re, out[p0].im, out[p1].re, out[p1].im, out[p2].re, out[p2].im, out[p3].re, out[p3].im
%else
shufps m3, m3, m3, q0312
shufps m5, m5, m5, q2130
%endif
movups [outq + offset_nq*8], m3
movups [outq + offset_pq*8], m5
sub offset_pq, %1
add offset_nq, %1
cmp offset_nq, offset_pq
jle .loop
REP_RET
%endmacro
INIT_XMM sse3
POSTROTATE_FN 2
%if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
INIT_YMM avx2
POSTROTATE_FN 4
%endif

View file

@ -0,0 +1,548 @@
;******************************************************************************
;* AAC Spectral Band Replication decoding functions
;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION_RODATA
; mask equivalent for multiply by -1.0 1.0
ps_mask times 2 dd 1<<31, 0
ps_mask2 times 2 dd 0, 1<<31
ps_mask3 dd 0, 0, 0, 1<<31
ps_noise0 times 2 dd 1.0, 0.0,
ps_noise2 times 2 dd -1.0, 0.0
ps_noise13 dd 0.0, 1.0, 0.0, -1.0
dd 0.0, -1.0, 0.0, 1.0
dd 0.0, 1.0, 0.0, -1.0
cextern sbr_noise_table
cextern ps_neg
SECTION .text
INIT_XMM sse
cglobal sbr_sum_square, 2, 3, 6
mov r2d, r1d
xorps m0, m0
xorps m1, m1
sar r2, 3
jz .prepare
.loop:
movu m2, [r0 + 0]
movu m3, [r0 + 16]
movu m4, [r0 + 32]
movu m5, [r0 + 48]
mulps m2, m2
mulps m3, m3
mulps m4, m4
mulps m5, m5
addps m0, m2
addps m1, m3
addps m0, m4
addps m1, m5
add r0, 64
dec r2
jnz .loop
.prepare:
and r1, 7
sar r1, 1
jz .end
; len is a multiple of 2, thus there are at least 4 elements to process
.endloop:
movu m2, [r0]
add r0, 16
mulps m2, m2
dec r1
addps m0, m2
jnz .endloop
.end:
addps m0, m1
movhlps m2, m0
addps m0, m2
movss m1, m0
shufps m0, m0, 1
addss m0, m1
%if ARCH_X86_64 == 0
movss r0m, m0
fld dword r0m
%endif
RET
%define STEP 40*4*2
cglobal sbr_hf_g_filt, 5, 6, 5
lea r1, [r1 + 8*r4] ; offset by ixh elements into X_high
mov r5, r3
and r3, 0xFC
lea r2, [r2 + r3*4]
lea r0, [r0 + r3*8]
neg r3
jz .loop1
.loop4:
movlps m0, [r2 + 4*r3 + 0]
movlps m1, [r2 + 4*r3 + 8]
movlps m2, [r1 + 0*STEP]
movlps m3, [r1 + 2*STEP]
movhps m2, [r1 + 1*STEP]
movhps m3, [r1 + 3*STEP]
unpcklps m0, m0
unpcklps m1, m1
mulps m0, m2
mulps m1, m3
movu [r0 + 8*r3 + 0], m0
movu [r0 + 8*r3 + 16], m1
add r1, 4*STEP
add r3, 4
jnz .loop4
and r5, 3 ; number of single element loops
jz .end
.loop1: ; element 0 and 1 can be computed at the same time
movss m0, [r2]
movlps m2, [r1]
unpcklps m0, m0
mulps m2, m0
movlps [r0], m2
add r0, 8
add r2, 4
add r1, STEP
dec r5
jnz .loop1
.end:
RET
; void ff_sbr_hf_gen_sse(float (*X_high)[2], const float (*X_low)[2],
; const float alpha0[2], const float alpha1[2],
; float bw, int start, int end)
;
cglobal sbr_hf_gen, 4,4,8, X_high, X_low, alpha0, alpha1, BW, S, E
; load alpha factors
%define bw m0
%if ARCH_X86_64 == 0 || WIN64
movss bw, BWm
%endif
movlps m2, [alpha1q]
movlps m1, [alpha0q]
shufps bw, bw, 0
mulps m2, bw ; (a1[0] a1[1])*bw
mulps m1, bw ; (a0[0] a0[1])*bw = (a2 a3)
mulps m2, bw ; (a1[0] a1[1])*bw*bw = (a0 a1)
mova m3, m1
mova m4, m2
; Set pointers
%if ARCH_X86_64 == 0 || WIN64
; start and end 6th and 7th args on stack
mov r2d, Sm
mov r3d, Em
DEFINE_ARGS X_high, X_low, start, end
%else
; BW does not actually occupy a register, so shift by 1
DEFINE_ARGS X_high, X_low, alpha0, alpha1, start, end
movsxd startq, startd
movsxd endq, endd
%endif
sub startq, endq ; neg num of loops
lea X_highq, [X_highq + endq*2*4]
lea X_lowq, [X_lowq + endq*2*4 - 2*2*4]
shl startq, 3 ; offset from num loops
mova m0, [X_lowq + startq]
shufps m3, m3, q1111
shufps m4, m4, q1111
xorps m3, [ps_mask]
shufps m1, m1, q0000
shufps m2, m2, q0000
xorps m4, [ps_mask]
.loop2:
movu m7, [X_lowq + startq + 8] ; BbCc
mova m6, m0
mova m5, m7
shufps m0, m0, q2301 ; aAbB
shufps m7, m7, q2301 ; bBcC
mulps m0, m4
mulps m7, m3
mulps m6, m2
mulps m5, m1
addps m7, m0
mova m0, [X_lowq + startq + 16] ; CcDd
addps m7, m0
addps m6, m5
addps m7, m6
mova [X_highq + startq], m7
add startq, 16
jnz .loop2
RET
cglobal sbr_sum64x5, 1,2,4,z
lea r1q, [zq+ 256]
.loop:
mova m0, [zq+ 0]
mova m2, [zq+ 16]
mova m1, [zq+ 256]
mova m3, [zq+ 272]
addps m0, [zq+ 512]
addps m2, [zq+ 528]
addps m1, [zq+ 768]
addps m3, [zq+ 784]
addps m0, [zq+1024]
addps m2, [zq+1040]
addps m0, m1
addps m2, m3
mova [zq], m0
mova [zq+16], m2
add zq, 32
cmp zq, r1q
jne .loop
REP_RET
INIT_XMM sse
cglobal sbr_qmf_post_shuffle, 2,3,4,W,z
lea r2q, [zq + (64-4)*4]
mova m3, [ps_neg]
.loop:
mova m1, [zq]
xorps m0, m3, [r2q]
shufps m0, m0, m0, q0123
unpcklps m2, m0, m1
unpckhps m0, m0, m1
mova [Wq + 0], m2
mova [Wq + 16], m0
add Wq, 32
sub r2q, 16
add zq, 16
cmp zq, r2q
jl .loop
REP_RET
INIT_XMM sse
cglobal sbr_neg_odd_64, 1,2,4,z
lea r1q, [zq+256]
.loop:
mova m0, [zq+ 0]
mova m1, [zq+16]
mova m2, [zq+32]
mova m3, [zq+48]
xorps m0, [ps_mask2]
xorps m1, [ps_mask2]
xorps m2, [ps_mask2]
xorps m3, [ps_mask2]
mova [zq+ 0], m0
mova [zq+16], m1
mova [zq+32], m2
mova [zq+48], m3
add zq, 64
cmp zq, r1q
jne .loop
REP_RET
; void ff_sbr_qmf_deint_bfly_sse2(float *v, const float *src0, const float *src1)
%macro SBR_QMF_DEINT_BFLY 0
cglobal sbr_qmf_deint_bfly, 3,5,8, v,src0,src1,vrev,c
mov cq, 64*4-2*mmsize
lea vrevq, [vq + 64*4]
.loop:
mova m0, [src0q+cq]
mova m1, [src1q]
mova m4, [src0q+cq+mmsize]
mova m5, [src1q+mmsize]
%if cpuflag(sse2)
pshufd m2, m0, q0123
pshufd m3, m1, q0123
pshufd m6, m4, q0123
pshufd m7, m5, q0123
%else
shufps m2, m0, m0, q0123
shufps m3, m1, m1, q0123
shufps m6, m4, m4, q0123
shufps m7, m5, m5, q0123
%endif
addps m5, m2
subps m0, m7
addps m1, m6
subps m4, m3
mova [vrevq], m1
mova [vrevq+mmsize], m5
mova [vq+cq], m0
mova [vq+cq+mmsize], m4
add src1q, 2*mmsize
add vrevq, 2*mmsize
sub cq, 2*mmsize
jge .loop
REP_RET
%endmacro
INIT_XMM sse
SBR_QMF_DEINT_BFLY
INIT_XMM sse2
SBR_QMF_DEINT_BFLY
INIT_XMM sse2
cglobal sbr_qmf_pre_shuffle, 1,4,6,z
%define OFFSET (32*4-2*mmsize)
mov r3q, OFFSET
lea r1q, [zq + (32+1)*4]
lea r2q, [zq + 64*4]
mova m5, [ps_neg]
.loop:
movu m0, [r1q]
movu m2, [r1q + mmsize]
movu m1, [zq + r3q + 4 + mmsize]
movu m3, [zq + r3q + 4]
pxor m2, m5
pxor m0, m5
pshufd m2, m2, q0123
pshufd m0, m0, q0123
SBUTTERFLY dq, 2, 3, 4
SBUTTERFLY dq, 0, 1, 4
mova [r2q + 2*r3q + 0*mmsize], m2
mova [r2q + 2*r3q + 1*mmsize], m3
mova [r2q + 2*r3q + 2*mmsize], m0
mova [r2q + 2*r3q + 3*mmsize], m1
add r1q, 2*mmsize
sub r3q, 2*mmsize
jge .loop
movq m2, [zq]
movq [r2q], m2
REP_RET
%ifdef PIC
%define NREGS 1
%if UNIX64
%define NOISE_TABLE r6q ; r5q is m_max
%else
%define NOISE_TABLE r5q
%endif
%else
%define NREGS 0
%define NOISE_TABLE sbr_noise_table
%endif
%macro LOAD_NST 1
%ifdef PIC
lea NOISE_TABLE, [%1]
mova m0, [kxq + NOISE_TABLE]
%else
mova m0, [kxq + %1]
%endif
%endmacro
INIT_XMM sse2
; sbr_hf_apply_noise_0(float (*Y)[2], const float *s_m,
; const float *q_filt, int noise,
; int kx, int m_max)
cglobal sbr_hf_apply_noise_0, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
mova m0, [ps_noise0]
jmp apply_noise_main
; sbr_hf_apply_noise_1(float (*Y)[2], const float *s_m,
; const float *q_filt, int noise,
; int kx, int m_max)
cglobal sbr_hf_apply_noise_1, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
and kxq, 1
shl kxq, 4
LOAD_NST ps_noise13
jmp apply_noise_main
; sbr_hf_apply_noise_2(float (*Y)[2], const float *s_m,
; const float *q_filt, int noise,
; int kx, int m_max)
cglobal sbr_hf_apply_noise_2, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
mova m0, [ps_noise2]
jmp apply_noise_main
; sbr_hf_apply_noise_3(float (*Y)[2], const float *s_m,
; const float *q_filt, int noise,
; int kx, int m_max)
cglobal sbr_hf_apply_noise_3, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
and kxq, 1
shl kxq, 4
LOAD_NST ps_noise13+16
apply_noise_main:
%if ARCH_X86_64 == 0 || WIN64
mov kxd, m_maxm
DEFINE_ARGS Y, s_m, q_filt, noise, count
%else
DEFINE_ARGS Y, s_m, q_filt, noise, kx, count
%endif
movsxdifnidn noiseq, noised
dec noiseq
shl countd, 2
%ifdef PIC
lea NOISE_TABLE, [sbr_noise_table]
%endif
lea Yq, [Yq + 2*countq]
add s_mq, countq
add q_filtq, countq
shl noiseq, 3
pxor m5, m5
neg countq
.loop:
mova m1, [q_filtq + countq]
movu m3, [noiseq + NOISE_TABLE + 1*mmsize]
movu m4, [noiseq + NOISE_TABLE + 2*mmsize]
add noiseq, 2*mmsize
and noiseq, 0x1ff<<3
punpckhdq m2, m1, m1
punpckldq m1, m1
mulps m1, m3 ; m2 = q_filt[m] * ff_sbr_noise_table[noise]
mulps m2, m4 ; m2 = q_filt[m] * ff_sbr_noise_table[noise]
mova m3, [s_mq + countq]
; TODO: replace by a vpermd in AVX2
punpckhdq m4, m3, m3
punpckldq m3, m3
pcmpeqd m6, m3, m5 ; m6 == 0
pcmpeqd m7, m4, m5 ; m7 == 0
mulps m3, m0 ; s_m[m] * phi_sign
mulps m4, m0 ; s_m[m] * phi_sign
pand m1, m6
pand m2, m7
movu m6, [Yq + 2*countq]
movu m7, [Yq + 2*countq + mmsize]
addps m3, m1
addps m4, m2
addps m6, m3
addps m7, m4
movu [Yq + 2*countq], m6
movu [Yq + 2*countq + mmsize], m7
add countq, mmsize
jl .loop
RET
INIT_XMM sse
cglobal sbr_qmf_deint_neg, 2,4,4,v,src,vrev,c
%define COUNT 32*4
%define OFFSET 32*4
mov cq, -COUNT
lea vrevq, [vq + OFFSET + COUNT]
add vq, OFFSET-mmsize
add srcq, 2*COUNT
mova m3, [ps_neg]
.loop:
mova m0, [srcq + 2*cq + 0*mmsize]
mova m1, [srcq + 2*cq + 1*mmsize]
shufps m2, m0, m1, q2020
shufps m1, m0, q1313
xorps m2, m3
mova [vq], m1
mova [vrevq + cq], m2
sub vq, mmsize
add cq, mmsize
jl .loop
REP_RET
%macro SBR_AUTOCORRELATE 0
cglobal sbr_autocorrelate, 2,3,8,32, x, phi, cnt
mov cntq, 37*8
add xq, cntq
neg cntq
%if cpuflag(sse3)
%define MOVH movsd
movddup m5, [xq+cntq]
%else
%define MOVH movlps
movlps m5, [xq+cntq]
movlhps m5, m5
%endif
MOVH m7, [xq+cntq+8 ]
MOVH m1, [xq+cntq+16]
shufps m7, m7, q0110
shufps m1, m1, q0110
mulps m3, m5, m7 ; x[0][0] * x[1][0], x[0][1] * x[1][1], x[0][0] * x[1][1], x[0][1] * x[1][0]
mulps m4, m5, m5 ; x[0][0] * x[0][0], x[0][1] * x[0][1];
mulps m5, m1 ; real_sum2 = x[0][0] * x[2][0], x[0][1] * x[2][1]; imag_sum2 = x[0][0] * x[2][1], x[0][1] * x[2][0]
movaps [rsp ], m3
movaps [rsp+16], m4
add cntq, 8
MOVH m2, [xq+cntq+16]
movlhps m7, m7
shufps m2, m2, q0110
mulps m6, m7, m1 ; real_sum1 = x[1][0] * x[2][0], x[1][1] * x[2][1]; imag_sum1 += x[1][0] * x[2][1], x[1][1] * x[2][0]
mulps m4, m7, m2
mulps m7, m7 ; real_sum0 = x[1][0] * x[1][0], x[1][1] * x[1][1];
addps m5, m4 ; real_sum2 += x[1][0] * x[3][0], x[1][1] * x[3][1]; imag_sum2 += x[1][0] * x[3][1], x[1][1] * x[3][0]
align 16
.loop:
add cntq, 8
MOVH m0, [xq+cntq+16]
movlhps m1, m1
shufps m0, m0, q0110
mulps m3, m1, m2
mulps m4, m1, m0
mulps m1, m1
addps m6, m3 ; real_sum1 += x[i][0] * x[i + 1][0], x[i][1] * x[i + 1][1]; imag_sum1 += x[i][0] * x[i + 1][1], x[i][1] * x[i + 1][0];
addps m5, m4 ; real_sum2 += x[i][0] * x[i + 2][0], x[i][1] * x[i + 2][1]; imag_sum2 += x[i][0] * x[i + 2][1], x[i][1] * x[i + 2][0];
addps m7, m1 ; real_sum0 += x[i][0] * x[i][0], x[i][1] * x[i][1];
add cntq, 8
MOVH m1, [xq+cntq+16]
movlhps m2, m2
shufps m1, m1, q0110
mulps m3, m2, m0
mulps m4, m2, m1
mulps m2, m2
addps m6, m3 ; real_sum1 += x[i][0] * x[i + 1][0], x[i][1] * x[i + 1][1]; imag_sum1 += x[i][0] * x[i + 1][1], x[i][1] * x[i + 1][0];
addps m5, m4 ; real_sum2 += x[i][0] * x[i + 2][0], x[i][1] * x[i + 2][1]; imag_sum2 += x[i][0] * x[i + 2][1], x[i][1] * x[i + 2][0];
addps m7, m2 ; real_sum0 += x[i][0] * x[i][0], x[i][1] * x[i][1];
add cntq, 8
MOVH m2, [xq+cntq+16]
movlhps m0, m0
shufps m2, m2, q0110
mulps m3, m0, m1
mulps m4, m0, m2
mulps m0, m0
addps m6, m3 ; real_sum1 += x[i][0] * x[i + 1][0], x[i][1] * x[i + 1][1]; imag_sum1 += x[i][0] * x[i + 1][1], x[i][1] * x[i + 1][0];
addps m5, m4 ; real_sum2 += x[i][0] * x[i + 2][0], x[i][1] * x[i + 2][1]; imag_sum2 += x[i][0] * x[i + 2][1], x[i][1] * x[i + 2][0];
addps m7, m0 ; real_sum0 += x[i][0] * x[i][0], x[i][1] * x[i][1];
jl .loop
movlhps m1, m1
mulps m2, m1
mulps m1, m1
addps m2, m6 ; real_sum1 + x[38][0] * x[39][0], x[38][1] * x[39][1]; imag_sum1 + x[38][0] * x[39][1], x[38][1] * x[39][0];
addps m1, m7 ; real_sum0 + x[38][0] * x[38][0], x[38][1] * x[38][1];
addps m6, [rsp ] ; real_sum1 + x[ 0][0] * x[ 1][0], x[ 0][1] * x[ 1][1]; imag_sum1 + x[ 0][0] * x[ 1][1], x[ 0][1] * x[ 1][0];
addps m7, [rsp+16] ; real_sum0 + x[ 0][0] * x[ 0][0], x[ 0][1] * x[ 0][1];
xorps m2, [ps_mask3]
xorps m5, [ps_mask3]
xorps m6, [ps_mask3]
HADDPS m2, m5, m3
HADDPS m7, m6, m4
%if cpuflag(sse3)
movshdup m0, m1
%else
movss m0, m1
shufps m1, m1, q0001
%endif
addss m1, m0
movaps [phiq ], m2
movhps [phiq+0x18], m7
movss [phiq+0x28], m7
movss [phiq+0x10], m1
RET
%endmacro
INIT_XMM sse
SBR_AUTOCORRELATE
INIT_XMM sse3
SBR_AUTOCORRELATE

View file

@ -0,0 +1,4 @@
OBJS += aarch64/cpu.o \
aarch64/float_dsp_init.o \
NEON-OBJS += aarch64/float_dsp_neon.o

View file

@ -0,0 +1,8 @@
OBJS += arm/cpu.o \
arm/float_dsp_init_arm.o \
VFP-OBJS += arm/float_dsp_init_vfp.o \
arm/float_dsp_vfp.o \
NEON-OBJS += arm/float_dsp_init_neon.o \
arm/float_dsp_neon.o \

View file

@ -1,5 +0,0 @@
/* Automatically generated by version.sh, do not manually edit! */
#ifndef AVUTIL_FFVERSION_H
#define AVUTIL_FFVERSION_H
#define FFMPEG_VERSION ""
#endif /* AVUTIL_FFVERSION_H */

View file

@ -0,0 +1,18 @@
OBJS += x86/cpu.o \
x86/fixed_dsp_init.o \
x86/float_dsp_init.o \
x86/imgutils_init.o \
x86/lls_init.o \
OBJS-$(CONFIG_PIXELUTILS) += x86/pixelutils_init.o \
EMMS_OBJS_$(HAVE_MMX_INLINE)_$(HAVE_MMX_EXTERNAL)_$(HAVE_MM_EMPTY) = x86/emms.o
X86ASM-OBJS += x86/cpuid.o \
$(EMMS_OBJS__yes_) \
x86/fixed_dsp.o \
x86/float_dsp.o \
x86/imgutils.o \
x86/lls.o \
X86ASM-OBJS-$(CONFIG_PIXELUTILS) += x86/pixelutils.o \

View file

@ -0,0 +1,91 @@
;*****************************************************************************
;* Copyright (C) 2005-2010 x264 project
;*
;* Authors: Loren Merritt <lorenm@u.washington.edu>
;* Fiona Glaser <fiona@x264.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "x86util.asm"
SECTION .text
;-----------------------------------------------------------------------------
; void ff_cpu_cpuid(int index, int *eax, int *ebx, int *ecx, int *edx)
;-----------------------------------------------------------------------------
cglobal cpu_cpuid, 5,7
push rbx
push r4
push r3
push r2
push r1
mov eax, r0d
xor ecx, ecx
cpuid
pop r4
mov [r4], eax
pop r4
mov [r4], ebx
pop r4
mov [r4], ecx
pop r4
mov [r4], edx
pop rbx
RET
;-----------------------------------------------------------------------------
; void ff_cpu_xgetbv(int op, int *eax, int *edx)
;-----------------------------------------------------------------------------
cglobal cpu_xgetbv, 3,7
push r2
push r1
mov ecx, r0d
xgetbv
pop r4
mov [r4], eax
pop r4
mov [r4], edx
RET
%if ARCH_X86_64 == 0
;-----------------------------------------------------------------------------
; int ff_cpu_cpuid_test(void)
; return 0 if unsupported
;-----------------------------------------------------------------------------
cglobal cpu_cpuid_test
pushfd
push ebx
push ebp
push esi
push edi
pushfd
pop eax
mov ebx, eax
xor eax, 0x200000
push eax
popfd
pushfd
pop eax
xor eax, ebx
pop edi
pop esi
pop ebp
pop ebx
popfd
ret
%endif

View file

@ -0,0 +1,48 @@
;*****************************************************************************
;* x86-optimized Float DSP functions
;*
;* Copyright 2016 James Almer
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "x86util.asm"
SECTION .text
;-----------------------------------------------------------------------------
; void ff_butterflies_fixed(float *src0, float *src1, int len);
;-----------------------------------------------------------------------------
INIT_XMM sse2
cglobal butterflies_fixed, 3,3,3, src0, src1, len
shl lend, 2
add src0q, lenq
add src1q, lenq
neg lenq
align 16
.loop:
mova m0, [src0q + lenq]
mova m1, [src1q + lenq]
mova m2, m0
paddd m0, m1
psubd m2, m1
mova [src0q + lenq], m0
mova [src1q + lenq], m2
add lenq, mmsize
jl .loop
RET

View file

@ -0,0 +1,484 @@
;*****************************************************************************
;* x86-optimized Float DSP functions
;*
;* Copyright 2006 Loren Merritt
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "x86util.asm"
SECTION_RODATA 32
pd_reverse: dd 7, 6, 5, 4, 3, 2, 1, 0
SECTION .text
;-----------------------------------------------------------------------------
; void vector_fmul(float *dst, const float *src0, const float *src1, int len)
;-----------------------------------------------------------------------------
%macro VECTOR_FMUL 0
cglobal vector_fmul, 4,4,2, dst, src0, src1, len
lea lenq, [lend*4 - 64]
ALIGN 16
.loop:
%assign a 0
%rep 32/mmsize
mova m0, [src0q + lenq + (a+0)*mmsize]
mova m1, [src0q + lenq + (a+1)*mmsize]
mulps m0, m0, [src1q + lenq + (a+0)*mmsize]
mulps m1, m1, [src1q + lenq + (a+1)*mmsize]
mova [dstq + lenq + (a+0)*mmsize], m0
mova [dstq + lenq + (a+1)*mmsize], m1
%assign a a+2
%endrep
sub lenq, 64
jge .loop
REP_RET
%endmacro
INIT_XMM sse
VECTOR_FMUL
%if HAVE_AVX_EXTERNAL
INIT_YMM avx
VECTOR_FMUL
%endif
;-----------------------------------------------------------------------------
; void vector_dmul(double *dst, const double *src0, const double *src1, int len)
;-----------------------------------------------------------------------------
%macro VECTOR_DMUL 0
cglobal vector_dmul, 4,4,4, dst, src0, src1, len
lea lend, [lenq*8 - mmsize*4]
ALIGN 16
.loop:
movaps m0, [src0q + lenq + 0*mmsize]
movaps m1, [src0q + lenq + 1*mmsize]
movaps m2, [src0q + lenq + 2*mmsize]
movaps m3, [src0q + lenq + 3*mmsize]
mulpd m0, m0, [src1q + lenq + 0*mmsize]
mulpd m1, m1, [src1q + lenq + 1*mmsize]
mulpd m2, m2, [src1q + lenq + 2*mmsize]
mulpd m3, m3, [src1q + lenq + 3*mmsize]
movaps [dstq + lenq + 0*mmsize], m0
movaps [dstq + lenq + 1*mmsize], m1
movaps [dstq + lenq + 2*mmsize], m2
movaps [dstq + lenq + 3*mmsize], m3
sub lenq, mmsize*4
jge .loop
RET
%endmacro
INIT_XMM sse2
VECTOR_DMUL
%if HAVE_AVX_EXTERNAL
INIT_YMM avx
VECTOR_DMUL
%endif
;------------------------------------------------------------------------------
; void ff_vector_fmac_scalar(float *dst, const float *src, float mul, int len)
;------------------------------------------------------------------------------
%macro VECTOR_FMAC_SCALAR 0
%if UNIX64
cglobal vector_fmac_scalar, 3,3,5, dst, src, len
%else
cglobal vector_fmac_scalar, 4,4,5, dst, src, mul, len
%endif
%if ARCH_X86_32
VBROADCASTSS m0, mulm
%else
%if WIN64
SWAP 0, 2
%endif
shufps xm0, xm0, 0
%if cpuflag(avx)
vinsertf128 m0, m0, xm0, 1
%endif
%endif
lea lenq, [lend*4-64]
.loop:
%if cpuflag(fma3)
mova m1, [dstq+lenq]
mova m2, [dstq+lenq+1*mmsize]
fmaddps m1, m0, [srcq+lenq], m1
fmaddps m2, m0, [srcq+lenq+1*mmsize], m2
%else ; cpuflag
mulps m1, m0, [srcq+lenq]
mulps m2, m0, [srcq+lenq+1*mmsize]
%if mmsize < 32
mulps m3, m0, [srcq+lenq+2*mmsize]
mulps m4, m0, [srcq+lenq+3*mmsize]
%endif ; mmsize
addps m1, m1, [dstq+lenq]
addps m2, m2, [dstq+lenq+1*mmsize]
%if mmsize < 32
addps m3, m3, [dstq+lenq+2*mmsize]
addps m4, m4, [dstq+lenq+3*mmsize]
%endif ; mmsize
%endif ; cpuflag
mova [dstq+lenq], m1
mova [dstq+lenq+1*mmsize], m2
%if mmsize < 32
mova [dstq+lenq+2*mmsize], m3
mova [dstq+lenq+3*mmsize], m4
%endif ; mmsize
sub lenq, 64
jge .loop
REP_RET
%endmacro
INIT_XMM sse
VECTOR_FMAC_SCALAR
%if HAVE_AVX_EXTERNAL
INIT_YMM avx
VECTOR_FMAC_SCALAR
%endif
%if HAVE_FMA3_EXTERNAL
INIT_YMM fma3
VECTOR_FMAC_SCALAR
%endif
;------------------------------------------------------------------------------
; void ff_vector_fmul_scalar(float *dst, const float *src, float mul, int len)
;------------------------------------------------------------------------------
%macro VECTOR_FMUL_SCALAR 0
%if UNIX64
cglobal vector_fmul_scalar, 3,3,2, dst, src, len
%else
cglobal vector_fmul_scalar, 4,4,3, dst, src, mul, len
%endif
%if ARCH_X86_32
movss m0, mulm
%elif WIN64
SWAP 0, 2
%endif
shufps m0, m0, 0
lea lenq, [lend*4-mmsize]
.loop:
mova m1, [srcq+lenq]
mulps m1, m0
mova [dstq+lenq], m1
sub lenq, mmsize
jge .loop
REP_RET
%endmacro
INIT_XMM sse
VECTOR_FMUL_SCALAR
;------------------------------------------------------------------------------
; void ff_vector_dmac_scalar(double *dst, const double *src, double mul,
; int len)
;------------------------------------------------------------------------------
%macro VECTOR_DMAC_SCALAR 0
%if ARCH_X86_32
cglobal vector_dmac_scalar, 2,4,5, dst, src, mul, len, lenaddr
mov lenq, lenaddrm
VBROADCASTSD m0, mulm
%else
%if UNIX64
cglobal vector_dmac_scalar, 3,3,5, dst, src, len
%else
cglobal vector_dmac_scalar, 4,4,5, dst, src, mul, len
SWAP 0, 2
%endif
movlhps xm0, xm0
%if cpuflag(avx)
vinsertf128 m0, m0, xm0, 1
%endif
%endif
lea lenq, [lend*8-mmsize*4]
.loop:
%if cpuflag(fma3)
movaps m1, [dstq+lenq]
movaps m2, [dstq+lenq+1*mmsize]
movaps m3, [dstq+lenq+2*mmsize]
movaps m4, [dstq+lenq+3*mmsize]
fmaddpd m1, m0, [srcq+lenq], m1
fmaddpd m2, m0, [srcq+lenq+1*mmsize], m2
fmaddpd m3, m0, [srcq+lenq+2*mmsize], m3
fmaddpd m4, m0, [srcq+lenq+3*mmsize], m4
%else ; cpuflag
mulpd m1, m0, [srcq+lenq]
mulpd m2, m0, [srcq+lenq+1*mmsize]
mulpd m3, m0, [srcq+lenq+2*mmsize]
mulpd m4, m0, [srcq+lenq+3*mmsize]
addpd m1, m1, [dstq+lenq]
addpd m2, m2, [dstq+lenq+1*mmsize]
addpd m3, m3, [dstq+lenq+2*mmsize]
addpd m4, m4, [dstq+lenq+3*mmsize]
%endif ; cpuflag
movaps [dstq+lenq], m1
movaps [dstq+lenq+1*mmsize], m2
movaps [dstq+lenq+2*mmsize], m3
movaps [dstq+lenq+3*mmsize], m4
sub lenq, mmsize*4
jge .loop
REP_RET
%endmacro
INIT_XMM sse2
VECTOR_DMAC_SCALAR
%if HAVE_AVX_EXTERNAL
INIT_YMM avx
VECTOR_DMAC_SCALAR
%endif
%if HAVE_FMA3_EXTERNAL
INIT_YMM fma3
VECTOR_DMAC_SCALAR
%endif
;------------------------------------------------------------------------------
; void ff_vector_dmul_scalar(double *dst, const double *src, double mul,
; int len)
;------------------------------------------------------------------------------
%macro VECTOR_DMUL_SCALAR 0
%if ARCH_X86_32
cglobal vector_dmul_scalar, 3,4,3, dst, src, mul, len, lenaddr
mov lenq, lenaddrm
%elif UNIX64
cglobal vector_dmul_scalar, 3,3,3, dst, src, len
%else
cglobal vector_dmul_scalar, 4,4,3, dst, src, mul, len
%endif
%if ARCH_X86_32
VBROADCASTSD m0, mulm
%else
%if WIN64
SWAP 0, 2
%endif
movlhps xm0, xm0
%if cpuflag(avx)
vinsertf128 ym0, ym0, xm0, 1
%endif
%endif
lea lenq, [lend*8-2*mmsize]
.loop:
mulpd m1, m0, [srcq+lenq ]
mulpd m2, m0, [srcq+lenq+mmsize]
movaps [dstq+lenq ], m1
movaps [dstq+lenq+mmsize], m2
sub lenq, 2*mmsize
jge .loop
REP_RET
%endmacro
INIT_XMM sse2
VECTOR_DMUL_SCALAR
%if HAVE_AVX_EXTERNAL
INIT_YMM avx
VECTOR_DMUL_SCALAR
%endif
;-----------------------------------------------------------------------------
; vector_fmul_window(float *dst, const float *src0,
; const float *src1, const float *win, int len);
;-----------------------------------------------------------------------------
%macro VECTOR_FMUL_WINDOW 0
cglobal vector_fmul_window, 5, 6, 6, dst, src0, src1, win, len, len1
shl lend, 2
lea len1q, [lenq - mmsize]
add src0q, lenq
add dstq, lenq
add winq, lenq
neg lenq
.loop:
mova m0, [winq + lenq]
mova m4, [src0q + lenq]
%if cpuflag(sse)
mova m1, [winq + len1q]
mova m5, [src1q + len1q]
shufps m1, m1, 0x1b
shufps m5, m5, 0x1b
mova m2, m0
mova m3, m1
mulps m2, m4
mulps m3, m5
mulps m1, m4
mulps m0, m5
addps m2, m3
subps m1, m0
shufps m2, m2, 0x1b
%else
pswapd m1, [winq + len1q]
pswapd m5, [src1q + len1q]
mova m2, m0
mova m3, m1
pfmul m2, m4
pfmul m3, m5
pfmul m1, m4
pfmul m0, m5
pfadd m2, m3
pfsub m1, m0
pswapd m2, m2
%endif
mova [dstq + lenq], m1
mova [dstq + len1q], m2
sub len1q, mmsize
add lenq, mmsize
jl .loop
%if mmsize == 8
femms
%endif
REP_RET
%endmacro
INIT_MMX 3dnowext
VECTOR_FMUL_WINDOW
INIT_XMM sse
VECTOR_FMUL_WINDOW
;-----------------------------------------------------------------------------
; vector_fmul_add(float *dst, const float *src0, const float *src1,
; const float *src2, int len)
;-----------------------------------------------------------------------------
%macro VECTOR_FMUL_ADD 0
cglobal vector_fmul_add, 5,5,4, dst, src0, src1, src2, len
lea lenq, [lend*4 - 2*mmsize]
ALIGN 16
.loop:
mova m0, [src0q + lenq]
mova m1, [src0q + lenq + mmsize]
%if cpuflag(fma3)
mova m2, [src2q + lenq]
mova m3, [src2q + lenq + mmsize]
fmaddps m0, m0, [src1q + lenq], m2
fmaddps m1, m1, [src1q + lenq + mmsize], m3
%else
mulps m0, m0, [src1q + lenq]
mulps m1, m1, [src1q + lenq + mmsize]
addps m0, m0, [src2q + lenq]
addps m1, m1, [src2q + lenq + mmsize]
%endif
mova [dstq + lenq], m0
mova [dstq + lenq + mmsize], m1
sub lenq, 2*mmsize
jge .loop
REP_RET
%endmacro
INIT_XMM sse
VECTOR_FMUL_ADD
%if HAVE_AVX_EXTERNAL
INIT_YMM avx
VECTOR_FMUL_ADD
%endif
%if HAVE_FMA3_EXTERNAL
INIT_YMM fma3
VECTOR_FMUL_ADD
%endif
;-----------------------------------------------------------------------------
; void vector_fmul_reverse(float *dst, const float *src0, const float *src1,
; int len)
;-----------------------------------------------------------------------------
%macro VECTOR_FMUL_REVERSE 0
cglobal vector_fmul_reverse, 4,4,2, dst, src0, src1, len
%if cpuflag(avx2)
movaps m2, [pd_reverse]
%endif
lea lenq, [lend*4 - 2*mmsize]
ALIGN 16
.loop:
%if cpuflag(avx2)
vpermps m0, m2, [src1q]
vpermps m1, m2, [src1q+mmsize]
%elif cpuflag(avx)
vmovaps xmm0, [src1q + 16]
vinsertf128 m0, m0, [src1q], 1
vshufps m0, m0, m0, q0123
vmovaps xmm1, [src1q + mmsize + 16]
vinsertf128 m1, m1, [src1q + mmsize], 1
vshufps m1, m1, m1, q0123
%else
mova m0, [src1q]
mova m1, [src1q + mmsize]
shufps m0, m0, q0123
shufps m1, m1, q0123
%endif
mulps m0, m0, [src0q + lenq + mmsize]
mulps m1, m1, [src0q + lenq]
movaps [dstq + lenq + mmsize], m0
movaps [dstq + lenq], m1
add src1q, 2*mmsize
sub lenq, 2*mmsize
jge .loop
REP_RET
%endmacro
INIT_XMM sse
VECTOR_FMUL_REVERSE
%if HAVE_AVX_EXTERNAL
INIT_YMM avx
VECTOR_FMUL_REVERSE
%endif
%if HAVE_AVX2_EXTERNAL
INIT_YMM avx2
VECTOR_FMUL_REVERSE
%endif
; float scalarproduct_float_sse(const float *v1, const float *v2, int len)
INIT_XMM sse
cglobal scalarproduct_float, 3,3,2, v1, v2, offset
shl offsetd, 2
add v1q, offsetq
add v2q, offsetq
neg offsetq
xorps xmm0, xmm0
.loop:
movaps xmm1, [v1q+offsetq]
mulps xmm1, [v2q+offsetq]
addps xmm0, xmm1
add offsetq, 16
js .loop
movhlps xmm1, xmm0
addps xmm0, xmm1
movss xmm1, xmm0
shufps xmm0, xmm0, 1
addss xmm0, xmm1
%if ARCH_X86_64 == 0
movss r0m, xmm0
fld dword r0m
%endif
RET
;-----------------------------------------------------------------------------
; void ff_butterflies_float(float *src0, float *src1, int len);
;-----------------------------------------------------------------------------
INIT_XMM sse
cglobal butterflies_float, 3,3,3, src0, src1, len
shl lend, 2
add src0q, lenq
add src1q, lenq
neg lenq
.loop:
mova m0, [src0q + lenq]
mova m1, [src1q + lenq]
subps m2, m0, m1
addps m0, m0, m1
mova [src1q + lenq], m2
mova [src0q + lenq], m0
add lenq, mmsize
jl .loop
REP_RET

View file

@ -0,0 +1,53 @@
;*****************************************************************************
;* Copyright 2016 Anton Khirnov
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION .text
INIT_XMM sse4
cglobal image_copy_plane_uc_from, 6, 7, 4, dst, dst_linesize, src, src_linesize, bw, height, rowpos
add dstq, bwq
add srcq, bwq
neg bwq
.row_start:
mov rowposq, bwq
.loop:
movntdqa m0, [srcq + rowposq + 0 * mmsize]
movntdqa m1, [srcq + rowposq + 1 * mmsize]
movntdqa m2, [srcq + rowposq + 2 * mmsize]
movntdqa m3, [srcq + rowposq + 3 * mmsize]
mova [dstq + rowposq + 0 * mmsize], m0
mova [dstq + rowposq + 1 * mmsize], m1
mova [dstq + rowposq + 2 * mmsize], m2
mova [dstq + rowposq + 3 * mmsize], m3
add rowposq, 4 * mmsize
jnz .loop
add srcq, src_linesizeq
add dstq, dst_linesizeq
dec heightd
jnz .row_start
RET

View file

@ -0,0 +1,290 @@
;******************************************************************************
;* linear least squares model
;*
;* Copyright (c) 2013 Loren Merritt
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "x86util.asm"
SECTION .text
%define MAX_VARS 32
%define MAX_VARS_ALIGN (MAX_VARS+4)
%define COVAR_STRIDE MAX_VARS_ALIGN*8
%define COVAR(x,y) [covarq + (x)*8 + (y)*COVAR_STRIDE]
struc LLSModel
.covariance: resq MAX_VARS_ALIGN*MAX_VARS_ALIGN
.coeff: resq MAX_VARS*MAX_VARS
.variance: resq MAX_VARS
.indep_count: resd 1
endstruc
%macro ADDPD_MEM 2
%if cpuflag(avx)
vaddpd %2, %2, %1
%else
addpd %2, %1
%endif
mova %1, %2
%endmacro
INIT_XMM sse2
%define movdqa movaps
cglobal update_lls, 2,5,8, ctx, var, i, j, covar2
%define covarq ctxq
mov id, [ctxq + LLSModel.indep_count]
lea varq, [varq + iq*8]
neg iq
mov covar2q, covarq
.loopi:
; Compute all 3 pairwise products of a 2x2 block that lies on the diagonal
mova m1, [varq + iq*8]
mova m3, [varq + iq*8 + 16]
pshufd m4, m1, q1010
pshufd m5, m1, q3232
pshufd m6, m3, q1010
pshufd m7, m3, q3232
mulpd m0, m1, m4
mulpd m1, m1, m5
lea covarq, [covar2q + 16]
ADDPD_MEM COVAR(-2,0), m0
ADDPD_MEM COVAR(-2,1), m1
lea jq, [iq + 2]
cmp jd, -2
jg .skip4x4
.loop4x4:
; Compute all 16 pairwise products of a 4x4 block
mulpd m0, m4, m3
mulpd m1, m5, m3
mulpd m2, m6, m3
mulpd m3, m3, m7
ADDPD_MEM COVAR(0,0), m0
ADDPD_MEM COVAR(0,1), m1
ADDPD_MEM COVAR(0,2), m2
ADDPD_MEM COVAR(0,3), m3
mova m3, [varq + jq*8 + 16]
mulpd m0, m4, m3
mulpd m1, m5, m3
mulpd m2, m6, m3
mulpd m3, m3, m7
ADDPD_MEM COVAR(2,0), m0
ADDPD_MEM COVAR(2,1), m1
ADDPD_MEM COVAR(2,2), m2
ADDPD_MEM COVAR(2,3), m3
mova m3, [varq + jq*8 + 32]
add covarq, 32
add jq, 4
cmp jd, -2
jle .loop4x4
.skip4x4:
test jd, jd
jg .skip2x4
mulpd m4, m3
mulpd m5, m3
mulpd m6, m3
mulpd m7, m3
ADDPD_MEM COVAR(0,0), m4
ADDPD_MEM COVAR(0,1), m5
ADDPD_MEM COVAR(0,2), m6
ADDPD_MEM COVAR(0,3), m7
.skip2x4:
add iq, 4
add covar2q, 4*COVAR_STRIDE+32
cmp id, -2
jle .loopi
test id, id
jg .ret
mov jq, iq
%define covarq covar2q
.loop2x1:
movsd m0, [varq + iq*8]
movlhps m0, m0
mulpd m0, [varq + jq*8]
ADDPD_MEM COVAR(0,0), m0
inc iq
add covarq, COVAR_STRIDE
test id, id
jle .loop2x1
.ret:
REP_RET
%macro UPDATE_LLS 0
cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2
%define covarq ctxq
mov countd, [ctxq + LLSModel.indep_count]
lea count2d, [countq-2]
xor id, id
.loopi:
; Compute all 10 pairwise products of a 4x4 block that lies on the diagonal
mova ymm1, [varq + iq*8]
vbroadcastsd ymm4, [varq + iq*8]
vbroadcastsd ymm5, [varq + iq*8 + 8]
vbroadcastsd ymm6, [varq + iq*8 + 16]
vbroadcastsd ymm7, [varq + iq*8 + 24]
vextractf128 xmm3, ymm1, 1
%if cpuflag(fma3)
mova ymm0, COVAR(iq ,0)
mova xmm2, COVAR(iq+2,2)
fmaddpd ymm0, ymm1, ymm4, ymm0
fmaddpd xmm2, xmm3, xmm6, xmm2
fmaddpd ymm1, ymm5, ymm1, COVAR(iq ,1)
fmaddpd xmm3, xmm7, xmm3, COVAR(iq+2,3)
mova COVAR(iq ,0), ymm0
mova COVAR(iq ,1), ymm1
mova COVAR(iq+2,2), xmm2
mova COVAR(iq+2,3), xmm3
%else
vmulpd ymm0, ymm1, ymm4
vmulpd ymm1, ymm1, ymm5
vmulpd xmm2, xmm3, xmm6
vmulpd xmm3, xmm3, xmm7
ADDPD_MEM COVAR(iq ,0), ymm0
ADDPD_MEM COVAR(iq ,1), ymm1
ADDPD_MEM COVAR(iq+2,2), xmm2
ADDPD_MEM COVAR(iq+2,3), xmm3
%endif ; cpuflag(fma3)
lea jd, [iq + 4]
cmp jd, count2d
jg .skip4x4
.loop4x4:
; Compute all 16 pairwise products of a 4x4 block
mova ymm3, [varq + jq*8]
%if cpuflag(fma3)
mova ymm0, COVAR(jq, 0)
mova ymm1, COVAR(jq, 1)
mova ymm2, COVAR(jq, 2)
fmaddpd ymm0, ymm3, ymm4, ymm0
fmaddpd ymm1, ymm3, ymm5, ymm1
fmaddpd ymm2, ymm3, ymm6, ymm2
fmaddpd ymm3, ymm7, ymm3, COVAR(jq,3)
mova COVAR(jq, 0), ymm0
mova COVAR(jq, 1), ymm1
mova COVAR(jq, 2), ymm2
mova COVAR(jq, 3), ymm3
%else
vmulpd ymm0, ymm3, ymm4
vmulpd ymm1, ymm3, ymm5
vmulpd ymm2, ymm3, ymm6
vmulpd ymm3, ymm3, ymm7
ADDPD_MEM COVAR(jq,0), ymm0
ADDPD_MEM COVAR(jq,1), ymm1
ADDPD_MEM COVAR(jq,2), ymm2
ADDPD_MEM COVAR(jq,3), ymm3
%endif ; cpuflag(fma3)
add jd, 4
cmp jd, count2d
jle .loop4x4
.skip4x4:
cmp jd, countd
jg .skip2x4
mova xmm3, [varq + jq*8]
%if cpuflag(fma3)
mova xmm0, COVAR(jq, 0)
mova xmm1, COVAR(jq, 1)
mova xmm2, COVAR(jq, 2)
fmaddpd xmm0, xmm3, xmm4, xmm0
fmaddpd xmm1, xmm3, xmm5, xmm1
fmaddpd xmm2, xmm3, xmm6, xmm2
fmaddpd xmm3, xmm7, xmm3, COVAR(jq,3)
mova COVAR(jq, 0), xmm0
mova COVAR(jq, 1), xmm1
mova COVAR(jq, 2), xmm2
mova COVAR(jq, 3), xmm3
%else
vmulpd xmm0, xmm3, xmm4
vmulpd xmm1, xmm3, xmm5
vmulpd xmm2, xmm3, xmm6
vmulpd xmm3, xmm3, xmm7
ADDPD_MEM COVAR(jq,0), xmm0
ADDPD_MEM COVAR(jq,1), xmm1
ADDPD_MEM COVAR(jq,2), xmm2
ADDPD_MEM COVAR(jq,3), xmm3
%endif ; cpuflag(fma3)
.skip2x4:
add id, 4
add covarq, 4*COVAR_STRIDE
cmp id, count2d
jle .loopi
cmp id, countd
jg .ret
mov jd, id
.loop2x1:
vmovddup xmm0, [varq + iq*8]
%if cpuflag(fma3)
mova xmm1, [varq + jq*8]
fmaddpd xmm0, xmm1, xmm0, COVAR(jq,0)
mova COVAR(jq,0), xmm0
%else
vmulpd xmm0, [varq + jq*8]
ADDPD_MEM COVAR(jq,0), xmm0
%endif ; cpuflag(fma3)
inc id
add covarq, COVAR_STRIDE
cmp id, countd
jle .loop2x1
.ret:
REP_RET
%endmacro ; UPDATE_LLS
%if HAVE_AVX_EXTERNAL
INIT_YMM avx
UPDATE_LLS
%endif
%if HAVE_FMA3_EXTERNAL
INIT_YMM fma3
UPDATE_LLS
%endif
INIT_XMM sse2
cglobal evaluate_lls, 3,4,2, ctx, var, order, i
; This function is often called on the same buffer as update_lls, but with
; an offset. They can't both be aligned.
; Load halves rather than movu to avoid store-forwarding stalls, since the
; input was initialized immediately prior to this function using scalar math.
%define coefsq ctxq
mov id, orderd
imul orderd, MAX_VARS
lea coefsq, [ctxq + LLSModel.coeff + orderq*8]
movsd m0, [varq]
movhpd m0, [varq + 8]
mulpd m0, [coefsq]
lea coefsq, [coefsq + iq*8]
lea varq, [varq + iq*8]
neg iq
add iq, 2
.loop:
movsd m1, [varq + iq*8]
movhpd m1, [varq + iq*8 + 8]
mulpd m1, [coefsq + iq*8]
addpd m0, m1
add iq, 2
jl .loop
jg .skip1
movsd m1, [varq + iq*8]
mulsd m1, [coefsq + iq*8]
addpd m0, m1
.skip1:
movhlps m1, m0
addsd m0, m1
%if ARCH_X86_32
movsd r0m, m0
fld qword r0m
%endif
RET

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,7 @@
OBJS += aarch64/audio_convert_init.o \
aarch64/resample_init.o
OBJS-$(CONFIG_NEON_CLOBBER_TEST) += aarch64/neontest.o
NEON-OBJS += aarch64/audio_convert_neon.o \
aarch64/resample.o

View file

@ -0,0 +1,8 @@
OBJS += arm/audio_convert_init.o \
arm/resample_init.o
OBJS-$(CONFIG_NEON_CLOBBER_TEST) += arm/neontest.o
NEON-OBJS += arm/audio_convert_neon.o \
arm/resample.o

View file

@ -0,0 +1,9 @@
X86ASM-OBJS += x86/audio_convert.o\
x86/rematrix.o\
x86/resample.o\
OBJS += x86/audio_convert_init.o\
x86/rematrix_init.o\
x86/resample_init.o\
OBJS-$(CONFIG_XMM_CLOBBER_TEST) += x86/w64xmmtest.o

View file

@ -0,0 +1,739 @@
;******************************************************************************
;* Copyright (c) 2012 Michael Niedermayer
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION_RODATA 32
flt2pm31: times 8 dd 4.6566129e-10
flt2p31 : times 8 dd 2147483648.0
flt2p15 : times 8 dd 32768.0
word_unpack_shuf : db 0, 1, 4, 5, 8, 9,12,13, 2, 3, 6, 7,10,11,14,15
SECTION .text
;to, from, a/u, log2_outsize, log_intsize, const
%macro PACK_2CH 5-7
cglobal pack_2ch_%2_to_%1_%3, 3, 4, 6, dst, src, len, src2
mov src2q , [srcq+gprsize]
mov srcq , [srcq]
mov dstq , [dstq]
%ifidn %3, a
test dstq, mmsize-1
jne pack_2ch_%2_to_%1_u_int %+ SUFFIX
test srcq, mmsize-1
jne pack_2ch_%2_to_%1_u_int %+ SUFFIX
test src2q, mmsize-1
jne pack_2ch_%2_to_%1_u_int %+ SUFFIX
%else
pack_2ch_%2_to_%1_u_int %+ SUFFIX:
%endif
lea srcq , [srcq + (1<<%5)*lenq]
lea src2q, [src2q + (1<<%5)*lenq]
lea dstq , [dstq + (2<<%4)*lenq]
neg lenq
%7 m0,m1,m2,m3,m4,m5
.next:
%if %4 >= %5
mov%3 m0, [ srcq +(1<<%5)*lenq]
mova m1, m0
mov%3 m2, [ src2q+(1<<%5)*lenq]
%if %5 == 1
punpcklwd m0, m2
punpckhwd m1, m2
%else
punpckldq m0, m2
punpckhdq m1, m2
%endif
%6 m0,m1,m2,m3,m4,m5
%else
mov%3 m0, [ srcq +(1<<%5)*lenq]
mov%3 m1, [mmsize + srcq +(1<<%5)*lenq]
mov%3 m2, [ src2q+(1<<%5)*lenq]
mov%3 m3, [mmsize + src2q+(1<<%5)*lenq]
%6 m0,m1,m2,m3,m4,m5
mova m2, m0
punpcklwd m0, m1
punpckhwd m2, m1
SWAP 1,2
%endif
mov%3 [ dstq+(2<<%4)*lenq], m0
mov%3 [ mmsize + dstq+(2<<%4)*lenq], m1
%if %4 > %5
mov%3 [2*mmsize + dstq+(2<<%4)*lenq], m2
mov%3 [3*mmsize + dstq+(2<<%4)*lenq], m3
add lenq, 4*mmsize/(2<<%4)
%else
add lenq, 2*mmsize/(2<<%4)
%endif
jl .next
REP_RET
%endmacro
%macro UNPACK_2CH 5-7
cglobal unpack_2ch_%2_to_%1_%3, 3, 4, 7, dst, src, len, dst2
mov dst2q , [dstq+gprsize]
mov srcq , [srcq]
mov dstq , [dstq]
%ifidn %3, a
test dstq, mmsize-1
jne unpack_2ch_%2_to_%1_u_int %+ SUFFIX
test srcq, mmsize-1
jne unpack_2ch_%2_to_%1_u_int %+ SUFFIX
test dst2q, mmsize-1
jne unpack_2ch_%2_to_%1_u_int %+ SUFFIX
%else
unpack_2ch_%2_to_%1_u_int %+ SUFFIX:
%endif
lea srcq , [srcq + (2<<%5)*lenq]
lea dstq , [dstq + (1<<%4)*lenq]
lea dst2q, [dst2q + (1<<%4)*lenq]
neg lenq
%7 m0,m1,m2,m3,m4,m5
mova m6, [word_unpack_shuf]
.next:
mov%3 m0, [ srcq +(2<<%5)*lenq]
mov%3 m2, [ mmsize + srcq +(2<<%5)*lenq]
%if %5 == 1
%ifidn SUFFIX, _ssse3
pshufb m0, m6
mova m1, m0
pshufb m2, m6
punpcklqdq m0,m2
punpckhqdq m1,m2
%else
mova m1, m0
punpcklwd m0,m2
punpckhwd m1,m2
mova m2, m0
punpcklwd m0,m1
punpckhwd m2,m1
mova m1, m0
punpcklwd m0,m2
punpckhwd m1,m2
%endif
%else
mova m1, m0
shufps m0, m2, 10001000b
shufps m1, m2, 11011101b
%endif
%if %4 < %5
mov%3 m2, [2*mmsize + srcq +(2<<%5)*lenq]
mova m3, m2
mov%3 m4, [3*mmsize + srcq +(2<<%5)*lenq]
shufps m2, m4, 10001000b
shufps m3, m4, 11011101b
SWAP 1,2
%endif
%6 m0,m1,m2,m3,m4,m5
mov%3 [ dstq+(1<<%4)*lenq], m0
%if %4 > %5
mov%3 [ dst2q+(1<<%4)*lenq], m2
mov%3 [ mmsize + dstq+(1<<%4)*lenq], m1
mov%3 [ mmsize + dst2q+(1<<%4)*lenq], m3
add lenq, 2*mmsize/(1<<%4)
%else
mov%3 [ dst2q+(1<<%4)*lenq], m1
add lenq, mmsize/(1<<%4)
%endif
jl .next
REP_RET
%endmacro
%macro CONV 5-7
cglobal %2_to_%1_%3, 3, 3, 6, dst, src, len
mov srcq , [srcq]
mov dstq , [dstq]
%ifidn %3, a
test dstq, mmsize-1
jne %2_to_%1_u_int %+ SUFFIX
test srcq, mmsize-1
jne %2_to_%1_u_int %+ SUFFIX
%else
%2_to_%1_u_int %+ SUFFIX:
%endif
lea srcq , [srcq + (1<<%5)*lenq]
lea dstq , [dstq + (1<<%4)*lenq]
neg lenq
%7 m0,m1,m2,m3,m4,m5
.next:
mov%3 m0, [ srcq +(1<<%5)*lenq]
mov%3 m1, [ mmsize + srcq +(1<<%5)*lenq]
%if %4 < %5
mov%3 m2, [2*mmsize + srcq +(1<<%5)*lenq]
mov%3 m3, [3*mmsize + srcq +(1<<%5)*lenq]
%endif
%6 m0,m1,m2,m3,m4,m5
mov%3 [ dstq+(1<<%4)*lenq], m0
mov%3 [ mmsize + dstq+(1<<%4)*lenq], m1
%if %4 > %5
mov%3 [2*mmsize + dstq+(1<<%4)*lenq], m2
mov%3 [3*mmsize + dstq+(1<<%4)*lenq], m3
add lenq, 4*mmsize/(1<<%4)
%else
add lenq, 2*mmsize/(1<<%4)
%endif
jl .next
%if mmsize == 8
emms
RET
%else
REP_RET
%endif
%endmacro
%macro PACK_6CH 8
cglobal pack_6ch_%2_to_%1_%3, 2, 8, %6, dst, src, src1, src2, src3, src4, src5, len
%if ARCH_X86_64
mov lend, r2d
%else
%define lend dword r2m
%endif
mov src1q, [srcq+1*gprsize]
mov src2q, [srcq+2*gprsize]
mov src3q, [srcq+3*gprsize]
mov src4q, [srcq+4*gprsize]
mov src5q, [srcq+5*gprsize]
mov srcq, [srcq]
mov dstq, [dstq]
%ifidn %3, a
test dstq, mmsize-1
jne pack_6ch_%2_to_%1_u_int %+ SUFFIX
test srcq, mmsize-1
jne pack_6ch_%2_to_%1_u_int %+ SUFFIX
test src1q, mmsize-1
jne pack_6ch_%2_to_%1_u_int %+ SUFFIX
test src2q, mmsize-1
jne pack_6ch_%2_to_%1_u_int %+ SUFFIX
test src3q, mmsize-1
jne pack_6ch_%2_to_%1_u_int %+ SUFFIX
test src4q, mmsize-1
jne pack_6ch_%2_to_%1_u_int %+ SUFFIX
test src5q, mmsize-1
jne pack_6ch_%2_to_%1_u_int %+ SUFFIX
%else
pack_6ch_%2_to_%1_u_int %+ SUFFIX:
%endif
sub src1q, srcq
sub src2q, srcq
sub src3q, srcq
sub src4q, srcq
sub src5q, srcq
%8 x,x,x,x,m7,x
.loop:
mov%3 m0, [srcq ]
mov%3 m1, [srcq+src1q]
mov%3 m2, [srcq+src2q]
mov%3 m3, [srcq+src3q]
mov%3 m4, [srcq+src4q]
mov%3 m5, [srcq+src5q]
%if cpuflag(sse)
SBUTTERFLYPS 0, 1, 6
SBUTTERFLYPS 2, 3, 6
SBUTTERFLYPS 4, 5, 6
%if cpuflag(avx)
blendps m6, m4, m0, 1100b
%else
movaps m6, m4
shufps m4, m0, q3210
SWAP 4,6
%endif
movlhps m0, m2
movhlps m4, m2
%if cpuflag(avx)
blendps m2, m5, m1, 1100b
%else
movaps m2, m5
shufps m5, m1, q3210
SWAP 2,5
%endif
movlhps m1, m3
movhlps m5, m3
%7 m0,m6,x,x,m7,m3
%7 m4,m1,x,x,m7,m3
%7 m2,m5,x,x,m7,m3
mov %+ %3 %+ ps [dstq ], m0
mov %+ %3 %+ ps [dstq+16], m6
mov %+ %3 %+ ps [dstq+32], m4
mov %+ %3 %+ ps [dstq+48], m1
mov %+ %3 %+ ps [dstq+64], m2
mov %+ %3 %+ ps [dstq+80], m5
%else ; mmx
SBUTTERFLY dq, 0, 1, 6
SBUTTERFLY dq, 2, 3, 6
SBUTTERFLY dq, 4, 5, 6
movq [dstq ], m0
movq [dstq+ 8], m2
movq [dstq+16], m4
movq [dstq+24], m1
movq [dstq+32], m3
movq [dstq+40], m5
%endif
add srcq, mmsize
add dstq, mmsize*6
sub lend, mmsize/4
jg .loop
%if mmsize == 8
emms
RET
%else
REP_RET
%endif
%endmacro
%macro UNPACK_6CH 8
cglobal unpack_6ch_%2_to_%1_%3, 2, 8, %6, dst, src, dst1, dst2, dst3, dst4, dst5, len
%if ARCH_X86_64
mov lend, r2d
%else
%define lend dword r2m
%endif
mov dst1q, [dstq+1*gprsize]
mov dst2q, [dstq+2*gprsize]
mov dst3q, [dstq+3*gprsize]
mov dst4q, [dstq+4*gprsize]
mov dst5q, [dstq+5*gprsize]
mov dstq, [dstq]
mov srcq, [srcq]
%ifidn %3, a
test dstq, mmsize-1
jne unpack_6ch_%2_to_%1_u_int %+ SUFFIX
test srcq, mmsize-1
jne unpack_6ch_%2_to_%1_u_int %+ SUFFIX
test dst1q, mmsize-1
jne unpack_6ch_%2_to_%1_u_int %+ SUFFIX
test dst2q, mmsize-1
jne unpack_6ch_%2_to_%1_u_int %+ SUFFIX
test dst3q, mmsize-1
jne unpack_6ch_%2_to_%1_u_int %+ SUFFIX
test dst4q, mmsize-1
jne unpack_6ch_%2_to_%1_u_int %+ SUFFIX
test dst5q, mmsize-1
jne unpack_6ch_%2_to_%1_u_int %+ SUFFIX
%else
unpack_6ch_%2_to_%1_u_int %+ SUFFIX:
%endif
sub dst1q, dstq
sub dst2q, dstq
sub dst3q, dstq
sub dst4q, dstq
sub dst5q, dstq
%8 x,x,x,x,m7,x
.loop:
mov%3 m0, [srcq ]
mov%3 m1, [srcq+16]
mov%3 m2, [srcq+32]
mov%3 m3, [srcq+48]
mov%3 m4, [srcq+64]
mov%3 m5, [srcq+80]
SBUTTERFLYPS 0, 3, 6
SBUTTERFLYPS 1, 4, 6
SBUTTERFLYPS 2, 5, 6
SBUTTERFLYPS 0, 4, 6
SBUTTERFLYPS 3, 2, 6
SBUTTERFLYPS 1, 5, 6
SWAP 1, 4
SWAP 2, 3
%7 m0,m1,x,x,m7,m6
%7 m2,m3,x,x,m7,m6
%7 m4,m5,x,x,m7,m6
mov %+ %3 %+ ps [dstq ], m0
mov %+ %3 %+ ps [dstq+dst1q], m1
mov %+ %3 %+ ps [dstq+dst2q], m2
mov %+ %3 %+ ps [dstq+dst3q], m3
mov %+ %3 %+ ps [dstq+dst4q], m4
mov %+ %3 %+ ps [dstq+dst5q], m5
add srcq, mmsize*6
add dstq, mmsize
sub lend, mmsize/4
jg .loop
REP_RET
%endmacro
%define PACK_8CH_GPRS (10 * ARCH_X86_64) + ((6 + HAVE_ALIGNED_STACK) * ARCH_X86_32)
%macro PACK_8CH 8
cglobal pack_8ch_%2_to_%1_%3, 2, PACK_8CH_GPRS, %6, ARCH_X86_32*48, dst, src, len, src1, src2, src3, src4, src5, src6, src7
mov dstq, [dstq]
%if ARCH_X86_32
DEFINE_ARGS dst, src, src2, src3, src4, src5, src6
%define lend dword r2m
%define src1q r0q
%define src1m dword [rsp+32]
%if HAVE_ALIGNED_STACK == 0
DEFINE_ARGS dst, src, src2, src3, src5, src6
%define src4q r0q
%define src4m dword [rsp+36]
%endif
%define src7q r0q
%define src7m dword [rsp+40]
mov dstm, dstq
%endif
mov src7q, [srcq+7*gprsize]
mov src6q, [srcq+6*gprsize]
%if ARCH_X86_32
mov src7m, src7q
%endif
mov src5q, [srcq+5*gprsize]
mov src4q, [srcq+4*gprsize]
mov src3q, [srcq+3*gprsize]
%if ARCH_X86_32 && HAVE_ALIGNED_STACK == 0
mov src4m, src4q
%endif
mov src2q, [srcq+2*gprsize]
mov src1q, [srcq+1*gprsize]
mov srcq, [srcq]
%ifidn %3, a
%if ARCH_X86_32
test dstmp, mmsize-1
%else
test dstq, mmsize-1
%endif
jne pack_8ch_%2_to_%1_u_int %+ SUFFIX
test srcq, mmsize-1
jne pack_8ch_%2_to_%1_u_int %+ SUFFIX
test src1q, mmsize-1
jne pack_8ch_%2_to_%1_u_int %+ SUFFIX
test src2q, mmsize-1
jne pack_8ch_%2_to_%1_u_int %+ SUFFIX
test src3q, mmsize-1
jne pack_8ch_%2_to_%1_u_int %+ SUFFIX
%if ARCH_X86_32 && HAVE_ALIGNED_STACK == 0
test src4m, mmsize-1
%else
test src4q, mmsize-1
%endif
jne pack_8ch_%2_to_%1_u_int %+ SUFFIX
test src5q, mmsize-1
jne pack_8ch_%2_to_%1_u_int %+ SUFFIX
test src6q, mmsize-1
jne pack_8ch_%2_to_%1_u_int %+ SUFFIX
%if ARCH_X86_32
test src7m, mmsize-1
%else
test src7q, mmsize-1
%endif
jne pack_8ch_%2_to_%1_u_int %+ SUFFIX
%else
pack_8ch_%2_to_%1_u_int %+ SUFFIX:
%endif
sub src1q, srcq
sub src2q, srcq
sub src3q, srcq
%if ARCH_X86_64 || HAVE_ALIGNED_STACK
sub src4q, srcq
%else
sub src4m, srcq
%endif
sub src5q, srcq
sub src6q, srcq
%if ARCH_X86_64
sub src7q, srcq
%else
mov src1m, src1q
sub src7m, srcq
%endif
%if ARCH_X86_64
%8 x,x,x,x,m9,x
%elifidn %1, int32
%define m9 [flt2p31]
%else
%define m9 [flt2pm31]
%endif
.loop:
mov%3 m0, [srcq ]
mov%3 m1, [srcq+src1q]
mov%3 m2, [srcq+src2q]
%if ARCH_X86_32 && HAVE_ALIGNED_STACK == 0
mov src4q, src4m
%endif
mov%3 m3, [srcq+src3q]
mov%3 m4, [srcq+src4q]
mov%3 m5, [srcq+src5q]
%if ARCH_X86_32
mov src7q, src7m
%endif
mov%3 m6, [srcq+src6q]
mov%3 m7, [srcq+src7q]
%if ARCH_X86_64
TRANSPOSE8x4D 0, 1, 2, 3, 4, 5, 6, 7, 8
%7 m0,m1,x,x,m9,m8
%7 m2,m3,x,x,m9,m8
%7 m4,m5,x,x,m9,m8
%7 m6,m7,x,x,m9,m8
mov%3 [dstq], m0
%else
mov dstq, dstm
TRANSPOSE8x4D 0, 1, 2, 3, 4, 5, 6, 7, [rsp], [rsp+16], 1
%7 m0,m1,x,x,m9,m2
mova m2, [rsp]
mov%3 [dstq], m0
%7 m2,m3,x,x,m9,m0
%7 m4,m5,x,x,m9,m0
%7 m6,m7,x,x,m9,m0
%endif
mov%3 [dstq+16], m1
mov%3 [dstq+32], m2
mov%3 [dstq+48], m3
mov%3 [dstq+64], m4
mov%3 [dstq+80], m5
mov%3 [dstq+96], m6
mov%3 [dstq+112], m7
add srcq, mmsize
add dstq, mmsize*8
%if ARCH_X86_32
mov dstm, dstq
mov src1q, src1m
%endif
sub lend, mmsize/4
jg .loop
REP_RET
%endmacro
%macro INT16_TO_INT32_N 6
pxor m2, m2
pxor m3, m3
punpcklwd m2, m1
punpckhwd m3, m1
SWAP 4,0
pxor m0, m0
pxor m1, m1
punpcklwd m0, m4
punpckhwd m1, m4
%endmacro
%macro INT32_TO_INT16_N 6
psrad m0, 16
psrad m1, 16
psrad m2, 16
psrad m3, 16
packssdw m0, m1
packssdw m2, m3
SWAP 1,2
%endmacro
%macro INT32_TO_FLOAT_INIT 6
mova %5, [flt2pm31]
%endmacro
%macro INT32_TO_FLOAT_N 6
cvtdq2ps %1, %1
cvtdq2ps %2, %2
mulps %1, %1, %5
mulps %2, %2, %5
%endmacro
%macro FLOAT_TO_INT32_INIT 6
mova %5, [flt2p31]
%endmacro
%macro FLOAT_TO_INT32_N 6
mulps %1, %5
mulps %2, %5
cvtps2dq %6, %1
cmpps %1, %1, %5, 5
paddd %1, %6
cvtps2dq %6, %2
cmpps %2, %2, %5, 5
paddd %2, %6
%endmacro
%macro INT16_TO_FLOAT_INIT 6
mova m5, [flt2pm31]
%endmacro
%macro INT16_TO_FLOAT_N 6
INT16_TO_INT32_N %1,%2,%3,%4,%5,%6
cvtdq2ps m0, m0
cvtdq2ps m1, m1
cvtdq2ps m2, m2
cvtdq2ps m3, m3
mulps m0, m0, m5
mulps m1, m1, m5
mulps m2, m2, m5
mulps m3, m3, m5
%endmacro
%macro FLOAT_TO_INT16_INIT 6
mova m5, [flt2p15]
%endmacro
%macro FLOAT_TO_INT16_N 6
mulps m0, m5
mulps m1, m5
mulps m2, m5
mulps m3, m5
cvtps2dq m0, m0
cvtps2dq m1, m1
packssdw m0, m1
cvtps2dq m1, m2
cvtps2dq m3, m3
packssdw m1, m3
%endmacro
%macro NOP_N 0-6
%endmacro
INIT_MMX mmx
CONV int32, int16, u, 2, 1, INT16_TO_INT32_N, NOP_N
CONV int32, int16, a, 2, 1, INT16_TO_INT32_N, NOP_N
CONV int16, int32, u, 1, 2, INT32_TO_INT16_N, NOP_N
CONV int16, int32, a, 1, 2, INT32_TO_INT16_N, NOP_N
PACK_6CH float, float, u, 2, 2, 0, NOP_N, NOP_N
PACK_6CH float, float, a, 2, 2, 0, NOP_N, NOP_N
INIT_XMM sse
PACK_6CH float, float, u, 2, 2, 7, NOP_N, NOP_N
PACK_6CH float, float, a, 2, 2, 7, NOP_N, NOP_N
UNPACK_6CH float, float, u, 2, 2, 7, NOP_N, NOP_N
UNPACK_6CH float, float, a, 2, 2, 7, NOP_N, NOP_N
INIT_XMM sse2
CONV int32, int16, u, 2, 1, INT16_TO_INT32_N, NOP_N
CONV int32, int16, a, 2, 1, INT16_TO_INT32_N, NOP_N
CONV int16, int32, u, 1, 2, INT32_TO_INT16_N, NOP_N
CONV int16, int32, a, 1, 2, INT32_TO_INT16_N, NOP_N
PACK_2CH int16, int16, u, 1, 1, NOP_N, NOP_N
PACK_2CH int16, int16, a, 1, 1, NOP_N, NOP_N
PACK_2CH int32, int32, u, 2, 2, NOP_N, NOP_N
PACK_2CH int32, int32, a, 2, 2, NOP_N, NOP_N
PACK_2CH int32, int16, u, 2, 1, INT16_TO_INT32_N, NOP_N
PACK_2CH int32, int16, a, 2, 1, INT16_TO_INT32_N, NOP_N
PACK_2CH int16, int32, u, 1, 2, INT32_TO_INT16_N, NOP_N
PACK_2CH int16, int32, a, 1, 2, INT32_TO_INT16_N, NOP_N
UNPACK_2CH int16, int16, u, 1, 1, NOP_N, NOP_N
UNPACK_2CH int16, int16, a, 1, 1, NOP_N, NOP_N
UNPACK_2CH int32, int32, u, 2, 2, NOP_N, NOP_N
UNPACK_2CH int32, int32, a, 2, 2, NOP_N, NOP_N
UNPACK_2CH int32, int16, u, 2, 1, INT16_TO_INT32_N, NOP_N
UNPACK_2CH int32, int16, a, 2, 1, INT16_TO_INT32_N, NOP_N
UNPACK_2CH int16, int32, u, 1, 2, INT32_TO_INT16_N, NOP_N
UNPACK_2CH int16, int32, a, 1, 2, INT32_TO_INT16_N, NOP_N
CONV float, int32, u, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
CONV float, int32, a, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
CONV int32, float, u, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
CONV int32, float, a, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
CONV float, int16, u, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT
CONV float, int16, a, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT
CONV int16, float, u, 1, 2, FLOAT_TO_INT16_N, FLOAT_TO_INT16_INIT
CONV int16, float, a, 1, 2, FLOAT_TO_INT16_N, FLOAT_TO_INT16_INIT
PACK_2CH float, int32, u, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
PACK_2CH float, int32, a, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
PACK_2CH int32, float, u, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
PACK_2CH int32, float, a, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
PACK_2CH float, int16, u, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT
PACK_2CH float, int16, a, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT
PACK_2CH int16, float, u, 1, 2, FLOAT_TO_INT16_N, FLOAT_TO_INT16_INIT
PACK_2CH int16, float, a, 1, 2, FLOAT_TO_INT16_N, FLOAT_TO_INT16_INIT
UNPACK_2CH float, int32, u, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
UNPACK_2CH float, int32, a, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
UNPACK_2CH int32, float, u, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
UNPACK_2CH int32, float, a, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
UNPACK_2CH float, int16, u, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT
UNPACK_2CH float, int16, a, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT
UNPACK_2CH int16, float, u, 1, 2, FLOAT_TO_INT16_N, FLOAT_TO_INT16_INIT
UNPACK_2CH int16, float, a, 1, 2, FLOAT_TO_INT16_N, FLOAT_TO_INT16_INIT
PACK_6CH float, int32, u, 2, 2, 8, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
PACK_6CH float, int32, a, 2, 2, 8, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
PACK_6CH int32, float, u, 2, 2, 8, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
PACK_6CH int32, float, a, 2, 2, 8, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
UNPACK_6CH float, int32, u, 2, 2, 8, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
UNPACK_6CH float, int32, a, 2, 2, 8, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
UNPACK_6CH int32, float, u, 2, 2, 8, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
UNPACK_6CH int32, float, a, 2, 2, 8, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
PACK_8CH float, float, u, 2, 2, 9, NOP_N, NOP_N
PACK_8CH float, float, a, 2, 2, 9, NOP_N, NOP_N
PACK_8CH float, int32, u, 2, 2, 10, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
PACK_8CH float, int32, a, 2, 2, 10, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
PACK_8CH int32, float, u, 2, 2, 10, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
PACK_8CH int32, float, a, 2, 2, 10, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
INIT_XMM ssse3
UNPACK_2CH int16, int16, u, 1, 1, NOP_N, NOP_N
UNPACK_2CH int16, int16, a, 1, 1, NOP_N, NOP_N
UNPACK_2CH int32, int16, u, 2, 1, INT16_TO_INT32_N, NOP_N
UNPACK_2CH int32, int16, a, 2, 1, INT16_TO_INT32_N, NOP_N
UNPACK_2CH float, int16, u, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT
UNPACK_2CH float, int16, a, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT
%if HAVE_AVX_EXTERNAL
INIT_XMM avx
PACK_6CH float, float, u, 2, 2, 8, NOP_N, NOP_N
PACK_6CH float, float, a, 2, 2, 8, NOP_N, NOP_N
UNPACK_6CH float, float, u, 2, 2, 8, NOP_N, NOP_N
UNPACK_6CH float, float, a, 2, 2, 8, NOP_N, NOP_N
PACK_6CH float, int32, u, 2, 2, 8, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
PACK_6CH float, int32, a, 2, 2, 8, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
PACK_6CH int32, float, u, 2, 2, 8, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
PACK_6CH int32, float, a, 2, 2, 8, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
UNPACK_6CH float, int32, u, 2, 2, 8, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
UNPACK_6CH float, int32, a, 2, 2, 8, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
UNPACK_6CH int32, float, u, 2, 2, 8, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
UNPACK_6CH int32, float, a, 2, 2, 8, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
PACK_8CH float, float, u, 2, 2, 9, NOP_N, NOP_N
PACK_8CH float, float, a, 2, 2, 9, NOP_N, NOP_N
PACK_8CH float, int32, u, 2, 2, 10, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
PACK_8CH float, int32, a, 2, 2, 10, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
PACK_8CH int32, float, u, 2, 2, 10, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
PACK_8CH int32, float, a, 2, 2, 10, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
INIT_YMM avx
CONV float, int32, u, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
CONV float, int32, a, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
%endif
%if HAVE_AVX2_EXTERNAL
INIT_YMM avx2
CONV int32, float, u, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
CONV int32, float, a, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
%endif

View file

@ -0,0 +1,250 @@
;******************************************************************************
;* Copyright (c) 2012 Michael Niedermayer
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION_RODATA 32
dw1: times 8 dd 1
w1 : times 16 dw 1
SECTION .text
%macro MIX2_FLT 1
cglobal mix_2_1_%1_float, 7, 7, 6, out, in1, in2, coeffp, index1, index2, len
%ifidn %1, a
test in1q, mmsize-1
jne mix_2_1_float_u_int %+ SUFFIX
test in2q, mmsize-1
jne mix_2_1_float_u_int %+ SUFFIX
test outq, mmsize-1
jne mix_2_1_float_u_int %+ SUFFIX
%else
mix_2_1_float_u_int %+ SUFFIX:
%endif
VBROADCASTSS m4, [coeffpq + 4*index1q]
VBROADCASTSS m5, [coeffpq + 4*index2q]
shl lend , 2
add in1q , lenq
add in2q , lenq
add outq , lenq
neg lenq
.next:
%ifidn %1, a
mulps m0, m4, [in1q + lenq ]
mulps m1, m5, [in2q + lenq ]
mulps m2, m4, [in1q + lenq + mmsize]
mulps m3, m5, [in2q + lenq + mmsize]
%else
movu m0, [in1q + lenq ]
movu m1, [in2q + lenq ]
movu m2, [in1q + lenq + mmsize]
movu m3, [in2q + lenq + mmsize]
mulps m0, m0, m4
mulps m1, m1, m5
mulps m2, m2, m4
mulps m3, m3, m5
%endif
addps m0, m0, m1
addps m2, m2, m3
mov%1 [outq + lenq ], m0
mov%1 [outq + lenq + mmsize], m2
add lenq, mmsize*2
jl .next
REP_RET
%endmacro
%macro MIX1_FLT 1
cglobal mix_1_1_%1_float, 5, 5, 3, out, in, coeffp, index, len
%ifidn %1, a
test inq, mmsize-1
jne mix_1_1_float_u_int %+ SUFFIX
test outq, mmsize-1
jne mix_1_1_float_u_int %+ SUFFIX
%else
mix_1_1_float_u_int %+ SUFFIX:
%endif
VBROADCASTSS m2, [coeffpq + 4*indexq]
shl lenq , 2
add inq , lenq
add outq , lenq
neg lenq
.next:
%ifidn %1, a
mulps m0, m2, [inq + lenq ]
mulps m1, m2, [inq + lenq + mmsize]
%else
movu m0, [inq + lenq ]
movu m1, [inq + lenq + mmsize]
mulps m0, m0, m2
mulps m1, m1, m2
%endif
mov%1 [outq + lenq ], m0
mov%1 [outq + lenq + mmsize], m1
add lenq, mmsize*2
jl .next
REP_RET
%endmacro
%macro MIX1_INT16 1
cglobal mix_1_1_%1_int16, 5, 5, 6, out, in, coeffp, index, len
%ifidn %1, a
test inq, mmsize-1
jne mix_1_1_int16_u_int %+ SUFFIX
test outq, mmsize-1
jne mix_1_1_int16_u_int %+ SUFFIX
%else
mix_1_1_int16_u_int %+ SUFFIX:
%endif
movd m4, [coeffpq + 4*indexq]
SPLATW m5, m4
psllq m4, 32
psrlq m4, 48
mova m0, [w1]
psllw m0, m4
psrlw m0, 1
punpcklwd m5, m0
add lenq , lenq
add inq , lenq
add outq , lenq
neg lenq
.next:
mov%1 m0, [inq + lenq ]
mov%1 m2, [inq + lenq + mmsize]
mova m1, m0
mova m3, m2
punpcklwd m0, [w1]
punpckhwd m1, [w1]
punpcklwd m2, [w1]
punpckhwd m3, [w1]
pmaddwd m0, m5
pmaddwd m1, m5
pmaddwd m2, m5
pmaddwd m3, m5
psrad m0, m4
psrad m1, m4
psrad m2, m4
psrad m3, m4
packssdw m0, m1
packssdw m2, m3
mov%1 [outq + lenq ], m0
mov%1 [outq + lenq + mmsize], m2
add lenq, mmsize*2
jl .next
%if mmsize == 8
emms
RET
%else
REP_RET
%endif
%endmacro
%macro MIX2_INT16 1
cglobal mix_2_1_%1_int16, 7, 7, 8, out, in1, in2, coeffp, index1, index2, len
%ifidn %1, a
test in1q, mmsize-1
jne mix_2_1_int16_u_int %+ SUFFIX
test in2q, mmsize-1
jne mix_2_1_int16_u_int %+ SUFFIX
test outq, mmsize-1
jne mix_2_1_int16_u_int %+ SUFFIX
%else
mix_2_1_int16_u_int %+ SUFFIX:
%endif
movd m4, [coeffpq + 4*index1q]
movd m6, [coeffpq + 4*index2q]
SPLATW m5, m4
SPLATW m6, m6
psllq m4, 32
psrlq m4, 48
mova m7, [dw1]
pslld m7, m4
psrld m7, 1
punpcklwd m5, m6
add lend , lend
add in1q , lenq
add in2q , lenq
add outq , lenq
neg lenq
.next:
mov%1 m0, [in1q + lenq ]
mov%1 m2, [in2q + lenq ]
mova m1, m0
punpcklwd m0, m2
punpckhwd m1, m2
mov%1 m2, [in1q + lenq + mmsize]
mov%1 m6, [in2q + lenq + mmsize]
mova m3, m2
punpcklwd m2, m6
punpckhwd m3, m6
pmaddwd m0, m5
pmaddwd m1, m5
pmaddwd m2, m5
pmaddwd m3, m5
paddd m0, m7
paddd m1, m7
paddd m2, m7
paddd m3, m7
psrad m0, m4
psrad m1, m4
psrad m2, m4
psrad m3, m4
packssdw m0, m1
packssdw m2, m3
mov%1 [outq + lenq ], m0
mov%1 [outq + lenq + mmsize], m2
add lenq, mmsize*2
jl .next
%if mmsize == 8
emms
RET
%else
REP_RET
%endif
%endmacro
INIT_MMX mmx
MIX1_INT16 u
MIX1_INT16 a
MIX2_INT16 u
MIX2_INT16 a
INIT_XMM sse
MIX2_FLT u
MIX2_FLT a
MIX1_FLT u
MIX1_FLT a
INIT_XMM sse2
MIX1_INT16 u
MIX1_INT16 a
MIX2_INT16 u
MIX2_INT16 a
%if HAVE_AVX_EXTERNAL
INIT_YMM avx
MIX2_FLT u
MIX2_FLT a
MIX1_FLT u
MIX1_FLT a
%endif

View file

@ -0,0 +1,619 @@
;******************************************************************************
;* Copyright (c) 2012 Michael Niedermayer
;* Copyright (c) 2014 James Almer <jamrial <at> gmail.com>
;* Copyright (c) 2014 Ronald S. Bultje <rsbultje@gmail.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
%if ARCH_X86_64
%define pointer resq
%else
%define pointer resd
%endif
struc ResampleContext
.av_class: pointer 1
.filter_bank: pointer 1
.filter_length: resd 1
.filter_alloc: resd 1
.ideal_dst_incr: resd 1
.dst_incr: resd 1
.dst_incr_div: resd 1
.dst_incr_mod: resd 1
.index: resd 1
.frac: resd 1
.src_incr: resd 1
.compensation_distance: resd 1
.phase_count: resd 1
; there's a few more here but we only care about the first few
endstruc
SECTION_RODATA
pf_1: dd 1.0
pdbl_1: dq 1.0
pd_0x4000: dd 0x4000
SECTION .text
; FIXME remove unneeded variables (index_incr, phase_mask)
%macro RESAMPLE_FNS 3-5 ; format [float or int16], bps, log2_bps, float op suffix [s or d], 1.0 constant
; int resample_common_$format(ResampleContext *ctx, $format *dst,
; const $format *src, int size, int update_ctx)
%if ARCH_X86_64 ; unix64 and win64
cglobal resample_common_%1, 0, 15, 2, ctx, dst, src, phase_count, index, frac, \
dst_incr_mod, size, min_filter_count_x4, \
min_filter_len_x4, dst_incr_div, src_incr, \
phase_mask, dst_end, filter_bank
; use red-zone for variable storage
%define ctx_stackq [rsp-0x8]
%define src_stackq [rsp-0x10]
%if WIN64
%define update_context_stackd r4m
%else ; unix64
%define update_context_stackd [rsp-0x14]
%endif
; load as many variables in registers as possible; for the rest, store
; on stack so that we have 'ctx' available as one extra register
mov sized, r3d
%if UNIX64
mov update_context_stackd, r4d
%endif
mov indexd, [ctxq+ResampleContext.index]
mov fracd, [ctxq+ResampleContext.frac]
mov dst_incr_modd, [ctxq+ResampleContext.dst_incr_mod]
mov filter_bankq, [ctxq+ResampleContext.filter_bank]
mov src_incrd, [ctxq+ResampleContext.src_incr]
mov ctx_stackq, ctxq
mov min_filter_len_x4d, [ctxq+ResampleContext.filter_length]
mov dst_incr_divd, [ctxq+ResampleContext.dst_incr_div]
shl min_filter_len_x4d, %3
lea dst_endq, [dstq+sizeq*%2]
%if UNIX64
mov ecx, [ctxq+ResampleContext.phase_count]
mov edi, [ctxq+ResampleContext.filter_alloc]
DEFINE_ARGS filter_alloc, dst, src, phase_count, index, frac, dst_incr_mod, \
filter, min_filter_count_x4, min_filter_len_x4, dst_incr_div, \
src_incr, phase_mask, dst_end, filter_bank
%elif WIN64
mov R9d, [ctxq+ResampleContext.filter_alloc]
mov ecx, [ctxq+ResampleContext.phase_count]
DEFINE_ARGS phase_count, dst, src, filter_alloc, index, frac, dst_incr_mod, \
filter, min_filter_count_x4, min_filter_len_x4, dst_incr_div, \
src_incr, phase_mask, dst_end, filter_bank
%endif
neg min_filter_len_x4q
sub filter_bankq, min_filter_len_x4q
sub srcq, min_filter_len_x4q
mov src_stackq, srcq
%else ; x86-32
cglobal resample_common_%1, 1, 7, 2, ctx, phase_count, dst, frac, \
index, min_filter_length_x4, filter_bank
; push temp variables to stack
%define ctx_stackq r0mp
%define src_stackq r2mp
%define update_context_stackd r4m
mov dstq, r1mp
mov r3, r3mp
lea r3, [dstq+r3*%2]
PUSH dword [ctxq+ResampleContext.dst_incr_div]
PUSH dword [ctxq+ResampleContext.dst_incr_mod]
PUSH dword [ctxq+ResampleContext.filter_alloc]
PUSH r3
PUSH dword [ctxq+ResampleContext.phase_count] ; unneeded replacement for phase_mask
PUSH dword [ctxq+ResampleContext.src_incr]
mov min_filter_length_x4d, [ctxq+ResampleContext.filter_length]
mov indexd, [ctxq+ResampleContext.index]
shl min_filter_length_x4d, %3
mov fracd, [ctxq+ResampleContext.frac]
neg min_filter_length_x4q
mov filter_bankq, [ctxq+ResampleContext.filter_bank]
sub r2mp, min_filter_length_x4q
sub filter_bankq, min_filter_length_x4q
PUSH min_filter_length_x4q
PUSH filter_bankq
mov phase_countd, [ctxq+ResampleContext.phase_count]
DEFINE_ARGS src, phase_count, dst, frac, index, min_filter_count_x4, filter
%define filter_bankq dword [rsp+0x0]
%define min_filter_length_x4q dword [rsp+0x4]
%define src_incrd dword [rsp+0x8]
%define phase_maskd dword [rsp+0xc]
%define dst_endq dword [rsp+0x10]
%define filter_allocd dword [rsp+0x14]
%define dst_incr_modd dword [rsp+0x18]
%define dst_incr_divd dword [rsp+0x1c]
mov srcq, r2mp
%endif
.loop:
mov filterd, filter_allocd
imul filterd, indexd
%if ARCH_X86_64
mov min_filter_count_x4q, min_filter_len_x4q
lea filterq, [filter_bankq+filterq*%2]
%else ; x86-32
mov min_filter_count_x4q, filter_bankq
lea filterq, [min_filter_count_x4q+filterq*%2]
mov min_filter_count_x4q, min_filter_length_x4q
%endif
%ifidn %1, int16
movd m0, [pd_0x4000]
%else ; float/double
xorps m0, m0, m0
%endif
align 16
.inner_loop:
movu m1, [srcq+min_filter_count_x4q*1]
%ifidn %1, int16
%if cpuflag(xop)
vpmadcswd m0, m1, [filterq+min_filter_count_x4q*1], m0
%else
pmaddwd m1, [filterq+min_filter_count_x4q*1]
paddd m0, m1
%endif
%else ; float/double
%if cpuflag(fma4) || cpuflag(fma3)
fmaddp%4 m0, m1, [filterq+min_filter_count_x4q*1], m0
%else
mulp%4 m1, m1, [filterq+min_filter_count_x4q*1]
addp%4 m0, m0, m1
%endif ; cpuflag
%endif
add min_filter_count_x4q, mmsize
js .inner_loop
%ifidn %1, int16
HADDD m0, m1
psrad m0, 15
add fracd, dst_incr_modd
packssdw m0, m0
add indexd, dst_incr_divd
movd [dstq], m0
%else ; float/double
; horizontal sum & store
%if mmsize == 32
vextractf128 xm1, m0, 0x1
addp%4 xm0, xm1
%endif
movhlps xm1, xm0
%ifidn %1, float
addps xm0, xm1
shufps xm1, xm0, xm0, q0001
%endif
add fracd, dst_incr_modd
addp%4 xm0, xm1
add indexd, dst_incr_divd
movs%4 [dstq], xm0
%endif
cmp fracd, src_incrd
jl .skip
sub fracd, src_incrd
inc indexd
%if UNIX64
DEFINE_ARGS filter_alloc, dst, src, phase_count, index, frac, dst_incr_mod, \
index_incr, min_filter_count_x4, min_filter_len_x4, dst_incr_div, \
src_incr, phase_mask, dst_end, filter_bank
%elif WIN64
DEFINE_ARGS phase_count, dst, src, filter_alloc, index, frac, dst_incr_mod, \
index_incr, min_filter_count_x4, min_filter_len_x4, dst_incr_div, \
src_incr, phase_mask, dst_end, filter_bank
%else ; x86-32
DEFINE_ARGS src, phase_count, dst, frac, index, index_incr
%endif
.skip:
add dstq, %2
cmp indexd, phase_countd
jb .index_skip
.index_while:
sub indexd, phase_countd
lea srcq, [srcq+%2]
cmp indexd, phase_countd
jnb .index_while
.index_skip:
cmp dstq, dst_endq
jne .loop
%if ARCH_X86_64
DEFINE_ARGS ctx, dst, src, phase_count, index, frac
%else ; x86-32
DEFINE_ARGS src, ctx, update_context, frac, index
%endif
cmp dword update_context_stackd, 0
jz .skip_store
; strictly speaking, the function should always return the consumed
; number of bytes; however, we only use the value if update_context
; is true, so let's just leave it uninitialized otherwise
mov ctxq, ctx_stackq
movifnidn rax, srcq
mov [ctxq+ResampleContext.frac ], fracd
sub rax, src_stackq
mov [ctxq+ResampleContext.index], indexd
shr rax, %3
.skip_store:
%if ARCH_X86_32
ADD rsp, 0x20
%endif
RET
; int resample_linear_$format(ResampleContext *ctx, float *dst,
; const float *src, int size, int update_ctx)
%if ARCH_X86_64 ; unix64 and win64
%if UNIX64
cglobal resample_linear_%1, 0, 15, 5, ctx, dst, phase_mask, phase_count, index, frac, \
size, dst_incr_mod, min_filter_count_x4, \
min_filter_len_x4, dst_incr_div, src_incr, \
src, dst_end, filter_bank
mov srcq, r2mp
%else ; win64
cglobal resample_linear_%1, 0, 15, 5, ctx, phase_mask, src, phase_count, index, frac, \
size, dst_incr_mod, min_filter_count_x4, \
min_filter_len_x4, dst_incr_div, src_incr, \
dst, dst_end, filter_bank
mov dstq, r1mp
%endif
; use red-zone for variable storage
%define ctx_stackq [rsp-0x8]
%define src_stackq [rsp-0x10]
%define phase_mask_stackd [rsp-0x14]
%if WIN64
%define update_context_stackd r4m
%else ; unix64
%define update_context_stackd [rsp-0x18]
%endif
; load as many variables in registers as possible; for the rest, store
; on stack so that we have 'ctx' available as one extra register
mov sized, r3d
%if UNIX64
mov update_context_stackd, r4d
%endif
mov indexd, [ctxq+ResampleContext.index]
mov fracd, [ctxq+ResampleContext.frac]
mov dst_incr_modd, [ctxq+ResampleContext.dst_incr_mod]
mov filter_bankq, [ctxq+ResampleContext.filter_bank]
mov src_incrd, [ctxq+ResampleContext.src_incr]
mov ctx_stackq, ctxq
mov min_filter_len_x4d, [ctxq+ResampleContext.filter_length]
%ifidn %1, int16
movd m4, [pd_0x4000]
%else ; float/double
cvtsi2s%4 xm0, src_incrd
movs%4 xm4, [%5]
divs%4 xm4, xm0
%endif
mov dst_incr_divd, [ctxq+ResampleContext.dst_incr_div]
shl min_filter_len_x4d, %3
lea dst_endq, [dstq+sizeq*%2]
%if UNIX64
mov ecx, [ctxq+ResampleContext.phase_count]
mov edi, [ctxq+ResampleContext.filter_alloc]
DEFINE_ARGS filter_alloc, dst, filter2, phase_count, index, frac, filter1, \
dst_incr_mod, min_filter_count_x4, min_filter_len_x4, \
dst_incr_div, src_incr, src, dst_end, filter_bank
%elif WIN64
mov R9d, [ctxq+ResampleContext.filter_alloc]
mov ecx, [ctxq+ResampleContext.phase_count]
DEFINE_ARGS phase_count, filter2, src, filter_alloc, index, frac, filter1, \
dst_incr_mod, min_filter_count_x4, min_filter_len_x4, \
dst_incr_div, src_incr, dst, dst_end, filter_bank
%endif
neg min_filter_len_x4q
sub filter_bankq, min_filter_len_x4q
sub srcq, min_filter_len_x4q
mov src_stackq, srcq
%else ; x86-32
cglobal resample_linear_%1, 1, 7, 5, ctx, min_filter_length_x4, filter2, \
frac, index, dst, filter_bank
; push temp variables to stack
%define ctx_stackq r0mp
%define src_stackq r2mp
%define update_context_stackd r4m
mov dstq, r1mp
mov r3, r3mp
lea r3, [dstq+r3*%2]
PUSH dword [ctxq+ResampleContext.dst_incr_div]
PUSH r3
mov r3, dword [ctxq+ResampleContext.filter_alloc]
PUSH dword [ctxq+ResampleContext.dst_incr_mod]
PUSH r3
shl r3, %3
PUSH r3
mov r3, dword [ctxq+ResampleContext.src_incr]
PUSH dword [ctxq+ResampleContext.phase_count] ; unneeded replacement of phase_mask
PUSH r3d
%ifidn %1, int16
movd m4, [pd_0x4000]
%else ; float/double
cvtsi2s%4 xm0, r3d
movs%4 xm4, [%5]
divs%4 xm4, xm0
%endif
mov min_filter_length_x4d, [ctxq+ResampleContext.filter_length]
mov indexd, [ctxq+ResampleContext.index]
shl min_filter_length_x4d, %3
mov fracd, [ctxq+ResampleContext.frac]
neg min_filter_length_x4q
mov filter_bankq, [ctxq+ResampleContext.filter_bank]
sub r2mp, min_filter_length_x4q
sub filter_bankq, min_filter_length_x4q
PUSH min_filter_length_x4q
PUSH filter_bankq
PUSH dword [ctxq+ResampleContext.phase_count]
DEFINE_ARGS filter1, min_filter_count_x4, filter2, frac, index, dst, src
%define phase_count_stackd dword [rsp+0x0]
%define filter_bankq dword [rsp+0x4]
%define min_filter_length_x4q dword [rsp+0x8]
%define src_incrd dword [rsp+0xc]
%define phase_mask_stackd dword [rsp+0x10]
%define filter_alloc_x4q dword [rsp+0x14]
%define filter_allocd dword [rsp+0x18]
%define dst_incr_modd dword [rsp+0x1c]
%define dst_endq dword [rsp+0x20]
%define dst_incr_divd dword [rsp+0x24]
mov srcq, r2mp
%endif
.loop:
mov filter1d, filter_allocd
imul filter1d, indexd
%if ARCH_X86_64
mov min_filter_count_x4q, min_filter_len_x4q
lea filter1q, [filter_bankq+filter1q*%2]
lea filter2q, [filter1q+filter_allocq*%2]
%else ; x86-32
mov min_filter_count_x4q, filter_bankq
lea filter1q, [min_filter_count_x4q+filter1q*%2]
mov min_filter_count_x4q, min_filter_length_x4q
mov filter2q, filter1q
add filter2q, filter_alloc_x4q
%endif
%ifidn %1, int16
mova m0, m4
mova m2, m4
%else ; float/double
xorps m0, m0, m0
xorps m2, m2, m2
%endif
align 16
.inner_loop:
movu m1, [srcq+min_filter_count_x4q*1]
%ifidn %1, int16
%if cpuflag(xop)
vpmadcswd m2, m1, [filter2q+min_filter_count_x4q*1], m2
vpmadcswd m0, m1, [filter1q+min_filter_count_x4q*1], m0
%else
pmaddwd m3, m1, [filter2q+min_filter_count_x4q*1]
pmaddwd m1, [filter1q+min_filter_count_x4q*1]
paddd m2, m3
paddd m0, m1
%endif ; cpuflag
%else ; float/double
%if cpuflag(fma4) || cpuflag(fma3)
fmaddp%4 m2, m1, [filter2q+min_filter_count_x4q*1], m2
fmaddp%4 m0, m1, [filter1q+min_filter_count_x4q*1], m0
%else
mulp%4 m3, m1, [filter2q+min_filter_count_x4q*1]
mulp%4 m1, m1, [filter1q+min_filter_count_x4q*1]
addp%4 m2, m2, m3
addp%4 m0, m0, m1
%endif ; cpuflag
%endif
add min_filter_count_x4q, mmsize
js .inner_loop
%ifidn %1, int16
%if mmsize == 16
%if cpuflag(xop)
vphadddq m2, m2
vphadddq m0, m0
%endif
pshufd m3, m2, q0032
pshufd m1, m0, q0032
paddd m2, m3
paddd m0, m1
%endif
%if notcpuflag(xop)
PSHUFLW m3, m2, q0032
PSHUFLW m1, m0, q0032
paddd m2, m3
paddd m0, m1
%endif
psubd m2, m0
; This is probably a really bad idea on atom and other machines with a
; long transfer latency between GPRs and XMMs (atom). However, it does
; make the clip a lot simpler...
movd eax, m2
add indexd, dst_incr_divd
imul fracd
idiv src_incrd
movd m1, eax
add fracd, dst_incr_modd
paddd m0, m1
psrad m0, 15
packssdw m0, m0
movd [dstq], m0
; note that for imul/idiv, I need to move filter to edx/eax for each:
; - 32bit: eax=r0[filter1], edx=r2[filter2]
; - win64: eax=r6[filter1], edx=r1[todo]
; - unix64: eax=r6[filter1], edx=r2[todo]
%else ; float/double
; val += (v2 - val) * (FELEML) frac / c->src_incr;
%if mmsize == 32
vextractf128 xm1, m0, 0x1
vextractf128 xm3, m2, 0x1
addp%4 xm0, xm1
addp%4 xm2, xm3
%endif
cvtsi2s%4 xm1, fracd
subp%4 xm2, xm0
mulp%4 xm1, xm4
shufp%4 xm1, xm1, q0000
%if cpuflag(fma4) || cpuflag(fma3)
fmaddp%4 xm0, xm2, xm1, xm0
%else
mulp%4 xm2, xm1
addp%4 xm0, xm2
%endif ; cpuflag
; horizontal sum & store
movhlps xm1, xm0
%ifidn %1, float
addps xm0, xm1
shufps xm1, xm0, xm0, q0001
%endif
add fracd, dst_incr_modd
addp%4 xm0, xm1
add indexd, dst_incr_divd
movs%4 [dstq], xm0
%endif
cmp fracd, src_incrd
jl .skip
sub fracd, src_incrd
inc indexd
%if UNIX64
DEFINE_ARGS filter_alloc, dst, filter2, phase_count, index, frac, index_incr, \
dst_incr_mod, min_filter_count_x4, min_filter_len_x4, \
dst_incr_div, src_incr, src, dst_end, filter_bank
%elif WIN64
DEFINE_ARGS phase_count, filter2, src, filter_alloc, index, frac, index_incr, \
dst_incr_mod, min_filter_count_x4, min_filter_len_x4, \
dst_incr_div, src_incr, dst, dst_end, filter_bank
%else ; x86-32
DEFINE_ARGS filter1, phase_count, index_incr, frac, index, dst, src
%endif
.skip:
%if ARCH_X86_32
mov phase_countd, phase_count_stackd
%endif
add dstq, %2
cmp indexd, phase_countd
jb .index_skip
.index_while:
sub indexd, phase_countd
lea srcq, [srcq+%2]
cmp indexd, phase_countd
jnb .index_while
.index_skip:
cmp dstq, dst_endq
jne .loop
%if UNIX64
DEFINE_ARGS ctx, dst, filter2, phase_count, index, frac, index_incr, \
dst_incr_mod, min_filter_count_x4, min_filter_len_x4, \
dst_incr_div, src_incr, src, dst_end, filter_bank
%elif WIN64
DEFINE_ARGS ctx, filter2, src, phase_count, index, frac, index_incr, \
dst_incr_mod, min_filter_count_x4, min_filter_len_x4, \
dst_incr_div, src_incr, dst, dst_end, filter_bank
%else ; x86-32
DEFINE_ARGS filter1, ctx, update_context, frac, index, dst, src
%endif
cmp dword update_context_stackd, 0
jz .skip_store
; strictly speaking, the function should always return the consumed
; number of bytes; however, we only use the value if update_context
; is true, so let's just leave it uninitialized otherwise
mov ctxq, ctx_stackq
movifnidn rax, srcq
mov [ctxq+ResampleContext.frac ], fracd
sub rax, src_stackq
mov [ctxq+ResampleContext.index], indexd
shr rax, %3
.skip_store:
%if ARCH_X86_32
ADD rsp, 0x28
%endif
RET
%endmacro
INIT_XMM sse
RESAMPLE_FNS float, 4, 2, s, pf_1
%if HAVE_AVX_EXTERNAL
INIT_YMM avx
RESAMPLE_FNS float, 4, 2, s, pf_1
%endif
%if HAVE_FMA3_EXTERNAL
INIT_YMM fma3
RESAMPLE_FNS float, 4, 2, s, pf_1
%endif
%if HAVE_FMA4_EXTERNAL
INIT_XMM fma4
RESAMPLE_FNS float, 4, 2, s, pf_1
%endif
%if ARCH_X86_32
INIT_MMX mmxext
RESAMPLE_FNS int16, 2, 1
%endif
INIT_XMM sse2
RESAMPLE_FNS int16, 2, 1
%if HAVE_XOP_EXTERNAL
INIT_XMM xop
RESAMPLE_FNS int16, 2, 1
%endif
INIT_XMM sse2
RESAMPLE_FNS double, 8, 3, d, pdbl_1
%if HAVE_AVX_EXTERNAL
INIT_YMM avx
RESAMPLE_FNS double, 8, 3, d, pdbl_1
%endif
%if HAVE_FMA3_EXTERNAL
INIT_YMM fma3
RESAMPLE_FNS double, 8, 3, d, pdbl_1
%endif

View file

@ -429,48 +429,14 @@ if [ $SRS_EXPORT_LIBRTMP_PROJECT = NO ]; then
ln -sf ../3rdparty/ffmpeg-4.2-fit && cd ffmpeg-4.2-fit &&
PKG_CONFIG_PATH=$ABS_OBJS/opus/lib/pkgconfig ./configure \
--prefix=`pwd`/_release \
--pkg-config-flags="--static" \
--extra-libs=-lpthread \
--extra-libs=-lm \
--disable-programs \
--disable-doc \
--disable-htmlpages \
--disable-manpages \
--disable-podpages \
--disable-txtpages \
--disable-avdevice \
--disable-avformat \
--disable-swscale \
--disable-postproc \
--disable-avfilter \
--disable-network \
--disable-dct \
--disable-dwt \
--disable-error-resilience \
--disable-lsp \
--disable-lzo \
--disable-faan \
--disable-pixelutils \
--disable-hwaccels \
--disable-devices \
--disable-audiotoolbox \
--disable-videotoolbox \
--disable-appkit \
--disable-coreimage \
--disable-avfoundation \
--disable-securetransport \
--disable-iconv \
--disable-lzma \
--disable-sdl2 \
--disable-everything \
--enable-decoder=aac \
--enable-decoder=aac_fixed \
--enable-decoder=aac_latm \
--enable-decoder=libopus \
--enable-encoder=aac \
--enable-encoder=opus \
--enable-encoder=libopus \
--enable-libopus &&
--pkg-config-flags="--static" --extra-libs=-lpthread --extra-libs=-lm \
--disable-programs --disable-doc --disable-htmlpages --disable-manpages --disable-podpages --disable-txtpages \
--disable-avdevice --disable-avformat --disable-swscale --disable-postproc --disable-avfilter --disable-network \
--disable-dct --disable-dwt --disable-error-resilience --disable-lsp --disable-lzo --disable-faan --disable-pixelutils \
--disable-hwaccels --disable-devices --disable-audiotoolbox --disable-videotoolbox --disable-appkit --disable-coreimage \
--disable-avfoundation --disable-securetransport --disable-iconv --disable-lzma --disable-sdl2 --disable-everything \
--enable-decoder=aac --enable-decoder=aac_fixed --enable-decoder=aac_latm --enable-decoder=libopus --enable-encoder=aac \
--enable-encoder=opus --enable-encoder=libopus --enable-libopus &&
make ${SRS_JOBS} && make install
cd .. && rm -rf ffmpeg && ln -sf ffmpeg-4.2-fit/_release ffmpeg
)

5
trunk/configure vendored
View file

@ -151,7 +151,8 @@ if [[ $SRS_SHARED_ST == YES ]]; then LibSTfile="-lst"; fi
# srtp
LibSrtpRoot="${SRS_OBJS_DIR}/srtp2/include"; LibSrtpFile="${SRS_OBJS_DIR}/srtp2/lib/libsrtp2.a"
# ffmpeg
LibFfmpegRoot="${SRS_OBJS_DIR}/ffmpeg/include"; LibFfmpegFile="${SRS_OBJS_DIR}/ffmpeg/lib/libavcodec.a ${SRS_OBJS_DIR}/ffmpeg/lib/libswresample.a ${SRS_OBJS_DIR}/ffmpeg/lib/libavutil.a ${SRS_OBJS_DIR}/ffmpeg/lib/libopus.a -lpthread"
LibFfmpegRoot="${SRS_OBJS_DIR}/ffmpeg/include"; LibFfmpegFile="${SRS_OBJS_DIR}/ffmpeg/lib/libavcodec.a ${SRS_OBJS_DIR}/ffmpeg/lib/libswresample.a ${SRS_OBJS_DIR}/ffmpeg/lib/libavutil.a -lpthread"
LibFfmpegRoot="${LibFfmpegRoot} ${SRS_OBJS_DIR}/opus/include"; LibFfmpegFile="${LibFfmpegFile} ${SRS_OBJS_DIR}/opus/lib/libopus.a"
# openssl-1.1.0e, for the RTMP complex handshake.
LibSSLRoot="";LibSSLfile=""
if [[ $SRS_SSL == YES && $SRS_USE_SYS_SSL == NO ]]; then
@ -173,7 +174,7 @@ fi
# the link options, always use static link
SrsLinkOptions="-ldl";
if [[ $SRS_SRT == YES ]]; then
SrsLinkOptions="${SrsLinkOptions} -pthread";
SrsLinkOptions="${SrsLinkOptions} -lpthread";
fi
if [[ $SRS_SSL == YES && $SRS_USE_SYS_SSL == YES ]]; then
SrsLinkOptions="${SrsLinkOptions} -lssl -lcrypto";