1
0
Fork 0
mirror of https://github.com/ossrs/srs.git synced 2025-03-09 15:49:59 +00:00

For #1659, #307, add x86 asm for ffmpeg for rtc

This commit is contained in:
winlin 2020-03-22 17:14:07 +08:00
parent 4308f238c0
commit 37c84eccc0
28 changed files with 8441 additions and 50 deletions

View file

@ -0,0 +1,60 @@
# subsystems
OBJS-$(CONFIG_FFT) += aarch64/fft_init_aarch64.o
OBJS-$(CONFIG_FMTCONVERT) += aarch64/fmtconvert_init.o
OBJS-$(CONFIG_H264CHROMA) += aarch64/h264chroma_init_aarch64.o
OBJS-$(CONFIG_H264DSP) += aarch64/h264dsp_init_aarch64.o
OBJS-$(CONFIG_H264PRED) += aarch64/h264pred_init.o
OBJS-$(CONFIG_H264QPEL) += aarch64/h264qpel_init_aarch64.o
OBJS-$(CONFIG_HPELDSP) += aarch64/hpeldsp_init_aarch64.o
OBJS-$(CONFIG_MPEGAUDIODSP) += aarch64/mpegaudiodsp_init.o
OBJS-$(CONFIG_NEON_CLOBBER_TEST) += aarch64/neontest.o
OBJS-$(CONFIG_VIDEODSP) += aarch64/videodsp_init.o
OBJS-$(CONFIG_VP8DSP) += aarch64/vp8dsp_init_aarch64.o
# decoders/encoders
OBJS-$(CONFIG_AAC_DECODER) += aarch64/aacpsdsp_init_aarch64.o \
aarch64/sbrdsp_init_aarch64.o
OBJS-$(CONFIG_DCA_DECODER) += aarch64/synth_filter_init.o
OBJS-$(CONFIG_OPUS_DECODER) += aarch64/opusdsp_init.o
OBJS-$(CONFIG_RV40_DECODER) += aarch64/rv40dsp_init_aarch64.o
OBJS-$(CONFIG_VC1DSP) += aarch64/vc1dsp_init_aarch64.o
OBJS-$(CONFIG_VORBIS_DECODER) += aarch64/vorbisdsp_init.o
OBJS-$(CONFIG_VP9_DECODER) += aarch64/vp9dsp_init_10bpp_aarch64.o \
aarch64/vp9dsp_init_12bpp_aarch64.o \
aarch64/vp9dsp_init_aarch64.o
# ARMv8 optimizations
# subsystems
ARMV8-OBJS-$(CONFIG_VIDEODSP) += aarch64/videodsp.o
# NEON optimizations
# subsystems
NEON-OBJS-$(CONFIG_AAC_DECODER) += aarch64/sbrdsp_neon.o
NEON-OBJS-$(CONFIG_FFT) += aarch64/fft_neon.o
NEON-OBJS-$(CONFIG_FMTCONVERT) += aarch64/fmtconvert_neon.o
NEON-OBJS-$(CONFIG_H264CHROMA) += aarch64/h264cmc_neon.o
NEON-OBJS-$(CONFIG_H264DSP) += aarch64/h264dsp_neon.o \
aarch64/h264idct_neon.o
NEON-OBJS-$(CONFIG_H264PRED) += aarch64/h264pred_neon.o
NEON-OBJS-$(CONFIG_H264QPEL) += aarch64/h264qpel_neon.o \
aarch64/hpeldsp_neon.o
NEON-OBJS-$(CONFIG_HPELDSP) += aarch64/hpeldsp_neon.o
NEON-OBJS-$(CONFIG_IDCTDSP) += aarch64/idctdsp_init_aarch64.o \
aarch64/simple_idct_neon.o
NEON-OBJS-$(CONFIG_MDCT) += aarch64/mdct_neon.o
NEON-OBJS-$(CONFIG_MPEGAUDIODSP) += aarch64/mpegaudiodsp_neon.o
NEON-OBJS-$(CONFIG_VP8DSP) += aarch64/vp8dsp_neon.o
# decoders/encoders
NEON-OBJS-$(CONFIG_AAC_DECODER) += aarch64/aacpsdsp_neon.o
NEON-OBJS-$(CONFIG_DCA_DECODER) += aarch64/synth_filter_neon.o
NEON-OBJS-$(CONFIG_OPUS_DECODER) += aarch64/opusdsp_neon.o
NEON-OBJS-$(CONFIG_VORBIS_DECODER) += aarch64/vorbisdsp_neon.o
NEON-OBJS-$(CONFIG_VP9_DECODER) += aarch64/vp9itxfm_16bpp_neon.o \
aarch64/vp9itxfm_neon.o \
aarch64/vp9lpf_16bpp_neon.o \
aarch64/vp9lpf_neon.o \
aarch64/vp9mc_16bpp_neon.o \
aarch64/vp9mc_neon.o

View file

@ -0,0 +1,199 @@
OBJS += x86/constants.o \
# subsystems
OBJS-$(CONFIG_AC3DSP) += x86/ac3dsp_init.o
OBJS-$(CONFIG_AUDIODSP) += x86/audiodsp_init.o
OBJS-$(CONFIG_BLOCKDSP) += x86/blockdsp_init.o
OBJS-$(CONFIG_BSWAPDSP) += x86/bswapdsp_init.o
OBJS-$(CONFIG_DCT) += x86/dct_init.o
OBJS-$(CONFIG_DIRAC_DECODER) += x86/diracdsp_init.o \
x86/dirac_dwt_init.o
OBJS-$(CONFIG_FDCTDSP) += x86/fdctdsp_init.o
OBJS-$(CONFIG_FFT) += x86/fft_init.o
OBJS-$(CONFIG_FLACDSP) += x86/flacdsp_init.o
OBJS-$(CONFIG_FMTCONVERT) += x86/fmtconvert_init.o
OBJS-$(CONFIG_H263DSP) += x86/h263dsp_init.o
OBJS-$(CONFIG_H264CHROMA) += x86/h264chroma_init.o
OBJS-$(CONFIG_H264DSP) += x86/h264dsp_init.o
OBJS-$(CONFIG_H264PRED) += x86/h264_intrapred_init.o
OBJS-$(CONFIG_H264QPEL) += x86/h264_qpel.o
OBJS-$(CONFIG_HPELDSP) += x86/hpeldsp_init.o
OBJS-$(CONFIG_LLAUDDSP) += x86/lossless_audiodsp_init.o
OBJS-$(CONFIG_LLVIDDSP) += x86/lossless_videodsp_init.o
OBJS-$(CONFIG_LLVIDENCDSP) += x86/lossless_videoencdsp_init.o
OBJS-$(CONFIG_HUFFYUVDSP) += x86/huffyuvdsp_init.o
OBJS-$(CONFIG_HUFFYUVENCDSP) += x86/huffyuvencdsp_init.o
OBJS-$(CONFIG_IDCTDSP) += x86/idctdsp_init.o
OBJS-$(CONFIG_LPC) += x86/lpc.o
OBJS-$(CONFIG_MDCT15) += x86/mdct15_init.o
OBJS-$(CONFIG_ME_CMP) += x86/me_cmp_init.o
OBJS-$(CONFIG_MPEGAUDIODSP) += x86/mpegaudiodsp.o
OBJS-$(CONFIG_MPEGVIDEO) += x86/mpegvideo.o \
x86/mpegvideodsp.o
OBJS-$(CONFIG_MPEGVIDEOENC) += x86/mpegvideoenc.o \
x86/mpegvideoencdsp_init.o
OBJS-$(CONFIG_PIXBLOCKDSP) += x86/pixblockdsp_init.o
OBJS-$(CONFIG_QPELDSP) += x86/qpeldsp_init.o
OBJS-$(CONFIG_RV34DSP) += x86/rv34dsp_init.o
OBJS-$(CONFIG_VC1DSP) += x86/vc1dsp_init.o
OBJS-$(CONFIG_VIDEODSP) += x86/videodsp_init.o
OBJS-$(CONFIG_VP3DSP) += x86/vp3dsp_init.o
OBJS-$(CONFIG_VP8DSP) += x86/vp8dsp_init.o
OBJS-$(CONFIG_XMM_CLOBBER_TEST) += x86/w64xmmtest.o
# decoders/encoders
OBJS-$(CONFIG_AAC_DECODER) += x86/aacpsdsp_init.o \
x86/sbrdsp_init.o
OBJS-$(CONFIG_AAC_ENCODER) += x86/aacencdsp_init.o
OBJS-$(CONFIG_ADPCM_G722_DECODER) += x86/g722dsp_init.o
OBJS-$(CONFIG_ADPCM_G722_ENCODER) += x86/g722dsp_init.o
OBJS-$(CONFIG_ALAC_DECODER) += x86/alacdsp_init.o
OBJS-$(CONFIG_APNG_DECODER) += x86/pngdsp_init.o
OBJS-$(CONFIG_CAVS_DECODER) += x86/cavsdsp.o
OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp_init.o x86/synth_filter_init.o
OBJS-$(CONFIG_DNXHD_ENCODER) += x86/dnxhdenc_init.o
OBJS-$(CONFIG_EXR_DECODER) += x86/exrdsp_init.o
OBJS-$(CONFIG_OPUS_DECODER) += x86/opusdsp_init.o
OBJS-$(CONFIG_OPUS_ENCODER) += x86/celt_pvq_init.o
OBJS-$(CONFIG_HEVC_DECODER) += x86/hevcdsp_init.o
OBJS-$(CONFIG_JPEG2000_DECODER) += x86/jpeg2000dsp_init.o
OBJS-$(CONFIG_LSCR_DECODER) += x86/pngdsp_init.o
OBJS-$(CONFIG_MLP_DECODER) += x86/mlpdsp_init.o
OBJS-$(CONFIG_MPEG4_DECODER) += x86/xvididct_init.o
OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp_init.o
OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp_init.o
OBJS-$(CONFIG_PRORES_LGPL_DECODER) += x86/proresdsp_init.o
OBJS-$(CONFIG_RV40_DECODER) += x86/rv40dsp_init.o
OBJS-$(CONFIG_SBC_ENCODER) += x86/sbcdsp_init.o
OBJS-$(CONFIG_SVQ1_ENCODER) += x86/svq1enc_init.o
OBJS-$(CONFIG_TAK_DECODER) += x86/takdsp_init.o
OBJS-$(CONFIG_TRUEHD_DECODER) += x86/mlpdsp_init.o
OBJS-$(CONFIG_TTA_DECODER) += x86/ttadsp_init.o
OBJS-$(CONFIG_TTA_ENCODER) += x86/ttaencdsp_init.o
OBJS-$(CONFIG_UTVIDEO_DECODER) += x86/utvideodsp_init.o
OBJS-$(CONFIG_V210_DECODER) += x86/v210-init.o
OBJS-$(CONFIG_V210_ENCODER) += x86/v210enc_init.o
OBJS-$(CONFIG_VORBIS_DECODER) += x86/vorbisdsp_init.o
OBJS-$(CONFIG_VP3_DECODER) += x86/hpeldsp_vp3_init.o
OBJS-$(CONFIG_VP6_DECODER) += x86/vp6dsp_init.o
OBJS-$(CONFIG_VP9_DECODER) += x86/vp9dsp_init.o \
x86/vp9dsp_init_10bpp.o \
x86/vp9dsp_init_12bpp.o \
x86/vp9dsp_init_16bpp.o
OBJS-$(CONFIG_WEBP_DECODER) += x86/vp8dsp_init.o
# GCC inline assembly optimizations
# subsystems
MMX-OBJS-$(CONFIG_FDCTDSP) += x86/fdct.o
MMX-OBJS-$(CONFIG_VC1DSP) += x86/vc1dsp_mmx.o
# decoders/encoders
MMX-OBJS-$(CONFIG_SNOW_DECODER) += x86/snowdsp.o
MMX-OBJS-$(CONFIG_SNOW_ENCODER) += x86/snowdsp.o
# subsystems
X86ASM-OBJS-$(CONFIG_AC3DSP) += x86/ac3dsp.o \
x86/ac3dsp_downmix.o
X86ASM-OBJS-$(CONFIG_AUDIODSP) += x86/audiodsp.o
X86ASM-OBJS-$(CONFIG_BLOCKDSP) += x86/blockdsp.o
X86ASM-OBJS-$(CONFIG_BSWAPDSP) += x86/bswapdsp.o
X86ASM-OBJS-$(CONFIG_DCT) += x86/dct32.o
X86ASM-OBJS-$(CONFIG_FFT) += x86/fft.o
X86ASM-OBJS-$(CONFIG_FMTCONVERT) += x86/fmtconvert.o
X86ASM-OBJS-$(CONFIG_H263DSP) += x86/h263_loopfilter.o
X86ASM-OBJS-$(CONFIG_H264CHROMA) += x86/h264_chromamc.o \
x86/h264_chromamc_10bit.o
X86ASM-OBJS-$(CONFIG_H264DSP) += x86/h264_deblock.o \
x86/h264_deblock_10bit.o \
x86/h264_idct.o \
x86/h264_idct_10bit.o \
x86/h264_weight.o \
x86/h264_weight_10bit.o
X86ASM-OBJS-$(CONFIG_H264PRED) += x86/h264_intrapred.o \
x86/h264_intrapred_10bit.o
X86ASM-OBJS-$(CONFIG_H264QPEL) += x86/h264_qpel_8bit.o \
x86/h264_qpel_10bit.o \
x86/fpel.o \
x86/qpel.o
X86ASM-OBJS-$(CONFIG_HPELDSP) += x86/fpel.o \
x86/hpeldsp.o
X86ASM-OBJS-$(CONFIG_HUFFYUVDSP) += x86/huffyuvdsp.o
X86ASM-OBJS-$(CONFIG_HUFFYUVENCDSP) += x86/huffyuvencdsp.o
X86ASM-OBJS-$(CONFIG_IDCTDSP) += x86/idctdsp.o
X86ASM-OBJS-$(CONFIG_LLAUDDSP) += x86/lossless_audiodsp.o
X86ASM-OBJS-$(CONFIG_LLVIDDSP) += x86/lossless_videodsp.o
X86ASM-OBJS-$(CONFIG_LLVIDENCDSP) += x86/lossless_videoencdsp.o
X86ASM-OBJS-$(CONFIG_MDCT15) += x86/mdct15.o
X86ASM-OBJS-$(CONFIG_ME_CMP) += x86/me_cmp.o
X86ASM-OBJS-$(CONFIG_MPEGAUDIODSP) += x86/imdct36.o
X86ASM-OBJS-$(CONFIG_MPEGVIDEOENC) += x86/mpegvideoencdsp.o
X86ASM-OBJS-$(CONFIG_OPUS_DECODER) += x86/opusdsp.o
X86ASM-OBJS-$(CONFIG_OPUS_ENCODER) += x86/celt_pvq_search.o
X86ASM-OBJS-$(CONFIG_PIXBLOCKDSP) += x86/pixblockdsp.o
X86ASM-OBJS-$(CONFIG_QPELDSP) += x86/qpeldsp.o \
x86/fpel.o \
x86/qpel.o
X86ASM-OBJS-$(CONFIG_RV34DSP) += x86/rv34dsp.o
X86ASM-OBJS-$(CONFIG_VC1DSP) += x86/vc1dsp_loopfilter.o \
x86/vc1dsp_mc.o
X86ASM-OBJS-$(CONFIG_IDCTDSP) += x86/simple_idct10.o \
x86/simple_idct.o
X86ASM-OBJS-$(CONFIG_VIDEODSP) += x86/videodsp.o
X86ASM-OBJS-$(CONFIG_VP3DSP) += x86/vp3dsp.o
X86ASM-OBJS-$(CONFIG_VP8DSP) += x86/vp8dsp.o \
x86/vp8dsp_loopfilter.o
# decoders/encoders
X86ASM-OBJS-$(CONFIG_AAC_DECODER) += x86/aacpsdsp.o \
x86/sbrdsp.o
X86ASM-OBJS-$(CONFIG_AAC_ENCODER) += x86/aacencdsp.o
X86ASM-OBJS-$(CONFIG_ADPCM_G722_DECODER) += x86/g722dsp.o
X86ASM-OBJS-$(CONFIG_ADPCM_G722_ENCODER) += x86/g722dsp.o
X86ASM-OBJS-$(CONFIG_ALAC_DECODER) += x86/alacdsp.o
X86ASM-OBJS-$(CONFIG_APNG_DECODER) += x86/pngdsp.o
X86ASM-OBJS-$(CONFIG_CAVS_DECODER) += x86/cavsidct.o
X86ASM-OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp.o x86/synth_filter.o
X86ASM-OBJS-$(CONFIG_DIRAC_DECODER) += x86/diracdsp.o \
x86/dirac_dwt.o
X86ASM-OBJS-$(CONFIG_DNXHD_ENCODER) += x86/dnxhdenc.o
X86ASM-OBJS-$(CONFIG_EXR_DECODER) += x86/exrdsp.o
X86ASM-OBJS-$(CONFIG_FLAC_DECODER) += x86/flacdsp.o
ifdef CONFIG_GPL
X86ASM-OBJS-$(CONFIG_FLAC_ENCODER) += x86/flac_dsp_gpl.o
endif
X86ASM-OBJS-$(CONFIG_HEVC_DECODER) += x86/hevc_add_res.o \
x86/hevc_deblock.o \
x86/hevc_idct.o \
x86/hevc_mc.o \
x86/hevc_sao.o \
x86/hevc_sao_10bit.o
X86ASM-OBJS-$(CONFIG_JPEG2000_DECODER) += x86/jpeg2000dsp.o
X86ASM-OBJS-$(CONFIG_LSCR_DECODER) += x86/pngdsp.o
X86ASM-OBJS-$(CONFIG_MLP_DECODER) += x86/mlpdsp.o
X86ASM-OBJS-$(CONFIG_MPEG4_DECODER) += x86/xvididct.o
X86ASM-OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp.o
X86ASM-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp.o
X86ASM-OBJS-$(CONFIG_PRORES_LGPL_DECODER) += x86/proresdsp.o
X86ASM-OBJS-$(CONFIG_RV40_DECODER) += x86/rv40dsp.o
X86ASM-OBJS-$(CONFIG_SBC_ENCODER) += x86/sbcdsp.o
X86ASM-OBJS-$(CONFIG_SVQ1_ENCODER) += x86/svq1enc.o
X86ASM-OBJS-$(CONFIG_TAK_DECODER) += x86/takdsp.o
X86ASM-OBJS-$(CONFIG_TRUEHD_DECODER) += x86/mlpdsp.o
X86ASM-OBJS-$(CONFIG_TTA_DECODER) += x86/ttadsp.o
X86ASM-OBJS-$(CONFIG_TTA_ENCODER) += x86/ttaencdsp.o
X86ASM-OBJS-$(CONFIG_UTVIDEO_DECODER) += x86/utvideodsp.o
X86ASM-OBJS-$(CONFIG_V210_ENCODER) += x86/v210enc.o
X86ASM-OBJS-$(CONFIG_V210_DECODER) += x86/v210.o
X86ASM-OBJS-$(CONFIG_VORBIS_DECODER) += x86/vorbisdsp.o
X86ASM-OBJS-$(CONFIG_VP3_DECODER) += x86/hpeldsp_vp3.o
X86ASM-OBJS-$(CONFIG_VP6_DECODER) += x86/vp6dsp.o
X86ASM-OBJS-$(CONFIG_VP9_DECODER) += x86/vp9intrapred.o \
x86/vp9intrapred_16bpp.o \
x86/vp9itxfm.o \
x86/vp9itxfm_16bpp.o \
x86/vp9lpf.o \
x86/vp9lpf_16bpp.o \
x86/vp9mc.o \
x86/vp9mc_16bpp.o
X86ASM-OBJS-$(CONFIG_WEBP_DECODER) += x86/vp8dsp.o

View file

@ -0,0 +1,86 @@
;******************************************************************************
;* SIMD optimized AAC encoder DSP functions
;*
;* Copyright (C) 2016 Rostislav Pehlivanov <atomnuker@gmail.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION_RODATA
float_abs_mask: times 4 dd 0x7fffffff
SECTION .text
;*******************************************************************
;void ff_abs_pow34(float *out, const float *in, const int size);
;*******************************************************************
INIT_XMM sse
cglobal abs_pow34, 3, 3, 3, out, in, size
mova m2, [float_abs_mask]
shl sizeq, 2
add inq, sizeq
add outq, sizeq
neg sizeq
.loop:
andps m0, m2, [inq+sizeq]
sqrtps m1, m0
mulps m0, m1
sqrtps m0, m0
mova [outq+sizeq], m0
add sizeq, mmsize
jl .loop
RET
;*******************************************************************
;void ff_aac_quantize_bands(int *out, const float *in, const float *scaled,
; int size, int is_signed, int maxval, const float Q34,
; const float rounding)
;*******************************************************************
INIT_XMM sse2
cglobal aac_quantize_bands, 5, 5, 6, out, in, scaled, size, is_signed, maxval, Q34, rounding
%if UNIX64 == 0
movss m0, Q34m
movss m1, roundingm
cvtsi2ss m3, dword maxvalm
%else
cvtsi2ss m3, maxvald
%endif
shufps m0, m0, 0
shufps m1, m1, 0
shufps m3, m3, 0
shl is_signedd, 31
movd m4, is_signedd
shufps m4, m4, 0
shl sized, 2
add inq, sizeq
add outq, sizeq
add scaledq, sizeq
neg sizeq
.loop:
mulps m2, m0, [scaledq+sizeq]
addps m2, m1
minps m2, m3
andps m5, m4, [inq+sizeq]
orps m2, m5
cvttps2dq m2, m2
mova [outq+sizeq], m2
add sizeq, mmsize
jl .loop
RET

View file

@ -0,0 +1,487 @@
;******************************************************************************
;* SIMD optimized MPEG-4 Parametric Stereo decoding functions
;*
;* Copyright (C) 2015 James Almer
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION_RODATA
ps_p1m1p1m1: dd 0, 0x80000000, 0, 0x80000000
SECTION .text
;*************************************************************************
;void ff_ps_add_squares_<opt>(float *dst, const float (*src)[2], int n);
;*************************************************************************
%macro PS_ADD_SQUARES 1
cglobal ps_add_squares, 3, 3, %1, dst, src, n
shl nd, 3
add srcq, nq
neg nq
align 16
.loop:
movaps m0, [srcq+nq]
movaps m1, [srcq+nq+mmsize]
mulps m0, m0
mulps m1, m1
HADDPS m0, m1, m2
addps m0, [dstq]
movaps [dstq], m0
add dstq, mmsize
add nq, mmsize*2
jl .loop
REP_RET
%endmacro
INIT_XMM sse
PS_ADD_SQUARES 2
INIT_XMM sse3
PS_ADD_SQUARES 3
;*******************************************************************
;void ff_ps_mul_pair_single_sse(float (*dst)[2], float (*src0)[2],
; float *src1, int n);
;*******************************************************************
INIT_XMM sse
cglobal ps_mul_pair_single, 4, 4, 4, dst, src1, src2, n
shl nd, 3
add src1q, nq
add dstq, nq
neg nq
align 16
.loop:
movu m0, [src1q+nq]
movu m1, [src1q+nq+mmsize]
mova m2, [src2q]
mova m3, m2
unpcklps m2, m2
unpckhps m3, m3
mulps m0, m2
mulps m1, m3
mova [dstq+nq], m0
mova [dstq+nq+mmsize], m1
add src2q, mmsize
add nq, mmsize*2
jl .loop
REP_RET
;***********************************************************************
;void ff_ps_stereo_interpolate_sse3(float (*l)[2], float (*r)[2],
; float h[2][4], float h_step[2][4],
; int len);
;***********************************************************************
INIT_XMM sse3
cglobal ps_stereo_interpolate, 5, 5, 6, l, r, h, h_step, n
movaps m0, [hq]
movaps m1, [h_stepq]
unpcklps m4, m0, m0
unpckhps m0, m0
unpcklps m5, m1, m1
unpckhps m1, m1
shl nd, 3
add lq, nq
add rq, nq
neg nq
align 16
.loop:
addps m4, m5
addps m0, m1
movddup m2, [lq+nq]
movddup m3, [rq+nq]
mulps m2, m4
mulps m3, m0
addps m2, m3
movsd [lq+nq], m2
movhps [rq+nq], m2
add nq, 8
jl .loop
REP_RET
;***************************************************************************
;void ps_stereo_interpolate_ipdopd_sse3(float (*l)[2], float (*r)[2],
; float h[2][4], float h_step[2][4],
; int len);
;***************************************************************************
INIT_XMM sse3
cglobal ps_stereo_interpolate_ipdopd, 5, 5, 10, l, r, h, h_step, n
movaps m0, [hq]
movaps m1, [hq+mmsize]
%if ARCH_X86_64
movaps m8, [h_stepq]
movaps m9, [h_stepq+mmsize]
%define H_STEP0 m8
%define H_STEP1 m9
%else
%define H_STEP0 [h_stepq]
%define H_STEP1 [h_stepq+mmsize]
%endif
shl nd, 3
add lq, nq
add rq, nq
neg nq
align 16
.loop:
addps m0, H_STEP0
addps m1, H_STEP1
movddup m2, [lq+nq]
movddup m3, [rq+nq]
shufps m4, m2, m2, q2301
shufps m5, m3, m3, q2301
unpcklps m6, m0, m0
unpckhps m7, m0, m0
mulps m2, m6
mulps m3, m7
unpcklps m6, m1, m1
unpckhps m7, m1, m1
mulps m4, m6
mulps m5, m7
addps m2, m3
addsubps m2, m4
addsubps m2, m5
movsd [lq+nq], m2
movhps [rq+nq], m2
add nq, 8
jl .loop
REP_RET
;**********************************************************
;void ps_hybrid_analysis_ileave_sse(float out[2][38][64],
; float (*in)[32][2],
; int i, int len)
;**********************************************************
INIT_XMM sse
cglobal ps_hybrid_analysis_ileave, 3, 7, 5, out, in, i, len, in0, in1, tmp
movsxdifnidn iq, id
mov lend, 32 << 3
lea inq, [inq+iq*4]
mov tmpd, id
shl tmpd, 8
add outq, tmpq
mov tmpd, 64
sub tmpd, id
mov id, tmpd
test id, 1
jne .loop4
test id, 2
jne .loop8
align 16
.loop16:
mov in0q, inq
mov in1q, 38*64*4
add in1q, in0q
mov tmpd, lend
.inner_loop16:
movaps m0, [in0q]
movaps m1, [in1q]
movaps m2, [in0q+lenq]
movaps m3, [in1q+lenq]
TRANSPOSE4x4PS 0, 1, 2, 3, 4
movaps [outq], m0
movaps [outq+lenq], m1
movaps [outq+lenq*2], m2
movaps [outq+3*32*2*4], m3
lea in0q, [in0q+lenq*2]
lea in1q, [in1q+lenq*2]
add outq, mmsize
sub tmpd, mmsize
jg .inner_loop16
add inq, 16
add outq, 3*32*2*4
sub id, 4
jg .loop16
RET
align 16
.loop8:
mov in0q, inq
mov in1q, 38*64*4
add in1q, in0q
mov tmpd, lend
.inner_loop8:
movlps m0, [in0q]
movlps m1, [in1q]
movhps m0, [in0q+lenq]
movhps m1, [in1q+lenq]
SBUTTERFLYPS 0, 1, 2
SBUTTERFLYPD 0, 1, 2
movaps [outq], m0
movaps [outq+lenq], m1
lea in0q, [in0q+lenq*2]
lea in1q, [in1q+lenq*2]
add outq, mmsize
sub tmpd, mmsize
jg .inner_loop8
add inq, 8
add outq, lenq
sub id, 2
jg .loop16
RET
align 16
.loop4:
mov in0q, inq
mov in1q, 38*64*4
add in1q, in0q
mov tmpd, lend
.inner_loop4:
movss m0, [in0q]
movss m1, [in1q]
movss m2, [in0q+lenq]
movss m3, [in1q+lenq]
movlhps m0, m1
movlhps m2, m3
shufps m0, m2, q2020
movaps [outq], m0
lea in0q, [in0q+lenq*2]
lea in1q, [in1q+lenq*2]
add outq, mmsize
sub tmpd, mmsize
jg .inner_loop4
add inq, 4
sub id, 1
test id, 2
jne .loop8
cmp id, 4
jge .loop16
RET
;***********************************************************
;void ps_hybrid_synthesis_deint_sse4(float out[2][38][64],
; float (*in)[32][2],
; int i, int len)
;***********************************************************
%macro HYBRID_SYNTHESIS_DEINT 0
cglobal ps_hybrid_synthesis_deint, 3, 7, 5, out, in, i, len, out0, out1, tmp
%if cpuflag(sse4)
%define MOVH movsd
%else
%define MOVH movlps
%endif
movsxdifnidn iq, id
mov lend, 32 << 3
lea outq, [outq+iq*4]
mov tmpd, id
shl tmpd, 8
add inq, tmpq
mov tmpd, 64
sub tmpd, id
mov id, tmpd
test id, 1
jne .loop4
test id, 2
jne .loop8
align 16
.loop16:
mov out0q, outq
mov out1q, 38*64*4
add out1q, out0q
mov tmpd, lend
.inner_loop16:
movaps m0, [inq]
movaps m1, [inq+lenq]
movaps m2, [inq+lenq*2]
movaps m3, [inq+3*32*2*4]
TRANSPOSE4x4PS 0, 1, 2, 3, 4
movaps [out0q], m0
movaps [out1q], m1
movaps [out0q+lenq], m2
movaps [out1q+lenq], m3
lea out0q, [out0q+lenq*2]
lea out1q, [out1q+lenq*2]
add inq, mmsize
sub tmpd, mmsize
jg .inner_loop16
add outq, 16
add inq, 3*32*2*4
sub id, 4
jg .loop16
RET
align 16
.loop8:
mov out0q, outq
mov out1q, 38*64*4
add out1q, out0q
mov tmpd, lend
.inner_loop8:
movaps m0, [inq]
movaps m1, [inq+lenq]
SBUTTERFLYPS 0, 1, 2
SBUTTERFLYPD 0, 1, 2
MOVH [out0q], m0
MOVH [out1q], m1
movhps [out0q+lenq], m0
movhps [out1q+lenq], m1
lea out0q, [out0q+lenq*2]
lea out1q, [out1q+lenq*2]
add inq, mmsize
sub tmpd, mmsize
jg .inner_loop8
add outq, 8
add inq, lenq
sub id, 2
jg .loop16
RET
align 16
.loop4:
mov out0q, outq
mov out1q, 38*64*4
add out1q, out0q
mov tmpd, lend
.inner_loop4:
movaps m0, [inq]
movss [out0q], m0
%if cpuflag(sse4)
extractps [out1q], m0, 1
extractps [out0q+lenq], m0, 2
extractps [out1q+lenq], m0, 3
%else
movhlps m1, m0
movss [out0q+lenq], m1
shufps m0, m0, 0xb1
movss [out1q], m0
movhlps m1, m0
movss [out1q+lenq], m1
%endif
lea out0q, [out0q+lenq*2]
lea out1q, [out1q+lenq*2]
add inq, mmsize
sub tmpd, mmsize
jg .inner_loop4
add outq, 4
sub id, 1
test id, 2
jne .loop8
cmp id, 4
jge .loop16
RET
%endmacro
INIT_XMM sse
HYBRID_SYNTHESIS_DEINT
INIT_XMM sse4
HYBRID_SYNTHESIS_DEINT
;*******************************************************************
;void ff_ps_hybrid_analysis_<opt>(float (*out)[2], float (*in)[2],
; const float (*filter)[8][2],
; ptrdiff_t stride, int n);
;*******************************************************************
%macro PS_HYBRID_ANALYSIS_LOOP 3
movu %1, [inq+mmsize*%3]
movu m1, [inq+mmsize*(5-%3)+8]
%if cpuflag(sse3)
pshufd %2, %1, q2301
pshufd m4, m1, q0123
pshufd m1, m1, q1032
pshufd m2, [filterq+nq+mmsize*%3], q2301
addsubps %2, m4
addsubps %1, m1
%else
mova m2, [filterq+nq+mmsize*%3]
mova %2, %1
mova m4, m1
shufps %2, %2, q2301
shufps m4, m4, q0123
shufps m1, m1, q1032
shufps m2, m2, q2301
xorps m4, m7
xorps m1, m7
subps %2, m4
subps %1, m1
%endif
mulps %2, m2
mulps %1, m2
%if %3
addps m3, %2
addps m0, %1
%endif
%endmacro
%macro PS_HYBRID_ANALYSIS 0
cglobal ps_hybrid_analysis, 5, 5, 8, out, in, filter, stride, n
%if cpuflag(sse3)
%define MOVH movsd
%else
%define MOVH movlps
%endif
shl strideq, 3
shl nd, 6
add filterq, nq
neg nq
mova m7, [ps_p1m1p1m1]
align 16
.loop:
PS_HYBRID_ANALYSIS_LOOP m0, m3, 0
PS_HYBRID_ANALYSIS_LOOP m5, m6, 1
PS_HYBRID_ANALYSIS_LOOP m5, m6, 2
%if cpuflag(sse3)
pshufd m3, m3, q2301
xorps m0, m7
hsubps m3, m0
pshufd m1, m3, q0020
pshufd m3, m3, q0031
addps m1, m3
movsd m2, [inq+6*8]
%else
mova m1, m3
mova m2, m0
shufps m1, m1, q2301
shufps m2, m2, q2301
subps m1, m3
addps m2, m0
unpcklps m3, m1, m2
unpckhps m1, m2
addps m1, m3
movu m2, [inq+6*8] ; faster than movlps and no risk of overread
%endif
movss m3, [filterq+nq+8*6]
SPLATD m3
mulps m2, m3
addps m1, m2
MOVH [outq], m1
add outq, strideq
add nq, 64
jl .loop
REP_RET
%endmacro
INIT_XMM sse
PS_HYBRID_ANALYSIS
INIT_XMM sse3
PS_HYBRID_ANALYSIS

View file

@ -0,0 +1,385 @@
;******************************************************************************
;* SIMD optimized Opus encoder DSP function
;*
;* Copyright (C) 2017 Ivan Kalvachev <ikalvachev@gmail.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "config.asm"
%include "libavutil/x86/x86util.asm"
%ifdef __NASM_VER__
%use "smartalign"
ALIGNMODE p6
%endif
SECTION_RODATA 64
const_float_abs_mask: times 8 dd 0x7fffffff
const_align_abs_edge: times 8 dd 0
const_float_0_5: times 8 dd 0.5
const_float_1: times 8 dd 1.0
const_float_sign_mask: times 8 dd 0x80000000
const_int32_offsets:
%rep 8
dd $-const_int32_offsets
%endrep
SECTION .text
;
; Setup High Register to be used
; for holding memory constants
;
; %1 - the register to be used, assmues it is >= mm8
; %2 - name of the constant.
;
; Subsequent opcodes are going to use the constant in the form
; "addps m0, mm_const_name" and it would be turned into:
; "addps m0, [const_name]" on 32 bit arch or
; "addps m0, m8" on 64 bit arch
%macro SET_HI_REG_MM_CONSTANT 3 ; movop, reg, const_name
%if num_mmregs > 8
%define mm_%3 %2
%{1} %2, [%3] ; movaps m8, [const_name]
%else
%define mm_%3 [%3]
%endif
%endmacro
;
; Set Position Independent Code
; Base address of a constant
; %1 - the register to be used, if PIC is set
; %2 - name of the constant.
;
; Subsequent opcode are going to use the base address in the form
; "movaps m0, [pic_base_constant_name+r4]" and it would be turned into
; "movaps m0, [r5 + r4]" if PIC is enabled
; "movaps m0, [constant_name + r4]" if texrel are used
%macro SET_PIC_BASE 3; reg, const_label
%ifdef PIC
%{1} %2, [%3] ; lea r5, [rip+const]
%define pic_base_%3 %2
%else
%define pic_base_%3 %3
%endif
%endmacro
%macro PULSES_SEARCH 1
; m6 Syy_norm
; m7 Sxy_norm
addps m6, mm_const_float_0_5 ; Syy_norm += 1.0/2
pxor m1, m1 ; max_idx
xorps m3, m3 ; p_max
xor r4d, r4d
align 16
%%distortion_search:
movd xm2, dword r4d ; movd zero extends
%ifidn %1,add
movaps m4, [tmpY + r4] ; y[i]
movaps m5, [tmpX + r4] ; X[i]
%if USE_APPROXIMATION == 1
xorps m0, m0
cmpps m0, m0, m5, 4 ; m0 = (X[i] != 0.0)
%endif
addps m4, m6 ; m4 = Syy_new = y[i] + Syy_norm
addps m5, m7 ; m5 = Sxy_new = X[i] + Sxy_norm
%if USE_APPROXIMATION == 1
andps m5, m0 ; if(X[i] == 0) Sxy_new = 0; Prevent aproximation error from setting pulses in array padding.
%endif
%else
movaps m5, [tmpY + r4] ; m5 = y[i]
xorps m0, m0 ; m0 = 0;
cmpps m0, m0, m5, 1 ; m0 = (0<y)
subps m4, m6, m5 ; m4 = Syy_new = Syy_norm - y[i]
subps m5, m7, [tmpX + r4] ; m5 = Sxy_new = Sxy_norm - X[i]
andps m5, m0 ; (0<y)?m5:0
%endif
%if USE_APPROXIMATION == 1
rsqrtps m4, m4
mulps m5, m4 ; m5 = p = Sxy_new*approx(1/sqrt(Syy) )
%else
mulps m5, m5
divps m5, m4 ; m5 = p = Sxy_new*Sxy_new/Syy
%endif
VPBROADCASTD m2, xm2 ; m2=i (all lanes get same values, we add the offset-per-lane, later)
cmpps m0, m3, m5, 1 ; m0 = (m3 < m5) ; (p_max < p) ; (p > p_max)
maxps m3, m5 ; m3=max(p_max,p)
; maxps here is faster than blendvps, despite blend having lower latency.
pand m2, m0 ; This version seems faster than sse41 pblendvb
pmaxsw m1, m2 ; SSE2 signed word, so it would work for N < 32768/4
add r4d, mmsize
cmp r4d, Nd
jb %%distortion_search
por m1, mm_const_int32_offsets ; max_idx offsets per individual lane (skipped in the inner loop)
movdqa m4, m1 ; needed for the aligned y[max_idx]+=1; processing
%if mmsize >= 32
; Merge parallel maximums round 8 (4 vs 4)
vextractf128 xm5, ym3, 1 ; xmm5 = ymm3[1x128] = ymm3[255..128b]
cmpps xm0, xm3, xm5, 1 ; m0 = (m3 < m5) = ( p[0x128] < p[1x128] )
vextracti128 xm2, ym1, 1 ; xmm2 = ymm1[1x128] = ymm1[255..128b]
BLENDVPS xm3, xm5, xm0 ; max_idx = m0 ? max_idx[1x128] : max_idx[0x128]
PBLENDVB xm1, xm2, xm0 ; p = m0 ? p[1x128] : p[0x128]
%endif
; Merge parallel maximums round 4 (2 vs 2)
; m3=p[3210]
movhlps xm5, xm3 ; m5=p[xx32]
cmpps xm0, xm3, xm5, 1 ; m0 = (m3 < m5) = ( p[1,0] < p[3,2] )
pshufd xm2, xm1, q3232
BLENDVPS xm3, xm5, xm0 ; max_idx = m0 ? max_idx[3,2] : max_idx[1,0]
PBLENDVB xm1, xm2, xm0 ; p = m0 ? p[3,2] : p[1,0]
; Merge parallel maximums final round (1 vs 1)
shufps xm0, xm3, xm3, q1111 ; m0 = m3[1] = p[1]
cmpss xm0, xm3, 5 ; m0 = !(m0 >= m3) = !( p[1] >= p[0] )
pshufd xm2, xm1, q1111
PBLENDVB xm1, xm2, xm0
movd dword r4d, xm1 ; zero extends to the rest of r4q
VBROADCASTSS m3, [tmpX + r4]
%{1}ps m7, m3 ; Sxy += X[max_idx]
VBROADCASTSS m5, [tmpY + r4]
%{1}ps m6, m5 ; Syy += Y[max_idx]
; We have to update a single element in Y[i]
; However writing 4 bytes and then doing 16 byte load in the inner loop
; could cause a stall due to breaking write forwarding.
VPBROADCASTD m1, xm1
pcmpeqd m1, m1, m4 ; exactly 1 element matches max_idx and this finds it
and r4d, ~(mmsize-1) ; align address down, so the value pointed by max_idx is inside a mmsize load
movaps m5, [tmpY + r4] ; m5 = Y[y3...ym...y0]
andps m1, mm_const_float_1 ; m1 = [ 0...1.0...0]
%{1}ps m5, m1 ; m5 = Y[y3...ym...y0] +/- [0...1.0...0]
movaps [tmpY + r4], m5 ; Y[max_idx] +-= 1.0;
%endmacro
;
; We need one more register for
; PIC relative addressing. Use this
; to count it in cglobal
;
%ifdef PIC
%define num_pic_regs 1
%else
%define num_pic_regs 0
%endif
;
; Pyramid Vector Quantization Search implementation
;
; float * inX - Unaligned (SIMD) access, it will be overread,
; but extra data is masked away.
; int32 * outY - Should be aligned and padded buffer.
; It is used as temp buffer.
; uint32 K - Number of pulses to have after quantizations.
; uint32 N - Number of vector elements. Must be 0 < N < 256
;
%macro PVQ_FAST_SEARCH 1
cglobal pvq_search%1, 4, 5+num_pic_regs, 11, 256*4, inX, outY, K, N
%define tmpX rsp
%define tmpY outYq
movaps m0, [const_float_abs_mask]
shl Nd, 2 ; N *= sizeof(float); also 32 bit operation zeroes the high 32 bits in 64 bit mode.
mov r4d, Nd
neg r4d
and r4d, mmsize-1
SET_PIC_BASE lea, r5, const_align_abs_edge ; rip+const
movups m2, [pic_base_const_align_abs_edge + r4 - mmsize]
add Nd, r4d ; N = align(N, mmsize)
lea r4d, [Nd - mmsize] ; N is rounded up (aligned up) to mmsize, so r4 can't become negative here, unless N=0.
movups m1, [inXq + r4]
andps m1, m2
movaps [tmpX + r4], m1 ; Sx = abs( X[N-1] )
align 16
%%loop_abs_sum:
sub r4d, mmsize
jc %%end_loop_abs_sum
movups m2, [inXq + r4]
andps m2, m0
movaps [tmpX + r4], m2 ; tmpX[i]=abs(X[i])
addps m1, m2 ; Sx += abs(X[i])
jmp %%loop_abs_sum
align 16
%%end_loop_abs_sum:
HSUMPS m1, m2 ; m1 = Sx
xorps m0, m0
comiss xm0, xm1 ;
jz %%zero_input ; if (Sx==0) goto zero_input
cvtsi2ss xm0, dword Kd ; m0 = K
%if USE_APPROXIMATION == 1
rcpss xm1, xm1 ; m1 = approx(1/Sx)
mulss xm0, xm1 ; m0 = K*(1/Sx)
%else
divss xm0, xm1 ; b = K/Sx
; b = K/max_x
%endif
VBROADCASTSS m0, xm0
lea r4d, [Nd - mmsize]
pxor m5, m5 ; Sy ( Sum of abs( y[i]) )
xorps m6, m6 ; Syy ( Sum of y[i]*y[i] )
xorps m7, m7 ; Sxy ( Sum of X[i]*y[i] )
align 16
%%loop_guess:
movaps m1, [tmpX + r4] ; m1 = X[i]
mulps m2, m0, m1 ; m2 = res*X[i]
cvtps2dq m2, m2 ; yt = (int)lrintf( res*X[i] )
paddd m5, m2 ; Sy += yt
cvtdq2ps m2, m2 ; yt = (float)yt
mulps m1, m2 ; m1 = X[i]*yt
movaps [tmpY + r4], m2 ; y[i] = m2
addps m7, m1 ; Sxy += m1;
mulps m2, m2 ; m2 = yt*yt
addps m6, m2 ; Syy += m2
sub r4d, mmsize
jnc %%loop_guess
HSUMPS m6, m1 ; Syy_norm
HADDD m5, m4 ; pulses
movd dword r4d, xm5 ; zero extends to the rest of r4q
sub Kd, r4d ; K -= pulses , also 32 bit operation zeroes high 32 bit in 64 bit mode.
jz %%finish ; K - pulses == 0
SET_HI_REG_MM_CONSTANT movaps, m8, const_float_0_5
SET_HI_REG_MM_CONSTANT movaps, m9, const_float_1
SET_HI_REG_MM_CONSTANT movdqa, m10, const_int32_offsets
; Use Syy/2 in distortion parameter calculations.
; Saves pre and post-caclulation to correct Y[] values.
; Same precision, since float mantisa is normalized.
; The SQRT approximation does differ.
HSUMPS m7, m0 ; Sxy_norm
mulps m6, mm_const_float_0_5
jc %%remove_pulses_loop ; K - pulses < 0
align 16 ; K - pulses > 0
%%add_pulses_loop:
PULSES_SEARCH add ; m6 Syy_norm ; m7 Sxy_norm
sub Kd, 1
jnz %%add_pulses_loop
addps m6, m6 ; Syy*=2
jmp %%finish
align 16
%%remove_pulses_loop:
PULSES_SEARCH sub ; m6 Syy_norm ; m7 Sxy_norm
add Kd, 1
jnz %%remove_pulses_loop
addps m6, m6 ; Syy*=2
align 16
%%finish:
lea r4d, [Nd - mmsize]
movaps m2, [const_float_sign_mask]
align 16
%%restore_sign_loop:
movaps m0, [tmpY + r4] ; m0 = Y[i]
movups m1, [inXq + r4] ; m1 = X[i]
andps m1, m2 ; m1 = sign(X[i])
orps m0, m1 ; m0 = Y[i]*sign
cvtps2dq m3, m0 ; m3 = (int)m0
movaps [outYq + r4], m3
sub r4d, mmsize
jnc %%restore_sign_loop
%%return:
%if ARCH_X86_64 == 0 ; sbrdsp
movss r0m, xm6 ; return (float)Syy_norm
fld dword r0m
%else
movaps m0, m6 ; return (float)Syy_norm
%endif
RET
align 16
%%zero_input:
lea r4d, [Nd - mmsize]
xorps m0, m0
%%zero_loop:
movaps [outYq + r4], m0
sub r4d, mmsize
jnc %%zero_loop
movaps m6, [const_float_1]
jmp %%return
%endmacro
; if 1, use a float op that give half precision but execute for around 3 cycles.
; On Skylake & Ryzen the division is much faster (around 11c/3),
; that makes the full precision code about 2% slower.
; Opus also does use rsqrt approximation in their intrinsics code.
%define USE_APPROXIMATION 1
INIT_XMM sse2
PVQ_FAST_SEARCH _approx
INIT_XMM sse4
PVQ_FAST_SEARCH _approx
%define USE_APPROXIMATION 0
INIT_XMM avx
PVQ_FAST_SEARCH _exact

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,221 @@
;******************************************************************************
;* SIMD optimized non-power-of-two MDCT functions
;*
;* Copyright (C) 2017 Rostislav Pehlivanov <atomnuker@gmail.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION_RODATA 32
perm_neg: dd 2, 5, 3, 4, 6, 1, 7, 0
perm_pos: dd 0, 7, 1, 6, 4, 3, 5, 2
sign_adjust_r: times 4 dd 0x80000000, 0x00000000
sign_adjust_5: dd 0x00000000, 0x80000000, 0x80000000, 0x00000000
SECTION .text
%if ARCH_X86_64
;*****************************************************************************************
;void ff_fft15_avx(FFTComplex *out, FFTComplex *in, FFTComplex *exptab, ptrdiff_t stride);
;*****************************************************************************************
%macro FFT5 3 ; %1 - in_offset, %2 - dst1 (64bit used), %3 - dst2
VBROADCASTSD m0, [inq + %1] ; in[ 0].re, in[ 0].im, in[ 0].re, in[ 0].im
movsd xm1, [inq + 1*16 + 8 + %1] ; in[ 3].re, in[ 3].im, 0, 0
movsd xm4, [inq + 6*16 + 0 + %1] ; in[12].re, in[12].im, 0, 0
movhps xm1, [inq + 3*16 + 0 + %1] ; in[ 3].re, in[ 3].im, in[ 6].re, in[ 6].im
movhps xm4, [inq + 4*16 + 8 + %1] ; in[12].re, in[12].im, in[ 9].re, in[ 9].im
subps xm2, xm1, xm4 ; t[2].im, t[2].re, t[3].im, t[3].re
addps xm1, xm4 ; t[0].re, t[0].im, t[1].re, t[1].im
movhlps %2, xm1 ; t[0].re, t[1].re, t[0].im, t[1].im
addps %2, xm1
addps %2, xm0 ; DC[0].re, DC[0].im, junk...
movlhps %2, %2 ; DC[0].re, DC[0].im, DC[0].re, DC[0].im
shufps xm3, xm1, xm2, q0110 ; t[0].re, t[0].im, t[2].re, t[2].im
shufps xm1, xm2, q2332 ; t[1].re, t[1].im, t[3].re, t[3].im
mulps xm%3, xm1, xm5
mulps xm4, xm3, xm6
mulps xm1, xm6
xorps xm1, xm7
mulps xm3, xm5
addsubps xm3, xm1 ; t[0].re, t[0].im, t[2].re, t[2].im
subps xm%3, xm4 ; t[4].re, t[4].im, t[5].re, t[5].im
movhlps xm2, xm%3, xm3 ; t[2].re, t[2].im, t[5].re, t[5].im
movlhps xm3, xm%3 ; t[0].re, t[0].im, t[4].re, t[4].im
xorps xm2, xm7
addps xm%3, xm2, xm3
subps xm3, xm2
shufps xm3, xm3, q1032
vinsertf128 m%3, m%3, xm3, 1 ; All ACs (tmp[1] through to tmp[4])
addps m%3, m%3, m0 ; Finally offset with DCs
%endmacro
%macro BUTTERFLIES_DC 1 ; %1 - exptab_offset
mulps xm0, xm9, [exptabq + %1 + 16*0]
mulps xm1, xm10, [exptabq + %1 + 16*1]
haddps xm0, xm1
movhlps xm1, xm0 ; t[0].re, t[1].re, t[0].im, t[1].im
addps xm0, xm1
addps xm0, xm8
movsd [outq], xm0
%endmacro
%macro BUTTERFLIES_AC 1 ; %1 - exptab_offset
mulps m0, m12, [exptabq + 64*0 + 0*mmsize + %1]
mulps m1, m12, [exptabq + 64*0 + 1*mmsize + %1]
mulps m2, m13, [exptabq + 64*1 + 0*mmsize + %1]
mulps m3, m13, [exptabq + 64*1 + 1*mmsize + %1]
addps m0, m0, m2
addps m1, m1, m3
addps m0, m0, m11
shufps m1, m1, m1, q2301
addps m0, m0, m1
vextractf128 xm1, m0, 1
movlps [outq + strideq*1], xm0
movhps [outq + strideq*2], xm0
movlps [outq + stride3q], xm1
movhps [outq + strideq*4], xm1
%endmacro
INIT_YMM avx
cglobal fft15, 4, 5, 14, out, in, exptab, stride, stride5
shl strideq, 3
movaps xm5, [exptabq + 480 + 16*0]
movaps xm6, [exptabq + 480 + 16*1]
movaps xm7, [sign_adjust_5]
FFT5 0, xm8, 11
FFT5 8, xm9, 12
FFT5 16, xm10, 13
%define stride3q inq
lea stride3q, [strideq + strideq*2]
lea stride5q, [strideq + strideq*4]
BUTTERFLIES_DC (8*6 + 4*0)*2*4
BUTTERFLIES_AC (8*0 + 0*0)*2*4
add outq, stride5q
BUTTERFLIES_DC (8*6 + 4*1)*2*4
BUTTERFLIES_AC (8*2 + 0*0)*2*4
add outq, stride5q
BUTTERFLIES_DC (8*6 + 4*2)*2*4
BUTTERFLIES_AC (8*4 + 0*0)*2*4
RET
%endif ; ARCH_X86_64
;*******************************************************************************************************
;void ff_mdct15_postreindex(FFTComplex *out, FFTComplex *in, FFTComplex *exp, int *lut, ptrdiff_t len8);
;*******************************************************************************************************
%macro LUT_LOAD_4D 3
mov r4d, [lutq + %3q*4 + 0]
movsd xmm%1, [inq + r4q*8]
mov r4d, [lutq + %3q*4 + 4]
movhps xmm%1, [inq + r4q*8]
%if cpuflag(avx2)
mov r4d, [lutq + %3q*4 + 8]
movsd %2, [inq + r4q*8]
mov r4d, [lutq + %3q*4 + 12]
movhps %2, [inq + r4q*8]
vinsertf128 %1, %1, %2, 1
%endif
%endmacro
%macro POSTROTATE_FN 1
cglobal mdct15_postreindex, 5, 7, 8 + cpuflag(avx2)*2, out, in, exp, lut, len8, offset_p, offset_n
xor offset_nq, offset_nq
lea offset_pq, [len8q*2 - %1]
movaps m7, [sign_adjust_r]
%if cpuflag(avx2)
movaps m8, [perm_pos]
movaps m9, [perm_neg]
%endif
.loop:
movups m0, [expq + offset_pq*8] ; exp[p0].re, exp[p0].im, exp[p1].re, exp[p1].im, exp[p2].re, exp[p2].im, exp[p3].re, exp[p3].im
movups m1, [expq + offset_nq*8] ; exp[n3].re, exp[n3].im, exp[n2].re, exp[n2].im, exp[n1].re, exp[n1].im, exp[n0].re, exp[n0].im
LUT_LOAD_4D m3, xm4, offset_p ; in[p0].re, in[p0].im, in[p1].re, in[p1].im, in[p2].re, in[p2].im, in[p3].re, in[p3].im
LUT_LOAD_4D m4, xm5, offset_n ; in[n3].re, in[n3].im, in[n2].re, in[n2].im, in[n1].re, in[n1].im, in[n0].re, in[n0].im
mulps m5, m3, m0 ; in[p].reim * exp[p].reim
mulps m6, m4, m1 ; in[n].reim * exp[n].reim
xorps m5, m7 ; in[p].re *= -1, in[p].im *= 1
xorps m6, m7 ; in[n].re *= -1, in[n].im *= 1
shufps m3, m3, m3, q2301 ; in[p].imre
shufps m4, m4, m4, q2301 ; in[n].imre
mulps m3, m0 ; in[p].imre * exp[p].reim
mulps m4, m1 ; in[n].imre * exp[n].reim
haddps m3, m6 ; out[n0].im, out[n1].im, out[n3].re, out[n2].re, out[n2].im, out[n3].im, out[n1].re, out[n0].re
haddps m5, m4 ; out[p0].re, out[p1].re, out[p3].im, out[p2].im, out[p2].re, out[p3].re, out[p1].im, out[p0].im
%if cpuflag(avx2)
vpermps m3, m9, m3 ; out[n3].im, out[n3].re, out[n2].im, out[n2].re, out[n1].im, out[n1].re, out[n0].im, out[n0].re
vpermps m5, m8, m5 ; out[p0].re, out[p0].im, out[p1].re, out[p1].im, out[p2].re, out[p2].im, out[p3].re, out[p3].im
%else
shufps m3, m3, m3, q0312
shufps m5, m5, m5, q2130
%endif
movups [outq + offset_nq*8], m3
movups [outq + offset_pq*8], m5
sub offset_pq, %1
add offset_nq, %1
cmp offset_nq, offset_pq
jle .loop
REP_RET
%endmacro
INIT_XMM sse3
POSTROTATE_FN 2
%if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
INIT_YMM avx2
POSTROTATE_FN 4
%endif

View file

@ -0,0 +1,548 @@
;******************************************************************************
;* AAC Spectral Band Replication decoding functions
;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION_RODATA
; mask equivalent for multiply by -1.0 1.0
ps_mask times 2 dd 1<<31, 0
ps_mask2 times 2 dd 0, 1<<31
ps_mask3 dd 0, 0, 0, 1<<31
ps_noise0 times 2 dd 1.0, 0.0,
ps_noise2 times 2 dd -1.0, 0.0
ps_noise13 dd 0.0, 1.0, 0.0, -1.0
dd 0.0, -1.0, 0.0, 1.0
dd 0.0, 1.0, 0.0, -1.0
cextern sbr_noise_table
cextern ps_neg
SECTION .text
INIT_XMM sse
cglobal sbr_sum_square, 2, 3, 6
mov r2d, r1d
xorps m0, m0
xorps m1, m1
sar r2, 3
jz .prepare
.loop:
movu m2, [r0 + 0]
movu m3, [r0 + 16]
movu m4, [r0 + 32]
movu m5, [r0 + 48]
mulps m2, m2
mulps m3, m3
mulps m4, m4
mulps m5, m5
addps m0, m2
addps m1, m3
addps m0, m4
addps m1, m5
add r0, 64
dec r2
jnz .loop
.prepare:
and r1, 7
sar r1, 1
jz .end
; len is a multiple of 2, thus there are at least 4 elements to process
.endloop:
movu m2, [r0]
add r0, 16
mulps m2, m2
dec r1
addps m0, m2
jnz .endloop
.end:
addps m0, m1
movhlps m2, m0
addps m0, m2
movss m1, m0
shufps m0, m0, 1
addss m0, m1
%if ARCH_X86_64 == 0
movss r0m, m0
fld dword r0m
%endif
RET
%define STEP 40*4*2
cglobal sbr_hf_g_filt, 5, 6, 5
lea r1, [r1 + 8*r4] ; offset by ixh elements into X_high
mov r5, r3
and r3, 0xFC
lea r2, [r2 + r3*4]
lea r0, [r0 + r3*8]
neg r3
jz .loop1
.loop4:
movlps m0, [r2 + 4*r3 + 0]
movlps m1, [r2 + 4*r3 + 8]
movlps m2, [r1 + 0*STEP]
movlps m3, [r1 + 2*STEP]
movhps m2, [r1 + 1*STEP]
movhps m3, [r1 + 3*STEP]
unpcklps m0, m0
unpcklps m1, m1
mulps m0, m2
mulps m1, m3
movu [r0 + 8*r3 + 0], m0
movu [r0 + 8*r3 + 16], m1
add r1, 4*STEP
add r3, 4
jnz .loop4
and r5, 3 ; number of single element loops
jz .end
.loop1: ; element 0 and 1 can be computed at the same time
movss m0, [r2]
movlps m2, [r1]
unpcklps m0, m0
mulps m2, m0
movlps [r0], m2
add r0, 8
add r2, 4
add r1, STEP
dec r5
jnz .loop1
.end:
RET
; void ff_sbr_hf_gen_sse(float (*X_high)[2], const float (*X_low)[2],
; const float alpha0[2], const float alpha1[2],
; float bw, int start, int end)
;
cglobal sbr_hf_gen, 4,4,8, X_high, X_low, alpha0, alpha1, BW, S, E
; load alpha factors
%define bw m0
%if ARCH_X86_64 == 0 || WIN64
movss bw, BWm
%endif
movlps m2, [alpha1q]
movlps m1, [alpha0q]
shufps bw, bw, 0
mulps m2, bw ; (a1[0] a1[1])*bw
mulps m1, bw ; (a0[0] a0[1])*bw = (a2 a3)
mulps m2, bw ; (a1[0] a1[1])*bw*bw = (a0 a1)
mova m3, m1
mova m4, m2
; Set pointers
%if ARCH_X86_64 == 0 || WIN64
; start and end 6th and 7th args on stack
mov r2d, Sm
mov r3d, Em
DEFINE_ARGS X_high, X_low, start, end
%else
; BW does not actually occupy a register, so shift by 1
DEFINE_ARGS X_high, X_low, alpha0, alpha1, start, end
movsxd startq, startd
movsxd endq, endd
%endif
sub startq, endq ; neg num of loops
lea X_highq, [X_highq + endq*2*4]
lea X_lowq, [X_lowq + endq*2*4 - 2*2*4]
shl startq, 3 ; offset from num loops
mova m0, [X_lowq + startq]
shufps m3, m3, q1111
shufps m4, m4, q1111
xorps m3, [ps_mask]
shufps m1, m1, q0000
shufps m2, m2, q0000
xorps m4, [ps_mask]
.loop2:
movu m7, [X_lowq + startq + 8] ; BbCc
mova m6, m0
mova m5, m7
shufps m0, m0, q2301 ; aAbB
shufps m7, m7, q2301 ; bBcC
mulps m0, m4
mulps m7, m3
mulps m6, m2
mulps m5, m1
addps m7, m0
mova m0, [X_lowq + startq + 16] ; CcDd
addps m7, m0
addps m6, m5
addps m7, m6
mova [X_highq + startq], m7
add startq, 16
jnz .loop2
RET
cglobal sbr_sum64x5, 1,2,4,z
lea r1q, [zq+ 256]
.loop:
mova m0, [zq+ 0]
mova m2, [zq+ 16]
mova m1, [zq+ 256]
mova m3, [zq+ 272]
addps m0, [zq+ 512]
addps m2, [zq+ 528]
addps m1, [zq+ 768]
addps m3, [zq+ 784]
addps m0, [zq+1024]
addps m2, [zq+1040]
addps m0, m1
addps m2, m3
mova [zq], m0
mova [zq+16], m2
add zq, 32
cmp zq, r1q
jne .loop
REP_RET
INIT_XMM sse
cglobal sbr_qmf_post_shuffle, 2,3,4,W,z
lea r2q, [zq + (64-4)*4]
mova m3, [ps_neg]
.loop:
mova m1, [zq]
xorps m0, m3, [r2q]
shufps m0, m0, m0, q0123
unpcklps m2, m0, m1
unpckhps m0, m0, m1
mova [Wq + 0], m2
mova [Wq + 16], m0
add Wq, 32
sub r2q, 16
add zq, 16
cmp zq, r2q
jl .loop
REP_RET
INIT_XMM sse
cglobal sbr_neg_odd_64, 1,2,4,z
lea r1q, [zq+256]
.loop:
mova m0, [zq+ 0]
mova m1, [zq+16]
mova m2, [zq+32]
mova m3, [zq+48]
xorps m0, [ps_mask2]
xorps m1, [ps_mask2]
xorps m2, [ps_mask2]
xorps m3, [ps_mask2]
mova [zq+ 0], m0
mova [zq+16], m1
mova [zq+32], m2
mova [zq+48], m3
add zq, 64
cmp zq, r1q
jne .loop
REP_RET
; void ff_sbr_qmf_deint_bfly_sse2(float *v, const float *src0, const float *src1)
%macro SBR_QMF_DEINT_BFLY 0
cglobal sbr_qmf_deint_bfly, 3,5,8, v,src0,src1,vrev,c
mov cq, 64*4-2*mmsize
lea vrevq, [vq + 64*4]
.loop:
mova m0, [src0q+cq]
mova m1, [src1q]
mova m4, [src0q+cq+mmsize]
mova m5, [src1q+mmsize]
%if cpuflag(sse2)
pshufd m2, m0, q0123
pshufd m3, m1, q0123
pshufd m6, m4, q0123
pshufd m7, m5, q0123
%else
shufps m2, m0, m0, q0123
shufps m3, m1, m1, q0123
shufps m6, m4, m4, q0123
shufps m7, m5, m5, q0123
%endif
addps m5, m2
subps m0, m7
addps m1, m6
subps m4, m3
mova [vrevq], m1
mova [vrevq+mmsize], m5
mova [vq+cq], m0
mova [vq+cq+mmsize], m4
add src1q, 2*mmsize
add vrevq, 2*mmsize
sub cq, 2*mmsize
jge .loop
REP_RET
%endmacro
INIT_XMM sse
SBR_QMF_DEINT_BFLY
INIT_XMM sse2
SBR_QMF_DEINT_BFLY
INIT_XMM sse2
cglobal sbr_qmf_pre_shuffle, 1,4,6,z
%define OFFSET (32*4-2*mmsize)
mov r3q, OFFSET
lea r1q, [zq + (32+1)*4]
lea r2q, [zq + 64*4]
mova m5, [ps_neg]
.loop:
movu m0, [r1q]
movu m2, [r1q + mmsize]
movu m1, [zq + r3q + 4 + mmsize]
movu m3, [zq + r3q + 4]
pxor m2, m5
pxor m0, m5
pshufd m2, m2, q0123
pshufd m0, m0, q0123
SBUTTERFLY dq, 2, 3, 4
SBUTTERFLY dq, 0, 1, 4
mova [r2q + 2*r3q + 0*mmsize], m2
mova [r2q + 2*r3q + 1*mmsize], m3
mova [r2q + 2*r3q + 2*mmsize], m0
mova [r2q + 2*r3q + 3*mmsize], m1
add r1q, 2*mmsize
sub r3q, 2*mmsize
jge .loop
movq m2, [zq]
movq [r2q], m2
REP_RET
%ifdef PIC
%define NREGS 1
%if UNIX64
%define NOISE_TABLE r6q ; r5q is m_max
%else
%define NOISE_TABLE r5q
%endif
%else
%define NREGS 0
%define NOISE_TABLE sbr_noise_table
%endif
%macro LOAD_NST 1
%ifdef PIC
lea NOISE_TABLE, [%1]
mova m0, [kxq + NOISE_TABLE]
%else
mova m0, [kxq + %1]
%endif
%endmacro
INIT_XMM sse2
; sbr_hf_apply_noise_0(float (*Y)[2], const float *s_m,
; const float *q_filt, int noise,
; int kx, int m_max)
cglobal sbr_hf_apply_noise_0, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
mova m0, [ps_noise0]
jmp apply_noise_main
; sbr_hf_apply_noise_1(float (*Y)[2], const float *s_m,
; const float *q_filt, int noise,
; int kx, int m_max)
cglobal sbr_hf_apply_noise_1, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
and kxq, 1
shl kxq, 4
LOAD_NST ps_noise13
jmp apply_noise_main
; sbr_hf_apply_noise_2(float (*Y)[2], const float *s_m,
; const float *q_filt, int noise,
; int kx, int m_max)
cglobal sbr_hf_apply_noise_2, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
mova m0, [ps_noise2]
jmp apply_noise_main
; sbr_hf_apply_noise_3(float (*Y)[2], const float *s_m,
; const float *q_filt, int noise,
; int kx, int m_max)
cglobal sbr_hf_apply_noise_3, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
and kxq, 1
shl kxq, 4
LOAD_NST ps_noise13+16
apply_noise_main:
%if ARCH_X86_64 == 0 || WIN64
mov kxd, m_maxm
DEFINE_ARGS Y, s_m, q_filt, noise, count
%else
DEFINE_ARGS Y, s_m, q_filt, noise, kx, count
%endif
movsxdifnidn noiseq, noised
dec noiseq
shl countd, 2
%ifdef PIC
lea NOISE_TABLE, [sbr_noise_table]
%endif
lea Yq, [Yq + 2*countq]
add s_mq, countq
add q_filtq, countq
shl noiseq, 3
pxor m5, m5
neg countq
.loop:
mova m1, [q_filtq + countq]
movu m3, [noiseq + NOISE_TABLE + 1*mmsize]
movu m4, [noiseq + NOISE_TABLE + 2*mmsize]
add noiseq, 2*mmsize
and noiseq, 0x1ff<<3
punpckhdq m2, m1, m1
punpckldq m1, m1
mulps m1, m3 ; m2 = q_filt[m] * ff_sbr_noise_table[noise]
mulps m2, m4 ; m2 = q_filt[m] * ff_sbr_noise_table[noise]
mova m3, [s_mq + countq]
; TODO: replace by a vpermd in AVX2
punpckhdq m4, m3, m3
punpckldq m3, m3
pcmpeqd m6, m3, m5 ; m6 == 0
pcmpeqd m7, m4, m5 ; m7 == 0
mulps m3, m0 ; s_m[m] * phi_sign
mulps m4, m0 ; s_m[m] * phi_sign
pand m1, m6
pand m2, m7
movu m6, [Yq + 2*countq]
movu m7, [Yq + 2*countq + mmsize]
addps m3, m1
addps m4, m2
addps m6, m3
addps m7, m4
movu [Yq + 2*countq], m6
movu [Yq + 2*countq + mmsize], m7
add countq, mmsize
jl .loop
RET
INIT_XMM sse
cglobal sbr_qmf_deint_neg, 2,4,4,v,src,vrev,c
%define COUNT 32*4
%define OFFSET 32*4
mov cq, -COUNT
lea vrevq, [vq + OFFSET + COUNT]
add vq, OFFSET-mmsize
add srcq, 2*COUNT
mova m3, [ps_neg]
.loop:
mova m0, [srcq + 2*cq + 0*mmsize]
mova m1, [srcq + 2*cq + 1*mmsize]
shufps m2, m0, m1, q2020
shufps m1, m0, q1313
xorps m2, m3
mova [vq], m1
mova [vrevq + cq], m2
sub vq, mmsize
add cq, mmsize
jl .loop
REP_RET
%macro SBR_AUTOCORRELATE 0
cglobal sbr_autocorrelate, 2,3,8,32, x, phi, cnt
mov cntq, 37*8
add xq, cntq
neg cntq
%if cpuflag(sse3)
%define MOVH movsd
movddup m5, [xq+cntq]
%else
%define MOVH movlps
movlps m5, [xq+cntq]
movlhps m5, m5
%endif
MOVH m7, [xq+cntq+8 ]
MOVH m1, [xq+cntq+16]
shufps m7, m7, q0110
shufps m1, m1, q0110
mulps m3, m5, m7 ; x[0][0] * x[1][0], x[0][1] * x[1][1], x[0][0] * x[1][1], x[0][1] * x[1][0]
mulps m4, m5, m5 ; x[0][0] * x[0][0], x[0][1] * x[0][1];
mulps m5, m1 ; real_sum2 = x[0][0] * x[2][0], x[0][1] * x[2][1]; imag_sum2 = x[0][0] * x[2][1], x[0][1] * x[2][0]
movaps [rsp ], m3
movaps [rsp+16], m4
add cntq, 8
MOVH m2, [xq+cntq+16]
movlhps m7, m7
shufps m2, m2, q0110
mulps m6, m7, m1 ; real_sum1 = x[1][0] * x[2][0], x[1][1] * x[2][1]; imag_sum1 += x[1][0] * x[2][1], x[1][1] * x[2][0]
mulps m4, m7, m2
mulps m7, m7 ; real_sum0 = x[1][0] * x[1][0], x[1][1] * x[1][1];
addps m5, m4 ; real_sum2 += x[1][0] * x[3][0], x[1][1] * x[3][1]; imag_sum2 += x[1][0] * x[3][1], x[1][1] * x[3][0]
align 16
.loop:
add cntq, 8
MOVH m0, [xq+cntq+16]
movlhps m1, m1
shufps m0, m0, q0110
mulps m3, m1, m2
mulps m4, m1, m0
mulps m1, m1
addps m6, m3 ; real_sum1 += x[i][0] * x[i + 1][0], x[i][1] * x[i + 1][1]; imag_sum1 += x[i][0] * x[i + 1][1], x[i][1] * x[i + 1][0];
addps m5, m4 ; real_sum2 += x[i][0] * x[i + 2][0], x[i][1] * x[i + 2][1]; imag_sum2 += x[i][0] * x[i + 2][1], x[i][1] * x[i + 2][0];
addps m7, m1 ; real_sum0 += x[i][0] * x[i][0], x[i][1] * x[i][1];
add cntq, 8
MOVH m1, [xq+cntq+16]
movlhps m2, m2
shufps m1, m1, q0110
mulps m3, m2, m0
mulps m4, m2, m1
mulps m2, m2
addps m6, m3 ; real_sum1 += x[i][0] * x[i + 1][0], x[i][1] * x[i + 1][1]; imag_sum1 += x[i][0] * x[i + 1][1], x[i][1] * x[i + 1][0];
addps m5, m4 ; real_sum2 += x[i][0] * x[i + 2][0], x[i][1] * x[i + 2][1]; imag_sum2 += x[i][0] * x[i + 2][1], x[i][1] * x[i + 2][0];
addps m7, m2 ; real_sum0 += x[i][0] * x[i][0], x[i][1] * x[i][1];
add cntq, 8
MOVH m2, [xq+cntq+16]
movlhps m0, m0
shufps m2, m2, q0110
mulps m3, m0, m1
mulps m4, m0, m2
mulps m0, m0
addps m6, m3 ; real_sum1 += x[i][0] * x[i + 1][0], x[i][1] * x[i + 1][1]; imag_sum1 += x[i][0] * x[i + 1][1], x[i][1] * x[i + 1][0];
addps m5, m4 ; real_sum2 += x[i][0] * x[i + 2][0], x[i][1] * x[i + 2][1]; imag_sum2 += x[i][0] * x[i + 2][1], x[i][1] * x[i + 2][0];
addps m7, m0 ; real_sum0 += x[i][0] * x[i][0], x[i][1] * x[i][1];
jl .loop
movlhps m1, m1
mulps m2, m1
mulps m1, m1
addps m2, m6 ; real_sum1 + x[38][0] * x[39][0], x[38][1] * x[39][1]; imag_sum1 + x[38][0] * x[39][1], x[38][1] * x[39][0];
addps m1, m7 ; real_sum0 + x[38][0] * x[38][0], x[38][1] * x[38][1];
addps m6, [rsp ] ; real_sum1 + x[ 0][0] * x[ 1][0], x[ 0][1] * x[ 1][1]; imag_sum1 + x[ 0][0] * x[ 1][1], x[ 0][1] * x[ 1][0];
addps m7, [rsp+16] ; real_sum0 + x[ 0][0] * x[ 0][0], x[ 0][1] * x[ 0][1];
xorps m2, [ps_mask3]
xorps m5, [ps_mask3]
xorps m6, [ps_mask3]
HADDPS m2, m5, m3
HADDPS m7, m6, m4
%if cpuflag(sse3)
movshdup m0, m1
%else
movss m0, m1
shufps m1, m1, q0001
%endif
addss m1, m0
movaps [phiq ], m2
movhps [phiq+0x18], m7
movss [phiq+0x28], m7
movss [phiq+0x10], m1
RET
%endmacro
INIT_XMM sse
SBR_AUTOCORRELATE
INIT_XMM sse3
SBR_AUTOCORRELATE