mirror of
https://github.com/ossrs/srs.git
synced 2025-03-09 15:49:59 +00:00
parent
4308f238c0
commit
37c84eccc0
28 changed files with 8441 additions and 50 deletions
60
trunk/3rdparty/ffmpeg-4.2-fit/libavcodec/aarch64/Makefile
vendored
Normal file
60
trunk/3rdparty/ffmpeg-4.2-fit/libavcodec/aarch64/Makefile
vendored
Normal file
|
@ -0,0 +1,60 @@
|
|||
# subsystems
|
||||
OBJS-$(CONFIG_FFT) += aarch64/fft_init_aarch64.o
|
||||
OBJS-$(CONFIG_FMTCONVERT) += aarch64/fmtconvert_init.o
|
||||
OBJS-$(CONFIG_H264CHROMA) += aarch64/h264chroma_init_aarch64.o
|
||||
OBJS-$(CONFIG_H264DSP) += aarch64/h264dsp_init_aarch64.o
|
||||
OBJS-$(CONFIG_H264PRED) += aarch64/h264pred_init.o
|
||||
OBJS-$(CONFIG_H264QPEL) += aarch64/h264qpel_init_aarch64.o
|
||||
OBJS-$(CONFIG_HPELDSP) += aarch64/hpeldsp_init_aarch64.o
|
||||
OBJS-$(CONFIG_MPEGAUDIODSP) += aarch64/mpegaudiodsp_init.o
|
||||
OBJS-$(CONFIG_NEON_CLOBBER_TEST) += aarch64/neontest.o
|
||||
OBJS-$(CONFIG_VIDEODSP) += aarch64/videodsp_init.o
|
||||
OBJS-$(CONFIG_VP8DSP) += aarch64/vp8dsp_init_aarch64.o
|
||||
|
||||
# decoders/encoders
|
||||
OBJS-$(CONFIG_AAC_DECODER) += aarch64/aacpsdsp_init_aarch64.o \
|
||||
aarch64/sbrdsp_init_aarch64.o
|
||||
OBJS-$(CONFIG_DCA_DECODER) += aarch64/synth_filter_init.o
|
||||
OBJS-$(CONFIG_OPUS_DECODER) += aarch64/opusdsp_init.o
|
||||
OBJS-$(CONFIG_RV40_DECODER) += aarch64/rv40dsp_init_aarch64.o
|
||||
OBJS-$(CONFIG_VC1DSP) += aarch64/vc1dsp_init_aarch64.o
|
||||
OBJS-$(CONFIG_VORBIS_DECODER) += aarch64/vorbisdsp_init.o
|
||||
OBJS-$(CONFIG_VP9_DECODER) += aarch64/vp9dsp_init_10bpp_aarch64.o \
|
||||
aarch64/vp9dsp_init_12bpp_aarch64.o \
|
||||
aarch64/vp9dsp_init_aarch64.o
|
||||
|
||||
# ARMv8 optimizations
|
||||
|
||||
# subsystems
|
||||
ARMV8-OBJS-$(CONFIG_VIDEODSP) += aarch64/videodsp.o
|
||||
|
||||
# NEON optimizations
|
||||
|
||||
# subsystems
|
||||
NEON-OBJS-$(CONFIG_AAC_DECODER) += aarch64/sbrdsp_neon.o
|
||||
NEON-OBJS-$(CONFIG_FFT) += aarch64/fft_neon.o
|
||||
NEON-OBJS-$(CONFIG_FMTCONVERT) += aarch64/fmtconvert_neon.o
|
||||
NEON-OBJS-$(CONFIG_H264CHROMA) += aarch64/h264cmc_neon.o
|
||||
NEON-OBJS-$(CONFIG_H264DSP) += aarch64/h264dsp_neon.o \
|
||||
aarch64/h264idct_neon.o
|
||||
NEON-OBJS-$(CONFIG_H264PRED) += aarch64/h264pred_neon.o
|
||||
NEON-OBJS-$(CONFIG_H264QPEL) += aarch64/h264qpel_neon.o \
|
||||
aarch64/hpeldsp_neon.o
|
||||
NEON-OBJS-$(CONFIG_HPELDSP) += aarch64/hpeldsp_neon.o
|
||||
NEON-OBJS-$(CONFIG_IDCTDSP) += aarch64/idctdsp_init_aarch64.o \
|
||||
aarch64/simple_idct_neon.o
|
||||
NEON-OBJS-$(CONFIG_MDCT) += aarch64/mdct_neon.o
|
||||
NEON-OBJS-$(CONFIG_MPEGAUDIODSP) += aarch64/mpegaudiodsp_neon.o
|
||||
NEON-OBJS-$(CONFIG_VP8DSP) += aarch64/vp8dsp_neon.o
|
||||
|
||||
# decoders/encoders
|
||||
NEON-OBJS-$(CONFIG_AAC_DECODER) += aarch64/aacpsdsp_neon.o
|
||||
NEON-OBJS-$(CONFIG_DCA_DECODER) += aarch64/synth_filter_neon.o
|
||||
NEON-OBJS-$(CONFIG_OPUS_DECODER) += aarch64/opusdsp_neon.o
|
||||
NEON-OBJS-$(CONFIG_VORBIS_DECODER) += aarch64/vorbisdsp_neon.o
|
||||
NEON-OBJS-$(CONFIG_VP9_DECODER) += aarch64/vp9itxfm_16bpp_neon.o \
|
||||
aarch64/vp9itxfm_neon.o \
|
||||
aarch64/vp9lpf_16bpp_neon.o \
|
||||
aarch64/vp9lpf_neon.o \
|
||||
aarch64/vp9mc_16bpp_neon.o \
|
||||
aarch64/vp9mc_neon.o
|
199
trunk/3rdparty/ffmpeg-4.2-fit/libavcodec/x86/Makefile
vendored
Normal file
199
trunk/3rdparty/ffmpeg-4.2-fit/libavcodec/x86/Makefile
vendored
Normal file
|
@ -0,0 +1,199 @@
|
|||
OBJS += x86/constants.o \
|
||||
|
||||
# subsystems
|
||||
OBJS-$(CONFIG_AC3DSP) += x86/ac3dsp_init.o
|
||||
OBJS-$(CONFIG_AUDIODSP) += x86/audiodsp_init.o
|
||||
OBJS-$(CONFIG_BLOCKDSP) += x86/blockdsp_init.o
|
||||
OBJS-$(CONFIG_BSWAPDSP) += x86/bswapdsp_init.o
|
||||
OBJS-$(CONFIG_DCT) += x86/dct_init.o
|
||||
OBJS-$(CONFIG_DIRAC_DECODER) += x86/diracdsp_init.o \
|
||||
x86/dirac_dwt_init.o
|
||||
OBJS-$(CONFIG_FDCTDSP) += x86/fdctdsp_init.o
|
||||
OBJS-$(CONFIG_FFT) += x86/fft_init.o
|
||||
OBJS-$(CONFIG_FLACDSP) += x86/flacdsp_init.o
|
||||
OBJS-$(CONFIG_FMTCONVERT) += x86/fmtconvert_init.o
|
||||
OBJS-$(CONFIG_H263DSP) += x86/h263dsp_init.o
|
||||
OBJS-$(CONFIG_H264CHROMA) += x86/h264chroma_init.o
|
||||
OBJS-$(CONFIG_H264DSP) += x86/h264dsp_init.o
|
||||
OBJS-$(CONFIG_H264PRED) += x86/h264_intrapred_init.o
|
||||
OBJS-$(CONFIG_H264QPEL) += x86/h264_qpel.o
|
||||
OBJS-$(CONFIG_HPELDSP) += x86/hpeldsp_init.o
|
||||
OBJS-$(CONFIG_LLAUDDSP) += x86/lossless_audiodsp_init.o
|
||||
OBJS-$(CONFIG_LLVIDDSP) += x86/lossless_videodsp_init.o
|
||||
OBJS-$(CONFIG_LLVIDENCDSP) += x86/lossless_videoencdsp_init.o
|
||||
OBJS-$(CONFIG_HUFFYUVDSP) += x86/huffyuvdsp_init.o
|
||||
OBJS-$(CONFIG_HUFFYUVENCDSP) += x86/huffyuvencdsp_init.o
|
||||
OBJS-$(CONFIG_IDCTDSP) += x86/idctdsp_init.o
|
||||
OBJS-$(CONFIG_LPC) += x86/lpc.o
|
||||
OBJS-$(CONFIG_MDCT15) += x86/mdct15_init.o
|
||||
OBJS-$(CONFIG_ME_CMP) += x86/me_cmp_init.o
|
||||
OBJS-$(CONFIG_MPEGAUDIODSP) += x86/mpegaudiodsp.o
|
||||
OBJS-$(CONFIG_MPEGVIDEO) += x86/mpegvideo.o \
|
||||
x86/mpegvideodsp.o
|
||||
OBJS-$(CONFIG_MPEGVIDEOENC) += x86/mpegvideoenc.o \
|
||||
x86/mpegvideoencdsp_init.o
|
||||
OBJS-$(CONFIG_PIXBLOCKDSP) += x86/pixblockdsp_init.o
|
||||
OBJS-$(CONFIG_QPELDSP) += x86/qpeldsp_init.o
|
||||
OBJS-$(CONFIG_RV34DSP) += x86/rv34dsp_init.o
|
||||
OBJS-$(CONFIG_VC1DSP) += x86/vc1dsp_init.o
|
||||
OBJS-$(CONFIG_VIDEODSP) += x86/videodsp_init.o
|
||||
OBJS-$(CONFIG_VP3DSP) += x86/vp3dsp_init.o
|
||||
OBJS-$(CONFIG_VP8DSP) += x86/vp8dsp_init.o
|
||||
OBJS-$(CONFIG_XMM_CLOBBER_TEST) += x86/w64xmmtest.o
|
||||
|
||||
# decoders/encoders
|
||||
OBJS-$(CONFIG_AAC_DECODER) += x86/aacpsdsp_init.o \
|
||||
x86/sbrdsp_init.o
|
||||
OBJS-$(CONFIG_AAC_ENCODER) += x86/aacencdsp_init.o
|
||||
OBJS-$(CONFIG_ADPCM_G722_DECODER) += x86/g722dsp_init.o
|
||||
OBJS-$(CONFIG_ADPCM_G722_ENCODER) += x86/g722dsp_init.o
|
||||
OBJS-$(CONFIG_ALAC_DECODER) += x86/alacdsp_init.o
|
||||
OBJS-$(CONFIG_APNG_DECODER) += x86/pngdsp_init.o
|
||||
OBJS-$(CONFIG_CAVS_DECODER) += x86/cavsdsp.o
|
||||
OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp_init.o x86/synth_filter_init.o
|
||||
OBJS-$(CONFIG_DNXHD_ENCODER) += x86/dnxhdenc_init.o
|
||||
OBJS-$(CONFIG_EXR_DECODER) += x86/exrdsp_init.o
|
||||
OBJS-$(CONFIG_OPUS_DECODER) += x86/opusdsp_init.o
|
||||
OBJS-$(CONFIG_OPUS_ENCODER) += x86/celt_pvq_init.o
|
||||
OBJS-$(CONFIG_HEVC_DECODER) += x86/hevcdsp_init.o
|
||||
OBJS-$(CONFIG_JPEG2000_DECODER) += x86/jpeg2000dsp_init.o
|
||||
OBJS-$(CONFIG_LSCR_DECODER) += x86/pngdsp_init.o
|
||||
OBJS-$(CONFIG_MLP_DECODER) += x86/mlpdsp_init.o
|
||||
OBJS-$(CONFIG_MPEG4_DECODER) += x86/xvididct_init.o
|
||||
OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp_init.o
|
||||
OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp_init.o
|
||||
OBJS-$(CONFIG_PRORES_LGPL_DECODER) += x86/proresdsp_init.o
|
||||
OBJS-$(CONFIG_RV40_DECODER) += x86/rv40dsp_init.o
|
||||
OBJS-$(CONFIG_SBC_ENCODER) += x86/sbcdsp_init.o
|
||||
OBJS-$(CONFIG_SVQ1_ENCODER) += x86/svq1enc_init.o
|
||||
OBJS-$(CONFIG_TAK_DECODER) += x86/takdsp_init.o
|
||||
OBJS-$(CONFIG_TRUEHD_DECODER) += x86/mlpdsp_init.o
|
||||
OBJS-$(CONFIG_TTA_DECODER) += x86/ttadsp_init.o
|
||||
OBJS-$(CONFIG_TTA_ENCODER) += x86/ttaencdsp_init.o
|
||||
OBJS-$(CONFIG_UTVIDEO_DECODER) += x86/utvideodsp_init.o
|
||||
OBJS-$(CONFIG_V210_DECODER) += x86/v210-init.o
|
||||
OBJS-$(CONFIG_V210_ENCODER) += x86/v210enc_init.o
|
||||
OBJS-$(CONFIG_VORBIS_DECODER) += x86/vorbisdsp_init.o
|
||||
OBJS-$(CONFIG_VP3_DECODER) += x86/hpeldsp_vp3_init.o
|
||||
OBJS-$(CONFIG_VP6_DECODER) += x86/vp6dsp_init.o
|
||||
OBJS-$(CONFIG_VP9_DECODER) += x86/vp9dsp_init.o \
|
||||
x86/vp9dsp_init_10bpp.o \
|
||||
x86/vp9dsp_init_12bpp.o \
|
||||
x86/vp9dsp_init_16bpp.o
|
||||
OBJS-$(CONFIG_WEBP_DECODER) += x86/vp8dsp_init.o
|
||||
|
||||
|
||||
# GCC inline assembly optimizations
|
||||
# subsystems
|
||||
MMX-OBJS-$(CONFIG_FDCTDSP) += x86/fdct.o
|
||||
MMX-OBJS-$(CONFIG_VC1DSP) += x86/vc1dsp_mmx.o
|
||||
|
||||
# decoders/encoders
|
||||
MMX-OBJS-$(CONFIG_SNOW_DECODER) += x86/snowdsp.o
|
||||
MMX-OBJS-$(CONFIG_SNOW_ENCODER) += x86/snowdsp.o
|
||||
|
||||
# subsystems
|
||||
X86ASM-OBJS-$(CONFIG_AC3DSP) += x86/ac3dsp.o \
|
||||
x86/ac3dsp_downmix.o
|
||||
X86ASM-OBJS-$(CONFIG_AUDIODSP) += x86/audiodsp.o
|
||||
X86ASM-OBJS-$(CONFIG_BLOCKDSP) += x86/blockdsp.o
|
||||
X86ASM-OBJS-$(CONFIG_BSWAPDSP) += x86/bswapdsp.o
|
||||
X86ASM-OBJS-$(CONFIG_DCT) += x86/dct32.o
|
||||
X86ASM-OBJS-$(CONFIG_FFT) += x86/fft.o
|
||||
X86ASM-OBJS-$(CONFIG_FMTCONVERT) += x86/fmtconvert.o
|
||||
X86ASM-OBJS-$(CONFIG_H263DSP) += x86/h263_loopfilter.o
|
||||
X86ASM-OBJS-$(CONFIG_H264CHROMA) += x86/h264_chromamc.o \
|
||||
x86/h264_chromamc_10bit.o
|
||||
X86ASM-OBJS-$(CONFIG_H264DSP) += x86/h264_deblock.o \
|
||||
x86/h264_deblock_10bit.o \
|
||||
x86/h264_idct.o \
|
||||
x86/h264_idct_10bit.o \
|
||||
x86/h264_weight.o \
|
||||
x86/h264_weight_10bit.o
|
||||
X86ASM-OBJS-$(CONFIG_H264PRED) += x86/h264_intrapred.o \
|
||||
x86/h264_intrapred_10bit.o
|
||||
X86ASM-OBJS-$(CONFIG_H264QPEL) += x86/h264_qpel_8bit.o \
|
||||
x86/h264_qpel_10bit.o \
|
||||
x86/fpel.o \
|
||||
x86/qpel.o
|
||||
X86ASM-OBJS-$(CONFIG_HPELDSP) += x86/fpel.o \
|
||||
x86/hpeldsp.o
|
||||
X86ASM-OBJS-$(CONFIG_HUFFYUVDSP) += x86/huffyuvdsp.o
|
||||
X86ASM-OBJS-$(CONFIG_HUFFYUVENCDSP) += x86/huffyuvencdsp.o
|
||||
X86ASM-OBJS-$(CONFIG_IDCTDSP) += x86/idctdsp.o
|
||||
X86ASM-OBJS-$(CONFIG_LLAUDDSP) += x86/lossless_audiodsp.o
|
||||
X86ASM-OBJS-$(CONFIG_LLVIDDSP) += x86/lossless_videodsp.o
|
||||
X86ASM-OBJS-$(CONFIG_LLVIDENCDSP) += x86/lossless_videoencdsp.o
|
||||
X86ASM-OBJS-$(CONFIG_MDCT15) += x86/mdct15.o
|
||||
X86ASM-OBJS-$(CONFIG_ME_CMP) += x86/me_cmp.o
|
||||
X86ASM-OBJS-$(CONFIG_MPEGAUDIODSP) += x86/imdct36.o
|
||||
X86ASM-OBJS-$(CONFIG_MPEGVIDEOENC) += x86/mpegvideoencdsp.o
|
||||
X86ASM-OBJS-$(CONFIG_OPUS_DECODER) += x86/opusdsp.o
|
||||
X86ASM-OBJS-$(CONFIG_OPUS_ENCODER) += x86/celt_pvq_search.o
|
||||
X86ASM-OBJS-$(CONFIG_PIXBLOCKDSP) += x86/pixblockdsp.o
|
||||
X86ASM-OBJS-$(CONFIG_QPELDSP) += x86/qpeldsp.o \
|
||||
x86/fpel.o \
|
||||
x86/qpel.o
|
||||
X86ASM-OBJS-$(CONFIG_RV34DSP) += x86/rv34dsp.o
|
||||
X86ASM-OBJS-$(CONFIG_VC1DSP) += x86/vc1dsp_loopfilter.o \
|
||||
x86/vc1dsp_mc.o
|
||||
X86ASM-OBJS-$(CONFIG_IDCTDSP) += x86/simple_idct10.o \
|
||||
x86/simple_idct.o
|
||||
X86ASM-OBJS-$(CONFIG_VIDEODSP) += x86/videodsp.o
|
||||
X86ASM-OBJS-$(CONFIG_VP3DSP) += x86/vp3dsp.o
|
||||
X86ASM-OBJS-$(CONFIG_VP8DSP) += x86/vp8dsp.o \
|
||||
x86/vp8dsp_loopfilter.o
|
||||
|
||||
# decoders/encoders
|
||||
X86ASM-OBJS-$(CONFIG_AAC_DECODER) += x86/aacpsdsp.o \
|
||||
x86/sbrdsp.o
|
||||
X86ASM-OBJS-$(CONFIG_AAC_ENCODER) += x86/aacencdsp.o
|
||||
X86ASM-OBJS-$(CONFIG_ADPCM_G722_DECODER) += x86/g722dsp.o
|
||||
X86ASM-OBJS-$(CONFIG_ADPCM_G722_ENCODER) += x86/g722dsp.o
|
||||
X86ASM-OBJS-$(CONFIG_ALAC_DECODER) += x86/alacdsp.o
|
||||
X86ASM-OBJS-$(CONFIG_APNG_DECODER) += x86/pngdsp.o
|
||||
X86ASM-OBJS-$(CONFIG_CAVS_DECODER) += x86/cavsidct.o
|
||||
X86ASM-OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp.o x86/synth_filter.o
|
||||
X86ASM-OBJS-$(CONFIG_DIRAC_DECODER) += x86/diracdsp.o \
|
||||
x86/dirac_dwt.o
|
||||
X86ASM-OBJS-$(CONFIG_DNXHD_ENCODER) += x86/dnxhdenc.o
|
||||
X86ASM-OBJS-$(CONFIG_EXR_DECODER) += x86/exrdsp.o
|
||||
X86ASM-OBJS-$(CONFIG_FLAC_DECODER) += x86/flacdsp.o
|
||||
ifdef CONFIG_GPL
|
||||
X86ASM-OBJS-$(CONFIG_FLAC_ENCODER) += x86/flac_dsp_gpl.o
|
||||
endif
|
||||
X86ASM-OBJS-$(CONFIG_HEVC_DECODER) += x86/hevc_add_res.o \
|
||||
x86/hevc_deblock.o \
|
||||
x86/hevc_idct.o \
|
||||
x86/hevc_mc.o \
|
||||
x86/hevc_sao.o \
|
||||
x86/hevc_sao_10bit.o
|
||||
X86ASM-OBJS-$(CONFIG_JPEG2000_DECODER) += x86/jpeg2000dsp.o
|
||||
X86ASM-OBJS-$(CONFIG_LSCR_DECODER) += x86/pngdsp.o
|
||||
X86ASM-OBJS-$(CONFIG_MLP_DECODER) += x86/mlpdsp.o
|
||||
X86ASM-OBJS-$(CONFIG_MPEG4_DECODER) += x86/xvididct.o
|
||||
X86ASM-OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp.o
|
||||
X86ASM-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp.o
|
||||
X86ASM-OBJS-$(CONFIG_PRORES_LGPL_DECODER) += x86/proresdsp.o
|
||||
X86ASM-OBJS-$(CONFIG_RV40_DECODER) += x86/rv40dsp.o
|
||||
X86ASM-OBJS-$(CONFIG_SBC_ENCODER) += x86/sbcdsp.o
|
||||
X86ASM-OBJS-$(CONFIG_SVQ1_ENCODER) += x86/svq1enc.o
|
||||
X86ASM-OBJS-$(CONFIG_TAK_DECODER) += x86/takdsp.o
|
||||
X86ASM-OBJS-$(CONFIG_TRUEHD_DECODER) += x86/mlpdsp.o
|
||||
X86ASM-OBJS-$(CONFIG_TTA_DECODER) += x86/ttadsp.o
|
||||
X86ASM-OBJS-$(CONFIG_TTA_ENCODER) += x86/ttaencdsp.o
|
||||
X86ASM-OBJS-$(CONFIG_UTVIDEO_DECODER) += x86/utvideodsp.o
|
||||
X86ASM-OBJS-$(CONFIG_V210_ENCODER) += x86/v210enc.o
|
||||
X86ASM-OBJS-$(CONFIG_V210_DECODER) += x86/v210.o
|
||||
X86ASM-OBJS-$(CONFIG_VORBIS_DECODER) += x86/vorbisdsp.o
|
||||
X86ASM-OBJS-$(CONFIG_VP3_DECODER) += x86/hpeldsp_vp3.o
|
||||
X86ASM-OBJS-$(CONFIG_VP6_DECODER) += x86/vp6dsp.o
|
||||
X86ASM-OBJS-$(CONFIG_VP9_DECODER) += x86/vp9intrapred.o \
|
||||
x86/vp9intrapred_16bpp.o \
|
||||
x86/vp9itxfm.o \
|
||||
x86/vp9itxfm_16bpp.o \
|
||||
x86/vp9lpf.o \
|
||||
x86/vp9lpf_16bpp.o \
|
||||
x86/vp9mc.o \
|
||||
x86/vp9mc_16bpp.o
|
||||
X86ASM-OBJS-$(CONFIG_WEBP_DECODER) += x86/vp8dsp.o
|
86
trunk/3rdparty/ffmpeg-4.2-fit/libavcodec/x86/aacencdsp.asm
vendored
Normal file
86
trunk/3rdparty/ffmpeg-4.2-fit/libavcodec/x86/aacencdsp.asm
vendored
Normal file
|
@ -0,0 +1,86 @@
|
|||
;******************************************************************************
|
||||
;* SIMD optimized AAC encoder DSP functions
|
||||
;*
|
||||
;* Copyright (C) 2016 Rostislav Pehlivanov <atomnuker@gmail.com>
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
SECTION_RODATA
|
||||
|
||||
float_abs_mask: times 4 dd 0x7fffffff
|
||||
|
||||
SECTION .text
|
||||
|
||||
;*******************************************************************
|
||||
;void ff_abs_pow34(float *out, const float *in, const int size);
|
||||
;*******************************************************************
|
||||
INIT_XMM sse
|
||||
cglobal abs_pow34, 3, 3, 3, out, in, size
|
||||
mova m2, [float_abs_mask]
|
||||
shl sizeq, 2
|
||||
add inq, sizeq
|
||||
add outq, sizeq
|
||||
neg sizeq
|
||||
.loop:
|
||||
andps m0, m2, [inq+sizeq]
|
||||
sqrtps m1, m0
|
||||
mulps m0, m1
|
||||
sqrtps m0, m0
|
||||
mova [outq+sizeq], m0
|
||||
add sizeq, mmsize
|
||||
jl .loop
|
||||
RET
|
||||
|
||||
;*******************************************************************
|
||||
;void ff_aac_quantize_bands(int *out, const float *in, const float *scaled,
|
||||
; int size, int is_signed, int maxval, const float Q34,
|
||||
; const float rounding)
|
||||
;*******************************************************************
|
||||
INIT_XMM sse2
|
||||
cglobal aac_quantize_bands, 5, 5, 6, out, in, scaled, size, is_signed, maxval, Q34, rounding
|
||||
%if UNIX64 == 0
|
||||
movss m0, Q34m
|
||||
movss m1, roundingm
|
||||
cvtsi2ss m3, dword maxvalm
|
||||
%else
|
||||
cvtsi2ss m3, maxvald
|
||||
%endif
|
||||
shufps m0, m0, 0
|
||||
shufps m1, m1, 0
|
||||
shufps m3, m3, 0
|
||||
shl is_signedd, 31
|
||||
movd m4, is_signedd
|
||||
shufps m4, m4, 0
|
||||
shl sized, 2
|
||||
add inq, sizeq
|
||||
add outq, sizeq
|
||||
add scaledq, sizeq
|
||||
neg sizeq
|
||||
.loop:
|
||||
mulps m2, m0, [scaledq+sizeq]
|
||||
addps m2, m1
|
||||
minps m2, m3
|
||||
andps m5, m4, [inq+sizeq]
|
||||
orps m2, m5
|
||||
cvttps2dq m2, m2
|
||||
mova [outq+sizeq], m2
|
||||
add sizeq, mmsize
|
||||
jl .loop
|
||||
RET
|
487
trunk/3rdparty/ffmpeg-4.2-fit/libavcodec/x86/aacpsdsp.asm
vendored
Normal file
487
trunk/3rdparty/ffmpeg-4.2-fit/libavcodec/x86/aacpsdsp.asm
vendored
Normal file
|
@ -0,0 +1,487 @@
|
|||
;******************************************************************************
|
||||
;* SIMD optimized MPEG-4 Parametric Stereo decoding functions
|
||||
;*
|
||||
;* Copyright (C) 2015 James Almer
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
SECTION_RODATA
|
||||
|
||||
ps_p1m1p1m1: dd 0, 0x80000000, 0, 0x80000000
|
||||
|
||||
SECTION .text
|
||||
|
||||
;*************************************************************************
|
||||
;void ff_ps_add_squares_<opt>(float *dst, const float (*src)[2], int n);
|
||||
;*************************************************************************
|
||||
%macro PS_ADD_SQUARES 1
|
||||
cglobal ps_add_squares, 3, 3, %1, dst, src, n
|
||||
shl nd, 3
|
||||
add srcq, nq
|
||||
neg nq
|
||||
|
||||
align 16
|
||||
.loop:
|
||||
movaps m0, [srcq+nq]
|
||||
movaps m1, [srcq+nq+mmsize]
|
||||
mulps m0, m0
|
||||
mulps m1, m1
|
||||
HADDPS m0, m1, m2
|
||||
addps m0, [dstq]
|
||||
movaps [dstq], m0
|
||||
add dstq, mmsize
|
||||
add nq, mmsize*2
|
||||
jl .loop
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse
|
||||
PS_ADD_SQUARES 2
|
||||
INIT_XMM sse3
|
||||
PS_ADD_SQUARES 3
|
||||
|
||||
;*******************************************************************
|
||||
;void ff_ps_mul_pair_single_sse(float (*dst)[2], float (*src0)[2],
|
||||
; float *src1, int n);
|
||||
;*******************************************************************
|
||||
INIT_XMM sse
|
||||
cglobal ps_mul_pair_single, 4, 4, 4, dst, src1, src2, n
|
||||
shl nd, 3
|
||||
add src1q, nq
|
||||
add dstq, nq
|
||||
neg nq
|
||||
|
||||
align 16
|
||||
.loop:
|
||||
movu m0, [src1q+nq]
|
||||
movu m1, [src1q+nq+mmsize]
|
||||
mova m2, [src2q]
|
||||
mova m3, m2
|
||||
unpcklps m2, m2
|
||||
unpckhps m3, m3
|
||||
mulps m0, m2
|
||||
mulps m1, m3
|
||||
mova [dstq+nq], m0
|
||||
mova [dstq+nq+mmsize], m1
|
||||
add src2q, mmsize
|
||||
add nq, mmsize*2
|
||||
jl .loop
|
||||
REP_RET
|
||||
|
||||
;***********************************************************************
|
||||
;void ff_ps_stereo_interpolate_sse3(float (*l)[2], float (*r)[2],
|
||||
; float h[2][4], float h_step[2][4],
|
||||
; int len);
|
||||
;***********************************************************************
|
||||
INIT_XMM sse3
|
||||
cglobal ps_stereo_interpolate, 5, 5, 6, l, r, h, h_step, n
|
||||
movaps m0, [hq]
|
||||
movaps m1, [h_stepq]
|
||||
unpcklps m4, m0, m0
|
||||
unpckhps m0, m0
|
||||
unpcklps m5, m1, m1
|
||||
unpckhps m1, m1
|
||||
shl nd, 3
|
||||
add lq, nq
|
||||
add rq, nq
|
||||
neg nq
|
||||
|
||||
align 16
|
||||
.loop:
|
||||
addps m4, m5
|
||||
addps m0, m1
|
||||
movddup m2, [lq+nq]
|
||||
movddup m3, [rq+nq]
|
||||
mulps m2, m4
|
||||
mulps m3, m0
|
||||
addps m2, m3
|
||||
movsd [lq+nq], m2
|
||||
movhps [rq+nq], m2
|
||||
add nq, 8
|
||||
jl .loop
|
||||
REP_RET
|
||||
|
||||
;***************************************************************************
|
||||
;void ps_stereo_interpolate_ipdopd_sse3(float (*l)[2], float (*r)[2],
|
||||
; float h[2][4], float h_step[2][4],
|
||||
; int len);
|
||||
;***************************************************************************
|
||||
INIT_XMM sse3
|
||||
cglobal ps_stereo_interpolate_ipdopd, 5, 5, 10, l, r, h, h_step, n
|
||||
movaps m0, [hq]
|
||||
movaps m1, [hq+mmsize]
|
||||
%if ARCH_X86_64
|
||||
movaps m8, [h_stepq]
|
||||
movaps m9, [h_stepq+mmsize]
|
||||
%define H_STEP0 m8
|
||||
%define H_STEP1 m9
|
||||
%else
|
||||
%define H_STEP0 [h_stepq]
|
||||
%define H_STEP1 [h_stepq+mmsize]
|
||||
%endif
|
||||
shl nd, 3
|
||||
add lq, nq
|
||||
add rq, nq
|
||||
neg nq
|
||||
|
||||
align 16
|
||||
.loop:
|
||||
addps m0, H_STEP0
|
||||
addps m1, H_STEP1
|
||||
movddup m2, [lq+nq]
|
||||
movddup m3, [rq+nq]
|
||||
shufps m4, m2, m2, q2301
|
||||
shufps m5, m3, m3, q2301
|
||||
unpcklps m6, m0, m0
|
||||
unpckhps m7, m0, m0
|
||||
mulps m2, m6
|
||||
mulps m3, m7
|
||||
unpcklps m6, m1, m1
|
||||
unpckhps m7, m1, m1
|
||||
mulps m4, m6
|
||||
mulps m5, m7
|
||||
addps m2, m3
|
||||
addsubps m2, m4
|
||||
addsubps m2, m5
|
||||
movsd [lq+nq], m2
|
||||
movhps [rq+nq], m2
|
||||
add nq, 8
|
||||
jl .loop
|
||||
REP_RET
|
||||
|
||||
;**********************************************************
|
||||
;void ps_hybrid_analysis_ileave_sse(float out[2][38][64],
|
||||
; float (*in)[32][2],
|
||||
; int i, int len)
|
||||
;**********************************************************
|
||||
INIT_XMM sse
|
||||
cglobal ps_hybrid_analysis_ileave, 3, 7, 5, out, in, i, len, in0, in1, tmp
|
||||
movsxdifnidn iq, id
|
||||
mov lend, 32 << 3
|
||||
lea inq, [inq+iq*4]
|
||||
mov tmpd, id
|
||||
shl tmpd, 8
|
||||
add outq, tmpq
|
||||
mov tmpd, 64
|
||||
sub tmpd, id
|
||||
mov id, tmpd
|
||||
|
||||
test id, 1
|
||||
jne .loop4
|
||||
test id, 2
|
||||
jne .loop8
|
||||
|
||||
align 16
|
||||
.loop16:
|
||||
mov in0q, inq
|
||||
mov in1q, 38*64*4
|
||||
add in1q, in0q
|
||||
mov tmpd, lend
|
||||
|
||||
.inner_loop16:
|
||||
movaps m0, [in0q]
|
||||
movaps m1, [in1q]
|
||||
movaps m2, [in0q+lenq]
|
||||
movaps m3, [in1q+lenq]
|
||||
TRANSPOSE4x4PS 0, 1, 2, 3, 4
|
||||
movaps [outq], m0
|
||||
movaps [outq+lenq], m1
|
||||
movaps [outq+lenq*2], m2
|
||||
movaps [outq+3*32*2*4], m3
|
||||
lea in0q, [in0q+lenq*2]
|
||||
lea in1q, [in1q+lenq*2]
|
||||
add outq, mmsize
|
||||
sub tmpd, mmsize
|
||||
jg .inner_loop16
|
||||
add inq, 16
|
||||
add outq, 3*32*2*4
|
||||
sub id, 4
|
||||
jg .loop16
|
||||
RET
|
||||
|
||||
align 16
|
||||
.loop8:
|
||||
mov in0q, inq
|
||||
mov in1q, 38*64*4
|
||||
add in1q, in0q
|
||||
mov tmpd, lend
|
||||
|
||||
.inner_loop8:
|
||||
movlps m0, [in0q]
|
||||
movlps m1, [in1q]
|
||||
movhps m0, [in0q+lenq]
|
||||
movhps m1, [in1q+lenq]
|
||||
SBUTTERFLYPS 0, 1, 2
|
||||
SBUTTERFLYPD 0, 1, 2
|
||||
movaps [outq], m0
|
||||
movaps [outq+lenq], m1
|
||||
lea in0q, [in0q+lenq*2]
|
||||
lea in1q, [in1q+lenq*2]
|
||||
add outq, mmsize
|
||||
sub tmpd, mmsize
|
||||
jg .inner_loop8
|
||||
add inq, 8
|
||||
add outq, lenq
|
||||
sub id, 2
|
||||
jg .loop16
|
||||
RET
|
||||
|
||||
align 16
|
||||
.loop4:
|
||||
mov in0q, inq
|
||||
mov in1q, 38*64*4
|
||||
add in1q, in0q
|
||||
mov tmpd, lend
|
||||
|
||||
.inner_loop4:
|
||||
movss m0, [in0q]
|
||||
movss m1, [in1q]
|
||||
movss m2, [in0q+lenq]
|
||||
movss m3, [in1q+lenq]
|
||||
movlhps m0, m1
|
||||
movlhps m2, m3
|
||||
shufps m0, m2, q2020
|
||||
movaps [outq], m0
|
||||
lea in0q, [in0q+lenq*2]
|
||||
lea in1q, [in1q+lenq*2]
|
||||
add outq, mmsize
|
||||
sub tmpd, mmsize
|
||||
jg .inner_loop4
|
||||
add inq, 4
|
||||
sub id, 1
|
||||
test id, 2
|
||||
jne .loop8
|
||||
cmp id, 4
|
||||
jge .loop16
|
||||
RET
|
||||
|
||||
;***********************************************************
|
||||
;void ps_hybrid_synthesis_deint_sse4(float out[2][38][64],
|
||||
; float (*in)[32][2],
|
||||
; int i, int len)
|
||||
;***********************************************************
|
||||
%macro HYBRID_SYNTHESIS_DEINT 0
|
||||
cglobal ps_hybrid_synthesis_deint, 3, 7, 5, out, in, i, len, out0, out1, tmp
|
||||
%if cpuflag(sse4)
|
||||
%define MOVH movsd
|
||||
%else
|
||||
%define MOVH movlps
|
||||
%endif
|
||||
movsxdifnidn iq, id
|
||||
mov lend, 32 << 3
|
||||
lea outq, [outq+iq*4]
|
||||
mov tmpd, id
|
||||
shl tmpd, 8
|
||||
add inq, tmpq
|
||||
mov tmpd, 64
|
||||
sub tmpd, id
|
||||
mov id, tmpd
|
||||
|
||||
test id, 1
|
||||
jne .loop4
|
||||
test id, 2
|
||||
jne .loop8
|
||||
|
||||
align 16
|
||||
.loop16:
|
||||
mov out0q, outq
|
||||
mov out1q, 38*64*4
|
||||
add out1q, out0q
|
||||
mov tmpd, lend
|
||||
|
||||
.inner_loop16:
|
||||
movaps m0, [inq]
|
||||
movaps m1, [inq+lenq]
|
||||
movaps m2, [inq+lenq*2]
|
||||
movaps m3, [inq+3*32*2*4]
|
||||
TRANSPOSE4x4PS 0, 1, 2, 3, 4
|
||||
movaps [out0q], m0
|
||||
movaps [out1q], m1
|
||||
movaps [out0q+lenq], m2
|
||||
movaps [out1q+lenq], m3
|
||||
lea out0q, [out0q+lenq*2]
|
||||
lea out1q, [out1q+lenq*2]
|
||||
add inq, mmsize
|
||||
sub tmpd, mmsize
|
||||
jg .inner_loop16
|
||||
add outq, 16
|
||||
add inq, 3*32*2*4
|
||||
sub id, 4
|
||||
jg .loop16
|
||||
RET
|
||||
|
||||
align 16
|
||||
.loop8:
|
||||
mov out0q, outq
|
||||
mov out1q, 38*64*4
|
||||
add out1q, out0q
|
||||
mov tmpd, lend
|
||||
|
||||
.inner_loop8:
|
||||
movaps m0, [inq]
|
||||
movaps m1, [inq+lenq]
|
||||
SBUTTERFLYPS 0, 1, 2
|
||||
SBUTTERFLYPD 0, 1, 2
|
||||
MOVH [out0q], m0
|
||||
MOVH [out1q], m1
|
||||
movhps [out0q+lenq], m0
|
||||
movhps [out1q+lenq], m1
|
||||
lea out0q, [out0q+lenq*2]
|
||||
lea out1q, [out1q+lenq*2]
|
||||
add inq, mmsize
|
||||
sub tmpd, mmsize
|
||||
jg .inner_loop8
|
||||
add outq, 8
|
||||
add inq, lenq
|
||||
sub id, 2
|
||||
jg .loop16
|
||||
RET
|
||||
|
||||
align 16
|
||||
.loop4:
|
||||
mov out0q, outq
|
||||
mov out1q, 38*64*4
|
||||
add out1q, out0q
|
||||
mov tmpd, lend
|
||||
|
||||
.inner_loop4:
|
||||
movaps m0, [inq]
|
||||
movss [out0q], m0
|
||||
%if cpuflag(sse4)
|
||||
extractps [out1q], m0, 1
|
||||
extractps [out0q+lenq], m0, 2
|
||||
extractps [out1q+lenq], m0, 3
|
||||
%else
|
||||
movhlps m1, m0
|
||||
movss [out0q+lenq], m1
|
||||
shufps m0, m0, 0xb1
|
||||
movss [out1q], m0
|
||||
movhlps m1, m0
|
||||
movss [out1q+lenq], m1
|
||||
%endif
|
||||
lea out0q, [out0q+lenq*2]
|
||||
lea out1q, [out1q+lenq*2]
|
||||
add inq, mmsize
|
||||
sub tmpd, mmsize
|
||||
jg .inner_loop4
|
||||
add outq, 4
|
||||
sub id, 1
|
||||
test id, 2
|
||||
jne .loop8
|
||||
cmp id, 4
|
||||
jge .loop16
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse
|
||||
HYBRID_SYNTHESIS_DEINT
|
||||
INIT_XMM sse4
|
||||
HYBRID_SYNTHESIS_DEINT
|
||||
|
||||
;*******************************************************************
|
||||
;void ff_ps_hybrid_analysis_<opt>(float (*out)[2], float (*in)[2],
|
||||
; const float (*filter)[8][2],
|
||||
; ptrdiff_t stride, int n);
|
||||
;*******************************************************************
|
||||
%macro PS_HYBRID_ANALYSIS_LOOP 3
|
||||
movu %1, [inq+mmsize*%3]
|
||||
movu m1, [inq+mmsize*(5-%3)+8]
|
||||
%if cpuflag(sse3)
|
||||
pshufd %2, %1, q2301
|
||||
pshufd m4, m1, q0123
|
||||
pshufd m1, m1, q1032
|
||||
pshufd m2, [filterq+nq+mmsize*%3], q2301
|
||||
addsubps %2, m4
|
||||
addsubps %1, m1
|
||||
%else
|
||||
mova m2, [filterq+nq+mmsize*%3]
|
||||
mova %2, %1
|
||||
mova m4, m1
|
||||
shufps %2, %2, q2301
|
||||
shufps m4, m4, q0123
|
||||
shufps m1, m1, q1032
|
||||
shufps m2, m2, q2301
|
||||
xorps m4, m7
|
||||
xorps m1, m7
|
||||
subps %2, m4
|
||||
subps %1, m1
|
||||
%endif
|
||||
mulps %2, m2
|
||||
mulps %1, m2
|
||||
%if %3
|
||||
addps m3, %2
|
||||
addps m0, %1
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro PS_HYBRID_ANALYSIS 0
|
||||
cglobal ps_hybrid_analysis, 5, 5, 8, out, in, filter, stride, n
|
||||
%if cpuflag(sse3)
|
||||
%define MOVH movsd
|
||||
%else
|
||||
%define MOVH movlps
|
||||
%endif
|
||||
shl strideq, 3
|
||||
shl nd, 6
|
||||
add filterq, nq
|
||||
neg nq
|
||||
mova m7, [ps_p1m1p1m1]
|
||||
|
||||
align 16
|
||||
.loop:
|
||||
PS_HYBRID_ANALYSIS_LOOP m0, m3, 0
|
||||
PS_HYBRID_ANALYSIS_LOOP m5, m6, 1
|
||||
PS_HYBRID_ANALYSIS_LOOP m5, m6, 2
|
||||
|
||||
%if cpuflag(sse3)
|
||||
pshufd m3, m3, q2301
|
||||
xorps m0, m7
|
||||
hsubps m3, m0
|
||||
pshufd m1, m3, q0020
|
||||
pshufd m3, m3, q0031
|
||||
addps m1, m3
|
||||
movsd m2, [inq+6*8]
|
||||
%else
|
||||
mova m1, m3
|
||||
mova m2, m0
|
||||
shufps m1, m1, q2301
|
||||
shufps m2, m2, q2301
|
||||
subps m1, m3
|
||||
addps m2, m0
|
||||
unpcklps m3, m1, m2
|
||||
unpckhps m1, m2
|
||||
addps m1, m3
|
||||
movu m2, [inq+6*8] ; faster than movlps and no risk of overread
|
||||
%endif
|
||||
movss m3, [filterq+nq+8*6]
|
||||
SPLATD m3
|
||||
mulps m2, m3
|
||||
addps m1, m2
|
||||
MOVH [outq], m1
|
||||
add outq, strideq
|
||||
add nq, 64
|
||||
jl .loop
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse
|
||||
PS_HYBRID_ANALYSIS
|
||||
INIT_XMM sse3
|
||||
PS_HYBRID_ANALYSIS
|
385
trunk/3rdparty/ffmpeg-4.2-fit/libavcodec/x86/celt_pvq_search.asm
vendored
Normal file
385
trunk/3rdparty/ffmpeg-4.2-fit/libavcodec/x86/celt_pvq_search.asm
vendored
Normal file
|
@ -0,0 +1,385 @@
|
|||
;******************************************************************************
|
||||
;* SIMD optimized Opus encoder DSP function
|
||||
;*
|
||||
;* Copyright (C) 2017 Ivan Kalvachev <ikalvachev@gmail.com>
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "config.asm"
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
%ifdef __NASM_VER__
|
||||
%use "smartalign"
|
||||
ALIGNMODE p6
|
||||
%endif
|
||||
|
||||
SECTION_RODATA 64
|
||||
|
||||
const_float_abs_mask: times 8 dd 0x7fffffff
|
||||
const_align_abs_edge: times 8 dd 0
|
||||
|
||||
const_float_0_5: times 8 dd 0.5
|
||||
const_float_1: times 8 dd 1.0
|
||||
const_float_sign_mask: times 8 dd 0x80000000
|
||||
|
||||
const_int32_offsets:
|
||||
%rep 8
|
||||
dd $-const_int32_offsets
|
||||
%endrep
|
||||
SECTION .text
|
||||
|
||||
;
|
||||
; Setup High Register to be used
|
||||
; for holding memory constants
|
||||
;
|
||||
; %1 - the register to be used, assmues it is >= mm8
|
||||
; %2 - name of the constant.
|
||||
;
|
||||
; Subsequent opcodes are going to use the constant in the form
|
||||
; "addps m0, mm_const_name" and it would be turned into:
|
||||
; "addps m0, [const_name]" on 32 bit arch or
|
||||
; "addps m0, m8" on 64 bit arch
|
||||
%macro SET_HI_REG_MM_CONSTANT 3 ; movop, reg, const_name
|
||||
%if num_mmregs > 8
|
||||
%define mm_%3 %2
|
||||
%{1} %2, [%3] ; movaps m8, [const_name]
|
||||
%else
|
||||
%define mm_%3 [%3]
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
;
|
||||
; Set Position Independent Code
|
||||
; Base address of a constant
|
||||
; %1 - the register to be used, if PIC is set
|
||||
; %2 - name of the constant.
|
||||
;
|
||||
; Subsequent opcode are going to use the base address in the form
|
||||
; "movaps m0, [pic_base_constant_name+r4]" and it would be turned into
|
||||
; "movaps m0, [r5 + r4]" if PIC is enabled
|
||||
; "movaps m0, [constant_name + r4]" if texrel are used
|
||||
%macro SET_PIC_BASE 3; reg, const_label
|
||||
%ifdef PIC
|
||||
%{1} %2, [%3] ; lea r5, [rip+const]
|
||||
%define pic_base_%3 %2
|
||||
%else
|
||||
%define pic_base_%3 %3
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro PULSES_SEARCH 1
|
||||
; m6 Syy_norm
|
||||
; m7 Sxy_norm
|
||||
addps m6, mm_const_float_0_5 ; Syy_norm += 1.0/2
|
||||
pxor m1, m1 ; max_idx
|
||||
xorps m3, m3 ; p_max
|
||||
xor r4d, r4d
|
||||
align 16
|
||||
%%distortion_search:
|
||||
movd xm2, dword r4d ; movd zero extends
|
||||
%ifidn %1,add
|
||||
movaps m4, [tmpY + r4] ; y[i]
|
||||
movaps m5, [tmpX + r4] ; X[i]
|
||||
|
||||
%if USE_APPROXIMATION == 1
|
||||
xorps m0, m0
|
||||
cmpps m0, m0, m5, 4 ; m0 = (X[i] != 0.0)
|
||||
%endif
|
||||
|
||||
addps m4, m6 ; m4 = Syy_new = y[i] + Syy_norm
|
||||
addps m5, m7 ; m5 = Sxy_new = X[i] + Sxy_norm
|
||||
|
||||
%if USE_APPROXIMATION == 1
|
||||
andps m5, m0 ; if(X[i] == 0) Sxy_new = 0; Prevent aproximation error from setting pulses in array padding.
|
||||
%endif
|
||||
|
||||
%else
|
||||
movaps m5, [tmpY + r4] ; m5 = y[i]
|
||||
|
||||
xorps m0, m0 ; m0 = 0;
|
||||
cmpps m0, m0, m5, 1 ; m0 = (0<y)
|
||||
|
||||
subps m4, m6, m5 ; m4 = Syy_new = Syy_norm - y[i]
|
||||
subps m5, m7, [tmpX + r4] ; m5 = Sxy_new = Sxy_norm - X[i]
|
||||
andps m5, m0 ; (0<y)?m5:0
|
||||
%endif
|
||||
|
||||
%if USE_APPROXIMATION == 1
|
||||
rsqrtps m4, m4
|
||||
mulps m5, m4 ; m5 = p = Sxy_new*approx(1/sqrt(Syy) )
|
||||
%else
|
||||
mulps m5, m5
|
||||
divps m5, m4 ; m5 = p = Sxy_new*Sxy_new/Syy
|
||||
%endif
|
||||
VPBROADCASTD m2, xm2 ; m2=i (all lanes get same values, we add the offset-per-lane, later)
|
||||
|
||||
cmpps m0, m3, m5, 1 ; m0 = (m3 < m5) ; (p_max < p) ; (p > p_max)
|
||||
maxps m3, m5 ; m3=max(p_max,p)
|
||||
; maxps here is faster than blendvps, despite blend having lower latency.
|
||||
|
||||
pand m2, m0 ; This version seems faster than sse41 pblendvb
|
||||
pmaxsw m1, m2 ; SSE2 signed word, so it would work for N < 32768/4
|
||||
|
||||
add r4d, mmsize
|
||||
cmp r4d, Nd
|
||||
jb %%distortion_search
|
||||
|
||||
por m1, mm_const_int32_offsets ; max_idx offsets per individual lane (skipped in the inner loop)
|
||||
movdqa m4, m1 ; needed for the aligned y[max_idx]+=1; processing
|
||||
|
||||
%if mmsize >= 32
|
||||
; Merge parallel maximums round 8 (4 vs 4)
|
||||
|
||||
vextractf128 xm5, ym3, 1 ; xmm5 = ymm3[1x128] = ymm3[255..128b]
|
||||
cmpps xm0, xm3, xm5, 1 ; m0 = (m3 < m5) = ( p[0x128] < p[1x128] )
|
||||
|
||||
vextracti128 xm2, ym1, 1 ; xmm2 = ymm1[1x128] = ymm1[255..128b]
|
||||
BLENDVPS xm3, xm5, xm0 ; max_idx = m0 ? max_idx[1x128] : max_idx[0x128]
|
||||
PBLENDVB xm1, xm2, xm0 ; p = m0 ? p[1x128] : p[0x128]
|
||||
%endif
|
||||
|
||||
; Merge parallel maximums round 4 (2 vs 2)
|
||||
; m3=p[3210]
|
||||
movhlps xm5, xm3 ; m5=p[xx32]
|
||||
cmpps xm0, xm3, xm5, 1 ; m0 = (m3 < m5) = ( p[1,0] < p[3,2] )
|
||||
|
||||
pshufd xm2, xm1, q3232
|
||||
BLENDVPS xm3, xm5, xm0 ; max_idx = m0 ? max_idx[3,2] : max_idx[1,0]
|
||||
PBLENDVB xm1, xm2, xm0 ; p = m0 ? p[3,2] : p[1,0]
|
||||
|
||||
; Merge parallel maximums final round (1 vs 1)
|
||||
shufps xm0, xm3, xm3, q1111 ; m0 = m3[1] = p[1]
|
||||
cmpss xm0, xm3, 5 ; m0 = !(m0 >= m3) = !( p[1] >= p[0] )
|
||||
|
||||
pshufd xm2, xm1, q1111
|
||||
PBLENDVB xm1, xm2, xm0
|
||||
|
||||
movd dword r4d, xm1 ; zero extends to the rest of r4q
|
||||
|
||||
VBROADCASTSS m3, [tmpX + r4]
|
||||
%{1}ps m7, m3 ; Sxy += X[max_idx]
|
||||
|
||||
VBROADCASTSS m5, [tmpY + r4]
|
||||
%{1}ps m6, m5 ; Syy += Y[max_idx]
|
||||
|
||||
; We have to update a single element in Y[i]
|
||||
; However writing 4 bytes and then doing 16 byte load in the inner loop
|
||||
; could cause a stall due to breaking write forwarding.
|
||||
VPBROADCASTD m1, xm1
|
||||
pcmpeqd m1, m1, m4 ; exactly 1 element matches max_idx and this finds it
|
||||
|
||||
and r4d, ~(mmsize-1) ; align address down, so the value pointed by max_idx is inside a mmsize load
|
||||
movaps m5, [tmpY + r4] ; m5 = Y[y3...ym...y0]
|
||||
andps m1, mm_const_float_1 ; m1 = [ 0...1.0...0]
|
||||
%{1}ps m5, m1 ; m5 = Y[y3...ym...y0] +/- [0...1.0...0]
|
||||
movaps [tmpY + r4], m5 ; Y[max_idx] +-= 1.0;
|
||||
%endmacro
|
||||
|
||||
;
|
||||
; We need one more register for
|
||||
; PIC relative addressing. Use this
|
||||
; to count it in cglobal
|
||||
;
|
||||
%ifdef PIC
|
||||
%define num_pic_regs 1
|
||||
%else
|
||||
%define num_pic_regs 0
|
||||
%endif
|
||||
|
||||
;
|
||||
; Pyramid Vector Quantization Search implementation
|
||||
;
|
||||
; float * inX - Unaligned (SIMD) access, it will be overread,
|
||||
; but extra data is masked away.
|
||||
; int32 * outY - Should be aligned and padded buffer.
|
||||
; It is used as temp buffer.
|
||||
; uint32 K - Number of pulses to have after quantizations.
|
||||
; uint32 N - Number of vector elements. Must be 0 < N < 256
|
||||
;
|
||||
%macro PVQ_FAST_SEARCH 1
|
||||
cglobal pvq_search%1, 4, 5+num_pic_regs, 11, 256*4, inX, outY, K, N
|
||||
%define tmpX rsp
|
||||
%define tmpY outYq
|
||||
|
||||
movaps m0, [const_float_abs_mask]
|
||||
shl Nd, 2 ; N *= sizeof(float); also 32 bit operation zeroes the high 32 bits in 64 bit mode.
|
||||
mov r4d, Nd
|
||||
|
||||
neg r4d
|
||||
and r4d, mmsize-1
|
||||
|
||||
SET_PIC_BASE lea, r5, const_align_abs_edge ; rip+const
|
||||
movups m2, [pic_base_const_align_abs_edge + r4 - mmsize]
|
||||
|
||||
add Nd, r4d ; N = align(N, mmsize)
|
||||
|
||||
lea r4d, [Nd - mmsize] ; N is rounded up (aligned up) to mmsize, so r4 can't become negative here, unless N=0.
|
||||
movups m1, [inXq + r4]
|
||||
andps m1, m2
|
||||
movaps [tmpX + r4], m1 ; Sx = abs( X[N-1] )
|
||||
|
||||
align 16
|
||||
%%loop_abs_sum:
|
||||
sub r4d, mmsize
|
||||
jc %%end_loop_abs_sum
|
||||
|
||||
movups m2, [inXq + r4]
|
||||
andps m2, m0
|
||||
|
||||
movaps [tmpX + r4], m2 ; tmpX[i]=abs(X[i])
|
||||
addps m1, m2 ; Sx += abs(X[i])
|
||||
jmp %%loop_abs_sum
|
||||
|
||||
align 16
|
||||
%%end_loop_abs_sum:
|
||||
|
||||
HSUMPS m1, m2 ; m1 = Sx
|
||||
|
||||
xorps m0, m0
|
||||
comiss xm0, xm1 ;
|
||||
jz %%zero_input ; if (Sx==0) goto zero_input
|
||||
|
||||
cvtsi2ss xm0, dword Kd ; m0 = K
|
||||
%if USE_APPROXIMATION == 1
|
||||
rcpss xm1, xm1 ; m1 = approx(1/Sx)
|
||||
mulss xm0, xm1 ; m0 = K*(1/Sx)
|
||||
%else
|
||||
divss xm0, xm1 ; b = K/Sx
|
||||
; b = K/max_x
|
||||
%endif
|
||||
|
||||
VBROADCASTSS m0, xm0
|
||||
|
||||
lea r4d, [Nd - mmsize]
|
||||
pxor m5, m5 ; Sy ( Sum of abs( y[i]) )
|
||||
xorps m6, m6 ; Syy ( Sum of y[i]*y[i] )
|
||||
xorps m7, m7 ; Sxy ( Sum of X[i]*y[i] )
|
||||
align 16
|
||||
%%loop_guess:
|
||||
movaps m1, [tmpX + r4] ; m1 = X[i]
|
||||
mulps m2, m0, m1 ; m2 = res*X[i]
|
||||
cvtps2dq m2, m2 ; yt = (int)lrintf( res*X[i] )
|
||||
paddd m5, m2 ; Sy += yt
|
||||
cvtdq2ps m2, m2 ; yt = (float)yt
|
||||
mulps m1, m2 ; m1 = X[i]*yt
|
||||
movaps [tmpY + r4], m2 ; y[i] = m2
|
||||
addps m7, m1 ; Sxy += m1;
|
||||
mulps m2, m2 ; m2 = yt*yt
|
||||
addps m6, m2 ; Syy += m2
|
||||
|
||||
sub r4d, mmsize
|
||||
jnc %%loop_guess
|
||||
|
||||
HSUMPS m6, m1 ; Syy_norm
|
||||
HADDD m5, m4 ; pulses
|
||||
|
||||
movd dword r4d, xm5 ; zero extends to the rest of r4q
|
||||
|
||||
sub Kd, r4d ; K -= pulses , also 32 bit operation zeroes high 32 bit in 64 bit mode.
|
||||
jz %%finish ; K - pulses == 0
|
||||
|
||||
SET_HI_REG_MM_CONSTANT movaps, m8, const_float_0_5
|
||||
SET_HI_REG_MM_CONSTANT movaps, m9, const_float_1
|
||||
SET_HI_REG_MM_CONSTANT movdqa, m10, const_int32_offsets
|
||||
; Use Syy/2 in distortion parameter calculations.
|
||||
; Saves pre and post-caclulation to correct Y[] values.
|
||||
; Same precision, since float mantisa is normalized.
|
||||
; The SQRT approximation does differ.
|
||||
HSUMPS m7, m0 ; Sxy_norm
|
||||
mulps m6, mm_const_float_0_5
|
||||
|
||||
jc %%remove_pulses_loop ; K - pulses < 0
|
||||
|
||||
align 16 ; K - pulses > 0
|
||||
%%add_pulses_loop:
|
||||
|
||||
PULSES_SEARCH add ; m6 Syy_norm ; m7 Sxy_norm
|
||||
|
||||
sub Kd, 1
|
||||
jnz %%add_pulses_loop
|
||||
|
||||
addps m6, m6 ; Syy*=2
|
||||
|
||||
jmp %%finish
|
||||
|
||||
align 16
|
||||
%%remove_pulses_loop:
|
||||
|
||||
PULSES_SEARCH sub ; m6 Syy_norm ; m7 Sxy_norm
|
||||
|
||||
add Kd, 1
|
||||
jnz %%remove_pulses_loop
|
||||
|
||||
addps m6, m6 ; Syy*=2
|
||||
|
||||
align 16
|
||||
%%finish:
|
||||
lea r4d, [Nd - mmsize]
|
||||
movaps m2, [const_float_sign_mask]
|
||||
|
||||
align 16
|
||||
%%restore_sign_loop:
|
||||
movaps m0, [tmpY + r4] ; m0 = Y[i]
|
||||
movups m1, [inXq + r4] ; m1 = X[i]
|
||||
andps m1, m2 ; m1 = sign(X[i])
|
||||
orps m0, m1 ; m0 = Y[i]*sign
|
||||
cvtps2dq m3, m0 ; m3 = (int)m0
|
||||
movaps [outYq + r4], m3
|
||||
|
||||
sub r4d, mmsize
|
||||
jnc %%restore_sign_loop
|
||||
%%return:
|
||||
|
||||
%if ARCH_X86_64 == 0 ; sbrdsp
|
||||
movss r0m, xm6 ; return (float)Syy_norm
|
||||
fld dword r0m
|
||||
%else
|
||||
movaps m0, m6 ; return (float)Syy_norm
|
||||
%endif
|
||||
|
||||
RET
|
||||
|
||||
align 16
|
||||
%%zero_input:
|
||||
lea r4d, [Nd - mmsize]
|
||||
xorps m0, m0
|
||||
%%zero_loop:
|
||||
movaps [outYq + r4], m0
|
||||
sub r4d, mmsize
|
||||
jnc %%zero_loop
|
||||
|
||||
movaps m6, [const_float_1]
|
||||
jmp %%return
|
||||
%endmacro
|
||||
|
||||
; if 1, use a float op that give half precision but execute for around 3 cycles.
|
||||
; On Skylake & Ryzen the division is much faster (around 11c/3),
|
||||
; that makes the full precision code about 2% slower.
|
||||
; Opus also does use rsqrt approximation in their intrinsics code.
|
||||
%define USE_APPROXIMATION 1
|
||||
|
||||
INIT_XMM sse2
|
||||
PVQ_FAST_SEARCH _approx
|
||||
|
||||
INIT_XMM sse4
|
||||
PVQ_FAST_SEARCH _approx
|
||||
|
||||
%define USE_APPROXIMATION 0
|
||||
|
||||
INIT_XMM avx
|
||||
PVQ_FAST_SEARCH _exact
|
1085
trunk/3rdparty/ffmpeg-4.2-fit/libavcodec/x86/fft.asm
vendored
Normal file
1085
trunk/3rdparty/ffmpeg-4.2-fit/libavcodec/x86/fft.asm
vendored
Normal file
File diff suppressed because it is too large
Load diff
221
trunk/3rdparty/ffmpeg-4.2-fit/libavcodec/x86/mdct15.asm
vendored
Normal file
221
trunk/3rdparty/ffmpeg-4.2-fit/libavcodec/x86/mdct15.asm
vendored
Normal file
|
@ -0,0 +1,221 @@
|
|||
;******************************************************************************
|
||||
;* SIMD optimized non-power-of-two MDCT functions
|
||||
;*
|
||||
;* Copyright (C) 2017 Rostislav Pehlivanov <atomnuker@gmail.com>
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
SECTION_RODATA 32
|
||||
|
||||
perm_neg: dd 2, 5, 3, 4, 6, 1, 7, 0
|
||||
perm_pos: dd 0, 7, 1, 6, 4, 3, 5, 2
|
||||
sign_adjust_r: times 4 dd 0x80000000, 0x00000000
|
||||
|
||||
sign_adjust_5: dd 0x00000000, 0x80000000, 0x80000000, 0x00000000
|
||||
|
||||
SECTION .text
|
||||
|
||||
%if ARCH_X86_64
|
||||
|
||||
;*****************************************************************************************
|
||||
;void ff_fft15_avx(FFTComplex *out, FFTComplex *in, FFTComplex *exptab, ptrdiff_t stride);
|
||||
;*****************************************************************************************
|
||||
%macro FFT5 3 ; %1 - in_offset, %2 - dst1 (64bit used), %3 - dst2
|
||||
VBROADCASTSD m0, [inq + %1] ; in[ 0].re, in[ 0].im, in[ 0].re, in[ 0].im
|
||||
movsd xm1, [inq + 1*16 + 8 + %1] ; in[ 3].re, in[ 3].im, 0, 0
|
||||
movsd xm4, [inq + 6*16 + 0 + %1] ; in[12].re, in[12].im, 0, 0
|
||||
movhps xm1, [inq + 3*16 + 0 + %1] ; in[ 3].re, in[ 3].im, in[ 6].re, in[ 6].im
|
||||
movhps xm4, [inq + 4*16 + 8 + %1] ; in[12].re, in[12].im, in[ 9].re, in[ 9].im
|
||||
|
||||
subps xm2, xm1, xm4 ; t[2].im, t[2].re, t[3].im, t[3].re
|
||||
addps xm1, xm4 ; t[0].re, t[0].im, t[1].re, t[1].im
|
||||
|
||||
movhlps %2, xm1 ; t[0].re, t[1].re, t[0].im, t[1].im
|
||||
addps %2, xm1
|
||||
addps %2, xm0 ; DC[0].re, DC[0].im, junk...
|
||||
movlhps %2, %2 ; DC[0].re, DC[0].im, DC[0].re, DC[0].im
|
||||
|
||||
shufps xm3, xm1, xm2, q0110 ; t[0].re, t[0].im, t[2].re, t[2].im
|
||||
shufps xm1, xm2, q2332 ; t[1].re, t[1].im, t[3].re, t[3].im
|
||||
|
||||
mulps xm%3, xm1, xm5
|
||||
mulps xm4, xm3, xm6
|
||||
mulps xm1, xm6
|
||||
|
||||
xorps xm1, xm7
|
||||
mulps xm3, xm5
|
||||
addsubps xm3, xm1 ; t[0].re, t[0].im, t[2].re, t[2].im
|
||||
subps xm%3, xm4 ; t[4].re, t[4].im, t[5].re, t[5].im
|
||||
|
||||
movhlps xm2, xm%3, xm3 ; t[2].re, t[2].im, t[5].re, t[5].im
|
||||
movlhps xm3, xm%3 ; t[0].re, t[0].im, t[4].re, t[4].im
|
||||
|
||||
xorps xm2, xm7
|
||||
addps xm%3, xm2, xm3
|
||||
subps xm3, xm2
|
||||
|
||||
shufps xm3, xm3, q1032
|
||||
vinsertf128 m%3, m%3, xm3, 1 ; All ACs (tmp[1] through to tmp[4])
|
||||
addps m%3, m%3, m0 ; Finally offset with DCs
|
||||
%endmacro
|
||||
|
||||
%macro BUTTERFLIES_DC 1 ; %1 - exptab_offset
|
||||
mulps xm0, xm9, [exptabq + %1 + 16*0]
|
||||
mulps xm1, xm10, [exptabq + %1 + 16*1]
|
||||
|
||||
haddps xm0, xm1
|
||||
movhlps xm1, xm0 ; t[0].re, t[1].re, t[0].im, t[1].im
|
||||
|
||||
addps xm0, xm1
|
||||
addps xm0, xm8
|
||||
|
||||
movsd [outq], xm0
|
||||
%endmacro
|
||||
|
||||
%macro BUTTERFLIES_AC 1 ; %1 - exptab_offset
|
||||
mulps m0, m12, [exptabq + 64*0 + 0*mmsize + %1]
|
||||
mulps m1, m12, [exptabq + 64*0 + 1*mmsize + %1]
|
||||
mulps m2, m13, [exptabq + 64*1 + 0*mmsize + %1]
|
||||
mulps m3, m13, [exptabq + 64*1 + 1*mmsize + %1]
|
||||
|
||||
addps m0, m0, m2
|
||||
addps m1, m1, m3
|
||||
addps m0, m0, m11
|
||||
|
||||
shufps m1, m1, m1, q2301
|
||||
addps m0, m0, m1
|
||||
|
||||
vextractf128 xm1, m0, 1
|
||||
|
||||
movlps [outq + strideq*1], xm0
|
||||
movhps [outq + strideq*2], xm0
|
||||
movlps [outq + stride3q], xm1
|
||||
movhps [outq + strideq*4], xm1
|
||||
%endmacro
|
||||
|
||||
INIT_YMM avx
|
||||
cglobal fft15, 4, 5, 14, out, in, exptab, stride, stride5
|
||||
shl strideq, 3
|
||||
|
||||
movaps xm5, [exptabq + 480 + 16*0]
|
||||
movaps xm6, [exptabq + 480 + 16*1]
|
||||
movaps xm7, [sign_adjust_5]
|
||||
|
||||
FFT5 0, xm8, 11
|
||||
FFT5 8, xm9, 12
|
||||
FFT5 16, xm10, 13
|
||||
|
||||
%define stride3q inq
|
||||
lea stride3q, [strideq + strideq*2]
|
||||
lea stride5q, [strideq + strideq*4]
|
||||
|
||||
BUTTERFLIES_DC (8*6 + 4*0)*2*4
|
||||
BUTTERFLIES_AC (8*0 + 0*0)*2*4
|
||||
|
||||
add outq, stride5q
|
||||
BUTTERFLIES_DC (8*6 + 4*1)*2*4
|
||||
BUTTERFLIES_AC (8*2 + 0*0)*2*4
|
||||
|
||||
add outq, stride5q
|
||||
BUTTERFLIES_DC (8*6 + 4*2)*2*4
|
||||
BUTTERFLIES_AC (8*4 + 0*0)*2*4
|
||||
|
||||
RET
|
||||
|
||||
%endif ; ARCH_X86_64
|
||||
|
||||
;*******************************************************************************************************
|
||||
;void ff_mdct15_postreindex(FFTComplex *out, FFTComplex *in, FFTComplex *exp, int *lut, ptrdiff_t len8);
|
||||
;*******************************************************************************************************
|
||||
%macro LUT_LOAD_4D 3
|
||||
mov r4d, [lutq + %3q*4 + 0]
|
||||
movsd xmm%1, [inq + r4q*8]
|
||||
mov r4d, [lutq + %3q*4 + 4]
|
||||
movhps xmm%1, [inq + r4q*8]
|
||||
%if cpuflag(avx2)
|
||||
mov r4d, [lutq + %3q*4 + 8]
|
||||
movsd %2, [inq + r4q*8]
|
||||
mov r4d, [lutq + %3q*4 + 12]
|
||||
movhps %2, [inq + r4q*8]
|
||||
vinsertf128 %1, %1, %2, 1
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro POSTROTATE_FN 1
|
||||
cglobal mdct15_postreindex, 5, 7, 8 + cpuflag(avx2)*2, out, in, exp, lut, len8, offset_p, offset_n
|
||||
|
||||
xor offset_nq, offset_nq
|
||||
lea offset_pq, [len8q*2 - %1]
|
||||
|
||||
movaps m7, [sign_adjust_r]
|
||||
|
||||
%if cpuflag(avx2)
|
||||
movaps m8, [perm_pos]
|
||||
movaps m9, [perm_neg]
|
||||
%endif
|
||||
|
||||
.loop:
|
||||
movups m0, [expq + offset_pq*8] ; exp[p0].re, exp[p0].im, exp[p1].re, exp[p1].im, exp[p2].re, exp[p2].im, exp[p3].re, exp[p3].im
|
||||
movups m1, [expq + offset_nq*8] ; exp[n3].re, exp[n3].im, exp[n2].re, exp[n2].im, exp[n1].re, exp[n1].im, exp[n0].re, exp[n0].im
|
||||
|
||||
LUT_LOAD_4D m3, xm4, offset_p ; in[p0].re, in[p0].im, in[p1].re, in[p1].im, in[p2].re, in[p2].im, in[p3].re, in[p3].im
|
||||
LUT_LOAD_4D m4, xm5, offset_n ; in[n3].re, in[n3].im, in[n2].re, in[n2].im, in[n1].re, in[n1].im, in[n0].re, in[n0].im
|
||||
|
||||
mulps m5, m3, m0 ; in[p].reim * exp[p].reim
|
||||
mulps m6, m4, m1 ; in[n].reim * exp[n].reim
|
||||
|
||||
xorps m5, m7 ; in[p].re *= -1, in[p].im *= 1
|
||||
xorps m6, m7 ; in[n].re *= -1, in[n].im *= 1
|
||||
|
||||
shufps m3, m3, m3, q2301 ; in[p].imre
|
||||
shufps m4, m4, m4, q2301 ; in[n].imre
|
||||
|
||||
mulps m3, m0 ; in[p].imre * exp[p].reim
|
||||
mulps m4, m1 ; in[n].imre * exp[n].reim
|
||||
|
||||
haddps m3, m6 ; out[n0].im, out[n1].im, out[n3].re, out[n2].re, out[n2].im, out[n3].im, out[n1].re, out[n0].re
|
||||
haddps m5, m4 ; out[p0].re, out[p1].re, out[p3].im, out[p2].im, out[p2].re, out[p3].re, out[p1].im, out[p0].im
|
||||
|
||||
%if cpuflag(avx2)
|
||||
vpermps m3, m9, m3 ; out[n3].im, out[n3].re, out[n2].im, out[n2].re, out[n1].im, out[n1].re, out[n0].im, out[n0].re
|
||||
vpermps m5, m8, m5 ; out[p0].re, out[p0].im, out[p1].re, out[p1].im, out[p2].re, out[p2].im, out[p3].re, out[p3].im
|
||||
%else
|
||||
shufps m3, m3, m3, q0312
|
||||
shufps m5, m5, m5, q2130
|
||||
%endif
|
||||
|
||||
movups [outq + offset_nq*8], m3
|
||||
movups [outq + offset_pq*8], m5
|
||||
|
||||
sub offset_pq, %1
|
||||
add offset_nq, %1
|
||||
cmp offset_nq, offset_pq
|
||||
jle .loop
|
||||
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse3
|
||||
POSTROTATE_FN 2
|
||||
|
||||
%if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
|
||||
INIT_YMM avx2
|
||||
POSTROTATE_FN 4
|
||||
%endif
|
548
trunk/3rdparty/ffmpeg-4.2-fit/libavcodec/x86/sbrdsp.asm
vendored
Normal file
548
trunk/3rdparty/ffmpeg-4.2-fit/libavcodec/x86/sbrdsp.asm
vendored
Normal file
|
@ -0,0 +1,548 @@
|
|||
;******************************************************************************
|
||||
;* AAC Spectral Band Replication decoding functions
|
||||
;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
SECTION_RODATA
|
||||
; mask equivalent for multiply by -1.0 1.0
|
||||
ps_mask times 2 dd 1<<31, 0
|
||||
ps_mask2 times 2 dd 0, 1<<31
|
||||
ps_mask3 dd 0, 0, 0, 1<<31
|
||||
ps_noise0 times 2 dd 1.0, 0.0,
|
||||
ps_noise2 times 2 dd -1.0, 0.0
|
||||
ps_noise13 dd 0.0, 1.0, 0.0, -1.0
|
||||
dd 0.0, -1.0, 0.0, 1.0
|
||||
dd 0.0, 1.0, 0.0, -1.0
|
||||
cextern sbr_noise_table
|
||||
cextern ps_neg
|
||||
|
||||
SECTION .text
|
||||
|
||||
INIT_XMM sse
|
||||
cglobal sbr_sum_square, 2, 3, 6
|
||||
mov r2d, r1d
|
||||
xorps m0, m0
|
||||
xorps m1, m1
|
||||
sar r2, 3
|
||||
jz .prepare
|
||||
.loop:
|
||||
movu m2, [r0 + 0]
|
||||
movu m3, [r0 + 16]
|
||||
movu m4, [r0 + 32]
|
||||
movu m5, [r0 + 48]
|
||||
mulps m2, m2
|
||||
mulps m3, m3
|
||||
mulps m4, m4
|
||||
mulps m5, m5
|
||||
addps m0, m2
|
||||
addps m1, m3
|
||||
addps m0, m4
|
||||
addps m1, m5
|
||||
add r0, 64
|
||||
dec r2
|
||||
jnz .loop
|
||||
.prepare:
|
||||
and r1, 7
|
||||
sar r1, 1
|
||||
jz .end
|
||||
; len is a multiple of 2, thus there are at least 4 elements to process
|
||||
.endloop:
|
||||
movu m2, [r0]
|
||||
add r0, 16
|
||||
mulps m2, m2
|
||||
dec r1
|
||||
addps m0, m2
|
||||
jnz .endloop
|
||||
.end:
|
||||
addps m0, m1
|
||||
movhlps m2, m0
|
||||
addps m0, m2
|
||||
movss m1, m0
|
||||
shufps m0, m0, 1
|
||||
addss m0, m1
|
||||
%if ARCH_X86_64 == 0
|
||||
movss r0m, m0
|
||||
fld dword r0m
|
||||
%endif
|
||||
RET
|
||||
|
||||
%define STEP 40*4*2
|
||||
cglobal sbr_hf_g_filt, 5, 6, 5
|
||||
lea r1, [r1 + 8*r4] ; offset by ixh elements into X_high
|
||||
mov r5, r3
|
||||
and r3, 0xFC
|
||||
lea r2, [r2 + r3*4]
|
||||
lea r0, [r0 + r3*8]
|
||||
neg r3
|
||||
jz .loop1
|
||||
.loop4:
|
||||
movlps m0, [r2 + 4*r3 + 0]
|
||||
movlps m1, [r2 + 4*r3 + 8]
|
||||
movlps m2, [r1 + 0*STEP]
|
||||
movlps m3, [r1 + 2*STEP]
|
||||
movhps m2, [r1 + 1*STEP]
|
||||
movhps m3, [r1 + 3*STEP]
|
||||
unpcklps m0, m0
|
||||
unpcklps m1, m1
|
||||
mulps m0, m2
|
||||
mulps m1, m3
|
||||
movu [r0 + 8*r3 + 0], m0
|
||||
movu [r0 + 8*r3 + 16], m1
|
||||
add r1, 4*STEP
|
||||
add r3, 4
|
||||
jnz .loop4
|
||||
and r5, 3 ; number of single element loops
|
||||
jz .end
|
||||
.loop1: ; element 0 and 1 can be computed at the same time
|
||||
movss m0, [r2]
|
||||
movlps m2, [r1]
|
||||
unpcklps m0, m0
|
||||
mulps m2, m0
|
||||
movlps [r0], m2
|
||||
add r0, 8
|
||||
add r2, 4
|
||||
add r1, STEP
|
||||
dec r5
|
||||
jnz .loop1
|
||||
.end:
|
||||
RET
|
||||
|
||||
; void ff_sbr_hf_gen_sse(float (*X_high)[2], const float (*X_low)[2],
|
||||
; const float alpha0[2], const float alpha1[2],
|
||||
; float bw, int start, int end)
|
||||
;
|
||||
cglobal sbr_hf_gen, 4,4,8, X_high, X_low, alpha0, alpha1, BW, S, E
|
||||
; load alpha factors
|
||||
%define bw m0
|
||||
%if ARCH_X86_64 == 0 || WIN64
|
||||
movss bw, BWm
|
||||
%endif
|
||||
movlps m2, [alpha1q]
|
||||
movlps m1, [alpha0q]
|
||||
shufps bw, bw, 0
|
||||
mulps m2, bw ; (a1[0] a1[1])*bw
|
||||
mulps m1, bw ; (a0[0] a0[1])*bw = (a2 a3)
|
||||
mulps m2, bw ; (a1[0] a1[1])*bw*bw = (a0 a1)
|
||||
mova m3, m1
|
||||
mova m4, m2
|
||||
|
||||
; Set pointers
|
||||
%if ARCH_X86_64 == 0 || WIN64
|
||||
; start and end 6th and 7th args on stack
|
||||
mov r2d, Sm
|
||||
mov r3d, Em
|
||||
DEFINE_ARGS X_high, X_low, start, end
|
||||
%else
|
||||
; BW does not actually occupy a register, so shift by 1
|
||||
DEFINE_ARGS X_high, X_low, alpha0, alpha1, start, end
|
||||
movsxd startq, startd
|
||||
movsxd endq, endd
|
||||
%endif
|
||||
sub startq, endq ; neg num of loops
|
||||
lea X_highq, [X_highq + endq*2*4]
|
||||
lea X_lowq, [X_lowq + endq*2*4 - 2*2*4]
|
||||
shl startq, 3 ; offset from num loops
|
||||
|
||||
mova m0, [X_lowq + startq]
|
||||
shufps m3, m3, q1111
|
||||
shufps m4, m4, q1111
|
||||
xorps m3, [ps_mask]
|
||||
shufps m1, m1, q0000
|
||||
shufps m2, m2, q0000
|
||||
xorps m4, [ps_mask]
|
||||
.loop2:
|
||||
movu m7, [X_lowq + startq + 8] ; BbCc
|
||||
mova m6, m0
|
||||
mova m5, m7
|
||||
shufps m0, m0, q2301 ; aAbB
|
||||
shufps m7, m7, q2301 ; bBcC
|
||||
mulps m0, m4
|
||||
mulps m7, m3
|
||||
mulps m6, m2
|
||||
mulps m5, m1
|
||||
addps m7, m0
|
||||
mova m0, [X_lowq + startq + 16] ; CcDd
|
||||
addps m7, m0
|
||||
addps m6, m5
|
||||
addps m7, m6
|
||||
mova [X_highq + startq], m7
|
||||
add startq, 16
|
||||
jnz .loop2
|
||||
RET
|
||||
|
||||
cglobal sbr_sum64x5, 1,2,4,z
|
||||
lea r1q, [zq+ 256]
|
||||
.loop:
|
||||
mova m0, [zq+ 0]
|
||||
mova m2, [zq+ 16]
|
||||
mova m1, [zq+ 256]
|
||||
mova m3, [zq+ 272]
|
||||
addps m0, [zq+ 512]
|
||||
addps m2, [zq+ 528]
|
||||
addps m1, [zq+ 768]
|
||||
addps m3, [zq+ 784]
|
||||
addps m0, [zq+1024]
|
||||
addps m2, [zq+1040]
|
||||
addps m0, m1
|
||||
addps m2, m3
|
||||
mova [zq], m0
|
||||
mova [zq+16], m2
|
||||
add zq, 32
|
||||
cmp zq, r1q
|
||||
jne .loop
|
||||
REP_RET
|
||||
|
||||
INIT_XMM sse
|
||||
cglobal sbr_qmf_post_shuffle, 2,3,4,W,z
|
||||
lea r2q, [zq + (64-4)*4]
|
||||
mova m3, [ps_neg]
|
||||
.loop:
|
||||
mova m1, [zq]
|
||||
xorps m0, m3, [r2q]
|
||||
shufps m0, m0, m0, q0123
|
||||
unpcklps m2, m0, m1
|
||||
unpckhps m0, m0, m1
|
||||
mova [Wq + 0], m2
|
||||
mova [Wq + 16], m0
|
||||
add Wq, 32
|
||||
sub r2q, 16
|
||||
add zq, 16
|
||||
cmp zq, r2q
|
||||
jl .loop
|
||||
REP_RET
|
||||
|
||||
INIT_XMM sse
|
||||
cglobal sbr_neg_odd_64, 1,2,4,z
|
||||
lea r1q, [zq+256]
|
||||
.loop:
|
||||
mova m0, [zq+ 0]
|
||||
mova m1, [zq+16]
|
||||
mova m2, [zq+32]
|
||||
mova m3, [zq+48]
|
||||
xorps m0, [ps_mask2]
|
||||
xorps m1, [ps_mask2]
|
||||
xorps m2, [ps_mask2]
|
||||
xorps m3, [ps_mask2]
|
||||
mova [zq+ 0], m0
|
||||
mova [zq+16], m1
|
||||
mova [zq+32], m2
|
||||
mova [zq+48], m3
|
||||
add zq, 64
|
||||
cmp zq, r1q
|
||||
jne .loop
|
||||
REP_RET
|
||||
|
||||
; void ff_sbr_qmf_deint_bfly_sse2(float *v, const float *src0, const float *src1)
|
||||
%macro SBR_QMF_DEINT_BFLY 0
|
||||
cglobal sbr_qmf_deint_bfly, 3,5,8, v,src0,src1,vrev,c
|
||||
mov cq, 64*4-2*mmsize
|
||||
lea vrevq, [vq + 64*4]
|
||||
.loop:
|
||||
mova m0, [src0q+cq]
|
||||
mova m1, [src1q]
|
||||
mova m4, [src0q+cq+mmsize]
|
||||
mova m5, [src1q+mmsize]
|
||||
%if cpuflag(sse2)
|
||||
pshufd m2, m0, q0123
|
||||
pshufd m3, m1, q0123
|
||||
pshufd m6, m4, q0123
|
||||
pshufd m7, m5, q0123
|
||||
%else
|
||||
shufps m2, m0, m0, q0123
|
||||
shufps m3, m1, m1, q0123
|
||||
shufps m6, m4, m4, q0123
|
||||
shufps m7, m5, m5, q0123
|
||||
%endif
|
||||
addps m5, m2
|
||||
subps m0, m7
|
||||
addps m1, m6
|
||||
subps m4, m3
|
||||
mova [vrevq], m1
|
||||
mova [vrevq+mmsize], m5
|
||||
mova [vq+cq], m0
|
||||
mova [vq+cq+mmsize], m4
|
||||
add src1q, 2*mmsize
|
||||
add vrevq, 2*mmsize
|
||||
sub cq, 2*mmsize
|
||||
jge .loop
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse
|
||||
SBR_QMF_DEINT_BFLY
|
||||
|
||||
INIT_XMM sse2
|
||||
SBR_QMF_DEINT_BFLY
|
||||
|
||||
INIT_XMM sse2
|
||||
cglobal sbr_qmf_pre_shuffle, 1,4,6,z
|
||||
%define OFFSET (32*4-2*mmsize)
|
||||
mov r3q, OFFSET
|
||||
lea r1q, [zq + (32+1)*4]
|
||||
lea r2q, [zq + 64*4]
|
||||
mova m5, [ps_neg]
|
||||
.loop:
|
||||
movu m0, [r1q]
|
||||
movu m2, [r1q + mmsize]
|
||||
movu m1, [zq + r3q + 4 + mmsize]
|
||||
movu m3, [zq + r3q + 4]
|
||||
|
||||
pxor m2, m5
|
||||
pxor m0, m5
|
||||
pshufd m2, m2, q0123
|
||||
pshufd m0, m0, q0123
|
||||
SBUTTERFLY dq, 2, 3, 4
|
||||
SBUTTERFLY dq, 0, 1, 4
|
||||
mova [r2q + 2*r3q + 0*mmsize], m2
|
||||
mova [r2q + 2*r3q + 1*mmsize], m3
|
||||
mova [r2q + 2*r3q + 2*mmsize], m0
|
||||
mova [r2q + 2*r3q + 3*mmsize], m1
|
||||
add r1q, 2*mmsize
|
||||
sub r3q, 2*mmsize
|
||||
jge .loop
|
||||
movq m2, [zq]
|
||||
movq [r2q], m2
|
||||
REP_RET
|
||||
|
||||
%ifdef PIC
|
||||
%define NREGS 1
|
||||
%if UNIX64
|
||||
%define NOISE_TABLE r6q ; r5q is m_max
|
||||
%else
|
||||
%define NOISE_TABLE r5q
|
||||
%endif
|
||||
%else
|
||||
%define NREGS 0
|
||||
%define NOISE_TABLE sbr_noise_table
|
||||
%endif
|
||||
|
||||
%macro LOAD_NST 1
|
||||
%ifdef PIC
|
||||
lea NOISE_TABLE, [%1]
|
||||
mova m0, [kxq + NOISE_TABLE]
|
||||
%else
|
||||
mova m0, [kxq + %1]
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse2
|
||||
; sbr_hf_apply_noise_0(float (*Y)[2], const float *s_m,
|
||||
; const float *q_filt, int noise,
|
||||
; int kx, int m_max)
|
||||
cglobal sbr_hf_apply_noise_0, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
|
||||
mova m0, [ps_noise0]
|
||||
jmp apply_noise_main
|
||||
|
||||
; sbr_hf_apply_noise_1(float (*Y)[2], const float *s_m,
|
||||
; const float *q_filt, int noise,
|
||||
; int kx, int m_max)
|
||||
cglobal sbr_hf_apply_noise_1, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
|
||||
and kxq, 1
|
||||
shl kxq, 4
|
||||
LOAD_NST ps_noise13
|
||||
jmp apply_noise_main
|
||||
|
||||
; sbr_hf_apply_noise_2(float (*Y)[2], const float *s_m,
|
||||
; const float *q_filt, int noise,
|
||||
; int kx, int m_max)
|
||||
cglobal sbr_hf_apply_noise_2, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
|
||||
mova m0, [ps_noise2]
|
||||
jmp apply_noise_main
|
||||
|
||||
; sbr_hf_apply_noise_3(float (*Y)[2], const float *s_m,
|
||||
; const float *q_filt, int noise,
|
||||
; int kx, int m_max)
|
||||
cglobal sbr_hf_apply_noise_3, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
|
||||
and kxq, 1
|
||||
shl kxq, 4
|
||||
LOAD_NST ps_noise13+16
|
||||
|
||||
apply_noise_main:
|
||||
%if ARCH_X86_64 == 0 || WIN64
|
||||
mov kxd, m_maxm
|
||||
DEFINE_ARGS Y, s_m, q_filt, noise, count
|
||||
%else
|
||||
DEFINE_ARGS Y, s_m, q_filt, noise, kx, count
|
||||
%endif
|
||||
movsxdifnidn noiseq, noised
|
||||
dec noiseq
|
||||
shl countd, 2
|
||||
%ifdef PIC
|
||||
lea NOISE_TABLE, [sbr_noise_table]
|
||||
%endif
|
||||
lea Yq, [Yq + 2*countq]
|
||||
add s_mq, countq
|
||||
add q_filtq, countq
|
||||
shl noiseq, 3
|
||||
pxor m5, m5
|
||||
neg countq
|
||||
.loop:
|
||||
mova m1, [q_filtq + countq]
|
||||
movu m3, [noiseq + NOISE_TABLE + 1*mmsize]
|
||||
movu m4, [noiseq + NOISE_TABLE + 2*mmsize]
|
||||
add noiseq, 2*mmsize
|
||||
and noiseq, 0x1ff<<3
|
||||
punpckhdq m2, m1, m1
|
||||
punpckldq m1, m1
|
||||
mulps m1, m3 ; m2 = q_filt[m] * ff_sbr_noise_table[noise]
|
||||
mulps m2, m4 ; m2 = q_filt[m] * ff_sbr_noise_table[noise]
|
||||
mova m3, [s_mq + countq]
|
||||
; TODO: replace by a vpermd in AVX2
|
||||
punpckhdq m4, m3, m3
|
||||
punpckldq m3, m3
|
||||
pcmpeqd m6, m3, m5 ; m6 == 0
|
||||
pcmpeqd m7, m4, m5 ; m7 == 0
|
||||
mulps m3, m0 ; s_m[m] * phi_sign
|
||||
mulps m4, m0 ; s_m[m] * phi_sign
|
||||
pand m1, m6
|
||||
pand m2, m7
|
||||
movu m6, [Yq + 2*countq]
|
||||
movu m7, [Yq + 2*countq + mmsize]
|
||||
addps m3, m1
|
||||
addps m4, m2
|
||||
addps m6, m3
|
||||
addps m7, m4
|
||||
movu [Yq + 2*countq], m6
|
||||
movu [Yq + 2*countq + mmsize], m7
|
||||
add countq, mmsize
|
||||
jl .loop
|
||||
RET
|
||||
|
||||
INIT_XMM sse
|
||||
cglobal sbr_qmf_deint_neg, 2,4,4,v,src,vrev,c
|
||||
%define COUNT 32*4
|
||||
%define OFFSET 32*4
|
||||
mov cq, -COUNT
|
||||
lea vrevq, [vq + OFFSET + COUNT]
|
||||
add vq, OFFSET-mmsize
|
||||
add srcq, 2*COUNT
|
||||
mova m3, [ps_neg]
|
||||
.loop:
|
||||
mova m0, [srcq + 2*cq + 0*mmsize]
|
||||
mova m1, [srcq + 2*cq + 1*mmsize]
|
||||
shufps m2, m0, m1, q2020
|
||||
shufps m1, m0, q1313
|
||||
xorps m2, m3
|
||||
mova [vq], m1
|
||||
mova [vrevq + cq], m2
|
||||
sub vq, mmsize
|
||||
add cq, mmsize
|
||||
jl .loop
|
||||
REP_RET
|
||||
|
||||
%macro SBR_AUTOCORRELATE 0
|
||||
cglobal sbr_autocorrelate, 2,3,8,32, x, phi, cnt
|
||||
mov cntq, 37*8
|
||||
add xq, cntq
|
||||
neg cntq
|
||||
|
||||
%if cpuflag(sse3)
|
||||
%define MOVH movsd
|
||||
movddup m5, [xq+cntq]
|
||||
%else
|
||||
%define MOVH movlps
|
||||
movlps m5, [xq+cntq]
|
||||
movlhps m5, m5
|
||||
%endif
|
||||
MOVH m7, [xq+cntq+8 ]
|
||||
MOVH m1, [xq+cntq+16]
|
||||
shufps m7, m7, q0110
|
||||
shufps m1, m1, q0110
|
||||
mulps m3, m5, m7 ; x[0][0] * x[1][0], x[0][1] * x[1][1], x[0][0] * x[1][1], x[0][1] * x[1][0]
|
||||
mulps m4, m5, m5 ; x[0][0] * x[0][0], x[0][1] * x[0][1];
|
||||
mulps m5, m1 ; real_sum2 = x[0][0] * x[2][0], x[0][1] * x[2][1]; imag_sum2 = x[0][0] * x[2][1], x[0][1] * x[2][0]
|
||||
movaps [rsp ], m3
|
||||
movaps [rsp+16], m4
|
||||
add cntq, 8
|
||||
|
||||
MOVH m2, [xq+cntq+16]
|
||||
movlhps m7, m7
|
||||
shufps m2, m2, q0110
|
||||
mulps m6, m7, m1 ; real_sum1 = x[1][0] * x[2][0], x[1][1] * x[2][1]; imag_sum1 += x[1][0] * x[2][1], x[1][1] * x[2][0]
|
||||
mulps m4, m7, m2
|
||||
mulps m7, m7 ; real_sum0 = x[1][0] * x[1][0], x[1][1] * x[1][1];
|
||||
addps m5, m4 ; real_sum2 += x[1][0] * x[3][0], x[1][1] * x[3][1]; imag_sum2 += x[1][0] * x[3][1], x[1][1] * x[3][0]
|
||||
|
||||
align 16
|
||||
.loop:
|
||||
add cntq, 8
|
||||
MOVH m0, [xq+cntq+16]
|
||||
movlhps m1, m1
|
||||
shufps m0, m0, q0110
|
||||
mulps m3, m1, m2
|
||||
mulps m4, m1, m0
|
||||
mulps m1, m1
|
||||
addps m6, m3 ; real_sum1 += x[i][0] * x[i + 1][0], x[i][1] * x[i + 1][1]; imag_sum1 += x[i][0] * x[i + 1][1], x[i][1] * x[i + 1][0];
|
||||
addps m5, m4 ; real_sum2 += x[i][0] * x[i + 2][0], x[i][1] * x[i + 2][1]; imag_sum2 += x[i][0] * x[i + 2][1], x[i][1] * x[i + 2][0];
|
||||
addps m7, m1 ; real_sum0 += x[i][0] * x[i][0], x[i][1] * x[i][1];
|
||||
add cntq, 8
|
||||
MOVH m1, [xq+cntq+16]
|
||||
movlhps m2, m2
|
||||
shufps m1, m1, q0110
|
||||
mulps m3, m2, m0
|
||||
mulps m4, m2, m1
|
||||
mulps m2, m2
|
||||
addps m6, m3 ; real_sum1 += x[i][0] * x[i + 1][0], x[i][1] * x[i + 1][1]; imag_sum1 += x[i][0] * x[i + 1][1], x[i][1] * x[i + 1][0];
|
||||
addps m5, m4 ; real_sum2 += x[i][0] * x[i + 2][0], x[i][1] * x[i + 2][1]; imag_sum2 += x[i][0] * x[i + 2][1], x[i][1] * x[i + 2][0];
|
||||
addps m7, m2 ; real_sum0 += x[i][0] * x[i][0], x[i][1] * x[i][1];
|
||||
add cntq, 8
|
||||
MOVH m2, [xq+cntq+16]
|
||||
movlhps m0, m0
|
||||
shufps m2, m2, q0110
|
||||
mulps m3, m0, m1
|
||||
mulps m4, m0, m2
|
||||
mulps m0, m0
|
||||
addps m6, m3 ; real_sum1 += x[i][0] * x[i + 1][0], x[i][1] * x[i + 1][1]; imag_sum1 += x[i][0] * x[i + 1][1], x[i][1] * x[i + 1][0];
|
||||
addps m5, m4 ; real_sum2 += x[i][0] * x[i + 2][0], x[i][1] * x[i + 2][1]; imag_sum2 += x[i][0] * x[i + 2][1], x[i][1] * x[i + 2][0];
|
||||
addps m7, m0 ; real_sum0 += x[i][0] * x[i][0], x[i][1] * x[i][1];
|
||||
jl .loop
|
||||
|
||||
movlhps m1, m1
|
||||
mulps m2, m1
|
||||
mulps m1, m1
|
||||
addps m2, m6 ; real_sum1 + x[38][0] * x[39][0], x[38][1] * x[39][1]; imag_sum1 + x[38][0] * x[39][1], x[38][1] * x[39][0];
|
||||
addps m1, m7 ; real_sum0 + x[38][0] * x[38][0], x[38][1] * x[38][1];
|
||||
addps m6, [rsp ] ; real_sum1 + x[ 0][0] * x[ 1][0], x[ 0][1] * x[ 1][1]; imag_sum1 + x[ 0][0] * x[ 1][1], x[ 0][1] * x[ 1][0];
|
||||
addps m7, [rsp+16] ; real_sum0 + x[ 0][0] * x[ 0][0], x[ 0][1] * x[ 0][1];
|
||||
|
||||
xorps m2, [ps_mask3]
|
||||
xorps m5, [ps_mask3]
|
||||
xorps m6, [ps_mask3]
|
||||
HADDPS m2, m5, m3
|
||||
HADDPS m7, m6, m4
|
||||
%if cpuflag(sse3)
|
||||
movshdup m0, m1
|
||||
%else
|
||||
movss m0, m1
|
||||
shufps m1, m1, q0001
|
||||
%endif
|
||||
addss m1, m0
|
||||
movaps [phiq ], m2
|
||||
movhps [phiq+0x18], m7
|
||||
movss [phiq+0x28], m7
|
||||
movss [phiq+0x10], m1
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse
|
||||
SBR_AUTOCORRELATE
|
||||
INIT_XMM sse3
|
||||
SBR_AUTOCORRELATE
|
Loading…
Add table
Add a link
Reference in a new issue