mirror of
https://github.com/ossrs/srs.git
synced 2025-02-15 04:42:04 +00:00
parent
4308f238c0
commit
37c84eccc0
28 changed files with 8441 additions and 50 deletions
3
trunk/3rdparty/ffmpeg-4.2-fit/.gitignore
vendored
3
trunk/3rdparty/ffmpeg-4.2-fit/.gitignore
vendored
|
@ -14,4 +14,5 @@ ffbuild/.config
|
|||
libavutil/lib.version
|
||||
libavcodec/libavcodec.version
|
||||
libavutil/libavutil.version
|
||||
libswresample/libswresample.version
|
||||
libswresample/libswresample.version
|
||||
libavutil/ffversion.h
|
60
trunk/3rdparty/ffmpeg-4.2-fit/libavcodec/aarch64/Makefile
vendored
Normal file
60
trunk/3rdparty/ffmpeg-4.2-fit/libavcodec/aarch64/Makefile
vendored
Normal file
|
@ -0,0 +1,60 @@
|
|||
# subsystems
|
||||
OBJS-$(CONFIG_FFT) += aarch64/fft_init_aarch64.o
|
||||
OBJS-$(CONFIG_FMTCONVERT) += aarch64/fmtconvert_init.o
|
||||
OBJS-$(CONFIG_H264CHROMA) += aarch64/h264chroma_init_aarch64.o
|
||||
OBJS-$(CONFIG_H264DSP) += aarch64/h264dsp_init_aarch64.o
|
||||
OBJS-$(CONFIG_H264PRED) += aarch64/h264pred_init.o
|
||||
OBJS-$(CONFIG_H264QPEL) += aarch64/h264qpel_init_aarch64.o
|
||||
OBJS-$(CONFIG_HPELDSP) += aarch64/hpeldsp_init_aarch64.o
|
||||
OBJS-$(CONFIG_MPEGAUDIODSP) += aarch64/mpegaudiodsp_init.o
|
||||
OBJS-$(CONFIG_NEON_CLOBBER_TEST) += aarch64/neontest.o
|
||||
OBJS-$(CONFIG_VIDEODSP) += aarch64/videodsp_init.o
|
||||
OBJS-$(CONFIG_VP8DSP) += aarch64/vp8dsp_init_aarch64.o
|
||||
|
||||
# decoders/encoders
|
||||
OBJS-$(CONFIG_AAC_DECODER) += aarch64/aacpsdsp_init_aarch64.o \
|
||||
aarch64/sbrdsp_init_aarch64.o
|
||||
OBJS-$(CONFIG_DCA_DECODER) += aarch64/synth_filter_init.o
|
||||
OBJS-$(CONFIG_OPUS_DECODER) += aarch64/opusdsp_init.o
|
||||
OBJS-$(CONFIG_RV40_DECODER) += aarch64/rv40dsp_init_aarch64.o
|
||||
OBJS-$(CONFIG_VC1DSP) += aarch64/vc1dsp_init_aarch64.o
|
||||
OBJS-$(CONFIG_VORBIS_DECODER) += aarch64/vorbisdsp_init.o
|
||||
OBJS-$(CONFIG_VP9_DECODER) += aarch64/vp9dsp_init_10bpp_aarch64.o \
|
||||
aarch64/vp9dsp_init_12bpp_aarch64.o \
|
||||
aarch64/vp9dsp_init_aarch64.o
|
||||
|
||||
# ARMv8 optimizations
|
||||
|
||||
# subsystems
|
||||
ARMV8-OBJS-$(CONFIG_VIDEODSP) += aarch64/videodsp.o
|
||||
|
||||
# NEON optimizations
|
||||
|
||||
# subsystems
|
||||
NEON-OBJS-$(CONFIG_AAC_DECODER) += aarch64/sbrdsp_neon.o
|
||||
NEON-OBJS-$(CONFIG_FFT) += aarch64/fft_neon.o
|
||||
NEON-OBJS-$(CONFIG_FMTCONVERT) += aarch64/fmtconvert_neon.o
|
||||
NEON-OBJS-$(CONFIG_H264CHROMA) += aarch64/h264cmc_neon.o
|
||||
NEON-OBJS-$(CONFIG_H264DSP) += aarch64/h264dsp_neon.o \
|
||||
aarch64/h264idct_neon.o
|
||||
NEON-OBJS-$(CONFIG_H264PRED) += aarch64/h264pred_neon.o
|
||||
NEON-OBJS-$(CONFIG_H264QPEL) += aarch64/h264qpel_neon.o \
|
||||
aarch64/hpeldsp_neon.o
|
||||
NEON-OBJS-$(CONFIG_HPELDSP) += aarch64/hpeldsp_neon.o
|
||||
NEON-OBJS-$(CONFIG_IDCTDSP) += aarch64/idctdsp_init_aarch64.o \
|
||||
aarch64/simple_idct_neon.o
|
||||
NEON-OBJS-$(CONFIG_MDCT) += aarch64/mdct_neon.o
|
||||
NEON-OBJS-$(CONFIG_MPEGAUDIODSP) += aarch64/mpegaudiodsp_neon.o
|
||||
NEON-OBJS-$(CONFIG_VP8DSP) += aarch64/vp8dsp_neon.o
|
||||
|
||||
# decoders/encoders
|
||||
NEON-OBJS-$(CONFIG_AAC_DECODER) += aarch64/aacpsdsp_neon.o
|
||||
NEON-OBJS-$(CONFIG_DCA_DECODER) += aarch64/synth_filter_neon.o
|
||||
NEON-OBJS-$(CONFIG_OPUS_DECODER) += aarch64/opusdsp_neon.o
|
||||
NEON-OBJS-$(CONFIG_VORBIS_DECODER) += aarch64/vorbisdsp_neon.o
|
||||
NEON-OBJS-$(CONFIG_VP9_DECODER) += aarch64/vp9itxfm_16bpp_neon.o \
|
||||
aarch64/vp9itxfm_neon.o \
|
||||
aarch64/vp9lpf_16bpp_neon.o \
|
||||
aarch64/vp9lpf_neon.o \
|
||||
aarch64/vp9mc_16bpp_neon.o \
|
||||
aarch64/vp9mc_neon.o
|
199
trunk/3rdparty/ffmpeg-4.2-fit/libavcodec/x86/Makefile
vendored
Normal file
199
trunk/3rdparty/ffmpeg-4.2-fit/libavcodec/x86/Makefile
vendored
Normal file
|
@ -0,0 +1,199 @@
|
|||
OBJS += x86/constants.o \
|
||||
|
||||
# subsystems
|
||||
OBJS-$(CONFIG_AC3DSP) += x86/ac3dsp_init.o
|
||||
OBJS-$(CONFIG_AUDIODSP) += x86/audiodsp_init.o
|
||||
OBJS-$(CONFIG_BLOCKDSP) += x86/blockdsp_init.o
|
||||
OBJS-$(CONFIG_BSWAPDSP) += x86/bswapdsp_init.o
|
||||
OBJS-$(CONFIG_DCT) += x86/dct_init.o
|
||||
OBJS-$(CONFIG_DIRAC_DECODER) += x86/diracdsp_init.o \
|
||||
x86/dirac_dwt_init.o
|
||||
OBJS-$(CONFIG_FDCTDSP) += x86/fdctdsp_init.o
|
||||
OBJS-$(CONFIG_FFT) += x86/fft_init.o
|
||||
OBJS-$(CONFIG_FLACDSP) += x86/flacdsp_init.o
|
||||
OBJS-$(CONFIG_FMTCONVERT) += x86/fmtconvert_init.o
|
||||
OBJS-$(CONFIG_H263DSP) += x86/h263dsp_init.o
|
||||
OBJS-$(CONFIG_H264CHROMA) += x86/h264chroma_init.o
|
||||
OBJS-$(CONFIG_H264DSP) += x86/h264dsp_init.o
|
||||
OBJS-$(CONFIG_H264PRED) += x86/h264_intrapred_init.o
|
||||
OBJS-$(CONFIG_H264QPEL) += x86/h264_qpel.o
|
||||
OBJS-$(CONFIG_HPELDSP) += x86/hpeldsp_init.o
|
||||
OBJS-$(CONFIG_LLAUDDSP) += x86/lossless_audiodsp_init.o
|
||||
OBJS-$(CONFIG_LLVIDDSP) += x86/lossless_videodsp_init.o
|
||||
OBJS-$(CONFIG_LLVIDENCDSP) += x86/lossless_videoencdsp_init.o
|
||||
OBJS-$(CONFIG_HUFFYUVDSP) += x86/huffyuvdsp_init.o
|
||||
OBJS-$(CONFIG_HUFFYUVENCDSP) += x86/huffyuvencdsp_init.o
|
||||
OBJS-$(CONFIG_IDCTDSP) += x86/idctdsp_init.o
|
||||
OBJS-$(CONFIG_LPC) += x86/lpc.o
|
||||
OBJS-$(CONFIG_MDCT15) += x86/mdct15_init.o
|
||||
OBJS-$(CONFIG_ME_CMP) += x86/me_cmp_init.o
|
||||
OBJS-$(CONFIG_MPEGAUDIODSP) += x86/mpegaudiodsp.o
|
||||
OBJS-$(CONFIG_MPEGVIDEO) += x86/mpegvideo.o \
|
||||
x86/mpegvideodsp.o
|
||||
OBJS-$(CONFIG_MPEGVIDEOENC) += x86/mpegvideoenc.o \
|
||||
x86/mpegvideoencdsp_init.o
|
||||
OBJS-$(CONFIG_PIXBLOCKDSP) += x86/pixblockdsp_init.o
|
||||
OBJS-$(CONFIG_QPELDSP) += x86/qpeldsp_init.o
|
||||
OBJS-$(CONFIG_RV34DSP) += x86/rv34dsp_init.o
|
||||
OBJS-$(CONFIG_VC1DSP) += x86/vc1dsp_init.o
|
||||
OBJS-$(CONFIG_VIDEODSP) += x86/videodsp_init.o
|
||||
OBJS-$(CONFIG_VP3DSP) += x86/vp3dsp_init.o
|
||||
OBJS-$(CONFIG_VP8DSP) += x86/vp8dsp_init.o
|
||||
OBJS-$(CONFIG_XMM_CLOBBER_TEST) += x86/w64xmmtest.o
|
||||
|
||||
# decoders/encoders
|
||||
OBJS-$(CONFIG_AAC_DECODER) += x86/aacpsdsp_init.o \
|
||||
x86/sbrdsp_init.o
|
||||
OBJS-$(CONFIG_AAC_ENCODER) += x86/aacencdsp_init.o
|
||||
OBJS-$(CONFIG_ADPCM_G722_DECODER) += x86/g722dsp_init.o
|
||||
OBJS-$(CONFIG_ADPCM_G722_ENCODER) += x86/g722dsp_init.o
|
||||
OBJS-$(CONFIG_ALAC_DECODER) += x86/alacdsp_init.o
|
||||
OBJS-$(CONFIG_APNG_DECODER) += x86/pngdsp_init.o
|
||||
OBJS-$(CONFIG_CAVS_DECODER) += x86/cavsdsp.o
|
||||
OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp_init.o x86/synth_filter_init.o
|
||||
OBJS-$(CONFIG_DNXHD_ENCODER) += x86/dnxhdenc_init.o
|
||||
OBJS-$(CONFIG_EXR_DECODER) += x86/exrdsp_init.o
|
||||
OBJS-$(CONFIG_OPUS_DECODER) += x86/opusdsp_init.o
|
||||
OBJS-$(CONFIG_OPUS_ENCODER) += x86/celt_pvq_init.o
|
||||
OBJS-$(CONFIG_HEVC_DECODER) += x86/hevcdsp_init.o
|
||||
OBJS-$(CONFIG_JPEG2000_DECODER) += x86/jpeg2000dsp_init.o
|
||||
OBJS-$(CONFIG_LSCR_DECODER) += x86/pngdsp_init.o
|
||||
OBJS-$(CONFIG_MLP_DECODER) += x86/mlpdsp_init.o
|
||||
OBJS-$(CONFIG_MPEG4_DECODER) += x86/xvididct_init.o
|
||||
OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp_init.o
|
||||
OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp_init.o
|
||||
OBJS-$(CONFIG_PRORES_LGPL_DECODER) += x86/proresdsp_init.o
|
||||
OBJS-$(CONFIG_RV40_DECODER) += x86/rv40dsp_init.o
|
||||
OBJS-$(CONFIG_SBC_ENCODER) += x86/sbcdsp_init.o
|
||||
OBJS-$(CONFIG_SVQ1_ENCODER) += x86/svq1enc_init.o
|
||||
OBJS-$(CONFIG_TAK_DECODER) += x86/takdsp_init.o
|
||||
OBJS-$(CONFIG_TRUEHD_DECODER) += x86/mlpdsp_init.o
|
||||
OBJS-$(CONFIG_TTA_DECODER) += x86/ttadsp_init.o
|
||||
OBJS-$(CONFIG_TTA_ENCODER) += x86/ttaencdsp_init.o
|
||||
OBJS-$(CONFIG_UTVIDEO_DECODER) += x86/utvideodsp_init.o
|
||||
OBJS-$(CONFIG_V210_DECODER) += x86/v210-init.o
|
||||
OBJS-$(CONFIG_V210_ENCODER) += x86/v210enc_init.o
|
||||
OBJS-$(CONFIG_VORBIS_DECODER) += x86/vorbisdsp_init.o
|
||||
OBJS-$(CONFIG_VP3_DECODER) += x86/hpeldsp_vp3_init.o
|
||||
OBJS-$(CONFIG_VP6_DECODER) += x86/vp6dsp_init.o
|
||||
OBJS-$(CONFIG_VP9_DECODER) += x86/vp9dsp_init.o \
|
||||
x86/vp9dsp_init_10bpp.o \
|
||||
x86/vp9dsp_init_12bpp.o \
|
||||
x86/vp9dsp_init_16bpp.o
|
||||
OBJS-$(CONFIG_WEBP_DECODER) += x86/vp8dsp_init.o
|
||||
|
||||
|
||||
# GCC inline assembly optimizations
|
||||
# subsystems
|
||||
MMX-OBJS-$(CONFIG_FDCTDSP) += x86/fdct.o
|
||||
MMX-OBJS-$(CONFIG_VC1DSP) += x86/vc1dsp_mmx.o
|
||||
|
||||
# decoders/encoders
|
||||
MMX-OBJS-$(CONFIG_SNOW_DECODER) += x86/snowdsp.o
|
||||
MMX-OBJS-$(CONFIG_SNOW_ENCODER) += x86/snowdsp.o
|
||||
|
||||
# subsystems
|
||||
X86ASM-OBJS-$(CONFIG_AC3DSP) += x86/ac3dsp.o \
|
||||
x86/ac3dsp_downmix.o
|
||||
X86ASM-OBJS-$(CONFIG_AUDIODSP) += x86/audiodsp.o
|
||||
X86ASM-OBJS-$(CONFIG_BLOCKDSP) += x86/blockdsp.o
|
||||
X86ASM-OBJS-$(CONFIG_BSWAPDSP) += x86/bswapdsp.o
|
||||
X86ASM-OBJS-$(CONFIG_DCT) += x86/dct32.o
|
||||
X86ASM-OBJS-$(CONFIG_FFT) += x86/fft.o
|
||||
X86ASM-OBJS-$(CONFIG_FMTCONVERT) += x86/fmtconvert.o
|
||||
X86ASM-OBJS-$(CONFIG_H263DSP) += x86/h263_loopfilter.o
|
||||
X86ASM-OBJS-$(CONFIG_H264CHROMA) += x86/h264_chromamc.o \
|
||||
x86/h264_chromamc_10bit.o
|
||||
X86ASM-OBJS-$(CONFIG_H264DSP) += x86/h264_deblock.o \
|
||||
x86/h264_deblock_10bit.o \
|
||||
x86/h264_idct.o \
|
||||
x86/h264_idct_10bit.o \
|
||||
x86/h264_weight.o \
|
||||
x86/h264_weight_10bit.o
|
||||
X86ASM-OBJS-$(CONFIG_H264PRED) += x86/h264_intrapred.o \
|
||||
x86/h264_intrapred_10bit.o
|
||||
X86ASM-OBJS-$(CONFIG_H264QPEL) += x86/h264_qpel_8bit.o \
|
||||
x86/h264_qpel_10bit.o \
|
||||
x86/fpel.o \
|
||||
x86/qpel.o
|
||||
X86ASM-OBJS-$(CONFIG_HPELDSP) += x86/fpel.o \
|
||||
x86/hpeldsp.o
|
||||
X86ASM-OBJS-$(CONFIG_HUFFYUVDSP) += x86/huffyuvdsp.o
|
||||
X86ASM-OBJS-$(CONFIG_HUFFYUVENCDSP) += x86/huffyuvencdsp.o
|
||||
X86ASM-OBJS-$(CONFIG_IDCTDSP) += x86/idctdsp.o
|
||||
X86ASM-OBJS-$(CONFIG_LLAUDDSP) += x86/lossless_audiodsp.o
|
||||
X86ASM-OBJS-$(CONFIG_LLVIDDSP) += x86/lossless_videodsp.o
|
||||
X86ASM-OBJS-$(CONFIG_LLVIDENCDSP) += x86/lossless_videoencdsp.o
|
||||
X86ASM-OBJS-$(CONFIG_MDCT15) += x86/mdct15.o
|
||||
X86ASM-OBJS-$(CONFIG_ME_CMP) += x86/me_cmp.o
|
||||
X86ASM-OBJS-$(CONFIG_MPEGAUDIODSP) += x86/imdct36.o
|
||||
X86ASM-OBJS-$(CONFIG_MPEGVIDEOENC) += x86/mpegvideoencdsp.o
|
||||
X86ASM-OBJS-$(CONFIG_OPUS_DECODER) += x86/opusdsp.o
|
||||
X86ASM-OBJS-$(CONFIG_OPUS_ENCODER) += x86/celt_pvq_search.o
|
||||
X86ASM-OBJS-$(CONFIG_PIXBLOCKDSP) += x86/pixblockdsp.o
|
||||
X86ASM-OBJS-$(CONFIG_QPELDSP) += x86/qpeldsp.o \
|
||||
x86/fpel.o \
|
||||
x86/qpel.o
|
||||
X86ASM-OBJS-$(CONFIG_RV34DSP) += x86/rv34dsp.o
|
||||
X86ASM-OBJS-$(CONFIG_VC1DSP) += x86/vc1dsp_loopfilter.o \
|
||||
x86/vc1dsp_mc.o
|
||||
X86ASM-OBJS-$(CONFIG_IDCTDSP) += x86/simple_idct10.o \
|
||||
x86/simple_idct.o
|
||||
X86ASM-OBJS-$(CONFIG_VIDEODSP) += x86/videodsp.o
|
||||
X86ASM-OBJS-$(CONFIG_VP3DSP) += x86/vp3dsp.o
|
||||
X86ASM-OBJS-$(CONFIG_VP8DSP) += x86/vp8dsp.o \
|
||||
x86/vp8dsp_loopfilter.o
|
||||
|
||||
# decoders/encoders
|
||||
X86ASM-OBJS-$(CONFIG_AAC_DECODER) += x86/aacpsdsp.o \
|
||||
x86/sbrdsp.o
|
||||
X86ASM-OBJS-$(CONFIG_AAC_ENCODER) += x86/aacencdsp.o
|
||||
X86ASM-OBJS-$(CONFIG_ADPCM_G722_DECODER) += x86/g722dsp.o
|
||||
X86ASM-OBJS-$(CONFIG_ADPCM_G722_ENCODER) += x86/g722dsp.o
|
||||
X86ASM-OBJS-$(CONFIG_ALAC_DECODER) += x86/alacdsp.o
|
||||
X86ASM-OBJS-$(CONFIG_APNG_DECODER) += x86/pngdsp.o
|
||||
X86ASM-OBJS-$(CONFIG_CAVS_DECODER) += x86/cavsidct.o
|
||||
X86ASM-OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp.o x86/synth_filter.o
|
||||
X86ASM-OBJS-$(CONFIG_DIRAC_DECODER) += x86/diracdsp.o \
|
||||
x86/dirac_dwt.o
|
||||
X86ASM-OBJS-$(CONFIG_DNXHD_ENCODER) += x86/dnxhdenc.o
|
||||
X86ASM-OBJS-$(CONFIG_EXR_DECODER) += x86/exrdsp.o
|
||||
X86ASM-OBJS-$(CONFIG_FLAC_DECODER) += x86/flacdsp.o
|
||||
ifdef CONFIG_GPL
|
||||
X86ASM-OBJS-$(CONFIG_FLAC_ENCODER) += x86/flac_dsp_gpl.o
|
||||
endif
|
||||
X86ASM-OBJS-$(CONFIG_HEVC_DECODER) += x86/hevc_add_res.o \
|
||||
x86/hevc_deblock.o \
|
||||
x86/hevc_idct.o \
|
||||
x86/hevc_mc.o \
|
||||
x86/hevc_sao.o \
|
||||
x86/hevc_sao_10bit.o
|
||||
X86ASM-OBJS-$(CONFIG_JPEG2000_DECODER) += x86/jpeg2000dsp.o
|
||||
X86ASM-OBJS-$(CONFIG_LSCR_DECODER) += x86/pngdsp.o
|
||||
X86ASM-OBJS-$(CONFIG_MLP_DECODER) += x86/mlpdsp.o
|
||||
X86ASM-OBJS-$(CONFIG_MPEG4_DECODER) += x86/xvididct.o
|
||||
X86ASM-OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp.o
|
||||
X86ASM-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp.o
|
||||
X86ASM-OBJS-$(CONFIG_PRORES_LGPL_DECODER) += x86/proresdsp.o
|
||||
X86ASM-OBJS-$(CONFIG_RV40_DECODER) += x86/rv40dsp.o
|
||||
X86ASM-OBJS-$(CONFIG_SBC_ENCODER) += x86/sbcdsp.o
|
||||
X86ASM-OBJS-$(CONFIG_SVQ1_ENCODER) += x86/svq1enc.o
|
||||
X86ASM-OBJS-$(CONFIG_TAK_DECODER) += x86/takdsp.o
|
||||
X86ASM-OBJS-$(CONFIG_TRUEHD_DECODER) += x86/mlpdsp.o
|
||||
X86ASM-OBJS-$(CONFIG_TTA_DECODER) += x86/ttadsp.o
|
||||
X86ASM-OBJS-$(CONFIG_TTA_ENCODER) += x86/ttaencdsp.o
|
||||
X86ASM-OBJS-$(CONFIG_UTVIDEO_DECODER) += x86/utvideodsp.o
|
||||
X86ASM-OBJS-$(CONFIG_V210_ENCODER) += x86/v210enc.o
|
||||
X86ASM-OBJS-$(CONFIG_V210_DECODER) += x86/v210.o
|
||||
X86ASM-OBJS-$(CONFIG_VORBIS_DECODER) += x86/vorbisdsp.o
|
||||
X86ASM-OBJS-$(CONFIG_VP3_DECODER) += x86/hpeldsp_vp3.o
|
||||
X86ASM-OBJS-$(CONFIG_VP6_DECODER) += x86/vp6dsp.o
|
||||
X86ASM-OBJS-$(CONFIG_VP9_DECODER) += x86/vp9intrapred.o \
|
||||
x86/vp9intrapred_16bpp.o \
|
||||
x86/vp9itxfm.o \
|
||||
x86/vp9itxfm_16bpp.o \
|
||||
x86/vp9lpf.o \
|
||||
x86/vp9lpf_16bpp.o \
|
||||
x86/vp9mc.o \
|
||||
x86/vp9mc_16bpp.o
|
||||
X86ASM-OBJS-$(CONFIG_WEBP_DECODER) += x86/vp8dsp.o
|
86
trunk/3rdparty/ffmpeg-4.2-fit/libavcodec/x86/aacencdsp.asm
vendored
Normal file
86
trunk/3rdparty/ffmpeg-4.2-fit/libavcodec/x86/aacencdsp.asm
vendored
Normal file
|
@ -0,0 +1,86 @@
|
|||
;******************************************************************************
|
||||
;* SIMD optimized AAC encoder DSP functions
|
||||
;*
|
||||
;* Copyright (C) 2016 Rostislav Pehlivanov <atomnuker@gmail.com>
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
SECTION_RODATA
|
||||
|
||||
float_abs_mask: times 4 dd 0x7fffffff
|
||||
|
||||
SECTION .text
|
||||
|
||||
;*******************************************************************
|
||||
;void ff_abs_pow34(float *out, const float *in, const int size);
|
||||
;*******************************************************************
|
||||
INIT_XMM sse
|
||||
cglobal abs_pow34, 3, 3, 3, out, in, size
|
||||
mova m2, [float_abs_mask]
|
||||
shl sizeq, 2
|
||||
add inq, sizeq
|
||||
add outq, sizeq
|
||||
neg sizeq
|
||||
.loop:
|
||||
andps m0, m2, [inq+sizeq]
|
||||
sqrtps m1, m0
|
||||
mulps m0, m1
|
||||
sqrtps m0, m0
|
||||
mova [outq+sizeq], m0
|
||||
add sizeq, mmsize
|
||||
jl .loop
|
||||
RET
|
||||
|
||||
;*******************************************************************
|
||||
;void ff_aac_quantize_bands(int *out, const float *in, const float *scaled,
|
||||
; int size, int is_signed, int maxval, const float Q34,
|
||||
; const float rounding)
|
||||
;*******************************************************************
|
||||
INIT_XMM sse2
|
||||
cglobal aac_quantize_bands, 5, 5, 6, out, in, scaled, size, is_signed, maxval, Q34, rounding
|
||||
%if UNIX64 == 0
|
||||
movss m0, Q34m
|
||||
movss m1, roundingm
|
||||
cvtsi2ss m3, dword maxvalm
|
||||
%else
|
||||
cvtsi2ss m3, maxvald
|
||||
%endif
|
||||
shufps m0, m0, 0
|
||||
shufps m1, m1, 0
|
||||
shufps m3, m3, 0
|
||||
shl is_signedd, 31
|
||||
movd m4, is_signedd
|
||||
shufps m4, m4, 0
|
||||
shl sized, 2
|
||||
add inq, sizeq
|
||||
add outq, sizeq
|
||||
add scaledq, sizeq
|
||||
neg sizeq
|
||||
.loop:
|
||||
mulps m2, m0, [scaledq+sizeq]
|
||||
addps m2, m1
|
||||
minps m2, m3
|
||||
andps m5, m4, [inq+sizeq]
|
||||
orps m2, m5
|
||||
cvttps2dq m2, m2
|
||||
mova [outq+sizeq], m2
|
||||
add sizeq, mmsize
|
||||
jl .loop
|
||||
RET
|
487
trunk/3rdparty/ffmpeg-4.2-fit/libavcodec/x86/aacpsdsp.asm
vendored
Normal file
487
trunk/3rdparty/ffmpeg-4.2-fit/libavcodec/x86/aacpsdsp.asm
vendored
Normal file
|
@ -0,0 +1,487 @@
|
|||
;******************************************************************************
|
||||
;* SIMD optimized MPEG-4 Parametric Stereo decoding functions
|
||||
;*
|
||||
;* Copyright (C) 2015 James Almer
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
SECTION_RODATA
|
||||
|
||||
ps_p1m1p1m1: dd 0, 0x80000000, 0, 0x80000000
|
||||
|
||||
SECTION .text
|
||||
|
||||
;*************************************************************************
|
||||
;void ff_ps_add_squares_<opt>(float *dst, const float (*src)[2], int n);
|
||||
;*************************************************************************
|
||||
%macro PS_ADD_SQUARES 1
|
||||
cglobal ps_add_squares, 3, 3, %1, dst, src, n
|
||||
shl nd, 3
|
||||
add srcq, nq
|
||||
neg nq
|
||||
|
||||
align 16
|
||||
.loop:
|
||||
movaps m0, [srcq+nq]
|
||||
movaps m1, [srcq+nq+mmsize]
|
||||
mulps m0, m0
|
||||
mulps m1, m1
|
||||
HADDPS m0, m1, m2
|
||||
addps m0, [dstq]
|
||||
movaps [dstq], m0
|
||||
add dstq, mmsize
|
||||
add nq, mmsize*2
|
||||
jl .loop
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse
|
||||
PS_ADD_SQUARES 2
|
||||
INIT_XMM sse3
|
||||
PS_ADD_SQUARES 3
|
||||
|
||||
;*******************************************************************
|
||||
;void ff_ps_mul_pair_single_sse(float (*dst)[2], float (*src0)[2],
|
||||
; float *src1, int n);
|
||||
;*******************************************************************
|
||||
INIT_XMM sse
|
||||
cglobal ps_mul_pair_single, 4, 4, 4, dst, src1, src2, n
|
||||
shl nd, 3
|
||||
add src1q, nq
|
||||
add dstq, nq
|
||||
neg nq
|
||||
|
||||
align 16
|
||||
.loop:
|
||||
movu m0, [src1q+nq]
|
||||
movu m1, [src1q+nq+mmsize]
|
||||
mova m2, [src2q]
|
||||
mova m3, m2
|
||||
unpcklps m2, m2
|
||||
unpckhps m3, m3
|
||||
mulps m0, m2
|
||||
mulps m1, m3
|
||||
mova [dstq+nq], m0
|
||||
mova [dstq+nq+mmsize], m1
|
||||
add src2q, mmsize
|
||||
add nq, mmsize*2
|
||||
jl .loop
|
||||
REP_RET
|
||||
|
||||
;***********************************************************************
|
||||
;void ff_ps_stereo_interpolate_sse3(float (*l)[2], float (*r)[2],
|
||||
; float h[2][4], float h_step[2][4],
|
||||
; int len);
|
||||
;***********************************************************************
|
||||
INIT_XMM sse3
|
||||
cglobal ps_stereo_interpolate, 5, 5, 6, l, r, h, h_step, n
|
||||
movaps m0, [hq]
|
||||
movaps m1, [h_stepq]
|
||||
unpcklps m4, m0, m0
|
||||
unpckhps m0, m0
|
||||
unpcklps m5, m1, m1
|
||||
unpckhps m1, m1
|
||||
shl nd, 3
|
||||
add lq, nq
|
||||
add rq, nq
|
||||
neg nq
|
||||
|
||||
align 16
|
||||
.loop:
|
||||
addps m4, m5
|
||||
addps m0, m1
|
||||
movddup m2, [lq+nq]
|
||||
movddup m3, [rq+nq]
|
||||
mulps m2, m4
|
||||
mulps m3, m0
|
||||
addps m2, m3
|
||||
movsd [lq+nq], m2
|
||||
movhps [rq+nq], m2
|
||||
add nq, 8
|
||||
jl .loop
|
||||
REP_RET
|
||||
|
||||
;***************************************************************************
|
||||
;void ps_stereo_interpolate_ipdopd_sse3(float (*l)[2], float (*r)[2],
|
||||
; float h[2][4], float h_step[2][4],
|
||||
; int len);
|
||||
;***************************************************************************
|
||||
INIT_XMM sse3
|
||||
cglobal ps_stereo_interpolate_ipdopd, 5, 5, 10, l, r, h, h_step, n
|
||||
movaps m0, [hq]
|
||||
movaps m1, [hq+mmsize]
|
||||
%if ARCH_X86_64
|
||||
movaps m8, [h_stepq]
|
||||
movaps m9, [h_stepq+mmsize]
|
||||
%define H_STEP0 m8
|
||||
%define H_STEP1 m9
|
||||
%else
|
||||
%define H_STEP0 [h_stepq]
|
||||
%define H_STEP1 [h_stepq+mmsize]
|
||||
%endif
|
||||
shl nd, 3
|
||||
add lq, nq
|
||||
add rq, nq
|
||||
neg nq
|
||||
|
||||
align 16
|
||||
.loop:
|
||||
addps m0, H_STEP0
|
||||
addps m1, H_STEP1
|
||||
movddup m2, [lq+nq]
|
||||
movddup m3, [rq+nq]
|
||||
shufps m4, m2, m2, q2301
|
||||
shufps m5, m3, m3, q2301
|
||||
unpcklps m6, m0, m0
|
||||
unpckhps m7, m0, m0
|
||||
mulps m2, m6
|
||||
mulps m3, m7
|
||||
unpcklps m6, m1, m1
|
||||
unpckhps m7, m1, m1
|
||||
mulps m4, m6
|
||||
mulps m5, m7
|
||||
addps m2, m3
|
||||
addsubps m2, m4
|
||||
addsubps m2, m5
|
||||
movsd [lq+nq], m2
|
||||
movhps [rq+nq], m2
|
||||
add nq, 8
|
||||
jl .loop
|
||||
REP_RET
|
||||
|
||||
;**********************************************************
|
||||
;void ps_hybrid_analysis_ileave_sse(float out[2][38][64],
|
||||
; float (*in)[32][2],
|
||||
; int i, int len)
|
||||
;**********************************************************
|
||||
INIT_XMM sse
|
||||
cglobal ps_hybrid_analysis_ileave, 3, 7, 5, out, in, i, len, in0, in1, tmp
|
||||
movsxdifnidn iq, id
|
||||
mov lend, 32 << 3
|
||||
lea inq, [inq+iq*4]
|
||||
mov tmpd, id
|
||||
shl tmpd, 8
|
||||
add outq, tmpq
|
||||
mov tmpd, 64
|
||||
sub tmpd, id
|
||||
mov id, tmpd
|
||||
|
||||
test id, 1
|
||||
jne .loop4
|
||||
test id, 2
|
||||
jne .loop8
|
||||
|
||||
align 16
|
||||
.loop16:
|
||||
mov in0q, inq
|
||||
mov in1q, 38*64*4
|
||||
add in1q, in0q
|
||||
mov tmpd, lend
|
||||
|
||||
.inner_loop16:
|
||||
movaps m0, [in0q]
|
||||
movaps m1, [in1q]
|
||||
movaps m2, [in0q+lenq]
|
||||
movaps m3, [in1q+lenq]
|
||||
TRANSPOSE4x4PS 0, 1, 2, 3, 4
|
||||
movaps [outq], m0
|
||||
movaps [outq+lenq], m1
|
||||
movaps [outq+lenq*2], m2
|
||||
movaps [outq+3*32*2*4], m3
|
||||
lea in0q, [in0q+lenq*2]
|
||||
lea in1q, [in1q+lenq*2]
|
||||
add outq, mmsize
|
||||
sub tmpd, mmsize
|
||||
jg .inner_loop16
|
||||
add inq, 16
|
||||
add outq, 3*32*2*4
|
||||
sub id, 4
|
||||
jg .loop16
|
||||
RET
|
||||
|
||||
align 16
|
||||
.loop8:
|
||||
mov in0q, inq
|
||||
mov in1q, 38*64*4
|
||||
add in1q, in0q
|
||||
mov tmpd, lend
|
||||
|
||||
.inner_loop8:
|
||||
movlps m0, [in0q]
|
||||
movlps m1, [in1q]
|
||||
movhps m0, [in0q+lenq]
|
||||
movhps m1, [in1q+lenq]
|
||||
SBUTTERFLYPS 0, 1, 2
|
||||
SBUTTERFLYPD 0, 1, 2
|
||||
movaps [outq], m0
|
||||
movaps [outq+lenq], m1
|
||||
lea in0q, [in0q+lenq*2]
|
||||
lea in1q, [in1q+lenq*2]
|
||||
add outq, mmsize
|
||||
sub tmpd, mmsize
|
||||
jg .inner_loop8
|
||||
add inq, 8
|
||||
add outq, lenq
|
||||
sub id, 2
|
||||
jg .loop16
|
||||
RET
|
||||
|
||||
align 16
|
||||
.loop4:
|
||||
mov in0q, inq
|
||||
mov in1q, 38*64*4
|
||||
add in1q, in0q
|
||||
mov tmpd, lend
|
||||
|
||||
.inner_loop4:
|
||||
movss m0, [in0q]
|
||||
movss m1, [in1q]
|
||||
movss m2, [in0q+lenq]
|
||||
movss m3, [in1q+lenq]
|
||||
movlhps m0, m1
|
||||
movlhps m2, m3
|
||||
shufps m0, m2, q2020
|
||||
movaps [outq], m0
|
||||
lea in0q, [in0q+lenq*2]
|
||||
lea in1q, [in1q+lenq*2]
|
||||
add outq, mmsize
|
||||
sub tmpd, mmsize
|
||||
jg .inner_loop4
|
||||
add inq, 4
|
||||
sub id, 1
|
||||
test id, 2
|
||||
jne .loop8
|
||||
cmp id, 4
|
||||
jge .loop16
|
||||
RET
|
||||
|
||||
;***********************************************************
|
||||
;void ps_hybrid_synthesis_deint_sse4(float out[2][38][64],
|
||||
; float (*in)[32][2],
|
||||
; int i, int len)
|
||||
;***********************************************************
|
||||
%macro HYBRID_SYNTHESIS_DEINT 0
|
||||
cglobal ps_hybrid_synthesis_deint, 3, 7, 5, out, in, i, len, out0, out1, tmp
|
||||
%if cpuflag(sse4)
|
||||
%define MOVH movsd
|
||||
%else
|
||||
%define MOVH movlps
|
||||
%endif
|
||||
movsxdifnidn iq, id
|
||||
mov lend, 32 << 3
|
||||
lea outq, [outq+iq*4]
|
||||
mov tmpd, id
|
||||
shl tmpd, 8
|
||||
add inq, tmpq
|
||||
mov tmpd, 64
|
||||
sub tmpd, id
|
||||
mov id, tmpd
|
||||
|
||||
test id, 1
|
||||
jne .loop4
|
||||
test id, 2
|
||||
jne .loop8
|
||||
|
||||
align 16
|
||||
.loop16:
|
||||
mov out0q, outq
|
||||
mov out1q, 38*64*4
|
||||
add out1q, out0q
|
||||
mov tmpd, lend
|
||||
|
||||
.inner_loop16:
|
||||
movaps m0, [inq]
|
||||
movaps m1, [inq+lenq]
|
||||
movaps m2, [inq+lenq*2]
|
||||
movaps m3, [inq+3*32*2*4]
|
||||
TRANSPOSE4x4PS 0, 1, 2, 3, 4
|
||||
movaps [out0q], m0
|
||||
movaps [out1q], m1
|
||||
movaps [out0q+lenq], m2
|
||||
movaps [out1q+lenq], m3
|
||||
lea out0q, [out0q+lenq*2]
|
||||
lea out1q, [out1q+lenq*2]
|
||||
add inq, mmsize
|
||||
sub tmpd, mmsize
|
||||
jg .inner_loop16
|
||||
add outq, 16
|
||||
add inq, 3*32*2*4
|
||||
sub id, 4
|
||||
jg .loop16
|
||||
RET
|
||||
|
||||
align 16
|
||||
.loop8:
|
||||
mov out0q, outq
|
||||
mov out1q, 38*64*4
|
||||
add out1q, out0q
|
||||
mov tmpd, lend
|
||||
|
||||
.inner_loop8:
|
||||
movaps m0, [inq]
|
||||
movaps m1, [inq+lenq]
|
||||
SBUTTERFLYPS 0, 1, 2
|
||||
SBUTTERFLYPD 0, 1, 2
|
||||
MOVH [out0q], m0
|
||||
MOVH [out1q], m1
|
||||
movhps [out0q+lenq], m0
|
||||
movhps [out1q+lenq], m1
|
||||
lea out0q, [out0q+lenq*2]
|
||||
lea out1q, [out1q+lenq*2]
|
||||
add inq, mmsize
|
||||
sub tmpd, mmsize
|
||||
jg .inner_loop8
|
||||
add outq, 8
|
||||
add inq, lenq
|
||||
sub id, 2
|
||||
jg .loop16
|
||||
RET
|
||||
|
||||
align 16
|
||||
.loop4:
|
||||
mov out0q, outq
|
||||
mov out1q, 38*64*4
|
||||
add out1q, out0q
|
||||
mov tmpd, lend
|
||||
|
||||
.inner_loop4:
|
||||
movaps m0, [inq]
|
||||
movss [out0q], m0
|
||||
%if cpuflag(sse4)
|
||||
extractps [out1q], m0, 1
|
||||
extractps [out0q+lenq], m0, 2
|
||||
extractps [out1q+lenq], m0, 3
|
||||
%else
|
||||
movhlps m1, m0
|
||||
movss [out0q+lenq], m1
|
||||
shufps m0, m0, 0xb1
|
||||
movss [out1q], m0
|
||||
movhlps m1, m0
|
||||
movss [out1q+lenq], m1
|
||||
%endif
|
||||
lea out0q, [out0q+lenq*2]
|
||||
lea out1q, [out1q+lenq*2]
|
||||
add inq, mmsize
|
||||
sub tmpd, mmsize
|
||||
jg .inner_loop4
|
||||
add outq, 4
|
||||
sub id, 1
|
||||
test id, 2
|
||||
jne .loop8
|
||||
cmp id, 4
|
||||
jge .loop16
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse
|
||||
HYBRID_SYNTHESIS_DEINT
|
||||
INIT_XMM sse4
|
||||
HYBRID_SYNTHESIS_DEINT
|
||||
|
||||
;*******************************************************************
|
||||
;void ff_ps_hybrid_analysis_<opt>(float (*out)[2], float (*in)[2],
|
||||
; const float (*filter)[8][2],
|
||||
; ptrdiff_t stride, int n);
|
||||
;*******************************************************************
|
||||
%macro PS_HYBRID_ANALYSIS_LOOP 3
|
||||
movu %1, [inq+mmsize*%3]
|
||||
movu m1, [inq+mmsize*(5-%3)+8]
|
||||
%if cpuflag(sse3)
|
||||
pshufd %2, %1, q2301
|
||||
pshufd m4, m1, q0123
|
||||
pshufd m1, m1, q1032
|
||||
pshufd m2, [filterq+nq+mmsize*%3], q2301
|
||||
addsubps %2, m4
|
||||
addsubps %1, m1
|
||||
%else
|
||||
mova m2, [filterq+nq+mmsize*%3]
|
||||
mova %2, %1
|
||||
mova m4, m1
|
||||
shufps %2, %2, q2301
|
||||
shufps m4, m4, q0123
|
||||
shufps m1, m1, q1032
|
||||
shufps m2, m2, q2301
|
||||
xorps m4, m7
|
||||
xorps m1, m7
|
||||
subps %2, m4
|
||||
subps %1, m1
|
||||
%endif
|
||||
mulps %2, m2
|
||||
mulps %1, m2
|
||||
%if %3
|
||||
addps m3, %2
|
||||
addps m0, %1
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro PS_HYBRID_ANALYSIS 0
|
||||
cglobal ps_hybrid_analysis, 5, 5, 8, out, in, filter, stride, n
|
||||
%if cpuflag(sse3)
|
||||
%define MOVH movsd
|
||||
%else
|
||||
%define MOVH movlps
|
||||
%endif
|
||||
shl strideq, 3
|
||||
shl nd, 6
|
||||
add filterq, nq
|
||||
neg nq
|
||||
mova m7, [ps_p1m1p1m1]
|
||||
|
||||
align 16
|
||||
.loop:
|
||||
PS_HYBRID_ANALYSIS_LOOP m0, m3, 0
|
||||
PS_HYBRID_ANALYSIS_LOOP m5, m6, 1
|
||||
PS_HYBRID_ANALYSIS_LOOP m5, m6, 2
|
||||
|
||||
%if cpuflag(sse3)
|
||||
pshufd m3, m3, q2301
|
||||
xorps m0, m7
|
||||
hsubps m3, m0
|
||||
pshufd m1, m3, q0020
|
||||
pshufd m3, m3, q0031
|
||||
addps m1, m3
|
||||
movsd m2, [inq+6*8]
|
||||
%else
|
||||
mova m1, m3
|
||||
mova m2, m0
|
||||
shufps m1, m1, q2301
|
||||
shufps m2, m2, q2301
|
||||
subps m1, m3
|
||||
addps m2, m0
|
||||
unpcklps m3, m1, m2
|
||||
unpckhps m1, m2
|
||||
addps m1, m3
|
||||
movu m2, [inq+6*8] ; faster than movlps and no risk of overread
|
||||
%endif
|
||||
movss m3, [filterq+nq+8*6]
|
||||
SPLATD m3
|
||||
mulps m2, m3
|
||||
addps m1, m2
|
||||
MOVH [outq], m1
|
||||
add outq, strideq
|
||||
add nq, 64
|
||||
jl .loop
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse
|
||||
PS_HYBRID_ANALYSIS
|
||||
INIT_XMM sse3
|
||||
PS_HYBRID_ANALYSIS
|
385
trunk/3rdparty/ffmpeg-4.2-fit/libavcodec/x86/celt_pvq_search.asm
vendored
Normal file
385
trunk/3rdparty/ffmpeg-4.2-fit/libavcodec/x86/celt_pvq_search.asm
vendored
Normal file
|
@ -0,0 +1,385 @@
|
|||
;******************************************************************************
|
||||
;* SIMD optimized Opus encoder DSP function
|
||||
;*
|
||||
;* Copyright (C) 2017 Ivan Kalvachev <ikalvachev@gmail.com>
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "config.asm"
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
%ifdef __NASM_VER__
|
||||
%use "smartalign"
|
||||
ALIGNMODE p6
|
||||
%endif
|
||||
|
||||
SECTION_RODATA 64
|
||||
|
||||
const_float_abs_mask: times 8 dd 0x7fffffff
|
||||
const_align_abs_edge: times 8 dd 0
|
||||
|
||||
const_float_0_5: times 8 dd 0.5
|
||||
const_float_1: times 8 dd 1.0
|
||||
const_float_sign_mask: times 8 dd 0x80000000
|
||||
|
||||
const_int32_offsets:
|
||||
%rep 8
|
||||
dd $-const_int32_offsets
|
||||
%endrep
|
||||
SECTION .text
|
||||
|
||||
;
|
||||
; Setup High Register to be used
|
||||
; for holding memory constants
|
||||
;
|
||||
; %1 - the register to be used, assmues it is >= mm8
|
||||
; %2 - name of the constant.
|
||||
;
|
||||
; Subsequent opcodes are going to use the constant in the form
|
||||
; "addps m0, mm_const_name" and it would be turned into:
|
||||
; "addps m0, [const_name]" on 32 bit arch or
|
||||
; "addps m0, m8" on 64 bit arch
|
||||
%macro SET_HI_REG_MM_CONSTANT 3 ; movop, reg, const_name
|
||||
%if num_mmregs > 8
|
||||
%define mm_%3 %2
|
||||
%{1} %2, [%3] ; movaps m8, [const_name]
|
||||
%else
|
||||
%define mm_%3 [%3]
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
;
|
||||
; Set Position Independent Code
|
||||
; Base address of a constant
|
||||
; %1 - the register to be used, if PIC is set
|
||||
; %2 - name of the constant.
|
||||
;
|
||||
; Subsequent opcode are going to use the base address in the form
|
||||
; "movaps m0, [pic_base_constant_name+r4]" and it would be turned into
|
||||
; "movaps m0, [r5 + r4]" if PIC is enabled
|
||||
; "movaps m0, [constant_name + r4]" if texrel are used
|
||||
%macro SET_PIC_BASE 3; reg, const_label
|
||||
%ifdef PIC
|
||||
%{1} %2, [%3] ; lea r5, [rip+const]
|
||||
%define pic_base_%3 %2
|
||||
%else
|
||||
%define pic_base_%3 %3
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro PULSES_SEARCH 1
|
||||
; m6 Syy_norm
|
||||
; m7 Sxy_norm
|
||||
addps m6, mm_const_float_0_5 ; Syy_norm += 1.0/2
|
||||
pxor m1, m1 ; max_idx
|
||||
xorps m3, m3 ; p_max
|
||||
xor r4d, r4d
|
||||
align 16
|
||||
%%distortion_search:
|
||||
movd xm2, dword r4d ; movd zero extends
|
||||
%ifidn %1,add
|
||||
movaps m4, [tmpY + r4] ; y[i]
|
||||
movaps m5, [tmpX + r4] ; X[i]
|
||||
|
||||
%if USE_APPROXIMATION == 1
|
||||
xorps m0, m0
|
||||
cmpps m0, m0, m5, 4 ; m0 = (X[i] != 0.0)
|
||||
%endif
|
||||
|
||||
addps m4, m6 ; m4 = Syy_new = y[i] + Syy_norm
|
||||
addps m5, m7 ; m5 = Sxy_new = X[i] + Sxy_norm
|
||||
|
||||
%if USE_APPROXIMATION == 1
|
||||
andps m5, m0 ; if(X[i] == 0) Sxy_new = 0; Prevent aproximation error from setting pulses in array padding.
|
||||
%endif
|
||||
|
||||
%else
|
||||
movaps m5, [tmpY + r4] ; m5 = y[i]
|
||||
|
||||
xorps m0, m0 ; m0 = 0;
|
||||
cmpps m0, m0, m5, 1 ; m0 = (0<y)
|
||||
|
||||
subps m4, m6, m5 ; m4 = Syy_new = Syy_norm - y[i]
|
||||
subps m5, m7, [tmpX + r4] ; m5 = Sxy_new = Sxy_norm - X[i]
|
||||
andps m5, m0 ; (0<y)?m5:0
|
||||
%endif
|
||||
|
||||
%if USE_APPROXIMATION == 1
|
||||
rsqrtps m4, m4
|
||||
mulps m5, m4 ; m5 = p = Sxy_new*approx(1/sqrt(Syy) )
|
||||
%else
|
||||
mulps m5, m5
|
||||
divps m5, m4 ; m5 = p = Sxy_new*Sxy_new/Syy
|
||||
%endif
|
||||
VPBROADCASTD m2, xm2 ; m2=i (all lanes get same values, we add the offset-per-lane, later)
|
||||
|
||||
cmpps m0, m3, m5, 1 ; m0 = (m3 < m5) ; (p_max < p) ; (p > p_max)
|
||||
maxps m3, m5 ; m3=max(p_max,p)
|
||||
; maxps here is faster than blendvps, despite blend having lower latency.
|
||||
|
||||
pand m2, m0 ; This version seems faster than sse41 pblendvb
|
||||
pmaxsw m1, m2 ; SSE2 signed word, so it would work for N < 32768/4
|
||||
|
||||
add r4d, mmsize
|
||||
cmp r4d, Nd
|
||||
jb %%distortion_search
|
||||
|
||||
por m1, mm_const_int32_offsets ; max_idx offsets per individual lane (skipped in the inner loop)
|
||||
movdqa m4, m1 ; needed for the aligned y[max_idx]+=1; processing
|
||||
|
||||
%if mmsize >= 32
|
||||
; Merge parallel maximums round 8 (4 vs 4)
|
||||
|
||||
vextractf128 xm5, ym3, 1 ; xmm5 = ymm3[1x128] = ymm3[255..128b]
|
||||
cmpps xm0, xm3, xm5, 1 ; m0 = (m3 < m5) = ( p[0x128] < p[1x128] )
|
||||
|
||||
vextracti128 xm2, ym1, 1 ; xmm2 = ymm1[1x128] = ymm1[255..128b]
|
||||
BLENDVPS xm3, xm5, xm0 ; max_idx = m0 ? max_idx[1x128] : max_idx[0x128]
|
||||
PBLENDVB xm1, xm2, xm0 ; p = m0 ? p[1x128] : p[0x128]
|
||||
%endif
|
||||
|
||||
; Merge parallel maximums round 4 (2 vs 2)
|
||||
; m3=p[3210]
|
||||
movhlps xm5, xm3 ; m5=p[xx32]
|
||||
cmpps xm0, xm3, xm5, 1 ; m0 = (m3 < m5) = ( p[1,0] < p[3,2] )
|
||||
|
||||
pshufd xm2, xm1, q3232
|
||||
BLENDVPS xm3, xm5, xm0 ; max_idx = m0 ? max_idx[3,2] : max_idx[1,0]
|
||||
PBLENDVB xm1, xm2, xm0 ; p = m0 ? p[3,2] : p[1,0]
|
||||
|
||||
; Merge parallel maximums final round (1 vs 1)
|
||||
shufps xm0, xm3, xm3, q1111 ; m0 = m3[1] = p[1]
|
||||
cmpss xm0, xm3, 5 ; m0 = !(m0 >= m3) = !( p[1] >= p[0] )
|
||||
|
||||
pshufd xm2, xm1, q1111
|
||||
PBLENDVB xm1, xm2, xm0
|
||||
|
||||
movd dword r4d, xm1 ; zero extends to the rest of r4q
|
||||
|
||||
VBROADCASTSS m3, [tmpX + r4]
|
||||
%{1}ps m7, m3 ; Sxy += X[max_idx]
|
||||
|
||||
VBROADCASTSS m5, [tmpY + r4]
|
||||
%{1}ps m6, m5 ; Syy += Y[max_idx]
|
||||
|
||||
; We have to update a single element in Y[i]
|
||||
; However writing 4 bytes and then doing 16 byte load in the inner loop
|
||||
; could cause a stall due to breaking write forwarding.
|
||||
VPBROADCASTD m1, xm1
|
||||
pcmpeqd m1, m1, m4 ; exactly 1 element matches max_idx and this finds it
|
||||
|
||||
and r4d, ~(mmsize-1) ; align address down, so the value pointed by max_idx is inside a mmsize load
|
||||
movaps m5, [tmpY + r4] ; m5 = Y[y3...ym...y0]
|
||||
andps m1, mm_const_float_1 ; m1 = [ 0...1.0...0]
|
||||
%{1}ps m5, m1 ; m5 = Y[y3...ym...y0] +/- [0...1.0...0]
|
||||
movaps [tmpY + r4], m5 ; Y[max_idx] +-= 1.0;
|
||||
%endmacro
|
||||
|
||||
;
|
||||
; We need one more register for
|
||||
; PIC relative addressing. Use this
|
||||
; to count it in cglobal
|
||||
;
|
||||
%ifdef PIC
|
||||
%define num_pic_regs 1
|
||||
%else
|
||||
%define num_pic_regs 0
|
||||
%endif
|
||||
|
||||
;
|
||||
; Pyramid Vector Quantization Search implementation
|
||||
;
|
||||
; float * inX - Unaligned (SIMD) access, it will be overread,
|
||||
; but extra data is masked away.
|
||||
; int32 * outY - Should be aligned and padded buffer.
|
||||
; It is used as temp buffer.
|
||||
; uint32 K - Number of pulses to have after quantizations.
|
||||
; uint32 N - Number of vector elements. Must be 0 < N < 256
|
||||
;
|
||||
%macro PVQ_FAST_SEARCH 1
|
||||
cglobal pvq_search%1, 4, 5+num_pic_regs, 11, 256*4, inX, outY, K, N
|
||||
%define tmpX rsp
|
||||
%define tmpY outYq
|
||||
|
||||
movaps m0, [const_float_abs_mask]
|
||||
shl Nd, 2 ; N *= sizeof(float); also 32 bit operation zeroes the high 32 bits in 64 bit mode.
|
||||
mov r4d, Nd
|
||||
|
||||
neg r4d
|
||||
and r4d, mmsize-1
|
||||
|
||||
SET_PIC_BASE lea, r5, const_align_abs_edge ; rip+const
|
||||
movups m2, [pic_base_const_align_abs_edge + r4 - mmsize]
|
||||
|
||||
add Nd, r4d ; N = align(N, mmsize)
|
||||
|
||||
lea r4d, [Nd - mmsize] ; N is rounded up (aligned up) to mmsize, so r4 can't become negative here, unless N=0.
|
||||
movups m1, [inXq + r4]
|
||||
andps m1, m2
|
||||
movaps [tmpX + r4], m1 ; Sx = abs( X[N-1] )
|
||||
|
||||
align 16
|
||||
%%loop_abs_sum:
|
||||
sub r4d, mmsize
|
||||
jc %%end_loop_abs_sum
|
||||
|
||||
movups m2, [inXq + r4]
|
||||
andps m2, m0
|
||||
|
||||
movaps [tmpX + r4], m2 ; tmpX[i]=abs(X[i])
|
||||
addps m1, m2 ; Sx += abs(X[i])
|
||||
jmp %%loop_abs_sum
|
||||
|
||||
align 16
|
||||
%%end_loop_abs_sum:
|
||||
|
||||
HSUMPS m1, m2 ; m1 = Sx
|
||||
|
||||
xorps m0, m0
|
||||
comiss xm0, xm1 ;
|
||||
jz %%zero_input ; if (Sx==0) goto zero_input
|
||||
|
||||
cvtsi2ss xm0, dword Kd ; m0 = K
|
||||
%if USE_APPROXIMATION == 1
|
||||
rcpss xm1, xm1 ; m1 = approx(1/Sx)
|
||||
mulss xm0, xm1 ; m0 = K*(1/Sx)
|
||||
%else
|
||||
divss xm0, xm1 ; b = K/Sx
|
||||
; b = K/max_x
|
||||
%endif
|
||||
|
||||
VBROADCASTSS m0, xm0
|
||||
|
||||
lea r4d, [Nd - mmsize]
|
||||
pxor m5, m5 ; Sy ( Sum of abs( y[i]) )
|
||||
xorps m6, m6 ; Syy ( Sum of y[i]*y[i] )
|
||||
xorps m7, m7 ; Sxy ( Sum of X[i]*y[i] )
|
||||
align 16
|
||||
%%loop_guess:
|
||||
movaps m1, [tmpX + r4] ; m1 = X[i]
|
||||
mulps m2, m0, m1 ; m2 = res*X[i]
|
||||
cvtps2dq m2, m2 ; yt = (int)lrintf( res*X[i] )
|
||||
paddd m5, m2 ; Sy += yt
|
||||
cvtdq2ps m2, m2 ; yt = (float)yt
|
||||
mulps m1, m2 ; m1 = X[i]*yt
|
||||
movaps [tmpY + r4], m2 ; y[i] = m2
|
||||
addps m7, m1 ; Sxy += m1;
|
||||
mulps m2, m2 ; m2 = yt*yt
|
||||
addps m6, m2 ; Syy += m2
|
||||
|
||||
sub r4d, mmsize
|
||||
jnc %%loop_guess
|
||||
|
||||
HSUMPS m6, m1 ; Syy_norm
|
||||
HADDD m5, m4 ; pulses
|
||||
|
||||
movd dword r4d, xm5 ; zero extends to the rest of r4q
|
||||
|
||||
sub Kd, r4d ; K -= pulses , also 32 bit operation zeroes high 32 bit in 64 bit mode.
|
||||
jz %%finish ; K - pulses == 0
|
||||
|
||||
SET_HI_REG_MM_CONSTANT movaps, m8, const_float_0_5
|
||||
SET_HI_REG_MM_CONSTANT movaps, m9, const_float_1
|
||||
SET_HI_REG_MM_CONSTANT movdqa, m10, const_int32_offsets
|
||||
; Use Syy/2 in distortion parameter calculations.
|
||||
; Saves pre and post-caclulation to correct Y[] values.
|
||||
; Same precision, since float mantisa is normalized.
|
||||
; The SQRT approximation does differ.
|
||||
HSUMPS m7, m0 ; Sxy_norm
|
||||
mulps m6, mm_const_float_0_5
|
||||
|
||||
jc %%remove_pulses_loop ; K - pulses < 0
|
||||
|
||||
align 16 ; K - pulses > 0
|
||||
%%add_pulses_loop:
|
||||
|
||||
PULSES_SEARCH add ; m6 Syy_norm ; m7 Sxy_norm
|
||||
|
||||
sub Kd, 1
|
||||
jnz %%add_pulses_loop
|
||||
|
||||
addps m6, m6 ; Syy*=2
|
||||
|
||||
jmp %%finish
|
||||
|
||||
align 16
|
||||
%%remove_pulses_loop:
|
||||
|
||||
PULSES_SEARCH sub ; m6 Syy_norm ; m7 Sxy_norm
|
||||
|
||||
add Kd, 1
|
||||
jnz %%remove_pulses_loop
|
||||
|
||||
addps m6, m6 ; Syy*=2
|
||||
|
||||
align 16
|
||||
%%finish:
|
||||
lea r4d, [Nd - mmsize]
|
||||
movaps m2, [const_float_sign_mask]
|
||||
|
||||
align 16
|
||||
%%restore_sign_loop:
|
||||
movaps m0, [tmpY + r4] ; m0 = Y[i]
|
||||
movups m1, [inXq + r4] ; m1 = X[i]
|
||||
andps m1, m2 ; m1 = sign(X[i])
|
||||
orps m0, m1 ; m0 = Y[i]*sign
|
||||
cvtps2dq m3, m0 ; m3 = (int)m0
|
||||
movaps [outYq + r4], m3
|
||||
|
||||
sub r4d, mmsize
|
||||
jnc %%restore_sign_loop
|
||||
%%return:
|
||||
|
||||
%if ARCH_X86_64 == 0 ; sbrdsp
|
||||
movss r0m, xm6 ; return (float)Syy_norm
|
||||
fld dword r0m
|
||||
%else
|
||||
movaps m0, m6 ; return (float)Syy_norm
|
||||
%endif
|
||||
|
||||
RET
|
||||
|
||||
align 16
|
||||
%%zero_input:
|
||||
lea r4d, [Nd - mmsize]
|
||||
xorps m0, m0
|
||||
%%zero_loop:
|
||||
movaps [outYq + r4], m0
|
||||
sub r4d, mmsize
|
||||
jnc %%zero_loop
|
||||
|
||||
movaps m6, [const_float_1]
|
||||
jmp %%return
|
||||
%endmacro
|
||||
|
||||
; if 1, use a float op that give half precision but execute for around 3 cycles.
|
||||
; On Skylake & Ryzen the division is much faster (around 11c/3),
|
||||
; that makes the full precision code about 2% slower.
|
||||
; Opus also does use rsqrt approximation in their intrinsics code.
|
||||
%define USE_APPROXIMATION 1
|
||||
|
||||
INIT_XMM sse2
|
||||
PVQ_FAST_SEARCH _approx
|
||||
|
||||
INIT_XMM sse4
|
||||
PVQ_FAST_SEARCH _approx
|
||||
|
||||
%define USE_APPROXIMATION 0
|
||||
|
||||
INIT_XMM avx
|
||||
PVQ_FAST_SEARCH _exact
|
1085
trunk/3rdparty/ffmpeg-4.2-fit/libavcodec/x86/fft.asm
vendored
Normal file
1085
trunk/3rdparty/ffmpeg-4.2-fit/libavcodec/x86/fft.asm
vendored
Normal file
File diff suppressed because it is too large
Load diff
221
trunk/3rdparty/ffmpeg-4.2-fit/libavcodec/x86/mdct15.asm
vendored
Normal file
221
trunk/3rdparty/ffmpeg-4.2-fit/libavcodec/x86/mdct15.asm
vendored
Normal file
|
@ -0,0 +1,221 @@
|
|||
;******************************************************************************
|
||||
;* SIMD optimized non-power-of-two MDCT functions
|
||||
;*
|
||||
;* Copyright (C) 2017 Rostislav Pehlivanov <atomnuker@gmail.com>
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
SECTION_RODATA 32
|
||||
|
||||
perm_neg: dd 2, 5, 3, 4, 6, 1, 7, 0
|
||||
perm_pos: dd 0, 7, 1, 6, 4, 3, 5, 2
|
||||
sign_adjust_r: times 4 dd 0x80000000, 0x00000000
|
||||
|
||||
sign_adjust_5: dd 0x00000000, 0x80000000, 0x80000000, 0x00000000
|
||||
|
||||
SECTION .text
|
||||
|
||||
%if ARCH_X86_64
|
||||
|
||||
;*****************************************************************************************
|
||||
;void ff_fft15_avx(FFTComplex *out, FFTComplex *in, FFTComplex *exptab, ptrdiff_t stride);
|
||||
;*****************************************************************************************
|
||||
%macro FFT5 3 ; %1 - in_offset, %2 - dst1 (64bit used), %3 - dst2
|
||||
VBROADCASTSD m0, [inq + %1] ; in[ 0].re, in[ 0].im, in[ 0].re, in[ 0].im
|
||||
movsd xm1, [inq + 1*16 + 8 + %1] ; in[ 3].re, in[ 3].im, 0, 0
|
||||
movsd xm4, [inq + 6*16 + 0 + %1] ; in[12].re, in[12].im, 0, 0
|
||||
movhps xm1, [inq + 3*16 + 0 + %1] ; in[ 3].re, in[ 3].im, in[ 6].re, in[ 6].im
|
||||
movhps xm4, [inq + 4*16 + 8 + %1] ; in[12].re, in[12].im, in[ 9].re, in[ 9].im
|
||||
|
||||
subps xm2, xm1, xm4 ; t[2].im, t[2].re, t[3].im, t[3].re
|
||||
addps xm1, xm4 ; t[0].re, t[0].im, t[1].re, t[1].im
|
||||
|
||||
movhlps %2, xm1 ; t[0].re, t[1].re, t[0].im, t[1].im
|
||||
addps %2, xm1
|
||||
addps %2, xm0 ; DC[0].re, DC[0].im, junk...
|
||||
movlhps %2, %2 ; DC[0].re, DC[0].im, DC[0].re, DC[0].im
|
||||
|
||||
shufps xm3, xm1, xm2, q0110 ; t[0].re, t[0].im, t[2].re, t[2].im
|
||||
shufps xm1, xm2, q2332 ; t[1].re, t[1].im, t[3].re, t[3].im
|
||||
|
||||
mulps xm%3, xm1, xm5
|
||||
mulps xm4, xm3, xm6
|
||||
mulps xm1, xm6
|
||||
|
||||
xorps xm1, xm7
|
||||
mulps xm3, xm5
|
||||
addsubps xm3, xm1 ; t[0].re, t[0].im, t[2].re, t[2].im
|
||||
subps xm%3, xm4 ; t[4].re, t[4].im, t[5].re, t[5].im
|
||||
|
||||
movhlps xm2, xm%3, xm3 ; t[2].re, t[2].im, t[5].re, t[5].im
|
||||
movlhps xm3, xm%3 ; t[0].re, t[0].im, t[4].re, t[4].im
|
||||
|
||||
xorps xm2, xm7
|
||||
addps xm%3, xm2, xm3
|
||||
subps xm3, xm2
|
||||
|
||||
shufps xm3, xm3, q1032
|
||||
vinsertf128 m%3, m%3, xm3, 1 ; All ACs (tmp[1] through to tmp[4])
|
||||
addps m%3, m%3, m0 ; Finally offset with DCs
|
||||
%endmacro
|
||||
|
||||
%macro BUTTERFLIES_DC 1 ; %1 - exptab_offset
|
||||
mulps xm0, xm9, [exptabq + %1 + 16*0]
|
||||
mulps xm1, xm10, [exptabq + %1 + 16*1]
|
||||
|
||||
haddps xm0, xm1
|
||||
movhlps xm1, xm0 ; t[0].re, t[1].re, t[0].im, t[1].im
|
||||
|
||||
addps xm0, xm1
|
||||
addps xm0, xm8
|
||||
|
||||
movsd [outq], xm0
|
||||
%endmacro
|
||||
|
||||
%macro BUTTERFLIES_AC 1 ; %1 - exptab_offset
|
||||
mulps m0, m12, [exptabq + 64*0 + 0*mmsize + %1]
|
||||
mulps m1, m12, [exptabq + 64*0 + 1*mmsize + %1]
|
||||
mulps m2, m13, [exptabq + 64*1 + 0*mmsize + %1]
|
||||
mulps m3, m13, [exptabq + 64*1 + 1*mmsize + %1]
|
||||
|
||||
addps m0, m0, m2
|
||||
addps m1, m1, m3
|
||||
addps m0, m0, m11
|
||||
|
||||
shufps m1, m1, m1, q2301
|
||||
addps m0, m0, m1
|
||||
|
||||
vextractf128 xm1, m0, 1
|
||||
|
||||
movlps [outq + strideq*1], xm0
|
||||
movhps [outq + strideq*2], xm0
|
||||
movlps [outq + stride3q], xm1
|
||||
movhps [outq + strideq*4], xm1
|
||||
%endmacro
|
||||
|
||||
INIT_YMM avx
|
||||
cglobal fft15, 4, 5, 14, out, in, exptab, stride, stride5
|
||||
shl strideq, 3
|
||||
|
||||
movaps xm5, [exptabq + 480 + 16*0]
|
||||
movaps xm6, [exptabq + 480 + 16*1]
|
||||
movaps xm7, [sign_adjust_5]
|
||||
|
||||
FFT5 0, xm8, 11
|
||||
FFT5 8, xm9, 12
|
||||
FFT5 16, xm10, 13
|
||||
|
||||
%define stride3q inq
|
||||
lea stride3q, [strideq + strideq*2]
|
||||
lea stride5q, [strideq + strideq*4]
|
||||
|
||||
BUTTERFLIES_DC (8*6 + 4*0)*2*4
|
||||
BUTTERFLIES_AC (8*0 + 0*0)*2*4
|
||||
|
||||
add outq, stride5q
|
||||
BUTTERFLIES_DC (8*6 + 4*1)*2*4
|
||||
BUTTERFLIES_AC (8*2 + 0*0)*2*4
|
||||
|
||||
add outq, stride5q
|
||||
BUTTERFLIES_DC (8*6 + 4*2)*2*4
|
||||
BUTTERFLIES_AC (8*4 + 0*0)*2*4
|
||||
|
||||
RET
|
||||
|
||||
%endif ; ARCH_X86_64
|
||||
|
||||
;*******************************************************************************************************
|
||||
;void ff_mdct15_postreindex(FFTComplex *out, FFTComplex *in, FFTComplex *exp, int *lut, ptrdiff_t len8);
|
||||
;*******************************************************************************************************
|
||||
%macro LUT_LOAD_4D 3
|
||||
mov r4d, [lutq + %3q*4 + 0]
|
||||
movsd xmm%1, [inq + r4q*8]
|
||||
mov r4d, [lutq + %3q*4 + 4]
|
||||
movhps xmm%1, [inq + r4q*8]
|
||||
%if cpuflag(avx2)
|
||||
mov r4d, [lutq + %3q*4 + 8]
|
||||
movsd %2, [inq + r4q*8]
|
||||
mov r4d, [lutq + %3q*4 + 12]
|
||||
movhps %2, [inq + r4q*8]
|
||||
vinsertf128 %1, %1, %2, 1
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro POSTROTATE_FN 1
|
||||
cglobal mdct15_postreindex, 5, 7, 8 + cpuflag(avx2)*2, out, in, exp, lut, len8, offset_p, offset_n
|
||||
|
||||
xor offset_nq, offset_nq
|
||||
lea offset_pq, [len8q*2 - %1]
|
||||
|
||||
movaps m7, [sign_adjust_r]
|
||||
|
||||
%if cpuflag(avx2)
|
||||
movaps m8, [perm_pos]
|
||||
movaps m9, [perm_neg]
|
||||
%endif
|
||||
|
||||
.loop:
|
||||
movups m0, [expq + offset_pq*8] ; exp[p0].re, exp[p0].im, exp[p1].re, exp[p1].im, exp[p2].re, exp[p2].im, exp[p3].re, exp[p3].im
|
||||
movups m1, [expq + offset_nq*8] ; exp[n3].re, exp[n3].im, exp[n2].re, exp[n2].im, exp[n1].re, exp[n1].im, exp[n0].re, exp[n0].im
|
||||
|
||||
LUT_LOAD_4D m3, xm4, offset_p ; in[p0].re, in[p0].im, in[p1].re, in[p1].im, in[p2].re, in[p2].im, in[p3].re, in[p3].im
|
||||
LUT_LOAD_4D m4, xm5, offset_n ; in[n3].re, in[n3].im, in[n2].re, in[n2].im, in[n1].re, in[n1].im, in[n0].re, in[n0].im
|
||||
|
||||
mulps m5, m3, m0 ; in[p].reim * exp[p].reim
|
||||
mulps m6, m4, m1 ; in[n].reim * exp[n].reim
|
||||
|
||||
xorps m5, m7 ; in[p].re *= -1, in[p].im *= 1
|
||||
xorps m6, m7 ; in[n].re *= -1, in[n].im *= 1
|
||||
|
||||
shufps m3, m3, m3, q2301 ; in[p].imre
|
||||
shufps m4, m4, m4, q2301 ; in[n].imre
|
||||
|
||||
mulps m3, m0 ; in[p].imre * exp[p].reim
|
||||
mulps m4, m1 ; in[n].imre * exp[n].reim
|
||||
|
||||
haddps m3, m6 ; out[n0].im, out[n1].im, out[n3].re, out[n2].re, out[n2].im, out[n3].im, out[n1].re, out[n0].re
|
||||
haddps m5, m4 ; out[p0].re, out[p1].re, out[p3].im, out[p2].im, out[p2].re, out[p3].re, out[p1].im, out[p0].im
|
||||
|
||||
%if cpuflag(avx2)
|
||||
vpermps m3, m9, m3 ; out[n3].im, out[n3].re, out[n2].im, out[n2].re, out[n1].im, out[n1].re, out[n0].im, out[n0].re
|
||||
vpermps m5, m8, m5 ; out[p0].re, out[p0].im, out[p1].re, out[p1].im, out[p2].re, out[p2].im, out[p3].re, out[p3].im
|
||||
%else
|
||||
shufps m3, m3, m3, q0312
|
||||
shufps m5, m5, m5, q2130
|
||||
%endif
|
||||
|
||||
movups [outq + offset_nq*8], m3
|
||||
movups [outq + offset_pq*8], m5
|
||||
|
||||
sub offset_pq, %1
|
||||
add offset_nq, %1
|
||||
cmp offset_nq, offset_pq
|
||||
jle .loop
|
||||
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse3
|
||||
POSTROTATE_FN 2
|
||||
|
||||
%if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
|
||||
INIT_YMM avx2
|
||||
POSTROTATE_FN 4
|
||||
%endif
|
548
trunk/3rdparty/ffmpeg-4.2-fit/libavcodec/x86/sbrdsp.asm
vendored
Normal file
548
trunk/3rdparty/ffmpeg-4.2-fit/libavcodec/x86/sbrdsp.asm
vendored
Normal file
|
@ -0,0 +1,548 @@
|
|||
;******************************************************************************
|
||||
;* AAC Spectral Band Replication decoding functions
|
||||
;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
SECTION_RODATA
|
||||
; mask equivalent for multiply by -1.0 1.0
|
||||
ps_mask times 2 dd 1<<31, 0
|
||||
ps_mask2 times 2 dd 0, 1<<31
|
||||
ps_mask3 dd 0, 0, 0, 1<<31
|
||||
ps_noise0 times 2 dd 1.0, 0.0,
|
||||
ps_noise2 times 2 dd -1.0, 0.0
|
||||
ps_noise13 dd 0.0, 1.0, 0.0, -1.0
|
||||
dd 0.0, -1.0, 0.0, 1.0
|
||||
dd 0.0, 1.0, 0.0, -1.0
|
||||
cextern sbr_noise_table
|
||||
cextern ps_neg
|
||||
|
||||
SECTION .text
|
||||
|
||||
INIT_XMM sse
|
||||
cglobal sbr_sum_square, 2, 3, 6
|
||||
mov r2d, r1d
|
||||
xorps m0, m0
|
||||
xorps m1, m1
|
||||
sar r2, 3
|
||||
jz .prepare
|
||||
.loop:
|
||||
movu m2, [r0 + 0]
|
||||
movu m3, [r0 + 16]
|
||||
movu m4, [r0 + 32]
|
||||
movu m5, [r0 + 48]
|
||||
mulps m2, m2
|
||||
mulps m3, m3
|
||||
mulps m4, m4
|
||||
mulps m5, m5
|
||||
addps m0, m2
|
||||
addps m1, m3
|
||||
addps m0, m4
|
||||
addps m1, m5
|
||||
add r0, 64
|
||||
dec r2
|
||||
jnz .loop
|
||||
.prepare:
|
||||
and r1, 7
|
||||
sar r1, 1
|
||||
jz .end
|
||||
; len is a multiple of 2, thus there are at least 4 elements to process
|
||||
.endloop:
|
||||
movu m2, [r0]
|
||||
add r0, 16
|
||||
mulps m2, m2
|
||||
dec r1
|
||||
addps m0, m2
|
||||
jnz .endloop
|
||||
.end:
|
||||
addps m0, m1
|
||||
movhlps m2, m0
|
||||
addps m0, m2
|
||||
movss m1, m0
|
||||
shufps m0, m0, 1
|
||||
addss m0, m1
|
||||
%if ARCH_X86_64 == 0
|
||||
movss r0m, m0
|
||||
fld dword r0m
|
||||
%endif
|
||||
RET
|
||||
|
||||
%define STEP 40*4*2
|
||||
cglobal sbr_hf_g_filt, 5, 6, 5
|
||||
lea r1, [r1 + 8*r4] ; offset by ixh elements into X_high
|
||||
mov r5, r3
|
||||
and r3, 0xFC
|
||||
lea r2, [r2 + r3*4]
|
||||
lea r0, [r0 + r3*8]
|
||||
neg r3
|
||||
jz .loop1
|
||||
.loop4:
|
||||
movlps m0, [r2 + 4*r3 + 0]
|
||||
movlps m1, [r2 + 4*r3 + 8]
|
||||
movlps m2, [r1 + 0*STEP]
|
||||
movlps m3, [r1 + 2*STEP]
|
||||
movhps m2, [r1 + 1*STEP]
|
||||
movhps m3, [r1 + 3*STEP]
|
||||
unpcklps m0, m0
|
||||
unpcklps m1, m1
|
||||
mulps m0, m2
|
||||
mulps m1, m3
|
||||
movu [r0 + 8*r3 + 0], m0
|
||||
movu [r0 + 8*r3 + 16], m1
|
||||
add r1, 4*STEP
|
||||
add r3, 4
|
||||
jnz .loop4
|
||||
and r5, 3 ; number of single element loops
|
||||
jz .end
|
||||
.loop1: ; element 0 and 1 can be computed at the same time
|
||||
movss m0, [r2]
|
||||
movlps m2, [r1]
|
||||
unpcklps m0, m0
|
||||
mulps m2, m0
|
||||
movlps [r0], m2
|
||||
add r0, 8
|
||||
add r2, 4
|
||||
add r1, STEP
|
||||
dec r5
|
||||
jnz .loop1
|
||||
.end:
|
||||
RET
|
||||
|
||||
; void ff_sbr_hf_gen_sse(float (*X_high)[2], const float (*X_low)[2],
|
||||
; const float alpha0[2], const float alpha1[2],
|
||||
; float bw, int start, int end)
|
||||
;
|
||||
cglobal sbr_hf_gen, 4,4,8, X_high, X_low, alpha0, alpha1, BW, S, E
|
||||
; load alpha factors
|
||||
%define bw m0
|
||||
%if ARCH_X86_64 == 0 || WIN64
|
||||
movss bw, BWm
|
||||
%endif
|
||||
movlps m2, [alpha1q]
|
||||
movlps m1, [alpha0q]
|
||||
shufps bw, bw, 0
|
||||
mulps m2, bw ; (a1[0] a1[1])*bw
|
||||
mulps m1, bw ; (a0[0] a0[1])*bw = (a2 a3)
|
||||
mulps m2, bw ; (a1[0] a1[1])*bw*bw = (a0 a1)
|
||||
mova m3, m1
|
||||
mova m4, m2
|
||||
|
||||
; Set pointers
|
||||
%if ARCH_X86_64 == 0 || WIN64
|
||||
; start and end 6th and 7th args on stack
|
||||
mov r2d, Sm
|
||||
mov r3d, Em
|
||||
DEFINE_ARGS X_high, X_low, start, end
|
||||
%else
|
||||
; BW does not actually occupy a register, so shift by 1
|
||||
DEFINE_ARGS X_high, X_low, alpha0, alpha1, start, end
|
||||
movsxd startq, startd
|
||||
movsxd endq, endd
|
||||
%endif
|
||||
sub startq, endq ; neg num of loops
|
||||
lea X_highq, [X_highq + endq*2*4]
|
||||
lea X_lowq, [X_lowq + endq*2*4 - 2*2*4]
|
||||
shl startq, 3 ; offset from num loops
|
||||
|
||||
mova m0, [X_lowq + startq]
|
||||
shufps m3, m3, q1111
|
||||
shufps m4, m4, q1111
|
||||
xorps m3, [ps_mask]
|
||||
shufps m1, m1, q0000
|
||||
shufps m2, m2, q0000
|
||||
xorps m4, [ps_mask]
|
||||
.loop2:
|
||||
movu m7, [X_lowq + startq + 8] ; BbCc
|
||||
mova m6, m0
|
||||
mova m5, m7
|
||||
shufps m0, m0, q2301 ; aAbB
|
||||
shufps m7, m7, q2301 ; bBcC
|
||||
mulps m0, m4
|
||||
mulps m7, m3
|
||||
mulps m6, m2
|
||||
mulps m5, m1
|
||||
addps m7, m0
|
||||
mova m0, [X_lowq + startq + 16] ; CcDd
|
||||
addps m7, m0
|
||||
addps m6, m5
|
||||
addps m7, m6
|
||||
mova [X_highq + startq], m7
|
||||
add startq, 16
|
||||
jnz .loop2
|
||||
RET
|
||||
|
||||
cglobal sbr_sum64x5, 1,2,4,z
|
||||
lea r1q, [zq+ 256]
|
||||
.loop:
|
||||
mova m0, [zq+ 0]
|
||||
mova m2, [zq+ 16]
|
||||
mova m1, [zq+ 256]
|
||||
mova m3, [zq+ 272]
|
||||
addps m0, [zq+ 512]
|
||||
addps m2, [zq+ 528]
|
||||
addps m1, [zq+ 768]
|
||||
addps m3, [zq+ 784]
|
||||
addps m0, [zq+1024]
|
||||
addps m2, [zq+1040]
|
||||
addps m0, m1
|
||||
addps m2, m3
|
||||
mova [zq], m0
|
||||
mova [zq+16], m2
|
||||
add zq, 32
|
||||
cmp zq, r1q
|
||||
jne .loop
|
||||
REP_RET
|
||||
|
||||
INIT_XMM sse
|
||||
cglobal sbr_qmf_post_shuffle, 2,3,4,W,z
|
||||
lea r2q, [zq + (64-4)*4]
|
||||
mova m3, [ps_neg]
|
||||
.loop:
|
||||
mova m1, [zq]
|
||||
xorps m0, m3, [r2q]
|
||||
shufps m0, m0, m0, q0123
|
||||
unpcklps m2, m0, m1
|
||||
unpckhps m0, m0, m1
|
||||
mova [Wq + 0], m2
|
||||
mova [Wq + 16], m0
|
||||
add Wq, 32
|
||||
sub r2q, 16
|
||||
add zq, 16
|
||||
cmp zq, r2q
|
||||
jl .loop
|
||||
REP_RET
|
||||
|
||||
INIT_XMM sse
|
||||
cglobal sbr_neg_odd_64, 1,2,4,z
|
||||
lea r1q, [zq+256]
|
||||
.loop:
|
||||
mova m0, [zq+ 0]
|
||||
mova m1, [zq+16]
|
||||
mova m2, [zq+32]
|
||||
mova m3, [zq+48]
|
||||
xorps m0, [ps_mask2]
|
||||
xorps m1, [ps_mask2]
|
||||
xorps m2, [ps_mask2]
|
||||
xorps m3, [ps_mask2]
|
||||
mova [zq+ 0], m0
|
||||
mova [zq+16], m1
|
||||
mova [zq+32], m2
|
||||
mova [zq+48], m3
|
||||
add zq, 64
|
||||
cmp zq, r1q
|
||||
jne .loop
|
||||
REP_RET
|
||||
|
||||
; void ff_sbr_qmf_deint_bfly_sse2(float *v, const float *src0, const float *src1)
|
||||
%macro SBR_QMF_DEINT_BFLY 0
|
||||
cglobal sbr_qmf_deint_bfly, 3,5,8, v,src0,src1,vrev,c
|
||||
mov cq, 64*4-2*mmsize
|
||||
lea vrevq, [vq + 64*4]
|
||||
.loop:
|
||||
mova m0, [src0q+cq]
|
||||
mova m1, [src1q]
|
||||
mova m4, [src0q+cq+mmsize]
|
||||
mova m5, [src1q+mmsize]
|
||||
%if cpuflag(sse2)
|
||||
pshufd m2, m0, q0123
|
||||
pshufd m3, m1, q0123
|
||||
pshufd m6, m4, q0123
|
||||
pshufd m7, m5, q0123
|
||||
%else
|
||||
shufps m2, m0, m0, q0123
|
||||
shufps m3, m1, m1, q0123
|
||||
shufps m6, m4, m4, q0123
|
||||
shufps m7, m5, m5, q0123
|
||||
%endif
|
||||
addps m5, m2
|
||||
subps m0, m7
|
||||
addps m1, m6
|
||||
subps m4, m3
|
||||
mova [vrevq], m1
|
||||
mova [vrevq+mmsize], m5
|
||||
mova [vq+cq], m0
|
||||
mova [vq+cq+mmsize], m4
|
||||
add src1q, 2*mmsize
|
||||
add vrevq, 2*mmsize
|
||||
sub cq, 2*mmsize
|
||||
jge .loop
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse
|
||||
SBR_QMF_DEINT_BFLY
|
||||
|
||||
INIT_XMM sse2
|
||||
SBR_QMF_DEINT_BFLY
|
||||
|
||||
INIT_XMM sse2
|
||||
cglobal sbr_qmf_pre_shuffle, 1,4,6,z
|
||||
%define OFFSET (32*4-2*mmsize)
|
||||
mov r3q, OFFSET
|
||||
lea r1q, [zq + (32+1)*4]
|
||||
lea r2q, [zq + 64*4]
|
||||
mova m5, [ps_neg]
|
||||
.loop:
|
||||
movu m0, [r1q]
|
||||
movu m2, [r1q + mmsize]
|
||||
movu m1, [zq + r3q + 4 + mmsize]
|
||||
movu m3, [zq + r3q + 4]
|
||||
|
||||
pxor m2, m5
|
||||
pxor m0, m5
|
||||
pshufd m2, m2, q0123
|
||||
pshufd m0, m0, q0123
|
||||
SBUTTERFLY dq, 2, 3, 4
|
||||
SBUTTERFLY dq, 0, 1, 4
|
||||
mova [r2q + 2*r3q + 0*mmsize], m2
|
||||
mova [r2q + 2*r3q + 1*mmsize], m3
|
||||
mova [r2q + 2*r3q + 2*mmsize], m0
|
||||
mova [r2q + 2*r3q + 3*mmsize], m1
|
||||
add r1q, 2*mmsize
|
||||
sub r3q, 2*mmsize
|
||||
jge .loop
|
||||
movq m2, [zq]
|
||||
movq [r2q], m2
|
||||
REP_RET
|
||||
|
||||
%ifdef PIC
|
||||
%define NREGS 1
|
||||
%if UNIX64
|
||||
%define NOISE_TABLE r6q ; r5q is m_max
|
||||
%else
|
||||
%define NOISE_TABLE r5q
|
||||
%endif
|
||||
%else
|
||||
%define NREGS 0
|
||||
%define NOISE_TABLE sbr_noise_table
|
||||
%endif
|
||||
|
||||
%macro LOAD_NST 1
|
||||
%ifdef PIC
|
||||
lea NOISE_TABLE, [%1]
|
||||
mova m0, [kxq + NOISE_TABLE]
|
||||
%else
|
||||
mova m0, [kxq + %1]
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse2
|
||||
; sbr_hf_apply_noise_0(float (*Y)[2], const float *s_m,
|
||||
; const float *q_filt, int noise,
|
||||
; int kx, int m_max)
|
||||
cglobal sbr_hf_apply_noise_0, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
|
||||
mova m0, [ps_noise0]
|
||||
jmp apply_noise_main
|
||||
|
||||
; sbr_hf_apply_noise_1(float (*Y)[2], const float *s_m,
|
||||
; const float *q_filt, int noise,
|
||||
; int kx, int m_max)
|
||||
cglobal sbr_hf_apply_noise_1, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
|
||||
and kxq, 1
|
||||
shl kxq, 4
|
||||
LOAD_NST ps_noise13
|
||||
jmp apply_noise_main
|
||||
|
||||
; sbr_hf_apply_noise_2(float (*Y)[2], const float *s_m,
|
||||
; const float *q_filt, int noise,
|
||||
; int kx, int m_max)
|
||||
cglobal sbr_hf_apply_noise_2, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
|
||||
mova m0, [ps_noise2]
|
||||
jmp apply_noise_main
|
||||
|
||||
; sbr_hf_apply_noise_3(float (*Y)[2], const float *s_m,
|
||||
; const float *q_filt, int noise,
|
||||
; int kx, int m_max)
|
||||
cglobal sbr_hf_apply_noise_3, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
|
||||
and kxq, 1
|
||||
shl kxq, 4
|
||||
LOAD_NST ps_noise13+16
|
||||
|
||||
apply_noise_main:
|
||||
%if ARCH_X86_64 == 0 || WIN64
|
||||
mov kxd, m_maxm
|
||||
DEFINE_ARGS Y, s_m, q_filt, noise, count
|
||||
%else
|
||||
DEFINE_ARGS Y, s_m, q_filt, noise, kx, count
|
||||
%endif
|
||||
movsxdifnidn noiseq, noised
|
||||
dec noiseq
|
||||
shl countd, 2
|
||||
%ifdef PIC
|
||||
lea NOISE_TABLE, [sbr_noise_table]
|
||||
%endif
|
||||
lea Yq, [Yq + 2*countq]
|
||||
add s_mq, countq
|
||||
add q_filtq, countq
|
||||
shl noiseq, 3
|
||||
pxor m5, m5
|
||||
neg countq
|
||||
.loop:
|
||||
mova m1, [q_filtq + countq]
|
||||
movu m3, [noiseq + NOISE_TABLE + 1*mmsize]
|
||||
movu m4, [noiseq + NOISE_TABLE + 2*mmsize]
|
||||
add noiseq, 2*mmsize
|
||||
and noiseq, 0x1ff<<3
|
||||
punpckhdq m2, m1, m1
|
||||
punpckldq m1, m1
|
||||
mulps m1, m3 ; m2 = q_filt[m] * ff_sbr_noise_table[noise]
|
||||
mulps m2, m4 ; m2 = q_filt[m] * ff_sbr_noise_table[noise]
|
||||
mova m3, [s_mq + countq]
|
||||
; TODO: replace by a vpermd in AVX2
|
||||
punpckhdq m4, m3, m3
|
||||
punpckldq m3, m3
|
||||
pcmpeqd m6, m3, m5 ; m6 == 0
|
||||
pcmpeqd m7, m4, m5 ; m7 == 0
|
||||
mulps m3, m0 ; s_m[m] * phi_sign
|
||||
mulps m4, m0 ; s_m[m] * phi_sign
|
||||
pand m1, m6
|
||||
pand m2, m7
|
||||
movu m6, [Yq + 2*countq]
|
||||
movu m7, [Yq + 2*countq + mmsize]
|
||||
addps m3, m1
|
||||
addps m4, m2
|
||||
addps m6, m3
|
||||
addps m7, m4
|
||||
movu [Yq + 2*countq], m6
|
||||
movu [Yq + 2*countq + mmsize], m7
|
||||
add countq, mmsize
|
||||
jl .loop
|
||||
RET
|
||||
|
||||
INIT_XMM sse
|
||||
cglobal sbr_qmf_deint_neg, 2,4,4,v,src,vrev,c
|
||||
%define COUNT 32*4
|
||||
%define OFFSET 32*4
|
||||
mov cq, -COUNT
|
||||
lea vrevq, [vq + OFFSET + COUNT]
|
||||
add vq, OFFSET-mmsize
|
||||
add srcq, 2*COUNT
|
||||
mova m3, [ps_neg]
|
||||
.loop:
|
||||
mova m0, [srcq + 2*cq + 0*mmsize]
|
||||
mova m1, [srcq + 2*cq + 1*mmsize]
|
||||
shufps m2, m0, m1, q2020
|
||||
shufps m1, m0, q1313
|
||||
xorps m2, m3
|
||||
mova [vq], m1
|
||||
mova [vrevq + cq], m2
|
||||
sub vq, mmsize
|
||||
add cq, mmsize
|
||||
jl .loop
|
||||
REP_RET
|
||||
|
||||
%macro SBR_AUTOCORRELATE 0
|
||||
cglobal sbr_autocorrelate, 2,3,8,32, x, phi, cnt
|
||||
mov cntq, 37*8
|
||||
add xq, cntq
|
||||
neg cntq
|
||||
|
||||
%if cpuflag(sse3)
|
||||
%define MOVH movsd
|
||||
movddup m5, [xq+cntq]
|
||||
%else
|
||||
%define MOVH movlps
|
||||
movlps m5, [xq+cntq]
|
||||
movlhps m5, m5
|
||||
%endif
|
||||
MOVH m7, [xq+cntq+8 ]
|
||||
MOVH m1, [xq+cntq+16]
|
||||
shufps m7, m7, q0110
|
||||
shufps m1, m1, q0110
|
||||
mulps m3, m5, m7 ; x[0][0] * x[1][0], x[0][1] * x[1][1], x[0][0] * x[1][1], x[0][1] * x[1][0]
|
||||
mulps m4, m5, m5 ; x[0][0] * x[0][0], x[0][1] * x[0][1];
|
||||
mulps m5, m1 ; real_sum2 = x[0][0] * x[2][0], x[0][1] * x[2][1]; imag_sum2 = x[0][0] * x[2][1], x[0][1] * x[2][0]
|
||||
movaps [rsp ], m3
|
||||
movaps [rsp+16], m4
|
||||
add cntq, 8
|
||||
|
||||
MOVH m2, [xq+cntq+16]
|
||||
movlhps m7, m7
|
||||
shufps m2, m2, q0110
|
||||
mulps m6, m7, m1 ; real_sum1 = x[1][0] * x[2][0], x[1][1] * x[2][1]; imag_sum1 += x[1][0] * x[2][1], x[1][1] * x[2][0]
|
||||
mulps m4, m7, m2
|
||||
mulps m7, m7 ; real_sum0 = x[1][0] * x[1][0], x[1][1] * x[1][1];
|
||||
addps m5, m4 ; real_sum2 += x[1][0] * x[3][0], x[1][1] * x[3][1]; imag_sum2 += x[1][0] * x[3][1], x[1][1] * x[3][0]
|
||||
|
||||
align 16
|
||||
.loop:
|
||||
add cntq, 8
|
||||
MOVH m0, [xq+cntq+16]
|
||||
movlhps m1, m1
|
||||
shufps m0, m0, q0110
|
||||
mulps m3, m1, m2
|
||||
mulps m4, m1, m0
|
||||
mulps m1, m1
|
||||
addps m6, m3 ; real_sum1 += x[i][0] * x[i + 1][0], x[i][1] * x[i + 1][1]; imag_sum1 += x[i][0] * x[i + 1][1], x[i][1] * x[i + 1][0];
|
||||
addps m5, m4 ; real_sum2 += x[i][0] * x[i + 2][0], x[i][1] * x[i + 2][1]; imag_sum2 += x[i][0] * x[i + 2][1], x[i][1] * x[i + 2][0];
|
||||
addps m7, m1 ; real_sum0 += x[i][0] * x[i][0], x[i][1] * x[i][1];
|
||||
add cntq, 8
|
||||
MOVH m1, [xq+cntq+16]
|
||||
movlhps m2, m2
|
||||
shufps m1, m1, q0110
|
||||
mulps m3, m2, m0
|
||||
mulps m4, m2, m1
|
||||
mulps m2, m2
|
||||
addps m6, m3 ; real_sum1 += x[i][0] * x[i + 1][0], x[i][1] * x[i + 1][1]; imag_sum1 += x[i][0] * x[i + 1][1], x[i][1] * x[i + 1][0];
|
||||
addps m5, m4 ; real_sum2 += x[i][0] * x[i + 2][0], x[i][1] * x[i + 2][1]; imag_sum2 += x[i][0] * x[i + 2][1], x[i][1] * x[i + 2][0];
|
||||
addps m7, m2 ; real_sum0 += x[i][0] * x[i][0], x[i][1] * x[i][1];
|
||||
add cntq, 8
|
||||
MOVH m2, [xq+cntq+16]
|
||||
movlhps m0, m0
|
||||
shufps m2, m2, q0110
|
||||
mulps m3, m0, m1
|
||||
mulps m4, m0, m2
|
||||
mulps m0, m0
|
||||
addps m6, m3 ; real_sum1 += x[i][0] * x[i + 1][0], x[i][1] * x[i + 1][1]; imag_sum1 += x[i][0] * x[i + 1][1], x[i][1] * x[i + 1][0];
|
||||
addps m5, m4 ; real_sum2 += x[i][0] * x[i + 2][0], x[i][1] * x[i + 2][1]; imag_sum2 += x[i][0] * x[i + 2][1], x[i][1] * x[i + 2][0];
|
||||
addps m7, m0 ; real_sum0 += x[i][0] * x[i][0], x[i][1] * x[i][1];
|
||||
jl .loop
|
||||
|
||||
movlhps m1, m1
|
||||
mulps m2, m1
|
||||
mulps m1, m1
|
||||
addps m2, m6 ; real_sum1 + x[38][0] * x[39][0], x[38][1] * x[39][1]; imag_sum1 + x[38][0] * x[39][1], x[38][1] * x[39][0];
|
||||
addps m1, m7 ; real_sum0 + x[38][0] * x[38][0], x[38][1] * x[38][1];
|
||||
addps m6, [rsp ] ; real_sum1 + x[ 0][0] * x[ 1][0], x[ 0][1] * x[ 1][1]; imag_sum1 + x[ 0][0] * x[ 1][1], x[ 0][1] * x[ 1][0];
|
||||
addps m7, [rsp+16] ; real_sum0 + x[ 0][0] * x[ 0][0], x[ 0][1] * x[ 0][1];
|
||||
|
||||
xorps m2, [ps_mask3]
|
||||
xorps m5, [ps_mask3]
|
||||
xorps m6, [ps_mask3]
|
||||
HADDPS m2, m5, m3
|
||||
HADDPS m7, m6, m4
|
||||
%if cpuflag(sse3)
|
||||
movshdup m0, m1
|
||||
%else
|
||||
movss m0, m1
|
||||
shufps m1, m1, q0001
|
||||
%endif
|
||||
addss m1, m0
|
||||
movaps [phiq ], m2
|
||||
movhps [phiq+0x18], m7
|
||||
movss [phiq+0x28], m7
|
||||
movss [phiq+0x10], m1
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse
|
||||
SBR_AUTOCORRELATE
|
||||
INIT_XMM sse3
|
||||
SBR_AUTOCORRELATE
|
4
trunk/3rdparty/ffmpeg-4.2-fit/libavutil/aarch64/Makefile
vendored
Normal file
4
trunk/3rdparty/ffmpeg-4.2-fit/libavutil/aarch64/Makefile
vendored
Normal file
|
@ -0,0 +1,4 @@
|
|||
OBJS += aarch64/cpu.o \
|
||||
aarch64/float_dsp_init.o \
|
||||
|
||||
NEON-OBJS += aarch64/float_dsp_neon.o
|
8
trunk/3rdparty/ffmpeg-4.2-fit/libavutil/arm/Makefile
vendored
Normal file
8
trunk/3rdparty/ffmpeg-4.2-fit/libavutil/arm/Makefile
vendored
Normal file
|
@ -0,0 +1,8 @@
|
|||
OBJS += arm/cpu.o \
|
||||
arm/float_dsp_init_arm.o \
|
||||
|
||||
VFP-OBJS += arm/float_dsp_init_vfp.o \
|
||||
arm/float_dsp_vfp.o \
|
||||
|
||||
NEON-OBJS += arm/float_dsp_init_neon.o \
|
||||
arm/float_dsp_neon.o \
|
|
@ -1,5 +0,0 @@
|
|||
/* Automatically generated by version.sh, do not manually edit! */
|
||||
#ifndef AVUTIL_FFVERSION_H
|
||||
#define AVUTIL_FFVERSION_H
|
||||
#define FFMPEG_VERSION ""
|
||||
#endif /* AVUTIL_FFVERSION_H */
|
18
trunk/3rdparty/ffmpeg-4.2-fit/libavutil/x86/Makefile
vendored
Normal file
18
trunk/3rdparty/ffmpeg-4.2-fit/libavutil/x86/Makefile
vendored
Normal file
|
@ -0,0 +1,18 @@
|
|||
OBJS += x86/cpu.o \
|
||||
x86/fixed_dsp_init.o \
|
||||
x86/float_dsp_init.o \
|
||||
x86/imgutils_init.o \
|
||||
x86/lls_init.o \
|
||||
|
||||
OBJS-$(CONFIG_PIXELUTILS) += x86/pixelutils_init.o \
|
||||
|
||||
EMMS_OBJS_$(HAVE_MMX_INLINE)_$(HAVE_MMX_EXTERNAL)_$(HAVE_MM_EMPTY) = x86/emms.o
|
||||
|
||||
X86ASM-OBJS += x86/cpuid.o \
|
||||
$(EMMS_OBJS__yes_) \
|
||||
x86/fixed_dsp.o \
|
||||
x86/float_dsp.o \
|
||||
x86/imgutils.o \
|
||||
x86/lls.o \
|
||||
|
||||
X86ASM-OBJS-$(CONFIG_PIXELUTILS) += x86/pixelutils.o \
|
91
trunk/3rdparty/ffmpeg-4.2-fit/libavutil/x86/cpuid.asm
vendored
Normal file
91
trunk/3rdparty/ffmpeg-4.2-fit/libavutil/x86/cpuid.asm
vendored
Normal file
|
@ -0,0 +1,91 @@
|
|||
;*****************************************************************************
|
||||
;* Copyright (C) 2005-2010 x264 project
|
||||
;*
|
||||
;* Authors: Loren Merritt <lorenm@u.washington.edu>
|
||||
;* Fiona Glaser <fiona@x264.com>
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "x86util.asm"
|
||||
|
||||
SECTION .text
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void ff_cpu_cpuid(int index, int *eax, int *ebx, int *ecx, int *edx)
|
||||
;-----------------------------------------------------------------------------
|
||||
cglobal cpu_cpuid, 5,7
|
||||
push rbx
|
||||
push r4
|
||||
push r3
|
||||
push r2
|
||||
push r1
|
||||
mov eax, r0d
|
||||
xor ecx, ecx
|
||||
cpuid
|
||||
pop r4
|
||||
mov [r4], eax
|
||||
pop r4
|
||||
mov [r4], ebx
|
||||
pop r4
|
||||
mov [r4], ecx
|
||||
pop r4
|
||||
mov [r4], edx
|
||||
pop rbx
|
||||
RET
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void ff_cpu_xgetbv(int op, int *eax, int *edx)
|
||||
;-----------------------------------------------------------------------------
|
||||
cglobal cpu_xgetbv, 3,7
|
||||
push r2
|
||||
push r1
|
||||
mov ecx, r0d
|
||||
xgetbv
|
||||
pop r4
|
||||
mov [r4], eax
|
||||
pop r4
|
||||
mov [r4], edx
|
||||
RET
|
||||
|
||||
%if ARCH_X86_64 == 0
|
||||
;-----------------------------------------------------------------------------
|
||||
; int ff_cpu_cpuid_test(void)
|
||||
; return 0 if unsupported
|
||||
;-----------------------------------------------------------------------------
|
||||
cglobal cpu_cpuid_test
|
||||
pushfd
|
||||
push ebx
|
||||
push ebp
|
||||
push esi
|
||||
push edi
|
||||
pushfd
|
||||
pop eax
|
||||
mov ebx, eax
|
||||
xor eax, 0x200000
|
||||
push eax
|
||||
popfd
|
||||
pushfd
|
||||
pop eax
|
||||
xor eax, ebx
|
||||
pop edi
|
||||
pop esi
|
||||
pop ebp
|
||||
pop ebx
|
||||
popfd
|
||||
ret
|
||||
%endif
|
48
trunk/3rdparty/ffmpeg-4.2-fit/libavutil/x86/fixed_dsp.asm
vendored
Normal file
48
trunk/3rdparty/ffmpeg-4.2-fit/libavutil/x86/fixed_dsp.asm
vendored
Normal file
|
@ -0,0 +1,48 @@
|
|||
;*****************************************************************************
|
||||
;* x86-optimized Float DSP functions
|
||||
;*
|
||||
;* Copyright 2016 James Almer
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "x86util.asm"
|
||||
|
||||
SECTION .text
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void ff_butterflies_fixed(float *src0, float *src1, int len);
|
||||
;-----------------------------------------------------------------------------
|
||||
INIT_XMM sse2
|
||||
cglobal butterflies_fixed, 3,3,3, src0, src1, len
|
||||
shl lend, 2
|
||||
add src0q, lenq
|
||||
add src1q, lenq
|
||||
neg lenq
|
||||
|
||||
align 16
|
||||
.loop:
|
||||
mova m0, [src0q + lenq]
|
||||
mova m1, [src1q + lenq]
|
||||
mova m2, m0
|
||||
paddd m0, m1
|
||||
psubd m2, m1
|
||||
mova [src0q + lenq], m0
|
||||
mova [src1q + lenq], m2
|
||||
add lenq, mmsize
|
||||
jl .loop
|
||||
RET
|
484
trunk/3rdparty/ffmpeg-4.2-fit/libavutil/x86/float_dsp.asm
vendored
Normal file
484
trunk/3rdparty/ffmpeg-4.2-fit/libavutil/x86/float_dsp.asm
vendored
Normal file
|
@ -0,0 +1,484 @@
|
|||
;*****************************************************************************
|
||||
;* x86-optimized Float DSP functions
|
||||
;*
|
||||
;* Copyright 2006 Loren Merritt
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "x86util.asm"
|
||||
|
||||
SECTION_RODATA 32
|
||||
pd_reverse: dd 7, 6, 5, 4, 3, 2, 1, 0
|
||||
|
||||
SECTION .text
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void vector_fmul(float *dst, const float *src0, const float *src1, int len)
|
||||
;-----------------------------------------------------------------------------
|
||||
%macro VECTOR_FMUL 0
|
||||
cglobal vector_fmul, 4,4,2, dst, src0, src1, len
|
||||
lea lenq, [lend*4 - 64]
|
||||
ALIGN 16
|
||||
.loop:
|
||||
%assign a 0
|
||||
%rep 32/mmsize
|
||||
mova m0, [src0q + lenq + (a+0)*mmsize]
|
||||
mova m1, [src0q + lenq + (a+1)*mmsize]
|
||||
mulps m0, m0, [src1q + lenq + (a+0)*mmsize]
|
||||
mulps m1, m1, [src1q + lenq + (a+1)*mmsize]
|
||||
mova [dstq + lenq + (a+0)*mmsize], m0
|
||||
mova [dstq + lenq + (a+1)*mmsize], m1
|
||||
%assign a a+2
|
||||
%endrep
|
||||
|
||||
sub lenq, 64
|
||||
jge .loop
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse
|
||||
VECTOR_FMUL
|
||||
%if HAVE_AVX_EXTERNAL
|
||||
INIT_YMM avx
|
||||
VECTOR_FMUL
|
||||
%endif
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void vector_dmul(double *dst, const double *src0, const double *src1, int len)
|
||||
;-----------------------------------------------------------------------------
|
||||
%macro VECTOR_DMUL 0
|
||||
cglobal vector_dmul, 4,4,4, dst, src0, src1, len
|
||||
lea lend, [lenq*8 - mmsize*4]
|
||||
ALIGN 16
|
||||
.loop:
|
||||
movaps m0, [src0q + lenq + 0*mmsize]
|
||||
movaps m1, [src0q + lenq + 1*mmsize]
|
||||
movaps m2, [src0q + lenq + 2*mmsize]
|
||||
movaps m3, [src0q + lenq + 3*mmsize]
|
||||
mulpd m0, m0, [src1q + lenq + 0*mmsize]
|
||||
mulpd m1, m1, [src1q + lenq + 1*mmsize]
|
||||
mulpd m2, m2, [src1q + lenq + 2*mmsize]
|
||||
mulpd m3, m3, [src1q + lenq + 3*mmsize]
|
||||
movaps [dstq + lenq + 0*mmsize], m0
|
||||
movaps [dstq + lenq + 1*mmsize], m1
|
||||
movaps [dstq + lenq + 2*mmsize], m2
|
||||
movaps [dstq + lenq + 3*mmsize], m3
|
||||
|
||||
sub lenq, mmsize*4
|
||||
jge .loop
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse2
|
||||
VECTOR_DMUL
|
||||
%if HAVE_AVX_EXTERNAL
|
||||
INIT_YMM avx
|
||||
VECTOR_DMUL
|
||||
%endif
|
||||
|
||||
;------------------------------------------------------------------------------
|
||||
; void ff_vector_fmac_scalar(float *dst, const float *src, float mul, int len)
|
||||
;------------------------------------------------------------------------------
|
||||
|
||||
%macro VECTOR_FMAC_SCALAR 0
|
||||
%if UNIX64
|
||||
cglobal vector_fmac_scalar, 3,3,5, dst, src, len
|
||||
%else
|
||||
cglobal vector_fmac_scalar, 4,4,5, dst, src, mul, len
|
||||
%endif
|
||||
%if ARCH_X86_32
|
||||
VBROADCASTSS m0, mulm
|
||||
%else
|
||||
%if WIN64
|
||||
SWAP 0, 2
|
||||
%endif
|
||||
shufps xm0, xm0, 0
|
||||
%if cpuflag(avx)
|
||||
vinsertf128 m0, m0, xm0, 1
|
||||
%endif
|
||||
%endif
|
||||
lea lenq, [lend*4-64]
|
||||
.loop:
|
||||
%if cpuflag(fma3)
|
||||
mova m1, [dstq+lenq]
|
||||
mova m2, [dstq+lenq+1*mmsize]
|
||||
fmaddps m1, m0, [srcq+lenq], m1
|
||||
fmaddps m2, m0, [srcq+lenq+1*mmsize], m2
|
||||
%else ; cpuflag
|
||||
mulps m1, m0, [srcq+lenq]
|
||||
mulps m2, m0, [srcq+lenq+1*mmsize]
|
||||
%if mmsize < 32
|
||||
mulps m3, m0, [srcq+lenq+2*mmsize]
|
||||
mulps m4, m0, [srcq+lenq+3*mmsize]
|
||||
%endif ; mmsize
|
||||
addps m1, m1, [dstq+lenq]
|
||||
addps m2, m2, [dstq+lenq+1*mmsize]
|
||||
%if mmsize < 32
|
||||
addps m3, m3, [dstq+lenq+2*mmsize]
|
||||
addps m4, m4, [dstq+lenq+3*mmsize]
|
||||
%endif ; mmsize
|
||||
%endif ; cpuflag
|
||||
mova [dstq+lenq], m1
|
||||
mova [dstq+lenq+1*mmsize], m2
|
||||
%if mmsize < 32
|
||||
mova [dstq+lenq+2*mmsize], m3
|
||||
mova [dstq+lenq+3*mmsize], m4
|
||||
%endif ; mmsize
|
||||
sub lenq, 64
|
||||
jge .loop
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse
|
||||
VECTOR_FMAC_SCALAR
|
||||
%if HAVE_AVX_EXTERNAL
|
||||
INIT_YMM avx
|
||||
VECTOR_FMAC_SCALAR
|
||||
%endif
|
||||
%if HAVE_FMA3_EXTERNAL
|
||||
INIT_YMM fma3
|
||||
VECTOR_FMAC_SCALAR
|
||||
%endif
|
||||
|
||||
;------------------------------------------------------------------------------
|
||||
; void ff_vector_fmul_scalar(float *dst, const float *src, float mul, int len)
|
||||
;------------------------------------------------------------------------------
|
||||
|
||||
%macro VECTOR_FMUL_SCALAR 0
|
||||
%if UNIX64
|
||||
cglobal vector_fmul_scalar, 3,3,2, dst, src, len
|
||||
%else
|
||||
cglobal vector_fmul_scalar, 4,4,3, dst, src, mul, len
|
||||
%endif
|
||||
%if ARCH_X86_32
|
||||
movss m0, mulm
|
||||
%elif WIN64
|
||||
SWAP 0, 2
|
||||
%endif
|
||||
shufps m0, m0, 0
|
||||
lea lenq, [lend*4-mmsize]
|
||||
.loop:
|
||||
mova m1, [srcq+lenq]
|
||||
mulps m1, m0
|
||||
mova [dstq+lenq], m1
|
||||
sub lenq, mmsize
|
||||
jge .loop
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse
|
||||
VECTOR_FMUL_SCALAR
|
||||
|
||||
;------------------------------------------------------------------------------
|
||||
; void ff_vector_dmac_scalar(double *dst, const double *src, double mul,
|
||||
; int len)
|
||||
;------------------------------------------------------------------------------
|
||||
|
||||
%macro VECTOR_DMAC_SCALAR 0
|
||||
%if ARCH_X86_32
|
||||
cglobal vector_dmac_scalar, 2,4,5, dst, src, mul, len, lenaddr
|
||||
mov lenq, lenaddrm
|
||||
VBROADCASTSD m0, mulm
|
||||
%else
|
||||
%if UNIX64
|
||||
cglobal vector_dmac_scalar, 3,3,5, dst, src, len
|
||||
%else
|
||||
cglobal vector_dmac_scalar, 4,4,5, dst, src, mul, len
|
||||
SWAP 0, 2
|
||||
%endif
|
||||
movlhps xm0, xm0
|
||||
%if cpuflag(avx)
|
||||
vinsertf128 m0, m0, xm0, 1
|
||||
%endif
|
||||
%endif
|
||||
lea lenq, [lend*8-mmsize*4]
|
||||
.loop:
|
||||
%if cpuflag(fma3)
|
||||
movaps m1, [dstq+lenq]
|
||||
movaps m2, [dstq+lenq+1*mmsize]
|
||||
movaps m3, [dstq+lenq+2*mmsize]
|
||||
movaps m4, [dstq+lenq+3*mmsize]
|
||||
fmaddpd m1, m0, [srcq+lenq], m1
|
||||
fmaddpd m2, m0, [srcq+lenq+1*mmsize], m2
|
||||
fmaddpd m3, m0, [srcq+lenq+2*mmsize], m3
|
||||
fmaddpd m4, m0, [srcq+lenq+3*mmsize], m4
|
||||
%else ; cpuflag
|
||||
mulpd m1, m0, [srcq+lenq]
|
||||
mulpd m2, m0, [srcq+lenq+1*mmsize]
|
||||
mulpd m3, m0, [srcq+lenq+2*mmsize]
|
||||
mulpd m4, m0, [srcq+lenq+3*mmsize]
|
||||
addpd m1, m1, [dstq+lenq]
|
||||
addpd m2, m2, [dstq+lenq+1*mmsize]
|
||||
addpd m3, m3, [dstq+lenq+2*mmsize]
|
||||
addpd m4, m4, [dstq+lenq+3*mmsize]
|
||||
%endif ; cpuflag
|
||||
movaps [dstq+lenq], m1
|
||||
movaps [dstq+lenq+1*mmsize], m2
|
||||
movaps [dstq+lenq+2*mmsize], m3
|
||||
movaps [dstq+lenq+3*mmsize], m4
|
||||
sub lenq, mmsize*4
|
||||
jge .loop
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse2
|
||||
VECTOR_DMAC_SCALAR
|
||||
%if HAVE_AVX_EXTERNAL
|
||||
INIT_YMM avx
|
||||
VECTOR_DMAC_SCALAR
|
||||
%endif
|
||||
%if HAVE_FMA3_EXTERNAL
|
||||
INIT_YMM fma3
|
||||
VECTOR_DMAC_SCALAR
|
||||
%endif
|
||||
|
||||
;------------------------------------------------------------------------------
|
||||
; void ff_vector_dmul_scalar(double *dst, const double *src, double mul,
|
||||
; int len)
|
||||
;------------------------------------------------------------------------------
|
||||
|
||||
%macro VECTOR_DMUL_SCALAR 0
|
||||
%if ARCH_X86_32
|
||||
cglobal vector_dmul_scalar, 3,4,3, dst, src, mul, len, lenaddr
|
||||
mov lenq, lenaddrm
|
||||
%elif UNIX64
|
||||
cglobal vector_dmul_scalar, 3,3,3, dst, src, len
|
||||
%else
|
||||
cglobal vector_dmul_scalar, 4,4,3, dst, src, mul, len
|
||||
%endif
|
||||
%if ARCH_X86_32
|
||||
VBROADCASTSD m0, mulm
|
||||
%else
|
||||
%if WIN64
|
||||
SWAP 0, 2
|
||||
%endif
|
||||
movlhps xm0, xm0
|
||||
%if cpuflag(avx)
|
||||
vinsertf128 ym0, ym0, xm0, 1
|
||||
%endif
|
||||
%endif
|
||||
lea lenq, [lend*8-2*mmsize]
|
||||
.loop:
|
||||
mulpd m1, m0, [srcq+lenq ]
|
||||
mulpd m2, m0, [srcq+lenq+mmsize]
|
||||
movaps [dstq+lenq ], m1
|
||||
movaps [dstq+lenq+mmsize], m2
|
||||
sub lenq, 2*mmsize
|
||||
jge .loop
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse2
|
||||
VECTOR_DMUL_SCALAR
|
||||
%if HAVE_AVX_EXTERNAL
|
||||
INIT_YMM avx
|
||||
VECTOR_DMUL_SCALAR
|
||||
%endif
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; vector_fmul_window(float *dst, const float *src0,
|
||||
; const float *src1, const float *win, int len);
|
||||
;-----------------------------------------------------------------------------
|
||||
%macro VECTOR_FMUL_WINDOW 0
|
||||
cglobal vector_fmul_window, 5, 6, 6, dst, src0, src1, win, len, len1
|
||||
shl lend, 2
|
||||
lea len1q, [lenq - mmsize]
|
||||
add src0q, lenq
|
||||
add dstq, lenq
|
||||
add winq, lenq
|
||||
neg lenq
|
||||
.loop:
|
||||
mova m0, [winq + lenq]
|
||||
mova m4, [src0q + lenq]
|
||||
%if cpuflag(sse)
|
||||
mova m1, [winq + len1q]
|
||||
mova m5, [src1q + len1q]
|
||||
shufps m1, m1, 0x1b
|
||||
shufps m5, m5, 0x1b
|
||||
mova m2, m0
|
||||
mova m3, m1
|
||||
mulps m2, m4
|
||||
mulps m3, m5
|
||||
mulps m1, m4
|
||||
mulps m0, m5
|
||||
addps m2, m3
|
||||
subps m1, m0
|
||||
shufps m2, m2, 0x1b
|
||||
%else
|
||||
pswapd m1, [winq + len1q]
|
||||
pswapd m5, [src1q + len1q]
|
||||
mova m2, m0
|
||||
mova m3, m1
|
||||
pfmul m2, m4
|
||||
pfmul m3, m5
|
||||
pfmul m1, m4
|
||||
pfmul m0, m5
|
||||
pfadd m2, m3
|
||||
pfsub m1, m0
|
||||
pswapd m2, m2
|
||||
%endif
|
||||
mova [dstq + lenq], m1
|
||||
mova [dstq + len1q], m2
|
||||
sub len1q, mmsize
|
||||
add lenq, mmsize
|
||||
jl .loop
|
||||
%if mmsize == 8
|
||||
femms
|
||||
%endif
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_MMX 3dnowext
|
||||
VECTOR_FMUL_WINDOW
|
||||
INIT_XMM sse
|
||||
VECTOR_FMUL_WINDOW
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; vector_fmul_add(float *dst, const float *src0, const float *src1,
|
||||
; const float *src2, int len)
|
||||
;-----------------------------------------------------------------------------
|
||||
%macro VECTOR_FMUL_ADD 0
|
||||
cglobal vector_fmul_add, 5,5,4, dst, src0, src1, src2, len
|
||||
lea lenq, [lend*4 - 2*mmsize]
|
||||
ALIGN 16
|
||||
.loop:
|
||||
mova m0, [src0q + lenq]
|
||||
mova m1, [src0q + lenq + mmsize]
|
||||
%if cpuflag(fma3)
|
||||
mova m2, [src2q + lenq]
|
||||
mova m3, [src2q + lenq + mmsize]
|
||||
fmaddps m0, m0, [src1q + lenq], m2
|
||||
fmaddps m1, m1, [src1q + lenq + mmsize], m3
|
||||
%else
|
||||
mulps m0, m0, [src1q + lenq]
|
||||
mulps m1, m1, [src1q + lenq + mmsize]
|
||||
addps m0, m0, [src2q + lenq]
|
||||
addps m1, m1, [src2q + lenq + mmsize]
|
||||
%endif
|
||||
mova [dstq + lenq], m0
|
||||
mova [dstq + lenq + mmsize], m1
|
||||
|
||||
sub lenq, 2*mmsize
|
||||
jge .loop
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse
|
||||
VECTOR_FMUL_ADD
|
||||
%if HAVE_AVX_EXTERNAL
|
||||
INIT_YMM avx
|
||||
VECTOR_FMUL_ADD
|
||||
%endif
|
||||
%if HAVE_FMA3_EXTERNAL
|
||||
INIT_YMM fma3
|
||||
VECTOR_FMUL_ADD
|
||||
%endif
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void vector_fmul_reverse(float *dst, const float *src0, const float *src1,
|
||||
; int len)
|
||||
;-----------------------------------------------------------------------------
|
||||
%macro VECTOR_FMUL_REVERSE 0
|
||||
cglobal vector_fmul_reverse, 4,4,2, dst, src0, src1, len
|
||||
%if cpuflag(avx2)
|
||||
movaps m2, [pd_reverse]
|
||||
%endif
|
||||
lea lenq, [lend*4 - 2*mmsize]
|
||||
ALIGN 16
|
||||
.loop:
|
||||
%if cpuflag(avx2)
|
||||
vpermps m0, m2, [src1q]
|
||||
vpermps m1, m2, [src1q+mmsize]
|
||||
%elif cpuflag(avx)
|
||||
vmovaps xmm0, [src1q + 16]
|
||||
vinsertf128 m0, m0, [src1q], 1
|
||||
vshufps m0, m0, m0, q0123
|
||||
vmovaps xmm1, [src1q + mmsize + 16]
|
||||
vinsertf128 m1, m1, [src1q + mmsize], 1
|
||||
vshufps m1, m1, m1, q0123
|
||||
%else
|
||||
mova m0, [src1q]
|
||||
mova m1, [src1q + mmsize]
|
||||
shufps m0, m0, q0123
|
||||
shufps m1, m1, q0123
|
||||
%endif
|
||||
mulps m0, m0, [src0q + lenq + mmsize]
|
||||
mulps m1, m1, [src0q + lenq]
|
||||
movaps [dstq + lenq + mmsize], m0
|
||||
movaps [dstq + lenq], m1
|
||||
add src1q, 2*mmsize
|
||||
sub lenq, 2*mmsize
|
||||
jge .loop
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse
|
||||
VECTOR_FMUL_REVERSE
|
||||
%if HAVE_AVX_EXTERNAL
|
||||
INIT_YMM avx
|
||||
VECTOR_FMUL_REVERSE
|
||||
%endif
|
||||
%if HAVE_AVX2_EXTERNAL
|
||||
INIT_YMM avx2
|
||||
VECTOR_FMUL_REVERSE
|
||||
%endif
|
||||
|
||||
; float scalarproduct_float_sse(const float *v1, const float *v2, int len)
|
||||
INIT_XMM sse
|
||||
cglobal scalarproduct_float, 3,3,2, v1, v2, offset
|
||||
shl offsetd, 2
|
||||
add v1q, offsetq
|
||||
add v2q, offsetq
|
||||
neg offsetq
|
||||
xorps xmm0, xmm0
|
||||
.loop:
|
||||
movaps xmm1, [v1q+offsetq]
|
||||
mulps xmm1, [v2q+offsetq]
|
||||
addps xmm0, xmm1
|
||||
add offsetq, 16
|
||||
js .loop
|
||||
movhlps xmm1, xmm0
|
||||
addps xmm0, xmm1
|
||||
movss xmm1, xmm0
|
||||
shufps xmm0, xmm0, 1
|
||||
addss xmm0, xmm1
|
||||
%if ARCH_X86_64 == 0
|
||||
movss r0m, xmm0
|
||||
fld dword r0m
|
||||
%endif
|
||||
RET
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void ff_butterflies_float(float *src0, float *src1, int len);
|
||||
;-----------------------------------------------------------------------------
|
||||
INIT_XMM sse
|
||||
cglobal butterflies_float, 3,3,3, src0, src1, len
|
||||
shl lend, 2
|
||||
add src0q, lenq
|
||||
add src1q, lenq
|
||||
neg lenq
|
||||
.loop:
|
||||
mova m0, [src0q + lenq]
|
||||
mova m1, [src1q + lenq]
|
||||
subps m2, m0, m1
|
||||
addps m0, m0, m1
|
||||
mova [src1q + lenq], m2
|
||||
mova [src0q + lenq], m0
|
||||
add lenq, mmsize
|
||||
jl .loop
|
||||
REP_RET
|
53
trunk/3rdparty/ffmpeg-4.2-fit/libavutil/x86/imgutils.asm
vendored
Normal file
53
trunk/3rdparty/ffmpeg-4.2-fit/libavutil/x86/imgutils.asm
vendored
Normal file
|
@ -0,0 +1,53 @@
|
|||
;*****************************************************************************
|
||||
;* Copyright 2016 Anton Khirnov
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
SECTION .text
|
||||
|
||||
INIT_XMM sse4
|
||||
cglobal image_copy_plane_uc_from, 6, 7, 4, dst, dst_linesize, src, src_linesize, bw, height, rowpos
|
||||
add dstq, bwq
|
||||
add srcq, bwq
|
||||
neg bwq
|
||||
|
||||
.row_start:
|
||||
mov rowposq, bwq
|
||||
|
||||
.loop:
|
||||
movntdqa m0, [srcq + rowposq + 0 * mmsize]
|
||||
movntdqa m1, [srcq + rowposq + 1 * mmsize]
|
||||
movntdqa m2, [srcq + rowposq + 2 * mmsize]
|
||||
movntdqa m3, [srcq + rowposq + 3 * mmsize]
|
||||
|
||||
mova [dstq + rowposq + 0 * mmsize], m0
|
||||
mova [dstq + rowposq + 1 * mmsize], m1
|
||||
mova [dstq + rowposq + 2 * mmsize], m2
|
||||
mova [dstq + rowposq + 3 * mmsize], m3
|
||||
|
||||
add rowposq, 4 * mmsize
|
||||
jnz .loop
|
||||
|
||||
add srcq, src_linesizeq
|
||||
add dstq, dst_linesizeq
|
||||
dec heightd
|
||||
jnz .row_start
|
||||
|
||||
RET
|
290
trunk/3rdparty/ffmpeg-4.2-fit/libavutil/x86/lls.asm
vendored
Normal file
290
trunk/3rdparty/ffmpeg-4.2-fit/libavutil/x86/lls.asm
vendored
Normal file
|
@ -0,0 +1,290 @@
|
|||
;******************************************************************************
|
||||
;* linear least squares model
|
||||
;*
|
||||
;* Copyright (c) 2013 Loren Merritt
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "x86util.asm"
|
||||
|
||||
SECTION .text
|
||||
|
||||
%define MAX_VARS 32
|
||||
%define MAX_VARS_ALIGN (MAX_VARS+4)
|
||||
%define COVAR_STRIDE MAX_VARS_ALIGN*8
|
||||
%define COVAR(x,y) [covarq + (x)*8 + (y)*COVAR_STRIDE]
|
||||
|
||||
struc LLSModel
|
||||
.covariance: resq MAX_VARS_ALIGN*MAX_VARS_ALIGN
|
||||
.coeff: resq MAX_VARS*MAX_VARS
|
||||
.variance: resq MAX_VARS
|
||||
.indep_count: resd 1
|
||||
endstruc
|
||||
|
||||
%macro ADDPD_MEM 2
|
||||
%if cpuflag(avx)
|
||||
vaddpd %2, %2, %1
|
||||
%else
|
||||
addpd %2, %1
|
||||
%endif
|
||||
mova %1, %2
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse2
|
||||
%define movdqa movaps
|
||||
cglobal update_lls, 2,5,8, ctx, var, i, j, covar2
|
||||
%define covarq ctxq
|
||||
mov id, [ctxq + LLSModel.indep_count]
|
||||
lea varq, [varq + iq*8]
|
||||
neg iq
|
||||
mov covar2q, covarq
|
||||
.loopi:
|
||||
; Compute all 3 pairwise products of a 2x2 block that lies on the diagonal
|
||||
mova m1, [varq + iq*8]
|
||||
mova m3, [varq + iq*8 + 16]
|
||||
pshufd m4, m1, q1010
|
||||
pshufd m5, m1, q3232
|
||||
pshufd m6, m3, q1010
|
||||
pshufd m7, m3, q3232
|
||||
mulpd m0, m1, m4
|
||||
mulpd m1, m1, m5
|
||||
lea covarq, [covar2q + 16]
|
||||
ADDPD_MEM COVAR(-2,0), m0
|
||||
ADDPD_MEM COVAR(-2,1), m1
|
||||
lea jq, [iq + 2]
|
||||
cmp jd, -2
|
||||
jg .skip4x4
|
||||
.loop4x4:
|
||||
; Compute all 16 pairwise products of a 4x4 block
|
||||
mulpd m0, m4, m3
|
||||
mulpd m1, m5, m3
|
||||
mulpd m2, m6, m3
|
||||
mulpd m3, m3, m7
|
||||
ADDPD_MEM COVAR(0,0), m0
|
||||
ADDPD_MEM COVAR(0,1), m1
|
||||
ADDPD_MEM COVAR(0,2), m2
|
||||
ADDPD_MEM COVAR(0,3), m3
|
||||
mova m3, [varq + jq*8 + 16]
|
||||
mulpd m0, m4, m3
|
||||
mulpd m1, m5, m3
|
||||
mulpd m2, m6, m3
|
||||
mulpd m3, m3, m7
|
||||
ADDPD_MEM COVAR(2,0), m0
|
||||
ADDPD_MEM COVAR(2,1), m1
|
||||
ADDPD_MEM COVAR(2,2), m2
|
||||
ADDPD_MEM COVAR(2,3), m3
|
||||
mova m3, [varq + jq*8 + 32]
|
||||
add covarq, 32
|
||||
add jq, 4
|
||||
cmp jd, -2
|
||||
jle .loop4x4
|
||||
.skip4x4:
|
||||
test jd, jd
|
||||
jg .skip2x4
|
||||
mulpd m4, m3
|
||||
mulpd m5, m3
|
||||
mulpd m6, m3
|
||||
mulpd m7, m3
|
||||
ADDPD_MEM COVAR(0,0), m4
|
||||
ADDPD_MEM COVAR(0,1), m5
|
||||
ADDPD_MEM COVAR(0,2), m6
|
||||
ADDPD_MEM COVAR(0,3), m7
|
||||
.skip2x4:
|
||||
add iq, 4
|
||||
add covar2q, 4*COVAR_STRIDE+32
|
||||
cmp id, -2
|
||||
jle .loopi
|
||||
test id, id
|
||||
jg .ret
|
||||
mov jq, iq
|
||||
%define covarq covar2q
|
||||
.loop2x1:
|
||||
movsd m0, [varq + iq*8]
|
||||
movlhps m0, m0
|
||||
mulpd m0, [varq + jq*8]
|
||||
ADDPD_MEM COVAR(0,0), m0
|
||||
inc iq
|
||||
add covarq, COVAR_STRIDE
|
||||
test id, id
|
||||
jle .loop2x1
|
||||
.ret:
|
||||
REP_RET
|
||||
|
||||
%macro UPDATE_LLS 0
|
||||
cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2
|
||||
%define covarq ctxq
|
||||
mov countd, [ctxq + LLSModel.indep_count]
|
||||
lea count2d, [countq-2]
|
||||
xor id, id
|
||||
.loopi:
|
||||
; Compute all 10 pairwise products of a 4x4 block that lies on the diagonal
|
||||
mova ymm1, [varq + iq*8]
|
||||
vbroadcastsd ymm4, [varq + iq*8]
|
||||
vbroadcastsd ymm5, [varq + iq*8 + 8]
|
||||
vbroadcastsd ymm6, [varq + iq*8 + 16]
|
||||
vbroadcastsd ymm7, [varq + iq*8 + 24]
|
||||
vextractf128 xmm3, ymm1, 1
|
||||
%if cpuflag(fma3)
|
||||
mova ymm0, COVAR(iq ,0)
|
||||
mova xmm2, COVAR(iq+2,2)
|
||||
fmaddpd ymm0, ymm1, ymm4, ymm0
|
||||
fmaddpd xmm2, xmm3, xmm6, xmm2
|
||||
fmaddpd ymm1, ymm5, ymm1, COVAR(iq ,1)
|
||||
fmaddpd xmm3, xmm7, xmm3, COVAR(iq+2,3)
|
||||
mova COVAR(iq ,0), ymm0
|
||||
mova COVAR(iq ,1), ymm1
|
||||
mova COVAR(iq+2,2), xmm2
|
||||
mova COVAR(iq+2,3), xmm3
|
||||
%else
|
||||
vmulpd ymm0, ymm1, ymm4
|
||||
vmulpd ymm1, ymm1, ymm5
|
||||
vmulpd xmm2, xmm3, xmm6
|
||||
vmulpd xmm3, xmm3, xmm7
|
||||
ADDPD_MEM COVAR(iq ,0), ymm0
|
||||
ADDPD_MEM COVAR(iq ,1), ymm1
|
||||
ADDPD_MEM COVAR(iq+2,2), xmm2
|
||||
ADDPD_MEM COVAR(iq+2,3), xmm3
|
||||
%endif ; cpuflag(fma3)
|
||||
lea jd, [iq + 4]
|
||||
cmp jd, count2d
|
||||
jg .skip4x4
|
||||
.loop4x4:
|
||||
; Compute all 16 pairwise products of a 4x4 block
|
||||
mova ymm3, [varq + jq*8]
|
||||
%if cpuflag(fma3)
|
||||
mova ymm0, COVAR(jq, 0)
|
||||
mova ymm1, COVAR(jq, 1)
|
||||
mova ymm2, COVAR(jq, 2)
|
||||
fmaddpd ymm0, ymm3, ymm4, ymm0
|
||||
fmaddpd ymm1, ymm3, ymm5, ymm1
|
||||
fmaddpd ymm2, ymm3, ymm6, ymm2
|
||||
fmaddpd ymm3, ymm7, ymm3, COVAR(jq,3)
|
||||
mova COVAR(jq, 0), ymm0
|
||||
mova COVAR(jq, 1), ymm1
|
||||
mova COVAR(jq, 2), ymm2
|
||||
mova COVAR(jq, 3), ymm3
|
||||
%else
|
||||
vmulpd ymm0, ymm3, ymm4
|
||||
vmulpd ymm1, ymm3, ymm5
|
||||
vmulpd ymm2, ymm3, ymm6
|
||||
vmulpd ymm3, ymm3, ymm7
|
||||
ADDPD_MEM COVAR(jq,0), ymm0
|
||||
ADDPD_MEM COVAR(jq,1), ymm1
|
||||
ADDPD_MEM COVAR(jq,2), ymm2
|
||||
ADDPD_MEM COVAR(jq,3), ymm3
|
||||
%endif ; cpuflag(fma3)
|
||||
add jd, 4
|
||||
cmp jd, count2d
|
||||
jle .loop4x4
|
||||
.skip4x4:
|
||||
cmp jd, countd
|
||||
jg .skip2x4
|
||||
mova xmm3, [varq + jq*8]
|
||||
%if cpuflag(fma3)
|
||||
mova xmm0, COVAR(jq, 0)
|
||||
mova xmm1, COVAR(jq, 1)
|
||||
mova xmm2, COVAR(jq, 2)
|
||||
fmaddpd xmm0, xmm3, xmm4, xmm0
|
||||
fmaddpd xmm1, xmm3, xmm5, xmm1
|
||||
fmaddpd xmm2, xmm3, xmm6, xmm2
|
||||
fmaddpd xmm3, xmm7, xmm3, COVAR(jq,3)
|
||||
mova COVAR(jq, 0), xmm0
|
||||
mova COVAR(jq, 1), xmm1
|
||||
mova COVAR(jq, 2), xmm2
|
||||
mova COVAR(jq, 3), xmm3
|
||||
%else
|
||||
vmulpd xmm0, xmm3, xmm4
|
||||
vmulpd xmm1, xmm3, xmm5
|
||||
vmulpd xmm2, xmm3, xmm6
|
||||
vmulpd xmm3, xmm3, xmm7
|
||||
ADDPD_MEM COVAR(jq,0), xmm0
|
||||
ADDPD_MEM COVAR(jq,1), xmm1
|
||||
ADDPD_MEM COVAR(jq,2), xmm2
|
||||
ADDPD_MEM COVAR(jq,3), xmm3
|
||||
%endif ; cpuflag(fma3)
|
||||
.skip2x4:
|
||||
add id, 4
|
||||
add covarq, 4*COVAR_STRIDE
|
||||
cmp id, count2d
|
||||
jle .loopi
|
||||
cmp id, countd
|
||||
jg .ret
|
||||
mov jd, id
|
||||
.loop2x1:
|
||||
vmovddup xmm0, [varq + iq*8]
|
||||
%if cpuflag(fma3)
|
||||
mova xmm1, [varq + jq*8]
|
||||
fmaddpd xmm0, xmm1, xmm0, COVAR(jq,0)
|
||||
mova COVAR(jq,0), xmm0
|
||||
%else
|
||||
vmulpd xmm0, [varq + jq*8]
|
||||
ADDPD_MEM COVAR(jq,0), xmm0
|
||||
%endif ; cpuflag(fma3)
|
||||
inc id
|
||||
add covarq, COVAR_STRIDE
|
||||
cmp id, countd
|
||||
jle .loop2x1
|
||||
.ret:
|
||||
REP_RET
|
||||
%endmacro ; UPDATE_LLS
|
||||
|
||||
%if HAVE_AVX_EXTERNAL
|
||||
INIT_YMM avx
|
||||
UPDATE_LLS
|
||||
%endif
|
||||
%if HAVE_FMA3_EXTERNAL
|
||||
INIT_YMM fma3
|
||||
UPDATE_LLS
|
||||
%endif
|
||||
|
||||
INIT_XMM sse2
|
||||
cglobal evaluate_lls, 3,4,2, ctx, var, order, i
|
||||
; This function is often called on the same buffer as update_lls, but with
|
||||
; an offset. They can't both be aligned.
|
||||
; Load halves rather than movu to avoid store-forwarding stalls, since the
|
||||
; input was initialized immediately prior to this function using scalar math.
|
||||
%define coefsq ctxq
|
||||
mov id, orderd
|
||||
imul orderd, MAX_VARS
|
||||
lea coefsq, [ctxq + LLSModel.coeff + orderq*8]
|
||||
movsd m0, [varq]
|
||||
movhpd m0, [varq + 8]
|
||||
mulpd m0, [coefsq]
|
||||
lea coefsq, [coefsq + iq*8]
|
||||
lea varq, [varq + iq*8]
|
||||
neg iq
|
||||
add iq, 2
|
||||
.loop:
|
||||
movsd m1, [varq + iq*8]
|
||||
movhpd m1, [varq + iq*8 + 8]
|
||||
mulpd m1, [coefsq + iq*8]
|
||||
addpd m0, m1
|
||||
add iq, 2
|
||||
jl .loop
|
||||
jg .skip1
|
||||
movsd m1, [varq + iq*8]
|
||||
mulsd m1, [coefsq + iq*8]
|
||||
addpd m0, m1
|
||||
.skip1:
|
||||
movhlps m1, m0
|
||||
addsd m0, m1
|
||||
%if ARCH_X86_32
|
||||
movsd r0m, m0
|
||||
fld qword r0m
|
||||
%endif
|
||||
RET
|
1701
trunk/3rdparty/ffmpeg-4.2-fit/libavutil/x86/x86inc.asm
vendored
Normal file
1701
trunk/3rdparty/ffmpeg-4.2-fit/libavutil/x86/x86inc.asm
vendored
Normal file
File diff suppressed because it is too large
Load diff
1028
trunk/3rdparty/ffmpeg-4.2-fit/libavutil/x86/x86util.asm
vendored
Normal file
1028
trunk/3rdparty/ffmpeg-4.2-fit/libavutil/x86/x86util.asm
vendored
Normal file
File diff suppressed because it is too large
Load diff
7
trunk/3rdparty/ffmpeg-4.2-fit/libswresample/aarch64/Makefile
vendored
Normal file
7
trunk/3rdparty/ffmpeg-4.2-fit/libswresample/aarch64/Makefile
vendored
Normal file
|
@ -0,0 +1,7 @@
|
|||
OBJS += aarch64/audio_convert_init.o \
|
||||
aarch64/resample_init.o
|
||||
|
||||
OBJS-$(CONFIG_NEON_CLOBBER_TEST) += aarch64/neontest.o
|
||||
|
||||
NEON-OBJS += aarch64/audio_convert_neon.o \
|
||||
aarch64/resample.o
|
8
trunk/3rdparty/ffmpeg-4.2-fit/libswresample/arm/Makefile
vendored
Normal file
8
trunk/3rdparty/ffmpeg-4.2-fit/libswresample/arm/Makefile
vendored
Normal file
|
@ -0,0 +1,8 @@
|
|||
OBJS += arm/audio_convert_init.o \
|
||||
arm/resample_init.o
|
||||
|
||||
|
||||
OBJS-$(CONFIG_NEON_CLOBBER_TEST) += arm/neontest.o
|
||||
|
||||
NEON-OBJS += arm/audio_convert_neon.o \
|
||||
arm/resample.o
|
9
trunk/3rdparty/ffmpeg-4.2-fit/libswresample/x86/Makefile
vendored
Normal file
9
trunk/3rdparty/ffmpeg-4.2-fit/libswresample/x86/Makefile
vendored
Normal file
|
@ -0,0 +1,9 @@
|
|||
X86ASM-OBJS += x86/audio_convert.o\
|
||||
x86/rematrix.o\
|
||||
x86/resample.o\
|
||||
|
||||
OBJS += x86/audio_convert_init.o\
|
||||
x86/rematrix_init.o\
|
||||
x86/resample_init.o\
|
||||
|
||||
OBJS-$(CONFIG_XMM_CLOBBER_TEST) += x86/w64xmmtest.o
|
739
trunk/3rdparty/ffmpeg-4.2-fit/libswresample/x86/audio_convert.asm
vendored
Normal file
739
trunk/3rdparty/ffmpeg-4.2-fit/libswresample/x86/audio_convert.asm
vendored
Normal file
|
@ -0,0 +1,739 @@
|
|||
;******************************************************************************
|
||||
;* Copyright (c) 2012 Michael Niedermayer
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
SECTION_RODATA 32
|
||||
flt2pm31: times 8 dd 4.6566129e-10
|
||||
flt2p31 : times 8 dd 2147483648.0
|
||||
flt2p15 : times 8 dd 32768.0
|
||||
|
||||
word_unpack_shuf : db 0, 1, 4, 5, 8, 9,12,13, 2, 3, 6, 7,10,11,14,15
|
||||
|
||||
SECTION .text
|
||||
|
||||
|
||||
;to, from, a/u, log2_outsize, log_intsize, const
|
||||
%macro PACK_2CH 5-7
|
||||
cglobal pack_2ch_%2_to_%1_%3, 3, 4, 6, dst, src, len, src2
|
||||
mov src2q , [srcq+gprsize]
|
||||
mov srcq , [srcq]
|
||||
mov dstq , [dstq]
|
||||
%ifidn %3, a
|
||||
test dstq, mmsize-1
|
||||
jne pack_2ch_%2_to_%1_u_int %+ SUFFIX
|
||||
test srcq, mmsize-1
|
||||
jne pack_2ch_%2_to_%1_u_int %+ SUFFIX
|
||||
test src2q, mmsize-1
|
||||
jne pack_2ch_%2_to_%1_u_int %+ SUFFIX
|
||||
%else
|
||||
pack_2ch_%2_to_%1_u_int %+ SUFFIX:
|
||||
%endif
|
||||
lea srcq , [srcq + (1<<%5)*lenq]
|
||||
lea src2q, [src2q + (1<<%5)*lenq]
|
||||
lea dstq , [dstq + (2<<%4)*lenq]
|
||||
neg lenq
|
||||
%7 m0,m1,m2,m3,m4,m5
|
||||
.next:
|
||||
%if %4 >= %5
|
||||
mov%3 m0, [ srcq +(1<<%5)*lenq]
|
||||
mova m1, m0
|
||||
mov%3 m2, [ src2q+(1<<%5)*lenq]
|
||||
%if %5 == 1
|
||||
punpcklwd m0, m2
|
||||
punpckhwd m1, m2
|
||||
%else
|
||||
punpckldq m0, m2
|
||||
punpckhdq m1, m2
|
||||
%endif
|
||||
%6 m0,m1,m2,m3,m4,m5
|
||||
%else
|
||||
mov%3 m0, [ srcq +(1<<%5)*lenq]
|
||||
mov%3 m1, [mmsize + srcq +(1<<%5)*lenq]
|
||||
mov%3 m2, [ src2q+(1<<%5)*lenq]
|
||||
mov%3 m3, [mmsize + src2q+(1<<%5)*lenq]
|
||||
%6 m0,m1,m2,m3,m4,m5
|
||||
mova m2, m0
|
||||
punpcklwd m0, m1
|
||||
punpckhwd m2, m1
|
||||
SWAP 1,2
|
||||
%endif
|
||||
mov%3 [ dstq+(2<<%4)*lenq], m0
|
||||
mov%3 [ mmsize + dstq+(2<<%4)*lenq], m1
|
||||
%if %4 > %5
|
||||
mov%3 [2*mmsize + dstq+(2<<%4)*lenq], m2
|
||||
mov%3 [3*mmsize + dstq+(2<<%4)*lenq], m3
|
||||
add lenq, 4*mmsize/(2<<%4)
|
||||
%else
|
||||
add lenq, 2*mmsize/(2<<%4)
|
||||
%endif
|
||||
jl .next
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
%macro UNPACK_2CH 5-7
|
||||
cglobal unpack_2ch_%2_to_%1_%3, 3, 4, 7, dst, src, len, dst2
|
||||
mov dst2q , [dstq+gprsize]
|
||||
mov srcq , [srcq]
|
||||
mov dstq , [dstq]
|
||||
%ifidn %3, a
|
||||
test dstq, mmsize-1
|
||||
jne unpack_2ch_%2_to_%1_u_int %+ SUFFIX
|
||||
test srcq, mmsize-1
|
||||
jne unpack_2ch_%2_to_%1_u_int %+ SUFFIX
|
||||
test dst2q, mmsize-1
|
||||
jne unpack_2ch_%2_to_%1_u_int %+ SUFFIX
|
||||
%else
|
||||
unpack_2ch_%2_to_%1_u_int %+ SUFFIX:
|
||||
%endif
|
||||
lea srcq , [srcq + (2<<%5)*lenq]
|
||||
lea dstq , [dstq + (1<<%4)*lenq]
|
||||
lea dst2q, [dst2q + (1<<%4)*lenq]
|
||||
neg lenq
|
||||
%7 m0,m1,m2,m3,m4,m5
|
||||
mova m6, [word_unpack_shuf]
|
||||
.next:
|
||||
mov%3 m0, [ srcq +(2<<%5)*lenq]
|
||||
mov%3 m2, [ mmsize + srcq +(2<<%5)*lenq]
|
||||
%if %5 == 1
|
||||
%ifidn SUFFIX, _ssse3
|
||||
pshufb m0, m6
|
||||
mova m1, m0
|
||||
pshufb m2, m6
|
||||
punpcklqdq m0,m2
|
||||
punpckhqdq m1,m2
|
||||
%else
|
||||
mova m1, m0
|
||||
punpcklwd m0,m2
|
||||
punpckhwd m1,m2
|
||||
|
||||
mova m2, m0
|
||||
punpcklwd m0,m1
|
||||
punpckhwd m2,m1
|
||||
|
||||
mova m1, m0
|
||||
punpcklwd m0,m2
|
||||
punpckhwd m1,m2
|
||||
%endif
|
||||
%else
|
||||
mova m1, m0
|
||||
shufps m0, m2, 10001000b
|
||||
shufps m1, m2, 11011101b
|
||||
%endif
|
||||
%if %4 < %5
|
||||
mov%3 m2, [2*mmsize + srcq +(2<<%5)*lenq]
|
||||
mova m3, m2
|
||||
mov%3 m4, [3*mmsize + srcq +(2<<%5)*lenq]
|
||||
shufps m2, m4, 10001000b
|
||||
shufps m3, m4, 11011101b
|
||||
SWAP 1,2
|
||||
%endif
|
||||
%6 m0,m1,m2,m3,m4,m5
|
||||
mov%3 [ dstq+(1<<%4)*lenq], m0
|
||||
%if %4 > %5
|
||||
mov%3 [ dst2q+(1<<%4)*lenq], m2
|
||||
mov%3 [ mmsize + dstq+(1<<%4)*lenq], m1
|
||||
mov%3 [ mmsize + dst2q+(1<<%4)*lenq], m3
|
||||
add lenq, 2*mmsize/(1<<%4)
|
||||
%else
|
||||
mov%3 [ dst2q+(1<<%4)*lenq], m1
|
||||
add lenq, mmsize/(1<<%4)
|
||||
%endif
|
||||
jl .next
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
%macro CONV 5-7
|
||||
cglobal %2_to_%1_%3, 3, 3, 6, dst, src, len
|
||||
mov srcq , [srcq]
|
||||
mov dstq , [dstq]
|
||||
%ifidn %3, a
|
||||
test dstq, mmsize-1
|
||||
jne %2_to_%1_u_int %+ SUFFIX
|
||||
test srcq, mmsize-1
|
||||
jne %2_to_%1_u_int %+ SUFFIX
|
||||
%else
|
||||
%2_to_%1_u_int %+ SUFFIX:
|
||||
%endif
|
||||
lea srcq , [srcq + (1<<%5)*lenq]
|
||||
lea dstq , [dstq + (1<<%4)*lenq]
|
||||
neg lenq
|
||||
%7 m0,m1,m2,m3,m4,m5
|
||||
.next:
|
||||
mov%3 m0, [ srcq +(1<<%5)*lenq]
|
||||
mov%3 m1, [ mmsize + srcq +(1<<%5)*lenq]
|
||||
%if %4 < %5
|
||||
mov%3 m2, [2*mmsize + srcq +(1<<%5)*lenq]
|
||||
mov%3 m3, [3*mmsize + srcq +(1<<%5)*lenq]
|
||||
%endif
|
||||
%6 m0,m1,m2,m3,m4,m5
|
||||
mov%3 [ dstq+(1<<%4)*lenq], m0
|
||||
mov%3 [ mmsize + dstq+(1<<%4)*lenq], m1
|
||||
%if %4 > %5
|
||||
mov%3 [2*mmsize + dstq+(1<<%4)*lenq], m2
|
||||
mov%3 [3*mmsize + dstq+(1<<%4)*lenq], m3
|
||||
add lenq, 4*mmsize/(1<<%4)
|
||||
%else
|
||||
add lenq, 2*mmsize/(1<<%4)
|
||||
%endif
|
||||
jl .next
|
||||
%if mmsize == 8
|
||||
emms
|
||||
RET
|
||||
%else
|
||||
REP_RET
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro PACK_6CH 8
|
||||
cglobal pack_6ch_%2_to_%1_%3, 2, 8, %6, dst, src, src1, src2, src3, src4, src5, len
|
||||
%if ARCH_X86_64
|
||||
mov lend, r2d
|
||||
%else
|
||||
%define lend dword r2m
|
||||
%endif
|
||||
mov src1q, [srcq+1*gprsize]
|
||||
mov src2q, [srcq+2*gprsize]
|
||||
mov src3q, [srcq+3*gprsize]
|
||||
mov src4q, [srcq+4*gprsize]
|
||||
mov src5q, [srcq+5*gprsize]
|
||||
mov srcq, [srcq]
|
||||
mov dstq, [dstq]
|
||||
%ifidn %3, a
|
||||
test dstq, mmsize-1
|
||||
jne pack_6ch_%2_to_%1_u_int %+ SUFFIX
|
||||
test srcq, mmsize-1
|
||||
jne pack_6ch_%2_to_%1_u_int %+ SUFFIX
|
||||
test src1q, mmsize-1
|
||||
jne pack_6ch_%2_to_%1_u_int %+ SUFFIX
|
||||
test src2q, mmsize-1
|
||||
jne pack_6ch_%2_to_%1_u_int %+ SUFFIX
|
||||
test src3q, mmsize-1
|
||||
jne pack_6ch_%2_to_%1_u_int %+ SUFFIX
|
||||
test src4q, mmsize-1
|
||||
jne pack_6ch_%2_to_%1_u_int %+ SUFFIX
|
||||
test src5q, mmsize-1
|
||||
jne pack_6ch_%2_to_%1_u_int %+ SUFFIX
|
||||
%else
|
||||
pack_6ch_%2_to_%1_u_int %+ SUFFIX:
|
||||
%endif
|
||||
sub src1q, srcq
|
||||
sub src2q, srcq
|
||||
sub src3q, srcq
|
||||
sub src4q, srcq
|
||||
sub src5q, srcq
|
||||
%8 x,x,x,x,m7,x
|
||||
.loop:
|
||||
mov%3 m0, [srcq ]
|
||||
mov%3 m1, [srcq+src1q]
|
||||
mov%3 m2, [srcq+src2q]
|
||||
mov%3 m3, [srcq+src3q]
|
||||
mov%3 m4, [srcq+src4q]
|
||||
mov%3 m5, [srcq+src5q]
|
||||
%if cpuflag(sse)
|
||||
SBUTTERFLYPS 0, 1, 6
|
||||
SBUTTERFLYPS 2, 3, 6
|
||||
SBUTTERFLYPS 4, 5, 6
|
||||
|
||||
%if cpuflag(avx)
|
||||
blendps m6, m4, m0, 1100b
|
||||
%else
|
||||
movaps m6, m4
|
||||
shufps m4, m0, q3210
|
||||
SWAP 4,6
|
||||
%endif
|
||||
movlhps m0, m2
|
||||
movhlps m4, m2
|
||||
%if cpuflag(avx)
|
||||
blendps m2, m5, m1, 1100b
|
||||
%else
|
||||
movaps m2, m5
|
||||
shufps m5, m1, q3210
|
||||
SWAP 2,5
|
||||
%endif
|
||||
movlhps m1, m3
|
||||
movhlps m5, m3
|
||||
|
||||
%7 m0,m6,x,x,m7,m3
|
||||
%7 m4,m1,x,x,m7,m3
|
||||
%7 m2,m5,x,x,m7,m3
|
||||
|
||||
mov %+ %3 %+ ps [dstq ], m0
|
||||
mov %+ %3 %+ ps [dstq+16], m6
|
||||
mov %+ %3 %+ ps [dstq+32], m4
|
||||
mov %+ %3 %+ ps [dstq+48], m1
|
||||
mov %+ %3 %+ ps [dstq+64], m2
|
||||
mov %+ %3 %+ ps [dstq+80], m5
|
||||
%else ; mmx
|
||||
SBUTTERFLY dq, 0, 1, 6
|
||||
SBUTTERFLY dq, 2, 3, 6
|
||||
SBUTTERFLY dq, 4, 5, 6
|
||||
|
||||
movq [dstq ], m0
|
||||
movq [dstq+ 8], m2
|
||||
movq [dstq+16], m4
|
||||
movq [dstq+24], m1
|
||||
movq [dstq+32], m3
|
||||
movq [dstq+40], m5
|
||||
%endif
|
||||
add srcq, mmsize
|
||||
add dstq, mmsize*6
|
||||
sub lend, mmsize/4
|
||||
jg .loop
|
||||
%if mmsize == 8
|
||||
emms
|
||||
RET
|
||||
%else
|
||||
REP_RET
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro UNPACK_6CH 8
|
||||
cglobal unpack_6ch_%2_to_%1_%3, 2, 8, %6, dst, src, dst1, dst2, dst3, dst4, dst5, len
|
||||
%if ARCH_X86_64
|
||||
mov lend, r2d
|
||||
%else
|
||||
%define lend dword r2m
|
||||
%endif
|
||||
mov dst1q, [dstq+1*gprsize]
|
||||
mov dst2q, [dstq+2*gprsize]
|
||||
mov dst3q, [dstq+3*gprsize]
|
||||
mov dst4q, [dstq+4*gprsize]
|
||||
mov dst5q, [dstq+5*gprsize]
|
||||
mov dstq, [dstq]
|
||||
mov srcq, [srcq]
|
||||
%ifidn %3, a
|
||||
test dstq, mmsize-1
|
||||
jne unpack_6ch_%2_to_%1_u_int %+ SUFFIX
|
||||
test srcq, mmsize-1
|
||||
jne unpack_6ch_%2_to_%1_u_int %+ SUFFIX
|
||||
test dst1q, mmsize-1
|
||||
jne unpack_6ch_%2_to_%1_u_int %+ SUFFIX
|
||||
test dst2q, mmsize-1
|
||||
jne unpack_6ch_%2_to_%1_u_int %+ SUFFIX
|
||||
test dst3q, mmsize-1
|
||||
jne unpack_6ch_%2_to_%1_u_int %+ SUFFIX
|
||||
test dst4q, mmsize-1
|
||||
jne unpack_6ch_%2_to_%1_u_int %+ SUFFIX
|
||||
test dst5q, mmsize-1
|
||||
jne unpack_6ch_%2_to_%1_u_int %+ SUFFIX
|
||||
%else
|
||||
unpack_6ch_%2_to_%1_u_int %+ SUFFIX:
|
||||
%endif
|
||||
sub dst1q, dstq
|
||||
sub dst2q, dstq
|
||||
sub dst3q, dstq
|
||||
sub dst4q, dstq
|
||||
sub dst5q, dstq
|
||||
%8 x,x,x,x,m7,x
|
||||
.loop:
|
||||
mov%3 m0, [srcq ]
|
||||
mov%3 m1, [srcq+16]
|
||||
mov%3 m2, [srcq+32]
|
||||
mov%3 m3, [srcq+48]
|
||||
mov%3 m4, [srcq+64]
|
||||
mov%3 m5, [srcq+80]
|
||||
|
||||
SBUTTERFLYPS 0, 3, 6
|
||||
SBUTTERFLYPS 1, 4, 6
|
||||
SBUTTERFLYPS 2, 5, 6
|
||||
SBUTTERFLYPS 0, 4, 6
|
||||
SBUTTERFLYPS 3, 2, 6
|
||||
SBUTTERFLYPS 1, 5, 6
|
||||
SWAP 1, 4
|
||||
SWAP 2, 3
|
||||
|
||||
%7 m0,m1,x,x,m7,m6
|
||||
%7 m2,m3,x,x,m7,m6
|
||||
%7 m4,m5,x,x,m7,m6
|
||||
|
||||
mov %+ %3 %+ ps [dstq ], m0
|
||||
mov %+ %3 %+ ps [dstq+dst1q], m1
|
||||
mov %+ %3 %+ ps [dstq+dst2q], m2
|
||||
mov %+ %3 %+ ps [dstq+dst3q], m3
|
||||
mov %+ %3 %+ ps [dstq+dst4q], m4
|
||||
mov %+ %3 %+ ps [dstq+dst5q], m5
|
||||
|
||||
add srcq, mmsize*6
|
||||
add dstq, mmsize
|
||||
sub lend, mmsize/4
|
||||
jg .loop
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
%define PACK_8CH_GPRS (10 * ARCH_X86_64) + ((6 + HAVE_ALIGNED_STACK) * ARCH_X86_32)
|
||||
|
||||
%macro PACK_8CH 8
|
||||
cglobal pack_8ch_%2_to_%1_%3, 2, PACK_8CH_GPRS, %6, ARCH_X86_32*48, dst, src, len, src1, src2, src3, src4, src5, src6, src7
|
||||
mov dstq, [dstq]
|
||||
%if ARCH_X86_32
|
||||
DEFINE_ARGS dst, src, src2, src3, src4, src5, src6
|
||||
%define lend dword r2m
|
||||
%define src1q r0q
|
||||
%define src1m dword [rsp+32]
|
||||
%if HAVE_ALIGNED_STACK == 0
|
||||
DEFINE_ARGS dst, src, src2, src3, src5, src6
|
||||
%define src4q r0q
|
||||
%define src4m dword [rsp+36]
|
||||
%endif
|
||||
%define src7q r0q
|
||||
%define src7m dword [rsp+40]
|
||||
mov dstm, dstq
|
||||
%endif
|
||||
mov src7q, [srcq+7*gprsize]
|
||||
mov src6q, [srcq+6*gprsize]
|
||||
%if ARCH_X86_32
|
||||
mov src7m, src7q
|
||||
%endif
|
||||
mov src5q, [srcq+5*gprsize]
|
||||
mov src4q, [srcq+4*gprsize]
|
||||
mov src3q, [srcq+3*gprsize]
|
||||
%if ARCH_X86_32 && HAVE_ALIGNED_STACK == 0
|
||||
mov src4m, src4q
|
||||
%endif
|
||||
mov src2q, [srcq+2*gprsize]
|
||||
mov src1q, [srcq+1*gprsize]
|
||||
mov srcq, [srcq]
|
||||
%ifidn %3, a
|
||||
%if ARCH_X86_32
|
||||
test dstmp, mmsize-1
|
||||
%else
|
||||
test dstq, mmsize-1
|
||||
%endif
|
||||
jne pack_8ch_%2_to_%1_u_int %+ SUFFIX
|
||||
test srcq, mmsize-1
|
||||
jne pack_8ch_%2_to_%1_u_int %+ SUFFIX
|
||||
test src1q, mmsize-1
|
||||
jne pack_8ch_%2_to_%1_u_int %+ SUFFIX
|
||||
test src2q, mmsize-1
|
||||
jne pack_8ch_%2_to_%1_u_int %+ SUFFIX
|
||||
test src3q, mmsize-1
|
||||
jne pack_8ch_%2_to_%1_u_int %+ SUFFIX
|
||||
%if ARCH_X86_32 && HAVE_ALIGNED_STACK == 0
|
||||
test src4m, mmsize-1
|
||||
%else
|
||||
test src4q, mmsize-1
|
||||
%endif
|
||||
jne pack_8ch_%2_to_%1_u_int %+ SUFFIX
|
||||
test src5q, mmsize-1
|
||||
jne pack_8ch_%2_to_%1_u_int %+ SUFFIX
|
||||
test src6q, mmsize-1
|
||||
jne pack_8ch_%2_to_%1_u_int %+ SUFFIX
|
||||
%if ARCH_X86_32
|
||||
test src7m, mmsize-1
|
||||
%else
|
||||
test src7q, mmsize-1
|
||||
%endif
|
||||
jne pack_8ch_%2_to_%1_u_int %+ SUFFIX
|
||||
%else
|
||||
pack_8ch_%2_to_%1_u_int %+ SUFFIX:
|
||||
%endif
|
||||
sub src1q, srcq
|
||||
sub src2q, srcq
|
||||
sub src3q, srcq
|
||||
%if ARCH_X86_64 || HAVE_ALIGNED_STACK
|
||||
sub src4q, srcq
|
||||
%else
|
||||
sub src4m, srcq
|
||||
%endif
|
||||
sub src5q, srcq
|
||||
sub src6q, srcq
|
||||
%if ARCH_X86_64
|
||||
sub src7q, srcq
|
||||
%else
|
||||
mov src1m, src1q
|
||||
sub src7m, srcq
|
||||
%endif
|
||||
|
||||
%if ARCH_X86_64
|
||||
%8 x,x,x,x,m9,x
|
||||
%elifidn %1, int32
|
||||
%define m9 [flt2p31]
|
||||
%else
|
||||
%define m9 [flt2pm31]
|
||||
%endif
|
||||
|
||||
.loop:
|
||||
mov%3 m0, [srcq ]
|
||||
mov%3 m1, [srcq+src1q]
|
||||
mov%3 m2, [srcq+src2q]
|
||||
%if ARCH_X86_32 && HAVE_ALIGNED_STACK == 0
|
||||
mov src4q, src4m
|
||||
%endif
|
||||
mov%3 m3, [srcq+src3q]
|
||||
mov%3 m4, [srcq+src4q]
|
||||
mov%3 m5, [srcq+src5q]
|
||||
%if ARCH_X86_32
|
||||
mov src7q, src7m
|
||||
%endif
|
||||
mov%3 m6, [srcq+src6q]
|
||||
mov%3 m7, [srcq+src7q]
|
||||
|
||||
%if ARCH_X86_64
|
||||
TRANSPOSE8x4D 0, 1, 2, 3, 4, 5, 6, 7, 8
|
||||
|
||||
%7 m0,m1,x,x,m9,m8
|
||||
%7 m2,m3,x,x,m9,m8
|
||||
%7 m4,m5,x,x,m9,m8
|
||||
%7 m6,m7,x,x,m9,m8
|
||||
|
||||
mov%3 [dstq], m0
|
||||
%else
|
||||
mov dstq, dstm
|
||||
|
||||
TRANSPOSE8x4D 0, 1, 2, 3, 4, 5, 6, 7, [rsp], [rsp+16], 1
|
||||
|
||||
%7 m0,m1,x,x,m9,m2
|
||||
mova m2, [rsp]
|
||||
mov%3 [dstq], m0
|
||||
%7 m2,m3,x,x,m9,m0
|
||||
%7 m4,m5,x,x,m9,m0
|
||||
%7 m6,m7,x,x,m9,m0
|
||||
|
||||
%endif
|
||||
|
||||
mov%3 [dstq+16], m1
|
||||
mov%3 [dstq+32], m2
|
||||
mov%3 [dstq+48], m3
|
||||
mov%3 [dstq+64], m4
|
||||
mov%3 [dstq+80], m5
|
||||
mov%3 [dstq+96], m6
|
||||
mov%3 [dstq+112], m7
|
||||
|
||||
add srcq, mmsize
|
||||
add dstq, mmsize*8
|
||||
%if ARCH_X86_32
|
||||
mov dstm, dstq
|
||||
mov src1q, src1m
|
||||
%endif
|
||||
sub lend, mmsize/4
|
||||
jg .loop
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
%macro INT16_TO_INT32_N 6
|
||||
pxor m2, m2
|
||||
pxor m3, m3
|
||||
punpcklwd m2, m1
|
||||
punpckhwd m3, m1
|
||||
SWAP 4,0
|
||||
pxor m0, m0
|
||||
pxor m1, m1
|
||||
punpcklwd m0, m4
|
||||
punpckhwd m1, m4
|
||||
%endmacro
|
||||
|
||||
%macro INT32_TO_INT16_N 6
|
||||
psrad m0, 16
|
||||
psrad m1, 16
|
||||
psrad m2, 16
|
||||
psrad m3, 16
|
||||
packssdw m0, m1
|
||||
packssdw m2, m3
|
||||
SWAP 1,2
|
||||
%endmacro
|
||||
|
||||
%macro INT32_TO_FLOAT_INIT 6
|
||||
mova %5, [flt2pm31]
|
||||
%endmacro
|
||||
%macro INT32_TO_FLOAT_N 6
|
||||
cvtdq2ps %1, %1
|
||||
cvtdq2ps %2, %2
|
||||
mulps %1, %1, %5
|
||||
mulps %2, %2, %5
|
||||
%endmacro
|
||||
|
||||
%macro FLOAT_TO_INT32_INIT 6
|
||||
mova %5, [flt2p31]
|
||||
%endmacro
|
||||
%macro FLOAT_TO_INT32_N 6
|
||||
mulps %1, %5
|
||||
mulps %2, %5
|
||||
cvtps2dq %6, %1
|
||||
cmpps %1, %1, %5, 5
|
||||
paddd %1, %6
|
||||
cvtps2dq %6, %2
|
||||
cmpps %2, %2, %5, 5
|
||||
paddd %2, %6
|
||||
%endmacro
|
||||
|
||||
%macro INT16_TO_FLOAT_INIT 6
|
||||
mova m5, [flt2pm31]
|
||||
%endmacro
|
||||
%macro INT16_TO_FLOAT_N 6
|
||||
INT16_TO_INT32_N %1,%2,%3,%4,%5,%6
|
||||
cvtdq2ps m0, m0
|
||||
cvtdq2ps m1, m1
|
||||
cvtdq2ps m2, m2
|
||||
cvtdq2ps m3, m3
|
||||
mulps m0, m0, m5
|
||||
mulps m1, m1, m5
|
||||
mulps m2, m2, m5
|
||||
mulps m3, m3, m5
|
||||
%endmacro
|
||||
|
||||
%macro FLOAT_TO_INT16_INIT 6
|
||||
mova m5, [flt2p15]
|
||||
%endmacro
|
||||
%macro FLOAT_TO_INT16_N 6
|
||||
mulps m0, m5
|
||||
mulps m1, m5
|
||||
mulps m2, m5
|
||||
mulps m3, m5
|
||||
cvtps2dq m0, m0
|
||||
cvtps2dq m1, m1
|
||||
packssdw m0, m1
|
||||
cvtps2dq m1, m2
|
||||
cvtps2dq m3, m3
|
||||
packssdw m1, m3
|
||||
%endmacro
|
||||
|
||||
%macro NOP_N 0-6
|
||||
%endmacro
|
||||
|
||||
INIT_MMX mmx
|
||||
CONV int32, int16, u, 2, 1, INT16_TO_INT32_N, NOP_N
|
||||
CONV int32, int16, a, 2, 1, INT16_TO_INT32_N, NOP_N
|
||||
CONV int16, int32, u, 1, 2, INT32_TO_INT16_N, NOP_N
|
||||
CONV int16, int32, a, 1, 2, INT32_TO_INT16_N, NOP_N
|
||||
|
||||
PACK_6CH float, float, u, 2, 2, 0, NOP_N, NOP_N
|
||||
PACK_6CH float, float, a, 2, 2, 0, NOP_N, NOP_N
|
||||
|
||||
INIT_XMM sse
|
||||
PACK_6CH float, float, u, 2, 2, 7, NOP_N, NOP_N
|
||||
PACK_6CH float, float, a, 2, 2, 7, NOP_N, NOP_N
|
||||
|
||||
UNPACK_6CH float, float, u, 2, 2, 7, NOP_N, NOP_N
|
||||
UNPACK_6CH float, float, a, 2, 2, 7, NOP_N, NOP_N
|
||||
|
||||
INIT_XMM sse2
|
||||
CONV int32, int16, u, 2, 1, INT16_TO_INT32_N, NOP_N
|
||||
CONV int32, int16, a, 2, 1, INT16_TO_INT32_N, NOP_N
|
||||
CONV int16, int32, u, 1, 2, INT32_TO_INT16_N, NOP_N
|
||||
CONV int16, int32, a, 1, 2, INT32_TO_INT16_N, NOP_N
|
||||
|
||||
PACK_2CH int16, int16, u, 1, 1, NOP_N, NOP_N
|
||||
PACK_2CH int16, int16, a, 1, 1, NOP_N, NOP_N
|
||||
PACK_2CH int32, int32, u, 2, 2, NOP_N, NOP_N
|
||||
PACK_2CH int32, int32, a, 2, 2, NOP_N, NOP_N
|
||||
PACK_2CH int32, int16, u, 2, 1, INT16_TO_INT32_N, NOP_N
|
||||
PACK_2CH int32, int16, a, 2, 1, INT16_TO_INT32_N, NOP_N
|
||||
PACK_2CH int16, int32, u, 1, 2, INT32_TO_INT16_N, NOP_N
|
||||
PACK_2CH int16, int32, a, 1, 2, INT32_TO_INT16_N, NOP_N
|
||||
|
||||
UNPACK_2CH int16, int16, u, 1, 1, NOP_N, NOP_N
|
||||
UNPACK_2CH int16, int16, a, 1, 1, NOP_N, NOP_N
|
||||
UNPACK_2CH int32, int32, u, 2, 2, NOP_N, NOP_N
|
||||
UNPACK_2CH int32, int32, a, 2, 2, NOP_N, NOP_N
|
||||
UNPACK_2CH int32, int16, u, 2, 1, INT16_TO_INT32_N, NOP_N
|
||||
UNPACK_2CH int32, int16, a, 2, 1, INT16_TO_INT32_N, NOP_N
|
||||
UNPACK_2CH int16, int32, u, 1, 2, INT32_TO_INT16_N, NOP_N
|
||||
UNPACK_2CH int16, int32, a, 1, 2, INT32_TO_INT16_N, NOP_N
|
||||
|
||||
CONV float, int32, u, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
|
||||
CONV float, int32, a, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
|
||||
CONV int32, float, u, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
|
||||
CONV int32, float, a, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
|
||||
CONV float, int16, u, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT
|
||||
CONV float, int16, a, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT
|
||||
CONV int16, float, u, 1, 2, FLOAT_TO_INT16_N, FLOAT_TO_INT16_INIT
|
||||
CONV int16, float, a, 1, 2, FLOAT_TO_INT16_N, FLOAT_TO_INT16_INIT
|
||||
|
||||
PACK_2CH float, int32, u, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
|
||||
PACK_2CH float, int32, a, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
|
||||
PACK_2CH int32, float, u, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
|
||||
PACK_2CH int32, float, a, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
|
||||
PACK_2CH float, int16, u, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT
|
||||
PACK_2CH float, int16, a, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT
|
||||
PACK_2CH int16, float, u, 1, 2, FLOAT_TO_INT16_N, FLOAT_TO_INT16_INIT
|
||||
PACK_2CH int16, float, a, 1, 2, FLOAT_TO_INT16_N, FLOAT_TO_INT16_INIT
|
||||
|
||||
UNPACK_2CH float, int32, u, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
|
||||
UNPACK_2CH float, int32, a, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
|
||||
UNPACK_2CH int32, float, u, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
|
||||
UNPACK_2CH int32, float, a, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
|
||||
UNPACK_2CH float, int16, u, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT
|
||||
UNPACK_2CH float, int16, a, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT
|
||||
UNPACK_2CH int16, float, u, 1, 2, FLOAT_TO_INT16_N, FLOAT_TO_INT16_INIT
|
||||
UNPACK_2CH int16, float, a, 1, 2, FLOAT_TO_INT16_N, FLOAT_TO_INT16_INIT
|
||||
|
||||
PACK_6CH float, int32, u, 2, 2, 8, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
|
||||
PACK_6CH float, int32, a, 2, 2, 8, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
|
||||
PACK_6CH int32, float, u, 2, 2, 8, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
|
||||
PACK_6CH int32, float, a, 2, 2, 8, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
|
||||
|
||||
UNPACK_6CH float, int32, u, 2, 2, 8, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
|
||||
UNPACK_6CH float, int32, a, 2, 2, 8, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
|
||||
UNPACK_6CH int32, float, u, 2, 2, 8, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
|
||||
UNPACK_6CH int32, float, a, 2, 2, 8, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
|
||||
|
||||
PACK_8CH float, float, u, 2, 2, 9, NOP_N, NOP_N
|
||||
PACK_8CH float, float, a, 2, 2, 9, NOP_N, NOP_N
|
||||
|
||||
PACK_8CH float, int32, u, 2, 2, 10, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
|
||||
PACK_8CH float, int32, a, 2, 2, 10, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
|
||||
PACK_8CH int32, float, u, 2, 2, 10, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
|
||||
PACK_8CH int32, float, a, 2, 2, 10, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
|
||||
|
||||
INIT_XMM ssse3
|
||||
UNPACK_2CH int16, int16, u, 1, 1, NOP_N, NOP_N
|
||||
UNPACK_2CH int16, int16, a, 1, 1, NOP_N, NOP_N
|
||||
UNPACK_2CH int32, int16, u, 2, 1, INT16_TO_INT32_N, NOP_N
|
||||
UNPACK_2CH int32, int16, a, 2, 1, INT16_TO_INT32_N, NOP_N
|
||||
UNPACK_2CH float, int16, u, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT
|
||||
UNPACK_2CH float, int16, a, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT
|
||||
|
||||
%if HAVE_AVX_EXTERNAL
|
||||
INIT_XMM avx
|
||||
PACK_6CH float, float, u, 2, 2, 8, NOP_N, NOP_N
|
||||
PACK_6CH float, float, a, 2, 2, 8, NOP_N, NOP_N
|
||||
|
||||
UNPACK_6CH float, float, u, 2, 2, 8, NOP_N, NOP_N
|
||||
UNPACK_6CH float, float, a, 2, 2, 8, NOP_N, NOP_N
|
||||
|
||||
PACK_6CH float, int32, u, 2, 2, 8, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
|
||||
PACK_6CH float, int32, a, 2, 2, 8, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
|
||||
PACK_6CH int32, float, u, 2, 2, 8, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
|
||||
PACK_6CH int32, float, a, 2, 2, 8, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
|
||||
|
||||
UNPACK_6CH float, int32, u, 2, 2, 8, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
|
||||
UNPACK_6CH float, int32, a, 2, 2, 8, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
|
||||
UNPACK_6CH int32, float, u, 2, 2, 8, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
|
||||
UNPACK_6CH int32, float, a, 2, 2, 8, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
|
||||
|
||||
PACK_8CH float, float, u, 2, 2, 9, NOP_N, NOP_N
|
||||
PACK_8CH float, float, a, 2, 2, 9, NOP_N, NOP_N
|
||||
|
||||
PACK_8CH float, int32, u, 2, 2, 10, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
|
||||
PACK_8CH float, int32, a, 2, 2, 10, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
|
||||
PACK_8CH int32, float, u, 2, 2, 10, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
|
||||
PACK_8CH int32, float, a, 2, 2, 10, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
|
||||
|
||||
INIT_YMM avx
|
||||
CONV float, int32, u, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
|
||||
CONV float, int32, a, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
|
||||
%endif
|
||||
|
||||
%if HAVE_AVX2_EXTERNAL
|
||||
INIT_YMM avx2
|
||||
CONV int32, float, u, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
|
||||
CONV int32, float, a, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
|
||||
%endif
|
250
trunk/3rdparty/ffmpeg-4.2-fit/libswresample/x86/rematrix.asm
vendored
Normal file
250
trunk/3rdparty/ffmpeg-4.2-fit/libswresample/x86/rematrix.asm
vendored
Normal file
|
@ -0,0 +1,250 @@
|
|||
;******************************************************************************
|
||||
;* Copyright (c) 2012 Michael Niedermayer
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
|
||||
SECTION_RODATA 32
|
||||
dw1: times 8 dd 1
|
||||
w1 : times 16 dw 1
|
||||
|
||||
SECTION .text
|
||||
|
||||
%macro MIX2_FLT 1
|
||||
cglobal mix_2_1_%1_float, 7, 7, 6, out, in1, in2, coeffp, index1, index2, len
|
||||
%ifidn %1, a
|
||||
test in1q, mmsize-1
|
||||
jne mix_2_1_float_u_int %+ SUFFIX
|
||||
test in2q, mmsize-1
|
||||
jne mix_2_1_float_u_int %+ SUFFIX
|
||||
test outq, mmsize-1
|
||||
jne mix_2_1_float_u_int %+ SUFFIX
|
||||
%else
|
||||
mix_2_1_float_u_int %+ SUFFIX:
|
||||
%endif
|
||||
VBROADCASTSS m4, [coeffpq + 4*index1q]
|
||||
VBROADCASTSS m5, [coeffpq + 4*index2q]
|
||||
shl lend , 2
|
||||
add in1q , lenq
|
||||
add in2q , lenq
|
||||
add outq , lenq
|
||||
neg lenq
|
||||
.next:
|
||||
%ifidn %1, a
|
||||
mulps m0, m4, [in1q + lenq ]
|
||||
mulps m1, m5, [in2q + lenq ]
|
||||
mulps m2, m4, [in1q + lenq + mmsize]
|
||||
mulps m3, m5, [in2q + lenq + mmsize]
|
||||
%else
|
||||
movu m0, [in1q + lenq ]
|
||||
movu m1, [in2q + lenq ]
|
||||
movu m2, [in1q + lenq + mmsize]
|
||||
movu m3, [in2q + lenq + mmsize]
|
||||
mulps m0, m0, m4
|
||||
mulps m1, m1, m5
|
||||
mulps m2, m2, m4
|
||||
mulps m3, m3, m5
|
||||
%endif
|
||||
addps m0, m0, m1
|
||||
addps m2, m2, m3
|
||||
mov%1 [outq + lenq ], m0
|
||||
mov%1 [outq + lenq + mmsize], m2
|
||||
add lenq, mmsize*2
|
||||
jl .next
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
%macro MIX1_FLT 1
|
||||
cglobal mix_1_1_%1_float, 5, 5, 3, out, in, coeffp, index, len
|
||||
%ifidn %1, a
|
||||
test inq, mmsize-1
|
||||
jne mix_1_1_float_u_int %+ SUFFIX
|
||||
test outq, mmsize-1
|
||||
jne mix_1_1_float_u_int %+ SUFFIX
|
||||
%else
|
||||
mix_1_1_float_u_int %+ SUFFIX:
|
||||
%endif
|
||||
VBROADCASTSS m2, [coeffpq + 4*indexq]
|
||||
shl lenq , 2
|
||||
add inq , lenq
|
||||
add outq , lenq
|
||||
neg lenq
|
||||
.next:
|
||||
%ifidn %1, a
|
||||
mulps m0, m2, [inq + lenq ]
|
||||
mulps m1, m2, [inq + lenq + mmsize]
|
||||
%else
|
||||
movu m0, [inq + lenq ]
|
||||
movu m1, [inq + lenq + mmsize]
|
||||
mulps m0, m0, m2
|
||||
mulps m1, m1, m2
|
||||
%endif
|
||||
mov%1 [outq + lenq ], m0
|
||||
mov%1 [outq + lenq + mmsize], m1
|
||||
add lenq, mmsize*2
|
||||
jl .next
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
%macro MIX1_INT16 1
|
||||
cglobal mix_1_1_%1_int16, 5, 5, 6, out, in, coeffp, index, len
|
||||
%ifidn %1, a
|
||||
test inq, mmsize-1
|
||||
jne mix_1_1_int16_u_int %+ SUFFIX
|
||||
test outq, mmsize-1
|
||||
jne mix_1_1_int16_u_int %+ SUFFIX
|
||||
%else
|
||||
mix_1_1_int16_u_int %+ SUFFIX:
|
||||
%endif
|
||||
movd m4, [coeffpq + 4*indexq]
|
||||
SPLATW m5, m4
|
||||
psllq m4, 32
|
||||
psrlq m4, 48
|
||||
mova m0, [w1]
|
||||
psllw m0, m4
|
||||
psrlw m0, 1
|
||||
punpcklwd m5, m0
|
||||
add lenq , lenq
|
||||
add inq , lenq
|
||||
add outq , lenq
|
||||
neg lenq
|
||||
.next:
|
||||
mov%1 m0, [inq + lenq ]
|
||||
mov%1 m2, [inq + lenq + mmsize]
|
||||
mova m1, m0
|
||||
mova m3, m2
|
||||
punpcklwd m0, [w1]
|
||||
punpckhwd m1, [w1]
|
||||
punpcklwd m2, [w1]
|
||||
punpckhwd m3, [w1]
|
||||
pmaddwd m0, m5
|
||||
pmaddwd m1, m5
|
||||
pmaddwd m2, m5
|
||||
pmaddwd m3, m5
|
||||
psrad m0, m4
|
||||
psrad m1, m4
|
||||
psrad m2, m4
|
||||
psrad m3, m4
|
||||
packssdw m0, m1
|
||||
packssdw m2, m3
|
||||
mov%1 [outq + lenq ], m0
|
||||
mov%1 [outq + lenq + mmsize], m2
|
||||
add lenq, mmsize*2
|
||||
jl .next
|
||||
%if mmsize == 8
|
||||
emms
|
||||
RET
|
||||
%else
|
||||
REP_RET
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro MIX2_INT16 1
|
||||
cglobal mix_2_1_%1_int16, 7, 7, 8, out, in1, in2, coeffp, index1, index2, len
|
||||
%ifidn %1, a
|
||||
test in1q, mmsize-1
|
||||
jne mix_2_1_int16_u_int %+ SUFFIX
|
||||
test in2q, mmsize-1
|
||||
jne mix_2_1_int16_u_int %+ SUFFIX
|
||||
test outq, mmsize-1
|
||||
jne mix_2_1_int16_u_int %+ SUFFIX
|
||||
%else
|
||||
mix_2_1_int16_u_int %+ SUFFIX:
|
||||
%endif
|
||||
movd m4, [coeffpq + 4*index1q]
|
||||
movd m6, [coeffpq + 4*index2q]
|
||||
SPLATW m5, m4
|
||||
SPLATW m6, m6
|
||||
psllq m4, 32
|
||||
psrlq m4, 48
|
||||
mova m7, [dw1]
|
||||
pslld m7, m4
|
||||
psrld m7, 1
|
||||
punpcklwd m5, m6
|
||||
add lend , lend
|
||||
add in1q , lenq
|
||||
add in2q , lenq
|
||||
add outq , lenq
|
||||
neg lenq
|
||||
.next:
|
||||
mov%1 m0, [in1q + lenq ]
|
||||
mov%1 m2, [in2q + lenq ]
|
||||
mova m1, m0
|
||||
punpcklwd m0, m2
|
||||
punpckhwd m1, m2
|
||||
|
||||
mov%1 m2, [in1q + lenq + mmsize]
|
||||
mov%1 m6, [in2q + lenq + mmsize]
|
||||
mova m3, m2
|
||||
punpcklwd m2, m6
|
||||
punpckhwd m3, m6
|
||||
|
||||
pmaddwd m0, m5
|
||||
pmaddwd m1, m5
|
||||
pmaddwd m2, m5
|
||||
pmaddwd m3, m5
|
||||
paddd m0, m7
|
||||
paddd m1, m7
|
||||
paddd m2, m7
|
||||
paddd m3, m7
|
||||
psrad m0, m4
|
||||
psrad m1, m4
|
||||
psrad m2, m4
|
||||
psrad m3, m4
|
||||
packssdw m0, m1
|
||||
packssdw m2, m3
|
||||
mov%1 [outq + lenq ], m0
|
||||
mov%1 [outq + lenq + mmsize], m2
|
||||
add lenq, mmsize*2
|
||||
jl .next
|
||||
%if mmsize == 8
|
||||
emms
|
||||
RET
|
||||
%else
|
||||
REP_RET
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
|
||||
INIT_MMX mmx
|
||||
MIX1_INT16 u
|
||||
MIX1_INT16 a
|
||||
MIX2_INT16 u
|
||||
MIX2_INT16 a
|
||||
|
||||
INIT_XMM sse
|
||||
MIX2_FLT u
|
||||
MIX2_FLT a
|
||||
MIX1_FLT u
|
||||
MIX1_FLT a
|
||||
|
||||
INIT_XMM sse2
|
||||
MIX1_INT16 u
|
||||
MIX1_INT16 a
|
||||
MIX2_INT16 u
|
||||
MIX2_INT16 a
|
||||
|
||||
%if HAVE_AVX_EXTERNAL
|
||||
INIT_YMM avx
|
||||
MIX2_FLT u
|
||||
MIX2_FLT a
|
||||
MIX1_FLT u
|
||||
MIX1_FLT a
|
||||
%endif
|
619
trunk/3rdparty/ffmpeg-4.2-fit/libswresample/x86/resample.asm
vendored
Normal file
619
trunk/3rdparty/ffmpeg-4.2-fit/libswresample/x86/resample.asm
vendored
Normal file
|
@ -0,0 +1,619 @@
|
|||
;******************************************************************************
|
||||
;* Copyright (c) 2012 Michael Niedermayer
|
||||
;* Copyright (c) 2014 James Almer <jamrial <at> gmail.com>
|
||||
;* Copyright (c) 2014 Ronald S. Bultje <rsbultje@gmail.com>
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
%if ARCH_X86_64
|
||||
%define pointer resq
|
||||
%else
|
||||
%define pointer resd
|
||||
%endif
|
||||
|
||||
struc ResampleContext
|
||||
.av_class: pointer 1
|
||||
.filter_bank: pointer 1
|
||||
.filter_length: resd 1
|
||||
.filter_alloc: resd 1
|
||||
.ideal_dst_incr: resd 1
|
||||
.dst_incr: resd 1
|
||||
.dst_incr_div: resd 1
|
||||
.dst_incr_mod: resd 1
|
||||
.index: resd 1
|
||||
.frac: resd 1
|
||||
.src_incr: resd 1
|
||||
.compensation_distance: resd 1
|
||||
.phase_count: resd 1
|
||||
|
||||
; there's a few more here but we only care about the first few
|
||||
endstruc
|
||||
|
||||
SECTION_RODATA
|
||||
|
||||
pf_1: dd 1.0
|
||||
pdbl_1: dq 1.0
|
||||
pd_0x4000: dd 0x4000
|
||||
|
||||
SECTION .text
|
||||
|
||||
; FIXME remove unneeded variables (index_incr, phase_mask)
|
||||
%macro RESAMPLE_FNS 3-5 ; format [float or int16], bps, log2_bps, float op suffix [s or d], 1.0 constant
|
||||
; int resample_common_$format(ResampleContext *ctx, $format *dst,
|
||||
; const $format *src, int size, int update_ctx)
|
||||
%if ARCH_X86_64 ; unix64 and win64
|
||||
cglobal resample_common_%1, 0, 15, 2, ctx, dst, src, phase_count, index, frac, \
|
||||
dst_incr_mod, size, min_filter_count_x4, \
|
||||
min_filter_len_x4, dst_incr_div, src_incr, \
|
||||
phase_mask, dst_end, filter_bank
|
||||
|
||||
; use red-zone for variable storage
|
||||
%define ctx_stackq [rsp-0x8]
|
||||
%define src_stackq [rsp-0x10]
|
||||
%if WIN64
|
||||
%define update_context_stackd r4m
|
||||
%else ; unix64
|
||||
%define update_context_stackd [rsp-0x14]
|
||||
%endif
|
||||
|
||||
; load as many variables in registers as possible; for the rest, store
|
||||
; on stack so that we have 'ctx' available as one extra register
|
||||
mov sized, r3d
|
||||
%if UNIX64
|
||||
mov update_context_stackd, r4d
|
||||
%endif
|
||||
mov indexd, [ctxq+ResampleContext.index]
|
||||
mov fracd, [ctxq+ResampleContext.frac]
|
||||
mov dst_incr_modd, [ctxq+ResampleContext.dst_incr_mod]
|
||||
mov filter_bankq, [ctxq+ResampleContext.filter_bank]
|
||||
mov src_incrd, [ctxq+ResampleContext.src_incr]
|
||||
mov ctx_stackq, ctxq
|
||||
mov min_filter_len_x4d, [ctxq+ResampleContext.filter_length]
|
||||
mov dst_incr_divd, [ctxq+ResampleContext.dst_incr_div]
|
||||
shl min_filter_len_x4d, %3
|
||||
lea dst_endq, [dstq+sizeq*%2]
|
||||
|
||||
%if UNIX64
|
||||
mov ecx, [ctxq+ResampleContext.phase_count]
|
||||
mov edi, [ctxq+ResampleContext.filter_alloc]
|
||||
|
||||
DEFINE_ARGS filter_alloc, dst, src, phase_count, index, frac, dst_incr_mod, \
|
||||
filter, min_filter_count_x4, min_filter_len_x4, dst_incr_div, \
|
||||
src_incr, phase_mask, dst_end, filter_bank
|
||||
%elif WIN64
|
||||
mov R9d, [ctxq+ResampleContext.filter_alloc]
|
||||
mov ecx, [ctxq+ResampleContext.phase_count]
|
||||
|
||||
DEFINE_ARGS phase_count, dst, src, filter_alloc, index, frac, dst_incr_mod, \
|
||||
filter, min_filter_count_x4, min_filter_len_x4, dst_incr_div, \
|
||||
src_incr, phase_mask, dst_end, filter_bank
|
||||
%endif
|
||||
|
||||
neg min_filter_len_x4q
|
||||
sub filter_bankq, min_filter_len_x4q
|
||||
sub srcq, min_filter_len_x4q
|
||||
mov src_stackq, srcq
|
||||
%else ; x86-32
|
||||
cglobal resample_common_%1, 1, 7, 2, ctx, phase_count, dst, frac, \
|
||||
index, min_filter_length_x4, filter_bank
|
||||
|
||||
; push temp variables to stack
|
||||
%define ctx_stackq r0mp
|
||||
%define src_stackq r2mp
|
||||
%define update_context_stackd r4m
|
||||
|
||||
mov dstq, r1mp
|
||||
mov r3, r3mp
|
||||
lea r3, [dstq+r3*%2]
|
||||
PUSH dword [ctxq+ResampleContext.dst_incr_div]
|
||||
PUSH dword [ctxq+ResampleContext.dst_incr_mod]
|
||||
PUSH dword [ctxq+ResampleContext.filter_alloc]
|
||||
PUSH r3
|
||||
PUSH dword [ctxq+ResampleContext.phase_count] ; unneeded replacement for phase_mask
|
||||
PUSH dword [ctxq+ResampleContext.src_incr]
|
||||
mov min_filter_length_x4d, [ctxq+ResampleContext.filter_length]
|
||||
mov indexd, [ctxq+ResampleContext.index]
|
||||
shl min_filter_length_x4d, %3
|
||||
mov fracd, [ctxq+ResampleContext.frac]
|
||||
neg min_filter_length_x4q
|
||||
mov filter_bankq, [ctxq+ResampleContext.filter_bank]
|
||||
sub r2mp, min_filter_length_x4q
|
||||
sub filter_bankq, min_filter_length_x4q
|
||||
PUSH min_filter_length_x4q
|
||||
PUSH filter_bankq
|
||||
mov phase_countd, [ctxq+ResampleContext.phase_count]
|
||||
|
||||
DEFINE_ARGS src, phase_count, dst, frac, index, min_filter_count_x4, filter
|
||||
|
||||
%define filter_bankq dword [rsp+0x0]
|
||||
%define min_filter_length_x4q dword [rsp+0x4]
|
||||
%define src_incrd dword [rsp+0x8]
|
||||
%define phase_maskd dword [rsp+0xc]
|
||||
%define dst_endq dword [rsp+0x10]
|
||||
%define filter_allocd dword [rsp+0x14]
|
||||
%define dst_incr_modd dword [rsp+0x18]
|
||||
%define dst_incr_divd dword [rsp+0x1c]
|
||||
|
||||
mov srcq, r2mp
|
||||
%endif
|
||||
|
||||
.loop:
|
||||
mov filterd, filter_allocd
|
||||
imul filterd, indexd
|
||||
%if ARCH_X86_64
|
||||
mov min_filter_count_x4q, min_filter_len_x4q
|
||||
lea filterq, [filter_bankq+filterq*%2]
|
||||
%else ; x86-32
|
||||
mov min_filter_count_x4q, filter_bankq
|
||||
lea filterq, [min_filter_count_x4q+filterq*%2]
|
||||
mov min_filter_count_x4q, min_filter_length_x4q
|
||||
%endif
|
||||
%ifidn %1, int16
|
||||
movd m0, [pd_0x4000]
|
||||
%else ; float/double
|
||||
xorps m0, m0, m0
|
||||
%endif
|
||||
|
||||
align 16
|
||||
.inner_loop:
|
||||
movu m1, [srcq+min_filter_count_x4q*1]
|
||||
%ifidn %1, int16
|
||||
%if cpuflag(xop)
|
||||
vpmadcswd m0, m1, [filterq+min_filter_count_x4q*1], m0
|
||||
%else
|
||||
pmaddwd m1, [filterq+min_filter_count_x4q*1]
|
||||
paddd m0, m1
|
||||
%endif
|
||||
%else ; float/double
|
||||
%if cpuflag(fma4) || cpuflag(fma3)
|
||||
fmaddp%4 m0, m1, [filterq+min_filter_count_x4q*1], m0
|
||||
%else
|
||||
mulp%4 m1, m1, [filterq+min_filter_count_x4q*1]
|
||||
addp%4 m0, m0, m1
|
||||
%endif ; cpuflag
|
||||
%endif
|
||||
add min_filter_count_x4q, mmsize
|
||||
js .inner_loop
|
||||
|
||||
%ifidn %1, int16
|
||||
HADDD m0, m1
|
||||
psrad m0, 15
|
||||
add fracd, dst_incr_modd
|
||||
packssdw m0, m0
|
||||
add indexd, dst_incr_divd
|
||||
movd [dstq], m0
|
||||
%else ; float/double
|
||||
; horizontal sum & store
|
||||
%if mmsize == 32
|
||||
vextractf128 xm1, m0, 0x1
|
||||
addp%4 xm0, xm1
|
||||
%endif
|
||||
movhlps xm1, xm0
|
||||
%ifidn %1, float
|
||||
addps xm0, xm1
|
||||
shufps xm1, xm0, xm0, q0001
|
||||
%endif
|
||||
add fracd, dst_incr_modd
|
||||
addp%4 xm0, xm1
|
||||
add indexd, dst_incr_divd
|
||||
movs%4 [dstq], xm0
|
||||
%endif
|
||||
cmp fracd, src_incrd
|
||||
jl .skip
|
||||
sub fracd, src_incrd
|
||||
inc indexd
|
||||
|
||||
%if UNIX64
|
||||
DEFINE_ARGS filter_alloc, dst, src, phase_count, index, frac, dst_incr_mod, \
|
||||
index_incr, min_filter_count_x4, min_filter_len_x4, dst_incr_div, \
|
||||
src_incr, phase_mask, dst_end, filter_bank
|
||||
%elif WIN64
|
||||
DEFINE_ARGS phase_count, dst, src, filter_alloc, index, frac, dst_incr_mod, \
|
||||
index_incr, min_filter_count_x4, min_filter_len_x4, dst_incr_div, \
|
||||
src_incr, phase_mask, dst_end, filter_bank
|
||||
%else ; x86-32
|
||||
DEFINE_ARGS src, phase_count, dst, frac, index, index_incr
|
||||
%endif
|
||||
|
||||
.skip:
|
||||
add dstq, %2
|
||||
cmp indexd, phase_countd
|
||||
jb .index_skip
|
||||
.index_while:
|
||||
sub indexd, phase_countd
|
||||
lea srcq, [srcq+%2]
|
||||
cmp indexd, phase_countd
|
||||
jnb .index_while
|
||||
.index_skip:
|
||||
cmp dstq, dst_endq
|
||||
jne .loop
|
||||
|
||||
%if ARCH_X86_64
|
||||
DEFINE_ARGS ctx, dst, src, phase_count, index, frac
|
||||
%else ; x86-32
|
||||
DEFINE_ARGS src, ctx, update_context, frac, index
|
||||
%endif
|
||||
|
||||
cmp dword update_context_stackd, 0
|
||||
jz .skip_store
|
||||
; strictly speaking, the function should always return the consumed
|
||||
; number of bytes; however, we only use the value if update_context
|
||||
; is true, so let's just leave it uninitialized otherwise
|
||||
mov ctxq, ctx_stackq
|
||||
movifnidn rax, srcq
|
||||
mov [ctxq+ResampleContext.frac ], fracd
|
||||
sub rax, src_stackq
|
||||
mov [ctxq+ResampleContext.index], indexd
|
||||
shr rax, %3
|
||||
|
||||
.skip_store:
|
||||
%if ARCH_X86_32
|
||||
ADD rsp, 0x20
|
||||
%endif
|
||||
RET
|
||||
|
||||
; int resample_linear_$format(ResampleContext *ctx, float *dst,
|
||||
; const float *src, int size, int update_ctx)
|
||||
%if ARCH_X86_64 ; unix64 and win64
|
||||
%if UNIX64
|
||||
cglobal resample_linear_%1, 0, 15, 5, ctx, dst, phase_mask, phase_count, index, frac, \
|
||||
size, dst_incr_mod, min_filter_count_x4, \
|
||||
min_filter_len_x4, dst_incr_div, src_incr, \
|
||||
src, dst_end, filter_bank
|
||||
|
||||
mov srcq, r2mp
|
||||
%else ; win64
|
||||
cglobal resample_linear_%1, 0, 15, 5, ctx, phase_mask, src, phase_count, index, frac, \
|
||||
size, dst_incr_mod, min_filter_count_x4, \
|
||||
min_filter_len_x4, dst_incr_div, src_incr, \
|
||||
dst, dst_end, filter_bank
|
||||
|
||||
mov dstq, r1mp
|
||||
%endif
|
||||
|
||||
; use red-zone for variable storage
|
||||
%define ctx_stackq [rsp-0x8]
|
||||
%define src_stackq [rsp-0x10]
|
||||
%define phase_mask_stackd [rsp-0x14]
|
||||
%if WIN64
|
||||
%define update_context_stackd r4m
|
||||
%else ; unix64
|
||||
%define update_context_stackd [rsp-0x18]
|
||||
%endif
|
||||
|
||||
; load as many variables in registers as possible; for the rest, store
|
||||
; on stack so that we have 'ctx' available as one extra register
|
||||
mov sized, r3d
|
||||
%if UNIX64
|
||||
mov update_context_stackd, r4d
|
||||
%endif
|
||||
mov indexd, [ctxq+ResampleContext.index]
|
||||
mov fracd, [ctxq+ResampleContext.frac]
|
||||
mov dst_incr_modd, [ctxq+ResampleContext.dst_incr_mod]
|
||||
mov filter_bankq, [ctxq+ResampleContext.filter_bank]
|
||||
mov src_incrd, [ctxq+ResampleContext.src_incr]
|
||||
mov ctx_stackq, ctxq
|
||||
mov min_filter_len_x4d, [ctxq+ResampleContext.filter_length]
|
||||
%ifidn %1, int16
|
||||
movd m4, [pd_0x4000]
|
||||
%else ; float/double
|
||||
cvtsi2s%4 xm0, src_incrd
|
||||
movs%4 xm4, [%5]
|
||||
divs%4 xm4, xm0
|
||||
%endif
|
||||
mov dst_incr_divd, [ctxq+ResampleContext.dst_incr_div]
|
||||
shl min_filter_len_x4d, %3
|
||||
lea dst_endq, [dstq+sizeq*%2]
|
||||
|
||||
%if UNIX64
|
||||
mov ecx, [ctxq+ResampleContext.phase_count]
|
||||
mov edi, [ctxq+ResampleContext.filter_alloc]
|
||||
|
||||
DEFINE_ARGS filter_alloc, dst, filter2, phase_count, index, frac, filter1, \
|
||||
dst_incr_mod, min_filter_count_x4, min_filter_len_x4, \
|
||||
dst_incr_div, src_incr, src, dst_end, filter_bank
|
||||
%elif WIN64
|
||||
mov R9d, [ctxq+ResampleContext.filter_alloc]
|
||||
mov ecx, [ctxq+ResampleContext.phase_count]
|
||||
|
||||
DEFINE_ARGS phase_count, filter2, src, filter_alloc, index, frac, filter1, \
|
||||
dst_incr_mod, min_filter_count_x4, min_filter_len_x4, \
|
||||
dst_incr_div, src_incr, dst, dst_end, filter_bank
|
||||
%endif
|
||||
|
||||
neg min_filter_len_x4q
|
||||
sub filter_bankq, min_filter_len_x4q
|
||||
sub srcq, min_filter_len_x4q
|
||||
mov src_stackq, srcq
|
||||
%else ; x86-32
|
||||
cglobal resample_linear_%1, 1, 7, 5, ctx, min_filter_length_x4, filter2, \
|
||||
frac, index, dst, filter_bank
|
||||
|
||||
; push temp variables to stack
|
||||
%define ctx_stackq r0mp
|
||||
%define src_stackq r2mp
|
||||
%define update_context_stackd r4m
|
||||
|
||||
mov dstq, r1mp
|
||||
mov r3, r3mp
|
||||
lea r3, [dstq+r3*%2]
|
||||
PUSH dword [ctxq+ResampleContext.dst_incr_div]
|
||||
PUSH r3
|
||||
mov r3, dword [ctxq+ResampleContext.filter_alloc]
|
||||
PUSH dword [ctxq+ResampleContext.dst_incr_mod]
|
||||
PUSH r3
|
||||
shl r3, %3
|
||||
PUSH r3
|
||||
mov r3, dword [ctxq+ResampleContext.src_incr]
|
||||
PUSH dword [ctxq+ResampleContext.phase_count] ; unneeded replacement of phase_mask
|
||||
PUSH r3d
|
||||
%ifidn %1, int16
|
||||
movd m4, [pd_0x4000]
|
||||
%else ; float/double
|
||||
cvtsi2s%4 xm0, r3d
|
||||
movs%4 xm4, [%5]
|
||||
divs%4 xm4, xm0
|
||||
%endif
|
||||
mov min_filter_length_x4d, [ctxq+ResampleContext.filter_length]
|
||||
mov indexd, [ctxq+ResampleContext.index]
|
||||
shl min_filter_length_x4d, %3
|
||||
mov fracd, [ctxq+ResampleContext.frac]
|
||||
neg min_filter_length_x4q
|
||||
mov filter_bankq, [ctxq+ResampleContext.filter_bank]
|
||||
sub r2mp, min_filter_length_x4q
|
||||
sub filter_bankq, min_filter_length_x4q
|
||||
PUSH min_filter_length_x4q
|
||||
PUSH filter_bankq
|
||||
PUSH dword [ctxq+ResampleContext.phase_count]
|
||||
|
||||
DEFINE_ARGS filter1, min_filter_count_x4, filter2, frac, index, dst, src
|
||||
|
||||
%define phase_count_stackd dword [rsp+0x0]
|
||||
%define filter_bankq dword [rsp+0x4]
|
||||
%define min_filter_length_x4q dword [rsp+0x8]
|
||||
%define src_incrd dword [rsp+0xc]
|
||||
%define phase_mask_stackd dword [rsp+0x10]
|
||||
%define filter_alloc_x4q dword [rsp+0x14]
|
||||
%define filter_allocd dword [rsp+0x18]
|
||||
%define dst_incr_modd dword [rsp+0x1c]
|
||||
%define dst_endq dword [rsp+0x20]
|
||||
%define dst_incr_divd dword [rsp+0x24]
|
||||
|
||||
mov srcq, r2mp
|
||||
%endif
|
||||
|
||||
.loop:
|
||||
mov filter1d, filter_allocd
|
||||
imul filter1d, indexd
|
||||
%if ARCH_X86_64
|
||||
mov min_filter_count_x4q, min_filter_len_x4q
|
||||
lea filter1q, [filter_bankq+filter1q*%2]
|
||||
lea filter2q, [filter1q+filter_allocq*%2]
|
||||
%else ; x86-32
|
||||
mov min_filter_count_x4q, filter_bankq
|
||||
lea filter1q, [min_filter_count_x4q+filter1q*%2]
|
||||
mov min_filter_count_x4q, min_filter_length_x4q
|
||||
mov filter2q, filter1q
|
||||
add filter2q, filter_alloc_x4q
|
||||
%endif
|
||||
%ifidn %1, int16
|
||||
mova m0, m4
|
||||
mova m2, m4
|
||||
%else ; float/double
|
||||
xorps m0, m0, m0
|
||||
xorps m2, m2, m2
|
||||
%endif
|
||||
|
||||
align 16
|
||||
.inner_loop:
|
||||
movu m1, [srcq+min_filter_count_x4q*1]
|
||||
%ifidn %1, int16
|
||||
%if cpuflag(xop)
|
||||
vpmadcswd m2, m1, [filter2q+min_filter_count_x4q*1], m2
|
||||
vpmadcswd m0, m1, [filter1q+min_filter_count_x4q*1], m0
|
||||
%else
|
||||
pmaddwd m3, m1, [filter2q+min_filter_count_x4q*1]
|
||||
pmaddwd m1, [filter1q+min_filter_count_x4q*1]
|
||||
paddd m2, m3
|
||||
paddd m0, m1
|
||||
%endif ; cpuflag
|
||||
%else ; float/double
|
||||
%if cpuflag(fma4) || cpuflag(fma3)
|
||||
fmaddp%4 m2, m1, [filter2q+min_filter_count_x4q*1], m2
|
||||
fmaddp%4 m0, m1, [filter1q+min_filter_count_x4q*1], m0
|
||||
%else
|
||||
mulp%4 m3, m1, [filter2q+min_filter_count_x4q*1]
|
||||
mulp%4 m1, m1, [filter1q+min_filter_count_x4q*1]
|
||||
addp%4 m2, m2, m3
|
||||
addp%4 m0, m0, m1
|
||||
%endif ; cpuflag
|
||||
%endif
|
||||
add min_filter_count_x4q, mmsize
|
||||
js .inner_loop
|
||||
|
||||
%ifidn %1, int16
|
||||
%if mmsize == 16
|
||||
%if cpuflag(xop)
|
||||
vphadddq m2, m2
|
||||
vphadddq m0, m0
|
||||
%endif
|
||||
pshufd m3, m2, q0032
|
||||
pshufd m1, m0, q0032
|
||||
paddd m2, m3
|
||||
paddd m0, m1
|
||||
%endif
|
||||
%if notcpuflag(xop)
|
||||
PSHUFLW m3, m2, q0032
|
||||
PSHUFLW m1, m0, q0032
|
||||
paddd m2, m3
|
||||
paddd m0, m1
|
||||
%endif
|
||||
psubd m2, m0
|
||||
; This is probably a really bad idea on atom and other machines with a
|
||||
; long transfer latency between GPRs and XMMs (atom). However, it does
|
||||
; make the clip a lot simpler...
|
||||
movd eax, m2
|
||||
add indexd, dst_incr_divd
|
||||
imul fracd
|
||||
idiv src_incrd
|
||||
movd m1, eax
|
||||
add fracd, dst_incr_modd
|
||||
paddd m0, m1
|
||||
psrad m0, 15
|
||||
packssdw m0, m0
|
||||
movd [dstq], m0
|
||||
|
||||
; note that for imul/idiv, I need to move filter to edx/eax for each:
|
||||
; - 32bit: eax=r0[filter1], edx=r2[filter2]
|
||||
; - win64: eax=r6[filter1], edx=r1[todo]
|
||||
; - unix64: eax=r6[filter1], edx=r2[todo]
|
||||
%else ; float/double
|
||||
; val += (v2 - val) * (FELEML) frac / c->src_incr;
|
||||
%if mmsize == 32
|
||||
vextractf128 xm1, m0, 0x1
|
||||
vextractf128 xm3, m2, 0x1
|
||||
addp%4 xm0, xm1
|
||||
addp%4 xm2, xm3
|
||||
%endif
|
||||
cvtsi2s%4 xm1, fracd
|
||||
subp%4 xm2, xm0
|
||||
mulp%4 xm1, xm4
|
||||
shufp%4 xm1, xm1, q0000
|
||||
%if cpuflag(fma4) || cpuflag(fma3)
|
||||
fmaddp%4 xm0, xm2, xm1, xm0
|
||||
%else
|
||||
mulp%4 xm2, xm1
|
||||
addp%4 xm0, xm2
|
||||
%endif ; cpuflag
|
||||
|
||||
; horizontal sum & store
|
||||
movhlps xm1, xm0
|
||||
%ifidn %1, float
|
||||
addps xm0, xm1
|
||||
shufps xm1, xm0, xm0, q0001
|
||||
%endif
|
||||
add fracd, dst_incr_modd
|
||||
addp%4 xm0, xm1
|
||||
add indexd, dst_incr_divd
|
||||
movs%4 [dstq], xm0
|
||||
%endif
|
||||
cmp fracd, src_incrd
|
||||
jl .skip
|
||||
sub fracd, src_incrd
|
||||
inc indexd
|
||||
|
||||
%if UNIX64
|
||||
DEFINE_ARGS filter_alloc, dst, filter2, phase_count, index, frac, index_incr, \
|
||||
dst_incr_mod, min_filter_count_x4, min_filter_len_x4, \
|
||||
dst_incr_div, src_incr, src, dst_end, filter_bank
|
||||
%elif WIN64
|
||||
DEFINE_ARGS phase_count, filter2, src, filter_alloc, index, frac, index_incr, \
|
||||
dst_incr_mod, min_filter_count_x4, min_filter_len_x4, \
|
||||
dst_incr_div, src_incr, dst, dst_end, filter_bank
|
||||
%else ; x86-32
|
||||
DEFINE_ARGS filter1, phase_count, index_incr, frac, index, dst, src
|
||||
%endif
|
||||
|
||||
.skip:
|
||||
%if ARCH_X86_32
|
||||
mov phase_countd, phase_count_stackd
|
||||
%endif
|
||||
add dstq, %2
|
||||
cmp indexd, phase_countd
|
||||
jb .index_skip
|
||||
.index_while:
|
||||
sub indexd, phase_countd
|
||||
lea srcq, [srcq+%2]
|
||||
cmp indexd, phase_countd
|
||||
jnb .index_while
|
||||
.index_skip:
|
||||
cmp dstq, dst_endq
|
||||
jne .loop
|
||||
|
||||
%if UNIX64
|
||||
DEFINE_ARGS ctx, dst, filter2, phase_count, index, frac, index_incr, \
|
||||
dst_incr_mod, min_filter_count_x4, min_filter_len_x4, \
|
||||
dst_incr_div, src_incr, src, dst_end, filter_bank
|
||||
%elif WIN64
|
||||
DEFINE_ARGS ctx, filter2, src, phase_count, index, frac, index_incr, \
|
||||
dst_incr_mod, min_filter_count_x4, min_filter_len_x4, \
|
||||
dst_incr_div, src_incr, dst, dst_end, filter_bank
|
||||
%else ; x86-32
|
||||
DEFINE_ARGS filter1, ctx, update_context, frac, index, dst, src
|
||||
%endif
|
||||
|
||||
cmp dword update_context_stackd, 0
|
||||
jz .skip_store
|
||||
; strictly speaking, the function should always return the consumed
|
||||
; number of bytes; however, we only use the value if update_context
|
||||
; is true, so let's just leave it uninitialized otherwise
|
||||
mov ctxq, ctx_stackq
|
||||
movifnidn rax, srcq
|
||||
mov [ctxq+ResampleContext.frac ], fracd
|
||||
sub rax, src_stackq
|
||||
mov [ctxq+ResampleContext.index], indexd
|
||||
shr rax, %3
|
||||
|
||||
.skip_store:
|
||||
%if ARCH_X86_32
|
||||
ADD rsp, 0x28
|
||||
%endif
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse
|
||||
RESAMPLE_FNS float, 4, 2, s, pf_1
|
||||
|
||||
%if HAVE_AVX_EXTERNAL
|
||||
INIT_YMM avx
|
||||
RESAMPLE_FNS float, 4, 2, s, pf_1
|
||||
%endif
|
||||
%if HAVE_FMA3_EXTERNAL
|
||||
INIT_YMM fma3
|
||||
RESAMPLE_FNS float, 4, 2, s, pf_1
|
||||
%endif
|
||||
%if HAVE_FMA4_EXTERNAL
|
||||
INIT_XMM fma4
|
||||
RESAMPLE_FNS float, 4, 2, s, pf_1
|
||||
%endif
|
||||
|
||||
%if ARCH_X86_32
|
||||
INIT_MMX mmxext
|
||||
RESAMPLE_FNS int16, 2, 1
|
||||
%endif
|
||||
|
||||
INIT_XMM sse2
|
||||
RESAMPLE_FNS int16, 2, 1
|
||||
%if HAVE_XOP_EXTERNAL
|
||||
INIT_XMM xop
|
||||
RESAMPLE_FNS int16, 2, 1
|
||||
%endif
|
||||
|
||||
INIT_XMM sse2
|
||||
RESAMPLE_FNS double, 8, 3, d, pdbl_1
|
||||
|
||||
%if HAVE_AVX_EXTERNAL
|
||||
INIT_YMM avx
|
||||
RESAMPLE_FNS double, 8, 3, d, pdbl_1
|
||||
%endif
|
||||
%if HAVE_FMA3_EXTERNAL
|
||||
INIT_YMM fma3
|
||||
RESAMPLE_FNS double, 8, 3, d, pdbl_1
|
||||
%endif
|
|
@ -429,48 +429,14 @@ if [ $SRS_EXPORT_LIBRTMP_PROJECT = NO ]; then
|
|||
ln -sf ../3rdparty/ffmpeg-4.2-fit && cd ffmpeg-4.2-fit &&
|
||||
PKG_CONFIG_PATH=$ABS_OBJS/opus/lib/pkgconfig ./configure \
|
||||
--prefix=`pwd`/_release \
|
||||
--pkg-config-flags="--static" \
|
||||
--extra-libs=-lpthread \
|
||||
--extra-libs=-lm \
|
||||
--disable-programs \
|
||||
--disable-doc \
|
||||
--disable-htmlpages \
|
||||
--disable-manpages \
|
||||
--disable-podpages \
|
||||
--disable-txtpages \
|
||||
--disable-avdevice \
|
||||
--disable-avformat \
|
||||
--disable-swscale \
|
||||
--disable-postproc \
|
||||
--disable-avfilter \
|
||||
--disable-network \
|
||||
--disable-dct \
|
||||
--disable-dwt \
|
||||
--disable-error-resilience \
|
||||
--disable-lsp \
|
||||
--disable-lzo \
|
||||
--disable-faan \
|
||||
--disable-pixelutils \
|
||||
--disable-hwaccels \
|
||||
--disable-devices \
|
||||
--disable-audiotoolbox \
|
||||
--disable-videotoolbox \
|
||||
--disable-appkit \
|
||||
--disable-coreimage \
|
||||
--disable-avfoundation \
|
||||
--disable-securetransport \
|
||||
--disable-iconv \
|
||||
--disable-lzma \
|
||||
--disable-sdl2 \
|
||||
--disable-everything \
|
||||
--enable-decoder=aac \
|
||||
--enable-decoder=aac_fixed \
|
||||
--enable-decoder=aac_latm \
|
||||
--enable-decoder=libopus \
|
||||
--enable-encoder=aac \
|
||||
--enable-encoder=opus \
|
||||
--enable-encoder=libopus \
|
||||
--enable-libopus &&
|
||||
--pkg-config-flags="--static" --extra-libs=-lpthread --extra-libs=-lm \
|
||||
--disable-programs --disable-doc --disable-htmlpages --disable-manpages --disable-podpages --disable-txtpages \
|
||||
--disable-avdevice --disable-avformat --disable-swscale --disable-postproc --disable-avfilter --disable-network \
|
||||
--disable-dct --disable-dwt --disable-error-resilience --disable-lsp --disable-lzo --disable-faan --disable-pixelutils \
|
||||
--disable-hwaccels --disable-devices --disable-audiotoolbox --disable-videotoolbox --disable-appkit --disable-coreimage \
|
||||
--disable-avfoundation --disable-securetransport --disable-iconv --disable-lzma --disable-sdl2 --disable-everything \
|
||||
--enable-decoder=aac --enable-decoder=aac_fixed --enable-decoder=aac_latm --enable-decoder=libopus --enable-encoder=aac \
|
||||
--enable-encoder=opus --enable-encoder=libopus --enable-libopus &&
|
||||
make ${SRS_JOBS} && make install
|
||||
cd .. && rm -rf ffmpeg && ln -sf ffmpeg-4.2-fit/_release ffmpeg
|
||||
)
|
||||
|
|
5
trunk/configure
vendored
5
trunk/configure
vendored
|
@ -151,7 +151,8 @@ if [[ $SRS_SHARED_ST == YES ]]; then LibSTfile="-lst"; fi
|
|||
# srtp
|
||||
LibSrtpRoot="${SRS_OBJS_DIR}/srtp2/include"; LibSrtpFile="${SRS_OBJS_DIR}/srtp2/lib/libsrtp2.a"
|
||||
# ffmpeg
|
||||
LibFfmpegRoot="${SRS_OBJS_DIR}/ffmpeg/include"; LibFfmpegFile="${SRS_OBJS_DIR}/ffmpeg/lib/libavcodec.a ${SRS_OBJS_DIR}/ffmpeg/lib/libswresample.a ${SRS_OBJS_DIR}/ffmpeg/lib/libavutil.a ${SRS_OBJS_DIR}/ffmpeg/lib/libopus.a -lpthread"
|
||||
LibFfmpegRoot="${SRS_OBJS_DIR}/ffmpeg/include"; LibFfmpegFile="${SRS_OBJS_DIR}/ffmpeg/lib/libavcodec.a ${SRS_OBJS_DIR}/ffmpeg/lib/libswresample.a ${SRS_OBJS_DIR}/ffmpeg/lib/libavutil.a -lpthread"
|
||||
LibFfmpegRoot="${LibFfmpegRoot} ${SRS_OBJS_DIR}/opus/include"; LibFfmpegFile="${LibFfmpegFile} ${SRS_OBJS_DIR}/opus/lib/libopus.a"
|
||||
# openssl-1.1.0e, for the RTMP complex handshake.
|
||||
LibSSLRoot="";LibSSLfile=""
|
||||
if [[ $SRS_SSL == YES && $SRS_USE_SYS_SSL == NO ]]; then
|
||||
|
@ -173,7 +174,7 @@ fi
|
|||
# the link options, always use static link
|
||||
SrsLinkOptions="-ldl";
|
||||
if [[ $SRS_SRT == YES ]]; then
|
||||
SrsLinkOptions="${SrsLinkOptions} -pthread";
|
||||
SrsLinkOptions="${SrsLinkOptions} -lpthread";
|
||||
fi
|
||||
if [[ $SRS_SSL == YES && $SRS_USE_SYS_SSL == YES ]]; then
|
||||
SrsLinkOptions="${SrsLinkOptions} -lssl -lcrypto";
|
||||
|
|
Loading…
Reference in a new issue