mirror of
https://github.com/ossrs/srs.git
synced 2025-03-09 15:49:59 +00:00
Rename ffmpeg-4.2-fit to ffmpeg-4-fit
This commit is contained in:
parent
b19074721c
commit
27712fdda7
720 changed files with 14 additions and 14 deletions
199
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/Makefile
vendored
Normal file
199
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/Makefile
vendored
Normal file
|
@ -0,0 +1,199 @@
|
|||
OBJS += x86/constants.o \
|
||||
|
||||
# subsystems
|
||||
OBJS-$(CONFIG_AC3DSP) += x86/ac3dsp_init.o
|
||||
OBJS-$(CONFIG_AUDIODSP) += x86/audiodsp_init.o
|
||||
OBJS-$(CONFIG_BLOCKDSP) += x86/blockdsp_init.o
|
||||
OBJS-$(CONFIG_BSWAPDSP) += x86/bswapdsp_init.o
|
||||
OBJS-$(CONFIG_DCT) += x86/dct_init.o
|
||||
OBJS-$(CONFIG_DIRAC_DECODER) += x86/diracdsp_init.o \
|
||||
x86/dirac_dwt_init.o
|
||||
OBJS-$(CONFIG_FDCTDSP) += x86/fdctdsp_init.o
|
||||
OBJS-$(CONFIG_FFT) += x86/fft_init.o
|
||||
OBJS-$(CONFIG_FLACDSP) += x86/flacdsp_init.o
|
||||
OBJS-$(CONFIG_FMTCONVERT) += x86/fmtconvert_init.o
|
||||
OBJS-$(CONFIG_H263DSP) += x86/h263dsp_init.o
|
||||
OBJS-$(CONFIG_H264CHROMA) += x86/h264chroma_init.o
|
||||
OBJS-$(CONFIG_H264DSP) += x86/h264dsp_init.o
|
||||
OBJS-$(CONFIG_H264PRED) += x86/h264_intrapred_init.o
|
||||
OBJS-$(CONFIG_H264QPEL) += x86/h264_qpel.o
|
||||
OBJS-$(CONFIG_HPELDSP) += x86/hpeldsp_init.o
|
||||
OBJS-$(CONFIG_LLAUDDSP) += x86/lossless_audiodsp_init.o
|
||||
OBJS-$(CONFIG_LLVIDDSP) += x86/lossless_videodsp_init.o
|
||||
OBJS-$(CONFIG_LLVIDENCDSP) += x86/lossless_videoencdsp_init.o
|
||||
OBJS-$(CONFIG_HUFFYUVDSP) += x86/huffyuvdsp_init.o
|
||||
OBJS-$(CONFIG_HUFFYUVENCDSP) += x86/huffyuvencdsp_init.o
|
||||
OBJS-$(CONFIG_IDCTDSP) += x86/idctdsp_init.o
|
||||
OBJS-$(CONFIG_LPC) += x86/lpc.o
|
||||
OBJS-$(CONFIG_MDCT15) += x86/mdct15_init.o
|
||||
OBJS-$(CONFIG_ME_CMP) += x86/me_cmp_init.o
|
||||
OBJS-$(CONFIG_MPEGAUDIODSP) += x86/mpegaudiodsp.o
|
||||
OBJS-$(CONFIG_MPEGVIDEO) += x86/mpegvideo.o \
|
||||
x86/mpegvideodsp.o
|
||||
OBJS-$(CONFIG_MPEGVIDEOENC) += x86/mpegvideoenc.o \
|
||||
x86/mpegvideoencdsp_init.o
|
||||
OBJS-$(CONFIG_PIXBLOCKDSP) += x86/pixblockdsp_init.o
|
||||
OBJS-$(CONFIG_QPELDSP) += x86/qpeldsp_init.o
|
||||
OBJS-$(CONFIG_RV34DSP) += x86/rv34dsp_init.o
|
||||
OBJS-$(CONFIG_VC1DSP) += x86/vc1dsp_init.o
|
||||
OBJS-$(CONFIG_VIDEODSP) += x86/videodsp_init.o
|
||||
OBJS-$(CONFIG_VP3DSP) += x86/vp3dsp_init.o
|
||||
OBJS-$(CONFIG_VP8DSP) += x86/vp8dsp_init.o
|
||||
OBJS-$(CONFIG_XMM_CLOBBER_TEST) += x86/w64xmmtest.o
|
||||
|
||||
# decoders/encoders
|
||||
OBJS-$(CONFIG_AAC_DECODER) += x86/aacpsdsp_init.o \
|
||||
x86/sbrdsp_init.o
|
||||
OBJS-$(CONFIG_AAC_ENCODER) += x86/aacencdsp_init.o
|
||||
OBJS-$(CONFIG_ADPCM_G722_DECODER) += x86/g722dsp_init.o
|
||||
OBJS-$(CONFIG_ADPCM_G722_ENCODER) += x86/g722dsp_init.o
|
||||
OBJS-$(CONFIG_ALAC_DECODER) += x86/alacdsp_init.o
|
||||
OBJS-$(CONFIG_APNG_DECODER) += x86/pngdsp_init.o
|
||||
OBJS-$(CONFIG_CAVS_DECODER) += x86/cavsdsp.o
|
||||
OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp_init.o x86/synth_filter_init.o
|
||||
OBJS-$(CONFIG_DNXHD_ENCODER) += x86/dnxhdenc_init.o
|
||||
OBJS-$(CONFIG_EXR_DECODER) += x86/exrdsp_init.o
|
||||
OBJS-$(CONFIG_OPUS_DECODER) += x86/opusdsp_init.o
|
||||
OBJS-$(CONFIG_OPUS_ENCODER) += x86/celt_pvq_init.o
|
||||
OBJS-$(CONFIG_HEVC_DECODER) += x86/hevcdsp_init.o
|
||||
OBJS-$(CONFIG_JPEG2000_DECODER) += x86/jpeg2000dsp_init.o
|
||||
OBJS-$(CONFIG_LSCR_DECODER) += x86/pngdsp_init.o
|
||||
OBJS-$(CONFIG_MLP_DECODER) += x86/mlpdsp_init.o
|
||||
OBJS-$(CONFIG_MPEG4_DECODER) += x86/xvididct_init.o
|
||||
OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp_init.o
|
||||
OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp_init.o
|
||||
OBJS-$(CONFIG_PRORES_LGPL_DECODER) += x86/proresdsp_init.o
|
||||
OBJS-$(CONFIG_RV40_DECODER) += x86/rv40dsp_init.o
|
||||
OBJS-$(CONFIG_SBC_ENCODER) += x86/sbcdsp_init.o
|
||||
OBJS-$(CONFIG_SVQ1_ENCODER) += x86/svq1enc_init.o
|
||||
OBJS-$(CONFIG_TAK_DECODER) += x86/takdsp_init.o
|
||||
OBJS-$(CONFIG_TRUEHD_DECODER) += x86/mlpdsp_init.o
|
||||
OBJS-$(CONFIG_TTA_DECODER) += x86/ttadsp_init.o
|
||||
OBJS-$(CONFIG_TTA_ENCODER) += x86/ttaencdsp_init.o
|
||||
OBJS-$(CONFIG_UTVIDEO_DECODER) += x86/utvideodsp_init.o
|
||||
OBJS-$(CONFIG_V210_DECODER) += x86/v210-init.o
|
||||
OBJS-$(CONFIG_V210_ENCODER) += x86/v210enc_init.o
|
||||
OBJS-$(CONFIG_VORBIS_DECODER) += x86/vorbisdsp_init.o
|
||||
OBJS-$(CONFIG_VP3_DECODER) += x86/hpeldsp_vp3_init.o
|
||||
OBJS-$(CONFIG_VP6_DECODER) += x86/vp6dsp_init.o
|
||||
OBJS-$(CONFIG_VP9_DECODER) += x86/vp9dsp_init.o \
|
||||
x86/vp9dsp_init_10bpp.o \
|
||||
x86/vp9dsp_init_12bpp.o \
|
||||
x86/vp9dsp_init_16bpp.o
|
||||
OBJS-$(CONFIG_WEBP_DECODER) += x86/vp8dsp_init.o
|
||||
|
||||
|
||||
# GCC inline assembly optimizations
|
||||
# subsystems
|
||||
MMX-OBJS-$(CONFIG_FDCTDSP) += x86/fdct.o
|
||||
MMX-OBJS-$(CONFIG_VC1DSP) += x86/vc1dsp_mmx.o
|
||||
|
||||
# decoders/encoders
|
||||
MMX-OBJS-$(CONFIG_SNOW_DECODER) += x86/snowdsp.o
|
||||
MMX-OBJS-$(CONFIG_SNOW_ENCODER) += x86/snowdsp.o
|
||||
|
||||
# subsystems
|
||||
X86ASM-OBJS-$(CONFIG_AC3DSP) += x86/ac3dsp.o \
|
||||
x86/ac3dsp_downmix.o
|
||||
X86ASM-OBJS-$(CONFIG_AUDIODSP) += x86/audiodsp.o
|
||||
X86ASM-OBJS-$(CONFIG_BLOCKDSP) += x86/blockdsp.o
|
||||
X86ASM-OBJS-$(CONFIG_BSWAPDSP) += x86/bswapdsp.o
|
||||
X86ASM-OBJS-$(CONFIG_DCT) += x86/dct32.o
|
||||
X86ASM-OBJS-$(CONFIG_FFT) += x86/fft.o
|
||||
X86ASM-OBJS-$(CONFIG_FMTCONVERT) += x86/fmtconvert.o
|
||||
X86ASM-OBJS-$(CONFIG_H263DSP) += x86/h263_loopfilter.o
|
||||
X86ASM-OBJS-$(CONFIG_H264CHROMA) += x86/h264_chromamc.o \
|
||||
x86/h264_chromamc_10bit.o
|
||||
X86ASM-OBJS-$(CONFIG_H264DSP) += x86/h264_deblock.o \
|
||||
x86/h264_deblock_10bit.o \
|
||||
x86/h264_idct.o \
|
||||
x86/h264_idct_10bit.o \
|
||||
x86/h264_weight.o \
|
||||
x86/h264_weight_10bit.o
|
||||
X86ASM-OBJS-$(CONFIG_H264PRED) += x86/h264_intrapred.o \
|
||||
x86/h264_intrapred_10bit.o
|
||||
X86ASM-OBJS-$(CONFIG_H264QPEL) += x86/h264_qpel_8bit.o \
|
||||
x86/h264_qpel_10bit.o \
|
||||
x86/fpel.o \
|
||||
x86/qpel.o
|
||||
X86ASM-OBJS-$(CONFIG_HPELDSP) += x86/fpel.o \
|
||||
x86/hpeldsp.o
|
||||
X86ASM-OBJS-$(CONFIG_HUFFYUVDSP) += x86/huffyuvdsp.o
|
||||
X86ASM-OBJS-$(CONFIG_HUFFYUVENCDSP) += x86/huffyuvencdsp.o
|
||||
X86ASM-OBJS-$(CONFIG_IDCTDSP) += x86/idctdsp.o
|
||||
X86ASM-OBJS-$(CONFIG_LLAUDDSP) += x86/lossless_audiodsp.o
|
||||
X86ASM-OBJS-$(CONFIG_LLVIDDSP) += x86/lossless_videodsp.o
|
||||
X86ASM-OBJS-$(CONFIG_LLVIDENCDSP) += x86/lossless_videoencdsp.o
|
||||
X86ASM-OBJS-$(CONFIG_MDCT15) += x86/mdct15.o
|
||||
X86ASM-OBJS-$(CONFIG_ME_CMP) += x86/me_cmp.o
|
||||
X86ASM-OBJS-$(CONFIG_MPEGAUDIODSP) += x86/imdct36.o
|
||||
X86ASM-OBJS-$(CONFIG_MPEGVIDEOENC) += x86/mpegvideoencdsp.o
|
||||
X86ASM-OBJS-$(CONFIG_OPUS_DECODER) += x86/opusdsp.o
|
||||
X86ASM-OBJS-$(CONFIG_OPUS_ENCODER) += x86/celt_pvq_search.o
|
||||
X86ASM-OBJS-$(CONFIG_PIXBLOCKDSP) += x86/pixblockdsp.o
|
||||
X86ASM-OBJS-$(CONFIG_QPELDSP) += x86/qpeldsp.o \
|
||||
x86/fpel.o \
|
||||
x86/qpel.o
|
||||
X86ASM-OBJS-$(CONFIG_RV34DSP) += x86/rv34dsp.o
|
||||
X86ASM-OBJS-$(CONFIG_VC1DSP) += x86/vc1dsp_loopfilter.o \
|
||||
x86/vc1dsp_mc.o
|
||||
X86ASM-OBJS-$(CONFIG_IDCTDSP) += x86/simple_idct10.o \
|
||||
x86/simple_idct.o
|
||||
X86ASM-OBJS-$(CONFIG_VIDEODSP) += x86/videodsp.o
|
||||
X86ASM-OBJS-$(CONFIG_VP3DSP) += x86/vp3dsp.o
|
||||
X86ASM-OBJS-$(CONFIG_VP8DSP) += x86/vp8dsp.o \
|
||||
x86/vp8dsp_loopfilter.o
|
||||
|
||||
# decoders/encoders
|
||||
X86ASM-OBJS-$(CONFIG_AAC_DECODER) += x86/aacpsdsp.o \
|
||||
x86/sbrdsp.o
|
||||
X86ASM-OBJS-$(CONFIG_AAC_ENCODER) += x86/aacencdsp.o
|
||||
X86ASM-OBJS-$(CONFIG_ADPCM_G722_DECODER) += x86/g722dsp.o
|
||||
X86ASM-OBJS-$(CONFIG_ADPCM_G722_ENCODER) += x86/g722dsp.o
|
||||
X86ASM-OBJS-$(CONFIG_ALAC_DECODER) += x86/alacdsp.o
|
||||
X86ASM-OBJS-$(CONFIG_APNG_DECODER) += x86/pngdsp.o
|
||||
X86ASM-OBJS-$(CONFIG_CAVS_DECODER) += x86/cavsidct.o
|
||||
X86ASM-OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp.o x86/synth_filter.o
|
||||
X86ASM-OBJS-$(CONFIG_DIRAC_DECODER) += x86/diracdsp.o \
|
||||
x86/dirac_dwt.o
|
||||
X86ASM-OBJS-$(CONFIG_DNXHD_ENCODER) += x86/dnxhdenc.o
|
||||
X86ASM-OBJS-$(CONFIG_EXR_DECODER) += x86/exrdsp.o
|
||||
X86ASM-OBJS-$(CONFIG_FLAC_DECODER) += x86/flacdsp.o
|
||||
ifdef CONFIG_GPL
|
||||
X86ASM-OBJS-$(CONFIG_FLAC_ENCODER) += x86/flac_dsp_gpl.o
|
||||
endif
|
||||
X86ASM-OBJS-$(CONFIG_HEVC_DECODER) += x86/hevc_add_res.o \
|
||||
x86/hevc_deblock.o \
|
||||
x86/hevc_idct.o \
|
||||
x86/hevc_mc.o \
|
||||
x86/hevc_sao.o \
|
||||
x86/hevc_sao_10bit.o
|
||||
X86ASM-OBJS-$(CONFIG_JPEG2000_DECODER) += x86/jpeg2000dsp.o
|
||||
X86ASM-OBJS-$(CONFIG_LSCR_DECODER) += x86/pngdsp.o
|
||||
X86ASM-OBJS-$(CONFIG_MLP_DECODER) += x86/mlpdsp.o
|
||||
X86ASM-OBJS-$(CONFIG_MPEG4_DECODER) += x86/xvididct.o
|
||||
X86ASM-OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp.o
|
||||
X86ASM-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp.o
|
||||
X86ASM-OBJS-$(CONFIG_PRORES_LGPL_DECODER) += x86/proresdsp.o
|
||||
X86ASM-OBJS-$(CONFIG_RV40_DECODER) += x86/rv40dsp.o
|
||||
X86ASM-OBJS-$(CONFIG_SBC_ENCODER) += x86/sbcdsp.o
|
||||
X86ASM-OBJS-$(CONFIG_SVQ1_ENCODER) += x86/svq1enc.o
|
||||
X86ASM-OBJS-$(CONFIG_TAK_DECODER) += x86/takdsp.o
|
||||
X86ASM-OBJS-$(CONFIG_TRUEHD_DECODER) += x86/mlpdsp.o
|
||||
X86ASM-OBJS-$(CONFIG_TTA_DECODER) += x86/ttadsp.o
|
||||
X86ASM-OBJS-$(CONFIG_TTA_ENCODER) += x86/ttaencdsp.o
|
||||
X86ASM-OBJS-$(CONFIG_UTVIDEO_DECODER) += x86/utvideodsp.o
|
||||
X86ASM-OBJS-$(CONFIG_V210_ENCODER) += x86/v210enc.o
|
||||
X86ASM-OBJS-$(CONFIG_V210_DECODER) += x86/v210.o
|
||||
X86ASM-OBJS-$(CONFIG_VORBIS_DECODER) += x86/vorbisdsp.o
|
||||
X86ASM-OBJS-$(CONFIG_VP3_DECODER) += x86/hpeldsp_vp3.o
|
||||
X86ASM-OBJS-$(CONFIG_VP6_DECODER) += x86/vp6dsp.o
|
||||
X86ASM-OBJS-$(CONFIG_VP9_DECODER) += x86/vp9intrapred.o \
|
||||
x86/vp9intrapred_16bpp.o \
|
||||
x86/vp9itxfm.o \
|
||||
x86/vp9itxfm_16bpp.o \
|
||||
x86/vp9lpf.o \
|
||||
x86/vp9lpf_16bpp.o \
|
||||
x86/vp9mc.o \
|
||||
x86/vp9mc_16bpp.o
|
||||
X86ASM-OBJS-$(CONFIG_WEBP_DECODER) += x86/vp8dsp.o
|
86
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/aacencdsp.asm
vendored
Normal file
86
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/aacencdsp.asm
vendored
Normal file
|
@ -0,0 +1,86 @@
|
|||
;******************************************************************************
|
||||
;* SIMD optimized AAC encoder DSP functions
|
||||
;*
|
||||
;* Copyright (C) 2016 Rostislav Pehlivanov <atomnuker@gmail.com>
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
SECTION_RODATA
|
||||
|
||||
float_abs_mask: times 4 dd 0x7fffffff
|
||||
|
||||
SECTION .text
|
||||
|
||||
;*******************************************************************
|
||||
;void ff_abs_pow34(float *out, const float *in, const int size);
|
||||
;*******************************************************************
|
||||
INIT_XMM sse
|
||||
cglobal abs_pow34, 3, 3, 3, out, in, size
|
||||
mova m2, [float_abs_mask]
|
||||
shl sizeq, 2
|
||||
add inq, sizeq
|
||||
add outq, sizeq
|
||||
neg sizeq
|
||||
.loop:
|
||||
andps m0, m2, [inq+sizeq]
|
||||
sqrtps m1, m0
|
||||
mulps m0, m1
|
||||
sqrtps m0, m0
|
||||
mova [outq+sizeq], m0
|
||||
add sizeq, mmsize
|
||||
jl .loop
|
||||
RET
|
||||
|
||||
;*******************************************************************
|
||||
;void ff_aac_quantize_bands(int *out, const float *in, const float *scaled,
|
||||
; int size, int is_signed, int maxval, const float Q34,
|
||||
; const float rounding)
|
||||
;*******************************************************************
|
||||
INIT_XMM sse2
|
||||
cglobal aac_quantize_bands, 5, 5, 6, out, in, scaled, size, is_signed, maxval, Q34, rounding
|
||||
%if UNIX64 == 0
|
||||
movss m0, Q34m
|
||||
movss m1, roundingm
|
||||
cvtsi2ss m3, dword maxvalm
|
||||
%else
|
||||
cvtsi2ss m3, maxvald
|
||||
%endif
|
||||
shufps m0, m0, 0
|
||||
shufps m1, m1, 0
|
||||
shufps m3, m3, 0
|
||||
shl is_signedd, 31
|
||||
movd m4, is_signedd
|
||||
shufps m4, m4, 0
|
||||
shl sized, 2
|
||||
add inq, sizeq
|
||||
add outq, sizeq
|
||||
add scaledq, sizeq
|
||||
neg sizeq
|
||||
.loop:
|
||||
mulps m2, m0, [scaledq+sizeq]
|
||||
addps m2, m1
|
||||
minps m2, m3
|
||||
andps m5, m4, [inq+sizeq]
|
||||
orps m2, m5
|
||||
cvttps2dq m2, m2
|
||||
mova [outq+sizeq], m2
|
||||
add sizeq, mmsize
|
||||
jl .loop
|
||||
RET
|
43
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/aacencdsp_init.c
vendored
Normal file
43
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/aacencdsp_init.c
vendored
Normal file
|
@ -0,0 +1,43 @@
|
|||
/*
|
||||
* AAC encoder assembly optimizations
|
||||
* Copyright (C) 2016 Rostislav Pehlivanov <atomnuker@gmail.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
|
||||
#include "libavutil/float_dsp.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/aacenc.h"
|
||||
|
||||
void ff_abs_pow34_sse(float *out, const float *in, const int size);
|
||||
|
||||
void ff_aac_quantize_bands_sse2(int *out, const float *in, const float *scaled,
|
||||
int size, int is_signed, int maxval, const float Q34,
|
||||
const float rounding);
|
||||
|
||||
av_cold void ff_aac_dsp_init_x86(AACEncContext *s)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (EXTERNAL_SSE(cpu_flags))
|
||||
s->abs_pow34 = ff_abs_pow34_sse;
|
||||
|
||||
if (EXTERNAL_SSE2(cpu_flags))
|
||||
s->quant_bands = ff_aac_quantize_bands_sse2;
|
||||
}
|
487
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/aacpsdsp.asm
vendored
Normal file
487
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/aacpsdsp.asm
vendored
Normal file
|
@ -0,0 +1,487 @@
|
|||
;******************************************************************************
|
||||
;* SIMD optimized MPEG-4 Parametric Stereo decoding functions
|
||||
;*
|
||||
;* Copyright (C) 2015 James Almer
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
SECTION_RODATA
|
||||
|
||||
ps_p1m1p1m1: dd 0, 0x80000000, 0, 0x80000000
|
||||
|
||||
SECTION .text
|
||||
|
||||
;*************************************************************************
|
||||
;void ff_ps_add_squares_<opt>(float *dst, const float (*src)[2], int n);
|
||||
;*************************************************************************
|
||||
%macro PS_ADD_SQUARES 1
|
||||
cglobal ps_add_squares, 3, 3, %1, dst, src, n
|
||||
shl nd, 3
|
||||
add srcq, nq
|
||||
neg nq
|
||||
|
||||
align 16
|
||||
.loop:
|
||||
movaps m0, [srcq+nq]
|
||||
movaps m1, [srcq+nq+mmsize]
|
||||
mulps m0, m0
|
||||
mulps m1, m1
|
||||
HADDPS m0, m1, m2
|
||||
addps m0, [dstq]
|
||||
movaps [dstq], m0
|
||||
add dstq, mmsize
|
||||
add nq, mmsize*2
|
||||
jl .loop
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse
|
||||
PS_ADD_SQUARES 2
|
||||
INIT_XMM sse3
|
||||
PS_ADD_SQUARES 3
|
||||
|
||||
;*******************************************************************
|
||||
;void ff_ps_mul_pair_single_sse(float (*dst)[2], float (*src0)[2],
|
||||
; float *src1, int n);
|
||||
;*******************************************************************
|
||||
INIT_XMM sse
|
||||
cglobal ps_mul_pair_single, 4, 4, 4, dst, src1, src2, n
|
||||
shl nd, 3
|
||||
add src1q, nq
|
||||
add dstq, nq
|
||||
neg nq
|
||||
|
||||
align 16
|
||||
.loop:
|
||||
movu m0, [src1q+nq]
|
||||
movu m1, [src1q+nq+mmsize]
|
||||
mova m2, [src2q]
|
||||
mova m3, m2
|
||||
unpcklps m2, m2
|
||||
unpckhps m3, m3
|
||||
mulps m0, m2
|
||||
mulps m1, m3
|
||||
mova [dstq+nq], m0
|
||||
mova [dstq+nq+mmsize], m1
|
||||
add src2q, mmsize
|
||||
add nq, mmsize*2
|
||||
jl .loop
|
||||
REP_RET
|
||||
|
||||
;***********************************************************************
|
||||
;void ff_ps_stereo_interpolate_sse3(float (*l)[2], float (*r)[2],
|
||||
; float h[2][4], float h_step[2][4],
|
||||
; int len);
|
||||
;***********************************************************************
|
||||
INIT_XMM sse3
|
||||
cglobal ps_stereo_interpolate, 5, 5, 6, l, r, h, h_step, n
|
||||
movaps m0, [hq]
|
||||
movaps m1, [h_stepq]
|
||||
unpcklps m4, m0, m0
|
||||
unpckhps m0, m0
|
||||
unpcklps m5, m1, m1
|
||||
unpckhps m1, m1
|
||||
shl nd, 3
|
||||
add lq, nq
|
||||
add rq, nq
|
||||
neg nq
|
||||
|
||||
align 16
|
||||
.loop:
|
||||
addps m4, m5
|
||||
addps m0, m1
|
||||
movddup m2, [lq+nq]
|
||||
movddup m3, [rq+nq]
|
||||
mulps m2, m4
|
||||
mulps m3, m0
|
||||
addps m2, m3
|
||||
movsd [lq+nq], m2
|
||||
movhps [rq+nq], m2
|
||||
add nq, 8
|
||||
jl .loop
|
||||
REP_RET
|
||||
|
||||
;***************************************************************************
|
||||
;void ps_stereo_interpolate_ipdopd_sse3(float (*l)[2], float (*r)[2],
|
||||
; float h[2][4], float h_step[2][4],
|
||||
; int len);
|
||||
;***************************************************************************
|
||||
INIT_XMM sse3
|
||||
cglobal ps_stereo_interpolate_ipdopd, 5, 5, 10, l, r, h, h_step, n
|
||||
movaps m0, [hq]
|
||||
movaps m1, [hq+mmsize]
|
||||
%if ARCH_X86_64
|
||||
movaps m8, [h_stepq]
|
||||
movaps m9, [h_stepq+mmsize]
|
||||
%define H_STEP0 m8
|
||||
%define H_STEP1 m9
|
||||
%else
|
||||
%define H_STEP0 [h_stepq]
|
||||
%define H_STEP1 [h_stepq+mmsize]
|
||||
%endif
|
||||
shl nd, 3
|
||||
add lq, nq
|
||||
add rq, nq
|
||||
neg nq
|
||||
|
||||
align 16
|
||||
.loop:
|
||||
addps m0, H_STEP0
|
||||
addps m1, H_STEP1
|
||||
movddup m2, [lq+nq]
|
||||
movddup m3, [rq+nq]
|
||||
shufps m4, m2, m2, q2301
|
||||
shufps m5, m3, m3, q2301
|
||||
unpcklps m6, m0, m0
|
||||
unpckhps m7, m0, m0
|
||||
mulps m2, m6
|
||||
mulps m3, m7
|
||||
unpcklps m6, m1, m1
|
||||
unpckhps m7, m1, m1
|
||||
mulps m4, m6
|
||||
mulps m5, m7
|
||||
addps m2, m3
|
||||
addsubps m2, m4
|
||||
addsubps m2, m5
|
||||
movsd [lq+nq], m2
|
||||
movhps [rq+nq], m2
|
||||
add nq, 8
|
||||
jl .loop
|
||||
REP_RET
|
||||
|
||||
;**********************************************************
|
||||
;void ps_hybrid_analysis_ileave_sse(float out[2][38][64],
|
||||
; float (*in)[32][2],
|
||||
; int i, int len)
|
||||
;**********************************************************
|
||||
INIT_XMM sse
|
||||
cglobal ps_hybrid_analysis_ileave, 3, 7, 5, out, in, i, len, in0, in1, tmp
|
||||
movsxdifnidn iq, id
|
||||
mov lend, 32 << 3
|
||||
lea inq, [inq+iq*4]
|
||||
mov tmpd, id
|
||||
shl tmpd, 8
|
||||
add outq, tmpq
|
||||
mov tmpd, 64
|
||||
sub tmpd, id
|
||||
mov id, tmpd
|
||||
|
||||
test id, 1
|
||||
jne .loop4
|
||||
test id, 2
|
||||
jne .loop8
|
||||
|
||||
align 16
|
||||
.loop16:
|
||||
mov in0q, inq
|
||||
mov in1q, 38*64*4
|
||||
add in1q, in0q
|
||||
mov tmpd, lend
|
||||
|
||||
.inner_loop16:
|
||||
movaps m0, [in0q]
|
||||
movaps m1, [in1q]
|
||||
movaps m2, [in0q+lenq]
|
||||
movaps m3, [in1q+lenq]
|
||||
TRANSPOSE4x4PS 0, 1, 2, 3, 4
|
||||
movaps [outq], m0
|
||||
movaps [outq+lenq], m1
|
||||
movaps [outq+lenq*2], m2
|
||||
movaps [outq+3*32*2*4], m3
|
||||
lea in0q, [in0q+lenq*2]
|
||||
lea in1q, [in1q+lenq*2]
|
||||
add outq, mmsize
|
||||
sub tmpd, mmsize
|
||||
jg .inner_loop16
|
||||
add inq, 16
|
||||
add outq, 3*32*2*4
|
||||
sub id, 4
|
||||
jg .loop16
|
||||
RET
|
||||
|
||||
align 16
|
||||
.loop8:
|
||||
mov in0q, inq
|
||||
mov in1q, 38*64*4
|
||||
add in1q, in0q
|
||||
mov tmpd, lend
|
||||
|
||||
.inner_loop8:
|
||||
movlps m0, [in0q]
|
||||
movlps m1, [in1q]
|
||||
movhps m0, [in0q+lenq]
|
||||
movhps m1, [in1q+lenq]
|
||||
SBUTTERFLYPS 0, 1, 2
|
||||
SBUTTERFLYPD 0, 1, 2
|
||||
movaps [outq], m0
|
||||
movaps [outq+lenq], m1
|
||||
lea in0q, [in0q+lenq*2]
|
||||
lea in1q, [in1q+lenq*2]
|
||||
add outq, mmsize
|
||||
sub tmpd, mmsize
|
||||
jg .inner_loop8
|
||||
add inq, 8
|
||||
add outq, lenq
|
||||
sub id, 2
|
||||
jg .loop16
|
||||
RET
|
||||
|
||||
align 16
|
||||
.loop4:
|
||||
mov in0q, inq
|
||||
mov in1q, 38*64*4
|
||||
add in1q, in0q
|
||||
mov tmpd, lend
|
||||
|
||||
.inner_loop4:
|
||||
movss m0, [in0q]
|
||||
movss m1, [in1q]
|
||||
movss m2, [in0q+lenq]
|
||||
movss m3, [in1q+lenq]
|
||||
movlhps m0, m1
|
||||
movlhps m2, m3
|
||||
shufps m0, m2, q2020
|
||||
movaps [outq], m0
|
||||
lea in0q, [in0q+lenq*2]
|
||||
lea in1q, [in1q+lenq*2]
|
||||
add outq, mmsize
|
||||
sub tmpd, mmsize
|
||||
jg .inner_loop4
|
||||
add inq, 4
|
||||
sub id, 1
|
||||
test id, 2
|
||||
jne .loop8
|
||||
cmp id, 4
|
||||
jge .loop16
|
||||
RET
|
||||
|
||||
;***********************************************************
|
||||
;void ps_hybrid_synthesis_deint_sse4(float out[2][38][64],
|
||||
; float (*in)[32][2],
|
||||
; int i, int len)
|
||||
;***********************************************************
|
||||
%macro HYBRID_SYNTHESIS_DEINT 0
|
||||
cglobal ps_hybrid_synthesis_deint, 3, 7, 5, out, in, i, len, out0, out1, tmp
|
||||
%if cpuflag(sse4)
|
||||
%define MOVH movsd
|
||||
%else
|
||||
%define MOVH movlps
|
||||
%endif
|
||||
movsxdifnidn iq, id
|
||||
mov lend, 32 << 3
|
||||
lea outq, [outq+iq*4]
|
||||
mov tmpd, id
|
||||
shl tmpd, 8
|
||||
add inq, tmpq
|
||||
mov tmpd, 64
|
||||
sub tmpd, id
|
||||
mov id, tmpd
|
||||
|
||||
test id, 1
|
||||
jne .loop4
|
||||
test id, 2
|
||||
jne .loop8
|
||||
|
||||
align 16
|
||||
.loop16:
|
||||
mov out0q, outq
|
||||
mov out1q, 38*64*4
|
||||
add out1q, out0q
|
||||
mov tmpd, lend
|
||||
|
||||
.inner_loop16:
|
||||
movaps m0, [inq]
|
||||
movaps m1, [inq+lenq]
|
||||
movaps m2, [inq+lenq*2]
|
||||
movaps m3, [inq+3*32*2*4]
|
||||
TRANSPOSE4x4PS 0, 1, 2, 3, 4
|
||||
movaps [out0q], m0
|
||||
movaps [out1q], m1
|
||||
movaps [out0q+lenq], m2
|
||||
movaps [out1q+lenq], m3
|
||||
lea out0q, [out0q+lenq*2]
|
||||
lea out1q, [out1q+lenq*2]
|
||||
add inq, mmsize
|
||||
sub tmpd, mmsize
|
||||
jg .inner_loop16
|
||||
add outq, 16
|
||||
add inq, 3*32*2*4
|
||||
sub id, 4
|
||||
jg .loop16
|
||||
RET
|
||||
|
||||
align 16
|
||||
.loop8:
|
||||
mov out0q, outq
|
||||
mov out1q, 38*64*4
|
||||
add out1q, out0q
|
||||
mov tmpd, lend
|
||||
|
||||
.inner_loop8:
|
||||
movaps m0, [inq]
|
||||
movaps m1, [inq+lenq]
|
||||
SBUTTERFLYPS 0, 1, 2
|
||||
SBUTTERFLYPD 0, 1, 2
|
||||
MOVH [out0q], m0
|
||||
MOVH [out1q], m1
|
||||
movhps [out0q+lenq], m0
|
||||
movhps [out1q+lenq], m1
|
||||
lea out0q, [out0q+lenq*2]
|
||||
lea out1q, [out1q+lenq*2]
|
||||
add inq, mmsize
|
||||
sub tmpd, mmsize
|
||||
jg .inner_loop8
|
||||
add outq, 8
|
||||
add inq, lenq
|
||||
sub id, 2
|
||||
jg .loop16
|
||||
RET
|
||||
|
||||
align 16
|
||||
.loop4:
|
||||
mov out0q, outq
|
||||
mov out1q, 38*64*4
|
||||
add out1q, out0q
|
||||
mov tmpd, lend
|
||||
|
||||
.inner_loop4:
|
||||
movaps m0, [inq]
|
||||
movss [out0q], m0
|
||||
%if cpuflag(sse4)
|
||||
extractps [out1q], m0, 1
|
||||
extractps [out0q+lenq], m0, 2
|
||||
extractps [out1q+lenq], m0, 3
|
||||
%else
|
||||
movhlps m1, m0
|
||||
movss [out0q+lenq], m1
|
||||
shufps m0, m0, 0xb1
|
||||
movss [out1q], m0
|
||||
movhlps m1, m0
|
||||
movss [out1q+lenq], m1
|
||||
%endif
|
||||
lea out0q, [out0q+lenq*2]
|
||||
lea out1q, [out1q+lenq*2]
|
||||
add inq, mmsize
|
||||
sub tmpd, mmsize
|
||||
jg .inner_loop4
|
||||
add outq, 4
|
||||
sub id, 1
|
||||
test id, 2
|
||||
jne .loop8
|
||||
cmp id, 4
|
||||
jge .loop16
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse
|
||||
HYBRID_SYNTHESIS_DEINT
|
||||
INIT_XMM sse4
|
||||
HYBRID_SYNTHESIS_DEINT
|
||||
|
||||
;*******************************************************************
|
||||
;void ff_ps_hybrid_analysis_<opt>(float (*out)[2], float (*in)[2],
|
||||
; const float (*filter)[8][2],
|
||||
; ptrdiff_t stride, int n);
|
||||
;*******************************************************************
|
||||
%macro PS_HYBRID_ANALYSIS_LOOP 3
|
||||
movu %1, [inq+mmsize*%3]
|
||||
movu m1, [inq+mmsize*(5-%3)+8]
|
||||
%if cpuflag(sse3)
|
||||
pshufd %2, %1, q2301
|
||||
pshufd m4, m1, q0123
|
||||
pshufd m1, m1, q1032
|
||||
pshufd m2, [filterq+nq+mmsize*%3], q2301
|
||||
addsubps %2, m4
|
||||
addsubps %1, m1
|
||||
%else
|
||||
mova m2, [filterq+nq+mmsize*%3]
|
||||
mova %2, %1
|
||||
mova m4, m1
|
||||
shufps %2, %2, q2301
|
||||
shufps m4, m4, q0123
|
||||
shufps m1, m1, q1032
|
||||
shufps m2, m2, q2301
|
||||
xorps m4, m7
|
||||
xorps m1, m7
|
||||
subps %2, m4
|
||||
subps %1, m1
|
||||
%endif
|
||||
mulps %2, m2
|
||||
mulps %1, m2
|
||||
%if %3
|
||||
addps m3, %2
|
||||
addps m0, %1
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro PS_HYBRID_ANALYSIS 0
|
||||
cglobal ps_hybrid_analysis, 5, 5, 8, out, in, filter, stride, n
|
||||
%if cpuflag(sse3)
|
||||
%define MOVH movsd
|
||||
%else
|
||||
%define MOVH movlps
|
||||
%endif
|
||||
shl strideq, 3
|
||||
shl nd, 6
|
||||
add filterq, nq
|
||||
neg nq
|
||||
mova m7, [ps_p1m1p1m1]
|
||||
|
||||
align 16
|
||||
.loop:
|
||||
PS_HYBRID_ANALYSIS_LOOP m0, m3, 0
|
||||
PS_HYBRID_ANALYSIS_LOOP m5, m6, 1
|
||||
PS_HYBRID_ANALYSIS_LOOP m5, m6, 2
|
||||
|
||||
%if cpuflag(sse3)
|
||||
pshufd m3, m3, q2301
|
||||
xorps m0, m7
|
||||
hsubps m3, m0
|
||||
pshufd m1, m3, q0020
|
||||
pshufd m3, m3, q0031
|
||||
addps m1, m3
|
||||
movsd m2, [inq+6*8]
|
||||
%else
|
||||
mova m1, m3
|
||||
mova m2, m0
|
||||
shufps m1, m1, q2301
|
||||
shufps m2, m2, q2301
|
||||
subps m1, m3
|
||||
addps m2, m0
|
||||
unpcklps m3, m1, m2
|
||||
unpckhps m1, m2
|
||||
addps m1, m3
|
||||
movu m2, [inq+6*8] ; faster than movlps and no risk of overread
|
||||
%endif
|
||||
movss m3, [filterq+nq+8*6]
|
||||
SPLATD m3
|
||||
mulps m2, m3
|
||||
addps m1, m2
|
||||
MOVH [outq], m1
|
||||
add outq, strideq
|
||||
add nq, 64
|
||||
jl .loop
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse
|
||||
PS_HYBRID_ANALYSIS
|
||||
INIT_XMM sse3
|
||||
PS_HYBRID_ANALYSIS
|
72
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/aacpsdsp_init.c
vendored
Normal file
72
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/aacpsdsp_init.c
vendored
Normal file
|
@ -0,0 +1,72 @@
|
|||
/*
|
||||
* SIMD optimized MPEG-4 Parametric Stereo decoding functions
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include <stddef.h>
|
||||
|
||||
#include "config.h"
|
||||
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavcodec/aacpsdsp.h"
|
||||
|
||||
void ff_ps_add_squares_sse (float *dst, const float (*src)[2], int n);
|
||||
void ff_ps_add_squares_sse3 (float *dst, const float (*src)[2], int n);
|
||||
void ff_ps_mul_pair_single_sse (float (*dst)[2], float (*src0)[2],
|
||||
float *src1, int n);
|
||||
void ff_ps_hybrid_analysis_sse (float (*out)[2], float (*in)[2],
|
||||
const float (*filter)[8][2],
|
||||
ptrdiff_t stride, int n);
|
||||
void ff_ps_hybrid_analysis_sse3(float (*out)[2], float (*in)[2],
|
||||
const float (*filter)[8][2],
|
||||
ptrdiff_t stride, int n);
|
||||
void ff_ps_stereo_interpolate_sse3(float (*l)[2], float (*r)[2],
|
||||
float h[2][4], float h_step[2][4],
|
||||
int len);
|
||||
void ff_ps_stereo_interpolate_ipdopd_sse3(float (*l)[2], float (*r)[2],
|
||||
float h[2][4], float h_step[2][4],
|
||||
int len);
|
||||
void ff_ps_hybrid_synthesis_deint_sse(float out[2][38][64], float (*in)[32][2],
|
||||
int i, int len);
|
||||
void ff_ps_hybrid_synthesis_deint_sse4(float out[2][38][64], float (*in)[32][2],
|
||||
int i, int len);
|
||||
void ff_ps_hybrid_analysis_ileave_sse(float (*out)[32][2], float L[2][38][64],
|
||||
int i, int len);
|
||||
|
||||
av_cold void ff_psdsp_init_x86(PSDSPContext *s)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (EXTERNAL_SSE(cpu_flags)) {
|
||||
s->add_squares = ff_ps_add_squares_sse;
|
||||
s->mul_pair_single = ff_ps_mul_pair_single_sse;
|
||||
s->hybrid_analysis_ileave = ff_ps_hybrid_analysis_ileave_sse;
|
||||
s->hybrid_synthesis_deint = ff_ps_hybrid_synthesis_deint_sse;
|
||||
s->hybrid_analysis = ff_ps_hybrid_analysis_sse;
|
||||
}
|
||||
if (EXTERNAL_SSE3(cpu_flags)) {
|
||||
s->add_squares = ff_ps_add_squares_sse3;
|
||||
s->stereo_interpolate[0] = ff_ps_stereo_interpolate_sse3;
|
||||
s->stereo_interpolate[1] = ff_ps_stereo_interpolate_ipdopd_sse3;
|
||||
s->hybrid_analysis = ff_ps_hybrid_analysis_sse3;
|
||||
}
|
||||
if (EXTERNAL_SSE4(cpu_flags)) {
|
||||
s->hybrid_synthesis_deint = ff_ps_hybrid_synthesis_deint_sse4;
|
||||
}
|
||||
}
|
164
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/ac3dsp_init.c
vendored
Normal file
164
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/ac3dsp_init.c
vendored
Normal file
|
@ -0,0 +1,164 @@
|
|||
/*
|
||||
* x86-optimized AC-3 DSP functions
|
||||
* Copyright (c) 2011 Justin Ruggles
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/mem.h"
|
||||
#include "libavutil/x86/asm.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/ac3.h"
|
||||
#include "libavcodec/ac3dsp.h"
|
||||
|
||||
void ff_ac3_exponent_min_mmx (uint8_t *exp, int num_reuse_blocks, int nb_coefs);
|
||||
void ff_ac3_exponent_min_mmxext(uint8_t *exp, int num_reuse_blocks, int nb_coefs);
|
||||
void ff_ac3_exponent_min_sse2 (uint8_t *exp, int num_reuse_blocks, int nb_coefs);
|
||||
|
||||
int ff_ac3_max_msb_abs_int16_mmx (const int16_t *src, int len);
|
||||
int ff_ac3_max_msb_abs_int16_mmxext(const int16_t *src, int len);
|
||||
int ff_ac3_max_msb_abs_int16_sse2 (const int16_t *src, int len);
|
||||
int ff_ac3_max_msb_abs_int16_ssse3(const int16_t *src, int len);
|
||||
|
||||
void ff_ac3_lshift_int16_mmx (int16_t *src, unsigned int len, unsigned int shift);
|
||||
void ff_ac3_lshift_int16_sse2(int16_t *src, unsigned int len, unsigned int shift);
|
||||
|
||||
void ff_ac3_rshift_int32_mmx (int32_t *src, unsigned int len, unsigned int shift);
|
||||
void ff_ac3_rshift_int32_sse2(int32_t *src, unsigned int len, unsigned int shift);
|
||||
|
||||
void ff_float_to_fixed24_3dnow(int32_t *dst, const float *src, unsigned int len);
|
||||
void ff_float_to_fixed24_sse (int32_t *dst, const float *src, unsigned int len);
|
||||
void ff_float_to_fixed24_sse2 (int32_t *dst, const float *src, unsigned int len);
|
||||
|
||||
int ff_ac3_compute_mantissa_size_sse2(uint16_t mant_cnt[6][16]);
|
||||
|
||||
void ff_ac3_extract_exponents_sse2 (uint8_t *exp, int32_t *coef, int nb_coefs);
|
||||
void ff_ac3_extract_exponents_ssse3(uint8_t *exp, int32_t *coef, int nb_coefs);
|
||||
|
||||
void ff_apply_window_int16_round_mmxext(int16_t *output, const int16_t *input,
|
||||
const int16_t *window, unsigned int len);
|
||||
void ff_apply_window_int16_round_sse2(int16_t *output, const int16_t *input,
|
||||
const int16_t *window, unsigned int len);
|
||||
void ff_apply_window_int16_mmxext(int16_t *output, const int16_t *input,
|
||||
const int16_t *window, unsigned int len);
|
||||
void ff_apply_window_int16_sse2(int16_t *output, const int16_t *input,
|
||||
const int16_t *window, unsigned int len);
|
||||
void ff_apply_window_int16_ssse3(int16_t *output, const int16_t *input,
|
||||
const int16_t *window, unsigned int len);
|
||||
void ff_apply_window_int16_ssse3_atom(int16_t *output, const int16_t *input,
|
||||
const int16_t *window, unsigned int len);
|
||||
|
||||
av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c, int bit_exact)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (EXTERNAL_MMX(cpu_flags)) {
|
||||
c->ac3_exponent_min = ff_ac3_exponent_min_mmx;
|
||||
c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_mmx;
|
||||
c->ac3_lshift_int16 = ff_ac3_lshift_int16_mmx;
|
||||
c->ac3_rshift_int32 = ff_ac3_rshift_int32_mmx;
|
||||
}
|
||||
if (EXTERNAL_AMD3DNOW(cpu_flags)) {
|
||||
if (!bit_exact) {
|
||||
c->float_to_fixed24 = ff_float_to_fixed24_3dnow;
|
||||
}
|
||||
}
|
||||
if (EXTERNAL_MMXEXT(cpu_flags)) {
|
||||
c->ac3_exponent_min = ff_ac3_exponent_min_mmxext;
|
||||
c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_mmxext;
|
||||
if (bit_exact) {
|
||||
c->apply_window_int16 = ff_apply_window_int16_mmxext;
|
||||
} else {
|
||||
c->apply_window_int16 = ff_apply_window_int16_round_mmxext;
|
||||
}
|
||||
}
|
||||
if (EXTERNAL_SSE(cpu_flags)) {
|
||||
c->float_to_fixed24 = ff_float_to_fixed24_sse;
|
||||
}
|
||||
if (EXTERNAL_SSE2(cpu_flags)) {
|
||||
c->ac3_exponent_min = ff_ac3_exponent_min_sse2;
|
||||
c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_sse2;
|
||||
c->float_to_fixed24 = ff_float_to_fixed24_sse2;
|
||||
c->compute_mantissa_size = ff_ac3_compute_mantissa_size_sse2;
|
||||
c->extract_exponents = ff_ac3_extract_exponents_sse2;
|
||||
if (bit_exact) {
|
||||
c->apply_window_int16 = ff_apply_window_int16_sse2;
|
||||
}
|
||||
}
|
||||
|
||||
if (EXTERNAL_SSE2_FAST(cpu_flags)) {
|
||||
c->ac3_lshift_int16 = ff_ac3_lshift_int16_sse2;
|
||||
c->ac3_rshift_int32 = ff_ac3_rshift_int32_sse2;
|
||||
if (!bit_exact) {
|
||||
c->apply_window_int16 = ff_apply_window_int16_round_sse2;
|
||||
}
|
||||
}
|
||||
|
||||
if (EXTERNAL_SSSE3(cpu_flags)) {
|
||||
c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_ssse3;
|
||||
if (cpu_flags & AV_CPU_FLAG_ATOM) {
|
||||
c->apply_window_int16 = ff_apply_window_int16_ssse3_atom;
|
||||
} else {
|
||||
c->extract_exponents = ff_ac3_extract_exponents_ssse3;
|
||||
c->apply_window_int16 = ff_apply_window_int16_ssse3;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#define DOWNMIX_FUNC_OPT(ch, opt) \
|
||||
void ff_ac3_downmix_ ## ch ## _to_1_ ## opt(float **samples, \
|
||||
float **matrix, int len); \
|
||||
void ff_ac3_downmix_ ## ch ## _to_2_ ## opt(float **samples, \
|
||||
float **matrix, int len);
|
||||
|
||||
#define DOWNMIX_FUNCS(opt) \
|
||||
DOWNMIX_FUNC_OPT(3, opt) \
|
||||
DOWNMIX_FUNC_OPT(4, opt) \
|
||||
DOWNMIX_FUNC_OPT(5, opt) \
|
||||
DOWNMIX_FUNC_OPT(6, opt)
|
||||
|
||||
DOWNMIX_FUNCS(sse)
|
||||
DOWNMIX_FUNCS(avx)
|
||||
DOWNMIX_FUNCS(fma3)
|
||||
|
||||
void ff_ac3dsp_set_downmix_x86(AC3DSPContext *c)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
#define SET_DOWNMIX(ch, suf, SUF) \
|
||||
if (ch == c->in_channels) { \
|
||||
if (EXTERNAL_ ## SUF (cpu_flags)) { \
|
||||
if (c->out_channels == 1) \
|
||||
c->downmix = ff_ac3_downmix_ ## ch ## _to_1_ ## suf; \
|
||||
else \
|
||||
c->downmix = ff_ac3_downmix_ ## ch ## _to_2_ ## suf; \
|
||||
} \
|
||||
}
|
||||
|
||||
#define SET_DOWNMIX_ALL(suf, SUF) \
|
||||
SET_DOWNMIX(3, suf, SUF) \
|
||||
SET_DOWNMIX(4, suf, SUF) \
|
||||
SET_DOWNMIX(5, suf, SUF) \
|
||||
SET_DOWNMIX(6, suf, SUF)
|
||||
|
||||
SET_DOWNMIX_ALL(sse, SSE)
|
||||
if (!(cpu_flags & AV_CPU_FLAG_AVXSLOW)) {
|
||||
SET_DOWNMIX_ALL(avx, AVX)
|
||||
SET_DOWNMIX_ALL(fma3, FMA3)
|
||||
}
|
||||
}
|
44
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/alacdsp_init.c
vendored
Normal file
44
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/alacdsp_init.c
vendored
Normal file
|
@ -0,0 +1,44 @@
|
|||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/alacdsp.h"
|
||||
#include "config.h"
|
||||
|
||||
void ff_alac_decorrelate_stereo_sse4(int32_t *buffer[2], int nb_samples,
|
||||
int decorr_shift, int decorr_left_weight);
|
||||
void ff_alac_append_extra_bits_stereo_sse2(int32_t *buffer[2], int32_t *extra_bits_buffer[2],
|
||||
int extra_bits, int channels, int nb_samples);
|
||||
void ff_alac_append_extra_bits_mono_sse2(int32_t *buffer[2], int32_t *extra_bits_buffer[2],
|
||||
int extra_bits, int channels, int nb_samples);
|
||||
|
||||
av_cold void ff_alacdsp_init_x86(ALACDSPContext *c)
|
||||
{
|
||||
#if HAVE_X86ASM
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (EXTERNAL_SSE2(cpu_flags)) {
|
||||
c->append_extra_bits[0] = ff_alac_append_extra_bits_mono_sse2;
|
||||
c->append_extra_bits[1] = ff_alac_append_extra_bits_stereo_sse2;
|
||||
}
|
||||
if (EXTERNAL_SSE4(cpu_flags)) {
|
||||
c->decorrelate_stereo = ff_alac_decorrelate_stereo_sse4;
|
||||
}
|
||||
#endif /* HAVE_X86ASM */
|
||||
}
|
66
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/audiodsp_init.c
vendored
Normal file
66
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/audiodsp_init.c
vendored
Normal file
|
@ -0,0 +1,66 @@
|
|||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include "config.h"
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/audiodsp.h"
|
||||
|
||||
int32_t ff_scalarproduct_int16_mmxext(const int16_t *v1, const int16_t *v2,
|
||||
int order);
|
||||
int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2,
|
||||
int order);
|
||||
|
||||
void ff_vector_clip_int32_mmx(int32_t *dst, const int32_t *src,
|
||||
int32_t min, int32_t max, unsigned int len);
|
||||
void ff_vector_clip_int32_sse2(int32_t *dst, const int32_t *src,
|
||||
int32_t min, int32_t max, unsigned int len);
|
||||
void ff_vector_clip_int32_int_sse2(int32_t *dst, const int32_t *src,
|
||||
int32_t min, int32_t max, unsigned int len);
|
||||
void ff_vector_clip_int32_sse4(int32_t *dst, const int32_t *src,
|
||||
int32_t min, int32_t max, unsigned int len);
|
||||
void ff_vector_clipf_sse(float *dst, const float *src,
|
||||
int len, float min, float max);
|
||||
|
||||
av_cold void ff_audiodsp_init_x86(AudioDSPContext *c)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (EXTERNAL_MMX(cpu_flags))
|
||||
c->vector_clip_int32 = ff_vector_clip_int32_mmx;
|
||||
|
||||
if (EXTERNAL_MMXEXT(cpu_flags))
|
||||
c->scalarproduct_int16 = ff_scalarproduct_int16_mmxext;
|
||||
|
||||
if (EXTERNAL_SSE(cpu_flags))
|
||||
c->vector_clipf = ff_vector_clipf_sse;
|
||||
|
||||
if (EXTERNAL_SSE2(cpu_flags)) {
|
||||
c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
|
||||
if (cpu_flags & AV_CPU_FLAG_ATOM)
|
||||
c->vector_clip_int32 = ff_vector_clip_int32_int_sse2;
|
||||
else
|
||||
c->vector_clip_int32 = ff_vector_clip_int32_sse2;
|
||||
}
|
||||
|
||||
if (EXTERNAL_SSE4(cpu_flags))
|
||||
c->vector_clip_int32 = ff_vector_clip_int32_sse4;
|
||||
}
|
60
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/blockdsp_init.c
vendored
Normal file
60
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/blockdsp_init.c
vendored
Normal file
|
@ -0,0 +1,60 @@
|
|||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include "config.h"
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/internal.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/blockdsp.h"
|
||||
#include "libavcodec/version.h"
|
||||
|
||||
void ff_clear_block_mmx(int16_t *block);
|
||||
void ff_clear_block_sse(int16_t *block);
|
||||
void ff_clear_block_avx(int16_t *block);
|
||||
void ff_clear_blocks_mmx(int16_t *blocks);
|
||||
void ff_clear_blocks_sse(int16_t *blocks);
|
||||
void ff_clear_blocks_avx(int16_t *blocks);
|
||||
|
||||
av_cold void ff_blockdsp_init_x86(BlockDSPContext *c,
|
||||
AVCodecContext *avctx)
|
||||
{
|
||||
#if HAVE_X86ASM
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (EXTERNAL_MMX(cpu_flags)) {
|
||||
c->clear_block = ff_clear_block_mmx;
|
||||
c->clear_blocks = ff_clear_blocks_mmx;
|
||||
}
|
||||
|
||||
/* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */
|
||||
if (CONFIG_XVMC && avctx->hwaccel && avctx->hwaccel->decode_mb)
|
||||
return;
|
||||
|
||||
if (EXTERNAL_SSE(cpu_flags)) {
|
||||
c->clear_block = ff_clear_block_sse;
|
||||
c->clear_blocks = ff_clear_blocks_sse;
|
||||
}
|
||||
if (EXTERNAL_AVX_FAST(cpu_flags)) {
|
||||
c->clear_block = ff_clear_block_avx;
|
||||
c->clear_blocks = ff_clear_blocks_avx;
|
||||
}
|
||||
#endif /* HAVE_X86ASM */
|
||||
}
|
40
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/bswapdsp_init.c
vendored
Normal file
40
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/bswapdsp_init.c
vendored
Normal file
|
@ -0,0 +1,40 @@
|
|||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/bswapdsp.h"
|
||||
|
||||
void ff_bswap32_buf_sse2(uint32_t *dst, const uint32_t *src, int w);
|
||||
void ff_bswap32_buf_ssse3(uint32_t *dst, const uint32_t *src, int w);
|
||||
void ff_bswap32_buf_avx2(uint32_t *dst, const uint32_t *src, int w);
|
||||
|
||||
av_cold void ff_bswapdsp_init_x86(BswapDSPContext *c)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (EXTERNAL_SSE2(cpu_flags))
|
||||
c->bswap_buf = ff_bswap32_buf_sse2;
|
||||
if (EXTERNAL_SSSE3(cpu_flags))
|
||||
c->bswap_buf = ff_bswap32_buf_ssse3;
|
||||
if (EXTERNAL_AVX2_FAST(cpu_flags))
|
||||
c->bswap_buf = ff_bswap32_buf_avx2;
|
||||
}
|
301
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/cabac.h
vendored
Normal file
301
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/cabac.h
vendored
Normal file
|
@ -0,0 +1,301 @@
|
|||
/*
|
||||
* Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef AVCODEC_X86_CABAC_H
|
||||
#define AVCODEC_X86_CABAC_H
|
||||
|
||||
#include "libavcodec/cabac.h"
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/macros.h"
|
||||
#include "libavutil/x86/asm.h"
|
||||
#include "config.h"
|
||||
|
||||
#if (defined(__i386) && defined(__clang__) && (__clang_major__<2 || (__clang_major__==2 && __clang_minor__<10)))\
|
||||
|| ( !defined(__clang__) && defined(__llvm__) && __GNUC__==4 && __GNUC_MINOR__==2 && __GNUC_PATCHLEVEL__<=1)\
|
||||
|| (defined(__INTEL_COMPILER) && defined(_MSC_VER))
|
||||
# define BROKEN_COMPILER 1
|
||||
#else
|
||||
# define BROKEN_COMPILER 0
|
||||
#endif
|
||||
|
||||
#if HAVE_INLINE_ASM
|
||||
|
||||
#ifndef UNCHECKED_BITSTREAM_READER
|
||||
#define UNCHECKED_BITSTREAM_READER !CONFIG_SAFE_BITSTREAM_READER
|
||||
#endif
|
||||
|
||||
#if UNCHECKED_BITSTREAM_READER
|
||||
#define END_CHECK(end) ""
|
||||
#else
|
||||
#define END_CHECK(end) \
|
||||
"cmp "end" , %%"FF_REG_c" \n\t"\
|
||||
"jge 1f \n\t"
|
||||
#endif
|
||||
|
||||
#ifdef BROKEN_RELOCATIONS
|
||||
#define TABLES_ARG , "r"(tables)
|
||||
|
||||
#if HAVE_FAST_CMOV
|
||||
#define BRANCHLESS_GET_CABAC_UPDATE(ret, retq, low, range, tmp) \
|
||||
"cmp "low" , "tmp" \n\t"\
|
||||
"cmova %%ecx , "range" \n\t"\
|
||||
"sbb %%rcx , %%rcx \n\t"\
|
||||
"and %%ecx , "tmp" \n\t"\
|
||||
"xor %%rcx , "retq" \n\t"\
|
||||
"sub "tmp" , "low" \n\t"
|
||||
#else /* HAVE_FAST_CMOV */
|
||||
#define BRANCHLESS_GET_CABAC_UPDATE(ret, retq, low, range, tmp) \
|
||||
/* P4 Prescott has crappy cmov,sbb,64-bit shift so avoid them */ \
|
||||
"sub "low" , "tmp" \n\t"\
|
||||
"sar $31 , "tmp" \n\t"\
|
||||
"sub %%ecx , "range" \n\t"\
|
||||
"and "tmp" , "range" \n\t"\
|
||||
"add %%ecx , "range" \n\t"\
|
||||
"shl $17 , %%ecx \n\t"\
|
||||
"and "tmp" , %%ecx \n\t"\
|
||||
"sub %%ecx , "low" \n\t"\
|
||||
"xor "tmp" , "ret" \n\t"\
|
||||
"movslq "ret" , "retq" \n\t"
|
||||
#endif /* HAVE_FAST_CMOV */
|
||||
|
||||
#define BRANCHLESS_GET_CABAC(ret, retq, statep, low, lowword, range, rangeq, tmp, tmpbyte, byte, end, norm_off, lps_off, mlps_off, tables) \
|
||||
"movzbl "statep" , "ret" \n\t"\
|
||||
"mov "range" , "tmp" \n\t"\
|
||||
"and $0xC0 , "range" \n\t"\
|
||||
"lea ("ret", "range", 2), %%ecx \n\t"\
|
||||
"movzbl "lps_off"("tables", %%rcx), "range" \n\t"\
|
||||
"sub "range" , "tmp" \n\t"\
|
||||
"mov "tmp" , %%ecx \n\t"\
|
||||
"shl $17 , "tmp" \n\t"\
|
||||
BRANCHLESS_GET_CABAC_UPDATE(ret, retq, low, range, tmp) \
|
||||
"movzbl "norm_off"("tables", "rangeq"), %%ecx \n\t"\
|
||||
"shl %%cl , "range" \n\t"\
|
||||
"movzbl "mlps_off"+128("tables", "retq"), "tmp" \n\t"\
|
||||
"shl %%cl , "low" \n\t"\
|
||||
"mov "tmpbyte" , "statep" \n\t"\
|
||||
"test "lowword" , "lowword" \n\t"\
|
||||
"jnz 2f \n\t"\
|
||||
"mov "byte" , %%"FF_REG_c" \n\t"\
|
||||
END_CHECK(end)\
|
||||
"add"FF_OPSIZE" $2 , "byte" \n\t"\
|
||||
"1: \n\t"\
|
||||
"movzwl (%%"FF_REG_c") , "tmp" \n\t"\
|
||||
"lea -1("low") , %%ecx \n\t"\
|
||||
"xor "low" , %%ecx \n\t"\
|
||||
"shr $15 , %%ecx \n\t"\
|
||||
"bswap "tmp" \n\t"\
|
||||
"shr $15 , "tmp" \n\t"\
|
||||
"movzbl "norm_off"("tables", %%rcx), %%ecx \n\t"\
|
||||
"sub $0xFFFF , "tmp" \n\t"\
|
||||
"neg %%ecx \n\t"\
|
||||
"add $7 , %%ecx \n\t"\
|
||||
"shl %%cl , "tmp" \n\t"\
|
||||
"add "tmp" , "low" \n\t"\
|
||||
"2: \n\t"
|
||||
|
||||
#else /* BROKEN_RELOCATIONS */
|
||||
#define TABLES_ARG NAMED_CONSTRAINTS_ARRAY_ADD(ff_h264_cabac_tables)
|
||||
#define RIP_ARG
|
||||
|
||||
#if HAVE_FAST_CMOV
|
||||
#define BRANCHLESS_GET_CABAC_UPDATE(ret, low, range, tmp)\
|
||||
"mov "tmp" , %%ecx \n\t"\
|
||||
"shl $17 , "tmp" \n\t"\
|
||||
"cmp "low" , "tmp" \n\t"\
|
||||
"cmova %%ecx , "range" \n\t"\
|
||||
"sbb %%ecx , %%ecx \n\t"\
|
||||
"and %%ecx , "tmp" \n\t"\
|
||||
"xor %%ecx , "ret" \n\t"\
|
||||
"sub "tmp" , "low" \n\t"
|
||||
#else /* HAVE_FAST_CMOV */
|
||||
#define BRANCHLESS_GET_CABAC_UPDATE(ret, low, range, tmp)\
|
||||
"mov "tmp" , %%ecx \n\t"\
|
||||
"shl $17 , "tmp" \n\t"\
|
||||
"sub "low" , "tmp" \n\t"\
|
||||
"sar $31 , "tmp" \n\t" /*lps_mask*/\
|
||||
"sub %%ecx , "range" \n\t" /*RangeLPS - range*/\
|
||||
"and "tmp" , "range" \n\t" /*(RangeLPS - range)&lps_mask*/\
|
||||
"add %%ecx , "range" \n\t" /*new range*/\
|
||||
"shl $17 , %%ecx \n\t"\
|
||||
"and "tmp" , %%ecx \n\t"\
|
||||
"sub %%ecx , "low" \n\t"\
|
||||
"xor "tmp" , "ret" \n\t"
|
||||
#endif /* HAVE_FAST_CMOV */
|
||||
|
||||
#define BRANCHLESS_GET_CABAC(ret, retq, statep, low, lowword, range, rangeq, tmp, tmpbyte, byte, end, norm_off, lps_off, mlps_off, tables) \
|
||||
"movzbl "statep" , "ret" \n\t"\
|
||||
"mov "range" , "tmp" \n\t"\
|
||||
"and $0xC0 , "range" \n\t"\
|
||||
"movzbl "MANGLE(ff_h264_cabac_tables)"+"lps_off"("ret", "range", 2), "range" \n\t"\
|
||||
"sub "range" , "tmp" \n\t"\
|
||||
BRANCHLESS_GET_CABAC_UPDATE(ret, low, range, tmp) \
|
||||
"movzbl "MANGLE(ff_h264_cabac_tables)"+"norm_off"("range"), %%ecx \n\t"\
|
||||
"shl %%cl , "range" \n\t"\
|
||||
"movzbl "MANGLE(ff_h264_cabac_tables)"+"mlps_off"+128("ret"), "tmp" \n\t"\
|
||||
"shl %%cl , "low" \n\t"\
|
||||
"mov "tmpbyte" , "statep" \n\t"\
|
||||
"test "lowword" , "lowword" \n\t"\
|
||||
" jnz 2f \n\t"\
|
||||
"mov "byte" , %%"FF_REG_c" \n\t"\
|
||||
END_CHECK(end)\
|
||||
"add"FF_OPSIZE" $2 , "byte" \n\t"\
|
||||
"1: \n\t"\
|
||||
"movzwl (%%"FF_REG_c") , "tmp" \n\t"\
|
||||
"lea -1("low") , %%ecx \n\t"\
|
||||
"xor "low" , %%ecx \n\t"\
|
||||
"shr $15 , %%ecx \n\t"\
|
||||
"bswap "tmp" \n\t"\
|
||||
"shr $15 , "tmp" \n\t"\
|
||||
"movzbl "MANGLE(ff_h264_cabac_tables)"+"norm_off"(%%ecx), %%ecx \n\t"\
|
||||
"sub $0xFFFF , "tmp" \n\t"\
|
||||
"neg %%ecx \n\t"\
|
||||
"add $7 , %%ecx \n\t"\
|
||||
"shl %%cl , "tmp" \n\t"\
|
||||
"add "tmp" , "low" \n\t"\
|
||||
"2: \n\t"
|
||||
|
||||
#endif /* BROKEN_RELOCATIONS */
|
||||
|
||||
#if HAVE_7REGS && !BROKEN_COMPILER
|
||||
#define get_cabac_inline get_cabac_inline_x86
|
||||
static av_always_inline int get_cabac_inline_x86(CABACContext *c,
|
||||
uint8_t *const state)
|
||||
{
|
||||
int bit, tmp;
|
||||
#ifdef BROKEN_RELOCATIONS
|
||||
void *tables;
|
||||
|
||||
__asm__ volatile(
|
||||
"lea "MANGLE(ff_h264_cabac_tables)", %0 \n\t"
|
||||
: "=&r"(tables)
|
||||
: NAMED_CONSTRAINTS_ARRAY(ff_h264_cabac_tables)
|
||||
);
|
||||
#endif
|
||||
|
||||
__asm__ volatile(
|
||||
BRANCHLESS_GET_CABAC("%0", "%q0", "(%4)", "%1", "%w1",
|
||||
"%2", "%q2", "%3", "%b3",
|
||||
"%c6(%5)", "%c7(%5)",
|
||||
AV_STRINGIFY(H264_NORM_SHIFT_OFFSET),
|
||||
AV_STRINGIFY(H264_LPS_RANGE_OFFSET),
|
||||
AV_STRINGIFY(H264_MLPS_STATE_OFFSET),
|
||||
"%8")
|
||||
: "=&r"(bit), "=&r"(c->low), "=&r"(c->range), "=&q"(tmp)
|
||||
: "r"(state), "r"(c),
|
||||
"i"(offsetof(CABACContext, bytestream)),
|
||||
"i"(offsetof(CABACContext, bytestream_end))
|
||||
TABLES_ARG
|
||||
,"1"(c->low), "2"(c->range)
|
||||
: "%"FF_REG_c, "memory"
|
||||
);
|
||||
return bit & 1;
|
||||
}
|
||||
#endif /* HAVE_7REGS && !BROKEN_COMPILER */
|
||||
|
||||
#if !BROKEN_COMPILER
|
||||
#define get_cabac_bypass_sign get_cabac_bypass_sign_x86
|
||||
static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
|
||||
{
|
||||
x86_reg tmp;
|
||||
__asm__ volatile(
|
||||
"movl %c6(%2), %k1 \n\t"
|
||||
"movl %c3(%2), %%eax \n\t"
|
||||
"shl $17, %k1 \n\t"
|
||||
"add %%eax, %%eax \n\t"
|
||||
"sub %k1, %%eax \n\t"
|
||||
"cdq \n\t"
|
||||
"and %%edx, %k1 \n\t"
|
||||
"add %k1, %%eax \n\t"
|
||||
"xor %%edx, %%ecx \n\t"
|
||||
"sub %%edx, %%ecx \n\t"
|
||||
"test %%ax, %%ax \n\t"
|
||||
"jnz 1f \n\t"
|
||||
"mov %c4(%2), %1 \n\t"
|
||||
"subl $0xFFFF, %%eax \n\t"
|
||||
"movzwl (%1), %%edx \n\t"
|
||||
"bswap %%edx \n\t"
|
||||
"shrl $15, %%edx \n\t"
|
||||
#if UNCHECKED_BITSTREAM_READER
|
||||
"add $2, %1 \n\t"
|
||||
"addl %%edx, %%eax \n\t"
|
||||
"mov %1, %c4(%2) \n\t"
|
||||
#else
|
||||
"addl %%edx, %%eax \n\t"
|
||||
"cmp %c5(%2), %1 \n\t"
|
||||
"jge 1f \n\t"
|
||||
"add"FF_OPSIZE" $2, %c4(%2) \n\t"
|
||||
#endif
|
||||
"1: \n\t"
|
||||
"movl %%eax, %c3(%2) \n\t"
|
||||
|
||||
: "+c"(val), "=&r"(tmp)
|
||||
: "r"(c),
|
||||
"i"(offsetof(CABACContext, low)),
|
||||
"i"(offsetof(CABACContext, bytestream)),
|
||||
"i"(offsetof(CABACContext, bytestream_end)),
|
||||
"i"(offsetof(CABACContext, range))
|
||||
: "%eax", "%edx", "memory"
|
||||
);
|
||||
return val;
|
||||
}
|
||||
|
||||
#define get_cabac_bypass get_cabac_bypass_x86
|
||||
static av_always_inline int get_cabac_bypass_x86(CABACContext *c)
|
||||
{
|
||||
x86_reg tmp;
|
||||
int res;
|
||||
__asm__ volatile(
|
||||
"movl %c6(%2), %k1 \n\t"
|
||||
"movl %c3(%2), %%eax \n\t"
|
||||
"shl $17, %k1 \n\t"
|
||||
"add %%eax, %%eax \n\t"
|
||||
"sub %k1, %%eax \n\t"
|
||||
"cdq \n\t"
|
||||
"and %%edx, %k1 \n\t"
|
||||
"add %k1, %%eax \n\t"
|
||||
"inc %%edx \n\t"
|
||||
"test %%ax, %%ax \n\t"
|
||||
"jnz 1f \n\t"
|
||||
"mov %c4(%2), %1 \n\t"
|
||||
"subl $0xFFFF, %%eax \n\t"
|
||||
"movzwl (%1), %%ecx \n\t"
|
||||
"bswap %%ecx \n\t"
|
||||
"shrl $15, %%ecx \n\t"
|
||||
"addl %%ecx, %%eax \n\t"
|
||||
"cmp %c5(%2), %1 \n\t"
|
||||
"jge 1f \n\t"
|
||||
"add"FF_OPSIZE" $2, %c4(%2) \n\t"
|
||||
"1: \n\t"
|
||||
"movl %%eax, %c3(%2) \n\t"
|
||||
|
||||
: "=&d"(res), "=&r"(tmp)
|
||||
: "r"(c),
|
||||
"i"(offsetof(CABACContext, low)),
|
||||
"i"(offsetof(CABACContext, bytestream)),
|
||||
"i"(offsetof(CABACContext, bytestream_end)),
|
||||
"i"(offsetof(CABACContext, range))
|
||||
: "%eax", "%ecx", "memory"
|
||||
);
|
||||
return res;
|
||||
}
|
||||
#endif /* !BROKEN_COMPILER */
|
||||
|
||||
#endif /* HAVE_INLINE_ASM */
|
||||
#endif /* AVCODEC_X86_CABAC_H */
|
463
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/cavsdsp.c
vendored
Normal file
463
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/cavsdsp.c
vendored
Normal file
|
@ -0,0 +1,463 @@
|
|||
/*
|
||||
* Chinese AVS video (AVS1-P2, JiZhun profile) decoder.
|
||||
* Copyright (c) 2006 Stefan Gehrer <stefan.gehrer@gmx.de>
|
||||
*
|
||||
* MMX-optimized DSP functions, based on H.264 optimizations by
|
||||
* Michael Niedermayer and Loren Merritt
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/common.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/x86/asm.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/cavsdsp.h"
|
||||
#include "libavcodec/idctdsp.h"
|
||||
#include "constants.h"
|
||||
#include "fpel.h"
|
||||
#include "idctdsp.h"
|
||||
#include "config.h"
|
||||
|
||||
|
||||
#if HAVE_MMX_EXTERNAL
|
||||
|
||||
void ff_cavs_idct8_mmx(int16_t *out, const int16_t *in);
|
||||
|
||||
static void cavs_idct8_add_mmx(uint8_t *dst, int16_t *block, ptrdiff_t stride)
|
||||
{
|
||||
LOCAL_ALIGNED(16, int16_t, b2, [64]);
|
||||
ff_cavs_idct8_mmx(b2, block);
|
||||
ff_add_pixels_clamped_mmx(b2, dst, stride);
|
||||
}
|
||||
|
||||
void ff_cavs_idct8_sse2(int16_t *out, const int16_t *in);
|
||||
|
||||
static void cavs_idct8_add_sse2(uint8_t *dst, int16_t *block, ptrdiff_t stride)
|
||||
{
|
||||
LOCAL_ALIGNED(16, int16_t, b2, [64]);
|
||||
ff_cavs_idct8_sse2(b2, block);
|
||||
ff_add_pixels_clamped_sse2(b2, dst, stride);
|
||||
}
|
||||
|
||||
#endif /* HAVE_MMX_EXTERNAL */
|
||||
|
||||
#if (HAVE_MMXEXT_INLINE || HAVE_AMD3DNOW_INLINE)
|
||||
|
||||
/*****************************************************************************
|
||||
*
|
||||
* motion compensation
|
||||
*
|
||||
****************************************************************************/
|
||||
|
||||
/* vertical filter [-1 -2 96 42 -7 0] */
|
||||
#define QPEL_CAVSV1(A,B,C,D,E,F,OP,ADD, MUL1, MUL2) \
|
||||
"movd (%0), "#F" \n\t"\
|
||||
"movq "#C", %%mm6 \n\t"\
|
||||
"pmullw "MANGLE(MUL1)", %%mm6\n\t"\
|
||||
"movq "#D", %%mm7 \n\t"\
|
||||
"pmullw "MANGLE(MUL2)", %%mm7\n\t"\
|
||||
"psllw $3, "#E" \n\t"\
|
||||
"psubw "#E", %%mm6 \n\t"\
|
||||
"psraw $3, "#E" \n\t"\
|
||||
"paddw %%mm7, %%mm6 \n\t"\
|
||||
"paddw "#E", %%mm6 \n\t"\
|
||||
"paddw "#B", "#B" \n\t"\
|
||||
"pxor %%mm7, %%mm7 \n\t"\
|
||||
"add %2, %0 \n\t"\
|
||||
"punpcklbw %%mm7, "#F" \n\t"\
|
||||
"psubw "#B", %%mm6 \n\t"\
|
||||
"psraw $1, "#B" \n\t"\
|
||||
"psubw "#A", %%mm6 \n\t"\
|
||||
"paddw "MANGLE(ADD)", %%mm6 \n\t"\
|
||||
"psraw $7, %%mm6 \n\t"\
|
||||
"packuswb %%mm6, %%mm6 \n\t"\
|
||||
OP(%%mm6, (%1), A, d) \
|
||||
"add %3, %1 \n\t"
|
||||
|
||||
/* vertical filter [ 0 -1 5 5 -1 0] */
|
||||
#define QPEL_CAVSV2(A,B,C,D,E,F,OP,ADD, MUL1, MUL2) \
|
||||
"movd (%0), "#F" \n\t"\
|
||||
"movq "#C", %%mm6 \n\t"\
|
||||
"paddw "#D", %%mm6 \n\t"\
|
||||
"pmullw "MANGLE(MUL1)", %%mm6\n\t"\
|
||||
"add %2, %0 \n\t"\
|
||||
"punpcklbw %%mm7, "#F" \n\t"\
|
||||
"psubw "#B", %%mm6 \n\t"\
|
||||
"psubw "#E", %%mm6 \n\t"\
|
||||
"paddw "MANGLE(ADD)", %%mm6 \n\t"\
|
||||
"psraw $3, %%mm6 \n\t"\
|
||||
"packuswb %%mm6, %%mm6 \n\t"\
|
||||
OP(%%mm6, (%1), A, d) \
|
||||
"add %3, %1 \n\t"
|
||||
|
||||
/* vertical filter [ 0 -7 42 96 -2 -1] */
|
||||
#define QPEL_CAVSV3(A,B,C,D,E,F,OP,ADD, MUL1, MUL2) \
|
||||
"movd (%0), "#F" \n\t"\
|
||||
"movq "#C", %%mm6 \n\t"\
|
||||
"pmullw "MANGLE(MUL2)", %%mm6\n\t"\
|
||||
"movq "#D", %%mm7 \n\t"\
|
||||
"pmullw "MANGLE(MUL1)", %%mm7\n\t"\
|
||||
"psllw $3, "#B" \n\t"\
|
||||
"psubw "#B", %%mm6 \n\t"\
|
||||
"psraw $3, "#B" \n\t"\
|
||||
"paddw %%mm7, %%mm6 \n\t"\
|
||||
"paddw "#B", %%mm6 \n\t"\
|
||||
"paddw "#E", "#E" \n\t"\
|
||||
"pxor %%mm7, %%mm7 \n\t"\
|
||||
"add %2, %0 \n\t"\
|
||||
"punpcklbw %%mm7, "#F" \n\t"\
|
||||
"psubw "#E", %%mm6 \n\t"\
|
||||
"psraw $1, "#E" \n\t"\
|
||||
"psubw "#F", %%mm6 \n\t"\
|
||||
"paddw "MANGLE(ADD)", %%mm6 \n\t"\
|
||||
"psraw $7, %%mm6 \n\t"\
|
||||
"packuswb %%mm6, %%mm6 \n\t"\
|
||||
OP(%%mm6, (%1), A, d) \
|
||||
"add %3, %1 \n\t"
|
||||
|
||||
|
||||
#define QPEL_CAVSVNUM(VOP,OP,ADD,MUL1,MUL2)\
|
||||
int w= 2;\
|
||||
src -= 2*srcStride;\
|
||||
\
|
||||
while(w--){\
|
||||
__asm__ volatile(\
|
||||
"pxor %%mm7, %%mm7 \n\t"\
|
||||
"movd (%0), %%mm0 \n\t"\
|
||||
"add %2, %0 \n\t"\
|
||||
"movd (%0), %%mm1 \n\t"\
|
||||
"add %2, %0 \n\t"\
|
||||
"movd (%0), %%mm2 \n\t"\
|
||||
"add %2, %0 \n\t"\
|
||||
"movd (%0), %%mm3 \n\t"\
|
||||
"add %2, %0 \n\t"\
|
||||
"movd (%0), %%mm4 \n\t"\
|
||||
"add %2, %0 \n\t"\
|
||||
"punpcklbw %%mm7, %%mm0 \n\t"\
|
||||
"punpcklbw %%mm7, %%mm1 \n\t"\
|
||||
"punpcklbw %%mm7, %%mm2 \n\t"\
|
||||
"punpcklbw %%mm7, %%mm3 \n\t"\
|
||||
"punpcklbw %%mm7, %%mm4 \n\t"\
|
||||
VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, ADD, MUL1, MUL2)\
|
||||
VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, ADD, MUL1, MUL2)\
|
||||
VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, ADD, MUL1, MUL2)\
|
||||
VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, ADD, MUL1, MUL2)\
|
||||
VOP(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP, ADD, MUL1, MUL2)\
|
||||
VOP(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP, ADD, MUL1, MUL2)\
|
||||
VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, ADD, MUL1, MUL2)\
|
||||
VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, ADD, MUL1, MUL2)\
|
||||
\
|
||||
: "+a"(src), "+c"(dst)\
|
||||
: "S"((x86_reg)srcStride), "r"((x86_reg)dstStride)\
|
||||
NAMED_CONSTRAINTS_ADD(ADD,MUL1,MUL2)\
|
||||
: "memory"\
|
||||
);\
|
||||
if(h==16){\
|
||||
__asm__ volatile(\
|
||||
VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, ADD, MUL1, MUL2)\
|
||||
VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, ADD, MUL1, MUL2)\
|
||||
VOP(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP, ADD, MUL1, MUL2)\
|
||||
VOP(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP, ADD, MUL1, MUL2)\
|
||||
VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, ADD, MUL1, MUL2)\
|
||||
VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, ADD, MUL1, MUL2)\
|
||||
VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, ADD, MUL1, MUL2)\
|
||||
VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, ADD, MUL1, MUL2)\
|
||||
\
|
||||
: "+a"(src), "+c"(dst)\
|
||||
: "S"((x86_reg)srcStride), "r"((x86_reg)dstStride)\
|
||||
NAMED_CONSTRAINTS_ADD(ADD,MUL1,MUL2)\
|
||||
: "memory"\
|
||||
);\
|
||||
}\
|
||||
src += 4-(h+5)*srcStride;\
|
||||
dst += 4-h*dstStride;\
|
||||
}
|
||||
|
||||
#define QPEL_CAVS(OPNAME, OP, MMX)\
|
||||
static void OPNAME ## cavs_qpel8_h_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride)\
|
||||
{\
|
||||
int h=8;\
|
||||
__asm__ volatile(\
|
||||
"pxor %%mm7, %%mm7 \n\t"\
|
||||
"movq "MANGLE(ff_pw_5)", %%mm6\n\t"\
|
||||
"1: \n\t"\
|
||||
"movq (%0), %%mm0 \n\t"\
|
||||
"movq 1(%0), %%mm2 \n\t"\
|
||||
"movq %%mm0, %%mm1 \n\t"\
|
||||
"movq %%mm2, %%mm3 \n\t"\
|
||||
"punpcklbw %%mm7, %%mm0 \n\t"\
|
||||
"punpckhbw %%mm7, %%mm1 \n\t"\
|
||||
"punpcklbw %%mm7, %%mm2 \n\t"\
|
||||
"punpckhbw %%mm7, %%mm3 \n\t"\
|
||||
"paddw %%mm2, %%mm0 \n\t"\
|
||||
"paddw %%mm3, %%mm1 \n\t"\
|
||||
"pmullw %%mm6, %%mm0 \n\t"\
|
||||
"pmullw %%mm6, %%mm1 \n\t"\
|
||||
"movq -1(%0), %%mm2 \n\t"\
|
||||
"movq 2(%0), %%mm4 \n\t"\
|
||||
"movq %%mm2, %%mm3 \n\t"\
|
||||
"movq %%mm4, %%mm5 \n\t"\
|
||||
"punpcklbw %%mm7, %%mm2 \n\t"\
|
||||
"punpckhbw %%mm7, %%mm3 \n\t"\
|
||||
"punpcklbw %%mm7, %%mm4 \n\t"\
|
||||
"punpckhbw %%mm7, %%mm5 \n\t"\
|
||||
"paddw %%mm4, %%mm2 \n\t"\
|
||||
"paddw %%mm3, %%mm5 \n\t"\
|
||||
"psubw %%mm2, %%mm0 \n\t"\
|
||||
"psubw %%mm5, %%mm1 \n\t"\
|
||||
"movq "MANGLE(ff_pw_4)", %%mm5\n\t"\
|
||||
"paddw %%mm5, %%mm0 \n\t"\
|
||||
"paddw %%mm5, %%mm1 \n\t"\
|
||||
"psraw $3, %%mm0 \n\t"\
|
||||
"psraw $3, %%mm1 \n\t"\
|
||||
"packuswb %%mm1, %%mm0 \n\t"\
|
||||
OP(%%mm0, (%1),%%mm5, q) \
|
||||
"add %3, %0 \n\t"\
|
||||
"add %4, %1 \n\t"\
|
||||
"decl %2 \n\t"\
|
||||
" jnz 1b \n\t"\
|
||||
: "+a"(src), "+c"(dst), "+m"(h)\
|
||||
: "d"((x86_reg)srcStride), "S"((x86_reg)dstStride)\
|
||||
NAMED_CONSTRAINTS_ADD(ff_pw_4,ff_pw_5)\
|
||||
: "memory"\
|
||||
);\
|
||||
}\
|
||||
\
|
||||
static inline void OPNAME ## cavs_qpel8or16_v1_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride, int h)\
|
||||
{ \
|
||||
QPEL_CAVSVNUM(QPEL_CAVSV1,OP,ff_pw_64,ff_pw_96,ff_pw_42) \
|
||||
}\
|
||||
\
|
||||
static inline void OPNAME ## cavs_qpel8or16_v2_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride, int h)\
|
||||
{ \
|
||||
QPEL_CAVSVNUM(QPEL_CAVSV2,OP,ff_pw_4,ff_pw_5,ff_pw_42) \
|
||||
}\
|
||||
\
|
||||
static inline void OPNAME ## cavs_qpel8or16_v3_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride, int h)\
|
||||
{ \
|
||||
QPEL_CAVSVNUM(QPEL_CAVSV3,OP,ff_pw_64,ff_pw_96,ff_pw_42) \
|
||||
}\
|
||||
\
|
||||
static void OPNAME ## cavs_qpel8_v1_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride)\
|
||||
{ \
|
||||
OPNAME ## cavs_qpel8or16_v1_ ## MMX(dst , src , dstStride, srcStride, 8);\
|
||||
}\
|
||||
static void OPNAME ## cavs_qpel16_v1_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride)\
|
||||
{ \
|
||||
OPNAME ## cavs_qpel8or16_v1_ ## MMX(dst , src , dstStride, srcStride, 16);\
|
||||
OPNAME ## cavs_qpel8or16_v1_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
|
||||
}\
|
||||
\
|
||||
static void OPNAME ## cavs_qpel8_v2_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride)\
|
||||
{ \
|
||||
OPNAME ## cavs_qpel8or16_v2_ ## MMX(dst , src , dstStride, srcStride, 8);\
|
||||
}\
|
||||
static void OPNAME ## cavs_qpel16_v2_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride)\
|
||||
{ \
|
||||
OPNAME ## cavs_qpel8or16_v2_ ## MMX(dst , src , dstStride, srcStride, 16);\
|
||||
OPNAME ## cavs_qpel8or16_v2_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
|
||||
}\
|
||||
\
|
||||
static void OPNAME ## cavs_qpel8_v3_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride)\
|
||||
{ \
|
||||
OPNAME ## cavs_qpel8or16_v3_ ## MMX(dst , src , dstStride, srcStride, 8);\
|
||||
}\
|
||||
static void OPNAME ## cavs_qpel16_v3_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride)\
|
||||
{ \
|
||||
OPNAME ## cavs_qpel8or16_v3_ ## MMX(dst , src , dstStride, srcStride, 16);\
|
||||
OPNAME ## cavs_qpel8or16_v3_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
|
||||
}\
|
||||
\
|
||||
static void OPNAME ## cavs_qpel16_h_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride)\
|
||||
{ \
|
||||
OPNAME ## cavs_qpel8_h_ ## MMX(dst , src , dstStride, srcStride);\
|
||||
OPNAME ## cavs_qpel8_h_ ## MMX(dst+8, src+8, dstStride, srcStride);\
|
||||
src += 8*srcStride;\
|
||||
dst += 8*dstStride;\
|
||||
OPNAME ## cavs_qpel8_h_ ## MMX(dst , src , dstStride, srcStride);\
|
||||
OPNAME ## cavs_qpel8_h_ ## MMX(dst+8, src+8, dstStride, srcStride);\
|
||||
}\
|
||||
|
||||
#define CAVS_MC(OPNAME, SIZE, MMX) \
|
||||
static void OPNAME ## cavs_qpel ## SIZE ## _mc20_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
|
||||
{\
|
||||
OPNAME ## cavs_qpel ## SIZE ## _h_ ## MMX(dst, src, stride, stride);\
|
||||
}\
|
||||
\
|
||||
static void OPNAME ## cavs_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
|
||||
{\
|
||||
OPNAME ## cavs_qpel ## SIZE ## _v1_ ## MMX(dst, src, stride, stride);\
|
||||
}\
|
||||
\
|
||||
static void OPNAME ## cavs_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
|
||||
{\
|
||||
OPNAME ## cavs_qpel ## SIZE ## _v2_ ## MMX(dst, src, stride, stride);\
|
||||
}\
|
||||
\
|
||||
static void OPNAME ## cavs_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
|
||||
{\
|
||||
OPNAME ## cavs_qpel ## SIZE ## _v3_ ## MMX(dst, src, stride, stride);\
|
||||
}\
|
||||
|
||||
#define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t"
|
||||
#define AVG_3DNOW_OP(a,b,temp, size) \
|
||||
"mov" #size " " #b ", " #temp " \n\t"\
|
||||
"pavgusb " #temp ", " #a " \n\t"\
|
||||
"mov" #size " " #a ", " #b " \n\t"
|
||||
#define AVG_MMXEXT_OP(a, b, temp, size) \
|
||||
"mov" #size " " #b ", " #temp " \n\t"\
|
||||
"pavgb " #temp ", " #a " \n\t"\
|
||||
"mov" #size " " #a ", " #b " \n\t"
|
||||
|
||||
#endif /* (HAVE_MMXEXT_INLINE || HAVE_AMD3DNOW_INLINE) */
|
||||
|
||||
#if HAVE_MMX_EXTERNAL
|
||||
static void put_cavs_qpel8_mc00_mmx(uint8_t *dst, const uint8_t *src,
|
||||
ptrdiff_t stride)
|
||||
{
|
||||
ff_put_pixels8_mmx(dst, src, stride, 8);
|
||||
}
|
||||
|
||||
static void avg_cavs_qpel8_mc00_mmx(uint8_t *dst, const uint8_t *src,
|
||||
ptrdiff_t stride)
|
||||
{
|
||||
ff_avg_pixels8_mmx(dst, src, stride, 8);
|
||||
}
|
||||
|
||||
static void avg_cavs_qpel8_mc00_mmxext(uint8_t *dst, const uint8_t *src,
|
||||
ptrdiff_t stride)
|
||||
{
|
||||
ff_avg_pixels8_mmxext(dst, src, stride, 8);
|
||||
}
|
||||
|
||||
static void put_cavs_qpel16_mc00_mmx(uint8_t *dst, const uint8_t *src,
|
||||
ptrdiff_t stride)
|
||||
{
|
||||
ff_put_pixels16_mmx(dst, src, stride, 16);
|
||||
}
|
||||
|
||||
static void avg_cavs_qpel16_mc00_mmx(uint8_t *dst, const uint8_t *src,
|
||||
ptrdiff_t stride)
|
||||
{
|
||||
ff_avg_pixels16_mmx(dst, src, stride, 16);
|
||||
}
|
||||
|
||||
static void avg_cavs_qpel16_mc00_mmxext(uint8_t *dst, const uint8_t *src,
|
||||
ptrdiff_t stride)
|
||||
{
|
||||
ff_avg_pixels16_mmxext(dst, src, stride, 16);
|
||||
}
|
||||
|
||||
static void put_cavs_qpel16_mc00_sse2(uint8_t *dst, const uint8_t *src,
|
||||
ptrdiff_t stride)
|
||||
{
|
||||
ff_put_pixels16_sse2(dst, src, stride, 16);
|
||||
}
|
||||
|
||||
static void avg_cavs_qpel16_mc00_sse2(uint8_t *dst, const uint8_t *src,
|
||||
ptrdiff_t stride)
|
||||
{
|
||||
ff_avg_pixels16_sse2(dst, src, stride, 16);
|
||||
}
|
||||
#endif
|
||||
|
||||
static av_cold void cavsdsp_init_mmx(CAVSDSPContext *c,
|
||||
AVCodecContext *avctx)
|
||||
{
|
||||
#if HAVE_MMX_EXTERNAL
|
||||
c->put_cavs_qpel_pixels_tab[0][0] = put_cavs_qpel16_mc00_mmx;
|
||||
c->put_cavs_qpel_pixels_tab[1][0] = put_cavs_qpel8_mc00_mmx;
|
||||
c->avg_cavs_qpel_pixels_tab[0][0] = avg_cavs_qpel16_mc00_mmx;
|
||||
c->avg_cavs_qpel_pixels_tab[1][0] = avg_cavs_qpel8_mc00_mmx;
|
||||
|
||||
c->cavs_idct8_add = cavs_idct8_add_mmx;
|
||||
c->idct_perm = FF_IDCT_PERM_TRANSPOSE;
|
||||
#endif /* HAVE_MMX_EXTERNAL */
|
||||
}
|
||||
|
||||
#define DSPFUNC(PFX, IDX, NUM, EXT) \
|
||||
c->PFX ## _cavs_qpel_pixels_tab[IDX][ 2] = PFX ## _cavs_qpel ## NUM ## _mc20_ ## EXT; \
|
||||
c->PFX ## _cavs_qpel_pixels_tab[IDX][ 4] = PFX ## _cavs_qpel ## NUM ## _mc01_ ## EXT; \
|
||||
c->PFX ## _cavs_qpel_pixels_tab[IDX][ 8] = PFX ## _cavs_qpel ## NUM ## _mc02_ ## EXT; \
|
||||
c->PFX ## _cavs_qpel_pixels_tab[IDX][12] = PFX ## _cavs_qpel ## NUM ## _mc03_ ## EXT; \
|
||||
|
||||
#if HAVE_MMXEXT_INLINE
|
||||
QPEL_CAVS(put_, PUT_OP, mmxext)
|
||||
QPEL_CAVS(avg_, AVG_MMXEXT_OP, mmxext)
|
||||
|
||||
CAVS_MC(put_, 8, mmxext)
|
||||
CAVS_MC(put_, 16, mmxext)
|
||||
CAVS_MC(avg_, 8, mmxext)
|
||||
CAVS_MC(avg_, 16, mmxext)
|
||||
#endif /* HAVE_MMXEXT_INLINE */
|
||||
|
||||
#if HAVE_AMD3DNOW_INLINE
|
||||
QPEL_CAVS(put_, PUT_OP, 3dnow)
|
||||
QPEL_CAVS(avg_, AVG_3DNOW_OP, 3dnow)
|
||||
|
||||
CAVS_MC(put_, 8, 3dnow)
|
||||
CAVS_MC(put_, 16,3dnow)
|
||||
CAVS_MC(avg_, 8, 3dnow)
|
||||
CAVS_MC(avg_, 16,3dnow)
|
||||
|
||||
static av_cold void cavsdsp_init_3dnow(CAVSDSPContext *c,
|
||||
AVCodecContext *avctx)
|
||||
{
|
||||
DSPFUNC(put, 0, 16, 3dnow);
|
||||
DSPFUNC(put, 1, 8, 3dnow);
|
||||
DSPFUNC(avg, 0, 16, 3dnow);
|
||||
DSPFUNC(avg, 1, 8, 3dnow);
|
||||
}
|
||||
#endif /* HAVE_AMD3DNOW_INLINE */
|
||||
|
||||
av_cold void ff_cavsdsp_init_x86(CAVSDSPContext *c, AVCodecContext *avctx)
|
||||
{
|
||||
av_unused int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (X86_MMX(cpu_flags))
|
||||
cavsdsp_init_mmx(c, avctx);
|
||||
|
||||
#if HAVE_AMD3DNOW_INLINE
|
||||
if (INLINE_AMD3DNOW(cpu_flags))
|
||||
cavsdsp_init_3dnow(c, avctx);
|
||||
#endif /* HAVE_AMD3DNOW_INLINE */
|
||||
#if HAVE_MMXEXT_INLINE
|
||||
if (INLINE_MMXEXT(cpu_flags)) {
|
||||
DSPFUNC(put, 0, 16, mmxext);
|
||||
DSPFUNC(put, 1, 8, mmxext);
|
||||
DSPFUNC(avg, 0, 16, mmxext);
|
||||
DSPFUNC(avg, 1, 8, mmxext);
|
||||
}
|
||||
#endif
|
||||
#if HAVE_MMX_EXTERNAL
|
||||
if (EXTERNAL_MMXEXT(cpu_flags)) {
|
||||
c->avg_cavs_qpel_pixels_tab[0][0] = avg_cavs_qpel16_mc00_mmxext;
|
||||
c->avg_cavs_qpel_pixels_tab[1][0] = avg_cavs_qpel8_mc00_mmxext;
|
||||
}
|
||||
#endif
|
||||
#if HAVE_SSE2_EXTERNAL
|
||||
if (EXTERNAL_SSE2(cpu_flags)) {
|
||||
c->put_cavs_qpel_pixels_tab[0][0] = put_cavs_qpel16_mc00_sse2;
|
||||
c->avg_cavs_qpel_pixels_tab[0][0] = avg_cavs_qpel16_mc00_sse2;
|
||||
|
||||
c->cavs_idct8_add = cavs_idct8_add_sse2;
|
||||
c->idct_perm = FF_IDCT_PERM_TRANSPOSE;
|
||||
}
|
||||
#endif
|
||||
}
|
43
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/celt_pvq_init.c
vendored
Normal file
43
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/celt_pvq_init.c
vendored
Normal file
|
@ -0,0 +1,43 @@
|
|||
/*
|
||||
* Opus encoder assembly optimizations
|
||||
* Copyright (C) 2017 Ivan Kalvachev <ikalvachev@gmail.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/opus_pvq.h"
|
||||
|
||||
extern float ff_pvq_search_approx_sse2(float *X, int *y, int K, int N);
|
||||
extern float ff_pvq_search_approx_sse4(float *X, int *y, int K, int N);
|
||||
extern float ff_pvq_search_exact_avx (float *X, int *y, int K, int N);
|
||||
|
||||
av_cold void ff_celt_pvq_init_x86(CeltPVQ *s)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (EXTERNAL_SSE2(cpu_flags))
|
||||
s->pvq_search = ff_pvq_search_approx_sse2;
|
||||
|
||||
if (EXTERNAL_SSE4(cpu_flags))
|
||||
s->pvq_search = ff_pvq_search_approx_sse4;
|
||||
|
||||
if (EXTERNAL_AVX_FAST(cpu_flags))
|
||||
s->pvq_search = ff_pvq_search_exact_avx;
|
||||
}
|
385
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/celt_pvq_search.asm
vendored
Normal file
385
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/celt_pvq_search.asm
vendored
Normal file
|
@ -0,0 +1,385 @@
|
|||
;******************************************************************************
|
||||
;* SIMD optimized Opus encoder DSP function
|
||||
;*
|
||||
;* Copyright (C) 2017 Ivan Kalvachev <ikalvachev@gmail.com>
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "config.asm"
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
%ifdef __NASM_VER__
|
||||
%use "smartalign"
|
||||
ALIGNMODE p6
|
||||
%endif
|
||||
|
||||
SECTION_RODATA 64
|
||||
|
||||
const_float_abs_mask: times 8 dd 0x7fffffff
|
||||
const_align_abs_edge: times 8 dd 0
|
||||
|
||||
const_float_0_5: times 8 dd 0.5
|
||||
const_float_1: times 8 dd 1.0
|
||||
const_float_sign_mask: times 8 dd 0x80000000
|
||||
|
||||
const_int32_offsets:
|
||||
%rep 8
|
||||
dd $-const_int32_offsets
|
||||
%endrep
|
||||
SECTION .text
|
||||
|
||||
;
|
||||
; Setup High Register to be used
|
||||
; for holding memory constants
|
||||
;
|
||||
; %1 - the register to be used, assmues it is >= mm8
|
||||
; %2 - name of the constant.
|
||||
;
|
||||
; Subsequent opcodes are going to use the constant in the form
|
||||
; "addps m0, mm_const_name" and it would be turned into:
|
||||
; "addps m0, [const_name]" on 32 bit arch or
|
||||
; "addps m0, m8" on 64 bit arch
|
||||
%macro SET_HI_REG_MM_CONSTANT 3 ; movop, reg, const_name
|
||||
%if num_mmregs > 8
|
||||
%define mm_%3 %2
|
||||
%{1} %2, [%3] ; movaps m8, [const_name]
|
||||
%else
|
||||
%define mm_%3 [%3]
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
;
|
||||
; Set Position Independent Code
|
||||
; Base address of a constant
|
||||
; %1 - the register to be used, if PIC is set
|
||||
; %2 - name of the constant.
|
||||
;
|
||||
; Subsequent opcode are going to use the base address in the form
|
||||
; "movaps m0, [pic_base_constant_name+r4]" and it would be turned into
|
||||
; "movaps m0, [r5 + r4]" if PIC is enabled
|
||||
; "movaps m0, [constant_name + r4]" if texrel are used
|
||||
%macro SET_PIC_BASE 3; reg, const_label
|
||||
%ifdef PIC
|
||||
%{1} %2, [%3] ; lea r5, [rip+const]
|
||||
%define pic_base_%3 %2
|
||||
%else
|
||||
%define pic_base_%3 %3
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro PULSES_SEARCH 1
|
||||
; m6 Syy_norm
|
||||
; m7 Sxy_norm
|
||||
addps m6, mm_const_float_0_5 ; Syy_norm += 1.0/2
|
||||
pxor m1, m1 ; max_idx
|
||||
xorps m3, m3 ; p_max
|
||||
xor r4d, r4d
|
||||
align 16
|
||||
%%distortion_search:
|
||||
movd xm2, dword r4d ; movd zero extends
|
||||
%ifidn %1,add
|
||||
movaps m4, [tmpY + r4] ; y[i]
|
||||
movaps m5, [tmpX + r4] ; X[i]
|
||||
|
||||
%if USE_APPROXIMATION == 1
|
||||
xorps m0, m0
|
||||
cmpps m0, m0, m5, 4 ; m0 = (X[i] != 0.0)
|
||||
%endif
|
||||
|
||||
addps m4, m6 ; m4 = Syy_new = y[i] + Syy_norm
|
||||
addps m5, m7 ; m5 = Sxy_new = X[i] + Sxy_norm
|
||||
|
||||
%if USE_APPROXIMATION == 1
|
||||
andps m5, m0 ; if(X[i] == 0) Sxy_new = 0; Prevent aproximation error from setting pulses in array padding.
|
||||
%endif
|
||||
|
||||
%else
|
||||
movaps m5, [tmpY + r4] ; m5 = y[i]
|
||||
|
||||
xorps m0, m0 ; m0 = 0;
|
||||
cmpps m0, m0, m5, 1 ; m0 = (0<y)
|
||||
|
||||
subps m4, m6, m5 ; m4 = Syy_new = Syy_norm - y[i]
|
||||
subps m5, m7, [tmpX + r4] ; m5 = Sxy_new = Sxy_norm - X[i]
|
||||
andps m5, m0 ; (0<y)?m5:0
|
||||
%endif
|
||||
|
||||
%if USE_APPROXIMATION == 1
|
||||
rsqrtps m4, m4
|
||||
mulps m5, m4 ; m5 = p = Sxy_new*approx(1/sqrt(Syy) )
|
||||
%else
|
||||
mulps m5, m5
|
||||
divps m5, m4 ; m5 = p = Sxy_new*Sxy_new/Syy
|
||||
%endif
|
||||
VPBROADCASTD m2, xm2 ; m2=i (all lanes get same values, we add the offset-per-lane, later)
|
||||
|
||||
cmpps m0, m3, m5, 1 ; m0 = (m3 < m5) ; (p_max < p) ; (p > p_max)
|
||||
maxps m3, m5 ; m3=max(p_max,p)
|
||||
; maxps here is faster than blendvps, despite blend having lower latency.
|
||||
|
||||
pand m2, m0 ; This version seems faster than sse41 pblendvb
|
||||
pmaxsw m1, m2 ; SSE2 signed word, so it would work for N < 32768/4
|
||||
|
||||
add r4d, mmsize
|
||||
cmp r4d, Nd
|
||||
jb %%distortion_search
|
||||
|
||||
por m1, mm_const_int32_offsets ; max_idx offsets per individual lane (skipped in the inner loop)
|
||||
movdqa m4, m1 ; needed for the aligned y[max_idx]+=1; processing
|
||||
|
||||
%if mmsize >= 32
|
||||
; Merge parallel maximums round 8 (4 vs 4)
|
||||
|
||||
vextractf128 xm5, ym3, 1 ; xmm5 = ymm3[1x128] = ymm3[255..128b]
|
||||
cmpps xm0, xm3, xm5, 1 ; m0 = (m3 < m5) = ( p[0x128] < p[1x128] )
|
||||
|
||||
vextracti128 xm2, ym1, 1 ; xmm2 = ymm1[1x128] = ymm1[255..128b]
|
||||
BLENDVPS xm3, xm5, xm0 ; max_idx = m0 ? max_idx[1x128] : max_idx[0x128]
|
||||
PBLENDVB xm1, xm2, xm0 ; p = m0 ? p[1x128] : p[0x128]
|
||||
%endif
|
||||
|
||||
; Merge parallel maximums round 4 (2 vs 2)
|
||||
; m3=p[3210]
|
||||
movhlps xm5, xm3 ; m5=p[xx32]
|
||||
cmpps xm0, xm3, xm5, 1 ; m0 = (m3 < m5) = ( p[1,0] < p[3,2] )
|
||||
|
||||
pshufd xm2, xm1, q3232
|
||||
BLENDVPS xm3, xm5, xm0 ; max_idx = m0 ? max_idx[3,2] : max_idx[1,0]
|
||||
PBLENDVB xm1, xm2, xm0 ; p = m0 ? p[3,2] : p[1,0]
|
||||
|
||||
; Merge parallel maximums final round (1 vs 1)
|
||||
shufps xm0, xm3, xm3, q1111 ; m0 = m3[1] = p[1]
|
||||
cmpss xm0, xm3, 5 ; m0 = !(m0 >= m3) = !( p[1] >= p[0] )
|
||||
|
||||
pshufd xm2, xm1, q1111
|
||||
PBLENDVB xm1, xm2, xm0
|
||||
|
||||
movd dword r4d, xm1 ; zero extends to the rest of r4q
|
||||
|
||||
VBROADCASTSS m3, [tmpX + r4]
|
||||
%{1}ps m7, m3 ; Sxy += X[max_idx]
|
||||
|
||||
VBROADCASTSS m5, [tmpY + r4]
|
||||
%{1}ps m6, m5 ; Syy += Y[max_idx]
|
||||
|
||||
; We have to update a single element in Y[i]
|
||||
; However writing 4 bytes and then doing 16 byte load in the inner loop
|
||||
; could cause a stall due to breaking write forwarding.
|
||||
VPBROADCASTD m1, xm1
|
||||
pcmpeqd m1, m1, m4 ; exactly 1 element matches max_idx and this finds it
|
||||
|
||||
and r4d, ~(mmsize-1) ; align address down, so the value pointed by max_idx is inside a mmsize load
|
||||
movaps m5, [tmpY + r4] ; m5 = Y[y3...ym...y0]
|
||||
andps m1, mm_const_float_1 ; m1 = [ 0...1.0...0]
|
||||
%{1}ps m5, m1 ; m5 = Y[y3...ym...y0] +/- [0...1.0...0]
|
||||
movaps [tmpY + r4], m5 ; Y[max_idx] +-= 1.0;
|
||||
%endmacro
|
||||
|
||||
;
|
||||
; We need one more register for
|
||||
; PIC relative addressing. Use this
|
||||
; to count it in cglobal
|
||||
;
|
||||
%ifdef PIC
|
||||
%define num_pic_regs 1
|
||||
%else
|
||||
%define num_pic_regs 0
|
||||
%endif
|
||||
|
||||
;
|
||||
; Pyramid Vector Quantization Search implementation
|
||||
;
|
||||
; float * inX - Unaligned (SIMD) access, it will be overread,
|
||||
; but extra data is masked away.
|
||||
; int32 * outY - Should be aligned and padded buffer.
|
||||
; It is used as temp buffer.
|
||||
; uint32 K - Number of pulses to have after quantizations.
|
||||
; uint32 N - Number of vector elements. Must be 0 < N < 256
|
||||
;
|
||||
%macro PVQ_FAST_SEARCH 1
|
||||
cglobal pvq_search%1, 4, 5+num_pic_regs, 11, 256*4, inX, outY, K, N
|
||||
%define tmpX rsp
|
||||
%define tmpY outYq
|
||||
|
||||
movaps m0, [const_float_abs_mask]
|
||||
shl Nd, 2 ; N *= sizeof(float); also 32 bit operation zeroes the high 32 bits in 64 bit mode.
|
||||
mov r4d, Nd
|
||||
|
||||
neg r4d
|
||||
and r4d, mmsize-1
|
||||
|
||||
SET_PIC_BASE lea, r5, const_align_abs_edge ; rip+const
|
||||
movups m2, [pic_base_const_align_abs_edge + r4 - mmsize]
|
||||
|
||||
add Nd, r4d ; N = align(N, mmsize)
|
||||
|
||||
lea r4d, [Nd - mmsize] ; N is rounded up (aligned up) to mmsize, so r4 can't become negative here, unless N=0.
|
||||
movups m1, [inXq + r4]
|
||||
andps m1, m2
|
||||
movaps [tmpX + r4], m1 ; Sx = abs( X[N-1] )
|
||||
|
||||
align 16
|
||||
%%loop_abs_sum:
|
||||
sub r4d, mmsize
|
||||
jc %%end_loop_abs_sum
|
||||
|
||||
movups m2, [inXq + r4]
|
||||
andps m2, m0
|
||||
|
||||
movaps [tmpX + r4], m2 ; tmpX[i]=abs(X[i])
|
||||
addps m1, m2 ; Sx += abs(X[i])
|
||||
jmp %%loop_abs_sum
|
||||
|
||||
align 16
|
||||
%%end_loop_abs_sum:
|
||||
|
||||
HSUMPS m1, m2 ; m1 = Sx
|
||||
|
||||
xorps m0, m0
|
||||
comiss xm0, xm1 ;
|
||||
jz %%zero_input ; if (Sx==0) goto zero_input
|
||||
|
||||
cvtsi2ss xm0, dword Kd ; m0 = K
|
||||
%if USE_APPROXIMATION == 1
|
||||
rcpss xm1, xm1 ; m1 = approx(1/Sx)
|
||||
mulss xm0, xm1 ; m0 = K*(1/Sx)
|
||||
%else
|
||||
divss xm0, xm1 ; b = K/Sx
|
||||
; b = K/max_x
|
||||
%endif
|
||||
|
||||
VBROADCASTSS m0, xm0
|
||||
|
||||
lea r4d, [Nd - mmsize]
|
||||
pxor m5, m5 ; Sy ( Sum of abs( y[i]) )
|
||||
xorps m6, m6 ; Syy ( Sum of y[i]*y[i] )
|
||||
xorps m7, m7 ; Sxy ( Sum of X[i]*y[i] )
|
||||
align 16
|
||||
%%loop_guess:
|
||||
movaps m1, [tmpX + r4] ; m1 = X[i]
|
||||
mulps m2, m0, m1 ; m2 = res*X[i]
|
||||
cvtps2dq m2, m2 ; yt = (int)lrintf( res*X[i] )
|
||||
paddd m5, m2 ; Sy += yt
|
||||
cvtdq2ps m2, m2 ; yt = (float)yt
|
||||
mulps m1, m2 ; m1 = X[i]*yt
|
||||
movaps [tmpY + r4], m2 ; y[i] = m2
|
||||
addps m7, m1 ; Sxy += m1;
|
||||
mulps m2, m2 ; m2 = yt*yt
|
||||
addps m6, m2 ; Syy += m2
|
||||
|
||||
sub r4d, mmsize
|
||||
jnc %%loop_guess
|
||||
|
||||
HSUMPS m6, m1 ; Syy_norm
|
||||
HADDD m5, m4 ; pulses
|
||||
|
||||
movd dword r4d, xm5 ; zero extends to the rest of r4q
|
||||
|
||||
sub Kd, r4d ; K -= pulses , also 32 bit operation zeroes high 32 bit in 64 bit mode.
|
||||
jz %%finish ; K - pulses == 0
|
||||
|
||||
SET_HI_REG_MM_CONSTANT movaps, m8, const_float_0_5
|
||||
SET_HI_REG_MM_CONSTANT movaps, m9, const_float_1
|
||||
SET_HI_REG_MM_CONSTANT movdqa, m10, const_int32_offsets
|
||||
; Use Syy/2 in distortion parameter calculations.
|
||||
; Saves pre and post-caclulation to correct Y[] values.
|
||||
; Same precision, since float mantisa is normalized.
|
||||
; The SQRT approximation does differ.
|
||||
HSUMPS m7, m0 ; Sxy_norm
|
||||
mulps m6, mm_const_float_0_5
|
||||
|
||||
jc %%remove_pulses_loop ; K - pulses < 0
|
||||
|
||||
align 16 ; K - pulses > 0
|
||||
%%add_pulses_loop:
|
||||
|
||||
PULSES_SEARCH add ; m6 Syy_norm ; m7 Sxy_norm
|
||||
|
||||
sub Kd, 1
|
||||
jnz %%add_pulses_loop
|
||||
|
||||
addps m6, m6 ; Syy*=2
|
||||
|
||||
jmp %%finish
|
||||
|
||||
align 16
|
||||
%%remove_pulses_loop:
|
||||
|
||||
PULSES_SEARCH sub ; m6 Syy_norm ; m7 Sxy_norm
|
||||
|
||||
add Kd, 1
|
||||
jnz %%remove_pulses_loop
|
||||
|
||||
addps m6, m6 ; Syy*=2
|
||||
|
||||
align 16
|
||||
%%finish:
|
||||
lea r4d, [Nd - mmsize]
|
||||
movaps m2, [const_float_sign_mask]
|
||||
|
||||
align 16
|
||||
%%restore_sign_loop:
|
||||
movaps m0, [tmpY + r4] ; m0 = Y[i]
|
||||
movups m1, [inXq + r4] ; m1 = X[i]
|
||||
andps m1, m2 ; m1 = sign(X[i])
|
||||
orps m0, m1 ; m0 = Y[i]*sign
|
||||
cvtps2dq m3, m0 ; m3 = (int)m0
|
||||
movaps [outYq + r4], m3
|
||||
|
||||
sub r4d, mmsize
|
||||
jnc %%restore_sign_loop
|
||||
%%return:
|
||||
|
||||
%if ARCH_X86_64 == 0 ; sbrdsp
|
||||
movss r0m, xm6 ; return (float)Syy_norm
|
||||
fld dword r0m
|
||||
%else
|
||||
movaps m0, m6 ; return (float)Syy_norm
|
||||
%endif
|
||||
|
||||
RET
|
||||
|
||||
align 16
|
||||
%%zero_input:
|
||||
lea r4d, [Nd - mmsize]
|
||||
xorps m0, m0
|
||||
%%zero_loop:
|
||||
movaps [outYq + r4], m0
|
||||
sub r4d, mmsize
|
||||
jnc %%zero_loop
|
||||
|
||||
movaps m6, [const_float_1]
|
||||
jmp %%return
|
||||
%endmacro
|
||||
|
||||
; if 1, use a float op that give half precision but execute for around 3 cycles.
|
||||
; On Skylake & Ryzen the division is much faster (around 11c/3),
|
||||
; that makes the full precision code about 2% slower.
|
||||
; Opus also does use rsqrt approximation in their intrinsics code.
|
||||
%define USE_APPROXIMATION 1
|
||||
|
||||
INIT_XMM sse2
|
||||
PVQ_FAST_SEARCH _approx
|
||||
|
||||
INIT_XMM sse4
|
||||
PVQ_FAST_SEARCH _approx
|
||||
|
||||
%define USE_APPROXIMATION 0
|
||||
|
||||
INIT_XMM avx
|
||||
PVQ_FAST_SEARCH _exact
|
94
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/constants.c
vendored
Normal file
94
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/constants.c
vendored
Normal file
|
@ -0,0 +1,94 @@
|
|||
/*
|
||||
* MMX/SSE/AVX constants used across x86 dsp optimizations.
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/mem.h"
|
||||
#include "libavutil/x86/asm.h" // for xmm_reg
|
||||
#include "constants.h"
|
||||
|
||||
DECLARE_ALIGNED(32, const ymm_reg, ff_pw_1) = { 0x0001000100010001ULL, 0x0001000100010001ULL,
|
||||
0x0001000100010001ULL, 0x0001000100010001ULL };
|
||||
DECLARE_ALIGNED(32, const ymm_reg, ff_pw_2) = { 0x0002000200020002ULL, 0x0002000200020002ULL,
|
||||
0x0002000200020002ULL, 0x0002000200020002ULL };
|
||||
DECLARE_ASM_ALIGNED(16, const xmm_reg, ff_pw_3) = { 0x0003000300030003ULL, 0x0003000300030003ULL };
|
||||
DECLARE_ASM_ALIGNED(32, const ymm_reg, ff_pw_4) = { 0x0004000400040004ULL, 0x0004000400040004ULL,
|
||||
0x0004000400040004ULL, 0x0004000400040004ULL };
|
||||
DECLARE_ASM_ALIGNED(16, const xmm_reg, ff_pw_5) = { 0x0005000500050005ULL, 0x0005000500050005ULL };
|
||||
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_8) = { 0x0008000800080008ULL, 0x0008000800080008ULL };
|
||||
DECLARE_ASM_ALIGNED(16, const xmm_reg, ff_pw_9) = { 0x0009000900090009ULL, 0x0009000900090009ULL };
|
||||
DECLARE_ALIGNED(8, const uint64_t, ff_pw_15) = 0x000F000F000F000FULL;
|
||||
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_16) = { 0x0010001000100010ULL, 0x0010001000100010ULL };
|
||||
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_17) = { 0x0011001100110011ULL, 0x0011001100110011ULL };
|
||||
DECLARE_ASM_ALIGNED(16, const xmm_reg, ff_pw_18) = { 0x0012001200120012ULL, 0x0012001200120012ULL };
|
||||
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_20) = { 0x0014001400140014ULL, 0x0014001400140014ULL };
|
||||
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_32) = { 0x0020002000200020ULL, 0x0020002000200020ULL };
|
||||
DECLARE_ASM_ALIGNED(8, const uint64_t, ff_pw_42) = 0x002A002A002A002AULL;
|
||||
DECLARE_ASM_ALIGNED(8, const uint64_t, ff_pw_53) = 0x0035003500350035ULL;
|
||||
DECLARE_ASM_ALIGNED(16, const xmm_reg, ff_pw_64) = { 0x0040004000400040ULL, 0x0040004000400040ULL };
|
||||
DECLARE_ASM_ALIGNED(8, const uint64_t, ff_pw_96) = 0x0060006000600060ULL;
|
||||
DECLARE_ASM_ALIGNED(8, const uint64_t, ff_pw_128) = 0x0080008000800080ULL;
|
||||
DECLARE_ALIGNED(32, const ymm_reg, ff_pw_255) = { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL,
|
||||
0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL };
|
||||
DECLARE_ALIGNED(32, const ymm_reg, ff_pw_256) = { 0x0100010001000100ULL, 0x0100010001000100ULL,
|
||||
0x0100010001000100ULL, 0x0100010001000100ULL };
|
||||
DECLARE_ALIGNED(32, const ymm_reg, ff_pw_512) = { 0x0200020002000200ULL, 0x0200020002000200ULL,
|
||||
0x0200020002000200ULL, 0x0200020002000200ULL };
|
||||
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1019) = { 0x03FB03FB03FB03FBULL, 0x03FB03FB03FB03FBULL };
|
||||
DECLARE_ALIGNED(32, const ymm_reg, ff_pw_1023) = { 0x03ff03ff03ff03ffULL, 0x03ff03ff03ff03ffULL,
|
||||
0x03ff03ff03ff03ffULL, 0x03ff03ff03ff03ffULL};
|
||||
DECLARE_ALIGNED(32, const ymm_reg, ff_pw_1024) = { 0x0400040004000400ULL, 0x0400040004000400ULL,
|
||||
0x0400040004000400ULL, 0x0400040004000400ULL};
|
||||
DECLARE_ALIGNED(32, const ymm_reg, ff_pw_2048) = { 0x0800080008000800ULL, 0x0800080008000800ULL,
|
||||
0x0800080008000800ULL, 0x0800080008000800ULL };
|
||||
DECLARE_ALIGNED(32, const ymm_reg, ff_pw_4095) = { 0x0fff0fff0fff0fffULL, 0x0fff0fff0fff0fffULL,
|
||||
0x0fff0fff0fff0fffULL, 0x0fff0fff0fff0fffULL };
|
||||
DECLARE_ALIGNED(32, const ymm_reg, ff_pw_4096) = { 0x1000100010001000ULL, 0x1000100010001000ULL,
|
||||
0x1000100010001000ULL, 0x1000100010001000ULL };
|
||||
DECLARE_ALIGNED(32, const ymm_reg, ff_pw_8192) = { 0x2000200020002000ULL, 0x2000200020002000ULL,
|
||||
0x2000200020002000ULL, 0x2000200020002000ULL };
|
||||
DECLARE_ALIGNED(32, const ymm_reg, ff_pw_m1) = { 0xFFFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL,
|
||||
0xFFFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL };
|
||||
|
||||
DECLARE_ALIGNED(32, const ymm_reg, ff_pb_0) = { 0x0000000000000000ULL, 0x0000000000000000ULL,
|
||||
0x0000000000000000ULL, 0x0000000000000000ULL };
|
||||
DECLARE_ALIGNED(32, const ymm_reg, ff_pb_1) = { 0x0101010101010101ULL, 0x0101010101010101ULL,
|
||||
0x0101010101010101ULL, 0x0101010101010101ULL };
|
||||
DECLARE_ALIGNED(32, const ymm_reg, ff_pb_2) = { 0x0202020202020202ULL, 0x0202020202020202ULL,
|
||||
0x0202020202020202ULL, 0x0202020202020202ULL };
|
||||
DECLARE_ALIGNED(32, const ymm_reg, ff_pb_3) = { 0x0303030303030303ULL, 0x0303030303030303ULL,
|
||||
0x0303030303030303ULL, 0x0303030303030303ULL };
|
||||
DECLARE_ALIGNED(32, const xmm_reg, ff_pb_15) = { 0x0F0F0F0F0F0F0F0FULL, 0x0F0F0F0F0F0F0F0FULL };
|
||||
DECLARE_ALIGNED(32, const ymm_reg, ff_pb_80) = { 0x8080808080808080ULL, 0x8080808080808080ULL,
|
||||
0x8080808080808080ULL, 0x8080808080808080ULL };
|
||||
DECLARE_ALIGNED(32, const ymm_reg, ff_pb_FE) = { 0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL,
|
||||
0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL };
|
||||
DECLARE_ALIGNED(8, const uint64_t, ff_pb_FC) = 0xFCFCFCFCFCFCFCFCULL;
|
||||
|
||||
DECLARE_ALIGNED(16, const xmm_reg, ff_ps_neg) = { 0x8000000080000000ULL, 0x8000000080000000ULL };
|
||||
|
||||
DECLARE_ALIGNED(32, const ymm_reg, ff_pd_1) = { 0x0000000100000001ULL, 0x0000000100000001ULL,
|
||||
0x0000000100000001ULL, 0x0000000100000001ULL };
|
||||
DECLARE_ALIGNED(32, const ymm_reg, ff_pd_16) = { 0x0000001000000010ULL, 0x0000001000000010ULL,
|
||||
0x0000001000000010ULL, 0x0000001000000010ULL };
|
||||
DECLARE_ALIGNED(32, const ymm_reg, ff_pd_32) = { 0x0000002000000020ULL, 0x0000002000000020ULL,
|
||||
0x0000002000000020ULL, 0x0000002000000020ULL };
|
||||
DECLARE_ALIGNED(32, const ymm_reg, ff_pd_8192) = { 0x0000200000002000ULL, 0x0000200000002000ULL,
|
||||
0x0000200000002000ULL, 0x0000200000002000ULL };
|
||||
DECLARE_ALIGNED(32, const ymm_reg, ff_pd_65535)= { 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL,
|
||||
0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL };
|
72
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/constants.h
vendored
Normal file
72
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/constants.h
vendored
Normal file
|
@ -0,0 +1,72 @@
|
|||
/*
|
||||
* MMX/SSE constants used across x86 dsp optimizations.
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef AVCODEC_X86_CONSTANTS_H
|
||||
#define AVCODEC_X86_CONSTANTS_H
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include "libavutil/x86/asm.h"
|
||||
|
||||
extern const ymm_reg ff_pw_1;
|
||||
extern const ymm_reg ff_pw_2;
|
||||
extern const xmm_reg ff_pw_3;
|
||||
extern const ymm_reg ff_pw_4;
|
||||
extern const xmm_reg ff_pw_5;
|
||||
extern const xmm_reg ff_pw_8;
|
||||
extern const xmm_reg ff_pw_9;
|
||||
extern const uint64_t ff_pw_15;
|
||||
extern const xmm_reg ff_pw_16;
|
||||
extern const xmm_reg ff_pw_18;
|
||||
extern const xmm_reg ff_pw_20;
|
||||
extern const xmm_reg ff_pw_32;
|
||||
extern const uint64_t ff_pw_42;
|
||||
extern const uint64_t ff_pw_53;
|
||||
extern const xmm_reg ff_pw_64;
|
||||
extern const uint64_t ff_pw_96;
|
||||
extern const uint64_t ff_pw_128;
|
||||
extern const ymm_reg ff_pw_255;
|
||||
extern const ymm_reg ff_pw_256;
|
||||
extern const ymm_reg ff_pw_512;
|
||||
extern const ymm_reg ff_pw_1023;
|
||||
extern const ymm_reg ff_pw_1024;
|
||||
extern const ymm_reg ff_pw_2048;
|
||||
extern const ymm_reg ff_pw_4095;
|
||||
extern const ymm_reg ff_pw_4096;
|
||||
extern const ymm_reg ff_pw_8192;
|
||||
extern const ymm_reg ff_pw_m1;
|
||||
|
||||
extern const ymm_reg ff_pb_0;
|
||||
extern const ymm_reg ff_pb_1;
|
||||
extern const ymm_reg ff_pb_2;
|
||||
extern const ymm_reg ff_pb_3;
|
||||
extern const ymm_reg ff_pb_80;
|
||||
extern const ymm_reg ff_pb_FE;
|
||||
extern const uint64_t ff_pb_FC;
|
||||
|
||||
extern const xmm_reg ff_ps_neg;
|
||||
|
||||
extern const ymm_reg ff_pd_1;
|
||||
extern const ymm_reg ff_pd_16;
|
||||
extern const ymm_reg ff_pd_32;
|
||||
extern const ymm_reg ff_pd_8192;
|
||||
extern const ymm_reg ff_pd_65535;
|
||||
|
||||
#endif /* AVCODEC_X86_CONSTANTS_H */
|
52
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/dcadsp_init.c
vendored
Normal file
52
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/dcadsp_init.c
vendored
Normal file
|
@ -0,0 +1,52 @@
|
|||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/dcadsp.h"
|
||||
|
||||
#define LFE_FIR_FLOAT_FUNC(opt) \
|
||||
void ff_lfe_fir0_float_##opt(float *pcm_samples, int32_t *lfe_samples, \
|
||||
const float *filter_coeff, ptrdiff_t npcmblocks); \
|
||||
void ff_lfe_fir1_float_##opt(float *pcm_samples, int32_t *lfe_samples, \
|
||||
const float *filter_coeff, ptrdiff_t npcmblocks);
|
||||
|
||||
LFE_FIR_FLOAT_FUNC(sse)
|
||||
LFE_FIR_FLOAT_FUNC(sse2)
|
||||
LFE_FIR_FLOAT_FUNC(sse3)
|
||||
LFE_FIR_FLOAT_FUNC(avx)
|
||||
LFE_FIR_FLOAT_FUNC(fma3)
|
||||
|
||||
av_cold void ff_dcadsp_init_x86(DCADSPContext *s)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (ARCH_X86_32 && EXTERNAL_SSE(cpu_flags))
|
||||
s->lfe_fir_float[0] = ff_lfe_fir0_float_sse;
|
||||
if (EXTERNAL_SSE2(cpu_flags))
|
||||
s->lfe_fir_float[0] = ff_lfe_fir0_float_sse2;
|
||||
if (EXTERNAL_SSE3(cpu_flags))
|
||||
s->lfe_fir_float[1] = ff_lfe_fir1_float_sse3;
|
||||
if (EXTERNAL_AVX(cpu_flags)) {
|
||||
s->lfe_fir_float[0] = ff_lfe_fir0_float_avx;
|
||||
s->lfe_fir_float[1] = ff_lfe_fir1_float_avx;
|
||||
}
|
||||
if (EXTERNAL_FMA3(cpu_flags))
|
||||
s->lfe_fir_float[0] = ff_lfe_fir0_float_fma3;
|
||||
}
|
41
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/dct_init.c
vendored
Normal file
41
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/dct_init.c
vendored
Normal file
|
@ -0,0 +1,41 @@
|
|||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/dct.h"
|
||||
|
||||
void ff_dct32_float_sse(FFTSample *out, const FFTSample *in);
|
||||
void ff_dct32_float_sse2(FFTSample *out, const FFTSample *in);
|
||||
void ff_dct32_float_avx(FFTSample *out, const FFTSample *in);
|
||||
|
||||
av_cold void ff_dct_init_x86(DCTContext *s)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
#if ARCH_X86_32
|
||||
if (EXTERNAL_SSE(cpu_flags))
|
||||
s->dct32 = ff_dct32_float_sse;
|
||||
#endif
|
||||
if (EXTERNAL_SSE2(cpu_flags))
|
||||
s->dct32 = ff_dct32_float_sse2;
|
||||
if (EXTERNAL_AVX_FAST(cpu_flags))
|
||||
s->dct32 = ff_dct32_float_avx;
|
||||
}
|
229
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/dirac_dwt_init.c
vendored
Normal file
229
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/dirac_dwt_init.c
vendored
Normal file
|
@ -0,0 +1,229 @@
|
|||
/*
|
||||
* x86 optimized discrete wavelet transform
|
||||
* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
|
||||
* Copyright (c) 2010 David Conrad
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/x86/asm.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/dirac_dwt.h"
|
||||
|
||||
#define COMPOSE_VERTICAL(ext, align) \
|
||||
void ff_vertical_compose53iL0##ext(int16_t *b0, int16_t *b1, int16_t *b2, int width); \
|
||||
void ff_vertical_compose_dirac53iH0##ext(int16_t *b0, int16_t *b1, int16_t *b2, int width); \
|
||||
void ff_vertical_compose_dd137iL0##ext(int16_t *b0, int16_t *b1, int16_t *b2, int16_t *b3, int16_t *b4, int width); \
|
||||
void ff_vertical_compose_dd97iH0##ext(int16_t *b0, int16_t *b1, int16_t *b2, int16_t *b3, int16_t *b4, int width); \
|
||||
void ff_vertical_compose_haar##ext(int16_t *b0, int16_t *b1, int width); \
|
||||
void ff_horizontal_compose_haar0i##ext(int16_t *b, int16_t *tmp, int w);\
|
||||
void ff_horizontal_compose_haar1i##ext(int16_t *b, int16_t *tmp, int w);\
|
||||
\
|
||||
static void vertical_compose53iL0##ext(uint8_t *_b0, uint8_t *_b1, uint8_t *_b2, int width) \
|
||||
{ \
|
||||
int i, width_align = width&~(align-1); \
|
||||
int16_t *b0 = (int16_t *)_b0; \
|
||||
int16_t *b1 = (int16_t *)_b1; \
|
||||
int16_t *b2 = (int16_t *)_b2; \
|
||||
\
|
||||
for(i=width_align; i<width; i++) \
|
||||
b1[i] = COMPOSE_53iL0(b0[i], b1[i], b2[i]); \
|
||||
\
|
||||
ff_vertical_compose53iL0##ext(b0, b1, b2, width_align); \
|
||||
} \
|
||||
\
|
||||
static void vertical_compose_dirac53iH0##ext(uint8_t *_b0, uint8_t *_b1, uint8_t *_b2, int width) \
|
||||
{ \
|
||||
int i, width_align = width&~(align-1); \
|
||||
int16_t *b0 = (int16_t *)_b0; \
|
||||
int16_t *b1 = (int16_t *)_b1; \
|
||||
int16_t *b2 = (int16_t *)_b2; \
|
||||
\
|
||||
for(i=width_align; i<width; i++) \
|
||||
b1[i] = COMPOSE_DIRAC53iH0(b0[i], b1[i], b2[i]); \
|
||||
\
|
||||
ff_vertical_compose_dirac53iH0##ext(b0, b1, b2, width_align); \
|
||||
} \
|
||||
\
|
||||
static void vertical_compose_dd137iL0##ext(uint8_t *_b0, uint8_t *_b1, uint8_t *_b2, \
|
||||
uint8_t *_b3, uint8_t *_b4, int width) \
|
||||
{ \
|
||||
int i, width_align = width&~(align-1); \
|
||||
int16_t *b0 = (int16_t *)_b0; \
|
||||
int16_t *b1 = (int16_t *)_b1; \
|
||||
int16_t *b2 = (int16_t *)_b2; \
|
||||
int16_t *b3 = (int16_t *)_b3; \
|
||||
int16_t *b4 = (int16_t *)_b4; \
|
||||
\
|
||||
for(i=width_align; i<width; i++) \
|
||||
b2[i] = COMPOSE_DD137iL0(b0[i], b1[i], b2[i], b3[i], b4[i]); \
|
||||
\
|
||||
ff_vertical_compose_dd137iL0##ext(b0, b1, b2, b3, b4, width_align); \
|
||||
} \
|
||||
\
|
||||
static void vertical_compose_dd97iH0##ext(uint8_t *_b0, uint8_t *_b1, uint8_t *_b2, \
|
||||
uint8_t *_b3, uint8_t *_b4, int width) \
|
||||
{ \
|
||||
int i, width_align = width&~(align-1); \
|
||||
int16_t *b0 = (int16_t *)_b0; \
|
||||
int16_t *b1 = (int16_t *)_b1; \
|
||||
int16_t *b2 = (int16_t *)_b2; \
|
||||
int16_t *b3 = (int16_t *)_b3; \
|
||||
int16_t *b4 = (int16_t *)_b4; \
|
||||
\
|
||||
for(i=width_align; i<width; i++) \
|
||||
b2[i] = COMPOSE_DD97iH0(b0[i], b1[i], b2[i], b3[i], b4[i]); \
|
||||
\
|
||||
ff_vertical_compose_dd97iH0##ext(b0, b1, b2, b3, b4, width_align); \
|
||||
} \
|
||||
static void vertical_compose_haar##ext(uint8_t *_b0, uint8_t *_b1, int width) \
|
||||
{ \
|
||||
int i, width_align = width&~(align-1); \
|
||||
int16_t *b0 = (int16_t *)_b0; \
|
||||
int16_t *b1 = (int16_t *)_b1; \
|
||||
\
|
||||
for(i=width_align; i<width; i++) { \
|
||||
b0[i] = COMPOSE_HAARiL0(b0[i], b1[i]); \
|
||||
b1[i] = COMPOSE_HAARiH0(b1[i], b0[i]); \
|
||||
} \
|
||||
\
|
||||
ff_vertical_compose_haar##ext(b0, b1, width_align); \
|
||||
} \
|
||||
static void horizontal_compose_haar0i##ext(uint8_t *_b, uint8_t *_tmp, int w)\
|
||||
{\
|
||||
int w2= w>>1;\
|
||||
int x= w2 - (w2&(align-1));\
|
||||
int16_t *b = (int16_t *)_b; \
|
||||
int16_t *tmp = (int16_t *)_tmp; \
|
||||
\
|
||||
ff_horizontal_compose_haar0i##ext(b, tmp, w);\
|
||||
\
|
||||
for (; x < w2; x++) {\
|
||||
b[2*x ] = tmp[x];\
|
||||
b[2*x+1] = COMPOSE_HAARiH0(b[x+w2], tmp[x]);\
|
||||
}\
|
||||
}\
|
||||
static void horizontal_compose_haar1i##ext(uint8_t *_b, uint8_t *_tmp, int w)\
|
||||
{\
|
||||
int w2= w>>1;\
|
||||
int x= w2 - (w2&(align-1));\
|
||||
int16_t *b = (int16_t *)_b; \
|
||||
int16_t *tmp = (int16_t *)_tmp; \
|
||||
\
|
||||
ff_horizontal_compose_haar1i##ext(b, tmp, w);\
|
||||
\
|
||||
for (; x < w2; x++) {\
|
||||
b[2*x ] = (tmp[x] + 1)>>1;\
|
||||
b[2*x+1] = (COMPOSE_HAARiH0(b[x+w2], tmp[x]) + 1)>>1;\
|
||||
}\
|
||||
}\
|
||||
\
|
||||
|
||||
#if HAVE_X86ASM
|
||||
#if !ARCH_X86_64
|
||||
COMPOSE_VERTICAL(_mmx, 4)
|
||||
#endif
|
||||
COMPOSE_VERTICAL(_sse2, 8)
|
||||
|
||||
|
||||
void ff_horizontal_compose_dd97i_ssse3(int16_t *_b, int16_t *_tmp, int w);
|
||||
|
||||
static void horizontal_compose_dd97i_ssse3(uint8_t *_b, uint8_t *_tmp, int w)
|
||||
{
|
||||
int w2= w>>1;
|
||||
int x= w2 - (w2&7);
|
||||
int16_t *b = (int16_t *)_b;
|
||||
int16_t *tmp = (int16_t *)_tmp;
|
||||
|
||||
ff_horizontal_compose_dd97i_ssse3(b, tmp, w);
|
||||
|
||||
for (; x < w2; x++) {
|
||||
b[2*x ] = (tmp[x] + 1)>>1;
|
||||
b[2*x+1] = (COMPOSE_DD97iH0(tmp[x-1], tmp[x], b[x+w2], tmp[x+1], tmp[x+2]) + 1)>>1;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
void ff_spatial_idwt_init_x86(DWTContext *d, enum dwt_type type)
|
||||
{
|
||||
#if HAVE_X86ASM
|
||||
int mm_flags = av_get_cpu_flags();
|
||||
|
||||
#if !ARCH_X86_64
|
||||
if (!(mm_flags & AV_CPU_FLAG_MMX))
|
||||
return;
|
||||
|
||||
switch (type) {
|
||||
case DWT_DIRAC_DD9_7:
|
||||
d->vertical_compose_l0 = (void*)vertical_compose53iL0_mmx;
|
||||
d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_mmx;
|
||||
break;
|
||||
case DWT_DIRAC_LEGALL5_3:
|
||||
d->vertical_compose_l0 = (void*)vertical_compose53iL0_mmx;
|
||||
d->vertical_compose_h0 = (void*)vertical_compose_dirac53iH0_mmx;
|
||||
break;
|
||||
case DWT_DIRAC_DD13_7:
|
||||
d->vertical_compose_l0 = (void*)vertical_compose_dd137iL0_mmx;
|
||||
d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_mmx;
|
||||
break;
|
||||
case DWT_DIRAC_HAAR0:
|
||||
d->vertical_compose = (void*)vertical_compose_haar_mmx;
|
||||
d->horizontal_compose = horizontal_compose_haar0i_mmx;
|
||||
break;
|
||||
case DWT_DIRAC_HAAR1:
|
||||
d->vertical_compose = (void*)vertical_compose_haar_mmx;
|
||||
d->horizontal_compose = horizontal_compose_haar1i_mmx;
|
||||
break;
|
||||
}
|
||||
#endif
|
||||
|
||||
if (!(mm_flags & AV_CPU_FLAG_SSE2))
|
||||
return;
|
||||
|
||||
switch (type) {
|
||||
case DWT_DIRAC_DD9_7:
|
||||
d->vertical_compose_l0 = (void*)vertical_compose53iL0_sse2;
|
||||
d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_sse2;
|
||||
break;
|
||||
case DWT_DIRAC_LEGALL5_3:
|
||||
d->vertical_compose_l0 = (void*)vertical_compose53iL0_sse2;
|
||||
d->vertical_compose_h0 = (void*)vertical_compose_dirac53iH0_sse2;
|
||||
break;
|
||||
case DWT_DIRAC_DD13_7:
|
||||
d->vertical_compose_l0 = (void*)vertical_compose_dd137iL0_sse2;
|
||||
d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_sse2;
|
||||
break;
|
||||
case DWT_DIRAC_HAAR0:
|
||||
d->vertical_compose = (void*)vertical_compose_haar_sse2;
|
||||
d->horizontal_compose = horizontal_compose_haar0i_sse2;
|
||||
break;
|
||||
case DWT_DIRAC_HAAR1:
|
||||
d->vertical_compose = (void*)vertical_compose_haar_sse2;
|
||||
d->horizontal_compose = horizontal_compose_haar1i_sse2;
|
||||
break;
|
||||
}
|
||||
|
||||
if (!(mm_flags & AV_CPU_FLAG_SSSE3))
|
||||
return;
|
||||
|
||||
switch (type) {
|
||||
case DWT_DIRAC_DD9_7:
|
||||
d->horizontal_compose = horizontal_compose_dd97i_ssse3;
|
||||
break;
|
||||
}
|
||||
#endif // HAVE_X86ASM
|
||||
}
|
195
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/diracdsp_init.c
vendored
Normal file
195
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/diracdsp_init.c
vendored
Normal file
|
@ -0,0 +1,195 @@
|
|||
/*
|
||||
* Copyright (C) 2010 David Conrad
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/diracdsp.h"
|
||||
#include "fpel.h"
|
||||
|
||||
DECL_DIRAC_PIXOP(put, mmx);
|
||||
DECL_DIRAC_PIXOP(avg, mmx);
|
||||
DECL_DIRAC_PIXOP(avg, mmxext);
|
||||
|
||||
void ff_put_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h);
|
||||
void ff_avg_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h);
|
||||
void ff_put_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h);
|
||||
void ff_avg_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h);
|
||||
|
||||
void ff_add_rect_clamped_mmx(uint8_t *, const uint16_t *, int, const int16_t *, int, int, int);
|
||||
void ff_add_rect_clamped_sse2(uint8_t *, const uint16_t *, int, const int16_t *, int, int, int);
|
||||
|
||||
void ff_add_dirac_obmc8_mmx(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
|
||||
void ff_add_dirac_obmc16_mmx(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
|
||||
void ff_add_dirac_obmc32_mmx(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
|
||||
|
||||
void ff_add_dirac_obmc16_sse2(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
|
||||
void ff_add_dirac_obmc32_sse2(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
|
||||
|
||||
void ff_put_rect_clamped_mmx(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height);
|
||||
void ff_put_rect_clamped_sse2(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height);
|
||||
void ff_put_signed_rect_clamped_mmx(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height);
|
||||
void ff_put_signed_rect_clamped_sse2(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height);
|
||||
void ff_put_signed_rect_clamped_10_sse4(uint8_t *dst, int dst_stride, const uint8_t *src, int src_stride, int width, int height);
|
||||
|
||||
void ff_dequant_subband_32_sse4(uint8_t *src, uint8_t *dst, ptrdiff_t stride, const int qf, const int qs, int tot_v, int tot_h);
|
||||
|
||||
#if HAVE_X86ASM
|
||||
|
||||
#define HPEL_FILTER(MMSIZE, EXT) \
|
||||
void ff_dirac_hpel_filter_v_ ## EXT(uint8_t *, const uint8_t *, int, int); \
|
||||
void ff_dirac_hpel_filter_h_ ## EXT(uint8_t *, const uint8_t *, int); \
|
||||
\
|
||||
static void dirac_hpel_filter_ ## EXT(uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, \
|
||||
const uint8_t *src, int stride, int width, int height) \
|
||||
{ \
|
||||
while( height-- ) \
|
||||
{ \
|
||||
ff_dirac_hpel_filter_v_ ## EXT(dstv-MMSIZE, src-MMSIZE, stride, width+MMSIZE+5); \
|
||||
ff_dirac_hpel_filter_h_ ## EXT(dsth, src, width); \
|
||||
ff_dirac_hpel_filter_h_ ## EXT(dstc, dstv, width); \
|
||||
\
|
||||
dsth += stride; \
|
||||
dstv += stride; \
|
||||
dstc += stride; \
|
||||
src += stride; \
|
||||
} \
|
||||
}
|
||||
|
||||
#define PIXFUNC(PFX, IDX, EXT) \
|
||||
/*MMXDISABLEDc->PFX ## _dirac_pixels_tab[0][IDX] = ff_ ## PFX ## _dirac_pixels8_ ## EXT;*/ \
|
||||
c->PFX ## _dirac_pixels_tab[1][IDX] = ff_ ## PFX ## _dirac_pixels16_ ## EXT; \
|
||||
c->PFX ## _dirac_pixels_tab[2][IDX] = ff_ ## PFX ## _dirac_pixels32_ ## EXT
|
||||
|
||||
#define DIRAC_PIXOP(OPNAME2, OPNAME, EXT)\
|
||||
void ff_ ## OPNAME2 ## _dirac_pixels8_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
|
||||
{\
|
||||
if (h&3)\
|
||||
ff_ ## OPNAME2 ## _dirac_pixels8_c(dst, src, stride, h);\
|
||||
else\
|
||||
OPNAME ## _pixels8_ ## EXT(dst, src[0], stride, h);\
|
||||
}\
|
||||
void ff_ ## OPNAME2 ## _dirac_pixels16_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
|
||||
{\
|
||||
if (h&3)\
|
||||
ff_ ## OPNAME2 ## _dirac_pixels16_c(dst, src, stride, h);\
|
||||
else\
|
||||
OPNAME ## _pixels16_ ## EXT(dst, src[0], stride, h);\
|
||||
}\
|
||||
void ff_ ## OPNAME2 ## _dirac_pixels32_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
|
||||
{\
|
||||
if (h&3) {\
|
||||
ff_ ## OPNAME2 ## _dirac_pixels32_c(dst, src, stride, h);\
|
||||
} else {\
|
||||
OPNAME ## _pixels16_ ## EXT(dst , src[0] , stride, h);\
|
||||
OPNAME ## _pixels16_ ## EXT(dst+16, src[0]+16, stride, h);\
|
||||
}\
|
||||
}
|
||||
|
||||
DIRAC_PIXOP(put, ff_put, mmx)
|
||||
DIRAC_PIXOP(avg, ff_avg, mmx)
|
||||
DIRAC_PIXOP(avg, ff_avg, mmxext)
|
||||
|
||||
void ff_put_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
|
||||
{
|
||||
if (h&3)
|
||||
ff_put_dirac_pixels16_c(dst, src, stride, h);
|
||||
else
|
||||
ff_put_pixels16_sse2(dst, src[0], stride, h);
|
||||
}
|
||||
void ff_avg_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
|
||||
{
|
||||
if (h&3)
|
||||
ff_avg_dirac_pixels16_c(dst, src, stride, h);
|
||||
else
|
||||
ff_avg_pixels16_sse2(dst, src[0], stride, h);
|
||||
}
|
||||
void ff_put_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
|
||||
{
|
||||
if (h&3) {
|
||||
ff_put_dirac_pixels32_c(dst, src, stride, h);
|
||||
} else {
|
||||
ff_put_pixels16_sse2(dst , src[0] , stride, h);
|
||||
ff_put_pixels16_sse2(dst+16, src[0]+16, stride, h);
|
||||
}
|
||||
}
|
||||
void ff_avg_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
|
||||
{
|
||||
if (h&3) {
|
||||
ff_avg_dirac_pixels32_c(dst, src, stride, h);
|
||||
} else {
|
||||
ff_avg_pixels16_sse2(dst , src[0] , stride, h);
|
||||
ff_avg_pixels16_sse2(dst+16, src[0]+16, stride, h);
|
||||
}
|
||||
}
|
||||
|
||||
#else // HAVE_X86ASM
|
||||
|
||||
#define HPEL_FILTER(MMSIZE, EXT) \
|
||||
void dirac_hpel_filter_ ## EXT(uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, \
|
||||
const uint8_t *src, int stride, int width, int height);
|
||||
|
||||
#define PIXFUNC(PFX, IDX, EXT) do {} while (0)
|
||||
|
||||
#endif // HAVE_X86ASM
|
||||
|
||||
#if !ARCH_X86_64
|
||||
HPEL_FILTER(8, mmx)
|
||||
#endif
|
||||
HPEL_FILTER(16, sse2)
|
||||
|
||||
void ff_diracdsp_init_x86(DiracDSPContext* c)
|
||||
{
|
||||
int mm_flags = av_get_cpu_flags();
|
||||
|
||||
if (EXTERNAL_MMX(mm_flags)) {
|
||||
c->add_dirac_obmc[0] = ff_add_dirac_obmc8_mmx;
|
||||
#if !ARCH_X86_64
|
||||
c->add_dirac_obmc[1] = ff_add_dirac_obmc16_mmx;
|
||||
c->add_dirac_obmc[2] = ff_add_dirac_obmc32_mmx;
|
||||
c->dirac_hpel_filter = dirac_hpel_filter_mmx;
|
||||
c->add_rect_clamped = ff_add_rect_clamped_mmx;
|
||||
c->put_signed_rect_clamped[0] = (void *)ff_put_signed_rect_clamped_mmx;
|
||||
#endif
|
||||
PIXFUNC(put, 0, mmx);
|
||||
PIXFUNC(avg, 0, mmx);
|
||||
}
|
||||
|
||||
if (EXTERNAL_MMXEXT(mm_flags)) {
|
||||
PIXFUNC(avg, 0, mmxext);
|
||||
}
|
||||
|
||||
if (EXTERNAL_SSE2(mm_flags)) {
|
||||
c->dirac_hpel_filter = dirac_hpel_filter_sse2;
|
||||
c->add_rect_clamped = ff_add_rect_clamped_sse2;
|
||||
c->put_signed_rect_clamped[0] = (void *)ff_put_signed_rect_clamped_sse2;
|
||||
|
||||
c->add_dirac_obmc[1] = ff_add_dirac_obmc16_sse2;
|
||||
c->add_dirac_obmc[2] = ff_add_dirac_obmc32_sse2;
|
||||
|
||||
c->put_dirac_pixels_tab[1][0] = ff_put_dirac_pixels16_sse2;
|
||||
c->avg_dirac_pixels_tab[1][0] = ff_avg_dirac_pixels16_sse2;
|
||||
c->put_dirac_pixels_tab[2][0] = ff_put_dirac_pixels32_sse2;
|
||||
c->avg_dirac_pixels_tab[2][0] = ff_avg_dirac_pixels32_sse2;
|
||||
}
|
||||
|
||||
if (EXTERNAL_SSE4(mm_flags)) {
|
||||
c->dequant_subband[1] = ff_dequant_subband_32_sse4;
|
||||
c->put_signed_rect_clamped[1] = ff_put_signed_rect_clamped_10_sse4;
|
||||
}
|
||||
}
|
37
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/dnxhdenc_init.c
vendored
Normal file
37
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/dnxhdenc_init.c
vendored
Normal file
|
@ -0,0 +1,37 @@
|
|||
/*
|
||||
* VC3/DNxHD SIMD functions
|
||||
* Copyright (c) 2007 Baptiste Coudurier <baptiste dot coudurier at smartjog dot com>
|
||||
*
|
||||
* VC-3 encoder funded by the British Broadcasting Corporation
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/dnxhdenc.h"
|
||||
|
||||
void ff_get_pixels_8x4_sym_sse2(int16_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size);
|
||||
|
||||
av_cold void ff_dnxhdenc_init_x86(DNXHDEncContext *ctx)
|
||||
{
|
||||
if (EXTERNAL_SSE2(av_get_cpu_flags())) {
|
||||
if (ctx->cid_table->bit_depth == 8)
|
||||
ctx->get_pixels_8x4_sym = ff_get_pixels_8x4_sym_sse2;
|
||||
}
|
||||
}
|
52
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/exrdsp_init.c
vendored
Normal file
52
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/exrdsp_init.c
vendored
Normal file
|
@ -0,0 +1,52 @@
|
|||
/*
|
||||
* OpenEXR (.exr) image decoder
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/exrdsp.h"
|
||||
|
||||
void ff_reorder_pixels_sse2(uint8_t *dst, const uint8_t *src, ptrdiff_t size);
|
||||
|
||||
void ff_reorder_pixels_avx2(uint8_t *dst, const uint8_t *src, ptrdiff_t size);
|
||||
|
||||
void ff_predictor_ssse3(uint8_t *src, ptrdiff_t size);
|
||||
|
||||
void ff_predictor_avx(uint8_t *src, ptrdiff_t size);
|
||||
|
||||
void ff_predictor_avx2(uint8_t *src, ptrdiff_t size);
|
||||
|
||||
av_cold void ff_exrdsp_init_x86(ExrDSPContext *dsp)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (EXTERNAL_SSE2(cpu_flags)) {
|
||||
dsp->reorder_pixels = ff_reorder_pixels_sse2;
|
||||
}
|
||||
if (EXTERNAL_SSSE3(cpu_flags)) {
|
||||
dsp->predictor = ff_predictor_ssse3;
|
||||
}
|
||||
if (EXTERNAL_AVX(cpu_flags)) {
|
||||
dsp->predictor = ff_predictor_avx;
|
||||
}
|
||||
if (EXTERNAL_AVX2_FAST(cpu_flags)) {
|
||||
dsp->reorder_pixels = ff_reorder_pixels_avx2;
|
||||
dsp->predictor = ff_predictor_avx2;
|
||||
}
|
||||
}
|
594
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/fdct.c
vendored
Normal file
594
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/fdct.c
vendored
Normal file
|
@ -0,0 +1,594 @@
|
|||
/*
|
||||
* SIMD-optimized forward DCT
|
||||
* The gcc porting is Copyright (c) 2001 Fabrice Bellard.
|
||||
* cleanup/optimizations are Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
|
||||
* SSE2 optimization is Copyright (c) 2004 Denes Balatoni.
|
||||
*
|
||||
* from fdctam32.c - AP922 MMX(3D-Now) forward-DCT
|
||||
*
|
||||
* Intel Application Note AP-922 - fast, precise implementation of DCT
|
||||
* http://developer.intel.com/vtune/cbts/appnotes.htm
|
||||
*
|
||||
* Also of inspiration:
|
||||
* a page about fdct at http://www.geocities.com/ssavekar/dct.htm
|
||||
* Skal's fdct at http://skal.planet-d.net/coding/dct.html
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/common.h"
|
||||
#include "libavutil/x86/asm.h"
|
||||
#include "fdct.h"
|
||||
|
||||
#if HAVE_MMX_INLINE
|
||||
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// constants for the forward DCT
|
||||
// -----------------------------
|
||||
//
|
||||
// Be sure to check that your compiler is aligning all constants to QWORD
|
||||
// (8-byte) memory boundaries! Otherwise the unaligned memory access will
|
||||
// severely stall MMX execution.
|
||||
//
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
|
||||
#define BITS_FRW_ACC 3 //; 2 or 3 for accuracy
|
||||
#define SHIFT_FRW_COL BITS_FRW_ACC
|
||||
#define SHIFT_FRW_ROW (BITS_FRW_ACC + 17 - 3)
|
||||
#define RND_FRW_ROW (1 << (SHIFT_FRW_ROW-1))
|
||||
//#define RND_FRW_COL (1 << (SHIFT_FRW_COL-1))
|
||||
|
||||
#define X8(x) x,x,x,x,x,x,x,x
|
||||
|
||||
//concatenated table, for forward DCT transformation
|
||||
DECLARE_ALIGNED(16, static const int16_t, fdct_tg_all_16)[24] = {
|
||||
X8(13036), // tg * (2<<16) + 0.5
|
||||
X8(27146), // tg * (2<<16) + 0.5
|
||||
X8(-21746) // tg * (2<<16) + 0.5
|
||||
};
|
||||
|
||||
DECLARE_ALIGNED(16, static const int16_t, ocos_4_16)[8] = {
|
||||
X8(23170) //cos * (2<<15) + 0.5
|
||||
};
|
||||
|
||||
DECLARE_ALIGNED(16, static const int16_t, fdct_one_corr)[8] = { X8(1) };
|
||||
|
||||
DECLARE_ALIGNED(8, static const int32_t, fdct_r_row)[2] = {RND_FRW_ROW, RND_FRW_ROW };
|
||||
|
||||
static const struct
|
||||
{
|
||||
DECLARE_ALIGNED(16, const int32_t, fdct_r_row_sse2)[4];
|
||||
} fdct_r_row_sse2 =
|
||||
{{
|
||||
RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW
|
||||
}};
|
||||
//DECLARE_ALIGNED(16, static const long, fdct_r_row_sse2)[4] = {RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW};
|
||||
|
||||
DECLARE_ALIGNED(8, static const int16_t, tab_frw_01234567)[] = { // forward_dct coeff table
|
||||
16384, 16384, 22725, 19266,
|
||||
16384, 16384, 12873, 4520,
|
||||
21407, 8867, 19266, -4520,
|
||||
-8867, -21407, -22725, -12873,
|
||||
16384, -16384, 12873, -22725,
|
||||
-16384, 16384, 4520, 19266,
|
||||
8867, -21407, 4520, -12873,
|
||||
21407, -8867, 19266, -22725,
|
||||
|
||||
22725, 22725, 31521, 26722,
|
||||
22725, 22725, 17855, 6270,
|
||||
29692, 12299, 26722, -6270,
|
||||
-12299, -29692, -31521, -17855,
|
||||
22725, -22725, 17855, -31521,
|
||||
-22725, 22725, 6270, 26722,
|
||||
12299, -29692, 6270, -17855,
|
||||
29692, -12299, 26722, -31521,
|
||||
|
||||
21407, 21407, 29692, 25172,
|
||||
21407, 21407, 16819, 5906,
|
||||
27969, 11585, 25172, -5906,
|
||||
-11585, -27969, -29692, -16819,
|
||||
21407, -21407, 16819, -29692,
|
||||
-21407, 21407, 5906, 25172,
|
||||
11585, -27969, 5906, -16819,
|
||||
27969, -11585, 25172, -29692,
|
||||
|
||||
19266, 19266, 26722, 22654,
|
||||
19266, 19266, 15137, 5315,
|
||||
25172, 10426, 22654, -5315,
|
||||
-10426, -25172, -26722, -15137,
|
||||
19266, -19266, 15137, -26722,
|
||||
-19266, 19266, 5315, 22654,
|
||||
10426, -25172, 5315, -15137,
|
||||
25172, -10426, 22654, -26722,
|
||||
|
||||
16384, 16384, 22725, 19266,
|
||||
16384, 16384, 12873, 4520,
|
||||
21407, 8867, 19266, -4520,
|
||||
-8867, -21407, -22725, -12873,
|
||||
16384, -16384, 12873, -22725,
|
||||
-16384, 16384, 4520, 19266,
|
||||
8867, -21407, 4520, -12873,
|
||||
21407, -8867, 19266, -22725,
|
||||
|
||||
19266, 19266, 26722, 22654,
|
||||
19266, 19266, 15137, 5315,
|
||||
25172, 10426, 22654, -5315,
|
||||
-10426, -25172, -26722, -15137,
|
||||
19266, -19266, 15137, -26722,
|
||||
-19266, 19266, 5315, 22654,
|
||||
10426, -25172, 5315, -15137,
|
||||
25172, -10426, 22654, -26722,
|
||||
|
||||
21407, 21407, 29692, 25172,
|
||||
21407, 21407, 16819, 5906,
|
||||
27969, 11585, 25172, -5906,
|
||||
-11585, -27969, -29692, -16819,
|
||||
21407, -21407, 16819, -29692,
|
||||
-21407, 21407, 5906, 25172,
|
||||
11585, -27969, 5906, -16819,
|
||||
27969, -11585, 25172, -29692,
|
||||
|
||||
22725, 22725, 31521, 26722,
|
||||
22725, 22725, 17855, 6270,
|
||||
29692, 12299, 26722, -6270,
|
||||
-12299, -29692, -31521, -17855,
|
||||
22725, -22725, 17855, -31521,
|
||||
-22725, 22725, 6270, 26722,
|
||||
12299, -29692, 6270, -17855,
|
||||
29692, -12299, 26722, -31521,
|
||||
};
|
||||
|
||||
static const struct
|
||||
{
|
||||
DECLARE_ALIGNED(16, const int16_t, tab_frw_01234567_sse2)[256];
|
||||
} tab_frw_01234567_sse2 =
|
||||
{{
|
||||
//DECLARE_ALIGNED(16, static const int16_t, tab_frw_01234567_sse2)[] = { // forward_dct coeff table
|
||||
#define TABLE_SSE2 C4, C4, C1, C3, -C6, -C2, -C1, -C5, \
|
||||
C4, C4, C5, C7, C2, C6, C3, -C7, \
|
||||
-C4, C4, C7, C3, C6, -C2, C7, -C5, \
|
||||
C4, -C4, C5, -C1, C2, -C6, C3, -C1,
|
||||
// c1..c7 * cos(pi/4) * 2^15
|
||||
#define C1 22725
|
||||
#define C2 21407
|
||||
#define C3 19266
|
||||
#define C4 16384
|
||||
#define C5 12873
|
||||
#define C6 8867
|
||||
#define C7 4520
|
||||
TABLE_SSE2
|
||||
|
||||
#undef C1
|
||||
#undef C2
|
||||
#undef C3
|
||||
#undef C4
|
||||
#undef C5
|
||||
#undef C6
|
||||
#undef C7
|
||||
#define C1 31521
|
||||
#define C2 29692
|
||||
#define C3 26722
|
||||
#define C4 22725
|
||||
#define C5 17855
|
||||
#define C6 12299
|
||||
#define C7 6270
|
||||
TABLE_SSE2
|
||||
|
||||
#undef C1
|
||||
#undef C2
|
||||
#undef C3
|
||||
#undef C4
|
||||
#undef C5
|
||||
#undef C6
|
||||
#undef C7
|
||||
#define C1 29692
|
||||
#define C2 27969
|
||||
#define C3 25172
|
||||
#define C4 21407
|
||||
#define C5 16819
|
||||
#define C6 11585
|
||||
#define C7 5906
|
||||
TABLE_SSE2
|
||||
|
||||
#undef C1
|
||||
#undef C2
|
||||
#undef C3
|
||||
#undef C4
|
||||
#undef C5
|
||||
#undef C6
|
||||
#undef C7
|
||||
#define C1 26722
|
||||
#define C2 25172
|
||||
#define C3 22654
|
||||
#define C4 19266
|
||||
#define C5 15137
|
||||
#define C6 10426
|
||||
#define C7 5315
|
||||
TABLE_SSE2
|
||||
|
||||
#undef C1
|
||||
#undef C2
|
||||
#undef C3
|
||||
#undef C4
|
||||
#undef C5
|
||||
#undef C6
|
||||
#undef C7
|
||||
#define C1 22725
|
||||
#define C2 21407
|
||||
#define C3 19266
|
||||
#define C4 16384
|
||||
#define C5 12873
|
||||
#define C6 8867
|
||||
#define C7 4520
|
||||
TABLE_SSE2
|
||||
|
||||
#undef C1
|
||||
#undef C2
|
||||
#undef C3
|
||||
#undef C4
|
||||
#undef C5
|
||||
#undef C6
|
||||
#undef C7
|
||||
#define C1 26722
|
||||
#define C2 25172
|
||||
#define C3 22654
|
||||
#define C4 19266
|
||||
#define C5 15137
|
||||
#define C6 10426
|
||||
#define C7 5315
|
||||
TABLE_SSE2
|
||||
|
||||
#undef C1
|
||||
#undef C2
|
||||
#undef C3
|
||||
#undef C4
|
||||
#undef C5
|
||||
#undef C6
|
||||
#undef C7
|
||||
#define C1 29692
|
||||
#define C2 27969
|
||||
#define C3 25172
|
||||
#define C4 21407
|
||||
#define C5 16819
|
||||
#define C6 11585
|
||||
#define C7 5906
|
||||
TABLE_SSE2
|
||||
|
||||
#undef C1
|
||||
#undef C2
|
||||
#undef C3
|
||||
#undef C4
|
||||
#undef C5
|
||||
#undef C6
|
||||
#undef C7
|
||||
#define C1 31521
|
||||
#define C2 29692
|
||||
#define C3 26722
|
||||
#define C4 22725
|
||||
#define C5 17855
|
||||
#define C6 12299
|
||||
#define C7 6270
|
||||
TABLE_SSE2
|
||||
}};
|
||||
|
||||
#define S(s) AV_TOSTRING(s) //AV_STRINGIFY is too long
|
||||
|
||||
#define FDCT_COL(cpu, mm, mov)\
|
||||
static av_always_inline void fdct_col_##cpu(const int16_t *in, int16_t *out, int offset)\
|
||||
{\
|
||||
__asm__ volatile (\
|
||||
#mov" 16(%0), %%"#mm"0 \n\t" \
|
||||
#mov" 96(%0), %%"#mm"1 \n\t" \
|
||||
#mov" %%"#mm"0, %%"#mm"2 \n\t" \
|
||||
#mov" 32(%0), %%"#mm"3 \n\t" \
|
||||
"paddsw %%"#mm"1, %%"#mm"0 \n\t" \
|
||||
#mov" 80(%0), %%"#mm"4 \n\t" \
|
||||
"psllw $"S(SHIFT_FRW_COL)", %%"#mm"0 \n\t" \
|
||||
#mov" (%0), %%"#mm"5 \n\t" \
|
||||
"paddsw %%"#mm"3, %%"#mm"4 \n\t" \
|
||||
"paddsw 112(%0), %%"#mm"5 \n\t" \
|
||||
"psllw $"S(SHIFT_FRW_COL)", %%"#mm"4 \n\t" \
|
||||
#mov" %%"#mm"0, %%"#mm"6 \n\t" \
|
||||
"psubsw %%"#mm"1, %%"#mm"2 \n\t" \
|
||||
#mov" 16(%1), %%"#mm"1 \n\t" \
|
||||
"psubsw %%"#mm"4, %%"#mm"0 \n\t" \
|
||||
#mov" 48(%0), %%"#mm"7 \n\t" \
|
||||
"pmulhw %%"#mm"0, %%"#mm"1 \n\t" \
|
||||
"paddsw 64(%0), %%"#mm"7 \n\t" \
|
||||
"psllw $"S(SHIFT_FRW_COL)", %%"#mm"5 \n\t" \
|
||||
"paddsw %%"#mm"4, %%"#mm"6 \n\t" \
|
||||
"psllw $"S(SHIFT_FRW_COL)", %%"#mm"7 \n\t" \
|
||||
#mov" %%"#mm"5, %%"#mm"4 \n\t" \
|
||||
"psubsw %%"#mm"7, %%"#mm"5 \n\t" \
|
||||
"paddsw %%"#mm"5, %%"#mm"1 \n\t" \
|
||||
"paddsw %%"#mm"7, %%"#mm"4 \n\t" \
|
||||
"por (%2), %%"#mm"1 \n\t" \
|
||||
"psllw $"S(SHIFT_FRW_COL)"+1, %%"#mm"2 \n\t" \
|
||||
"pmulhw 16(%1), %%"#mm"5 \n\t" \
|
||||
#mov" %%"#mm"4, %%"#mm"7 \n\t" \
|
||||
"psubsw 80(%0), %%"#mm"3 \n\t" \
|
||||
"psubsw %%"#mm"6, %%"#mm"4 \n\t" \
|
||||
#mov" %%"#mm"1, 32(%3) \n\t" \
|
||||
"paddsw %%"#mm"6, %%"#mm"7 \n\t" \
|
||||
#mov" 48(%0), %%"#mm"1 \n\t" \
|
||||
"psllw $"S(SHIFT_FRW_COL)"+1, %%"#mm"3 \n\t" \
|
||||
"psubsw 64(%0), %%"#mm"1 \n\t" \
|
||||
#mov" %%"#mm"2, %%"#mm"6 \n\t" \
|
||||
#mov" %%"#mm"4, 64(%3) \n\t" \
|
||||
"paddsw %%"#mm"3, %%"#mm"2 \n\t" \
|
||||
"pmulhw (%4), %%"#mm"2 \n\t" \
|
||||
"psubsw %%"#mm"3, %%"#mm"6 \n\t" \
|
||||
"pmulhw (%4), %%"#mm"6 \n\t" \
|
||||
"psubsw %%"#mm"0, %%"#mm"5 \n\t" \
|
||||
"por (%2), %%"#mm"5 \n\t" \
|
||||
"psllw $"S(SHIFT_FRW_COL)", %%"#mm"1 \n\t" \
|
||||
"por (%2), %%"#mm"2 \n\t" \
|
||||
#mov" %%"#mm"1, %%"#mm"4 \n\t" \
|
||||
#mov" (%0), %%"#mm"3 \n\t" \
|
||||
"paddsw %%"#mm"6, %%"#mm"1 \n\t" \
|
||||
"psubsw 112(%0), %%"#mm"3 \n\t" \
|
||||
"psubsw %%"#mm"6, %%"#mm"4 \n\t" \
|
||||
#mov" (%1), %%"#mm"0 \n\t" \
|
||||
"psllw $"S(SHIFT_FRW_COL)", %%"#mm"3 \n\t" \
|
||||
#mov" 32(%1), %%"#mm"6 \n\t" \
|
||||
"pmulhw %%"#mm"1, %%"#mm"0 \n\t" \
|
||||
#mov" %%"#mm"7, (%3) \n\t" \
|
||||
"pmulhw %%"#mm"4, %%"#mm"6 \n\t" \
|
||||
#mov" %%"#mm"5, 96(%3) \n\t" \
|
||||
#mov" %%"#mm"3, %%"#mm"7 \n\t" \
|
||||
#mov" 32(%1), %%"#mm"5 \n\t" \
|
||||
"psubsw %%"#mm"2, %%"#mm"7 \n\t" \
|
||||
"paddsw %%"#mm"2, %%"#mm"3 \n\t" \
|
||||
"pmulhw %%"#mm"7, %%"#mm"5 \n\t" \
|
||||
"paddsw %%"#mm"3, %%"#mm"0 \n\t" \
|
||||
"paddsw %%"#mm"4, %%"#mm"6 \n\t" \
|
||||
"pmulhw (%1), %%"#mm"3 \n\t" \
|
||||
"por (%2), %%"#mm"0 \n\t" \
|
||||
"paddsw %%"#mm"7, %%"#mm"5 \n\t" \
|
||||
"psubsw %%"#mm"6, %%"#mm"7 \n\t" \
|
||||
#mov" %%"#mm"0, 16(%3) \n\t" \
|
||||
"paddsw %%"#mm"4, %%"#mm"5 \n\t" \
|
||||
#mov" %%"#mm"7, 48(%3) \n\t" \
|
||||
"psubsw %%"#mm"1, %%"#mm"3 \n\t" \
|
||||
#mov" %%"#mm"5, 80(%3) \n\t" \
|
||||
#mov" %%"#mm"3, 112(%3) \n\t" \
|
||||
: \
|
||||
: "r" (in + offset), "r" (fdct_tg_all_16), "r" (fdct_one_corr), \
|
||||
"r" (out + offset), "r" (ocos_4_16)); \
|
||||
}
|
||||
|
||||
FDCT_COL(mmx, mm, movq)
|
||||
FDCT_COL(sse2, xmm, movdqa)
|
||||
|
||||
static av_always_inline void fdct_row_sse2(const int16_t *in, int16_t *out)
|
||||
{
|
||||
__asm__ volatile(
|
||||
#define FDCT_ROW_SSE2_H1(i,t) \
|
||||
"movq " #i "(%0), %%xmm2 \n\t" \
|
||||
"movq " #i "+8(%0), %%xmm0 \n\t" \
|
||||
"movdqa " #t "+32(%1), %%xmm3 \n\t" \
|
||||
"movdqa " #t "+48(%1), %%xmm7 \n\t" \
|
||||
"movdqa " #t "(%1), %%xmm4 \n\t" \
|
||||
"movdqa " #t "+16(%1), %%xmm5 \n\t"
|
||||
|
||||
#define FDCT_ROW_SSE2_H2(i,t) \
|
||||
"movq " #i "(%0), %%xmm2 \n\t" \
|
||||
"movq " #i "+8(%0), %%xmm0 \n\t" \
|
||||
"movdqa " #t "+32(%1), %%xmm3 \n\t" \
|
||||
"movdqa " #t "+48(%1), %%xmm7 \n\t"
|
||||
|
||||
#define FDCT_ROW_SSE2(i) \
|
||||
"movq %%xmm2, %%xmm1 \n\t" \
|
||||
"pshuflw $27, %%xmm0, %%xmm0 \n\t" \
|
||||
"paddsw %%xmm0, %%xmm1 \n\t" \
|
||||
"psubsw %%xmm0, %%xmm2 \n\t" \
|
||||
"punpckldq %%xmm2, %%xmm1 \n\t" \
|
||||
"pshufd $78, %%xmm1, %%xmm2 \n\t" \
|
||||
"pmaddwd %%xmm2, %%xmm3 \n\t" \
|
||||
"pmaddwd %%xmm1, %%xmm7 \n\t" \
|
||||
"pmaddwd %%xmm5, %%xmm2 \n\t" \
|
||||
"pmaddwd %%xmm4, %%xmm1 \n\t" \
|
||||
"paddd %%xmm7, %%xmm3 \n\t" \
|
||||
"paddd %%xmm2, %%xmm1 \n\t" \
|
||||
"paddd %%xmm6, %%xmm3 \n\t" \
|
||||
"paddd %%xmm6, %%xmm1 \n\t" \
|
||||
"psrad %3, %%xmm3 \n\t" \
|
||||
"psrad %3, %%xmm1 \n\t" \
|
||||
"packssdw %%xmm3, %%xmm1 \n\t" \
|
||||
"movdqa %%xmm1, " #i "(%4) \n\t"
|
||||
|
||||
"movdqa (%2), %%xmm6 \n\t"
|
||||
FDCT_ROW_SSE2_H1(0,0)
|
||||
FDCT_ROW_SSE2(0)
|
||||
FDCT_ROW_SSE2_H2(64,0)
|
||||
FDCT_ROW_SSE2(64)
|
||||
|
||||
FDCT_ROW_SSE2_H1(16,64)
|
||||
FDCT_ROW_SSE2(16)
|
||||
FDCT_ROW_SSE2_H2(112,64)
|
||||
FDCT_ROW_SSE2(112)
|
||||
|
||||
FDCT_ROW_SSE2_H1(32,128)
|
||||
FDCT_ROW_SSE2(32)
|
||||
FDCT_ROW_SSE2_H2(96,128)
|
||||
FDCT_ROW_SSE2(96)
|
||||
|
||||
FDCT_ROW_SSE2_H1(48,192)
|
||||
FDCT_ROW_SSE2(48)
|
||||
FDCT_ROW_SSE2_H2(80,192)
|
||||
FDCT_ROW_SSE2(80)
|
||||
:
|
||||
: "r" (in), "r" (tab_frw_01234567_sse2.tab_frw_01234567_sse2),
|
||||
"r" (fdct_r_row_sse2.fdct_r_row_sse2), "i" (SHIFT_FRW_ROW), "r" (out)
|
||||
XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7")
|
||||
);
|
||||
}
|
||||
|
||||
static av_always_inline void fdct_row_mmxext(const int16_t *in, int16_t *out,
|
||||
const int16_t *table)
|
||||
{
|
||||
__asm__ volatile (
|
||||
"pshufw $0x1B, 8(%0), %%mm5 \n\t"
|
||||
"movq (%0), %%mm0 \n\t"
|
||||
"movq %%mm0, %%mm1 \n\t"
|
||||
"paddsw %%mm5, %%mm0 \n\t"
|
||||
"psubsw %%mm5, %%mm1 \n\t"
|
||||
"movq %%mm0, %%mm2 \n\t"
|
||||
"punpckldq %%mm1, %%mm0 \n\t"
|
||||
"punpckhdq %%mm1, %%mm2 \n\t"
|
||||
"movq (%1), %%mm1 \n\t"
|
||||
"movq 8(%1), %%mm3 \n\t"
|
||||
"movq 16(%1), %%mm4 \n\t"
|
||||
"movq 24(%1), %%mm5 \n\t"
|
||||
"movq 32(%1), %%mm6 \n\t"
|
||||
"movq 40(%1), %%mm7 \n\t"
|
||||
"pmaddwd %%mm0, %%mm1 \n\t"
|
||||
"pmaddwd %%mm2, %%mm3 \n\t"
|
||||
"pmaddwd %%mm0, %%mm4 \n\t"
|
||||
"pmaddwd %%mm2, %%mm5 \n\t"
|
||||
"pmaddwd %%mm0, %%mm6 \n\t"
|
||||
"pmaddwd %%mm2, %%mm7 \n\t"
|
||||
"pmaddwd 48(%1), %%mm0 \n\t"
|
||||
"pmaddwd 56(%1), %%mm2 \n\t"
|
||||
"paddd %%mm1, %%mm3 \n\t"
|
||||
"paddd %%mm4, %%mm5 \n\t"
|
||||
"paddd %%mm6, %%mm7 \n\t"
|
||||
"paddd %%mm0, %%mm2 \n\t"
|
||||
"movq (%2), %%mm0 \n\t"
|
||||
"paddd %%mm0, %%mm3 \n\t"
|
||||
"paddd %%mm0, %%mm5 \n\t"
|
||||
"paddd %%mm0, %%mm7 \n\t"
|
||||
"paddd %%mm0, %%mm2 \n\t"
|
||||
"psrad $"S(SHIFT_FRW_ROW)", %%mm3 \n\t"
|
||||
"psrad $"S(SHIFT_FRW_ROW)", %%mm5 \n\t"
|
||||
"psrad $"S(SHIFT_FRW_ROW)", %%mm7 \n\t"
|
||||
"psrad $"S(SHIFT_FRW_ROW)", %%mm2 \n\t"
|
||||
"packssdw %%mm5, %%mm3 \n\t"
|
||||
"packssdw %%mm2, %%mm7 \n\t"
|
||||
"movq %%mm3, (%3) \n\t"
|
||||
"movq %%mm7, 8(%3) \n\t"
|
||||
:
|
||||
: "r" (in), "r" (table), "r" (fdct_r_row), "r" (out));
|
||||
}
|
||||
|
||||
static av_always_inline void fdct_row_mmx(const int16_t *in, int16_t *out, const int16_t *table)
|
||||
{
|
||||
//FIXME reorder (I do not have an old MMX-only CPU here to benchmark ...)
|
||||
__asm__ volatile(
|
||||
"movd 12(%0), %%mm1 \n\t"
|
||||
"punpcklwd 8(%0), %%mm1 \n\t"
|
||||
"movq %%mm1, %%mm2 \n\t"
|
||||
"psrlq $0x20, %%mm1 \n\t"
|
||||
"movq 0(%0), %%mm0 \n\t"
|
||||
"punpcklwd %%mm2, %%mm1 \n\t"
|
||||
"movq %%mm0, %%mm5 \n\t"
|
||||
"paddsw %%mm1, %%mm0 \n\t"
|
||||
"psubsw %%mm1, %%mm5 \n\t"
|
||||
"movq %%mm0, %%mm2 \n\t"
|
||||
"punpckldq %%mm5, %%mm0 \n\t"
|
||||
"punpckhdq %%mm5, %%mm2 \n\t"
|
||||
"movq 0(%1), %%mm1 \n\t"
|
||||
"movq 8(%1), %%mm3 \n\t"
|
||||
"movq 16(%1), %%mm4 \n\t"
|
||||
"movq 24(%1), %%mm5 \n\t"
|
||||
"movq 32(%1), %%mm6 \n\t"
|
||||
"movq 40(%1), %%mm7 \n\t"
|
||||
"pmaddwd %%mm0, %%mm1 \n\t"
|
||||
"pmaddwd %%mm2, %%mm3 \n\t"
|
||||
"pmaddwd %%mm0, %%mm4 \n\t"
|
||||
"pmaddwd %%mm2, %%mm5 \n\t"
|
||||
"pmaddwd %%mm0, %%mm6 \n\t"
|
||||
"pmaddwd %%mm2, %%mm7 \n\t"
|
||||
"pmaddwd 48(%1), %%mm0 \n\t"
|
||||
"pmaddwd 56(%1), %%mm2 \n\t"
|
||||
"paddd %%mm1, %%mm3 \n\t"
|
||||
"paddd %%mm4, %%mm5 \n\t"
|
||||
"paddd %%mm6, %%mm7 \n\t"
|
||||
"paddd %%mm0, %%mm2 \n\t"
|
||||
"movq (%2), %%mm0 \n\t"
|
||||
"paddd %%mm0, %%mm3 \n\t"
|
||||
"paddd %%mm0, %%mm5 \n\t"
|
||||
"paddd %%mm0, %%mm7 \n\t"
|
||||
"paddd %%mm0, %%mm2 \n\t"
|
||||
"psrad $"S(SHIFT_FRW_ROW)", %%mm3 \n\t"
|
||||
"psrad $"S(SHIFT_FRW_ROW)", %%mm5 \n\t"
|
||||
"psrad $"S(SHIFT_FRW_ROW)", %%mm7 \n\t"
|
||||
"psrad $"S(SHIFT_FRW_ROW)", %%mm2 \n\t"
|
||||
"packssdw %%mm5, %%mm3 \n\t"
|
||||
"packssdw %%mm2, %%mm7 \n\t"
|
||||
"movq %%mm3, 0(%3) \n\t"
|
||||
"movq %%mm7, 8(%3) \n\t"
|
||||
:
|
||||
: "r" (in), "r" (table), "r" (fdct_r_row), "r" (out));
|
||||
}
|
||||
|
||||
void ff_fdct_mmx(int16_t *block)
|
||||
{
|
||||
DECLARE_ALIGNED(8, int64_t, align_tmp)[16];
|
||||
int16_t * block1= (int16_t*)align_tmp;
|
||||
const int16_t *table= tab_frw_01234567;
|
||||
int i;
|
||||
|
||||
fdct_col_mmx(block, block1, 0);
|
||||
fdct_col_mmx(block, block1, 4);
|
||||
|
||||
for(i=8;i>0;i--) {
|
||||
fdct_row_mmx(block1, block, table);
|
||||
block1 += 8;
|
||||
table += 32;
|
||||
block += 8;
|
||||
}
|
||||
}
|
||||
|
||||
#endif /* HAVE_MMX_INLINE */
|
||||
|
||||
#if HAVE_MMXEXT_INLINE
|
||||
|
||||
void ff_fdct_mmxext(int16_t *block)
|
||||
{
|
||||
DECLARE_ALIGNED(8, int64_t, align_tmp)[16];
|
||||
int16_t *block1= (int16_t*)align_tmp;
|
||||
const int16_t *table= tab_frw_01234567;
|
||||
int i;
|
||||
|
||||
fdct_col_mmx(block, block1, 0);
|
||||
fdct_col_mmx(block, block1, 4);
|
||||
|
||||
for(i=8;i>0;i--) {
|
||||
fdct_row_mmxext(block1, block, table);
|
||||
block1 += 8;
|
||||
table += 32;
|
||||
block += 8;
|
||||
}
|
||||
}
|
||||
|
||||
#endif /* HAVE_MMXEXT_INLINE */
|
||||
|
||||
#if HAVE_SSE2_INLINE
|
||||
|
||||
void ff_fdct_sse2(int16_t *block)
|
||||
{
|
||||
DECLARE_ALIGNED(16, int64_t, align_tmp)[16];
|
||||
int16_t * const block1= (int16_t*)align_tmp;
|
||||
|
||||
fdct_col_sse2(block, block1, 0);
|
||||
fdct_row_sse2(block1, block);
|
||||
}
|
||||
|
||||
#endif /* HAVE_SSE2_INLINE */
|
28
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/fdct.h
vendored
Normal file
28
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/fdct.h
vendored
Normal file
|
@ -0,0 +1,28 @@
|
|||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef AVCODEC_X86_FDCT_H
|
||||
#define AVCODEC_X86_FDCT_H
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
void ff_fdct_mmx(int16_t *block);
|
||||
void ff_fdct_mmxext(int16_t *block);
|
||||
void ff_fdct_sse2(int16_t *block);
|
||||
|
||||
#endif /* AVCODEC_X86_FDCT_H */
|
44
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/fdctdsp_init.c
vendored
Normal file
44
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/fdctdsp_init.c
vendored
Normal file
|
@ -0,0 +1,44 @@
|
|||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/avcodec.h"
|
||||
#include "libavcodec/fdctdsp.h"
|
||||
#include "fdct.h"
|
||||
|
||||
av_cold void ff_fdctdsp_init_x86(FDCTDSPContext *c, AVCodecContext *avctx,
|
||||
unsigned high_bit_depth)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
const int dct_algo = avctx->dct_algo;
|
||||
|
||||
if (!high_bit_depth) {
|
||||
if ((dct_algo == FF_DCT_AUTO || dct_algo == FF_DCT_MMX)) {
|
||||
if (INLINE_MMX(cpu_flags))
|
||||
c->fdct = ff_fdct_mmx;
|
||||
|
||||
if (INLINE_MMXEXT(cpu_flags))
|
||||
c->fdct = ff_fdct_mmxext;
|
||||
|
||||
if (INLINE_SSE2(cpu_flags))
|
||||
c->fdct = ff_fdct_sse2;
|
||||
}
|
||||
}
|
||||
}
|
1085
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/fft.asm
vendored
Normal file
1085
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/fft.asm
vendored
Normal file
File diff suppressed because it is too large
Load diff
38
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/fft.h
vendored
Normal file
38
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/fft.h
vendored
Normal file
|
@ -0,0 +1,38 @@
|
|||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef AVCODEC_X86_FFT_H
|
||||
#define AVCODEC_X86_FFT_H
|
||||
|
||||
#include "libavcodec/fft.h"
|
||||
|
||||
void ff_fft_permute_sse(FFTContext *s, FFTComplex *z);
|
||||
void ff_fft_calc_avx(FFTContext *s, FFTComplex *z);
|
||||
void ff_fft_calc_sse(FFTContext *s, FFTComplex *z);
|
||||
void ff_fft_calc_3dnow(FFTContext *s, FFTComplex *z);
|
||||
void ff_fft_calc_3dnowext(FFTContext *s, FFTComplex *z);
|
||||
|
||||
void ff_imdct_calc_3dnow(FFTContext *s, FFTSample *output, const FFTSample *input);
|
||||
void ff_imdct_half_3dnow(FFTContext *s, FFTSample *output, const FFTSample *input);
|
||||
void ff_imdct_calc_3dnowext(FFTContext *s, FFTSample *output, const FFTSample *input);
|
||||
void ff_imdct_half_3dnowext(FFTContext *s, FFTSample *output, const FFTSample *input);
|
||||
void ff_imdct_calc_sse(FFTContext *s, FFTSample *output, const FFTSample *input);
|
||||
void ff_imdct_half_sse(FFTContext *s, FFTSample *output, const FFTSample *input);
|
||||
void ff_imdct_half_avx(FFTContext *s, FFTSample *output, const FFTSample *input);
|
||||
|
||||
#endif /* AVCODEC_X86_FFT_H */
|
61
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/fft_init.c
vendored
Normal file
61
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/fft_init.c
vendored
Normal file
|
@ -0,0 +1,61 @@
|
|||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
|
||||
#include "fft.h"
|
||||
|
||||
av_cold void ff_fft_init_x86(FFTContext *s)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (s->nbits > 16)
|
||||
return;
|
||||
|
||||
#if ARCH_X86_32
|
||||
if (EXTERNAL_AMD3DNOW(cpu_flags)) {
|
||||
s->imdct_calc = ff_imdct_calc_3dnow;
|
||||
s->imdct_half = ff_imdct_half_3dnow;
|
||||
s->fft_calc = ff_fft_calc_3dnow;
|
||||
}
|
||||
|
||||
if (EXTERNAL_AMD3DNOWEXT(cpu_flags)) {
|
||||
s->imdct_calc = ff_imdct_calc_3dnowext;
|
||||
s->imdct_half = ff_imdct_half_3dnowext;
|
||||
s->fft_calc = ff_fft_calc_3dnowext;
|
||||
}
|
||||
#endif /* ARCH_X86_32 */
|
||||
|
||||
if (EXTERNAL_SSE(cpu_flags)) {
|
||||
s->imdct_calc = ff_imdct_calc_sse;
|
||||
s->imdct_half = ff_imdct_half_sse;
|
||||
s->fft_permute = ff_fft_permute_sse;
|
||||
s->fft_calc = ff_fft_calc_sse;
|
||||
s->fft_permutation = FF_FFT_PERM_SWAP_LSBS;
|
||||
}
|
||||
|
||||
if (EXTERNAL_AVX_FAST(cpu_flags) && s->nbits >= 5) {
|
||||
s->imdct_half = ff_imdct_half_avx;
|
||||
s->fft_calc = ff_fft_calc_avx;
|
||||
s->fft_permutation = FF_FFT_PERM_AVX;
|
||||
}
|
||||
}
|
115
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/flacdsp_init.c
vendored
Normal file
115
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/flacdsp_init.c
vendored
Normal file
|
@ -0,0 +1,115 @@
|
|||
/*
|
||||
* Copyright (c) 2014 James Almer
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavcodec/flacdsp.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "config.h"
|
||||
|
||||
void ff_flac_lpc_32_sse4(int32_t *samples, const int coeffs[32], int order,
|
||||
int qlevel, int len);
|
||||
void ff_flac_lpc_32_xop(int32_t *samples, const int coeffs[32], int order,
|
||||
int qlevel, int len);
|
||||
|
||||
void ff_flac_enc_lpc_16_sse4(int32_t *, const int32_t *, int, int, const int32_t *,int);
|
||||
|
||||
#define DECORRELATE_FUNCS(fmt, opt) \
|
||||
void ff_flac_decorrelate_ls_##fmt##_##opt(uint8_t **out, int32_t **in, int channels, \
|
||||
int len, int shift); \
|
||||
void ff_flac_decorrelate_rs_##fmt##_##opt(uint8_t **out, int32_t **in, int channels, \
|
||||
int len, int shift); \
|
||||
void ff_flac_decorrelate_ms_##fmt##_##opt(uint8_t **out, int32_t **in, int channels, \
|
||||
int len, int shift); \
|
||||
void ff_flac_decorrelate_indep2_##fmt##_##opt(uint8_t **out, int32_t **in, int channels, \
|
||||
int len, int shift); \
|
||||
void ff_flac_decorrelate_indep4_##fmt##_##opt(uint8_t **out, int32_t **in, int channels, \
|
||||
int len, int shift); \
|
||||
void ff_flac_decorrelate_indep6_##fmt##_##opt(uint8_t **out, int32_t **in, int channels, \
|
||||
int len, int shift); \
|
||||
void ff_flac_decorrelate_indep8_##fmt##_##opt(uint8_t **out, int32_t **in, int channels, \
|
||||
int len, int shift)
|
||||
|
||||
DECORRELATE_FUNCS(16, sse2);
|
||||
DECORRELATE_FUNCS(16, avx);
|
||||
DECORRELATE_FUNCS(32, sse2);
|
||||
DECORRELATE_FUNCS(32, avx);
|
||||
|
||||
av_cold void ff_flacdsp_init_x86(FLACDSPContext *c, enum AVSampleFormat fmt, int channels,
|
||||
int bps)
|
||||
{
|
||||
#if HAVE_X86ASM
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
#if CONFIG_FLAC_DECODER
|
||||
if (EXTERNAL_SSE2(cpu_flags)) {
|
||||
if (fmt == AV_SAMPLE_FMT_S16) {
|
||||
if (channels == 2)
|
||||
c->decorrelate[0] = ff_flac_decorrelate_indep2_16_sse2;
|
||||
else if (channels == 4)
|
||||
c->decorrelate[0] = ff_flac_decorrelate_indep4_16_sse2;
|
||||
else if (channels == 6)
|
||||
c->decorrelate[0] = ff_flac_decorrelate_indep6_16_sse2;
|
||||
else if (ARCH_X86_64 && channels == 8)
|
||||
c->decorrelate[0] = ff_flac_decorrelate_indep8_16_sse2;
|
||||
c->decorrelate[1] = ff_flac_decorrelate_ls_16_sse2;
|
||||
c->decorrelate[2] = ff_flac_decorrelate_rs_16_sse2;
|
||||
c->decorrelate[3] = ff_flac_decorrelate_ms_16_sse2;
|
||||
} else if (fmt == AV_SAMPLE_FMT_S32) {
|
||||
if (channels == 2)
|
||||
c->decorrelate[0] = ff_flac_decorrelate_indep2_32_sse2;
|
||||
else if (channels == 4)
|
||||
c->decorrelate[0] = ff_flac_decorrelate_indep4_32_sse2;
|
||||
else if (channels == 6)
|
||||
c->decorrelate[0] = ff_flac_decorrelate_indep6_32_sse2;
|
||||
else if (ARCH_X86_64 && channels == 8)
|
||||
c->decorrelate[0] = ff_flac_decorrelate_indep8_32_sse2;
|
||||
c->decorrelate[1] = ff_flac_decorrelate_ls_32_sse2;
|
||||
c->decorrelate[2] = ff_flac_decorrelate_rs_32_sse2;
|
||||
c->decorrelate[3] = ff_flac_decorrelate_ms_32_sse2;
|
||||
}
|
||||
}
|
||||
if (EXTERNAL_SSE4(cpu_flags)) {
|
||||
c->lpc32 = ff_flac_lpc_32_sse4;
|
||||
}
|
||||
if (EXTERNAL_AVX(cpu_flags)) {
|
||||
if (fmt == AV_SAMPLE_FMT_S16) {
|
||||
if (ARCH_X86_64 && channels == 8)
|
||||
c->decorrelate[0] = ff_flac_decorrelate_indep8_16_avx;
|
||||
} else if (fmt == AV_SAMPLE_FMT_S32) {
|
||||
if (channels == 4)
|
||||
c->decorrelate[0] = ff_flac_decorrelate_indep4_32_avx;
|
||||
else if (channels == 6)
|
||||
c->decorrelate[0] = ff_flac_decorrelate_indep6_32_avx;
|
||||
else if (ARCH_X86_64 && channels == 8)
|
||||
c->decorrelate[0] = ff_flac_decorrelate_indep8_32_avx;
|
||||
}
|
||||
}
|
||||
if (EXTERNAL_XOP(cpu_flags)) {
|
||||
c->lpc32 = ff_flac_lpc_32_xop;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if CONFIG_FLAC_ENCODER
|
||||
if (EXTERNAL_SSE4(cpu_flags)) {
|
||||
if (CONFIG_GPL)
|
||||
c->lpc16_encode = ff_flac_enc_lpc_16_sse4;
|
||||
}
|
||||
#endif
|
||||
#endif /* HAVE_X86ASM */
|
||||
}
|
55
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/fmtconvert_init.c
vendored
Normal file
55
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/fmtconvert_init.c
vendored
Normal file
|
@ -0,0 +1,55 @@
|
|||
/*
|
||||
* Format Conversion Utils
|
||||
* Copyright (c) 2000, 2001 Fabrice Bellard
|
||||
* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
|
||||
*
|
||||
* MMX optimization by Nick Kurshev <nickols_k@mail.ru>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/fmtconvert.h"
|
||||
|
||||
#if HAVE_X86ASM
|
||||
|
||||
void ff_int32_to_float_fmul_scalar_sse (float *dst, const int32_t *src, float mul, int len);
|
||||
void ff_int32_to_float_fmul_scalar_sse2(float *dst, const int32_t *src, float mul, int len);
|
||||
void ff_int32_to_float_fmul_array8_sse (FmtConvertContext *c, float *dst, const int32_t *src,
|
||||
const float *mul, int len);
|
||||
void ff_int32_to_float_fmul_array8_sse2(FmtConvertContext *c, float *dst, const int32_t *src,
|
||||
const float *mul, int len);
|
||||
|
||||
#endif /* HAVE_X86ASM */
|
||||
|
||||
av_cold void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx)
|
||||
{
|
||||
#if HAVE_X86ASM
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (EXTERNAL_SSE(cpu_flags)) {
|
||||
c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_sse;
|
||||
c->int32_to_float_fmul_array8 = ff_int32_to_float_fmul_array8_sse;
|
||||
}
|
||||
if (EXTERNAL_SSE2(cpu_flags)) {
|
||||
c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_sse2;
|
||||
c->int32_to_float_fmul_array8 = ff_int32_to_float_fmul_array8_sse2;
|
||||
}
|
||||
#endif /* HAVE_X86ASM */
|
||||
}
|
49
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/fpel.h
vendored
Normal file
49
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/fpel.h
vendored
Normal file
|
@ -0,0 +1,49 @@
|
|||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef AVCODEC_X86_FPEL_H
|
||||
#define AVCODEC_X86_FPEL_H
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
void ff_avg_pixels4_mmx(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_avg_pixels4_mmxext(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_avg_pixels8_mmx(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_avg_pixels8_mmxext(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_avg_pixels16_mmx(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_avg_pixels16_mmxext(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_put_pixels4_mmx(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_put_pixels8_mmx(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_put_pixels16_mmx(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_put_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
|
||||
|
||||
#endif /* AVCODEC_X86_FPEL_H */
|
35
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/g722dsp_init.c
vendored
Normal file
35
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/g722dsp_init.c
vendored
Normal file
|
@ -0,0 +1,35 @@
|
|||
/*
|
||||
* Copyright (c) 2014 James Almer
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/g722dsp.h"
|
||||
|
||||
void ff_g722_apply_qmf_sse2(const int16_t *prev_samples, int xout[2]);
|
||||
|
||||
av_cold void ff_g722dsp_init_x86(G722DSPContext *dsp)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (EXTERNAL_SSE2(cpu_flags))
|
||||
dsp->apply_qmf = ff_g722_apply_qmf_sse2;
|
||||
}
|
39
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/h263dsp_init.c
vendored
Normal file
39
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/h263dsp_init.c
vendored
Normal file
|
@ -0,0 +1,39 @@
|
|||
/*
|
||||
* Copyright (c) 2013 Diego Biurrun <diego@biurrun.de>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/h263dsp.h"
|
||||
|
||||
void ff_h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale);
|
||||
void ff_h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale);
|
||||
|
||||
av_cold void ff_h263dsp_init_x86(H263DSPContext *c)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (EXTERNAL_MMX(cpu_flags)) {
|
||||
c->h263_h_loop_filter = ff_h263_h_loop_filter_mmx;
|
||||
c->h263_v_loop_filter = ff_h263_v_loop_filter_mmx;
|
||||
}
|
||||
}
|
208
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/h264_cabac.c
vendored
Normal file
208
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/h264_cabac.c
vendored
Normal file
|
@ -0,0 +1,208 @@
|
|||
/*
|
||||
* H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
|
||||
* Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
/**
|
||||
* @file
|
||||
* H.264 / AVC / MPEG-4 part10 codec.
|
||||
* non-SIMD x86-specific optimizations for H.264
|
||||
* @author Michael Niedermayer <michaelni@gmx.at>
|
||||
*/
|
||||
|
||||
#include <stddef.h>
|
||||
|
||||
#include "libavcodec/cabac.h"
|
||||
#include "cabac.h"
|
||||
|
||||
#if HAVE_INLINE_ASM
|
||||
|
||||
#if ARCH_X86_64
|
||||
#define REG64 "r"
|
||||
#else
|
||||
#define REG64 "m"
|
||||
#endif
|
||||
|
||||
//FIXME use some macros to avoid duplicating get_cabac (cannot be done yet
|
||||
//as that would make optimization work hard)
|
||||
#if HAVE_7REGS && !BROKEN_COMPILER
|
||||
#define decode_significance decode_significance_x86
|
||||
static int decode_significance_x86(CABACContext *c, int max_coeff,
|
||||
uint8_t *significant_coeff_ctx_base,
|
||||
int *index, x86_reg last_off){
|
||||
void *end= significant_coeff_ctx_base + max_coeff - 1;
|
||||
int minusstart= -(intptr_t)significant_coeff_ctx_base;
|
||||
int minusindex= 4-(intptr_t)index;
|
||||
int bit;
|
||||
x86_reg coeff_count;
|
||||
|
||||
#ifdef BROKEN_RELOCATIONS
|
||||
void *tables;
|
||||
|
||||
__asm__ volatile(
|
||||
"lea "MANGLE(ff_h264_cabac_tables)", %0 \n\t"
|
||||
: "=&r"(tables)
|
||||
: NAMED_CONSTRAINTS_ARRAY(ff_h264_cabac_tables)
|
||||
);
|
||||
#endif
|
||||
|
||||
__asm__ volatile(
|
||||
"3: \n\t"
|
||||
|
||||
BRANCHLESS_GET_CABAC("%4", "%q4", "(%1)", "%3", "%w3",
|
||||
"%5", "%q5", "%k0", "%b0",
|
||||
"%c11(%6)", "%c12(%6)",
|
||||
AV_STRINGIFY(H264_NORM_SHIFT_OFFSET),
|
||||
AV_STRINGIFY(H264_LPS_RANGE_OFFSET),
|
||||
AV_STRINGIFY(H264_MLPS_STATE_OFFSET),
|
||||
"%13")
|
||||
|
||||
"test $1, %4 \n\t"
|
||||
" jz 4f \n\t"
|
||||
"add %10, %1 \n\t"
|
||||
|
||||
BRANCHLESS_GET_CABAC("%4", "%q4", "(%1)", "%3", "%w3",
|
||||
"%5", "%q5", "%k0", "%b0",
|
||||
"%c11(%6)", "%c12(%6)",
|
||||
AV_STRINGIFY(H264_NORM_SHIFT_OFFSET),
|
||||
AV_STRINGIFY(H264_LPS_RANGE_OFFSET),
|
||||
AV_STRINGIFY(H264_MLPS_STATE_OFFSET),
|
||||
"%13")
|
||||
|
||||
"sub %10, %1 \n\t"
|
||||
"mov %2, %0 \n\t"
|
||||
"movl %7, %%ecx \n\t"
|
||||
"add %1, %%"FF_REG_c" \n\t"
|
||||
"movl %%ecx, (%0) \n\t"
|
||||
|
||||
"test $1, %4 \n\t"
|
||||
" jnz 5f \n\t"
|
||||
|
||||
"add"FF_OPSIZE" $4, %2 \n\t"
|
||||
|
||||
"4: \n\t"
|
||||
"add $1, %1 \n\t"
|
||||
"cmp %8, %1 \n\t"
|
||||
" jb 3b \n\t"
|
||||
"mov %2, %0 \n\t"
|
||||
"movl %7, %%ecx \n\t"
|
||||
"add %1, %%"FF_REG_c" \n\t"
|
||||
"movl %%ecx, (%0) \n\t"
|
||||
"5: \n\t"
|
||||
"add %9, %k0 \n\t"
|
||||
"shr $2, %k0 \n\t"
|
||||
: "=&q"(coeff_count), "+r"(significant_coeff_ctx_base), "+m"(index),
|
||||
"+&r"(c->low), "=&r"(bit), "+&r"(c->range)
|
||||
: "r"(c), "m"(minusstart), "m"(end), "m"(minusindex), "m"(last_off),
|
||||
"i"(offsetof(CABACContext, bytestream)),
|
||||
"i"(offsetof(CABACContext, bytestream_end))
|
||||
TABLES_ARG
|
||||
: "%"FF_REG_c, "memory"
|
||||
);
|
||||
return coeff_count;
|
||||
}
|
||||
|
||||
#define decode_significance_8x8 decode_significance_8x8_x86
|
||||
static int decode_significance_8x8_x86(CABACContext *c,
|
||||
uint8_t *significant_coeff_ctx_base,
|
||||
int *index, uint8_t *last_coeff_ctx_base, const uint8_t *sig_off){
|
||||
int minusindex= 4-(intptr_t)index;
|
||||
int bit;
|
||||
x86_reg coeff_count;
|
||||
x86_reg last=0;
|
||||
x86_reg state;
|
||||
|
||||
#ifdef BROKEN_RELOCATIONS
|
||||
void *tables;
|
||||
|
||||
__asm__ volatile(
|
||||
"lea "MANGLE(ff_h264_cabac_tables)", %0 \n\t"
|
||||
: "=&r"(tables)
|
||||
: NAMED_CONSTRAINTS_ARRAY(ff_h264_cabac_tables)
|
||||
);
|
||||
#endif
|
||||
|
||||
__asm__ volatile(
|
||||
"mov %1, %6 \n\t"
|
||||
"3: \n\t"
|
||||
|
||||
"mov %10, %0 \n\t"
|
||||
"movzb (%0, %6), %6 \n\t"
|
||||
"add %9, %6 \n\t"
|
||||
|
||||
BRANCHLESS_GET_CABAC("%4", "%q4", "(%6)", "%3", "%w3",
|
||||
"%5", "%q5", "%k0", "%b0",
|
||||
"%c12(%7)", "%c13(%7)",
|
||||
AV_STRINGIFY(H264_NORM_SHIFT_OFFSET),
|
||||
AV_STRINGIFY(H264_LPS_RANGE_OFFSET),
|
||||
AV_STRINGIFY(H264_MLPS_STATE_OFFSET),
|
||||
"%15")
|
||||
|
||||
"mov %1, %6 \n\t"
|
||||
"test $1, %4 \n\t"
|
||||
" jz 4f \n\t"
|
||||
|
||||
#ifdef BROKEN_RELOCATIONS
|
||||
"movzb %c14(%15, %q6), %6\n\t"
|
||||
#else
|
||||
"movzb "MANGLE(ff_h264_cabac_tables)"+%c14(%6), %6\n\t"
|
||||
#endif
|
||||
"add %11, %6 \n\t"
|
||||
|
||||
BRANCHLESS_GET_CABAC("%4", "%q4", "(%6)", "%3", "%w3",
|
||||
"%5", "%q5", "%k0", "%b0",
|
||||
"%c12(%7)", "%c13(%7)",
|
||||
AV_STRINGIFY(H264_NORM_SHIFT_OFFSET),
|
||||
AV_STRINGIFY(H264_LPS_RANGE_OFFSET),
|
||||
AV_STRINGIFY(H264_MLPS_STATE_OFFSET),
|
||||
"%15")
|
||||
|
||||
"mov %2, %0 \n\t"
|
||||
"mov %1, %6 \n\t"
|
||||
"mov %k6, (%0) \n\t"
|
||||
|
||||
"test $1, %4 \n\t"
|
||||
" jnz 5f \n\t"
|
||||
|
||||
"add"FF_OPSIZE" $4, %2 \n\t"
|
||||
|
||||
"4: \n\t"
|
||||
"add $1, %6 \n\t"
|
||||
"mov %6, %1 \n\t"
|
||||
"cmp $63, %6 \n\t"
|
||||
" jb 3b \n\t"
|
||||
"mov %2, %0 \n\t"
|
||||
"mov %k6, (%0) \n\t"
|
||||
"5: \n\t"
|
||||
"addl %8, %k0 \n\t"
|
||||
"shr $2, %k0 \n\t"
|
||||
: "=&q"(coeff_count), "+"REG64(last), "+"REG64(index), "+&r"(c->low),
|
||||
"=&r"(bit), "+&r"(c->range), "=&r"(state)
|
||||
: "r"(c), "m"(minusindex), "m"(significant_coeff_ctx_base),
|
||||
REG64(sig_off), REG64(last_coeff_ctx_base),
|
||||
"i"(offsetof(CABACContext, bytestream)),
|
||||
"i"(offsetof(CABACContext, bytestream_end)),
|
||||
"i"(H264_LAST_COEFF_FLAG_OFFSET_8x8_OFFSET) TABLES_ARG
|
||||
: "%"FF_REG_c, "memory"
|
||||
);
|
||||
return coeff_count;
|
||||
}
|
||||
#endif /* HAVE_7REGS && BROKEN_COMPILER */
|
||||
|
||||
#endif /* HAVE_INLINE_ASM */
|
410
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/h264_intrapred_init.c
vendored
Normal file
410
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/h264_intrapred_init.c
vendored
Normal file
|
@ -0,0 +1,410 @@
|
|||
/*
|
||||
* Copyright (c) 2010 Fiona Glaser <fiona@x264.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/avcodec.h"
|
||||
#include "libavcodec/h264pred.h"
|
||||
|
||||
#define PRED4x4(TYPE, DEPTH, OPT) \
|
||||
void ff_pred4x4_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, \
|
||||
const uint8_t *topright, \
|
||||
ptrdiff_t stride);
|
||||
|
||||
PRED4x4(dc, 10, mmxext)
|
||||
PRED4x4(down_left, 10, sse2)
|
||||
PRED4x4(down_left, 10, avx)
|
||||
PRED4x4(down_right, 10, sse2)
|
||||
PRED4x4(down_right, 10, ssse3)
|
||||
PRED4x4(down_right, 10, avx)
|
||||
PRED4x4(vertical_left, 10, sse2)
|
||||
PRED4x4(vertical_left, 10, avx)
|
||||
PRED4x4(vertical_right, 10, sse2)
|
||||
PRED4x4(vertical_right, 10, ssse3)
|
||||
PRED4x4(vertical_right, 10, avx)
|
||||
PRED4x4(horizontal_up, 10, mmxext)
|
||||
PRED4x4(horizontal_down, 10, sse2)
|
||||
PRED4x4(horizontal_down, 10, ssse3)
|
||||
PRED4x4(horizontal_down, 10, avx)
|
||||
|
||||
#define PRED8x8(TYPE, DEPTH, OPT) \
|
||||
void ff_pred8x8_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, \
|
||||
ptrdiff_t stride);
|
||||
|
||||
PRED8x8(dc, 10, mmxext)
|
||||
PRED8x8(dc, 10, sse2)
|
||||
PRED8x8(top_dc, 10, sse2)
|
||||
PRED8x8(plane, 10, sse2)
|
||||
PRED8x8(vertical, 10, sse2)
|
||||
PRED8x8(horizontal, 10, sse2)
|
||||
|
||||
#define PRED8x8L(TYPE, DEPTH, OPT)\
|
||||
void ff_pred8x8l_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, \
|
||||
int has_topleft, \
|
||||
int has_topright, \
|
||||
ptrdiff_t stride);
|
||||
|
||||
PRED8x8L(dc, 10, sse2)
|
||||
PRED8x8L(dc, 10, avx)
|
||||
PRED8x8L(128_dc, 10, mmxext)
|
||||
PRED8x8L(128_dc, 10, sse2)
|
||||
PRED8x8L(top_dc, 10, sse2)
|
||||
PRED8x8L(top_dc, 10, avx)
|
||||
PRED8x8L(vertical, 10, sse2)
|
||||
PRED8x8L(vertical, 10, avx)
|
||||
PRED8x8L(horizontal, 10, sse2)
|
||||
PRED8x8L(horizontal, 10, ssse3)
|
||||
PRED8x8L(horizontal, 10, avx)
|
||||
PRED8x8L(down_left, 10, sse2)
|
||||
PRED8x8L(down_left, 10, ssse3)
|
||||
PRED8x8L(down_left, 10, avx)
|
||||
PRED8x8L(down_right, 10, sse2)
|
||||
PRED8x8L(down_right, 10, ssse3)
|
||||
PRED8x8L(down_right, 10, avx)
|
||||
PRED8x8L(vertical_right, 10, sse2)
|
||||
PRED8x8L(vertical_right, 10, ssse3)
|
||||
PRED8x8L(vertical_right, 10, avx)
|
||||
PRED8x8L(horizontal_up, 10, sse2)
|
||||
PRED8x8L(horizontal_up, 10, ssse3)
|
||||
PRED8x8L(horizontal_up, 10, avx)
|
||||
|
||||
#define PRED16x16(TYPE, DEPTH, OPT)\
|
||||
void ff_pred16x16_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, \
|
||||
ptrdiff_t stride);
|
||||
|
||||
PRED16x16(dc, 10, mmxext)
|
||||
PRED16x16(dc, 10, sse2)
|
||||
PRED16x16(top_dc, 10, mmxext)
|
||||
PRED16x16(top_dc, 10, sse2)
|
||||
PRED16x16(128_dc, 10, mmxext)
|
||||
PRED16x16(128_dc, 10, sse2)
|
||||
PRED16x16(left_dc, 10, mmxext)
|
||||
PRED16x16(left_dc, 10, sse2)
|
||||
PRED16x16(vertical, 10, mmxext)
|
||||
PRED16x16(vertical, 10, sse2)
|
||||
PRED16x16(horizontal, 10, mmxext)
|
||||
PRED16x16(horizontal, 10, sse2)
|
||||
|
||||
/* 8-bit versions */
|
||||
PRED16x16(vertical, 8, mmx)
|
||||
PRED16x16(vertical, 8, sse)
|
||||
PRED16x16(horizontal, 8, mmx)
|
||||
PRED16x16(horizontal, 8, mmxext)
|
||||
PRED16x16(horizontal, 8, ssse3)
|
||||
PRED16x16(dc, 8, mmxext)
|
||||
PRED16x16(dc, 8, sse2)
|
||||
PRED16x16(dc, 8, ssse3)
|
||||
PRED16x16(plane_h264, 8, mmx)
|
||||
PRED16x16(plane_h264, 8, mmxext)
|
||||
PRED16x16(plane_h264, 8, sse2)
|
||||
PRED16x16(plane_h264, 8, ssse3)
|
||||
PRED16x16(plane_rv40, 8, mmx)
|
||||
PRED16x16(plane_rv40, 8, mmxext)
|
||||
PRED16x16(plane_rv40, 8, sse2)
|
||||
PRED16x16(plane_rv40, 8, ssse3)
|
||||
PRED16x16(plane_svq3, 8, mmx)
|
||||
PRED16x16(plane_svq3, 8, mmxext)
|
||||
PRED16x16(plane_svq3, 8, sse2)
|
||||
PRED16x16(plane_svq3, 8, ssse3)
|
||||
PRED16x16(tm_vp8, 8, mmx)
|
||||
PRED16x16(tm_vp8, 8, mmxext)
|
||||
PRED16x16(tm_vp8, 8, sse2)
|
||||
PRED16x16(tm_vp8, 8, avx2)
|
||||
|
||||
PRED8x8(top_dc, 8, mmxext)
|
||||
PRED8x8(dc_rv40, 8, mmxext)
|
||||
PRED8x8(dc, 8, mmxext)
|
||||
PRED8x8(vertical, 8, mmx)
|
||||
PRED8x8(horizontal, 8, mmx)
|
||||
PRED8x8(horizontal, 8, mmxext)
|
||||
PRED8x8(horizontal, 8, ssse3)
|
||||
PRED8x8(plane, 8, mmx)
|
||||
PRED8x8(plane, 8, mmxext)
|
||||
PRED8x8(plane, 8, sse2)
|
||||
PRED8x8(plane, 8, ssse3)
|
||||
PRED8x8(tm_vp8, 8, mmx)
|
||||
PRED8x8(tm_vp8, 8, mmxext)
|
||||
PRED8x8(tm_vp8, 8, sse2)
|
||||
PRED8x8(tm_vp8, 8, ssse3)
|
||||
|
||||
PRED8x8L(top_dc, 8, mmxext)
|
||||
PRED8x8L(top_dc, 8, ssse3)
|
||||
PRED8x8L(dc, 8, mmxext)
|
||||
PRED8x8L(dc, 8, ssse3)
|
||||
PRED8x8L(horizontal, 8, mmxext)
|
||||
PRED8x8L(horizontal, 8, ssse3)
|
||||
PRED8x8L(vertical, 8, mmxext)
|
||||
PRED8x8L(vertical, 8, ssse3)
|
||||
PRED8x8L(down_left, 8, mmxext)
|
||||
PRED8x8L(down_left, 8, sse2)
|
||||
PRED8x8L(down_left, 8, ssse3)
|
||||
PRED8x8L(down_right, 8, mmxext)
|
||||
PRED8x8L(down_right, 8, sse2)
|
||||
PRED8x8L(down_right, 8, ssse3)
|
||||
PRED8x8L(vertical_right, 8, mmxext)
|
||||
PRED8x8L(vertical_right, 8, sse2)
|
||||
PRED8x8L(vertical_right, 8, ssse3)
|
||||
PRED8x8L(vertical_left, 8, sse2)
|
||||
PRED8x8L(vertical_left, 8, ssse3)
|
||||
PRED8x8L(horizontal_up, 8, mmxext)
|
||||
PRED8x8L(horizontal_up, 8, ssse3)
|
||||
PRED8x8L(horizontal_down, 8, mmxext)
|
||||
PRED8x8L(horizontal_down, 8, sse2)
|
||||
PRED8x8L(horizontal_down, 8, ssse3)
|
||||
|
||||
PRED4x4(dc, 8, mmxext)
|
||||
PRED4x4(down_left, 8, mmxext)
|
||||
PRED4x4(down_right, 8, mmxext)
|
||||
PRED4x4(vertical_left, 8, mmxext)
|
||||
PRED4x4(vertical_right, 8, mmxext)
|
||||
PRED4x4(horizontal_up, 8, mmxext)
|
||||
PRED4x4(horizontal_down, 8, mmxext)
|
||||
PRED4x4(tm_vp8, 8, mmx)
|
||||
PRED4x4(tm_vp8, 8, mmxext)
|
||||
PRED4x4(tm_vp8, 8, ssse3)
|
||||
PRED4x4(vertical_vp8, 8, mmxext)
|
||||
|
||||
av_cold void ff_h264_pred_init_x86(H264PredContext *h, int codec_id,
|
||||
const int bit_depth,
|
||||
const int chroma_format_idc)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (bit_depth == 8) {
|
||||
if (EXTERNAL_MMX(cpu_flags)) {
|
||||
h->pred16x16[VERT_PRED8x8 ] = ff_pred16x16_vertical_8_mmx;
|
||||
h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_8_mmx;
|
||||
if (chroma_format_idc <= 1) {
|
||||
h->pred8x8 [VERT_PRED8x8 ] = ff_pred8x8_vertical_8_mmx;
|
||||
h->pred8x8 [HOR_PRED8x8 ] = ff_pred8x8_horizontal_8_mmx;
|
||||
}
|
||||
if (codec_id == AV_CODEC_ID_VP7 || codec_id == AV_CODEC_ID_VP8) {
|
||||
h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_tm_vp8_8_mmx;
|
||||
h->pred8x8 [PLANE_PRED8x8 ] = ff_pred8x8_tm_vp8_8_mmx;
|
||||
h->pred4x4 [TM_VP8_PRED ] = ff_pred4x4_tm_vp8_8_mmx;
|
||||
} else {
|
||||
if (chroma_format_idc <= 1)
|
||||
h->pred8x8 [PLANE_PRED8x8] = ff_pred8x8_plane_8_mmx;
|
||||
if (codec_id == AV_CODEC_ID_SVQ3) {
|
||||
if (cpu_flags & AV_CPU_FLAG_CMOV)
|
||||
h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_svq3_8_mmx;
|
||||
} else if (codec_id == AV_CODEC_ID_RV40) {
|
||||
h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_rv40_8_mmx;
|
||||
} else {
|
||||
h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_h264_8_mmx;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (EXTERNAL_MMXEXT(cpu_flags)) {
|
||||
h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_8_mmxext;
|
||||
h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_8_mmxext;
|
||||
if (chroma_format_idc <= 1)
|
||||
h->pred8x8[HOR_PRED8x8 ] = ff_pred8x8_horizontal_8_mmxext;
|
||||
h->pred8x8l [TOP_DC_PRED ] = ff_pred8x8l_top_dc_8_mmxext;
|
||||
h->pred8x8l [DC_PRED ] = ff_pred8x8l_dc_8_mmxext;
|
||||
h->pred8x8l [HOR_PRED ] = ff_pred8x8l_horizontal_8_mmxext;
|
||||
h->pred8x8l [VERT_PRED ] = ff_pred8x8l_vertical_8_mmxext;
|
||||
h->pred8x8l [DIAG_DOWN_RIGHT_PRED ] = ff_pred8x8l_down_right_8_mmxext;
|
||||
h->pred8x8l [VERT_RIGHT_PRED ] = ff_pred8x8l_vertical_right_8_mmxext;
|
||||
h->pred8x8l [HOR_UP_PRED ] = ff_pred8x8l_horizontal_up_8_mmxext;
|
||||
h->pred8x8l [DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_8_mmxext;
|
||||
h->pred8x8l [HOR_DOWN_PRED ] = ff_pred8x8l_horizontal_down_8_mmxext;
|
||||
h->pred4x4 [DIAG_DOWN_RIGHT_PRED ] = ff_pred4x4_down_right_8_mmxext;
|
||||
h->pred4x4 [VERT_RIGHT_PRED ] = ff_pred4x4_vertical_right_8_mmxext;
|
||||
h->pred4x4 [HOR_DOWN_PRED ] = ff_pred4x4_horizontal_down_8_mmxext;
|
||||
h->pred4x4 [DC_PRED ] = ff_pred4x4_dc_8_mmxext;
|
||||
if (codec_id == AV_CODEC_ID_VP7 || codec_id == AV_CODEC_ID_VP8 ||
|
||||
codec_id == AV_CODEC_ID_H264) {
|
||||
h->pred4x4 [DIAG_DOWN_LEFT_PRED] = ff_pred4x4_down_left_8_mmxext;
|
||||
}
|
||||
if (codec_id == AV_CODEC_ID_SVQ3 || codec_id == AV_CODEC_ID_H264) {
|
||||
h->pred4x4 [VERT_LEFT_PRED ] = ff_pred4x4_vertical_left_8_mmxext;
|
||||
}
|
||||
if (codec_id != AV_CODEC_ID_RV40) {
|
||||
h->pred4x4 [HOR_UP_PRED ] = ff_pred4x4_horizontal_up_8_mmxext;
|
||||
}
|
||||
if (codec_id == AV_CODEC_ID_SVQ3 || codec_id == AV_CODEC_ID_H264) {
|
||||
if (chroma_format_idc <= 1) {
|
||||
h->pred8x8[TOP_DC_PRED8x8 ] = ff_pred8x8_top_dc_8_mmxext;
|
||||
h->pred8x8[DC_PRED8x8 ] = ff_pred8x8_dc_8_mmxext;
|
||||
}
|
||||
}
|
||||
if (codec_id == AV_CODEC_ID_VP7 || codec_id == AV_CODEC_ID_VP8) {
|
||||
h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_tm_vp8_8_mmxext;
|
||||
h->pred8x8 [DC_PRED8x8 ] = ff_pred8x8_dc_rv40_8_mmxext;
|
||||
h->pred8x8 [PLANE_PRED8x8 ] = ff_pred8x8_tm_vp8_8_mmxext;
|
||||
h->pred4x4 [TM_VP8_PRED ] = ff_pred4x4_tm_vp8_8_mmxext;
|
||||
h->pred4x4 [VERT_PRED ] = ff_pred4x4_vertical_vp8_8_mmxext;
|
||||
} else {
|
||||
if (chroma_format_idc <= 1)
|
||||
h->pred8x8 [PLANE_PRED8x8] = ff_pred8x8_plane_8_mmxext;
|
||||
if (codec_id == AV_CODEC_ID_SVQ3) {
|
||||
h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_plane_svq3_8_mmxext;
|
||||
} else if (codec_id == AV_CODEC_ID_RV40) {
|
||||
h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_plane_rv40_8_mmxext;
|
||||
} else {
|
||||
h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_plane_h264_8_mmxext;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (EXTERNAL_SSE(cpu_flags)) {
|
||||
h->pred16x16[VERT_PRED8x8] = ff_pred16x16_vertical_8_sse;
|
||||
}
|
||||
|
||||
if (EXTERNAL_SSE2(cpu_flags)) {
|
||||
h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_8_sse2;
|
||||
h->pred8x8l [DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_8_sse2;
|
||||
h->pred8x8l [DIAG_DOWN_RIGHT_PRED ] = ff_pred8x8l_down_right_8_sse2;
|
||||
h->pred8x8l [VERT_RIGHT_PRED ] = ff_pred8x8l_vertical_right_8_sse2;
|
||||
h->pred8x8l [VERT_LEFT_PRED ] = ff_pred8x8l_vertical_left_8_sse2;
|
||||
h->pred8x8l [HOR_DOWN_PRED ] = ff_pred8x8l_horizontal_down_8_sse2;
|
||||
if (codec_id == AV_CODEC_ID_VP7 || codec_id == AV_CODEC_ID_VP8) {
|
||||
h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_tm_vp8_8_sse2;
|
||||
h->pred8x8 [PLANE_PRED8x8 ] = ff_pred8x8_tm_vp8_8_sse2;
|
||||
} else {
|
||||
if (chroma_format_idc <= 1)
|
||||
h->pred8x8 [PLANE_PRED8x8] = ff_pred8x8_plane_8_sse2;
|
||||
if (codec_id == AV_CODEC_ID_SVQ3) {
|
||||
h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_svq3_8_sse2;
|
||||
} else if (codec_id == AV_CODEC_ID_RV40) {
|
||||
h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_rv40_8_sse2;
|
||||
} else {
|
||||
h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_h264_8_sse2;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (EXTERNAL_SSSE3(cpu_flags)) {
|
||||
h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_8_ssse3;
|
||||
h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_8_ssse3;
|
||||
if (chroma_format_idc <= 1)
|
||||
h->pred8x8 [HOR_PRED8x8 ] = ff_pred8x8_horizontal_8_ssse3;
|
||||
h->pred8x8l [TOP_DC_PRED ] = ff_pred8x8l_top_dc_8_ssse3;
|
||||
h->pred8x8l [DC_PRED ] = ff_pred8x8l_dc_8_ssse3;
|
||||
h->pred8x8l [HOR_PRED ] = ff_pred8x8l_horizontal_8_ssse3;
|
||||
h->pred8x8l [VERT_PRED ] = ff_pred8x8l_vertical_8_ssse3;
|
||||
h->pred8x8l [DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_8_ssse3;
|
||||
h->pred8x8l [DIAG_DOWN_RIGHT_PRED ] = ff_pred8x8l_down_right_8_ssse3;
|
||||
h->pred8x8l [VERT_RIGHT_PRED ] = ff_pred8x8l_vertical_right_8_ssse3;
|
||||
h->pred8x8l [VERT_LEFT_PRED ] = ff_pred8x8l_vertical_left_8_ssse3;
|
||||
h->pred8x8l [HOR_UP_PRED ] = ff_pred8x8l_horizontal_up_8_ssse3;
|
||||
h->pred8x8l [HOR_DOWN_PRED ] = ff_pred8x8l_horizontal_down_8_ssse3;
|
||||
if (codec_id == AV_CODEC_ID_VP7 || codec_id == AV_CODEC_ID_VP8) {
|
||||
h->pred8x8 [PLANE_PRED8x8 ] = ff_pred8x8_tm_vp8_8_ssse3;
|
||||
h->pred4x4 [TM_VP8_PRED ] = ff_pred4x4_tm_vp8_8_ssse3;
|
||||
} else {
|
||||
if (chroma_format_idc <= 1)
|
||||
h->pred8x8 [PLANE_PRED8x8] = ff_pred8x8_plane_8_ssse3;
|
||||
if (codec_id == AV_CODEC_ID_SVQ3) {
|
||||
h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_svq3_8_ssse3;
|
||||
} else if (codec_id == AV_CODEC_ID_RV40) {
|
||||
h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_rv40_8_ssse3;
|
||||
} else {
|
||||
h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_h264_8_ssse3;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(EXTERNAL_AVX2(cpu_flags)){
|
||||
if (codec_id == AV_CODEC_ID_VP8) {
|
||||
h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_tm_vp8_8_avx2;
|
||||
}
|
||||
}
|
||||
} else if (bit_depth == 10) {
|
||||
if (EXTERNAL_MMXEXT(cpu_flags)) {
|
||||
h->pred4x4[DC_PRED ] = ff_pred4x4_dc_10_mmxext;
|
||||
h->pred4x4[HOR_UP_PRED ] = ff_pred4x4_horizontal_up_10_mmxext;
|
||||
|
||||
if (chroma_format_idc <= 1)
|
||||
h->pred8x8[DC_PRED8x8 ] = ff_pred8x8_dc_10_mmxext;
|
||||
|
||||
h->pred8x8l[DC_128_PRED ] = ff_pred8x8l_128_dc_10_mmxext;
|
||||
|
||||
h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_10_mmxext;
|
||||
h->pred16x16[TOP_DC_PRED8x8 ] = ff_pred16x16_top_dc_10_mmxext;
|
||||
h->pred16x16[DC_128_PRED8x8 ] = ff_pred16x16_128_dc_10_mmxext;
|
||||
h->pred16x16[LEFT_DC_PRED8x8 ] = ff_pred16x16_left_dc_10_mmxext;
|
||||
h->pred16x16[VERT_PRED8x8 ] = ff_pred16x16_vertical_10_mmxext;
|
||||
h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_10_mmxext;
|
||||
}
|
||||
if (EXTERNAL_SSE2(cpu_flags)) {
|
||||
h->pred4x4[DIAG_DOWN_LEFT_PRED ] = ff_pred4x4_down_left_10_sse2;
|
||||
h->pred4x4[DIAG_DOWN_RIGHT_PRED] = ff_pred4x4_down_right_10_sse2;
|
||||
h->pred4x4[VERT_LEFT_PRED ] = ff_pred4x4_vertical_left_10_sse2;
|
||||
h->pred4x4[VERT_RIGHT_PRED ] = ff_pred4x4_vertical_right_10_sse2;
|
||||
h->pred4x4[HOR_DOWN_PRED ] = ff_pred4x4_horizontal_down_10_sse2;
|
||||
|
||||
if (chroma_format_idc <= 1) {
|
||||
h->pred8x8[DC_PRED8x8 ] = ff_pred8x8_dc_10_sse2;
|
||||
h->pred8x8[TOP_DC_PRED8x8 ] = ff_pred8x8_top_dc_10_sse2;
|
||||
h->pred8x8[PLANE_PRED8x8 ] = ff_pred8x8_plane_10_sse2;
|
||||
h->pred8x8[VERT_PRED8x8 ] = ff_pred8x8_vertical_10_sse2;
|
||||
h->pred8x8[HOR_PRED8x8 ] = ff_pred8x8_horizontal_10_sse2;
|
||||
}
|
||||
|
||||
h->pred8x8l[VERT_PRED ] = ff_pred8x8l_vertical_10_sse2;
|
||||
h->pred8x8l[HOR_PRED ] = ff_pred8x8l_horizontal_10_sse2;
|
||||
h->pred8x8l[DC_PRED ] = ff_pred8x8l_dc_10_sse2;
|
||||
h->pred8x8l[DC_128_PRED ] = ff_pred8x8l_128_dc_10_sse2;
|
||||
h->pred8x8l[TOP_DC_PRED ] = ff_pred8x8l_top_dc_10_sse2;
|
||||
h->pred8x8l[DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_10_sse2;
|
||||
h->pred8x8l[DIAG_DOWN_RIGHT_PRED] = ff_pred8x8l_down_right_10_sse2;
|
||||
h->pred8x8l[VERT_RIGHT_PRED ] = ff_pred8x8l_vertical_right_10_sse2;
|
||||
h->pred8x8l[HOR_UP_PRED ] = ff_pred8x8l_horizontal_up_10_sse2;
|
||||
|
||||
h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_10_sse2;
|
||||
h->pred16x16[TOP_DC_PRED8x8 ] = ff_pred16x16_top_dc_10_sse2;
|
||||
h->pred16x16[DC_128_PRED8x8 ] = ff_pred16x16_128_dc_10_sse2;
|
||||
h->pred16x16[LEFT_DC_PRED8x8 ] = ff_pred16x16_left_dc_10_sse2;
|
||||
h->pred16x16[VERT_PRED8x8 ] = ff_pred16x16_vertical_10_sse2;
|
||||
h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_10_sse2;
|
||||
}
|
||||
if (EXTERNAL_SSSE3(cpu_flags)) {
|
||||
h->pred4x4[DIAG_DOWN_RIGHT_PRED] = ff_pred4x4_down_right_10_ssse3;
|
||||
h->pred4x4[VERT_RIGHT_PRED ] = ff_pred4x4_vertical_right_10_ssse3;
|
||||
h->pred4x4[HOR_DOWN_PRED ] = ff_pred4x4_horizontal_down_10_ssse3;
|
||||
|
||||
h->pred8x8l[HOR_PRED ] = ff_pred8x8l_horizontal_10_ssse3;
|
||||
h->pred8x8l[DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_10_ssse3;
|
||||
h->pred8x8l[DIAG_DOWN_RIGHT_PRED] = ff_pred8x8l_down_right_10_ssse3;
|
||||
h->pred8x8l[VERT_RIGHT_PRED ] = ff_pred8x8l_vertical_right_10_ssse3;
|
||||
h->pred8x8l[HOR_UP_PRED ] = ff_pred8x8l_horizontal_up_10_ssse3;
|
||||
}
|
||||
if (EXTERNAL_AVX(cpu_flags)) {
|
||||
h->pred4x4[DIAG_DOWN_LEFT_PRED ] = ff_pred4x4_down_left_10_avx;
|
||||
h->pred4x4[DIAG_DOWN_RIGHT_PRED] = ff_pred4x4_down_right_10_avx;
|
||||
h->pred4x4[VERT_LEFT_PRED ] = ff_pred4x4_vertical_left_10_avx;
|
||||
h->pred4x4[VERT_RIGHT_PRED ] = ff_pred4x4_vertical_right_10_avx;
|
||||
h->pred4x4[HOR_DOWN_PRED ] = ff_pred4x4_horizontal_down_10_avx;
|
||||
|
||||
h->pred8x8l[VERT_PRED ] = ff_pred8x8l_vertical_10_avx;
|
||||
h->pred8x8l[HOR_PRED ] = ff_pred8x8l_horizontal_10_avx;
|
||||
h->pred8x8l[DC_PRED ] = ff_pred8x8l_dc_10_avx;
|
||||
h->pred8x8l[TOP_DC_PRED ] = ff_pred8x8l_top_dc_10_avx;
|
||||
h->pred8x8l[DIAG_DOWN_RIGHT_PRED] = ff_pred8x8l_down_right_10_avx;
|
||||
h->pred8x8l[DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_10_avx;
|
||||
h->pred8x8l[VERT_RIGHT_PRED ] = ff_pred8x8l_vertical_right_10_avx;
|
||||
h->pred8x8l[HOR_UP_PRED ] = ff_pred8x8l_horizontal_up_10_avx;
|
||||
}
|
||||
}
|
||||
}
|
634
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/h264_qpel.c
vendored
Normal file
634
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/h264_qpel.c
vendored
Normal file
|
@ -0,0 +1,634 @@
|
|||
/*
|
||||
* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
|
||||
* Copyright (c) 2011 Daniel Kang
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/x86/asm.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/h264dec.h"
|
||||
#include "libavcodec/h264qpel.h"
|
||||
#include "libavcodec/pixels.h"
|
||||
#include "fpel.h"
|
||||
|
||||
#if HAVE_X86ASM
|
||||
void ff_put_pixels4_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
|
||||
int dstStride, int src1Stride, int h);
|
||||
void ff_avg_pixels4_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
|
||||
int dstStride, int src1Stride, int h);
|
||||
void ff_put_pixels8_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
|
||||
int dstStride, int src1Stride, int h);
|
||||
void ff_avg_pixels8_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
|
||||
int dstStride, int src1Stride, int h);
|
||||
void ff_put_pixels16_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
|
||||
int dstStride, int src1Stride, int h);
|
||||
void ff_avg_pixels16_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
|
||||
int dstStride, int src1Stride, int h);
|
||||
#define ff_put_pixels8_l2_sse2 ff_put_pixels8_l2_mmxext
|
||||
#define ff_avg_pixels8_l2_sse2 ff_avg_pixels8_l2_mmxext
|
||||
#define ff_put_pixels16_l2_sse2 ff_put_pixels16_l2_mmxext
|
||||
#define ff_avg_pixels16_l2_sse2 ff_avg_pixels16_l2_mmxext
|
||||
#define ff_put_pixels16_mmxext ff_put_pixels16_mmx
|
||||
#define ff_put_pixels8_mmxext ff_put_pixels8_mmx
|
||||
#define ff_put_pixels4_mmxext ff_put_pixels4_mmx
|
||||
|
||||
#define DEF_QPEL(OPNAME)\
|
||||
void ff_ ## OPNAME ## _h264_qpel4_h_lowpass_mmxext(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride);\
|
||||
void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_mmxext(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride);\
|
||||
void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_ssse3(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride);\
|
||||
void ff_ ## OPNAME ## _h264_qpel4_h_lowpass_l2_mmxext(uint8_t *dst, const uint8_t *src, const uint8_t *src2, int dstStride, int src2Stride);\
|
||||
void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_l2_mmxext(uint8_t *dst, const uint8_t *src, const uint8_t *src2, int dstStride, int src2Stride);\
|
||||
void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_l2_ssse3(uint8_t *dst, const uint8_t *src, const uint8_t *src2, int dstStride, int src2Stride);\
|
||||
void ff_ ## OPNAME ## _h264_qpel4_v_lowpass_mmxext(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride);\
|
||||
void ff_ ## OPNAME ## _h264_qpel8or16_v_lowpass_op_mmxext(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h);\
|
||||
void ff_ ## OPNAME ## _h264_qpel8or16_v_lowpass_sse2(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h);\
|
||||
void ff_ ## OPNAME ## _h264_qpel4_hv_lowpass_v_mmxext(const uint8_t *src, int16_t *tmp, int srcStride);\
|
||||
void ff_ ## OPNAME ## _h264_qpel4_hv_lowpass_h_mmxext(int16_t *tmp, uint8_t *dst, int dstStride);\
|
||||
void ff_ ## OPNAME ## _h264_qpel8or16_hv1_lowpass_op_mmxext(const uint8_t *src, int16_t *tmp, int srcStride, int size);\
|
||||
void ff_ ## OPNAME ## _h264_qpel8or16_hv1_lowpass_op_sse2(const uint8_t *src, int16_t *tmp, int srcStride, int size);\
|
||||
void ff_ ## OPNAME ## _h264_qpel8or16_hv2_lowpass_op_mmxext(uint8_t *dst, int16_t *tmp, int dstStride, int unused, int h);\
|
||||
void ff_ ## OPNAME ## _h264_qpel8or16_hv2_lowpass_ssse3(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size);\
|
||||
void ff_ ## OPNAME ## _pixels4_l2_shift5_mmxext(uint8_t *dst, const int16_t *src16, const uint8_t *src8, int dstStride, int src8Stride, int h);\
|
||||
void ff_ ## OPNAME ## _pixels8_l2_shift5_mmxext(uint8_t *dst, const int16_t *src16, const uint8_t *src8, int dstStride, int src8Stride, int h);
|
||||
|
||||
DEF_QPEL(avg)
|
||||
DEF_QPEL(put)
|
||||
|
||||
static av_always_inline void ff_put_h264_qpel8or16_hv1_lowpass_mmxext(int16_t *tmp, const uint8_t *src, int tmpStride, int srcStride, int size)
|
||||
{
|
||||
int w = (size + 8) >> 2;
|
||||
src -= 2 * srcStride + 2;
|
||||
while (w--) {
|
||||
ff_put_h264_qpel8or16_hv1_lowpass_op_mmxext(src, tmp, srcStride, size);
|
||||
tmp += 4;
|
||||
src += 4;
|
||||
}
|
||||
}
|
||||
|
||||
#define QPEL_H264(OPNAME, OP, MMX)\
|
||||
static av_always_inline void ff_ ## OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, const uint8_t *src, int dstStride, int tmpStride, int srcStride){\
|
||||
int w=3;\
|
||||
src -= 2*srcStride+2;\
|
||||
while(w--){\
|
||||
ff_ ## OPNAME ## h264_qpel4_hv_lowpass_v_mmxext(src, tmp, srcStride);\
|
||||
tmp += 4;\
|
||||
src += 4;\
|
||||
}\
|
||||
tmp -= 3*4;\
|
||||
ff_ ## OPNAME ## h264_qpel4_hv_lowpass_h_mmxext(tmp, dst, dstStride);\
|
||||
}\
|
||||
\
|
||||
static av_always_inline void ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h){\
|
||||
src -= 2*srcStride;\
|
||||
ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_op_mmxext(dst, src, dstStride, srcStride, h);\
|
||||
src += 4;\
|
||||
dst += 4;\
|
||||
ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_op_mmxext(dst, src, dstStride, srcStride, h);\
|
||||
}\
|
||||
static av_always_inline void ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\
|
||||
int w = size>>4;\
|
||||
do{\
|
||||
ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_op_mmxext(dst, tmp, dstStride, 0, size);\
|
||||
tmp += 8;\
|
||||
dst += 8;\
|
||||
}while(w--);\
|
||||
}\
|
||||
\
|
||||
static av_always_inline void ff_ ## OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride){\
|
||||
ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 8);\
|
||||
}\
|
||||
static av_always_inline void ff_ ## OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride){\
|
||||
ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 16);\
|
||||
ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
|
||||
}\
|
||||
\
|
||||
static av_always_inline void ff_ ## OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride){\
|
||||
ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
|
||||
ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
|
||||
src += 8*srcStride;\
|
||||
dst += 8*dstStride;\
|
||||
ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
|
||||
ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
|
||||
}\
|
||||
\
|
||||
static av_always_inline void ff_ ## OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, const uint8_t *src, const uint8_t *src2, int dstStride, int src2Stride){\
|
||||
ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
|
||||
ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
|
||||
src += 8*dstStride;\
|
||||
dst += 8*dstStride;\
|
||||
src2 += 8*src2Stride;\
|
||||
ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
|
||||
ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
|
||||
}\
|
||||
\
|
||||
static av_always_inline void ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, const uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\
|
||||
ff_put_h264_qpel8or16_hv1_lowpass_ ## MMX(tmp, src, tmpStride, srcStride, size);\
|
||||
ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\
|
||||
}\
|
||||
static av_always_inline void ff_ ## OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, const uint8_t *src, int dstStride, int tmpStride, int srcStride){\
|
||||
ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride, 8);\
|
||||
}\
|
||||
\
|
||||
static av_always_inline void ff_ ## OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, const uint8_t *src, int dstStride, int tmpStride, int srcStride){\
|
||||
ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride, 16);\
|
||||
}\
|
||||
\
|
||||
static av_always_inline void ff_ ## OPNAME ## pixels16_l2_shift5_ ## MMX(uint8_t *dst, const int16_t *src16, const uint8_t *src8, int dstStride, int src8Stride, int h)\
|
||||
{\
|
||||
ff_ ## OPNAME ## pixels8_l2_shift5_ ## MMX(dst , src16 , src8 , dstStride, src8Stride, h);\
|
||||
ff_ ## OPNAME ## pixels8_l2_shift5_ ## MMX(dst+8, src16+8, src8+8, dstStride, src8Stride, h);\
|
||||
}\
|
||||
|
||||
|
||||
#if ARCH_X86_64
|
||||
#define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
|
||||
|
||||
void ff_avg_h264_qpel16_h_lowpass_l2_ssse3(uint8_t *dst, const uint8_t *src, const uint8_t *src2, int dstStride, int src2Stride);
|
||||
void ff_put_h264_qpel16_h_lowpass_l2_ssse3(uint8_t *dst, const uint8_t *src, const uint8_t *src2, int dstStride, int src2Stride);
|
||||
|
||||
#else // ARCH_X86_64
|
||||
#define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
|
||||
static av_always_inline void ff_ ## OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, const uint8_t *src, const uint8_t *src2, int dstStride, int src2Stride){\
|
||||
ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
|
||||
ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
|
||||
src += 8*dstStride;\
|
||||
dst += 8*dstStride;\
|
||||
src2 += 8*src2Stride;\
|
||||
ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
|
||||
ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
|
||||
}
|
||||
#endif // ARCH_X86_64
|
||||
|
||||
#define QPEL_H264_H_XMM(OPNAME, OP, MMX)\
|
||||
QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
|
||||
static av_always_inline void ff_ ## OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride){\
|
||||
ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
|
||||
ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
|
||||
src += 8*srcStride;\
|
||||
dst += 8*dstStride;\
|
||||
ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
|
||||
ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
|
||||
}\
|
||||
|
||||
#define QPEL_H264_V_XMM(OPNAME, OP, MMX)\
|
||||
static av_always_inline void ff_ ## OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride){\
|
||||
ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 8);\
|
||||
}\
|
||||
static av_always_inline void ff_ ## OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride){\
|
||||
ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 16);\
|
||||
ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
|
||||
}
|
||||
|
||||
static av_always_inline void put_h264_qpel8or16_hv1_lowpass_sse2(int16_t *tmp,
|
||||
const uint8_t *src,
|
||||
int tmpStride,
|
||||
int srcStride,
|
||||
int size)
|
||||
{
|
||||
int w = (size+8)>>3;
|
||||
src -= 2*srcStride+2;
|
||||
while(w--){
|
||||
ff_put_h264_qpel8or16_hv1_lowpass_op_sse2(src, tmp, srcStride, size);
|
||||
tmp += 8;
|
||||
src += 8;
|
||||
}
|
||||
}
|
||||
|
||||
#define QPEL_H264_HV_XMM(OPNAME, OP, MMX)\
|
||||
static av_always_inline void ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, const uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\
|
||||
put_h264_qpel8or16_hv1_lowpass_sse2(tmp, src, tmpStride, srcStride, size);\
|
||||
ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\
|
||||
}\
|
||||
static av_always_inline void ff_ ## OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, const uint8_t *src, int dstStride, int tmpStride, int srcStride){\
|
||||
ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 8);\
|
||||
}\
|
||||
static av_always_inline void ff_ ## OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, const uint8_t *src, int dstStride, int tmpStride, int srcStride){\
|
||||
ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 16);\
|
||||
}\
|
||||
|
||||
#define ff_put_h264_qpel8_h_lowpass_l2_sse2 ff_put_h264_qpel8_h_lowpass_l2_mmxext
|
||||
#define ff_avg_h264_qpel8_h_lowpass_l2_sse2 ff_avg_h264_qpel8_h_lowpass_l2_mmxext
|
||||
#define ff_put_h264_qpel16_h_lowpass_l2_sse2 ff_put_h264_qpel16_h_lowpass_l2_mmxext
|
||||
#define ff_avg_h264_qpel16_h_lowpass_l2_sse2 ff_avg_h264_qpel16_h_lowpass_l2_mmxext
|
||||
|
||||
#define ff_put_h264_qpel8_v_lowpass_ssse3 ff_put_h264_qpel8_v_lowpass_sse2
|
||||
#define ff_avg_h264_qpel8_v_lowpass_ssse3 ff_avg_h264_qpel8_v_lowpass_sse2
|
||||
#define ff_put_h264_qpel16_v_lowpass_ssse3 ff_put_h264_qpel16_v_lowpass_sse2
|
||||
#define ff_avg_h264_qpel16_v_lowpass_ssse3 ff_avg_h264_qpel16_v_lowpass_sse2
|
||||
|
||||
#define ff_put_h264_qpel8or16_hv2_lowpass_sse2 ff_put_h264_qpel8or16_hv2_lowpass_mmxext
|
||||
#define ff_avg_h264_qpel8or16_hv2_lowpass_sse2 ff_avg_h264_qpel8or16_hv2_lowpass_mmxext
|
||||
|
||||
#define H264_MC(OPNAME, SIZE, MMX, ALIGN) \
|
||||
H264_MC_C(OPNAME, SIZE, MMX, ALIGN)\
|
||||
H264_MC_V(OPNAME, SIZE, MMX, ALIGN)\
|
||||
H264_MC_H(OPNAME, SIZE, MMX, ALIGN)\
|
||||
H264_MC_HV(OPNAME, SIZE, MMX, ALIGN)\
|
||||
|
||||
static void put_h264_qpel16_mc00_sse2 (uint8_t *dst, const uint8_t *src,
|
||||
ptrdiff_t stride)
|
||||
{
|
||||
ff_put_pixels16_sse2(dst, src, stride, 16);
|
||||
}
|
||||
static void avg_h264_qpel16_mc00_sse2 (uint8_t *dst, const uint8_t *src,
|
||||
ptrdiff_t stride)
|
||||
{
|
||||
ff_avg_pixels16_sse2(dst, src, stride, 16);
|
||||
}
|
||||
#define put_h264_qpel8_mc00_sse2 put_h264_qpel8_mc00_mmxext
|
||||
#define avg_h264_qpel8_mc00_sse2 avg_h264_qpel8_mc00_mmxext
|
||||
|
||||
#define H264_MC_C(OPNAME, SIZE, MMX, ALIGN) \
|
||||
static void OPNAME ## h264_qpel ## SIZE ## _mc00_ ## MMX (uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
|
||||
{\
|
||||
ff_ ## OPNAME ## pixels ## SIZE ## _ ## MMX(dst, src, stride, SIZE);\
|
||||
}\
|
||||
|
||||
#define H264_MC_H(OPNAME, SIZE, MMX, ALIGN) \
|
||||
static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
|
||||
{\
|
||||
ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src, stride, stride);\
|
||||
}\
|
||||
\
|
||||
static void OPNAME ## h264_qpel ## SIZE ## _mc20_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
|
||||
{\
|
||||
ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_ ## MMX(dst, src, stride, stride);\
|
||||
}\
|
||||
\
|
||||
static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
|
||||
{\
|
||||
ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src+1, stride, stride);\
|
||||
}\
|
||||
|
||||
#define H264_MC_V(OPNAME, SIZE, MMX, ALIGN) \
|
||||
static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
|
||||
{\
|
||||
LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*SIZE]);\
|
||||
ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
|
||||
ff_ ## OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src, temp, stride, stride, SIZE);\
|
||||
}\
|
||||
\
|
||||
static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
|
||||
{\
|
||||
ff_ ## OPNAME ## h264_qpel ## SIZE ## _v_lowpass_ ## MMX(dst, src, stride, stride);\
|
||||
}\
|
||||
\
|
||||
static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
|
||||
{\
|
||||
LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*SIZE]);\
|
||||
ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
|
||||
ff_ ## OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src+stride, temp, stride, stride, SIZE);\
|
||||
}\
|
||||
|
||||
#define H264_MC_HV(OPNAME, SIZE, MMX, ALIGN) \
|
||||
static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
|
||||
{\
|
||||
LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*SIZE]);\
|
||||
ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
|
||||
ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\
|
||||
}\
|
||||
\
|
||||
static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
|
||||
{\
|
||||
LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*SIZE]);\
|
||||
ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\
|
||||
ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\
|
||||
}\
|
||||
\
|
||||
static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
|
||||
{\
|
||||
LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*SIZE]);\
|
||||
ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
|
||||
ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\
|
||||
}\
|
||||
\
|
||||
static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
|
||||
{\
|
||||
LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*SIZE]);\
|
||||
ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\
|
||||
ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\
|
||||
}\
|
||||
\
|
||||
static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
|
||||
{\
|
||||
LOCAL_ALIGNED(ALIGN, uint16_t, temp, [SIZE*(SIZE<8?12:24)]);\
|
||||
ff_ ## OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(dst, temp, src, stride, SIZE, stride);\
|
||||
}\
|
||||
\
|
||||
static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
|
||||
{\
|
||||
LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\
|
||||
uint8_t * const halfHV= temp;\
|
||||
int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
|
||||
av_assert2(((int)temp & 7) == 0);\
|
||||
ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
|
||||
ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, halfHV, stride, SIZE);\
|
||||
}\
|
||||
\
|
||||
static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
|
||||
{\
|
||||
LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\
|
||||
uint8_t * const halfHV= temp;\
|
||||
int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
|
||||
av_assert2(((int)temp & 7) == 0);\
|
||||
ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
|
||||
ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, halfHV, stride, SIZE);\
|
||||
}\
|
||||
\
|
||||
static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
|
||||
{\
|
||||
LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\
|
||||
uint8_t * const halfHV= temp;\
|
||||
int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
|
||||
av_assert2(((int)temp & 7) == 0);\
|
||||
ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
|
||||
ff_ ## OPNAME ## pixels ## SIZE ## _l2_shift5_mmxext(dst, halfV+2, halfHV, stride, SIZE, SIZE);\
|
||||
}\
|
||||
\
|
||||
static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
|
||||
{\
|
||||
LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\
|
||||
uint8_t * const halfHV= temp;\
|
||||
int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
|
||||
av_assert2(((int)temp & 7) == 0);\
|
||||
ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
|
||||
ff_ ## OPNAME ## pixels ## SIZE ## _l2_shift5_mmxext(dst, halfV+3, halfHV, stride, SIZE, SIZE);\
|
||||
}\
|
||||
|
||||
#define H264_MC_4816(MMX)\
|
||||
H264_MC(put_, 4, MMX, 8)\
|
||||
H264_MC(put_, 8, MMX, 8)\
|
||||
H264_MC(put_, 16,MMX, 8)\
|
||||
H264_MC(avg_, 4, MMX, 8)\
|
||||
H264_MC(avg_, 8, MMX, 8)\
|
||||
H264_MC(avg_, 16,MMX, 8)\
|
||||
|
||||
#define H264_MC_816(QPEL, XMM)\
|
||||
QPEL(put_, 8, XMM, 16)\
|
||||
QPEL(put_, 16,XMM, 16)\
|
||||
QPEL(avg_, 8, XMM, 16)\
|
||||
QPEL(avg_, 16,XMM, 16)\
|
||||
|
||||
QPEL_H264(put_, PUT_OP, mmxext)
|
||||
QPEL_H264(avg_, AVG_MMXEXT_OP, mmxext)
|
||||
QPEL_H264_V_XMM(put_, PUT_OP, sse2)
|
||||
QPEL_H264_V_XMM(avg_,AVG_MMXEXT_OP, sse2)
|
||||
QPEL_H264_HV_XMM(put_, PUT_OP, sse2)
|
||||
QPEL_H264_HV_XMM(avg_,AVG_MMXEXT_OP, sse2)
|
||||
QPEL_H264_H_XMM(put_, PUT_OP, ssse3)
|
||||
QPEL_H264_H_XMM(avg_,AVG_MMXEXT_OP, ssse3)
|
||||
QPEL_H264_HV_XMM(put_, PUT_OP, ssse3)
|
||||
QPEL_H264_HV_XMM(avg_,AVG_MMXEXT_OP, ssse3)
|
||||
|
||||
H264_MC_4816(mmxext)
|
||||
H264_MC_816(H264_MC_V, sse2)
|
||||
H264_MC_816(H264_MC_HV, sse2)
|
||||
H264_MC_816(H264_MC_H, ssse3)
|
||||
H264_MC_816(H264_MC_HV, ssse3)
|
||||
|
||||
|
||||
//10bit
|
||||
#define LUMA_MC_OP(OP, NUM, DEPTH, TYPE, OPT) \
|
||||
void ff_ ## OP ## _h264_qpel ## NUM ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT \
|
||||
(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
|
||||
#define LUMA_MC_ALL(DEPTH, TYPE, OPT) \
|
||||
LUMA_MC_OP(put, 4, DEPTH, TYPE, OPT) \
|
||||
LUMA_MC_OP(avg, 4, DEPTH, TYPE, OPT) \
|
||||
LUMA_MC_OP(put, 8, DEPTH, TYPE, OPT) \
|
||||
LUMA_MC_OP(avg, 8, DEPTH, TYPE, OPT) \
|
||||
LUMA_MC_OP(put, 16, DEPTH, TYPE, OPT) \
|
||||
LUMA_MC_OP(avg, 16, DEPTH, TYPE, OPT)
|
||||
|
||||
#define LUMA_MC_816(DEPTH, TYPE, OPT) \
|
||||
LUMA_MC_OP(put, 8, DEPTH, TYPE, OPT) \
|
||||
LUMA_MC_OP(avg, 8, DEPTH, TYPE, OPT) \
|
||||
LUMA_MC_OP(put, 16, DEPTH, TYPE, OPT) \
|
||||
LUMA_MC_OP(avg, 16, DEPTH, TYPE, OPT)
|
||||
|
||||
LUMA_MC_ALL(10, mc00, mmxext)
|
||||
LUMA_MC_ALL(10, mc10, mmxext)
|
||||
LUMA_MC_ALL(10, mc20, mmxext)
|
||||
LUMA_MC_ALL(10, mc30, mmxext)
|
||||
LUMA_MC_ALL(10, mc01, mmxext)
|
||||
LUMA_MC_ALL(10, mc11, mmxext)
|
||||
LUMA_MC_ALL(10, mc21, mmxext)
|
||||
LUMA_MC_ALL(10, mc31, mmxext)
|
||||
LUMA_MC_ALL(10, mc02, mmxext)
|
||||
LUMA_MC_ALL(10, mc12, mmxext)
|
||||
LUMA_MC_ALL(10, mc22, mmxext)
|
||||
LUMA_MC_ALL(10, mc32, mmxext)
|
||||
LUMA_MC_ALL(10, mc03, mmxext)
|
||||
LUMA_MC_ALL(10, mc13, mmxext)
|
||||
LUMA_MC_ALL(10, mc23, mmxext)
|
||||
LUMA_MC_ALL(10, mc33, mmxext)
|
||||
|
||||
LUMA_MC_816(10, mc00, sse2)
|
||||
LUMA_MC_816(10, mc10, sse2)
|
||||
LUMA_MC_816(10, mc10, sse2_cache64)
|
||||
LUMA_MC_816(10, mc10, ssse3_cache64)
|
||||
LUMA_MC_816(10, mc20, sse2)
|
||||
LUMA_MC_816(10, mc20, sse2_cache64)
|
||||
LUMA_MC_816(10, mc20, ssse3_cache64)
|
||||
LUMA_MC_816(10, mc30, sse2)
|
||||
LUMA_MC_816(10, mc30, sse2_cache64)
|
||||
LUMA_MC_816(10, mc30, ssse3_cache64)
|
||||
LUMA_MC_816(10, mc01, sse2)
|
||||
LUMA_MC_816(10, mc11, sse2)
|
||||
LUMA_MC_816(10, mc21, sse2)
|
||||
LUMA_MC_816(10, mc31, sse2)
|
||||
LUMA_MC_816(10, mc02, sse2)
|
||||
LUMA_MC_816(10, mc12, sse2)
|
||||
LUMA_MC_816(10, mc22, sse2)
|
||||
LUMA_MC_816(10, mc32, sse2)
|
||||
LUMA_MC_816(10, mc03, sse2)
|
||||
LUMA_MC_816(10, mc13, sse2)
|
||||
LUMA_MC_816(10, mc23, sse2)
|
||||
LUMA_MC_816(10, mc33, sse2)
|
||||
|
||||
#define QPEL16_OPMC(OP, MC, MMX)\
|
||||
void ff_ ## OP ## _h264_qpel16_ ## MC ## _10_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride){\
|
||||
ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst , src , stride);\
|
||||
ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst+16, src+16, stride);\
|
||||
src += 8*stride;\
|
||||
dst += 8*stride;\
|
||||
ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst , src , stride);\
|
||||
ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst+16, src+16, stride);\
|
||||
}
|
||||
|
||||
#define QPEL16_OP(MC, MMX)\
|
||||
QPEL16_OPMC(put, MC, MMX)\
|
||||
QPEL16_OPMC(avg, MC, MMX)
|
||||
|
||||
#define QPEL16(MMX)\
|
||||
QPEL16_OP(mc00, MMX)\
|
||||
QPEL16_OP(mc01, MMX)\
|
||||
QPEL16_OP(mc02, MMX)\
|
||||
QPEL16_OP(mc03, MMX)\
|
||||
QPEL16_OP(mc10, MMX)\
|
||||
QPEL16_OP(mc11, MMX)\
|
||||
QPEL16_OP(mc12, MMX)\
|
||||
QPEL16_OP(mc13, MMX)\
|
||||
QPEL16_OP(mc20, MMX)\
|
||||
QPEL16_OP(mc21, MMX)\
|
||||
QPEL16_OP(mc22, MMX)\
|
||||
QPEL16_OP(mc23, MMX)\
|
||||
QPEL16_OP(mc30, MMX)\
|
||||
QPEL16_OP(mc31, MMX)\
|
||||
QPEL16_OP(mc32, MMX)\
|
||||
QPEL16_OP(mc33, MMX)
|
||||
|
||||
#if ARCH_X86_32 // ARCH_X86_64 implies SSE2+
|
||||
QPEL16(mmxext)
|
||||
#endif
|
||||
|
||||
#endif /* HAVE_X86ASM */
|
||||
|
||||
#define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX) \
|
||||
do { \
|
||||
c->PFX ## _pixels_tab[IDX][ 0] = PREFIX ## PFX ## SIZE ## _mc00_ ## CPU; \
|
||||
c->PFX ## _pixels_tab[IDX][ 1] = PREFIX ## PFX ## SIZE ## _mc10_ ## CPU; \
|
||||
c->PFX ## _pixels_tab[IDX][ 2] = PREFIX ## PFX ## SIZE ## _mc20_ ## CPU; \
|
||||
c->PFX ## _pixels_tab[IDX][ 3] = PREFIX ## PFX ## SIZE ## _mc30_ ## CPU; \
|
||||
c->PFX ## _pixels_tab[IDX][ 4] = PREFIX ## PFX ## SIZE ## _mc01_ ## CPU; \
|
||||
c->PFX ## _pixels_tab[IDX][ 5] = PREFIX ## PFX ## SIZE ## _mc11_ ## CPU; \
|
||||
c->PFX ## _pixels_tab[IDX][ 6] = PREFIX ## PFX ## SIZE ## _mc21_ ## CPU; \
|
||||
c->PFX ## _pixels_tab[IDX][ 7] = PREFIX ## PFX ## SIZE ## _mc31_ ## CPU; \
|
||||
c->PFX ## _pixels_tab[IDX][ 8] = PREFIX ## PFX ## SIZE ## _mc02_ ## CPU; \
|
||||
c->PFX ## _pixels_tab[IDX][ 9] = PREFIX ## PFX ## SIZE ## _mc12_ ## CPU; \
|
||||
c->PFX ## _pixels_tab[IDX][10] = PREFIX ## PFX ## SIZE ## _mc22_ ## CPU; \
|
||||
c->PFX ## _pixels_tab[IDX][11] = PREFIX ## PFX ## SIZE ## _mc32_ ## CPU; \
|
||||
c->PFX ## _pixels_tab[IDX][12] = PREFIX ## PFX ## SIZE ## _mc03_ ## CPU; \
|
||||
c->PFX ## _pixels_tab[IDX][13] = PREFIX ## PFX ## SIZE ## _mc13_ ## CPU; \
|
||||
c->PFX ## _pixels_tab[IDX][14] = PREFIX ## PFX ## SIZE ## _mc23_ ## CPU; \
|
||||
c->PFX ## _pixels_tab[IDX][15] = PREFIX ## PFX ## SIZE ## _mc33_ ## CPU; \
|
||||
} while (0)
|
||||
|
||||
#define H264_QPEL_FUNCS(x, y, CPU) \
|
||||
do { \
|
||||
c->put_h264_qpel_pixels_tab[0][x + y * 4] = put_h264_qpel16_mc ## x ## y ## _ ## CPU; \
|
||||
c->put_h264_qpel_pixels_tab[1][x + y * 4] = put_h264_qpel8_mc ## x ## y ## _ ## CPU; \
|
||||
c->avg_h264_qpel_pixels_tab[0][x + y * 4] = avg_h264_qpel16_mc ## x ## y ## _ ## CPU; \
|
||||
c->avg_h264_qpel_pixels_tab[1][x + y * 4] = avg_h264_qpel8_mc ## x ## y ## _ ## CPU; \
|
||||
} while (0)
|
||||
|
||||
#define H264_QPEL_FUNCS_10(x, y, CPU) \
|
||||
do { \
|
||||
c->put_h264_qpel_pixels_tab[0][x + y * 4] = ff_put_h264_qpel16_mc ## x ## y ## _10_ ## CPU; \
|
||||
c->put_h264_qpel_pixels_tab[1][x + y * 4] = ff_put_h264_qpel8_mc ## x ## y ## _10_ ## CPU; \
|
||||
c->avg_h264_qpel_pixels_tab[0][x + y * 4] = ff_avg_h264_qpel16_mc ## x ## y ## _10_ ## CPU; \
|
||||
c->avg_h264_qpel_pixels_tab[1][x + y * 4] = ff_avg_h264_qpel8_mc ## x ## y ## _10_ ## CPU; \
|
||||
} while (0)
|
||||
|
||||
av_cold void ff_h264qpel_init_x86(H264QpelContext *c, int bit_depth)
|
||||
{
|
||||
#if HAVE_X86ASM
|
||||
int high_bit_depth = bit_depth > 8;
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (EXTERNAL_MMXEXT(cpu_flags)) {
|
||||
if (!high_bit_depth) {
|
||||
SET_QPEL_FUNCS(put_h264_qpel, 0, 16, mmxext, );
|
||||
SET_QPEL_FUNCS(put_h264_qpel, 1, 8, mmxext, );
|
||||
SET_QPEL_FUNCS(put_h264_qpel, 2, 4, mmxext, );
|
||||
SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, mmxext, );
|
||||
SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, mmxext, );
|
||||
SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, mmxext, );
|
||||
} else if (bit_depth == 10) {
|
||||
#if ARCH_X86_32
|
||||
SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_mmxext, ff_);
|
||||
SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_mmxext, ff_);
|
||||
SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 10_mmxext, ff_);
|
||||
SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 10_mmxext, ff_);
|
||||
#endif
|
||||
SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 10_mmxext, ff_);
|
||||
SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 10_mmxext, ff_);
|
||||
}
|
||||
}
|
||||
|
||||
if (EXTERNAL_SSE2(cpu_flags)) {
|
||||
if (!high_bit_depth) {
|
||||
H264_QPEL_FUNCS(0, 1, sse2);
|
||||
H264_QPEL_FUNCS(0, 2, sse2);
|
||||
H264_QPEL_FUNCS(0, 3, sse2);
|
||||
H264_QPEL_FUNCS(1, 1, sse2);
|
||||
H264_QPEL_FUNCS(1, 2, sse2);
|
||||
H264_QPEL_FUNCS(1, 3, sse2);
|
||||
H264_QPEL_FUNCS(2, 1, sse2);
|
||||
H264_QPEL_FUNCS(2, 2, sse2);
|
||||
H264_QPEL_FUNCS(2, 3, sse2);
|
||||
H264_QPEL_FUNCS(3, 1, sse2);
|
||||
H264_QPEL_FUNCS(3, 2, sse2);
|
||||
H264_QPEL_FUNCS(3, 3, sse2);
|
||||
}
|
||||
|
||||
if (bit_depth == 10) {
|
||||
SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_sse2, ff_);
|
||||
SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 10_sse2, ff_);
|
||||
SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_sse2, ff_);
|
||||
SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 10_sse2, ff_);
|
||||
H264_QPEL_FUNCS_10(1, 0, sse2_cache64);
|
||||
H264_QPEL_FUNCS_10(2, 0, sse2_cache64);
|
||||
H264_QPEL_FUNCS_10(3, 0, sse2_cache64);
|
||||
}
|
||||
}
|
||||
|
||||
if (EXTERNAL_SSE2_FAST(cpu_flags)) {
|
||||
if (!high_bit_depth) {
|
||||
H264_QPEL_FUNCS(0, 0, sse2);
|
||||
}
|
||||
}
|
||||
|
||||
if (EXTERNAL_SSSE3(cpu_flags)) {
|
||||
if (!high_bit_depth) {
|
||||
H264_QPEL_FUNCS(1, 0, ssse3);
|
||||
H264_QPEL_FUNCS(1, 1, ssse3);
|
||||
H264_QPEL_FUNCS(1, 2, ssse3);
|
||||
H264_QPEL_FUNCS(1, 3, ssse3);
|
||||
H264_QPEL_FUNCS(2, 0, ssse3);
|
||||
H264_QPEL_FUNCS(2, 1, ssse3);
|
||||
H264_QPEL_FUNCS(2, 2, ssse3);
|
||||
H264_QPEL_FUNCS(2, 3, ssse3);
|
||||
H264_QPEL_FUNCS(3, 0, ssse3);
|
||||
H264_QPEL_FUNCS(3, 1, ssse3);
|
||||
H264_QPEL_FUNCS(3, 2, ssse3);
|
||||
H264_QPEL_FUNCS(3, 3, ssse3);
|
||||
}
|
||||
|
||||
if (bit_depth == 10) {
|
||||
H264_QPEL_FUNCS_10(1, 0, ssse3_cache64);
|
||||
H264_QPEL_FUNCS_10(2, 0, ssse3_cache64);
|
||||
H264_QPEL_FUNCS_10(3, 0, ssse3_cache64);
|
||||
}
|
||||
}
|
||||
|
||||
if (EXTERNAL_AVX(cpu_flags)) {
|
||||
/* AVX implies 64 byte cache lines without the need to avoid unaligned
|
||||
* memory accesses that cross the boundary between two cache lines.
|
||||
* TODO: Port X264_CPU_CACHELINE_32/64 detection from x264 to avoid
|
||||
* having to treat SSE2 functions with such properties as AVX. */
|
||||
if (bit_depth == 10) {
|
||||
H264_QPEL_FUNCS_10(1, 0, sse2);
|
||||
H264_QPEL_FUNCS_10(2, 0, sse2);
|
||||
H264_QPEL_FUNCS_10(3, 0, sse2);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
117
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/h264chroma_init.c
vendored
Normal file
117
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/h264chroma_init.c
vendored
Normal file
|
@ -0,0 +1,117 @@
|
|||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include "config.h"
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/h264chroma.h"
|
||||
|
||||
void ff_put_h264_chroma_mc8_rnd_mmx (uint8_t *dst, uint8_t *src,
|
||||
ptrdiff_t stride, int h, int x, int y);
|
||||
void ff_avg_h264_chroma_mc8_rnd_mmxext(uint8_t *dst, uint8_t *src,
|
||||
ptrdiff_t stride, int h, int x, int y);
|
||||
void ff_avg_h264_chroma_mc8_rnd_3dnow(uint8_t *dst, uint8_t *src,
|
||||
ptrdiff_t stride, int h, int x, int y);
|
||||
|
||||
void ff_put_h264_chroma_mc4_mmx (uint8_t *dst, uint8_t *src,
|
||||
ptrdiff_t stride, int h, int x, int y);
|
||||
void ff_avg_h264_chroma_mc4_mmxext (uint8_t *dst, uint8_t *src,
|
||||
ptrdiff_t stride, int h, int x, int y);
|
||||
void ff_avg_h264_chroma_mc4_3dnow (uint8_t *dst, uint8_t *src,
|
||||
ptrdiff_t stride, int h, int x, int y);
|
||||
|
||||
void ff_put_h264_chroma_mc2_mmxext (uint8_t *dst, uint8_t *src,
|
||||
ptrdiff_t stride, int h, int x, int y);
|
||||
void ff_avg_h264_chroma_mc2_mmxext (uint8_t *dst, uint8_t *src,
|
||||
ptrdiff_t stride, int h, int x, int y);
|
||||
|
||||
void ff_put_h264_chroma_mc8_rnd_ssse3(uint8_t *dst, uint8_t *src,
|
||||
ptrdiff_t stride, int h, int x, int y);
|
||||
void ff_put_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src,
|
||||
ptrdiff_t stride, int h, int x, int y);
|
||||
|
||||
void ff_avg_h264_chroma_mc8_rnd_ssse3(uint8_t *dst, uint8_t *src,
|
||||
ptrdiff_t stride, int h, int x, int y);
|
||||
void ff_avg_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src,
|
||||
ptrdiff_t stride, int h, int x, int y);
|
||||
|
||||
#define CHROMA_MC(OP, NUM, DEPTH, OPT) \
|
||||
void ff_ ## OP ## _h264_chroma_mc ## NUM ## _ ## DEPTH ## _ ## OPT \
|
||||
(uint8_t *dst, uint8_t *src, \
|
||||
ptrdiff_t stride, int h, int x, int y);
|
||||
|
||||
CHROMA_MC(put, 2, 10, mmxext)
|
||||
CHROMA_MC(avg, 2, 10, mmxext)
|
||||
CHROMA_MC(put, 4, 10, mmxext)
|
||||
CHROMA_MC(avg, 4, 10, mmxext)
|
||||
CHROMA_MC(put, 8, 10, sse2)
|
||||
CHROMA_MC(avg, 8, 10, sse2)
|
||||
CHROMA_MC(put, 8, 10, avx)
|
||||
CHROMA_MC(avg, 8, 10, avx)
|
||||
|
||||
av_cold void ff_h264chroma_init_x86(H264ChromaContext *c, int bit_depth)
|
||||
{
|
||||
int high_bit_depth = bit_depth > 8;
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (EXTERNAL_MMX(cpu_flags) && !high_bit_depth) {
|
||||
c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_rnd_mmx;
|
||||
c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_mmx;
|
||||
}
|
||||
|
||||
if (EXTERNAL_AMD3DNOW(cpu_flags) && !high_bit_depth) {
|
||||
c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_rnd_3dnow;
|
||||
c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_3dnow;
|
||||
}
|
||||
|
||||
if (EXTERNAL_MMXEXT(cpu_flags) && !high_bit_depth) {
|
||||
c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_rnd_mmxext;
|
||||
c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_mmxext;
|
||||
c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_mmxext;
|
||||
c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_mmxext;
|
||||
}
|
||||
|
||||
if (EXTERNAL_MMXEXT(cpu_flags) && bit_depth > 8 && bit_depth <= 10) {
|
||||
c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_10_mmxext;
|
||||
c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_10_mmxext;
|
||||
c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_10_mmxext;
|
||||
c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_10_mmxext;
|
||||
}
|
||||
|
||||
if (EXTERNAL_SSE2(cpu_flags) && bit_depth > 8 && bit_depth <= 10) {
|
||||
c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_sse2;
|
||||
c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_10_sse2;
|
||||
}
|
||||
|
||||
if (EXTERNAL_SSSE3(cpu_flags) && !high_bit_depth) {
|
||||
c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_rnd_ssse3;
|
||||
c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_rnd_ssse3;
|
||||
c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_ssse3;
|
||||
c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_ssse3;
|
||||
}
|
||||
|
||||
if (EXTERNAL_AVX(cpu_flags) && bit_depth > 8 && bit_depth <= 10) {
|
||||
// AVX implies !cache64.
|
||||
// TODO: Port cache(32|64) detection from x264.
|
||||
c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_avx;
|
||||
c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_10_avx;
|
||||
}
|
||||
}
|
448
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/h264dsp_init.c
vendored
Normal file
448
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/h264dsp_init.c
vendored
Normal file
|
@ -0,0 +1,448 @@
|
|||
/*
|
||||
* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/x86/asm.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/h264dsp.h"
|
||||
|
||||
/***********************************/
|
||||
/* IDCT */
|
||||
#define IDCT_ADD_FUNC(NUM, DEPTH, OPT) \
|
||||
void ff_h264_idct ## NUM ## _add_ ## DEPTH ## _ ## OPT(uint8_t *dst, \
|
||||
int16_t *block, \
|
||||
int stride);
|
||||
|
||||
IDCT_ADD_FUNC(, 8, mmx)
|
||||
IDCT_ADD_FUNC(, 8, sse2)
|
||||
IDCT_ADD_FUNC(, 8, avx)
|
||||
IDCT_ADD_FUNC(, 10, sse2)
|
||||
IDCT_ADD_FUNC(_dc, 8, mmxext)
|
||||
IDCT_ADD_FUNC(_dc, 8, sse2)
|
||||
IDCT_ADD_FUNC(_dc, 8, avx)
|
||||
IDCT_ADD_FUNC(_dc, 10, mmxext)
|
||||
IDCT_ADD_FUNC(8_dc, 8, mmxext)
|
||||
IDCT_ADD_FUNC(8_dc, 10, sse2)
|
||||
IDCT_ADD_FUNC(8, 8, mmx)
|
||||
IDCT_ADD_FUNC(8, 8, sse2)
|
||||
IDCT_ADD_FUNC(8, 10, sse2)
|
||||
IDCT_ADD_FUNC(, 10, avx)
|
||||
IDCT_ADD_FUNC(8_dc, 10, avx)
|
||||
IDCT_ADD_FUNC(8, 10, avx)
|
||||
|
||||
|
||||
#define IDCT_ADD_REP_FUNC(NUM, REP, DEPTH, OPT) \
|
||||
void ff_h264_idct ## NUM ## _add ## REP ## _ ## DEPTH ## _ ## OPT \
|
||||
(uint8_t *dst, const int *block_offset, \
|
||||
int16_t *block, int stride, const uint8_t nnzc[6 * 8]);
|
||||
|
||||
IDCT_ADD_REP_FUNC(8, 4, 8, mmx)
|
||||
IDCT_ADD_REP_FUNC(8, 4, 8, mmxext)
|
||||
IDCT_ADD_REP_FUNC(8, 4, 8, sse2)
|
||||
IDCT_ADD_REP_FUNC(8, 4, 10, sse2)
|
||||
IDCT_ADD_REP_FUNC(8, 4, 10, avx)
|
||||
IDCT_ADD_REP_FUNC(, 16, 8, mmx)
|
||||
IDCT_ADD_REP_FUNC(, 16, 8, mmxext)
|
||||
IDCT_ADD_REP_FUNC(, 16, 8, sse2)
|
||||
IDCT_ADD_REP_FUNC(, 16, 10, sse2)
|
||||
IDCT_ADD_REP_FUNC(, 16intra, 8, mmx)
|
||||
IDCT_ADD_REP_FUNC(, 16intra, 8, mmxext)
|
||||
IDCT_ADD_REP_FUNC(, 16intra, 8, sse2)
|
||||
IDCT_ADD_REP_FUNC(, 16intra, 10, sse2)
|
||||
IDCT_ADD_REP_FUNC(, 16, 10, avx)
|
||||
IDCT_ADD_REP_FUNC(, 16intra, 10, avx)
|
||||
|
||||
|
||||
#define IDCT_ADD_REP_FUNC2(NUM, REP, DEPTH, OPT) \
|
||||
void ff_h264_idct ## NUM ## _add ## REP ## _ ## DEPTH ## _ ## OPT \
|
||||
(uint8_t **dst, const int *block_offset, \
|
||||
int16_t *block, int stride, const uint8_t nnzc[6 * 8]);
|
||||
|
||||
IDCT_ADD_REP_FUNC2(, 8, 8, mmx)
|
||||
IDCT_ADD_REP_FUNC2(, 8, 8, mmxext)
|
||||
IDCT_ADD_REP_FUNC2(, 8, 8, sse2)
|
||||
IDCT_ADD_REP_FUNC2(, 8, 10, sse2)
|
||||
IDCT_ADD_REP_FUNC2(, 8, 10, avx)
|
||||
|
||||
IDCT_ADD_REP_FUNC2(, 8_422, 8, mmx)
|
||||
|
||||
IDCT_ADD_REP_FUNC2(, 8_422, 10, sse2)
|
||||
IDCT_ADD_REP_FUNC2(, 8_422, 10, avx)
|
||||
|
||||
void ff_h264_luma_dc_dequant_idct_mmx(int16_t *output, int16_t *input, int qmul);
|
||||
void ff_h264_luma_dc_dequant_idct_sse2(int16_t *output, int16_t *input, int qmul);
|
||||
|
||||
/***********************************/
|
||||
/* deblocking */
|
||||
|
||||
void ff_h264_loop_filter_strength_mmxext(int16_t bS[2][4][4], uint8_t nnz[40],
|
||||
int8_t ref[2][40],
|
||||
int16_t mv[2][40][2],
|
||||
int bidir, int edges, int step,
|
||||
int mask_mv0, int mask_mv1, int field);
|
||||
|
||||
#define LF_FUNC(DIR, TYPE, DEPTH, OPT) \
|
||||
void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT(uint8_t *pix, \
|
||||
ptrdiff_t stride, \
|
||||
int alpha, \
|
||||
int beta, \
|
||||
int8_t *tc0);
|
||||
#define LF_IFUNC(DIR, TYPE, DEPTH, OPT) \
|
||||
void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT(uint8_t *pix, \
|
||||
ptrdiff_t stride, \
|
||||
int alpha, \
|
||||
int beta);
|
||||
|
||||
#define LF_FUNCS(type, depth) \
|
||||
LF_FUNC(h, chroma, depth, mmxext) \
|
||||
LF_IFUNC(h, chroma_intra, depth, mmxext) \
|
||||
LF_FUNC(h, chroma422, depth, mmxext) \
|
||||
LF_IFUNC(h, chroma422_intra, depth, mmxext) \
|
||||
LF_FUNC(v, chroma, depth, mmxext) \
|
||||
LF_IFUNC(v, chroma_intra, depth, mmxext) \
|
||||
LF_FUNC(h, luma, depth, mmxext) \
|
||||
LF_IFUNC(h, luma_intra, depth, mmxext) \
|
||||
LF_FUNC(h, luma, depth, sse2) \
|
||||
LF_IFUNC(h, luma_intra, depth, sse2) \
|
||||
LF_FUNC(v, luma, depth, sse2) \
|
||||
LF_IFUNC(v, luma_intra, depth, sse2) \
|
||||
LF_FUNC(h, chroma, depth, sse2) \
|
||||
LF_IFUNC(h, chroma_intra, depth, sse2) \
|
||||
LF_FUNC(h, chroma422, depth, sse2) \
|
||||
LF_IFUNC(h, chroma422_intra, depth, sse2) \
|
||||
LF_FUNC(v, chroma, depth, sse2) \
|
||||
LF_IFUNC(v, chroma_intra, depth, sse2) \
|
||||
LF_FUNC(h, luma, depth, avx) \
|
||||
LF_IFUNC(h, luma_intra, depth, avx) \
|
||||
LF_FUNC(v, luma, depth, avx) \
|
||||
LF_IFUNC(v, luma_intra, depth, avx) \
|
||||
LF_FUNC(h, chroma, depth, avx) \
|
||||
LF_IFUNC(h, chroma_intra, depth, avx) \
|
||||
LF_FUNC(h, chroma422, depth, avx) \
|
||||
LF_IFUNC(h, chroma422_intra, depth, avx) \
|
||||
LF_FUNC(v, chroma, depth, avx) \
|
||||
LF_IFUNC(v, chroma_intra, depth, avx)
|
||||
|
||||
LF_FUNC(h, luma_mbaff, 8, sse2)
|
||||
LF_FUNC(h, luma_mbaff, 8, avx)
|
||||
|
||||
LF_FUNCS(uint8_t, 8)
|
||||
LF_FUNCS(uint16_t, 10)
|
||||
|
||||
#if ARCH_X86_32 && HAVE_MMXEXT_EXTERNAL
|
||||
LF_FUNC(v8, luma, 8, mmxext)
|
||||
static void deblock_v_luma_8_mmxext(uint8_t *pix, int stride, int alpha,
|
||||
int beta, int8_t *tc0)
|
||||
{
|
||||
if ((tc0[0] & tc0[1]) >= 0)
|
||||
ff_deblock_v8_luma_8_mmxext(pix + 0, stride, alpha, beta, tc0);
|
||||
if ((tc0[2] & tc0[3]) >= 0)
|
||||
ff_deblock_v8_luma_8_mmxext(pix + 8, stride, alpha, beta, tc0 + 2);
|
||||
}
|
||||
LF_IFUNC(v8, luma_intra, 8, mmxext)
|
||||
static void deblock_v_luma_intra_8_mmxext(uint8_t *pix, int stride,
|
||||
int alpha, int beta)
|
||||
{
|
||||
ff_deblock_v8_luma_intra_8_mmxext(pix + 0, stride, alpha, beta);
|
||||
ff_deblock_v8_luma_intra_8_mmxext(pix + 8, stride, alpha, beta);
|
||||
}
|
||||
#endif /* ARCH_X86_32 && HAVE_MMXEXT_EXTERNAL */
|
||||
|
||||
LF_FUNC(v, luma, 10, mmxext)
|
||||
LF_IFUNC(v, luma_intra, 10, mmxext)
|
||||
|
||||
/***********************************/
|
||||
/* weighted prediction */
|
||||
|
||||
#define H264_WEIGHT(W, OPT) \
|
||||
void ff_h264_weight_ ## W ## _ ## OPT(uint8_t *dst, ptrdiff_t stride, \
|
||||
int height, int log2_denom, \
|
||||
int weight, int offset);
|
||||
|
||||
#define H264_BIWEIGHT(W, OPT) \
|
||||
void ff_h264_biweight_ ## W ## _ ## OPT(uint8_t *dst, uint8_t *src, \
|
||||
ptrdiff_t stride, int height, \
|
||||
int log2_denom, int weightd, \
|
||||
int weights, int offset);
|
||||
|
||||
#define H264_BIWEIGHT_MMX(W) \
|
||||
H264_WEIGHT(W, mmxext) \
|
||||
H264_BIWEIGHT(W, mmxext)
|
||||
|
||||
#define H264_BIWEIGHT_MMX_SSE(W) \
|
||||
H264_BIWEIGHT_MMX(W) \
|
||||
H264_WEIGHT(W, sse2) \
|
||||
H264_BIWEIGHT(W, sse2) \
|
||||
H264_BIWEIGHT(W, ssse3)
|
||||
|
||||
H264_BIWEIGHT_MMX_SSE(16)
|
||||
H264_BIWEIGHT_MMX_SSE(8)
|
||||
H264_BIWEIGHT_MMX(4)
|
||||
|
||||
#define H264_WEIGHT_10(W, DEPTH, OPT) \
|
||||
void ff_h264_weight_ ## W ## _ ## DEPTH ## _ ## OPT(uint8_t *dst, \
|
||||
ptrdiff_t stride, \
|
||||
int height, \
|
||||
int log2_denom, \
|
||||
int weight, \
|
||||
int offset);
|
||||
|
||||
#define H264_BIWEIGHT_10(W, DEPTH, OPT) \
|
||||
void ff_h264_biweight_ ## W ## _ ## DEPTH ## _ ## OPT(uint8_t *dst, \
|
||||
uint8_t *src, \
|
||||
ptrdiff_t stride, \
|
||||
int height, \
|
||||
int log2_denom, \
|
||||
int weightd, \
|
||||
int weights, \
|
||||
int offset);
|
||||
|
||||
#define H264_BIWEIGHT_10_SSE(W, DEPTH) \
|
||||
H264_WEIGHT_10(W, DEPTH, sse2) \
|
||||
H264_WEIGHT_10(W, DEPTH, sse4) \
|
||||
H264_BIWEIGHT_10(W, DEPTH, sse2) \
|
||||
H264_BIWEIGHT_10(W, DEPTH, sse4)
|
||||
|
||||
H264_BIWEIGHT_10_SSE(16, 10)
|
||||
H264_BIWEIGHT_10_SSE(8, 10)
|
||||
H264_BIWEIGHT_10_SSE(4, 10)
|
||||
|
||||
av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
|
||||
const int chroma_format_idc)
|
||||
{
|
||||
#if HAVE_X86ASM
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (EXTERNAL_MMXEXT(cpu_flags) && chroma_format_idc <= 1)
|
||||
c->h264_loop_filter_strength = ff_h264_loop_filter_strength_mmxext;
|
||||
|
||||
if (bit_depth == 8) {
|
||||
if (EXTERNAL_MMX(cpu_flags)) {
|
||||
c->h264_idct_dc_add =
|
||||
c->h264_idct_add = ff_h264_idct_add_8_mmx;
|
||||
c->h264_idct8_dc_add =
|
||||
c->h264_idct8_add = ff_h264_idct8_add_8_mmx;
|
||||
|
||||
c->h264_idct_add16 = ff_h264_idct_add16_8_mmx;
|
||||
c->h264_idct8_add4 = ff_h264_idct8_add4_8_mmx;
|
||||
if (chroma_format_idc <= 1) {
|
||||
c->h264_idct_add8 = ff_h264_idct_add8_8_mmx;
|
||||
} else {
|
||||
c->h264_idct_add8 = ff_h264_idct_add8_422_8_mmx;
|
||||
}
|
||||
c->h264_idct_add16intra = ff_h264_idct_add16intra_8_mmx;
|
||||
if (cpu_flags & AV_CPU_FLAG_CMOV)
|
||||
c->h264_luma_dc_dequant_idct = ff_h264_luma_dc_dequant_idct_mmx;
|
||||
}
|
||||
if (EXTERNAL_MMXEXT(cpu_flags)) {
|
||||
c->h264_idct_dc_add = ff_h264_idct_dc_add_8_mmxext;
|
||||
c->h264_idct8_dc_add = ff_h264_idct8_dc_add_8_mmxext;
|
||||
c->h264_idct_add16 = ff_h264_idct_add16_8_mmxext;
|
||||
c->h264_idct8_add4 = ff_h264_idct8_add4_8_mmxext;
|
||||
if (chroma_format_idc <= 1)
|
||||
c->h264_idct_add8 = ff_h264_idct_add8_8_mmxext;
|
||||
c->h264_idct_add16intra = ff_h264_idct_add16intra_8_mmxext;
|
||||
|
||||
c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_8_mmxext;
|
||||
c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_8_mmxext;
|
||||
if (chroma_format_idc <= 1) {
|
||||
c->h264_h_loop_filter_chroma = ff_deblock_h_chroma_8_mmxext;
|
||||
c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma_intra_8_mmxext;
|
||||
} else {
|
||||
c->h264_h_loop_filter_chroma = ff_deblock_h_chroma422_8_mmxext;
|
||||
c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma422_intra_8_mmxext;
|
||||
}
|
||||
#if ARCH_X86_32 && HAVE_MMXEXT_EXTERNAL
|
||||
c->h264_v_loop_filter_luma = deblock_v_luma_8_mmxext;
|
||||
c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_mmxext;
|
||||
c->h264_v_loop_filter_luma_intra = deblock_v_luma_intra_8_mmxext;
|
||||
c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_mmxext;
|
||||
#endif /* ARCH_X86_32 && HAVE_MMXEXT_EXTERNAL */
|
||||
c->weight_h264_pixels_tab[0] = ff_h264_weight_16_mmxext;
|
||||
c->weight_h264_pixels_tab[1] = ff_h264_weight_8_mmxext;
|
||||
c->weight_h264_pixels_tab[2] = ff_h264_weight_4_mmxext;
|
||||
|
||||
c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_mmxext;
|
||||
c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_mmxext;
|
||||
c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_mmxext;
|
||||
}
|
||||
if (EXTERNAL_SSE2(cpu_flags)) {
|
||||
c->h264_idct8_add = ff_h264_idct8_add_8_sse2;
|
||||
|
||||
c->h264_idct_add16 = ff_h264_idct_add16_8_sse2;
|
||||
c->h264_idct8_add4 = ff_h264_idct8_add4_8_sse2;
|
||||
if (chroma_format_idc <= 1)
|
||||
c->h264_idct_add8 = ff_h264_idct_add8_8_sse2;
|
||||
c->h264_idct_add16intra = ff_h264_idct_add16intra_8_sse2;
|
||||
c->h264_luma_dc_dequant_idct = ff_h264_luma_dc_dequant_idct_sse2;
|
||||
|
||||
c->weight_h264_pixels_tab[0] = ff_h264_weight_16_sse2;
|
||||
c->weight_h264_pixels_tab[1] = ff_h264_weight_8_sse2;
|
||||
|
||||
c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_sse2;
|
||||
c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_sse2;
|
||||
|
||||
c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_sse2;
|
||||
c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_sse2;
|
||||
c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_sse2;
|
||||
c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_sse2;
|
||||
|
||||
#if ARCH_X86_64
|
||||
c->h264_h_loop_filter_luma_mbaff = ff_deblock_h_luma_mbaff_8_sse2;
|
||||
#endif
|
||||
|
||||
c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_8_sse2;
|
||||
c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_8_sse2;
|
||||
if (chroma_format_idc <= 1) {
|
||||
c->h264_h_loop_filter_chroma = ff_deblock_h_chroma_8_sse2;
|
||||
c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma_intra_8_sse2;
|
||||
} else {
|
||||
c->h264_h_loop_filter_chroma = ff_deblock_h_chroma422_8_sse2;
|
||||
c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma422_intra_8_sse2;
|
||||
}
|
||||
|
||||
c->h264_idct_add = ff_h264_idct_add_8_sse2;
|
||||
c->h264_idct_dc_add = ff_h264_idct_dc_add_8_sse2;
|
||||
}
|
||||
if (EXTERNAL_SSSE3(cpu_flags)) {
|
||||
c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_ssse3;
|
||||
c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_ssse3;
|
||||
}
|
||||
if (EXTERNAL_AVX(cpu_flags)) {
|
||||
c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_avx;
|
||||
c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_avx;
|
||||
c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_avx;
|
||||
c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_avx;
|
||||
#if ARCH_X86_64
|
||||
c->h264_h_loop_filter_luma_mbaff = ff_deblock_h_luma_mbaff_8_avx;
|
||||
#endif
|
||||
|
||||
c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_8_avx;
|
||||
c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_8_avx;
|
||||
if (chroma_format_idc <= 1) {
|
||||
c->h264_h_loop_filter_chroma = ff_deblock_h_chroma_8_avx;
|
||||
c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma_intra_8_avx;
|
||||
} else {
|
||||
c->h264_h_loop_filter_chroma = ff_deblock_h_chroma422_8_avx;
|
||||
c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma422_intra_8_avx;
|
||||
}
|
||||
|
||||
c->h264_idct_add = ff_h264_idct_add_8_avx;
|
||||
c->h264_idct_dc_add = ff_h264_idct_dc_add_8_avx;
|
||||
}
|
||||
} else if (bit_depth == 10) {
|
||||
if (EXTERNAL_MMXEXT(cpu_flags)) {
|
||||
#if ARCH_X86_32
|
||||
c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_10_mmxext;
|
||||
c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_mmxext;
|
||||
if (chroma_format_idc <= 1) {
|
||||
c->h264_h_loop_filter_chroma = ff_deblock_h_chroma_10_mmxext;
|
||||
} else {
|
||||
c->h264_h_loop_filter_chroma = ff_deblock_h_chroma422_10_mmxext;
|
||||
}
|
||||
c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_mmxext;
|
||||
c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_mmxext;
|
||||
c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_mmxext;
|
||||
c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_mmxext;
|
||||
#endif /* ARCH_X86_32 */
|
||||
c->h264_idct_dc_add = ff_h264_idct_dc_add_10_mmxext;
|
||||
}
|
||||
if (EXTERNAL_SSE2(cpu_flags)) {
|
||||
c->h264_idct_add = ff_h264_idct_add_10_sse2;
|
||||
c->h264_idct8_dc_add = ff_h264_idct8_dc_add_10_sse2;
|
||||
|
||||
c->h264_idct_add16 = ff_h264_idct_add16_10_sse2;
|
||||
if (chroma_format_idc <= 1) {
|
||||
c->h264_idct_add8 = ff_h264_idct_add8_10_sse2;
|
||||
} else {
|
||||
c->h264_idct_add8 = ff_h264_idct_add8_422_10_sse2;
|
||||
}
|
||||
c->h264_idct_add16intra = ff_h264_idct_add16intra_10_sse2;
|
||||
#if HAVE_ALIGNED_STACK
|
||||
c->h264_idct8_add = ff_h264_idct8_add_10_sse2;
|
||||
c->h264_idct8_add4 = ff_h264_idct8_add4_10_sse2;
|
||||
#endif /* HAVE_ALIGNED_STACK */
|
||||
|
||||
c->weight_h264_pixels_tab[0] = ff_h264_weight_16_10_sse2;
|
||||
c->weight_h264_pixels_tab[1] = ff_h264_weight_8_10_sse2;
|
||||
c->weight_h264_pixels_tab[2] = ff_h264_weight_4_10_sse2;
|
||||
|
||||
c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_10_sse2;
|
||||
c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_10_sse2;
|
||||
c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_10_sse2;
|
||||
|
||||
c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_10_sse2;
|
||||
c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_sse2;
|
||||
if (chroma_format_idc <= 1) {
|
||||
c->h264_h_loop_filter_chroma = ff_deblock_h_chroma_10_sse2;
|
||||
} else {
|
||||
c->h264_h_loop_filter_chroma = ff_deblock_h_chroma422_10_sse2;
|
||||
}
|
||||
#if HAVE_ALIGNED_STACK
|
||||
c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_sse2;
|
||||
c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_sse2;
|
||||
c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_sse2;
|
||||
c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_sse2;
|
||||
#endif /* HAVE_ALIGNED_STACK */
|
||||
}
|
||||
if (EXTERNAL_SSE4(cpu_flags)) {
|
||||
c->weight_h264_pixels_tab[0] = ff_h264_weight_16_10_sse4;
|
||||
c->weight_h264_pixels_tab[1] = ff_h264_weight_8_10_sse4;
|
||||
c->weight_h264_pixels_tab[2] = ff_h264_weight_4_10_sse4;
|
||||
|
||||
c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_10_sse4;
|
||||
c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_10_sse4;
|
||||
c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_10_sse4;
|
||||
}
|
||||
if (EXTERNAL_AVX(cpu_flags)) {
|
||||
c->h264_idct_dc_add =
|
||||
c->h264_idct_add = ff_h264_idct_add_10_avx;
|
||||
c->h264_idct8_dc_add = ff_h264_idct8_dc_add_10_avx;
|
||||
|
||||
c->h264_idct_add16 = ff_h264_idct_add16_10_avx;
|
||||
if (chroma_format_idc <= 1) {
|
||||
c->h264_idct_add8 = ff_h264_idct_add8_10_avx;
|
||||
} else {
|
||||
c->h264_idct_add8 = ff_h264_idct_add8_422_10_avx;
|
||||
}
|
||||
c->h264_idct_add16intra = ff_h264_idct_add16intra_10_avx;
|
||||
#if HAVE_ALIGNED_STACK
|
||||
c->h264_idct8_add = ff_h264_idct8_add_10_avx;
|
||||
c->h264_idct8_add4 = ff_h264_idct8_add4_10_avx;
|
||||
#endif /* HAVE_ALIGNED_STACK */
|
||||
|
||||
c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_10_avx;
|
||||
c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_avx;
|
||||
if (chroma_format_idc <= 1) {
|
||||
c->h264_h_loop_filter_chroma = ff_deblock_h_chroma_10_avx;
|
||||
} else {
|
||||
c->h264_h_loop_filter_chroma = ff_deblock_h_chroma422_10_avx;
|
||||
}
|
||||
#if HAVE_ALIGNED_STACK
|
||||
c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_avx;
|
||||
c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_avx;
|
||||
c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_avx;
|
||||
c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_avx;
|
||||
#endif /* HAVE_ALIGNED_STACK */
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
259
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/hevcdsp.h
vendored
Normal file
259
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/hevcdsp.h
vendored
Normal file
|
@ -0,0 +1,259 @@
|
|||
/*
|
||||
* HEVC video decoder
|
||||
*
|
||||
* Copyright (C) 2012 - 2013 Guillaume Martres
|
||||
* Copyright (C) 2013 - 2014 Pierre-Edouard Lepere
|
||||
*
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef AVCODEC_X86_HEVCDSP_H
|
||||
#define AVCODEC_X86_HEVCDSP_H
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
|
||||
#define PEL_LINK(dst, idx1, idx2, idx3, name, D, opt) \
|
||||
dst[idx1][idx2][idx3] = ff_hevc_put_hevc_ ## name ## _ ## D ## _##opt; \
|
||||
dst ## _bi[idx1][idx2][idx3] = ff_hevc_put_hevc_bi_ ## name ## _ ## D ## _##opt; \
|
||||
dst ## _uni[idx1][idx2][idx3] = ff_hevc_put_hevc_uni_ ## name ## _ ## D ## _##opt; \
|
||||
dst ## _uni_w[idx1][idx2][idx3] = ff_hevc_put_hevc_uni_w_ ## name ## _ ## D ## _##opt; \
|
||||
dst ## _bi_w[idx1][idx2][idx3] = ff_hevc_put_hevc_bi_w_ ## name ## _ ## D ## _##opt
|
||||
|
||||
|
||||
#define PEL_PROTOTYPE(name, D, opt) \
|
||||
void ff_hevc_put_hevc_ ## name ## _ ## D ## _##opt(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width); \
|
||||
void ff_hevc_put_hevc_bi_ ## name ## _ ## D ## _##opt(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width); \
|
||||
void ff_hevc_put_hevc_uni_ ## name ## _ ## D ## _##opt(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width); \
|
||||
void ff_hevc_put_hevc_uni_w_ ## name ## _ ## D ## _##opt(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width); \
|
||||
void ff_hevc_put_hevc_bi_w_ ## name ## _ ## D ## _##opt(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, int denom, int wx0, int wx1, int ox0, int ox1, intptr_t mx, intptr_t my, int width)
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// MC functions
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#define EPEL_PROTOTYPES(fname, bitd, opt) \
|
||||
PEL_PROTOTYPE(fname##4, bitd, opt); \
|
||||
PEL_PROTOTYPE(fname##6, bitd, opt); \
|
||||
PEL_PROTOTYPE(fname##8, bitd, opt); \
|
||||
PEL_PROTOTYPE(fname##12, bitd, opt); \
|
||||
PEL_PROTOTYPE(fname##16, bitd, opt); \
|
||||
PEL_PROTOTYPE(fname##24, bitd, opt); \
|
||||
PEL_PROTOTYPE(fname##32, bitd, opt); \
|
||||
PEL_PROTOTYPE(fname##48, bitd, opt); \
|
||||
PEL_PROTOTYPE(fname##64, bitd, opt)
|
||||
|
||||
#define QPEL_PROTOTYPES(fname, bitd, opt) \
|
||||
PEL_PROTOTYPE(fname##4, bitd, opt); \
|
||||
PEL_PROTOTYPE(fname##8, bitd, opt); \
|
||||
PEL_PROTOTYPE(fname##12, bitd, opt); \
|
||||
PEL_PROTOTYPE(fname##16, bitd, opt); \
|
||||
PEL_PROTOTYPE(fname##24, bitd, opt); \
|
||||
PEL_PROTOTYPE(fname##32, bitd, opt); \
|
||||
PEL_PROTOTYPE(fname##48, bitd, opt); \
|
||||
PEL_PROTOTYPE(fname##64, bitd, opt)
|
||||
|
||||
#define WEIGHTING_PROTOTYPE(width, bitd, opt) \
|
||||
void ff_hevc_put_hevc_uni_w##width##_##bitd##_##opt(uint8_t *dst, ptrdiff_t dststride, int16_t *_src, int height, int denom, int _wx, int _ox); \
|
||||
void ff_hevc_put_hevc_bi_w##width##_##bitd##_##opt(uint8_t *dst, ptrdiff_t dststride, int16_t *_src, int16_t *_src2, int height, int denom, int _wx0, int _wx1, int _ox0, int _ox1)
|
||||
|
||||
#define WEIGHTING_PROTOTYPES(bitd, opt) \
|
||||
WEIGHTING_PROTOTYPE(2, bitd, opt); \
|
||||
WEIGHTING_PROTOTYPE(4, bitd, opt); \
|
||||
WEIGHTING_PROTOTYPE(6, bitd, opt); \
|
||||
WEIGHTING_PROTOTYPE(8, bitd, opt); \
|
||||
WEIGHTING_PROTOTYPE(12, bitd, opt); \
|
||||
WEIGHTING_PROTOTYPE(16, bitd, opt); \
|
||||
WEIGHTING_PROTOTYPE(24, bitd, opt); \
|
||||
WEIGHTING_PROTOTYPE(32, bitd, opt); \
|
||||
WEIGHTING_PROTOTYPE(48, bitd, opt); \
|
||||
WEIGHTING_PROTOTYPE(64, bitd, opt)
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// QPEL_PIXELS EPEL_PIXELS
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
EPEL_PROTOTYPES(pel_pixels , 8, sse4);
|
||||
EPEL_PROTOTYPES(pel_pixels , 10, sse4);
|
||||
EPEL_PROTOTYPES(pel_pixels , 12, sse4);
|
||||
|
||||
void ff_hevc_put_hevc_pel_pixels16_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
|
||||
void ff_hevc_put_hevc_pel_pixels24_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
|
||||
void ff_hevc_put_hevc_pel_pixels32_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
|
||||
void ff_hevc_put_hevc_pel_pixels48_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
|
||||
void ff_hevc_put_hevc_pel_pixels64_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
|
||||
|
||||
void ff_hevc_put_hevc_pel_pixels16_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
|
||||
void ff_hevc_put_hevc_pel_pixels24_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
|
||||
void ff_hevc_put_hevc_pel_pixels32_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
|
||||
void ff_hevc_put_hevc_pel_pixels48_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
|
||||
void ff_hevc_put_hevc_pel_pixels64_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
|
||||
|
||||
|
||||
|
||||
void ff_hevc_put_hevc_uni_pel_pixels32_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
|
||||
void ff_hevc_put_hevc_uni_pel_pixels48_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
|
||||
void ff_hevc_put_hevc_uni_pel_pixels64_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
|
||||
void ff_hevc_put_hevc_uni_pel_pixels96_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width); //used for 10bit
|
||||
void ff_hevc_put_hevc_uni_pel_pixels128_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);//used for 10bit
|
||||
|
||||
|
||||
void ff_hevc_put_hevc_bi_pel_pixels16_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
|
||||
void ff_hevc_put_hevc_bi_pel_pixels24_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
|
||||
void ff_hevc_put_hevc_bi_pel_pixels32_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
|
||||
void ff_hevc_put_hevc_bi_pel_pixels48_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
|
||||
void ff_hevc_put_hevc_bi_pel_pixels64_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
|
||||
|
||||
void ff_hevc_put_hevc_bi_pel_pixels16_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
|
||||
void ff_hevc_put_hevc_bi_pel_pixels24_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
|
||||
void ff_hevc_put_hevc_bi_pel_pixels32_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
|
||||
void ff_hevc_put_hevc_bi_pel_pixels48_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
|
||||
void ff_hevc_put_hevc_bi_pel_pixels64_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// EPEL
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
EPEL_PROTOTYPES(epel_h , 8, sse4);
|
||||
EPEL_PROTOTYPES(epel_h , 10, sse4);
|
||||
EPEL_PROTOTYPES(epel_h , 12, sse4);
|
||||
|
||||
EPEL_PROTOTYPES(epel_v , 8, sse4);
|
||||
EPEL_PROTOTYPES(epel_v , 10, sse4);
|
||||
EPEL_PROTOTYPES(epel_v , 12, sse4);
|
||||
|
||||
EPEL_PROTOTYPES(epel_hv , 8, sse4);
|
||||
EPEL_PROTOTYPES(epel_hv , 10, sse4);
|
||||
EPEL_PROTOTYPES(epel_hv , 12, sse4);
|
||||
|
||||
PEL_PROTOTYPE(epel_h16, 8, avx2);
|
||||
PEL_PROTOTYPE(epel_h24, 8, avx2);
|
||||
PEL_PROTOTYPE(epel_h32, 8, avx2);
|
||||
PEL_PROTOTYPE(epel_h48, 8, avx2);
|
||||
PEL_PROTOTYPE(epel_h64, 8, avx2);
|
||||
|
||||
PEL_PROTOTYPE(epel_h16,10, avx2);
|
||||
PEL_PROTOTYPE(epel_h24,10, avx2);
|
||||
PEL_PROTOTYPE(epel_h32,10, avx2);
|
||||
PEL_PROTOTYPE(epel_h48,10, avx2);
|
||||
PEL_PROTOTYPE(epel_h64,10, avx2);
|
||||
|
||||
PEL_PROTOTYPE(epel_v16, 8, avx2);
|
||||
PEL_PROTOTYPE(epel_v24, 8, avx2);
|
||||
PEL_PROTOTYPE(epel_v32, 8, avx2);
|
||||
PEL_PROTOTYPE(epel_v48, 8, avx2);
|
||||
PEL_PROTOTYPE(epel_v64, 8, avx2);
|
||||
|
||||
PEL_PROTOTYPE(epel_v16,10, avx2);
|
||||
PEL_PROTOTYPE(epel_v24,10, avx2);
|
||||
PEL_PROTOTYPE(epel_v32,10, avx2);
|
||||
PEL_PROTOTYPE(epel_v48,10, avx2);
|
||||
PEL_PROTOTYPE(epel_v64,10, avx2);
|
||||
|
||||
PEL_PROTOTYPE(epel_hv16, 8, avx2);
|
||||
PEL_PROTOTYPE(epel_hv24, 8, avx2);
|
||||
PEL_PROTOTYPE(epel_hv32, 8, avx2);
|
||||
PEL_PROTOTYPE(epel_hv48, 8, avx2);
|
||||
PEL_PROTOTYPE(epel_hv64, 8, avx2);
|
||||
|
||||
PEL_PROTOTYPE(epel_hv16,10, avx2);
|
||||
PEL_PROTOTYPE(epel_hv24,10, avx2);
|
||||
PEL_PROTOTYPE(epel_hv32,10, avx2);
|
||||
PEL_PROTOTYPE(epel_hv48,10, avx2);
|
||||
PEL_PROTOTYPE(epel_hv64,10, avx2);
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// QPEL
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
QPEL_PROTOTYPES(qpel_h , 8, sse4);
|
||||
QPEL_PROTOTYPES(qpel_h , 10, sse4);
|
||||
QPEL_PROTOTYPES(qpel_h , 12, sse4);
|
||||
|
||||
QPEL_PROTOTYPES(qpel_v, 8, sse4);
|
||||
QPEL_PROTOTYPES(qpel_v, 10, sse4);
|
||||
QPEL_PROTOTYPES(qpel_v, 12, sse4);
|
||||
|
||||
QPEL_PROTOTYPES(qpel_hv, 8, sse4);
|
||||
QPEL_PROTOTYPES(qpel_hv, 10, sse4);
|
||||
QPEL_PROTOTYPES(qpel_hv, 12, sse4);
|
||||
|
||||
PEL_PROTOTYPE(qpel_h16, 8, avx2);
|
||||
PEL_PROTOTYPE(qpel_h24, 8, avx2);
|
||||
PEL_PROTOTYPE(qpel_h32, 8, avx2);
|
||||
PEL_PROTOTYPE(qpel_h48, 8, avx2);
|
||||
PEL_PROTOTYPE(qpel_h64, 8, avx2);
|
||||
|
||||
PEL_PROTOTYPE(qpel_h16,10, avx2);
|
||||
PEL_PROTOTYPE(qpel_h24,10, avx2);
|
||||
PEL_PROTOTYPE(qpel_h32,10, avx2);
|
||||
PEL_PROTOTYPE(qpel_h48,10, avx2);
|
||||
PEL_PROTOTYPE(qpel_h64,10, avx2);
|
||||
|
||||
PEL_PROTOTYPE(qpel_v16, 8, avx2);
|
||||
PEL_PROTOTYPE(qpel_v24, 8, avx2);
|
||||
PEL_PROTOTYPE(qpel_v32, 8, avx2);
|
||||
PEL_PROTOTYPE(qpel_v48, 8, avx2);
|
||||
PEL_PROTOTYPE(qpel_v64, 8, avx2);
|
||||
|
||||
PEL_PROTOTYPE(qpel_v16,10, avx2);
|
||||
PEL_PROTOTYPE(qpel_v24,10, avx2);
|
||||
PEL_PROTOTYPE(qpel_v32,10, avx2);
|
||||
PEL_PROTOTYPE(qpel_v48,10, avx2);
|
||||
PEL_PROTOTYPE(qpel_v64,10, avx2);
|
||||
|
||||
PEL_PROTOTYPE(qpel_hv16, 8, avx2);
|
||||
PEL_PROTOTYPE(qpel_hv24, 8, avx2);
|
||||
PEL_PROTOTYPE(qpel_hv32, 8, avx2);
|
||||
PEL_PROTOTYPE(qpel_hv48, 8, avx2);
|
||||
PEL_PROTOTYPE(qpel_hv64, 8, avx2);
|
||||
|
||||
PEL_PROTOTYPE(qpel_hv16,10, avx2);
|
||||
PEL_PROTOTYPE(qpel_hv24,10, avx2);
|
||||
PEL_PROTOTYPE(qpel_hv32,10, avx2);
|
||||
PEL_PROTOTYPE(qpel_hv48,10, avx2);
|
||||
PEL_PROTOTYPE(qpel_hv64,10, avx2);
|
||||
|
||||
WEIGHTING_PROTOTYPES(8, sse4);
|
||||
WEIGHTING_PROTOTYPES(10, sse4);
|
||||
WEIGHTING_PROTOTYPES(12, sse4);
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// TRANSFORM_ADD
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
void ff_hevc_add_residual_4_8_mmxext(uint8_t *dst, int16_t *res, ptrdiff_t stride);
|
||||
void ff_hevc_add_residual_8_8_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
|
||||
void ff_hevc_add_residual_16_8_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
|
||||
void ff_hevc_add_residual_32_8_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
|
||||
|
||||
void ff_hevc_add_residual_8_8_avx(uint8_t *dst, int16_t *res, ptrdiff_t stride);
|
||||
void ff_hevc_add_residual_16_8_avx(uint8_t *dst, int16_t *res, ptrdiff_t stride);
|
||||
void ff_hevc_add_residual_32_8_avx(uint8_t *dst, int16_t *res, ptrdiff_t stride);
|
||||
|
||||
void ff_hevc_add_residual_32_8_avx2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
|
||||
|
||||
void ff_hevc_add_residual_4_10_mmxext(uint8_t *dst, int16_t *res, ptrdiff_t stride);
|
||||
void ff_hevc_add_residual_8_10_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
|
||||
void ff_hevc_add_residual_16_10_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
|
||||
void ff_hevc_add_residual_32_10_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
|
||||
|
||||
void ff_hevc_add_residual_16_10_avx2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
|
||||
void ff_hevc_add_residual_32_10_avx2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
|
||||
|
||||
#endif // AVCODEC_X86_HEVCDSP_H
|
1151
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/hevcdsp_init.c
vendored
Normal file
1151
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/hevcdsp_init.c
vendored
Normal file
File diff suppressed because it is too large
Load diff
57
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/hpeldsp.h
vendored
Normal file
57
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/hpeldsp.h
vendored
Normal file
|
@ -0,0 +1,57 @@
|
|||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef AVCODEC_X86_HPELDSP_H
|
||||
#define AVCODEC_X86_HPELDSP_H
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include "libavcodec/hpeldsp.h"
|
||||
|
||||
void ff_avg_pixels8_x2_mmx(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
|
||||
void ff_avg_pixels8_xy2_mmx(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_avg_pixels8_xy2_mmxext(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_avg_pixels8_xy2_ssse3(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
|
||||
void ff_avg_pixels16_xy2_mmx(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_avg_pixels16_xy2_sse2(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_avg_pixels16_xy2_ssse3(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
|
||||
void ff_put_pixels8_xy2_mmx(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_put_pixels8_xy2_ssse3(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_put_pixels16_xy2_mmx(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_put_pixels16_xy2_sse2(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_put_pixels16_xy2_ssse3(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
|
||||
void ff_hpeldsp_vp3_init_x86(HpelDSPContext *c, int cpu_flags, int flags);
|
||||
|
||||
#endif /* AVCODEC_X86_HPELDSP_H */
|
313
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/hpeldsp_init.c
vendored
Normal file
313
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/hpeldsp_init.c
vendored
Normal file
|
@ -0,0 +1,313 @@
|
|||
/*
|
||||
* SIMD-optimized halfpel functions
|
||||
* Copyright (c) 2000, 2001 Fabrice Bellard
|
||||
* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
* MMX optimization by Nick Kurshev <nickols_k@mail.ru>
|
||||
*/
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/avcodec.h"
|
||||
#include "libavcodec/hpeldsp.h"
|
||||
#include "libavcodec/pixels.h"
|
||||
#include "fpel.h"
|
||||
#include "hpeldsp.h"
|
||||
|
||||
void ff_put_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_put_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_put_pixels16_x2_mmxext(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_put_pixels16_x2_3dnow(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_put_pixels16_x2_sse2(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_avg_pixels16_x2_sse2(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_put_pixels16_y2_sse2(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_avg_pixels16_y2_sse2(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_put_no_rnd_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_put_no_rnd_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_put_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_put_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_put_no_rnd_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_put_no_rnd_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_avg_pixels8_3dnow(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_avg_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_avg_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_avg_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_avg_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_avg_pixels8_xy2_3dnow(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_avg_approx_pixels8_xy2_mmxext(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_avg_approx_pixels8_xy2_3dnow(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
|
||||
#define avg_pixels8_mmx ff_avg_pixels8_mmx
|
||||
#define avg_pixels8_x2_mmx ff_avg_pixels8_x2_mmx
|
||||
#define avg_pixels16_mmx ff_avg_pixels16_mmx
|
||||
#define avg_pixels8_xy2_mmx ff_avg_pixels8_xy2_mmx
|
||||
#define avg_pixels16_xy2_mmx ff_avg_pixels16_xy2_mmx
|
||||
#define put_pixels8_mmx ff_put_pixels8_mmx
|
||||
#define put_pixels16_mmx ff_put_pixels16_mmx
|
||||
#define put_pixels8_xy2_mmx ff_put_pixels8_xy2_mmx
|
||||
#define put_pixels16_xy2_mmx ff_put_pixels16_xy2_mmx
|
||||
#define avg_no_rnd_pixels16_mmx ff_avg_pixels16_mmx
|
||||
#define put_no_rnd_pixels8_mmx ff_put_pixels8_mmx
|
||||
#define put_no_rnd_pixels16_mmx ff_put_pixels16_mmx
|
||||
|
||||
#if HAVE_INLINE_ASM
|
||||
|
||||
/***********************************/
|
||||
/* MMX no rounding */
|
||||
#define DEF(x, y) x ## _no_rnd_ ## y ## _mmx
|
||||
#define SET_RND MOVQ_WONE
|
||||
#define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
|
||||
#define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
|
||||
#define STATIC static
|
||||
|
||||
#include "rnd_template.c"
|
||||
#include "hpeldsp_rnd_template.c"
|
||||
|
||||
#undef DEF
|
||||
#undef SET_RND
|
||||
#undef PAVGBP
|
||||
#undef PAVGB
|
||||
#undef STATIC
|
||||
|
||||
#if HAVE_MMX
|
||||
CALL_2X_PIXELS(avg_no_rnd_pixels16_y2_mmx, avg_no_rnd_pixels8_y2_mmx, 8)
|
||||
CALL_2X_PIXELS(put_no_rnd_pixels16_y2_mmx, put_no_rnd_pixels8_y2_mmx, 8)
|
||||
|
||||
CALL_2X_PIXELS(avg_no_rnd_pixels16_xy2_mmx, avg_no_rnd_pixels8_xy2_mmx, 8)
|
||||
CALL_2X_PIXELS(put_no_rnd_pixels16_xy2_mmx, put_no_rnd_pixels8_xy2_mmx, 8)
|
||||
#endif
|
||||
|
||||
/***********************************/
|
||||
/* MMX rounding */
|
||||
|
||||
#define DEF(x, y) x ## _ ## y ## _mmx
|
||||
#define SET_RND MOVQ_WTWO
|
||||
#define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
|
||||
#define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
|
||||
|
||||
#include "hpeldsp_rnd_template.c"
|
||||
|
||||
#undef DEF
|
||||
#define DEF(x, y) ff_ ## x ## _ ## y ## _mmx
|
||||
#define STATIC
|
||||
|
||||
#include "rnd_template.c"
|
||||
|
||||
#undef DEF
|
||||
#undef SET_RND
|
||||
#undef PAVGBP
|
||||
#undef PAVGB
|
||||
|
||||
#if HAVE_MMX
|
||||
CALL_2X_PIXELS(avg_pixels16_y2_mmx, avg_pixels8_y2_mmx, 8)
|
||||
CALL_2X_PIXELS(put_pixels16_y2_mmx, put_pixels8_y2_mmx, 8)
|
||||
|
||||
CALL_2X_PIXELS_EXPORT(ff_avg_pixels16_xy2_mmx, ff_avg_pixels8_xy2_mmx, 8)
|
||||
CALL_2X_PIXELS_EXPORT(ff_put_pixels16_xy2_mmx, ff_put_pixels8_xy2_mmx, 8)
|
||||
#endif
|
||||
|
||||
#endif /* HAVE_INLINE_ASM */
|
||||
|
||||
|
||||
#if HAVE_X86ASM
|
||||
|
||||
#define HPELDSP_AVG_PIXELS16(CPUEXT) \
|
||||
CALL_2X_PIXELS(put_no_rnd_pixels16_x2 ## CPUEXT, ff_put_no_rnd_pixels8_x2 ## CPUEXT, 8) \
|
||||
CALL_2X_PIXELS(put_pixels16_y2 ## CPUEXT, ff_put_pixels8_y2 ## CPUEXT, 8) \
|
||||
CALL_2X_PIXELS(put_no_rnd_pixels16_y2 ## CPUEXT, ff_put_no_rnd_pixels8_y2 ## CPUEXT, 8) \
|
||||
CALL_2X_PIXELS(avg_pixels16 ## CPUEXT, ff_avg_pixels8 ## CPUEXT, 8) \
|
||||
CALL_2X_PIXELS(avg_pixels16_x2 ## CPUEXT, ff_avg_pixels8_x2 ## CPUEXT, 8) \
|
||||
CALL_2X_PIXELS(avg_pixels16_y2 ## CPUEXT, ff_avg_pixels8_y2 ## CPUEXT, 8) \
|
||||
CALL_2X_PIXELS(avg_pixels16_xy2 ## CPUEXT, ff_avg_pixels8_xy2 ## CPUEXT, 8) \
|
||||
CALL_2X_PIXELS(avg_approx_pixels16_xy2## CPUEXT, ff_avg_approx_pixels8_xy2## CPUEXT, 8)
|
||||
|
||||
HPELDSP_AVG_PIXELS16(_3dnow)
|
||||
HPELDSP_AVG_PIXELS16(_mmxext)
|
||||
|
||||
#endif /* HAVE_X86ASM */
|
||||
|
||||
#define SET_HPEL_FUNCS_EXT(PFX, IDX, SIZE, CPU) \
|
||||
if (HAVE_MMX_EXTERNAL) \
|
||||
c->PFX ## _pixels_tab IDX [0] = PFX ## _pixels ## SIZE ## _ ## CPU;
|
||||
|
||||
#if HAVE_MMX_INLINE
|
||||
#define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \
|
||||
do { \
|
||||
SET_HPEL_FUNCS_EXT(PFX, IDX, SIZE, CPU) \
|
||||
c->PFX ## _pixels_tab IDX [1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \
|
||||
c->PFX ## _pixels_tab IDX [2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \
|
||||
c->PFX ## _pixels_tab IDX [3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU; \
|
||||
} while (0)
|
||||
#else
|
||||
#define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \
|
||||
do { \
|
||||
SET_HPEL_FUNCS_EXT(PFX, IDX, SIZE, CPU) \
|
||||
} while (0)
|
||||
#endif
|
||||
|
||||
static void hpeldsp_init_mmx(HpelDSPContext *c, int flags)
|
||||
{
|
||||
SET_HPEL_FUNCS(put, [0], 16, mmx);
|
||||
SET_HPEL_FUNCS(put_no_rnd, [0], 16, mmx);
|
||||
SET_HPEL_FUNCS(avg, [0], 16, mmx);
|
||||
SET_HPEL_FUNCS(avg_no_rnd, , 16, mmx);
|
||||
SET_HPEL_FUNCS(put, [1], 8, mmx);
|
||||
SET_HPEL_FUNCS(put_no_rnd, [1], 8, mmx);
|
||||
if (HAVE_MMX_EXTERNAL) {
|
||||
c->avg_pixels_tab[1][0] = ff_avg_pixels8_mmx;
|
||||
c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_mmx;
|
||||
}
|
||||
#if HAVE_MMX_INLINE
|
||||
c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx;
|
||||
c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_mmx;
|
||||
#endif
|
||||
}
|
||||
|
||||
static void hpeldsp_init_mmxext(HpelDSPContext *c, int flags)
|
||||
{
|
||||
#if HAVE_MMXEXT_EXTERNAL
|
||||
c->put_pixels_tab[0][1] = ff_put_pixels16_x2_mmxext;
|
||||
c->put_pixels_tab[0][2] = put_pixels16_y2_mmxext;
|
||||
|
||||
c->avg_pixels_tab[0][0] = avg_pixels16_mmxext;
|
||||
c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmxext;
|
||||
c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmxext;
|
||||
c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmxext;
|
||||
|
||||
c->put_pixels_tab[1][1] = ff_put_pixels8_x2_mmxext;
|
||||
c->put_pixels_tab[1][2] = ff_put_pixels8_y2_mmxext;
|
||||
|
||||
c->avg_pixels_tab[1][0] = ff_avg_pixels8_mmxext;
|
||||
c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_mmxext;
|
||||
c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_mmxext;
|
||||
c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_mmxext;
|
||||
|
||||
if (!(flags & AV_CODEC_FLAG_BITEXACT)) {
|
||||
c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmxext;
|
||||
c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmxext;
|
||||
c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_mmxext;
|
||||
c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_mmxext;
|
||||
|
||||
c->avg_pixels_tab[0][3] = avg_approx_pixels16_xy2_mmxext;
|
||||
c->avg_pixels_tab[1][3] = ff_avg_approx_pixels8_xy2_mmxext;
|
||||
}
|
||||
#endif /* HAVE_MMXEXT_EXTERNAL */
|
||||
}
|
||||
|
||||
static void hpeldsp_init_3dnow(HpelDSPContext *c, int flags)
|
||||
{
|
||||
#if HAVE_AMD3DNOW_EXTERNAL
|
||||
c->put_pixels_tab[0][1] = ff_put_pixels16_x2_3dnow;
|
||||
c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
|
||||
|
||||
c->avg_pixels_tab[0][0] = avg_pixels16_3dnow;
|
||||
c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
|
||||
c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
|
||||
c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
|
||||
|
||||
c->put_pixels_tab[1][1] = ff_put_pixels8_x2_3dnow;
|
||||
c->put_pixels_tab[1][2] = ff_put_pixels8_y2_3dnow;
|
||||
|
||||
c->avg_pixels_tab[1][0] = ff_avg_pixels8_3dnow;
|
||||
c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_3dnow;
|
||||
c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_3dnow;
|
||||
c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_3dnow;
|
||||
|
||||
if (!(flags & AV_CODEC_FLAG_BITEXACT)){
|
||||
c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
|
||||
c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow;
|
||||
c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_3dnow;
|
||||
c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_3dnow;
|
||||
|
||||
c->avg_pixels_tab[0][3] = avg_approx_pixels16_xy2_3dnow;
|
||||
c->avg_pixels_tab[1][3] = ff_avg_approx_pixels8_xy2_3dnow;
|
||||
}
|
||||
#endif /* HAVE_AMD3DNOW_EXTERNAL */
|
||||
}
|
||||
|
||||
static void hpeldsp_init_sse2_fast(HpelDSPContext *c, int flags)
|
||||
{
|
||||
#if HAVE_SSE2_EXTERNAL
|
||||
c->put_pixels_tab[0][0] = ff_put_pixels16_sse2;
|
||||
c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_sse2;
|
||||
c->put_pixels_tab[0][1] = ff_put_pixels16_x2_sse2;
|
||||
c->put_pixels_tab[0][2] = ff_put_pixels16_y2_sse2;
|
||||
c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_sse2;
|
||||
c->avg_pixels_tab[0][0] = ff_avg_pixels16_sse2;
|
||||
c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_sse2;
|
||||
c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_sse2;
|
||||
c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_sse2;
|
||||
#endif /* HAVE_SSE2_EXTERNAL */
|
||||
}
|
||||
|
||||
static void hpeldsp_init_ssse3(HpelDSPContext *c, int flags)
|
||||
{
|
||||
#if HAVE_SSSE3_EXTERNAL
|
||||
c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_ssse3;
|
||||
c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_ssse3;
|
||||
c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_ssse3;
|
||||
c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_ssse3;
|
||||
#endif
|
||||
}
|
||||
|
||||
av_cold void ff_hpeldsp_init_x86(HpelDSPContext *c, int flags)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (INLINE_MMX(cpu_flags))
|
||||
hpeldsp_init_mmx(c, flags);
|
||||
|
||||
if (EXTERNAL_AMD3DNOW(cpu_flags))
|
||||
hpeldsp_init_3dnow(c, flags);
|
||||
|
||||
if (EXTERNAL_MMXEXT(cpu_flags))
|
||||
hpeldsp_init_mmxext(c, flags);
|
||||
|
||||
if (EXTERNAL_SSE2_FAST(cpu_flags))
|
||||
hpeldsp_init_sse2_fast(c, flags);
|
||||
|
||||
if (EXTERNAL_SSSE3(cpu_flags))
|
||||
hpeldsp_init_ssse3(c, flags);
|
||||
|
||||
if (CONFIG_VP3_DECODER)
|
||||
ff_hpeldsp_vp3_init_x86(c, cpu_flags, flags);
|
||||
}
|
202
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/hpeldsp_rnd_template.c
vendored
Normal file
202
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/hpeldsp_rnd_template.c
vendored
Normal file
|
@ -0,0 +1,202 @@
|
|||
/*
|
||||
* SIMD-optimized halfpel functions are compiled twice for rnd/no_rnd
|
||||
* Copyright (c) 2000, 2001 Fabrice Bellard
|
||||
* Copyright (c) 2003-2004 Michael Niedermayer <michaelni@gmx.at>
|
||||
*
|
||||
* MMX optimization by Nick Kurshev <nickols_k@mail.ru>
|
||||
* mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
|
||||
* and improved by Zdenek Kabelac <kabi@users.sf.net>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
// put_pixels
|
||||
av_unused static void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
|
||||
{
|
||||
MOVQ_BFE(mm6);
|
||||
__asm__ volatile(
|
||||
"lea (%3, %3), %%"FF_REG_a" \n\t"
|
||||
".p2align 3 \n\t"
|
||||
"1: \n\t"
|
||||
"movq (%1), %%mm0 \n\t"
|
||||
"movq 1(%1), %%mm1 \n\t"
|
||||
"movq (%1, %3), %%mm2 \n\t"
|
||||
"movq 1(%1, %3), %%mm3 \n\t"
|
||||
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
|
||||
"movq %%mm4, (%2) \n\t"
|
||||
"movq %%mm5, (%2, %3) \n\t"
|
||||
"add %%"FF_REG_a", %1 \n\t"
|
||||
"add %%"FF_REG_a", %2 \n\t"
|
||||
"movq (%1), %%mm0 \n\t"
|
||||
"movq 1(%1), %%mm1 \n\t"
|
||||
"movq (%1, %3), %%mm2 \n\t"
|
||||
"movq 1(%1, %3), %%mm3 \n\t"
|
||||
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
|
||||
"movq %%mm4, (%2) \n\t"
|
||||
"movq %%mm5, (%2, %3) \n\t"
|
||||
"add %%"FF_REG_a", %1 \n\t"
|
||||
"add %%"FF_REG_a", %2 \n\t"
|
||||
"subl $4, %0 \n\t"
|
||||
"jnz 1b \n\t"
|
||||
:"+g"(h), "+S"(pixels), "+D"(block)
|
||||
:"r"((x86_reg)line_size)
|
||||
:FF_REG_a, "memory");
|
||||
}
|
||||
|
||||
av_unused static void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
|
||||
{
|
||||
MOVQ_BFE(mm6);
|
||||
__asm__ volatile(
|
||||
"lea (%3, %3), %%"FF_REG_a" \n\t"
|
||||
".p2align 3 \n\t"
|
||||
"1: \n\t"
|
||||
"movq (%1), %%mm0 \n\t"
|
||||
"movq 1(%1), %%mm1 \n\t"
|
||||
"movq (%1, %3), %%mm2 \n\t"
|
||||
"movq 1(%1, %3), %%mm3 \n\t"
|
||||
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
|
||||
"movq %%mm4, (%2) \n\t"
|
||||
"movq %%mm5, (%2, %3) \n\t"
|
||||
"movq 8(%1), %%mm0 \n\t"
|
||||
"movq 9(%1), %%mm1 \n\t"
|
||||
"movq 8(%1, %3), %%mm2 \n\t"
|
||||
"movq 9(%1, %3), %%mm3 \n\t"
|
||||
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
|
||||
"movq %%mm4, 8(%2) \n\t"
|
||||
"movq %%mm5, 8(%2, %3) \n\t"
|
||||
"add %%"FF_REG_a", %1 \n\t"
|
||||
"add %%"FF_REG_a", %2 \n\t"
|
||||
"movq (%1), %%mm0 \n\t"
|
||||
"movq 1(%1), %%mm1 \n\t"
|
||||
"movq (%1, %3), %%mm2 \n\t"
|
||||
"movq 1(%1, %3), %%mm3 \n\t"
|
||||
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
|
||||
"movq %%mm4, (%2) \n\t"
|
||||
"movq %%mm5, (%2, %3) \n\t"
|
||||
"movq 8(%1), %%mm0 \n\t"
|
||||
"movq 9(%1), %%mm1 \n\t"
|
||||
"movq 8(%1, %3), %%mm2 \n\t"
|
||||
"movq 9(%1, %3), %%mm3 \n\t"
|
||||
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
|
||||
"movq %%mm4, 8(%2) \n\t"
|
||||
"movq %%mm5, 8(%2, %3) \n\t"
|
||||
"add %%"FF_REG_a", %1 \n\t"
|
||||
"add %%"FF_REG_a", %2 \n\t"
|
||||
"subl $4, %0 \n\t"
|
||||
"jnz 1b \n\t"
|
||||
:"+g"(h), "+S"(pixels), "+D"(block)
|
||||
:"r"((x86_reg)line_size)
|
||||
:FF_REG_a, "memory");
|
||||
}
|
||||
|
||||
av_unused static void DEF(put, pixels8_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
|
||||
{
|
||||
MOVQ_BFE(mm6);
|
||||
__asm__ volatile(
|
||||
"lea (%3, %3), %%"FF_REG_a" \n\t"
|
||||
"movq (%1), %%mm0 \n\t"
|
||||
".p2align 3 \n\t"
|
||||
"1: \n\t"
|
||||
"movq (%1, %3), %%mm1 \n\t"
|
||||
"movq (%1, %%"FF_REG_a"),%%mm2\n\t"
|
||||
PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5)
|
||||
"movq %%mm4, (%2) \n\t"
|
||||
"movq %%mm5, (%2, %3) \n\t"
|
||||
"add %%"FF_REG_a", %1 \n\t"
|
||||
"add %%"FF_REG_a", %2 \n\t"
|
||||
"movq (%1, %3), %%mm1 \n\t"
|
||||
"movq (%1, %%"FF_REG_a"),%%mm0\n\t"
|
||||
PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5)
|
||||
"movq %%mm4, (%2) \n\t"
|
||||
"movq %%mm5, (%2, %3) \n\t"
|
||||
"add %%"FF_REG_a", %1 \n\t"
|
||||
"add %%"FF_REG_a", %2 \n\t"
|
||||
"subl $4, %0 \n\t"
|
||||
"jnz 1b \n\t"
|
||||
:"+g"(h), "+S"(pixels), "+D"(block)
|
||||
:"r"((x86_reg)line_size)
|
||||
:FF_REG_a, "memory");
|
||||
}
|
||||
|
||||
av_unused static void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
|
||||
{
|
||||
MOVQ_BFE(mm6);
|
||||
__asm__ volatile(
|
||||
".p2align 3 \n\t"
|
||||
"1: \n\t"
|
||||
"movq (%1), %%mm0 \n\t"
|
||||
"movq 1(%1), %%mm1 \n\t"
|
||||
"movq (%2), %%mm3 \n\t"
|
||||
PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
|
||||
PAVGB_MMX(%%mm3, %%mm2, %%mm0, %%mm6)
|
||||
"movq %%mm0, (%2) \n\t"
|
||||
"movq 8(%1), %%mm0 \n\t"
|
||||
"movq 9(%1), %%mm1 \n\t"
|
||||
"movq 8(%2), %%mm3 \n\t"
|
||||
PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
|
||||
PAVGB_MMX(%%mm3, %%mm2, %%mm0, %%mm6)
|
||||
"movq %%mm0, 8(%2) \n\t"
|
||||
"add %3, %1 \n\t"
|
||||
"add %3, %2 \n\t"
|
||||
"subl $1, %0 \n\t"
|
||||
"jnz 1b \n\t"
|
||||
:"+g"(h), "+S"(pixels), "+D"(block)
|
||||
:"r"((x86_reg)line_size)
|
||||
:"memory");
|
||||
}
|
||||
|
||||
av_unused static void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
|
||||
{
|
||||
MOVQ_BFE(mm6);
|
||||
__asm__ volatile(
|
||||
"lea (%3, %3), %%"FF_REG_a" \n\t"
|
||||
"movq (%1), %%mm0 \n\t"
|
||||
".p2align 3 \n\t"
|
||||
"1: \n\t"
|
||||
"movq (%1, %3), %%mm1 \n\t"
|
||||
"movq (%1, %%"FF_REG_a"), %%mm2 \n\t"
|
||||
PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5)
|
||||
"movq (%2), %%mm3 \n\t"
|
||||
PAVGB_MMX(%%mm3, %%mm4, %%mm0, %%mm6)
|
||||
"movq (%2, %3), %%mm3 \n\t"
|
||||
PAVGB_MMX(%%mm3, %%mm5, %%mm1, %%mm6)
|
||||
"movq %%mm0, (%2) \n\t"
|
||||
"movq %%mm1, (%2, %3) \n\t"
|
||||
"add %%"FF_REG_a", %1 \n\t"
|
||||
"add %%"FF_REG_a", %2 \n\t"
|
||||
|
||||
"movq (%1, %3), %%mm1 \n\t"
|
||||
"movq (%1, %%"FF_REG_a"), %%mm0 \n\t"
|
||||
PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5)
|
||||
"movq (%2), %%mm3 \n\t"
|
||||
PAVGB_MMX(%%mm3, %%mm4, %%mm2, %%mm6)
|
||||
"movq (%2, %3), %%mm3 \n\t"
|
||||
PAVGB_MMX(%%mm3, %%mm5, %%mm1, %%mm6)
|
||||
"movq %%mm2, (%2) \n\t"
|
||||
"movq %%mm1, (%2, %3) \n\t"
|
||||
"add %%"FF_REG_a", %1 \n\t"
|
||||
"add %%"FF_REG_a", %2 \n\t"
|
||||
|
||||
"subl $4, %0 \n\t"
|
||||
"jnz 1b \n\t"
|
||||
:"+g"(h), "+S"(pixels), "+D"(block)
|
||||
:"r"((x86_reg)line_size)
|
||||
:FF_REG_a, "memory");
|
||||
}
|
56
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/hpeldsp_vp3_init.c
vendored
Normal file
56
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/hpeldsp_vp3_init.c
vendored
Normal file
|
@ -0,0 +1,56 @@
|
|||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
|
||||
#include "libavcodec/avcodec.h"
|
||||
#include "libavcodec/hpeldsp.h"
|
||||
|
||||
#include "hpeldsp.h"
|
||||
|
||||
void ff_put_no_rnd_pixels8_x2_exact_mmxext(uint8_t *block,
|
||||
const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_put_no_rnd_pixels8_x2_exact_3dnow(uint8_t *block,
|
||||
const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_put_no_rnd_pixels8_y2_exact_mmxext(uint8_t *block,
|
||||
const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_put_no_rnd_pixels8_y2_exact_3dnow(uint8_t *block,
|
||||
const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
|
||||
av_cold void ff_hpeldsp_vp3_init_x86(HpelDSPContext *c, int cpu_flags, int flags)
|
||||
{
|
||||
if (EXTERNAL_AMD3DNOW(cpu_flags)) {
|
||||
if (flags & AV_CODEC_FLAG_BITEXACT) {
|
||||
c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_3dnow;
|
||||
c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_3dnow;
|
||||
}
|
||||
}
|
||||
|
||||
if (EXTERNAL_MMXEXT(cpu_flags)) {
|
||||
if (flags & AV_CODEC_FLAG_BITEXACT) {
|
||||
c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_mmxext;
|
||||
c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_mmxext;
|
||||
}
|
||||
}
|
||||
}
|
61
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/huffyuvdsp_init.c
vendored
Normal file
61
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/huffyuvdsp_init.c
vendored
Normal file
|
@ -0,0 +1,61 @@
|
|||
/*
|
||||
* Copyright (c) 2009 Loren Merritt <lorenm@u.washington.edu>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/pixdesc.h"
|
||||
#include "libavutil/x86/asm.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/huffyuvdsp.h"
|
||||
|
||||
void ff_add_int16_mmx(uint16_t *dst, const uint16_t *src, unsigned mask, int w);
|
||||
void ff_add_int16_sse2(uint16_t *dst, const uint16_t *src, unsigned mask, int w);
|
||||
void ff_add_int16_avx2(uint16_t *dst, const uint16_t *src, unsigned mask, int w);
|
||||
|
||||
void ff_add_hfyu_left_pred_bgr32_mmx(uint8_t *dst, const uint8_t *src,
|
||||
intptr_t w, uint8_t *left);
|
||||
void ff_add_hfyu_left_pred_bgr32_sse2(uint8_t *dst, const uint8_t *src,
|
||||
intptr_t w, uint8_t *left);
|
||||
void ff_add_hfyu_median_pred_int16_mmxext(uint16_t *dst, const uint16_t *top, const uint16_t *diff, unsigned mask, int w, int *left, int *left_top);
|
||||
|
||||
av_cold void ff_huffyuvdsp_init_x86(HuffYUVDSPContext *c, enum AVPixelFormat pix_fmt)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
const AVPixFmtDescriptor *pix_desc = av_pix_fmt_desc_get(pix_fmt);
|
||||
|
||||
if (ARCH_X86_32 && EXTERNAL_MMX(cpu_flags)) {
|
||||
c->add_hfyu_left_pred_bgr32 = ff_add_hfyu_left_pred_bgr32_mmx;
|
||||
c->add_int16 = ff_add_int16_mmx;
|
||||
}
|
||||
|
||||
if (EXTERNAL_MMXEXT(cpu_flags) && pix_desc && pix_desc->comp[0].depth<16) {
|
||||
c->add_hfyu_median_pred_int16 = ff_add_hfyu_median_pred_int16_mmxext;
|
||||
}
|
||||
|
||||
if (EXTERNAL_SSE2(cpu_flags)) {
|
||||
c->add_int16 = ff_add_int16_sse2;
|
||||
c->add_hfyu_left_pred_bgr32 = ff_add_hfyu_left_pred_bgr32_sse2;
|
||||
}
|
||||
|
||||
if (EXTERNAL_AVX2_FAST(cpu_flags)) {
|
||||
c->add_int16 = ff_add_int16_avx2;
|
||||
}
|
||||
}
|
60
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/huffyuvencdsp_init.c
vendored
Normal file
60
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/huffyuvencdsp_init.c
vendored
Normal file
|
@ -0,0 +1,60 @@
|
|||
/*
|
||||
* SIMD-optimized HuffYUV encoding functions
|
||||
* Copyright (c) 2000, 2001 Fabrice Bellard
|
||||
* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
|
||||
*
|
||||
* MMX optimization by Nick Kurshev <nickols_k@mail.ru>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/pixdesc.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/huffyuvencdsp.h"
|
||||
|
||||
void ff_diff_int16_mmx (uint16_t *dst, const uint16_t *src1, const uint16_t *src2,
|
||||
unsigned mask, int w);
|
||||
void ff_diff_int16_sse2(uint16_t *dst, const uint16_t *src1, const uint16_t *src2,
|
||||
unsigned mask, int w);
|
||||
void ff_diff_int16_avx2(uint16_t *dst, const uint16_t *src1, const uint16_t *src2,
|
||||
unsigned mask, int w);
|
||||
void ff_sub_hfyu_median_pred_int16_mmxext(uint16_t *dst, const uint16_t *src1, const uint16_t *src2,
|
||||
unsigned mask, int w, int *left, int *left_top);
|
||||
|
||||
av_cold void ff_huffyuvencdsp_init_x86(HuffYUVEncDSPContext *c, AVCodecContext *avctx)
|
||||
{
|
||||
av_unused int cpu_flags = av_get_cpu_flags();
|
||||
const AVPixFmtDescriptor *pix_desc = av_pix_fmt_desc_get(avctx->pix_fmt);
|
||||
|
||||
if (ARCH_X86_32 && EXTERNAL_MMX(cpu_flags)) {
|
||||
c->diff_int16 = ff_diff_int16_mmx;
|
||||
}
|
||||
|
||||
if (EXTERNAL_MMXEXT(cpu_flags) && pix_desc && pix_desc->comp[0].depth<16) {
|
||||
c->sub_hfyu_median_pred_int16 = ff_sub_hfyu_median_pred_int16_mmxext;
|
||||
}
|
||||
|
||||
if (EXTERNAL_SSE2(cpu_flags)) {
|
||||
c->diff_int16 = ff_diff_int16_sse2;
|
||||
}
|
||||
|
||||
if (EXTERNAL_AVX2_FAST(cpu_flags)) {
|
||||
c->diff_int16 = ff_diff_int16_avx2;
|
||||
}
|
||||
}
|
39
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/idctdsp.h
vendored
Normal file
39
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/idctdsp.h
vendored
Normal file
|
@ -0,0 +1,39 @@
|
|||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef AVCODEC_X86_IDCTDSP_H
|
||||
#define AVCODEC_X86_IDCTDSP_H
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
|
||||
ptrdiff_t line_size);
|
||||
void ff_add_pixels_clamped_sse2(const int16_t *block, uint8_t *pixels,
|
||||
ptrdiff_t line_size);
|
||||
void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
|
||||
ptrdiff_t line_size);
|
||||
void ff_put_pixels_clamped_sse2(const int16_t *block, uint8_t *pixels,
|
||||
ptrdiff_t line_size);
|
||||
void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
|
||||
ptrdiff_t line_size);
|
||||
void ff_put_signed_pixels_clamped_sse2(const int16_t *block, uint8_t *pixels,
|
||||
ptrdiff_t line_size);
|
||||
|
||||
|
||||
#endif /* AVCODEC_X86_IDCTDSP_H */
|
162
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/idctdsp_init.c
vendored
Normal file
162
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/idctdsp_init.c
vendored
Normal file
|
@ -0,0 +1,162 @@
|
|||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/avcodec.h"
|
||||
#include "libavcodec/idctdsp.h"
|
||||
#include "idctdsp.h"
|
||||
#include "simple_idct.h"
|
||||
|
||||
/* Input permutation for the simple_idct_mmx */
|
||||
static const uint8_t simple_mmx_permutation[64] = {
|
||||
0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
|
||||
0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
|
||||
0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
|
||||
0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
|
||||
0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
|
||||
0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
|
||||
0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
|
||||
0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
|
||||
};
|
||||
|
||||
static const uint8_t idct_sse2_row_perm[8] = { 0, 4, 1, 5, 2, 6, 3, 7 };
|
||||
|
||||
av_cold int ff_init_scantable_permutation_x86(uint8_t *idct_permutation,
|
||||
enum idct_permutation_type perm_type)
|
||||
{
|
||||
int i;
|
||||
|
||||
switch (perm_type) {
|
||||
case FF_IDCT_PERM_SIMPLE:
|
||||
for (i = 0; i < 64; i++)
|
||||
idct_permutation[i] = simple_mmx_permutation[i];
|
||||
return 1;
|
||||
case FF_IDCT_PERM_SSE2:
|
||||
for (i = 0; i < 64; i++)
|
||||
idct_permutation[i] = (i & 0x38) | idct_sse2_row_perm[i & 7];
|
||||
return 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
av_cold void ff_idctdsp_init_x86(IDCTDSPContext *c, AVCodecContext *avctx,
|
||||
unsigned high_bit_depth)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (EXTERNAL_MMX(cpu_flags)) {
|
||||
c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;
|
||||
c->put_pixels_clamped = ff_put_pixels_clamped_mmx;
|
||||
c->add_pixels_clamped = ff_add_pixels_clamped_mmx;
|
||||
|
||||
if (!high_bit_depth &&
|
||||
avctx->lowres == 0 &&
|
||||
(avctx->idct_algo == FF_IDCT_AUTO ||
|
||||
avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
|
||||
avctx->idct_algo == FF_IDCT_SIMPLEMMX)) {
|
||||
c->idct_put = ff_simple_idct_put_mmx;
|
||||
c->idct_add = ff_simple_idct_add_mmx;
|
||||
c->idct = ff_simple_idct_mmx;
|
||||
c->perm_type = FF_IDCT_PERM_SIMPLE;
|
||||
}
|
||||
}
|
||||
|
||||
if (EXTERNAL_SSE2(cpu_flags)) {
|
||||
c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_sse2;
|
||||
c->put_pixels_clamped = ff_put_pixels_clamped_sse2;
|
||||
c->add_pixels_clamped = ff_add_pixels_clamped_sse2;
|
||||
|
||||
if (!high_bit_depth &&
|
||||
avctx->lowres == 0 &&
|
||||
(avctx->idct_algo == FF_IDCT_AUTO ||
|
||||
avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
|
||||
avctx->idct_algo == FF_IDCT_SIMPLEMMX)) {
|
||||
c->idct_put = ff_simple_idct_put_sse2;
|
||||
c->idct_add = ff_simple_idct_add_sse2;
|
||||
c->perm_type = FF_IDCT_PERM_SIMPLE;
|
||||
}
|
||||
|
||||
if (ARCH_X86_64 &&
|
||||
!high_bit_depth &&
|
||||
avctx->lowres == 0 &&
|
||||
(avctx->idct_algo == FF_IDCT_AUTO ||
|
||||
avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
|
||||
avctx->idct_algo == FF_IDCT_SIMPLEMMX ||
|
||||
avctx->idct_algo == FF_IDCT_SIMPLE)) {
|
||||
c->idct = ff_simple_idct8_sse2;
|
||||
c->idct_put = ff_simple_idct8_put_sse2;
|
||||
c->idct_add = ff_simple_idct8_add_sse2;
|
||||
c->perm_type = FF_IDCT_PERM_TRANSPOSE;
|
||||
}
|
||||
}
|
||||
|
||||
if (ARCH_X86_64 && avctx->lowres == 0) {
|
||||
if (EXTERNAL_AVX(cpu_flags) &&
|
||||
!high_bit_depth &&
|
||||
(avctx->idct_algo == FF_IDCT_AUTO ||
|
||||
avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
|
||||
avctx->idct_algo == FF_IDCT_SIMPLEMMX ||
|
||||
avctx->idct_algo == FF_IDCT_SIMPLE)) {
|
||||
c->idct = ff_simple_idct8_avx;
|
||||
c->idct_put = ff_simple_idct8_put_avx;
|
||||
c->idct_add = ff_simple_idct8_add_avx;
|
||||
c->perm_type = FF_IDCT_PERM_TRANSPOSE;
|
||||
}
|
||||
|
||||
if (avctx->bits_per_raw_sample == 10 &&
|
||||
avctx->codec_id != AV_CODEC_ID_MPEG4 &&
|
||||
(avctx->idct_algo == FF_IDCT_AUTO ||
|
||||
avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
|
||||
avctx->idct_algo == FF_IDCT_SIMPLE)) {
|
||||
if (EXTERNAL_SSE2(cpu_flags)) {
|
||||
c->idct_put = ff_simple_idct10_put_sse2;
|
||||
c->idct_add = NULL;
|
||||
c->idct = ff_simple_idct10_sse2;
|
||||
c->perm_type = FF_IDCT_PERM_TRANSPOSE;
|
||||
|
||||
}
|
||||
if (EXTERNAL_AVX(cpu_flags)) {
|
||||
c->idct_put = ff_simple_idct10_put_avx;
|
||||
c->idct_add = NULL;
|
||||
c->idct = ff_simple_idct10_avx;
|
||||
c->perm_type = FF_IDCT_PERM_TRANSPOSE;
|
||||
}
|
||||
}
|
||||
|
||||
if (avctx->bits_per_raw_sample == 12 &&
|
||||
(avctx->idct_algo == FF_IDCT_AUTO ||
|
||||
avctx->idct_algo == FF_IDCT_SIMPLEMMX)) {
|
||||
if (EXTERNAL_SSE2(cpu_flags)) {
|
||||
c->idct_put = ff_simple_idct12_put_sse2;
|
||||
c->idct_add = NULL;
|
||||
c->idct = ff_simple_idct12_sse2;
|
||||
c->perm_type = FF_IDCT_PERM_TRANSPOSE;
|
||||
}
|
||||
if (EXTERNAL_AVX(cpu_flags)) {
|
||||
c->idct_put = ff_simple_idct12_put_avx;
|
||||
c->idct_add = NULL;
|
||||
c->idct = ff_simple_idct12_avx;
|
||||
c->perm_type = FF_IDCT_PERM_TRANSPOSE;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
100
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/inline_asm.h
vendored
Normal file
100
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/inline_asm.h
vendored
Normal file
|
@ -0,0 +1,100 @@
|
|||
/*
|
||||
* inline assembly helper macros
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef AVCODEC_X86_INLINE_ASM_H
|
||||
#define AVCODEC_X86_INLINE_ASM_H
|
||||
|
||||
#include "constants.h"
|
||||
|
||||
#define MOVQ_WONE(regd) \
|
||||
__asm__ volatile ( \
|
||||
"pcmpeqd %%" #regd ", %%" #regd " \n\t" \
|
||||
"psrlw $15, %%" #regd ::)
|
||||
|
||||
#define JUMPALIGN() __asm__ volatile (".p2align 3"::)
|
||||
#define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%"#regd", %%"#regd ::)
|
||||
|
||||
#define MOVQ_BFE(regd) \
|
||||
__asm__ volatile ( \
|
||||
"pcmpeqd %%"#regd", %%"#regd" \n\t" \
|
||||
"paddb %%"#regd", %%"#regd" \n\t" ::)
|
||||
|
||||
#ifndef PIC
|
||||
#define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_pw_2))
|
||||
#else
|
||||
// for shared library it's better to use this way for accessing constants
|
||||
// pcmpeqd -> -1
|
||||
#define MOVQ_WTWO(regd) \
|
||||
__asm__ volatile ( \
|
||||
"pcmpeqd %%"#regd", %%"#regd" \n\t" \
|
||||
"psrlw $15, %%"#regd" \n\t" \
|
||||
"psllw $1, %%"#regd" \n\t"::)
|
||||
|
||||
#endif
|
||||
|
||||
// using regr as temporary and for the output result
|
||||
// first argument is unmodified and second is trashed
|
||||
// regfe is supposed to contain 0xfefefefefefefefe
|
||||
#define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
|
||||
"movq "#rega", "#regr" \n\t" \
|
||||
"pand "#regb", "#regr" \n\t" \
|
||||
"pxor "#rega", "#regb" \n\t" \
|
||||
"pand "#regfe", "#regb" \n\t" \
|
||||
"psrlq $1, "#regb" \n\t" \
|
||||
"paddb "#regb", "#regr" \n\t"
|
||||
|
||||
#define PAVGB_MMX(rega, regb, regr, regfe) \
|
||||
"movq "#rega", "#regr" \n\t" \
|
||||
"por "#regb", "#regr" \n\t" \
|
||||
"pxor "#rega", "#regb" \n\t" \
|
||||
"pand "#regfe", "#regb" \n\t" \
|
||||
"psrlq $1, "#regb" \n\t" \
|
||||
"psubb "#regb", "#regr" \n\t"
|
||||
|
||||
// mm6 is supposed to contain 0xfefefefefefefefe
|
||||
#define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
|
||||
"movq "#rega", "#regr" \n\t" \
|
||||
"movq "#regc", "#regp" \n\t" \
|
||||
"pand "#regb", "#regr" \n\t" \
|
||||
"pand "#regd", "#regp" \n\t" \
|
||||
"pxor "#rega", "#regb" \n\t" \
|
||||
"pxor "#regc", "#regd" \n\t" \
|
||||
"pand %%mm6, "#regb" \n\t" \
|
||||
"pand %%mm6, "#regd" \n\t" \
|
||||
"psrlq $1, "#regb" \n\t" \
|
||||
"psrlq $1, "#regd" \n\t" \
|
||||
"paddb "#regb", "#regr" \n\t" \
|
||||
"paddb "#regd", "#regp" \n\t"
|
||||
|
||||
#define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
|
||||
"movq "#rega", "#regr" \n\t" \
|
||||
"movq "#regc", "#regp" \n\t" \
|
||||
"por "#regb", "#regr" \n\t" \
|
||||
"por "#regd", "#regp" \n\t" \
|
||||
"pxor "#rega", "#regb" \n\t" \
|
||||
"pxor "#regc", "#regd" \n\t" \
|
||||
"pand %%mm6, "#regb" \n\t" \
|
||||
"pand %%mm6, "#regd" \n\t" \
|
||||
"psrlq $1, "#regd" \n\t" \
|
||||
"psrlq $1, "#regb" \n\t" \
|
||||
"psubb "#regb", "#regr" \n\t" \
|
||||
"psubb "#regd", "#regp" \n\t"
|
||||
|
||||
#endif /* AVCODEC_X86_INLINE_ASM_H */
|
60
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/jpeg2000dsp_init.c
vendored
Normal file
60
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/jpeg2000dsp_init.c
vendored
Normal file
|
@ -0,0 +1,60 @@
|
|||
/*
|
||||
* SIMD optimized JPEG 2000 DSP functions
|
||||
* Copyright (c) 2015 James Almer
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/jpeg2000dsp.h"
|
||||
|
||||
void ff_ict_float_sse(void *src0, void *src1, void *src2, int csize);
|
||||
void ff_ict_float_avx(void *src0, void *src1, void *src2, int csize);
|
||||
void ff_ict_float_fma3(void *src0, void *src1, void *src2, int csize);
|
||||
void ff_ict_float_fma4(void *src0, void *src1, void *src2, int csize);
|
||||
void ff_rct_int_sse2 (void *src0, void *src1, void *src2, int csize);
|
||||
void ff_rct_int_avx2 (void *src0, void *src1, void *src2, int csize);
|
||||
|
||||
av_cold void ff_jpeg2000dsp_init_x86(Jpeg2000DSPContext *c)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
if (EXTERNAL_SSE(cpu_flags)) {
|
||||
c->mct_decode[FF_DWT97] = ff_ict_float_sse;
|
||||
}
|
||||
|
||||
if (EXTERNAL_SSE2(cpu_flags)) {
|
||||
c->mct_decode[FF_DWT53] = ff_rct_int_sse2;
|
||||
}
|
||||
|
||||
if (EXTERNAL_AVX_FAST(cpu_flags)) {
|
||||
c->mct_decode[FF_DWT97] = ff_ict_float_avx;
|
||||
}
|
||||
|
||||
if (EXTERNAL_FMA4(cpu_flags)) {
|
||||
c->mct_decode[FF_DWT97] = ff_ict_float_fma4;
|
||||
}
|
||||
|
||||
if (EXTERNAL_FMA3_FAST(cpu_flags)) {
|
||||
c->mct_decode[FF_DWT97] = ff_ict_float_fma3;
|
||||
}
|
||||
|
||||
if (EXTERNAL_AVX2_FAST(cpu_flags)) {
|
||||
c->mct_decode[FF_DWT53] = ff_rct_int_avx2;
|
||||
}
|
||||
}
|
56
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/lossless_audiodsp_init.c
vendored
Normal file
56
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/lossless_audiodsp_init.c
vendored
Normal file
|
@ -0,0 +1,56 @@
|
|||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/lossless_audiodsp.h"
|
||||
|
||||
int32_t ff_scalarproduct_and_madd_int16_mmxext(int16_t *v1, const int16_t *v2,
|
||||
const int16_t *v3,
|
||||
int order, int mul);
|
||||
int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2,
|
||||
const int16_t *v3,
|
||||
int order, int mul);
|
||||
int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2,
|
||||
const int16_t *v3,
|
||||
int order, int mul);
|
||||
|
||||
int32_t ff_scalarproduct_and_madd_int32_sse4(int16_t *v1, const int32_t *v2,
|
||||
const int16_t *v3,
|
||||
int order, int mul);
|
||||
|
||||
av_cold void ff_llauddsp_init_x86(LLAudDSPContext *c)
|
||||
{
|
||||
#if HAVE_X86ASM
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (EXTERNAL_MMXEXT(cpu_flags))
|
||||
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmxext;
|
||||
|
||||
if (EXTERNAL_SSE2(cpu_flags))
|
||||
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
|
||||
|
||||
if (EXTERNAL_SSSE3(cpu_flags) &&
|
||||
!(cpu_flags & (AV_CPU_FLAG_SSE42 | AV_CPU_FLAG_3DNOW))) // cachesplit
|
||||
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
|
||||
|
||||
if (EXTERNAL_SSE4(cpu_flags))
|
||||
c->scalarproduct_and_madd_int32 = ff_scalarproduct_and_madd_int32_sse4;
|
||||
#endif
|
||||
}
|
128
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/lossless_videodsp_init.c
vendored
Normal file
128
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/lossless_videodsp_init.c
vendored
Normal file
|
@ -0,0 +1,128 @@
|
|||
/*
|
||||
* Lossless video DSP utils
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
#include "libavutil/x86/asm.h"
|
||||
#include "../lossless_videodsp.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
|
||||
void ff_add_bytes_mmx(uint8_t *dst, uint8_t *src, ptrdiff_t w);
|
||||
void ff_add_bytes_sse2(uint8_t *dst, uint8_t *src, ptrdiff_t w);
|
||||
void ff_add_bytes_avx2(uint8_t *dst, uint8_t *src, ptrdiff_t w);
|
||||
|
||||
void ff_add_median_pred_mmxext(uint8_t *dst, const uint8_t *top,
|
||||
const uint8_t *diff, ptrdiff_t w,
|
||||
int *left, int *left_top);
|
||||
void ff_add_median_pred_sse2(uint8_t *dst, const uint8_t *top,
|
||||
const uint8_t *diff, ptrdiff_t w,
|
||||
int *left, int *left_top);
|
||||
|
||||
int ff_add_left_pred_ssse3(uint8_t *dst, const uint8_t *src,
|
||||
ptrdiff_t w, int left);
|
||||
int ff_add_left_pred_unaligned_ssse3(uint8_t *dst, const uint8_t *src,
|
||||
ptrdiff_t w, int left);
|
||||
int ff_add_left_pred_unaligned_avx2(uint8_t *dst, const uint8_t *src,
|
||||
ptrdiff_t w, int left);
|
||||
|
||||
int ff_add_left_pred_int16_ssse3(uint16_t *dst, const uint16_t *src, unsigned mask, ptrdiff_t w, unsigned acc);
|
||||
int ff_add_left_pred_int16_unaligned_ssse3(uint16_t *dst, const uint16_t *src, unsigned mask, ptrdiff_t w, unsigned acc);
|
||||
|
||||
void ff_add_gradient_pred_ssse3(uint8_t *src, const ptrdiff_t stride, const ptrdiff_t width);
|
||||
void ff_add_gradient_pred_avx2(uint8_t *src, const ptrdiff_t stride, const ptrdiff_t width);
|
||||
|
||||
#if HAVE_INLINE_ASM && HAVE_7REGS && ARCH_X86_32
|
||||
static void add_median_pred_cmov(uint8_t *dst, const uint8_t *top,
|
||||
const uint8_t *diff, ptrdiff_t w,
|
||||
int *left, int *left_top)
|
||||
{
|
||||
x86_reg w2 = -w;
|
||||
x86_reg x;
|
||||
int l = *left & 0xff;
|
||||
int tl = *left_top & 0xff;
|
||||
int t;
|
||||
__asm__ volatile (
|
||||
"mov %7, %3 \n"
|
||||
"1: \n"
|
||||
"movzbl (%3, %4), %2 \n"
|
||||
"mov %2, %k3 \n"
|
||||
"sub %b1, %b3 \n"
|
||||
"add %b0, %b3 \n"
|
||||
"mov %2, %1 \n"
|
||||
"cmp %0, %2 \n"
|
||||
"cmovg %0, %2 \n"
|
||||
"cmovg %1, %0 \n"
|
||||
"cmp %k3, %0 \n"
|
||||
"cmovg %k3, %0 \n"
|
||||
"mov %7, %3 \n"
|
||||
"cmp %2, %0 \n"
|
||||
"cmovl %2, %0 \n"
|
||||
"add (%6, %4), %b0 \n"
|
||||
"mov %b0, (%5, %4) \n"
|
||||
"inc %4 \n"
|
||||
"jl 1b \n"
|
||||
: "+&q"(l), "+&q"(tl), "=&r"(t), "=&q"(x), "+&r"(w2)
|
||||
: "r"(dst + w), "r"(diff + w), "rm"(top + w)
|
||||
);
|
||||
*left = l;
|
||||
*left_top = tl;
|
||||
}
|
||||
#endif
|
||||
|
||||
void ff_llviddsp_init_x86(LLVidDSPContext *c)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
#if HAVE_INLINE_ASM && HAVE_7REGS && ARCH_X86_32
|
||||
if (cpu_flags & AV_CPU_FLAG_CMOV)
|
||||
c->add_median_pred = add_median_pred_cmov;
|
||||
#endif
|
||||
|
||||
if (ARCH_X86_32 && EXTERNAL_MMX(cpu_flags)) {
|
||||
c->add_bytes = ff_add_bytes_mmx;
|
||||
}
|
||||
|
||||
if (ARCH_X86_32 && EXTERNAL_MMXEXT(cpu_flags)) {
|
||||
/* slower than cmov version on AMD */
|
||||
if (!(cpu_flags & AV_CPU_FLAG_3DNOW))
|
||||
c->add_median_pred = ff_add_median_pred_mmxext;
|
||||
}
|
||||
|
||||
if (EXTERNAL_SSE2(cpu_flags)) {
|
||||
c->add_bytes = ff_add_bytes_sse2;
|
||||
c->add_median_pred = ff_add_median_pred_sse2;
|
||||
}
|
||||
|
||||
if (EXTERNAL_SSSE3(cpu_flags)) {
|
||||
c->add_left_pred = ff_add_left_pred_ssse3;
|
||||
c->add_left_pred_int16 = ff_add_left_pred_int16_ssse3;
|
||||
c->add_gradient_pred = ff_add_gradient_pred_ssse3;
|
||||
}
|
||||
|
||||
if (EXTERNAL_SSSE3_FAST(cpu_flags)) {
|
||||
c->add_left_pred = ff_add_left_pred_unaligned_ssse3;
|
||||
c->add_left_pred_int16 = ff_add_left_pred_int16_unaligned_ssse3;
|
||||
}
|
||||
|
||||
if (EXTERNAL_AVX2_FAST(cpu_flags)) {
|
||||
c->add_bytes = ff_add_bytes_avx2;
|
||||
c->add_left_pred = ff_add_left_pred_unaligned_avx2;
|
||||
c->add_gradient_pred = ff_add_gradient_pred_avx2;
|
||||
}
|
||||
}
|
111
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/lossless_videoencdsp_init.c
vendored
Normal file
111
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/lossless_videoencdsp_init.c
vendored
Normal file
|
@ -0,0 +1,111 @@
|
|||
/*
|
||||
* SIMD-optimized lossless video encoding functions
|
||||
* Copyright (c) 2000, 2001 Fabrice Bellard
|
||||
* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
|
||||
*
|
||||
* MMX optimization by Nick Kurshev <nickols_k@mail.ru>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/x86/asm.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/lossless_videoencdsp.h"
|
||||
#include "libavcodec/mathops.h"
|
||||
|
||||
void ff_diff_bytes_mmx(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
|
||||
intptr_t w);
|
||||
void ff_diff_bytes_sse2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
|
||||
intptr_t w);
|
||||
void ff_diff_bytes_avx2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
|
||||
intptr_t w);
|
||||
|
||||
void ff_sub_left_predict_avx(uint8_t *dst, uint8_t *src,
|
||||
ptrdiff_t stride, ptrdiff_t width, int height);
|
||||
|
||||
#if HAVE_INLINE_ASM
|
||||
|
||||
static void sub_median_pred_mmxext(uint8_t *dst, const uint8_t *src1,
|
||||
const uint8_t *src2, intptr_t w,
|
||||
int *left, int *left_top)
|
||||
{
|
||||
x86_reg i = 0;
|
||||
uint8_t l, lt;
|
||||
|
||||
__asm__ volatile (
|
||||
"movq (%1, %0), %%mm0 \n\t" // LT
|
||||
"psllq $8, %%mm0 \n\t"
|
||||
"1: \n\t"
|
||||
"movq (%1, %0), %%mm1 \n\t" // T
|
||||
"movq -1(%2, %0), %%mm2 \n\t" // L
|
||||
"movq (%2, %0), %%mm3 \n\t" // X
|
||||
"movq %%mm2, %%mm4 \n\t" // L
|
||||
"psubb %%mm0, %%mm2 \n\t"
|
||||
"paddb %%mm1, %%mm2 \n\t" // L + T - LT
|
||||
"movq %%mm4, %%mm5 \n\t" // L
|
||||
"pmaxub %%mm1, %%mm4 \n\t" // max(T, L)
|
||||
"pminub %%mm5, %%mm1 \n\t" // min(T, L)
|
||||
"pminub %%mm2, %%mm4 \n\t"
|
||||
"pmaxub %%mm1, %%mm4 \n\t"
|
||||
"psubb %%mm4, %%mm3 \n\t" // dst - pred
|
||||
"movq %%mm3, (%3, %0) \n\t"
|
||||
"add $8, %0 \n\t"
|
||||
"movq -1(%1, %0), %%mm0 \n\t" // LT
|
||||
"cmp %4, %0 \n\t"
|
||||
" jb 1b \n\t"
|
||||
: "+r" (i)
|
||||
: "r" (src1), "r" (src2), "r" (dst), "r" ((x86_reg) w));
|
||||
|
||||
l = *left;
|
||||
lt = *left_top;
|
||||
|
||||
dst[0] = src2[0] - mid_pred(l, src1[0], (l + src1[0] - lt) & 0xFF);
|
||||
|
||||
*left_top = src1[w - 1];
|
||||
*left = src2[w - 1];
|
||||
}
|
||||
|
||||
#endif /* HAVE_INLINE_ASM */
|
||||
|
||||
av_cold void ff_llvidencdsp_init_x86(LLVidEncDSPContext *c)
|
||||
{
|
||||
av_unused int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (ARCH_X86_32 && EXTERNAL_MMX(cpu_flags)) {
|
||||
c->diff_bytes = ff_diff_bytes_mmx;
|
||||
}
|
||||
|
||||
#if HAVE_INLINE_ASM
|
||||
if (INLINE_MMXEXT(cpu_flags)) {
|
||||
c->sub_median_pred = sub_median_pred_mmxext;
|
||||
}
|
||||
#endif /* HAVE_INLINE_ASM */
|
||||
|
||||
if (EXTERNAL_SSE2(cpu_flags)) {
|
||||
c->diff_bytes = ff_diff_bytes_sse2;
|
||||
}
|
||||
|
||||
if (EXTERNAL_AVX(cpu_flags)) {
|
||||
c->sub_left_predict = ff_sub_left_predict_avx;
|
||||
}
|
||||
|
||||
if (EXTERNAL_AVX2_FAST(cpu_flags)) {
|
||||
c->diff_bytes = ff_diff_bytes_avx2;
|
||||
}
|
||||
}
|
162
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/lpc.c
vendored
Normal file
162
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/lpc.c
vendored
Normal file
|
@ -0,0 +1,162 @@
|
|||
/*
|
||||
* SIMD-optimized LPC functions
|
||||
* Copyright (c) 2007 Loren Merritt
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/mem.h"
|
||||
#include "libavutil/x86/asm.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/lpc.h"
|
||||
|
||||
DECLARE_ASM_CONST(16, double, pd_1)[2] = { 1.0, 1.0 };
|
||||
DECLARE_ASM_CONST(16, double, pd_2)[2] = { 2.0, 2.0 };
|
||||
|
||||
#if HAVE_SSE2_INLINE
|
||||
|
||||
static void lpc_apply_welch_window_sse2(const int32_t *data, int len,
|
||||
double *w_data)
|
||||
{
|
||||
double c = 2.0 / (len-1.0);
|
||||
int n2 = len>>1;
|
||||
x86_reg i = -n2*sizeof(int32_t);
|
||||
x86_reg j = n2*sizeof(int32_t);
|
||||
__asm__ volatile(
|
||||
"movsd %4, %%xmm7 \n\t"
|
||||
"movapd "MANGLE(pd_1)", %%xmm6 \n\t"
|
||||
"movapd "MANGLE(pd_2)", %%xmm5 \n\t"
|
||||
"movlhps %%xmm7, %%xmm7 \n\t"
|
||||
"subpd %%xmm5, %%xmm7 \n\t"
|
||||
"addsd %%xmm6, %%xmm7 \n\t"
|
||||
"test $1, %5 \n\t"
|
||||
"jz 2f \n\t"
|
||||
#define WELCH(MOVPD, offset)\
|
||||
"1: \n\t"\
|
||||
"movapd %%xmm7, %%xmm1 \n\t"\
|
||||
"mulpd %%xmm1, %%xmm1 \n\t"\
|
||||
"movapd %%xmm6, %%xmm0 \n\t"\
|
||||
"subpd %%xmm1, %%xmm0 \n\t"\
|
||||
"pshufd $0x4e, %%xmm0, %%xmm1 \n\t"\
|
||||
"cvtpi2pd (%3,%0), %%xmm2 \n\t"\
|
||||
"cvtpi2pd "#offset"*4(%3,%1), %%xmm3 \n\t"\
|
||||
"mulpd %%xmm0, %%xmm2 \n\t"\
|
||||
"mulpd %%xmm1, %%xmm3 \n\t"\
|
||||
"movapd %%xmm2, (%2,%0,2) \n\t"\
|
||||
MOVPD" %%xmm3, "#offset"*8(%2,%1,2) \n\t"\
|
||||
"subpd %%xmm5, %%xmm7 \n\t"\
|
||||
"sub $8, %1 \n\t"\
|
||||
"add $8, %0 \n\t"\
|
||||
"jl 1b \n\t"\
|
||||
|
||||
WELCH("movupd", -1)
|
||||
"jmp 3f \n\t"
|
||||
"2: \n\t"
|
||||
WELCH("movapd", -2)
|
||||
"3: \n\t"
|
||||
:"+&r"(i), "+&r"(j)
|
||||
:"r"(w_data+n2), "r"(data+n2), "m"(c), "r"(len)
|
||||
NAMED_CONSTRAINTS_ARRAY_ADD(pd_1,pd_2)
|
||||
XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm5", "%xmm6", "%xmm7")
|
||||
);
|
||||
#undef WELCH
|
||||
}
|
||||
|
||||
static void lpc_compute_autocorr_sse2(const double *data, int len, int lag,
|
||||
double *autoc)
|
||||
{
|
||||
int j;
|
||||
|
||||
if((x86_reg)data & 15)
|
||||
data++;
|
||||
|
||||
for(j=0; j<lag; j+=2){
|
||||
x86_reg i = -len*sizeof(double);
|
||||
if(j == lag-2) {
|
||||
__asm__ volatile(
|
||||
"movsd "MANGLE(pd_1)", %%xmm0 \n\t"
|
||||
"movsd "MANGLE(pd_1)", %%xmm1 \n\t"
|
||||
"movsd "MANGLE(pd_1)", %%xmm2 \n\t"
|
||||
"1: \n\t"
|
||||
"movapd (%2,%0), %%xmm3 \n\t"
|
||||
"movupd -8(%3,%0), %%xmm4 \n\t"
|
||||
"movapd (%3,%0), %%xmm5 \n\t"
|
||||
"mulpd %%xmm3, %%xmm4 \n\t"
|
||||
"mulpd %%xmm3, %%xmm5 \n\t"
|
||||
"mulpd -16(%3,%0), %%xmm3 \n\t"
|
||||
"addpd %%xmm4, %%xmm1 \n\t"
|
||||
"addpd %%xmm5, %%xmm0 \n\t"
|
||||
"addpd %%xmm3, %%xmm2 \n\t"
|
||||
"add $16, %0 \n\t"
|
||||
"jl 1b \n\t"
|
||||
"movhlps %%xmm0, %%xmm3 \n\t"
|
||||
"movhlps %%xmm1, %%xmm4 \n\t"
|
||||
"movhlps %%xmm2, %%xmm5 \n\t"
|
||||
"addsd %%xmm3, %%xmm0 \n\t"
|
||||
"addsd %%xmm4, %%xmm1 \n\t"
|
||||
"addsd %%xmm5, %%xmm2 \n\t"
|
||||
"movsd %%xmm0, (%1) \n\t"
|
||||
"movsd %%xmm1, 8(%1) \n\t"
|
||||
"movsd %%xmm2, 16(%1) \n\t"
|
||||
:"+&r"(i)
|
||||
:"r"(autoc+j), "r"(data+len), "r"(data+len-j)
|
||||
NAMED_CONSTRAINTS_ARRAY_ADD(pd_1)
|
||||
:"memory"
|
||||
);
|
||||
} else {
|
||||
__asm__ volatile(
|
||||
"movsd "MANGLE(pd_1)", %%xmm0 \n\t"
|
||||
"movsd "MANGLE(pd_1)", %%xmm1 \n\t"
|
||||
"1: \n\t"
|
||||
"movapd (%3,%0), %%xmm3 \n\t"
|
||||
"movupd -8(%4,%0), %%xmm4 \n\t"
|
||||
"mulpd %%xmm3, %%xmm4 \n\t"
|
||||
"mulpd (%4,%0), %%xmm3 \n\t"
|
||||
"addpd %%xmm4, %%xmm1 \n\t"
|
||||
"addpd %%xmm3, %%xmm0 \n\t"
|
||||
"add $16, %0 \n\t"
|
||||
"jl 1b \n\t"
|
||||
"movhlps %%xmm0, %%xmm3 \n\t"
|
||||
"movhlps %%xmm1, %%xmm4 \n\t"
|
||||
"addsd %%xmm3, %%xmm0 \n\t"
|
||||
"addsd %%xmm4, %%xmm1 \n\t"
|
||||
"movsd %%xmm0, %1 \n\t"
|
||||
"movsd %%xmm1, %2 \n\t"
|
||||
:"+&r"(i), "=m"(autoc[j]), "=m"(autoc[j+1])
|
||||
:"r"(data+len), "r"(data+len-j)
|
||||
NAMED_CONSTRAINTS_ARRAY_ADD(pd_1)
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif /* HAVE_SSE2_INLINE */
|
||||
|
||||
av_cold void ff_lpc_init_x86(LPCContext *c)
|
||||
{
|
||||
#if HAVE_SSE2_INLINE
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (INLINE_SSE2(cpu_flags) || INLINE_SSE2_SLOW(cpu_flags)) {
|
||||
c->lpc_apply_welch_window = lpc_apply_welch_window_sse2;
|
||||
c->lpc_compute_autocorr = lpc_compute_autocorr_sse2;
|
||||
}
|
||||
#endif /* HAVE_SSE2_INLINE */
|
||||
}
|
133
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/mathops.h
vendored
Normal file
133
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/mathops.h
vendored
Normal file
|
@ -0,0 +1,133 @@
|
|||
/*
|
||||
* simple math operations
|
||||
* Copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at> et al
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef AVCODEC_X86_MATHOPS_H
|
||||
#define AVCODEC_X86_MATHOPS_H
|
||||
|
||||
#include "config.h"
|
||||
|
||||
#include "libavutil/common.h"
|
||||
#include "libavutil/x86/asm.h"
|
||||
|
||||
#if HAVE_INLINE_ASM
|
||||
|
||||
#if ARCH_X86_32
|
||||
|
||||
#define MULL MULL
|
||||
static av_always_inline av_const int MULL(int a, int b, unsigned shift)
|
||||
{
|
||||
int rt, dummy;
|
||||
__asm__ (
|
||||
"imull %3 \n\t"
|
||||
"shrdl %4, %%edx, %%eax \n\t"
|
||||
:"=a"(rt), "=d"(dummy)
|
||||
:"a"(a), "rm"(b), "ci"((uint8_t)shift)
|
||||
);
|
||||
return rt;
|
||||
}
|
||||
|
||||
#define MULH MULH
|
||||
static av_always_inline av_const int MULH(int a, int b)
|
||||
{
|
||||
int rt, dummy;
|
||||
__asm__ (
|
||||
"imull %3"
|
||||
:"=d"(rt), "=a"(dummy)
|
||||
:"a"(a), "rm"(b)
|
||||
);
|
||||
return rt;
|
||||
}
|
||||
|
||||
#define MUL64 MUL64
|
||||
static av_always_inline av_const int64_t MUL64(int a, int b)
|
||||
{
|
||||
int64_t rt;
|
||||
__asm__ (
|
||||
"imull %2"
|
||||
:"=A"(rt)
|
||||
:"a"(a), "rm"(b)
|
||||
);
|
||||
return rt;
|
||||
}
|
||||
|
||||
#endif /* ARCH_X86_32 */
|
||||
|
||||
#if HAVE_I686
|
||||
/* median of 3 */
|
||||
#define mid_pred mid_pred
|
||||
static inline av_const int mid_pred(int a, int b, int c)
|
||||
{
|
||||
int i=b;
|
||||
__asm__ (
|
||||
"cmp %2, %1 \n\t"
|
||||
"cmovg %1, %0 \n\t"
|
||||
"cmovg %2, %1 \n\t"
|
||||
"cmp %3, %1 \n\t"
|
||||
"cmovl %3, %1 \n\t"
|
||||
"cmp %1, %0 \n\t"
|
||||
"cmovg %1, %0 \n\t"
|
||||
:"+&r"(i), "+&r"(a)
|
||||
:"r"(b), "r"(c)
|
||||
);
|
||||
return i;
|
||||
}
|
||||
|
||||
#if HAVE_6REGS
|
||||
#define COPY3_IF_LT(x, y, a, b, c, d)\
|
||||
__asm__ volatile(\
|
||||
"cmpl %0, %3 \n\t"\
|
||||
"cmovl %3, %0 \n\t"\
|
||||
"cmovl %4, %1 \n\t"\
|
||||
"cmovl %5, %2 \n\t"\
|
||||
: "+&r" (x), "+&r" (a), "+r" (c)\
|
||||
: "r" (y), "r" (b), "r" (d)\
|
||||
);
|
||||
#endif /* HAVE_6REGS */
|
||||
|
||||
#endif /* HAVE_I686 */
|
||||
|
||||
#define MASK_ABS(mask, level) \
|
||||
__asm__ ("cdq \n\t" \
|
||||
"xorl %1, %0 \n\t" \
|
||||
"subl %1, %0 \n\t" \
|
||||
: "+a"(level), "=&d"(mask))
|
||||
|
||||
// avoid +32 for shift optimization (gcc should do that ...)
|
||||
#define NEG_SSR32 NEG_SSR32
|
||||
static inline int32_t NEG_SSR32( int32_t a, int8_t s){
|
||||
__asm__ ("sarl %1, %0\n\t"
|
||||
: "+r" (a)
|
||||
: "ic" ((uint8_t)(-s))
|
||||
);
|
||||
return a;
|
||||
}
|
||||
|
||||
#define NEG_USR32 NEG_USR32
|
||||
static inline uint32_t NEG_USR32(uint32_t a, int8_t s){
|
||||
__asm__ ("shrl %1, %0\n\t"
|
||||
: "+r" (a)
|
||||
: "ic" ((uint8_t)(-s))
|
||||
);
|
||||
return a;
|
||||
}
|
||||
|
||||
#endif /* HAVE_INLINE_ASM */
|
||||
#endif /* AVCODEC_X86_MATHOPS_H */
|
221
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/mdct15.asm
vendored
Normal file
221
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/mdct15.asm
vendored
Normal file
|
@ -0,0 +1,221 @@
|
|||
;******************************************************************************
|
||||
;* SIMD optimized non-power-of-two MDCT functions
|
||||
;*
|
||||
;* Copyright (C) 2017 Rostislav Pehlivanov <atomnuker@gmail.com>
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
SECTION_RODATA 32
|
||||
|
||||
perm_neg: dd 2, 5, 3, 4, 6, 1, 7, 0
|
||||
perm_pos: dd 0, 7, 1, 6, 4, 3, 5, 2
|
||||
sign_adjust_r: times 4 dd 0x80000000, 0x00000000
|
||||
|
||||
sign_adjust_5: dd 0x00000000, 0x80000000, 0x80000000, 0x00000000
|
||||
|
||||
SECTION .text
|
||||
|
||||
%if ARCH_X86_64
|
||||
|
||||
;*****************************************************************************************
|
||||
;void ff_fft15_avx(FFTComplex *out, FFTComplex *in, FFTComplex *exptab, ptrdiff_t stride);
|
||||
;*****************************************************************************************
|
||||
%macro FFT5 3 ; %1 - in_offset, %2 - dst1 (64bit used), %3 - dst2
|
||||
VBROADCASTSD m0, [inq + %1] ; in[ 0].re, in[ 0].im, in[ 0].re, in[ 0].im
|
||||
movsd xm1, [inq + 1*16 + 8 + %1] ; in[ 3].re, in[ 3].im, 0, 0
|
||||
movsd xm4, [inq + 6*16 + 0 + %1] ; in[12].re, in[12].im, 0, 0
|
||||
movhps xm1, [inq + 3*16 + 0 + %1] ; in[ 3].re, in[ 3].im, in[ 6].re, in[ 6].im
|
||||
movhps xm4, [inq + 4*16 + 8 + %1] ; in[12].re, in[12].im, in[ 9].re, in[ 9].im
|
||||
|
||||
subps xm2, xm1, xm4 ; t[2].im, t[2].re, t[3].im, t[3].re
|
||||
addps xm1, xm4 ; t[0].re, t[0].im, t[1].re, t[1].im
|
||||
|
||||
movhlps %2, xm1 ; t[0].re, t[1].re, t[0].im, t[1].im
|
||||
addps %2, xm1
|
||||
addps %2, xm0 ; DC[0].re, DC[0].im, junk...
|
||||
movlhps %2, %2 ; DC[0].re, DC[0].im, DC[0].re, DC[0].im
|
||||
|
||||
shufps xm3, xm1, xm2, q0110 ; t[0].re, t[0].im, t[2].re, t[2].im
|
||||
shufps xm1, xm2, q2332 ; t[1].re, t[1].im, t[3].re, t[3].im
|
||||
|
||||
mulps xm%3, xm1, xm5
|
||||
mulps xm4, xm3, xm6
|
||||
mulps xm1, xm6
|
||||
|
||||
xorps xm1, xm7
|
||||
mulps xm3, xm5
|
||||
addsubps xm3, xm1 ; t[0].re, t[0].im, t[2].re, t[2].im
|
||||
subps xm%3, xm4 ; t[4].re, t[4].im, t[5].re, t[5].im
|
||||
|
||||
movhlps xm2, xm%3, xm3 ; t[2].re, t[2].im, t[5].re, t[5].im
|
||||
movlhps xm3, xm%3 ; t[0].re, t[0].im, t[4].re, t[4].im
|
||||
|
||||
xorps xm2, xm7
|
||||
addps xm%3, xm2, xm3
|
||||
subps xm3, xm2
|
||||
|
||||
shufps xm3, xm3, q1032
|
||||
vinsertf128 m%3, m%3, xm3, 1 ; All ACs (tmp[1] through to tmp[4])
|
||||
addps m%3, m%3, m0 ; Finally offset with DCs
|
||||
%endmacro
|
||||
|
||||
%macro BUTTERFLIES_DC 1 ; %1 - exptab_offset
|
||||
mulps xm0, xm9, [exptabq + %1 + 16*0]
|
||||
mulps xm1, xm10, [exptabq + %1 + 16*1]
|
||||
|
||||
haddps xm0, xm1
|
||||
movhlps xm1, xm0 ; t[0].re, t[1].re, t[0].im, t[1].im
|
||||
|
||||
addps xm0, xm1
|
||||
addps xm0, xm8
|
||||
|
||||
movsd [outq], xm0
|
||||
%endmacro
|
||||
|
||||
%macro BUTTERFLIES_AC 1 ; %1 - exptab_offset
|
||||
mulps m0, m12, [exptabq + 64*0 + 0*mmsize + %1]
|
||||
mulps m1, m12, [exptabq + 64*0 + 1*mmsize + %1]
|
||||
mulps m2, m13, [exptabq + 64*1 + 0*mmsize + %1]
|
||||
mulps m3, m13, [exptabq + 64*1 + 1*mmsize + %1]
|
||||
|
||||
addps m0, m0, m2
|
||||
addps m1, m1, m3
|
||||
addps m0, m0, m11
|
||||
|
||||
shufps m1, m1, m1, q2301
|
||||
addps m0, m0, m1
|
||||
|
||||
vextractf128 xm1, m0, 1
|
||||
|
||||
movlps [outq + strideq*1], xm0
|
||||
movhps [outq + strideq*2], xm0
|
||||
movlps [outq + stride3q], xm1
|
||||
movhps [outq + strideq*4], xm1
|
||||
%endmacro
|
||||
|
||||
INIT_YMM avx
|
||||
cglobal fft15, 4, 5, 14, out, in, exptab, stride, stride5
|
||||
shl strideq, 3
|
||||
|
||||
movaps xm5, [exptabq + 480 + 16*0]
|
||||
movaps xm6, [exptabq + 480 + 16*1]
|
||||
movaps xm7, [sign_adjust_5]
|
||||
|
||||
FFT5 0, xm8, 11
|
||||
FFT5 8, xm9, 12
|
||||
FFT5 16, xm10, 13
|
||||
|
||||
%define stride3q inq
|
||||
lea stride3q, [strideq + strideq*2]
|
||||
lea stride5q, [strideq + strideq*4]
|
||||
|
||||
BUTTERFLIES_DC (8*6 + 4*0)*2*4
|
||||
BUTTERFLIES_AC (8*0 + 0*0)*2*4
|
||||
|
||||
add outq, stride5q
|
||||
BUTTERFLIES_DC (8*6 + 4*1)*2*4
|
||||
BUTTERFLIES_AC (8*2 + 0*0)*2*4
|
||||
|
||||
add outq, stride5q
|
||||
BUTTERFLIES_DC (8*6 + 4*2)*2*4
|
||||
BUTTERFLIES_AC (8*4 + 0*0)*2*4
|
||||
|
||||
RET
|
||||
|
||||
%endif ; ARCH_X86_64
|
||||
|
||||
;*******************************************************************************************************
|
||||
;void ff_mdct15_postreindex(FFTComplex *out, FFTComplex *in, FFTComplex *exp, int *lut, ptrdiff_t len8);
|
||||
;*******************************************************************************************************
|
||||
%macro LUT_LOAD_4D 3
|
||||
mov r4d, [lutq + %3q*4 + 0]
|
||||
movsd xmm%1, [inq + r4q*8]
|
||||
mov r4d, [lutq + %3q*4 + 4]
|
||||
movhps xmm%1, [inq + r4q*8]
|
||||
%if cpuflag(avx2)
|
||||
mov r4d, [lutq + %3q*4 + 8]
|
||||
movsd %2, [inq + r4q*8]
|
||||
mov r4d, [lutq + %3q*4 + 12]
|
||||
movhps %2, [inq + r4q*8]
|
||||
vinsertf128 %1, %1, %2, 1
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro POSTROTATE_FN 1
|
||||
cglobal mdct15_postreindex, 5, 7, 8 + cpuflag(avx2)*2, out, in, exp, lut, len8, offset_p, offset_n
|
||||
|
||||
xor offset_nq, offset_nq
|
||||
lea offset_pq, [len8q*2 - %1]
|
||||
|
||||
movaps m7, [sign_adjust_r]
|
||||
|
||||
%if cpuflag(avx2)
|
||||
movaps m8, [perm_pos]
|
||||
movaps m9, [perm_neg]
|
||||
%endif
|
||||
|
||||
.loop:
|
||||
movups m0, [expq + offset_pq*8] ; exp[p0].re, exp[p0].im, exp[p1].re, exp[p1].im, exp[p2].re, exp[p2].im, exp[p3].re, exp[p3].im
|
||||
movups m1, [expq + offset_nq*8] ; exp[n3].re, exp[n3].im, exp[n2].re, exp[n2].im, exp[n1].re, exp[n1].im, exp[n0].re, exp[n0].im
|
||||
|
||||
LUT_LOAD_4D m3, xm4, offset_p ; in[p0].re, in[p0].im, in[p1].re, in[p1].im, in[p2].re, in[p2].im, in[p3].re, in[p3].im
|
||||
LUT_LOAD_4D m4, xm5, offset_n ; in[n3].re, in[n3].im, in[n2].re, in[n2].im, in[n1].re, in[n1].im, in[n0].re, in[n0].im
|
||||
|
||||
mulps m5, m3, m0 ; in[p].reim * exp[p].reim
|
||||
mulps m6, m4, m1 ; in[n].reim * exp[n].reim
|
||||
|
||||
xorps m5, m7 ; in[p].re *= -1, in[p].im *= 1
|
||||
xorps m6, m7 ; in[n].re *= -1, in[n].im *= 1
|
||||
|
||||
shufps m3, m3, m3, q2301 ; in[p].imre
|
||||
shufps m4, m4, m4, q2301 ; in[n].imre
|
||||
|
||||
mulps m3, m0 ; in[p].imre * exp[p].reim
|
||||
mulps m4, m1 ; in[n].imre * exp[n].reim
|
||||
|
||||
haddps m3, m6 ; out[n0].im, out[n1].im, out[n3].re, out[n2].re, out[n2].im, out[n3].im, out[n1].re, out[n0].re
|
||||
haddps m5, m4 ; out[p0].re, out[p1].re, out[p3].im, out[p2].im, out[p2].re, out[p3].re, out[p1].im, out[p0].im
|
||||
|
||||
%if cpuflag(avx2)
|
||||
vpermps m3, m9, m3 ; out[n3].im, out[n3].re, out[n2].im, out[n2].re, out[n1].im, out[n1].re, out[n0].im, out[n0].re
|
||||
vpermps m5, m8, m5 ; out[p0].re, out[p0].im, out[p1].re, out[p1].im, out[p2].re, out[p2].im, out[p3].re, out[p3].im
|
||||
%else
|
||||
shufps m3, m3, m3, q0312
|
||||
shufps m5, m5, m5, q2130
|
||||
%endif
|
||||
|
||||
movups [outq + offset_nq*8], m3
|
||||
movups [outq + offset_pq*8], m5
|
||||
|
||||
sub offset_pq, %1
|
||||
add offset_nq, %1
|
||||
cmp offset_nq, offset_pq
|
||||
jle .loop
|
||||
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse3
|
||||
POSTROTATE_FN 2
|
||||
|
||||
%if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
|
||||
INIT_YMM avx2
|
||||
POSTROTATE_FN 4
|
||||
%endif
|
99
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/mdct15_init.c
vendored
Normal file
99
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/mdct15_init.c
vendored
Normal file
|
@ -0,0 +1,99 @@
|
|||
/*
|
||||
* SIMD optimized non-power-of-two MDCT functions
|
||||
*
|
||||
* Copyright (C) 2017 Rostislav Pehlivanov <atomnuker@gmail.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/mdct15.h"
|
||||
|
||||
void ff_mdct15_postreindex_sse3(FFTComplex *out, FFTComplex *in, FFTComplex *exp, int *lut, ptrdiff_t len8);
|
||||
void ff_mdct15_postreindex_avx2(FFTComplex *out, FFTComplex *in, FFTComplex *exp, int *lut, ptrdiff_t len8);
|
||||
|
||||
void ff_fft15_avx(FFTComplex *out, FFTComplex *in, FFTComplex *exptab, ptrdiff_t stride);
|
||||
|
||||
static void perm_twiddles(MDCT15Context *s)
|
||||
{
|
||||
int k;
|
||||
FFTComplex tmp[30];
|
||||
|
||||
/* 5-point FFT twiddles */
|
||||
s->exptab[60].re = s->exptab[60].im = s->exptab[19].re;
|
||||
s->exptab[61].re = s->exptab[61].im = s->exptab[19].im;
|
||||
s->exptab[62].re = s->exptab[62].im = s->exptab[20].re;
|
||||
s->exptab[63].re = s->exptab[63].im = s->exptab[20].im;
|
||||
|
||||
/* 15-point FFT twiddles */
|
||||
for (k = 0; k < 5; k++) {
|
||||
tmp[6*k + 0] = s->exptab[k + 0];
|
||||
tmp[6*k + 2] = s->exptab[k + 5];
|
||||
tmp[6*k + 4] = s->exptab[k + 10];
|
||||
|
||||
tmp[6*k + 1] = s->exptab[2 * (k + 0)];
|
||||
tmp[6*k + 3] = s->exptab[2 * (k + 5)];
|
||||
tmp[6*k + 5] = s->exptab[2 * k + 5 ];
|
||||
}
|
||||
|
||||
for (k = 0; k < 6; k++) {
|
||||
FFTComplex ac_exp[] = {
|
||||
{ tmp[6*1 + k].re, tmp[6*1 + k].re },
|
||||
{ tmp[6*2 + k].re, tmp[6*2 + k].re },
|
||||
{ tmp[6*3 + k].re, tmp[6*3 + k].re },
|
||||
{ tmp[6*4 + k].re, tmp[6*4 + k].re },
|
||||
{ tmp[6*1 + k].im, -tmp[6*1 + k].im },
|
||||
{ tmp[6*2 + k].im, -tmp[6*2 + k].im },
|
||||
{ tmp[6*3 + k].im, -tmp[6*3 + k].im },
|
||||
{ tmp[6*4 + k].im, -tmp[6*4 + k].im },
|
||||
};
|
||||
memcpy(s->exptab + 8*k, ac_exp, 8*sizeof(FFTComplex));
|
||||
}
|
||||
|
||||
/* Specialcase when k = 0 */
|
||||
for (k = 0; k < 3; k++) {
|
||||
FFTComplex dc_exp[] = {
|
||||
{ tmp[2*k + 0].re, -tmp[2*k + 0].im },
|
||||
{ tmp[2*k + 0].im, tmp[2*k + 0].re },
|
||||
{ tmp[2*k + 1].re, -tmp[2*k + 1].im },
|
||||
{ tmp[2*k + 1].im, tmp[2*k + 1].re },
|
||||
};
|
||||
memcpy(s->exptab + 8*6 + 4*k, dc_exp, 4*sizeof(FFTComplex));
|
||||
}
|
||||
}
|
||||
|
||||
av_cold void ff_mdct15_init_x86(MDCT15Context *s)
|
||||
{
|
||||
int adjust_twiddles = 0;
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (EXTERNAL_SSE3(cpu_flags))
|
||||
s->postreindex = ff_mdct15_postreindex_sse3;
|
||||
|
||||
if (ARCH_X86_64 && EXTERNAL_AVX(cpu_flags)) {
|
||||
s->fft15 = ff_fft15_avx;
|
||||
adjust_twiddles = 1;
|
||||
}
|
||||
|
||||
if (ARCH_X86_64 && EXTERNAL_AVX2_FAST(cpu_flags))
|
||||
s->postreindex = ff_mdct15_postreindex_avx2;
|
||||
|
||||
if (adjust_twiddles)
|
||||
perm_twiddles(s);
|
||||
}
|
651
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/me_cmp_init.c
vendored
Normal file
651
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/me_cmp_init.c
vendored
Normal file
|
@ -0,0 +1,651 @@
|
|||
/*
|
||||
* SIMD-optimized motion estimation
|
||||
* Copyright (c) 2000, 2001 Fabrice Bellard
|
||||
* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
|
||||
*
|
||||
* MMX optimization by Nick Kurshev <nickols_k@mail.ru>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/x86/asm.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/me_cmp.h"
|
||||
#include "libavcodec/mpegvideo.h"
|
||||
|
||||
int ff_sum_abs_dctelem_mmx(int16_t *block);
|
||||
int ff_sum_abs_dctelem_mmxext(int16_t *block);
|
||||
int ff_sum_abs_dctelem_sse2(int16_t *block);
|
||||
int ff_sum_abs_dctelem_ssse3(int16_t *block);
|
||||
int ff_sse8_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
|
||||
ptrdiff_t stride, int h);
|
||||
int ff_sse16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
|
||||
ptrdiff_t stride, int h);
|
||||
int ff_sse16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
|
||||
ptrdiff_t stride, int h);
|
||||
int ff_hf_noise8_mmx(uint8_t *pix1, ptrdiff_t stride, int h);
|
||||
int ff_hf_noise16_mmx(uint8_t *pix1, ptrdiff_t stride, int h);
|
||||
int ff_sad8_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
|
||||
ptrdiff_t stride, int h);
|
||||
int ff_sad16_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
|
||||
ptrdiff_t stride, int h);
|
||||
int ff_sad16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
|
||||
ptrdiff_t stride, int h);
|
||||
int ff_sad8_x2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
|
||||
ptrdiff_t stride, int h);
|
||||
int ff_sad16_x2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
|
||||
ptrdiff_t stride, int h);
|
||||
int ff_sad16_x2_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
|
||||
ptrdiff_t stride, int h);
|
||||
int ff_sad8_y2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
|
||||
ptrdiff_t stride, int h);
|
||||
int ff_sad16_y2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
|
||||
ptrdiff_t stride, int h);
|
||||
int ff_sad16_y2_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
|
||||
ptrdiff_t stride, int h);
|
||||
int ff_sad8_approx_xy2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
|
||||
ptrdiff_t stride, int h);
|
||||
int ff_sad16_approx_xy2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
|
||||
ptrdiff_t stride, int h);
|
||||
int ff_sad16_approx_xy2_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
|
||||
ptrdiff_t stride, int h);
|
||||
int ff_vsad_intra8_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
|
||||
ptrdiff_t stride, int h);
|
||||
int ff_vsad_intra16_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
|
||||
ptrdiff_t stride, int h);
|
||||
int ff_vsad_intra16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
|
||||
ptrdiff_t stride, int h);
|
||||
int ff_vsad8_approx_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
|
||||
ptrdiff_t stride, int h);
|
||||
int ff_vsad16_approx_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
|
||||
ptrdiff_t stride, int h);
|
||||
int ff_vsad16_approx_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
|
||||
ptrdiff_t stride, int h);
|
||||
|
||||
#define hadamard_func(cpu) \
|
||||
int ff_hadamard8_diff_ ## cpu(MpegEncContext *s, uint8_t *src1, \
|
||||
uint8_t *src2, ptrdiff_t stride, int h); \
|
||||
int ff_hadamard8_diff16_ ## cpu(MpegEncContext *s, uint8_t *src1, \
|
||||
uint8_t *src2, ptrdiff_t stride, int h);
|
||||
|
||||
hadamard_func(mmx)
|
||||
hadamard_func(mmxext)
|
||||
hadamard_func(sse2)
|
||||
hadamard_func(ssse3)
|
||||
|
||||
#if HAVE_X86ASM
|
||||
static int nsse16_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
|
||||
ptrdiff_t stride, int h)
|
||||
{
|
||||
int score1, score2;
|
||||
|
||||
if (c)
|
||||
score1 = c->mecc.sse[0](c, pix1, pix2, stride, h);
|
||||
else
|
||||
score1 = ff_sse16_mmx(c, pix1, pix2, stride, h);
|
||||
score2 = ff_hf_noise16_mmx(pix1, stride, h) + ff_hf_noise8_mmx(pix1+8, stride, h)
|
||||
- ff_hf_noise16_mmx(pix2, stride, h) - ff_hf_noise8_mmx(pix2+8, stride, h);
|
||||
|
||||
if (c)
|
||||
return score1 + FFABS(score2) * c->avctx->nsse_weight;
|
||||
else
|
||||
return score1 + FFABS(score2) * 8;
|
||||
}
|
||||
|
||||
static int nsse8_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
|
||||
ptrdiff_t stride, int h)
|
||||
{
|
||||
int score1 = ff_sse8_mmx(c, pix1, pix2, stride, h);
|
||||
int score2 = ff_hf_noise8_mmx(pix1, stride, h) -
|
||||
ff_hf_noise8_mmx(pix2, stride, h);
|
||||
|
||||
if (c)
|
||||
return score1 + FFABS(score2) * c->avctx->nsse_weight;
|
||||
else
|
||||
return score1 + FFABS(score2) * 8;
|
||||
}
|
||||
|
||||
#endif /* HAVE_X86ASM */
|
||||
|
||||
#if HAVE_INLINE_ASM
|
||||
|
||||
static int vsad_intra16_mmx(MpegEncContext *v, uint8_t *pix, uint8_t *dummy,
|
||||
ptrdiff_t stride, int h)
|
||||
{
|
||||
int tmp;
|
||||
|
||||
av_assert2((((int) pix) & 7) == 0);
|
||||
av_assert2((stride & 7) == 0);
|
||||
|
||||
#define SUM(in0, in1, out0, out1) \
|
||||
"movq (%0), %%mm2\n" \
|
||||
"movq 8(%0), %%mm3\n" \
|
||||
"add %2,%0\n" \
|
||||
"movq %%mm2, " #out0 "\n" \
|
||||
"movq %%mm3, " #out1 "\n" \
|
||||
"psubusb " #in0 ", %%mm2\n" \
|
||||
"psubusb " #in1 ", %%mm3\n" \
|
||||
"psubusb " #out0 ", " #in0 "\n" \
|
||||
"psubusb " #out1 ", " #in1 "\n" \
|
||||
"por %%mm2, " #in0 "\n" \
|
||||
"por %%mm3, " #in1 "\n" \
|
||||
"movq " #in0 ", %%mm2\n" \
|
||||
"movq " #in1 ", %%mm3\n" \
|
||||
"punpcklbw %%mm7, " #in0 "\n" \
|
||||
"punpcklbw %%mm7, " #in1 "\n" \
|
||||
"punpckhbw %%mm7, %%mm2\n" \
|
||||
"punpckhbw %%mm7, %%mm3\n" \
|
||||
"paddw " #in1 ", " #in0 "\n" \
|
||||
"paddw %%mm3, %%mm2\n" \
|
||||
"paddw %%mm2, " #in0 "\n" \
|
||||
"paddw " #in0 ", %%mm6\n"
|
||||
|
||||
|
||||
__asm__ volatile (
|
||||
"movl %3, %%ecx\n"
|
||||
"pxor %%mm6, %%mm6\n"
|
||||
"pxor %%mm7, %%mm7\n"
|
||||
"movq (%0), %%mm0\n"
|
||||
"movq 8(%0), %%mm1\n"
|
||||
"add %2, %0\n"
|
||||
"jmp 2f\n"
|
||||
"1:\n"
|
||||
|
||||
SUM(%%mm4, %%mm5, %%mm0, %%mm1)
|
||||
"2:\n"
|
||||
SUM(%%mm0, %%mm1, %%mm4, %%mm5)
|
||||
|
||||
"subl $2, %%ecx\n"
|
||||
"jnz 1b\n"
|
||||
|
||||
"movq %%mm6, %%mm0\n"
|
||||
"psrlq $32, %%mm6\n"
|
||||
"paddw %%mm6, %%mm0\n"
|
||||
"movq %%mm0, %%mm6\n"
|
||||
"psrlq $16, %%mm0\n"
|
||||
"paddw %%mm6, %%mm0\n"
|
||||
"movd %%mm0, %1\n"
|
||||
: "+r" (pix), "=r" (tmp)
|
||||
: "r" (stride), "m" (h)
|
||||
: "%ecx");
|
||||
|
||||
return tmp & 0xFFFF;
|
||||
}
|
||||
#undef SUM
|
||||
|
||||
static int vsad16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
|
||||
ptrdiff_t stride, int h)
|
||||
{
|
||||
int tmp;
|
||||
|
||||
av_assert2((((int) pix1) & 7) == 0);
|
||||
av_assert2((((int) pix2) & 7) == 0);
|
||||
av_assert2((stride & 7) == 0);
|
||||
|
||||
#define SUM(in0, in1, out0, out1) \
|
||||
"movq (%0), %%mm2\n" \
|
||||
"movq (%1), " #out0 "\n" \
|
||||
"movq 8(%0), %%mm3\n" \
|
||||
"movq 8(%1), " #out1 "\n" \
|
||||
"add %3, %0\n" \
|
||||
"add %3, %1\n" \
|
||||
"psubb " #out0 ", %%mm2\n" \
|
||||
"psubb " #out1 ", %%mm3\n" \
|
||||
"pxor %%mm7, %%mm2\n" \
|
||||
"pxor %%mm7, %%mm3\n" \
|
||||
"movq %%mm2, " #out0 "\n" \
|
||||
"movq %%mm3, " #out1 "\n" \
|
||||
"psubusb " #in0 ", %%mm2\n" \
|
||||
"psubusb " #in1 ", %%mm3\n" \
|
||||
"psubusb " #out0 ", " #in0 "\n" \
|
||||
"psubusb " #out1 ", " #in1 "\n" \
|
||||
"por %%mm2, " #in0 "\n" \
|
||||
"por %%mm3, " #in1 "\n" \
|
||||
"movq " #in0 ", %%mm2\n" \
|
||||
"movq " #in1 ", %%mm3\n" \
|
||||
"punpcklbw %%mm7, " #in0 "\n" \
|
||||
"punpcklbw %%mm7, " #in1 "\n" \
|
||||
"punpckhbw %%mm7, %%mm2\n" \
|
||||
"punpckhbw %%mm7, %%mm3\n" \
|
||||
"paddw " #in1 ", " #in0 "\n" \
|
||||
"paddw %%mm3, %%mm2\n" \
|
||||
"paddw %%mm2, " #in0 "\n" \
|
||||
"paddw " #in0 ", %%mm6\n"
|
||||
|
||||
|
||||
__asm__ volatile (
|
||||
"movl %4, %%ecx\n"
|
||||
"pxor %%mm6, %%mm6\n"
|
||||
"pcmpeqw %%mm7, %%mm7\n"
|
||||
"psllw $15, %%mm7\n"
|
||||
"packsswb %%mm7, %%mm7\n"
|
||||
"movq (%0), %%mm0\n"
|
||||
"movq (%1), %%mm2\n"
|
||||
"movq 8(%0), %%mm1\n"
|
||||
"movq 8(%1), %%mm3\n"
|
||||
"add %3, %0\n"
|
||||
"add %3, %1\n"
|
||||
"psubb %%mm2, %%mm0\n"
|
||||
"psubb %%mm3, %%mm1\n"
|
||||
"pxor %%mm7, %%mm0\n"
|
||||
"pxor %%mm7, %%mm1\n"
|
||||
"jmp 2f\n"
|
||||
"1:\n"
|
||||
|
||||
SUM(%%mm4, %%mm5, %%mm0, %%mm1)
|
||||
"2:\n"
|
||||
SUM(%%mm0, %%mm1, %%mm4, %%mm5)
|
||||
|
||||
"subl $2, %%ecx\n"
|
||||
"jnz 1b\n"
|
||||
|
||||
"movq %%mm6, %%mm0\n"
|
||||
"psrlq $32, %%mm6\n"
|
||||
"paddw %%mm6, %%mm0\n"
|
||||
"movq %%mm0, %%mm6\n"
|
||||
"psrlq $16, %%mm0\n"
|
||||
"paddw %%mm6, %%mm0\n"
|
||||
"movd %%mm0, %2\n"
|
||||
: "+r" (pix1), "+r" (pix2), "=r" (tmp)
|
||||
: "r" (stride), "m" (h)
|
||||
: "%ecx");
|
||||
|
||||
return tmp & 0x7FFF;
|
||||
}
|
||||
#undef SUM
|
||||
|
||||
DECLARE_ASM_CONST(8, uint64_t, round_tab)[3] = {
|
||||
0x0000000000000000ULL,
|
||||
0x0001000100010001ULL,
|
||||
0x0002000200020002ULL,
|
||||
};
|
||||
|
||||
static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2,
|
||||
ptrdiff_t stride, int h)
|
||||
{
|
||||
x86_reg len = -stride * h;
|
||||
__asm__ volatile (
|
||||
".p2align 4 \n\t"
|
||||
"1: \n\t"
|
||||
"movq (%1, %%"FF_REG_a"), %%mm0 \n\t"
|
||||
"movq (%2, %%"FF_REG_a"), %%mm2 \n\t"
|
||||
"movq (%2, %%"FF_REG_a"), %%mm4 \n\t"
|
||||
"add %3, %%"FF_REG_a" \n\t"
|
||||
"psubusb %%mm0, %%mm2 \n\t"
|
||||
"psubusb %%mm4, %%mm0 \n\t"
|
||||
"movq (%1, %%"FF_REG_a"), %%mm1 \n\t"
|
||||
"movq (%2, %%"FF_REG_a"), %%mm3 \n\t"
|
||||
"movq (%2, %%"FF_REG_a"), %%mm5 \n\t"
|
||||
"psubusb %%mm1, %%mm3 \n\t"
|
||||
"psubusb %%mm5, %%mm1 \n\t"
|
||||
"por %%mm2, %%mm0 \n\t"
|
||||
"por %%mm1, %%mm3 \n\t"
|
||||
"movq %%mm0, %%mm1 \n\t"
|
||||
"movq %%mm3, %%mm2 \n\t"
|
||||
"punpcklbw %%mm7, %%mm0 \n\t"
|
||||
"punpckhbw %%mm7, %%mm1 \n\t"
|
||||
"punpcklbw %%mm7, %%mm3 \n\t"
|
||||
"punpckhbw %%mm7, %%mm2 \n\t"
|
||||
"paddw %%mm1, %%mm0 \n\t"
|
||||
"paddw %%mm3, %%mm2 \n\t"
|
||||
"paddw %%mm2, %%mm0 \n\t"
|
||||
"paddw %%mm0, %%mm6 \n\t"
|
||||
"add %3, %%"FF_REG_a" \n\t"
|
||||
" js 1b \n\t"
|
||||
: "+a" (len)
|
||||
: "r" (blk1 - len), "r" (blk2 - len), "r" (stride));
|
||||
}
|
||||
|
||||
static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2,
|
||||
ptrdiff_t stride, int h)
|
||||
{
|
||||
x86_reg len = -stride * h;
|
||||
__asm__ volatile (
|
||||
".p2align 4 \n\t"
|
||||
"1: \n\t"
|
||||
"movq (%1, %%"FF_REG_a"), %%mm0 \n\t"
|
||||
"movq (%2, %%"FF_REG_a"), %%mm1 \n\t"
|
||||
"movq (%1, %%"FF_REG_a"), %%mm2 \n\t"
|
||||
"movq (%2, %%"FF_REG_a"), %%mm3 \n\t"
|
||||
"punpcklbw %%mm7, %%mm0 \n\t"
|
||||
"punpcklbw %%mm7, %%mm1 \n\t"
|
||||
"punpckhbw %%mm7, %%mm2 \n\t"
|
||||
"punpckhbw %%mm7, %%mm3 \n\t"
|
||||
"paddw %%mm0, %%mm1 \n\t"
|
||||
"paddw %%mm2, %%mm3 \n\t"
|
||||
"movq (%3, %%"FF_REG_a"), %%mm4 \n\t"
|
||||
"movq (%3, %%"FF_REG_a"), %%mm2 \n\t"
|
||||
"paddw %%mm5, %%mm1 \n\t"
|
||||
"paddw %%mm5, %%mm3 \n\t"
|
||||
"psrlw $1, %%mm1 \n\t"
|
||||
"psrlw $1, %%mm3 \n\t"
|
||||
"packuswb %%mm3, %%mm1 \n\t"
|
||||
"psubusb %%mm1, %%mm4 \n\t"
|
||||
"psubusb %%mm2, %%mm1 \n\t"
|
||||
"por %%mm4, %%mm1 \n\t"
|
||||
"movq %%mm1, %%mm0 \n\t"
|
||||
"punpcklbw %%mm7, %%mm0 \n\t"
|
||||
"punpckhbw %%mm7, %%mm1 \n\t"
|
||||
"paddw %%mm1, %%mm0 \n\t"
|
||||
"paddw %%mm0, %%mm6 \n\t"
|
||||
"add %4, %%"FF_REG_a" \n\t"
|
||||
" js 1b \n\t"
|
||||
: "+a" (len)
|
||||
: "r" (blk1a - len), "r" (blk1b - len), "r" (blk2 - len),
|
||||
"r" (stride));
|
||||
}
|
||||
|
||||
static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2,
|
||||
ptrdiff_t stride, int h)
|
||||
{
|
||||
x86_reg len = -stride * h;
|
||||
__asm__ volatile (
|
||||
"movq (%1, %%"FF_REG_a"), %%mm0\n\t"
|
||||
"movq 1(%1, %%"FF_REG_a"), %%mm2\n\t"
|
||||
"movq %%mm0, %%mm1 \n\t"
|
||||
"movq %%mm2, %%mm3 \n\t"
|
||||
"punpcklbw %%mm7, %%mm0 \n\t"
|
||||
"punpckhbw %%mm7, %%mm1 \n\t"
|
||||
"punpcklbw %%mm7, %%mm2 \n\t"
|
||||
"punpckhbw %%mm7, %%mm3 \n\t"
|
||||
"paddw %%mm2, %%mm0 \n\t"
|
||||
"paddw %%mm3, %%mm1 \n\t"
|
||||
".p2align 4 \n\t"
|
||||
"1: \n\t"
|
||||
"movq (%2, %%"FF_REG_a"), %%mm2\n\t"
|
||||
"movq 1(%2, %%"FF_REG_a"), %%mm4\n\t"
|
||||
"movq %%mm2, %%mm3 \n\t"
|
||||
"movq %%mm4, %%mm5 \n\t"
|
||||
"punpcklbw %%mm7, %%mm2 \n\t"
|
||||
"punpckhbw %%mm7, %%mm3 \n\t"
|
||||
"punpcklbw %%mm7, %%mm4 \n\t"
|
||||
"punpckhbw %%mm7, %%mm5 \n\t"
|
||||
"paddw %%mm4, %%mm2 \n\t"
|
||||
"paddw %%mm5, %%mm3 \n\t"
|
||||
"movq %5, %%mm5 \n\t"
|
||||
"paddw %%mm2, %%mm0 \n\t"
|
||||
"paddw %%mm3, %%mm1 \n\t"
|
||||
"paddw %%mm5, %%mm0 \n\t"
|
||||
"paddw %%mm5, %%mm1 \n\t"
|
||||
"movq (%3, %%"FF_REG_a"), %%mm4 \n\t"
|
||||
"movq (%3, %%"FF_REG_a"), %%mm5 \n\t"
|
||||
"psrlw $2, %%mm0 \n\t"
|
||||
"psrlw $2, %%mm1 \n\t"
|
||||
"packuswb %%mm1, %%mm0 \n\t"
|
||||
"psubusb %%mm0, %%mm4 \n\t"
|
||||
"psubusb %%mm5, %%mm0 \n\t"
|
||||
"por %%mm4, %%mm0 \n\t"
|
||||
"movq %%mm0, %%mm4 \n\t"
|
||||
"punpcklbw %%mm7, %%mm0 \n\t"
|
||||
"punpckhbw %%mm7, %%mm4 \n\t"
|
||||
"paddw %%mm0, %%mm6 \n\t"
|
||||
"paddw %%mm4, %%mm6 \n\t"
|
||||
"movq %%mm2, %%mm0 \n\t"
|
||||
"movq %%mm3, %%mm1 \n\t"
|
||||
"add %4, %%"FF_REG_a" \n\t"
|
||||
" js 1b \n\t"
|
||||
: "+a" (len)
|
||||
: "r" (blk1 - len), "r" (blk1 - len + stride), "r" (blk2 - len),
|
||||
"r" (stride), "m" (round_tab[2]));
|
||||
}
|
||||
|
||||
static inline int sum_mmx(void)
|
||||
{
|
||||
int ret;
|
||||
__asm__ volatile (
|
||||
"movq %%mm6, %%mm0 \n\t"
|
||||
"psrlq $32, %%mm6 \n\t"
|
||||
"paddw %%mm0, %%mm6 \n\t"
|
||||
"movq %%mm6, %%mm0 \n\t"
|
||||
"psrlq $16, %%mm6 \n\t"
|
||||
"paddw %%mm0, %%mm6 \n\t"
|
||||
"movd %%mm6, %0 \n\t"
|
||||
: "=r" (ret));
|
||||
return ret & 0xFFFF;
|
||||
}
|
||||
|
||||
static inline void sad8_x2a_mmx(uint8_t *blk1, uint8_t *blk2,
|
||||
ptrdiff_t stride, int h)
|
||||
{
|
||||
sad8_2_mmx(blk1, blk1 + 1, blk2, stride, h);
|
||||
}
|
||||
|
||||
static inline void sad8_y2a_mmx(uint8_t *blk1, uint8_t *blk2,
|
||||
ptrdiff_t stride, int h)
|
||||
{
|
||||
sad8_2_mmx(blk1, blk1 + stride, blk2, stride, h);
|
||||
}
|
||||
|
||||
#define PIX_SAD(suf) \
|
||||
static int sad8_ ## suf(MpegEncContext *v, uint8_t *blk2, \
|
||||
uint8_t *blk1, ptrdiff_t stride, int h) \
|
||||
{ \
|
||||
av_assert2(h == 8); \
|
||||
__asm__ volatile ( \
|
||||
"pxor %%mm7, %%mm7 \n\t" \
|
||||
"pxor %%mm6, %%mm6 \n\t" \
|
||||
:); \
|
||||
\
|
||||
sad8_1_ ## suf(blk1, blk2, stride, 8); \
|
||||
\
|
||||
return sum_ ## suf(); \
|
||||
} \
|
||||
\
|
||||
static int sad8_x2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
|
||||
uint8_t *blk1, ptrdiff_t stride, int h) \
|
||||
{ \
|
||||
av_assert2(h == 8); \
|
||||
__asm__ volatile ( \
|
||||
"pxor %%mm7, %%mm7 \n\t" \
|
||||
"pxor %%mm6, %%mm6 \n\t" \
|
||||
"movq %0, %%mm5 \n\t" \
|
||||
:: "m" (round_tab[1])); \
|
||||
\
|
||||
sad8_x2a_ ## suf(blk1, blk2, stride, 8); \
|
||||
\
|
||||
return sum_ ## suf(); \
|
||||
} \
|
||||
\
|
||||
static int sad8_y2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
|
||||
uint8_t *blk1, ptrdiff_t stride, int h) \
|
||||
{ \
|
||||
av_assert2(h == 8); \
|
||||
__asm__ volatile ( \
|
||||
"pxor %%mm7, %%mm7 \n\t" \
|
||||
"pxor %%mm6, %%mm6 \n\t" \
|
||||
"movq %0, %%mm5 \n\t" \
|
||||
:: "m" (round_tab[1])); \
|
||||
\
|
||||
sad8_y2a_ ## suf(blk1, blk2, stride, 8); \
|
||||
\
|
||||
return sum_ ## suf(); \
|
||||
} \
|
||||
\
|
||||
static int sad8_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
|
||||
uint8_t *blk1, ptrdiff_t stride, int h) \
|
||||
{ \
|
||||
av_assert2(h == 8); \
|
||||
__asm__ volatile ( \
|
||||
"pxor %%mm7, %%mm7 \n\t" \
|
||||
"pxor %%mm6, %%mm6 \n\t" \
|
||||
::); \
|
||||
\
|
||||
sad8_4_ ## suf(blk1, blk2, stride, 8); \
|
||||
\
|
||||
return sum_ ## suf(); \
|
||||
} \
|
||||
\
|
||||
static int sad16_ ## suf(MpegEncContext *v, uint8_t *blk2, \
|
||||
uint8_t *blk1, ptrdiff_t stride, int h) \
|
||||
{ \
|
||||
__asm__ volatile ( \
|
||||
"pxor %%mm7, %%mm7 \n\t" \
|
||||
"pxor %%mm6, %%mm6 \n\t" \
|
||||
:); \
|
||||
\
|
||||
sad8_1_ ## suf(blk1, blk2, stride, h); \
|
||||
sad8_1_ ## suf(blk1 + 8, blk2 + 8, stride, h); \
|
||||
\
|
||||
return sum_ ## suf(); \
|
||||
} \
|
||||
\
|
||||
static int sad16_x2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
|
||||
uint8_t *blk1, ptrdiff_t stride, int h) \
|
||||
{ \
|
||||
__asm__ volatile ( \
|
||||
"pxor %%mm7, %%mm7 \n\t" \
|
||||
"pxor %%mm6, %%mm6 \n\t" \
|
||||
"movq %0, %%mm5 \n\t" \
|
||||
:: "m" (round_tab[1])); \
|
||||
\
|
||||
sad8_x2a_ ## suf(blk1, blk2, stride, h); \
|
||||
sad8_x2a_ ## suf(blk1 + 8, blk2 + 8, stride, h); \
|
||||
\
|
||||
return sum_ ## suf(); \
|
||||
} \
|
||||
\
|
||||
static int sad16_y2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
|
||||
uint8_t *blk1, ptrdiff_t stride, int h) \
|
||||
{ \
|
||||
__asm__ volatile ( \
|
||||
"pxor %%mm7, %%mm7 \n\t" \
|
||||
"pxor %%mm6, %%mm6 \n\t" \
|
||||
"movq %0, %%mm5 \n\t" \
|
||||
:: "m" (round_tab[1])); \
|
||||
\
|
||||
sad8_y2a_ ## suf(blk1, blk2, stride, h); \
|
||||
sad8_y2a_ ## suf(blk1 + 8, blk2 + 8, stride, h); \
|
||||
\
|
||||
return sum_ ## suf(); \
|
||||
} \
|
||||
\
|
||||
static int sad16_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
|
||||
uint8_t *blk1, ptrdiff_t stride, int h) \
|
||||
{ \
|
||||
__asm__ volatile ( \
|
||||
"pxor %%mm7, %%mm7 \n\t" \
|
||||
"pxor %%mm6, %%mm6 \n\t" \
|
||||
::); \
|
||||
\
|
||||
sad8_4_ ## suf(blk1, blk2, stride, h); \
|
||||
sad8_4_ ## suf(blk1 + 8, blk2 + 8, stride, h); \
|
||||
\
|
||||
return sum_ ## suf(); \
|
||||
} \
|
||||
|
||||
PIX_SAD(mmx)
|
||||
|
||||
#endif /* HAVE_INLINE_ASM */
|
||||
|
||||
av_cold void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
#if HAVE_INLINE_ASM
|
||||
if (INLINE_MMX(cpu_flags)) {
|
||||
c->pix_abs[0][0] = sad16_mmx;
|
||||
c->pix_abs[0][1] = sad16_x2_mmx;
|
||||
c->pix_abs[0][2] = sad16_y2_mmx;
|
||||
c->pix_abs[0][3] = sad16_xy2_mmx;
|
||||
c->pix_abs[1][0] = sad8_mmx;
|
||||
c->pix_abs[1][1] = sad8_x2_mmx;
|
||||
c->pix_abs[1][2] = sad8_y2_mmx;
|
||||
c->pix_abs[1][3] = sad8_xy2_mmx;
|
||||
|
||||
c->sad[0] = sad16_mmx;
|
||||
c->sad[1] = sad8_mmx;
|
||||
|
||||
c->vsad[4] = vsad_intra16_mmx;
|
||||
|
||||
if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {
|
||||
c->vsad[0] = vsad16_mmx;
|
||||
}
|
||||
}
|
||||
|
||||
#endif /* HAVE_INLINE_ASM */
|
||||
|
||||
if (EXTERNAL_MMX(cpu_flags)) {
|
||||
c->hadamard8_diff[0] = ff_hadamard8_diff16_mmx;
|
||||
c->hadamard8_diff[1] = ff_hadamard8_diff_mmx;
|
||||
c->sum_abs_dctelem = ff_sum_abs_dctelem_mmx;
|
||||
c->sse[0] = ff_sse16_mmx;
|
||||
c->sse[1] = ff_sse8_mmx;
|
||||
#if HAVE_X86ASM
|
||||
c->nsse[0] = nsse16_mmx;
|
||||
c->nsse[1] = nsse8_mmx;
|
||||
#endif
|
||||
}
|
||||
|
||||
if (EXTERNAL_MMXEXT(cpu_flags)) {
|
||||
c->hadamard8_diff[0] = ff_hadamard8_diff16_mmxext;
|
||||
c->hadamard8_diff[1] = ff_hadamard8_diff_mmxext;
|
||||
c->sum_abs_dctelem = ff_sum_abs_dctelem_mmxext;
|
||||
|
||||
c->sad[0] = ff_sad16_mmxext;
|
||||
c->sad[1] = ff_sad8_mmxext;
|
||||
|
||||
c->pix_abs[0][0] = ff_sad16_mmxext;
|
||||
c->pix_abs[0][1] = ff_sad16_x2_mmxext;
|
||||
c->pix_abs[0][2] = ff_sad16_y2_mmxext;
|
||||
c->pix_abs[1][0] = ff_sad8_mmxext;
|
||||
c->pix_abs[1][1] = ff_sad8_x2_mmxext;
|
||||
c->pix_abs[1][2] = ff_sad8_y2_mmxext;
|
||||
|
||||
c->vsad[4] = ff_vsad_intra16_mmxext;
|
||||
c->vsad[5] = ff_vsad_intra8_mmxext;
|
||||
|
||||
if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {
|
||||
c->pix_abs[0][3] = ff_sad16_approx_xy2_mmxext;
|
||||
c->pix_abs[1][3] = ff_sad8_approx_xy2_mmxext;
|
||||
|
||||
c->vsad[0] = ff_vsad16_approx_mmxext;
|
||||
c->vsad[1] = ff_vsad8_approx_mmxext;
|
||||
}
|
||||
}
|
||||
|
||||
if (EXTERNAL_SSE2(cpu_flags)) {
|
||||
c->sse[0] = ff_sse16_sse2;
|
||||
c->sum_abs_dctelem = ff_sum_abs_dctelem_sse2;
|
||||
|
||||
#if HAVE_ALIGNED_STACK
|
||||
c->hadamard8_diff[0] = ff_hadamard8_diff16_sse2;
|
||||
c->hadamard8_diff[1] = ff_hadamard8_diff_sse2;
|
||||
#endif
|
||||
if (!(cpu_flags & AV_CPU_FLAG_SSE2SLOW) && avctx->codec_id != AV_CODEC_ID_SNOW) {
|
||||
c->sad[0] = ff_sad16_sse2;
|
||||
c->pix_abs[0][0] = ff_sad16_sse2;
|
||||
c->pix_abs[0][1] = ff_sad16_x2_sse2;
|
||||
c->pix_abs[0][2] = ff_sad16_y2_sse2;
|
||||
|
||||
c->vsad[4] = ff_vsad_intra16_sse2;
|
||||
if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {
|
||||
c->pix_abs[0][3] = ff_sad16_approx_xy2_sse2;
|
||||
c->vsad[0] = ff_vsad16_approx_sse2;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (EXTERNAL_SSSE3(cpu_flags)) {
|
||||
c->sum_abs_dctelem = ff_sum_abs_dctelem_ssse3;
|
||||
#if HAVE_ALIGNED_STACK
|
||||
c->hadamard8_diff[0] = ff_hadamard8_diff16_ssse3;
|
||||
c->hadamard8_diff[1] = ff_hadamard8_diff_ssse3;
|
||||
#endif
|
||||
}
|
||||
}
|
204
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/mlpdsp_init.c
vendored
Normal file
204
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/mlpdsp_init.c
vendored
Normal file
|
@ -0,0 +1,204 @@
|
|||
/*
|
||||
* MLP DSP functions x86-optimized
|
||||
* Copyright (c) 2009 Ramiro Polla
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/x86/asm.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/mlpdsp.h"
|
||||
#include "libavcodec/mlp.h"
|
||||
|
||||
#define REMATRIX_CHANNEL_FUNC(opt) \
|
||||
void ff_mlp_rematrix_channel_##opt(int32_t *samples, \
|
||||
const int32_t *coeffs, \
|
||||
const uint8_t *bypassed_lsbs, \
|
||||
const int8_t *noise_buffer, \
|
||||
int index, \
|
||||
unsigned int dest_ch, \
|
||||
uint16_t blockpos, \
|
||||
unsigned int maxchan, \
|
||||
int matrix_noise_shift, \
|
||||
int access_unit_size_pow2, \
|
||||
int32_t mask);
|
||||
|
||||
REMATRIX_CHANNEL_FUNC(sse4)
|
||||
REMATRIX_CHANNEL_FUNC(avx2_bmi2)
|
||||
|
||||
#if HAVE_7REGS && HAVE_INLINE_ASM && HAVE_INLINE_ASM_NONLOCAL_LABELS
|
||||
|
||||
extern char ff_mlp_firorder_8;
|
||||
extern char ff_mlp_firorder_7;
|
||||
extern char ff_mlp_firorder_6;
|
||||
extern char ff_mlp_firorder_5;
|
||||
extern char ff_mlp_firorder_4;
|
||||
extern char ff_mlp_firorder_3;
|
||||
extern char ff_mlp_firorder_2;
|
||||
extern char ff_mlp_firorder_1;
|
||||
extern char ff_mlp_firorder_0;
|
||||
|
||||
extern char ff_mlp_iirorder_4;
|
||||
extern char ff_mlp_iirorder_3;
|
||||
extern char ff_mlp_iirorder_2;
|
||||
extern char ff_mlp_iirorder_1;
|
||||
extern char ff_mlp_iirorder_0;
|
||||
|
||||
static const void * const firtable[9] = { &ff_mlp_firorder_0, &ff_mlp_firorder_1,
|
||||
&ff_mlp_firorder_2, &ff_mlp_firorder_3,
|
||||
&ff_mlp_firorder_4, &ff_mlp_firorder_5,
|
||||
&ff_mlp_firorder_6, &ff_mlp_firorder_7,
|
||||
&ff_mlp_firorder_8 };
|
||||
static const void * const iirtable[5] = { &ff_mlp_iirorder_0, &ff_mlp_iirorder_1,
|
||||
&ff_mlp_iirorder_2, &ff_mlp_iirorder_3,
|
||||
&ff_mlp_iirorder_4 };
|
||||
|
||||
#if ARCH_X86_64
|
||||
|
||||
#define MLPMUL(label, offset, offs, offc) \
|
||||
LABEL_MANGLE(label)": \n\t" \
|
||||
"movslq "offset"+"offs"(%0), %%rax\n\t" \
|
||||
"movslq "offset"+"offc"(%1), %%rdx\n\t" \
|
||||
"imul %%rdx, %%rax\n\t" \
|
||||
"add %%rax, %%rsi\n\t"
|
||||
|
||||
#define FIRMULREG(label, offset, firc)\
|
||||
LABEL_MANGLE(label)": \n\t" \
|
||||
"movslq "#offset"(%0), %%rax\n\t" \
|
||||
"imul %"#firc", %%rax\n\t" \
|
||||
"add %%rax, %%rsi\n\t"
|
||||
|
||||
#define CLEAR_ACCUM \
|
||||
"xor %%rsi, %%rsi\n\t"
|
||||
|
||||
#define SHIFT_ACCUM \
|
||||
"shr %%cl, %%rsi\n\t"
|
||||
|
||||
#define ACCUM "%%rdx"
|
||||
#define RESULT "%%rsi"
|
||||
#define RESULT32 "%%esi"
|
||||
|
||||
#else /* if ARCH_X86_32 */
|
||||
|
||||
#define MLPMUL(label, offset, offs, offc) \
|
||||
LABEL_MANGLE(label)": \n\t" \
|
||||
"mov "offset"+"offs"(%0), %%eax\n\t" \
|
||||
"imull "offset"+"offc"(%1) \n\t" \
|
||||
"add %%eax , %%esi\n\t" \
|
||||
"adc %%edx , %%ecx\n\t"
|
||||
|
||||
#define FIRMULREG(label, offset, firc) \
|
||||
MLPMUL(label, #offset, "0", "0")
|
||||
|
||||
#define CLEAR_ACCUM \
|
||||
"xor %%esi, %%esi\n\t" \
|
||||
"xor %%ecx, %%ecx\n\t"
|
||||
|
||||
#define SHIFT_ACCUM \
|
||||
"mov %%ecx, %%edx\n\t" \
|
||||
"mov %%esi, %%eax\n\t" \
|
||||
"movzbl %7 , %%ecx\n\t" \
|
||||
"shrd %%cl, %%edx, %%eax\n\t" \
|
||||
|
||||
#define ACCUM "%%edx"
|
||||
#define RESULT "%%eax"
|
||||
#define RESULT32 "%%eax"
|
||||
|
||||
#endif /* !ARCH_X86_64 */
|
||||
|
||||
#define BINC AV_STRINGIFY(4* MAX_CHANNELS)
|
||||
#define IOFFS AV_STRINGIFY(4*(MAX_FIR_ORDER + MAX_BLOCKSIZE))
|
||||
#define IOFFC AV_STRINGIFY(4* MAX_FIR_ORDER)
|
||||
|
||||
#define FIRMUL(label, offset) MLPMUL(label, #offset, "0", "0")
|
||||
#define IIRMUL(label, offset) MLPMUL(label, #offset, IOFFS, IOFFC)
|
||||
|
||||
static void mlp_filter_channel_x86(int32_t *state, const int32_t *coeff,
|
||||
int firorder, int iirorder,
|
||||
unsigned int filter_shift, int32_t mask,
|
||||
int blocksize, int32_t *sample_buffer)
|
||||
{
|
||||
const void *firjump = firtable[firorder];
|
||||
const void *iirjump = iirtable[iirorder];
|
||||
|
||||
blocksize = -blocksize;
|
||||
|
||||
__asm__ volatile(
|
||||
"1: \n\t"
|
||||
CLEAR_ACCUM
|
||||
"jmp *%5 \n\t"
|
||||
FIRMUL (ff_mlp_firorder_8, 0x1c )
|
||||
FIRMUL (ff_mlp_firorder_7, 0x18 )
|
||||
FIRMUL (ff_mlp_firorder_6, 0x14 )
|
||||
FIRMUL (ff_mlp_firorder_5, 0x10 )
|
||||
FIRMUL (ff_mlp_firorder_4, 0x0c )
|
||||
FIRMUL (ff_mlp_firorder_3, 0x08 )
|
||||
FIRMUL (ff_mlp_firorder_2, 0x04 )
|
||||
FIRMULREG(ff_mlp_firorder_1, 0x00, 8)
|
||||
LABEL_MANGLE(ff_mlp_firorder_0)":\n\t"
|
||||
"jmp *%6 \n\t"
|
||||
IIRMUL (ff_mlp_iirorder_4, 0x0c )
|
||||
IIRMUL (ff_mlp_iirorder_3, 0x08 )
|
||||
IIRMUL (ff_mlp_iirorder_2, 0x04 )
|
||||
IIRMUL (ff_mlp_iirorder_1, 0x00 )
|
||||
LABEL_MANGLE(ff_mlp_iirorder_0)":\n\t"
|
||||
SHIFT_ACCUM
|
||||
"mov "RESULT" ,"ACCUM" \n\t"
|
||||
"add (%2) ,"RESULT" \n\t"
|
||||
"and %4 ,"RESULT" \n\t"
|
||||
"sub $4 , %0 \n\t"
|
||||
"mov "RESULT32", (%0) \n\t"
|
||||
"mov "RESULT32", (%2) \n\t"
|
||||
"add $"BINC" , %2 \n\t"
|
||||
"sub "ACCUM" ,"RESULT" \n\t"
|
||||
"mov "RESULT32","IOFFS"(%0) \n\t"
|
||||
"incl %3 \n\t"
|
||||
"js 1b \n\t"
|
||||
: /* 0*/"+r"(state),
|
||||
/* 1*/"+r"(coeff),
|
||||
/* 2*/"+r"(sample_buffer),
|
||||
#if ARCH_X86_64
|
||||
/* 3*/"+r"(blocksize)
|
||||
: /* 4*/"r"((x86_reg)mask), /* 5*/"r"(firjump),
|
||||
/* 6*/"r"(iirjump) , /* 7*/"c"(filter_shift)
|
||||
, /* 8*/"r"((int64_t)coeff[0])
|
||||
: "rax", "rdx", "rsi"
|
||||
#else /* ARCH_X86_32 */
|
||||
/* 3*/"+m"(blocksize)
|
||||
: /* 4*/"m"( mask), /* 5*/"m"(firjump),
|
||||
/* 6*/"m"(iirjump) , /* 7*/"m"(filter_shift)
|
||||
: "eax", "edx", "esi", "ecx"
|
||||
#endif /* !ARCH_X86_64 */
|
||||
);
|
||||
}
|
||||
|
||||
#endif /* HAVE_7REGS && HAVE_INLINE_ASM */
|
||||
|
||||
av_cold void ff_mlpdsp_init_x86(MLPDSPContext *c)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
#if HAVE_7REGS && HAVE_INLINE_ASM && HAVE_INLINE_ASM_NONLOCAL_LABELS
|
||||
if (INLINE_MMX(cpu_flags))
|
||||
c->mlp_filter_channel = mlp_filter_channel_x86;
|
||||
#endif
|
||||
if (ARCH_X86_64 && EXTERNAL_SSE4(cpu_flags))
|
||||
c->mlp_rematrix_channel = ff_mlp_rematrix_channel_sse4;
|
||||
if (ARCH_X86_64 && EXTERNAL_AVX2_FAST(cpu_flags) && cpu_flags & AV_CPU_FLAG_BMI2)
|
||||
c->mlp_rematrix_channel = ff_mlp_rematrix_channel_avx2_bmi2;
|
||||
}
|
289
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/mpegaudiodsp.c
vendored
Normal file
289
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/mpegaudiodsp.c
vendored
Normal file
|
@ -0,0 +1,289 @@
|
|||
/*
|
||||
* SIMD-optimized MP3 decoding functions
|
||||
* Copyright (c) 2010 Vitor Sessak
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/internal.h"
|
||||
#include "libavutil/x86/asm.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/mpegaudiodsp.h"
|
||||
|
||||
#define DECL(CPU)\
|
||||
static void imdct36_blocks_ ## CPU(float *out, float *buf, float *in, int count, int switch_point, int block_type);\
|
||||
void ff_imdct36_float_ ## CPU(float *out, float *buf, float *in, float *win);
|
||||
|
||||
#if HAVE_X86ASM
|
||||
#if ARCH_X86_32
|
||||
DECL(sse)
|
||||
#endif
|
||||
DECL(sse2)
|
||||
DECL(sse3)
|
||||
DECL(ssse3)
|
||||
DECL(avx)
|
||||
#endif /* HAVE_X86ASM */
|
||||
|
||||
void ff_four_imdct36_float_sse(float *out, float *buf, float *in, float *win,
|
||||
float *tmpbuf);
|
||||
void ff_four_imdct36_float_avx(float *out, float *buf, float *in, float *win,
|
||||
float *tmpbuf);
|
||||
|
||||
DECLARE_ALIGNED(16, static float, mdct_win_sse)[2][4][4*40];
|
||||
|
||||
#if HAVE_6REGS && HAVE_SSE_INLINE
|
||||
|
||||
#define MACS(rt, ra, rb) rt+=(ra)*(rb)
|
||||
#define MLSS(rt, ra, rb) rt-=(ra)*(rb)
|
||||
|
||||
#define SUM8(op, sum, w, p) \
|
||||
{ \
|
||||
op(sum, (w)[0 * 64], (p)[0 * 64]); \
|
||||
op(sum, (w)[1 * 64], (p)[1 * 64]); \
|
||||
op(sum, (w)[2 * 64], (p)[2 * 64]); \
|
||||
op(sum, (w)[3 * 64], (p)[3 * 64]); \
|
||||
op(sum, (w)[4 * 64], (p)[4 * 64]); \
|
||||
op(sum, (w)[5 * 64], (p)[5 * 64]); \
|
||||
op(sum, (w)[6 * 64], (p)[6 * 64]); \
|
||||
op(sum, (w)[7 * 64], (p)[7 * 64]); \
|
||||
}
|
||||
|
||||
static void apply_window(const float *buf, const float *win1,
|
||||
const float *win2, float *sum1, float *sum2, int len)
|
||||
{
|
||||
x86_reg count = - 4*len;
|
||||
const float *win1a = win1+len;
|
||||
const float *win2a = win2+len;
|
||||
const float *bufa = buf+len;
|
||||
float *sum1a = sum1+len;
|
||||
float *sum2a = sum2+len;
|
||||
|
||||
|
||||
#define MULT(a, b) \
|
||||
"movaps " #a "(%1,%0), %%xmm1 \n\t" \
|
||||
"movaps " #a "(%3,%0), %%xmm2 \n\t" \
|
||||
"mulps %%xmm2, %%xmm1 \n\t" \
|
||||
"subps %%xmm1, %%xmm0 \n\t" \
|
||||
"mulps " #b "(%2,%0), %%xmm2 \n\t" \
|
||||
"subps %%xmm2, %%xmm4 \n\t" \
|
||||
|
||||
__asm__ volatile(
|
||||
"1: \n\t"
|
||||
"xorps %%xmm0, %%xmm0 \n\t"
|
||||
"xorps %%xmm4, %%xmm4 \n\t"
|
||||
|
||||
MULT( 0, 0)
|
||||
MULT( 256, 64)
|
||||
MULT( 512, 128)
|
||||
MULT( 768, 192)
|
||||
MULT(1024, 256)
|
||||
MULT(1280, 320)
|
||||
MULT(1536, 384)
|
||||
MULT(1792, 448)
|
||||
|
||||
"movaps %%xmm0, (%4,%0) \n\t"
|
||||
"movaps %%xmm4, (%5,%0) \n\t"
|
||||
"add $16, %0 \n\t"
|
||||
"jl 1b \n\t"
|
||||
:"+&r"(count)
|
||||
:"r"(win1a), "r"(win2a), "r"(bufa), "r"(sum1a), "r"(sum2a)
|
||||
);
|
||||
|
||||
#undef MULT
|
||||
}
|
||||
|
||||
static void apply_window_mp3(float *in, float *win, int *unused, float *out,
|
||||
ptrdiff_t incr)
|
||||
{
|
||||
LOCAL_ALIGNED_16(float, suma, [17]);
|
||||
LOCAL_ALIGNED_16(float, sumb, [17]);
|
||||
LOCAL_ALIGNED_16(float, sumc, [17]);
|
||||
LOCAL_ALIGNED_16(float, sumd, [17]);
|
||||
|
||||
float sum;
|
||||
|
||||
/* copy to avoid wrap */
|
||||
__asm__ volatile(
|
||||
"movaps 0(%0), %%xmm0 \n\t" \
|
||||
"movaps 16(%0), %%xmm1 \n\t" \
|
||||
"movaps 32(%0), %%xmm2 \n\t" \
|
||||
"movaps 48(%0), %%xmm3 \n\t" \
|
||||
"movaps %%xmm0, 0(%1) \n\t" \
|
||||
"movaps %%xmm1, 16(%1) \n\t" \
|
||||
"movaps %%xmm2, 32(%1) \n\t" \
|
||||
"movaps %%xmm3, 48(%1) \n\t" \
|
||||
"movaps 64(%0), %%xmm0 \n\t" \
|
||||
"movaps 80(%0), %%xmm1 \n\t" \
|
||||
"movaps 96(%0), %%xmm2 \n\t" \
|
||||
"movaps 112(%0), %%xmm3 \n\t" \
|
||||
"movaps %%xmm0, 64(%1) \n\t" \
|
||||
"movaps %%xmm1, 80(%1) \n\t" \
|
||||
"movaps %%xmm2, 96(%1) \n\t" \
|
||||
"movaps %%xmm3, 112(%1) \n\t"
|
||||
::"r"(in), "r"(in+512)
|
||||
:"memory"
|
||||
);
|
||||
|
||||
apply_window(in + 16, win , win + 512, suma, sumc, 16);
|
||||
apply_window(in + 32, win + 48, win + 640, sumb, sumd, 16);
|
||||
|
||||
SUM8(MACS, suma[0], win + 32, in + 48);
|
||||
|
||||
sumc[ 0] = 0;
|
||||
sumb[16] = 0;
|
||||
sumd[16] = 0;
|
||||
|
||||
#define SUMS(suma, sumb, sumc, sumd, out1, out2) \
|
||||
"movups " #sumd "(%4), %%xmm0 \n\t" \
|
||||
"shufps $0x1b, %%xmm0, %%xmm0 \n\t" \
|
||||
"subps " #suma "(%1), %%xmm0 \n\t" \
|
||||
"movaps %%xmm0," #out1 "(%0) \n\t" \
|
||||
\
|
||||
"movups " #sumc "(%3), %%xmm0 \n\t" \
|
||||
"shufps $0x1b, %%xmm0, %%xmm0 \n\t" \
|
||||
"addps " #sumb "(%2), %%xmm0 \n\t" \
|
||||
"movaps %%xmm0," #out2 "(%0) \n\t"
|
||||
|
||||
if (incr == 1) {
|
||||
__asm__ volatile(
|
||||
SUMS( 0, 48, 4, 52, 0, 112)
|
||||
SUMS(16, 32, 20, 36, 16, 96)
|
||||
SUMS(32, 16, 36, 20, 32, 80)
|
||||
SUMS(48, 0, 52, 4, 48, 64)
|
||||
|
||||
:"+&r"(out)
|
||||
:"r"(&suma[0]), "r"(&sumb[0]), "r"(&sumc[0]), "r"(&sumd[0])
|
||||
:"memory"
|
||||
);
|
||||
out += 16*incr;
|
||||
} else {
|
||||
int j;
|
||||
float *out2 = out + 32 * incr;
|
||||
out[0 ] = -suma[ 0];
|
||||
out += incr;
|
||||
out2 -= incr;
|
||||
for(j=1;j<16;j++) {
|
||||
*out = -suma[ j] + sumd[16-j];
|
||||
*out2 = sumb[16-j] + sumc[ j];
|
||||
out += incr;
|
||||
out2 -= incr;
|
||||
}
|
||||
}
|
||||
|
||||
sum = 0;
|
||||
SUM8(MLSS, sum, win + 16 + 32, in + 32);
|
||||
*out = sum;
|
||||
}
|
||||
|
||||
#endif /* HAVE_6REGS && HAVE_SSE_INLINE */
|
||||
|
||||
#if HAVE_X86ASM
|
||||
#define DECL_IMDCT_BLOCKS(CPU1, CPU2) \
|
||||
static void imdct36_blocks_ ## CPU1(float *out, float *buf, float *in, \
|
||||
int count, int switch_point, int block_type) \
|
||||
{ \
|
||||
int align_end = count - (count & 3); \
|
||||
int j; \
|
||||
for (j = 0; j < align_end; j+= 4) { \
|
||||
LOCAL_ALIGNED_16(float, tmpbuf, [1024]); \
|
||||
float *win = mdct_win_sse[switch_point && j < 4][block_type]; \
|
||||
/* apply window & overlap with previous buffer */ \
|
||||
\
|
||||
/* select window */ \
|
||||
ff_four_imdct36_float_ ## CPU2(out, buf, in, win, tmpbuf); \
|
||||
in += 4*18; \
|
||||
buf += 4*18; \
|
||||
out += 4; \
|
||||
} \
|
||||
for (; j < count; j++) { \
|
||||
/* apply window & overlap with previous buffer */ \
|
||||
\
|
||||
/* select window */ \
|
||||
int win_idx = (switch_point && j < 2) ? 0 : block_type; \
|
||||
float *win = ff_mdct_win_float[win_idx + (4 & -(j & 1))]; \
|
||||
\
|
||||
ff_imdct36_float_ ## CPU1(out, buf, in, win); \
|
||||
\
|
||||
in += 18; \
|
||||
buf++; \
|
||||
out++; \
|
||||
} \
|
||||
}
|
||||
|
||||
#if HAVE_SSE
|
||||
#if ARCH_X86_32
|
||||
DECL_IMDCT_BLOCKS(sse,sse)
|
||||
#endif
|
||||
DECL_IMDCT_BLOCKS(sse2,sse)
|
||||
DECL_IMDCT_BLOCKS(sse3,sse)
|
||||
DECL_IMDCT_BLOCKS(ssse3,sse)
|
||||
#endif
|
||||
#if HAVE_AVX_EXTERNAL
|
||||
DECL_IMDCT_BLOCKS(avx,avx)
|
||||
#endif
|
||||
#endif /* HAVE_X86ASM */
|
||||
|
||||
av_cold void ff_mpadsp_init_x86(MPADSPContext *s)
|
||||
{
|
||||
av_unused int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
int i, j;
|
||||
for (j = 0; j < 4; j++) {
|
||||
for (i = 0; i < 40; i ++) {
|
||||
mdct_win_sse[0][j][4*i ] = ff_mdct_win_float[j ][i];
|
||||
mdct_win_sse[0][j][4*i + 1] = ff_mdct_win_float[j + 4][i];
|
||||
mdct_win_sse[0][j][4*i + 2] = ff_mdct_win_float[j ][i];
|
||||
mdct_win_sse[0][j][4*i + 3] = ff_mdct_win_float[j + 4][i];
|
||||
mdct_win_sse[1][j][4*i ] = ff_mdct_win_float[0 ][i];
|
||||
mdct_win_sse[1][j][4*i + 1] = ff_mdct_win_float[4 ][i];
|
||||
mdct_win_sse[1][j][4*i + 2] = ff_mdct_win_float[j ][i];
|
||||
mdct_win_sse[1][j][4*i + 3] = ff_mdct_win_float[j + 4][i];
|
||||
}
|
||||
}
|
||||
|
||||
#if HAVE_6REGS && HAVE_SSE_INLINE
|
||||
if (INLINE_SSE(cpu_flags)) {
|
||||
s->apply_window_float = apply_window_mp3;
|
||||
}
|
||||
#endif /* HAVE_SSE_INLINE */
|
||||
|
||||
#if HAVE_X86ASM
|
||||
#if HAVE_SSE
|
||||
#if ARCH_X86_32
|
||||
if (EXTERNAL_SSE(cpu_flags)) {
|
||||
s->imdct36_blocks_float = imdct36_blocks_sse;
|
||||
}
|
||||
#endif
|
||||
if (EXTERNAL_SSE2(cpu_flags)) {
|
||||
s->imdct36_blocks_float = imdct36_blocks_sse2;
|
||||
}
|
||||
if (EXTERNAL_SSE3(cpu_flags)) {
|
||||
s->imdct36_blocks_float = imdct36_blocks_sse3;
|
||||
}
|
||||
if (EXTERNAL_SSSE3(cpu_flags)) {
|
||||
s->imdct36_blocks_float = imdct36_blocks_ssse3;
|
||||
}
|
||||
#endif
|
||||
#if HAVE_AVX_EXTERNAL
|
||||
if (EXTERNAL_AVX(cpu_flags)) {
|
||||
s->imdct36_blocks_float = imdct36_blocks_avx;
|
||||
}
|
||||
#endif
|
||||
#endif /* HAVE_X86ASM */
|
||||
}
|
469
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/mpegvideo.c
vendored
Normal file
469
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/mpegvideo.c
vendored
Normal file
|
@ -0,0 +1,469 @@
|
|||
/*
|
||||
* Optimized for ia32 CPUs by Nick Kurshev <nickols_k@mail.ru>
|
||||
* H.263, MPEG-1, MPEG-2 dequantizer & draw_edges by Michael Niedermayer <michaelni@gmx.at>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/x86/asm.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/avcodec.h"
|
||||
#include "libavcodec/mpegvideo.h"
|
||||
#include "libavcodec/mpegvideodata.h"
|
||||
|
||||
#if HAVE_MMX_INLINE
|
||||
|
||||
static void dct_unquantize_h263_intra_mmx(MpegEncContext *s,
|
||||
int16_t *block, int n, int qscale)
|
||||
{
|
||||
x86_reg level, qmul, qadd, nCoeffs;
|
||||
|
||||
qmul = qscale << 1;
|
||||
|
||||
av_assert2(s->block_last_index[n]>=0 || s->h263_aic);
|
||||
|
||||
if (!s->h263_aic) {
|
||||
if (n < 4)
|
||||
level = block[0] * s->y_dc_scale;
|
||||
else
|
||||
level = block[0] * s->c_dc_scale;
|
||||
qadd = (qscale - 1) | 1;
|
||||
}else{
|
||||
qadd = 0;
|
||||
level= block[0];
|
||||
}
|
||||
if(s->ac_pred)
|
||||
nCoeffs=63;
|
||||
else
|
||||
nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ];
|
||||
|
||||
__asm__ volatile(
|
||||
"movd %1, %%mm6 \n\t" //qmul
|
||||
"packssdw %%mm6, %%mm6 \n\t"
|
||||
"packssdw %%mm6, %%mm6 \n\t"
|
||||
"movd %2, %%mm5 \n\t" //qadd
|
||||
"pxor %%mm7, %%mm7 \n\t"
|
||||
"packssdw %%mm5, %%mm5 \n\t"
|
||||
"packssdw %%mm5, %%mm5 \n\t"
|
||||
"psubw %%mm5, %%mm7 \n\t"
|
||||
"pxor %%mm4, %%mm4 \n\t"
|
||||
".p2align 4 \n\t"
|
||||
"1: \n\t"
|
||||
"movq (%0, %3), %%mm0 \n\t"
|
||||
"movq 8(%0, %3), %%mm1 \n\t"
|
||||
|
||||
"pmullw %%mm6, %%mm0 \n\t"
|
||||
"pmullw %%mm6, %%mm1 \n\t"
|
||||
|
||||
"movq (%0, %3), %%mm2 \n\t"
|
||||
"movq 8(%0, %3), %%mm3 \n\t"
|
||||
|
||||
"pcmpgtw %%mm4, %%mm2 \n\t" // block[i] < 0 ? -1 : 0
|
||||
"pcmpgtw %%mm4, %%mm3 \n\t" // block[i] < 0 ? -1 : 0
|
||||
|
||||
"pxor %%mm2, %%mm0 \n\t"
|
||||
"pxor %%mm3, %%mm1 \n\t"
|
||||
|
||||
"paddw %%mm7, %%mm0 \n\t"
|
||||
"paddw %%mm7, %%mm1 \n\t"
|
||||
|
||||
"pxor %%mm0, %%mm2 \n\t"
|
||||
"pxor %%mm1, %%mm3 \n\t"
|
||||
|
||||
"pcmpeqw %%mm7, %%mm0 \n\t" // block[i] == 0 ? -1 : 0
|
||||
"pcmpeqw %%mm7, %%mm1 \n\t" // block[i] == 0 ? -1 : 0
|
||||
|
||||
"pandn %%mm2, %%mm0 \n\t"
|
||||
"pandn %%mm3, %%mm1 \n\t"
|
||||
|
||||
"movq %%mm0, (%0, %3) \n\t"
|
||||
"movq %%mm1, 8(%0, %3) \n\t"
|
||||
|
||||
"add $16, %3 \n\t"
|
||||
"jng 1b \n\t"
|
||||
::"r" (block+nCoeffs), "rm"(qmul), "rm" (qadd), "r" (2*(-nCoeffs))
|
||||
: "memory"
|
||||
);
|
||||
block[0]= level;
|
||||
}
|
||||
|
||||
|
||||
static void dct_unquantize_h263_inter_mmx(MpegEncContext *s,
|
||||
int16_t *block, int n, int qscale)
|
||||
{
|
||||
x86_reg qmul, qadd, nCoeffs;
|
||||
|
||||
qmul = qscale << 1;
|
||||
qadd = (qscale - 1) | 1;
|
||||
|
||||
av_assert2(s->block_last_index[n]>=0 || s->h263_aic);
|
||||
|
||||
nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
|
||||
|
||||
__asm__ volatile(
|
||||
"movd %1, %%mm6 \n\t" //qmul
|
||||
"packssdw %%mm6, %%mm6 \n\t"
|
||||
"packssdw %%mm6, %%mm6 \n\t"
|
||||
"movd %2, %%mm5 \n\t" //qadd
|
||||
"pxor %%mm7, %%mm7 \n\t"
|
||||
"packssdw %%mm5, %%mm5 \n\t"
|
||||
"packssdw %%mm5, %%mm5 \n\t"
|
||||
"psubw %%mm5, %%mm7 \n\t"
|
||||
"pxor %%mm4, %%mm4 \n\t"
|
||||
".p2align 4 \n\t"
|
||||
"1: \n\t"
|
||||
"movq (%0, %3), %%mm0 \n\t"
|
||||
"movq 8(%0, %3), %%mm1 \n\t"
|
||||
|
||||
"pmullw %%mm6, %%mm0 \n\t"
|
||||
"pmullw %%mm6, %%mm1 \n\t"
|
||||
|
||||
"movq (%0, %3), %%mm2 \n\t"
|
||||
"movq 8(%0, %3), %%mm3 \n\t"
|
||||
|
||||
"pcmpgtw %%mm4, %%mm2 \n\t" // block[i] < 0 ? -1 : 0
|
||||
"pcmpgtw %%mm4, %%mm3 \n\t" // block[i] < 0 ? -1 : 0
|
||||
|
||||
"pxor %%mm2, %%mm0 \n\t"
|
||||
"pxor %%mm3, %%mm1 \n\t"
|
||||
|
||||
"paddw %%mm7, %%mm0 \n\t"
|
||||
"paddw %%mm7, %%mm1 \n\t"
|
||||
|
||||
"pxor %%mm0, %%mm2 \n\t"
|
||||
"pxor %%mm1, %%mm3 \n\t"
|
||||
|
||||
"pcmpeqw %%mm7, %%mm0 \n\t" // block[i] == 0 ? -1 : 0
|
||||
"pcmpeqw %%mm7, %%mm1 \n\t" // block[i] == 0 ? -1 : 0
|
||||
|
||||
"pandn %%mm2, %%mm0 \n\t"
|
||||
"pandn %%mm3, %%mm1 \n\t"
|
||||
|
||||
"movq %%mm0, (%0, %3) \n\t"
|
||||
"movq %%mm1, 8(%0, %3) \n\t"
|
||||
|
||||
"add $16, %3 \n\t"
|
||||
"jng 1b \n\t"
|
||||
::"r" (block+nCoeffs), "rm"(qmul), "rm" (qadd), "r" (2*(-nCoeffs))
|
||||
: "memory"
|
||||
);
|
||||
}
|
||||
|
||||
static void dct_unquantize_mpeg1_intra_mmx(MpegEncContext *s,
|
||||
int16_t *block, int n, int qscale)
|
||||
{
|
||||
x86_reg nCoeffs;
|
||||
const uint16_t *quant_matrix;
|
||||
int block0;
|
||||
|
||||
av_assert2(s->block_last_index[n]>=0);
|
||||
|
||||
nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1;
|
||||
|
||||
if (n < 4)
|
||||
block0 = block[0] * s->y_dc_scale;
|
||||
else
|
||||
block0 = block[0] * s->c_dc_scale;
|
||||
/* XXX: only MPEG-1 */
|
||||
quant_matrix = s->intra_matrix;
|
||||
__asm__ volatile(
|
||||
"pcmpeqw %%mm7, %%mm7 \n\t"
|
||||
"psrlw $15, %%mm7 \n\t"
|
||||
"movd %2, %%mm6 \n\t"
|
||||
"packssdw %%mm6, %%mm6 \n\t"
|
||||
"packssdw %%mm6, %%mm6 \n\t"
|
||||
"mov %3, %%"FF_REG_a" \n\t"
|
||||
".p2align 4 \n\t"
|
||||
"1: \n\t"
|
||||
"movq (%0, %%"FF_REG_a"), %%mm0 \n\t"
|
||||
"movq 8(%0, %%"FF_REG_a"), %%mm1\n\t"
|
||||
"movq (%1, %%"FF_REG_a"), %%mm4 \n\t"
|
||||
"movq 8(%1, %%"FF_REG_a"), %%mm5\n\t"
|
||||
"pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i]
|
||||
"pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i]
|
||||
"pxor %%mm2, %%mm2 \n\t"
|
||||
"pxor %%mm3, %%mm3 \n\t"
|
||||
"pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0
|
||||
"pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0
|
||||
"pxor %%mm2, %%mm0 \n\t"
|
||||
"pxor %%mm3, %%mm1 \n\t"
|
||||
"psubw %%mm2, %%mm0 \n\t" // abs(block[i])
|
||||
"psubw %%mm3, %%mm1 \n\t" // abs(block[i])
|
||||
"pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*q
|
||||
"pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*q
|
||||
"pxor %%mm4, %%mm4 \n\t"
|
||||
"pxor %%mm5, %%mm5 \n\t" // FIXME slow
|
||||
"pcmpeqw (%0, %%"FF_REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
|
||||
"pcmpeqw 8(%0, %%"FF_REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0
|
||||
"psraw $3, %%mm0 \n\t"
|
||||
"psraw $3, %%mm1 \n\t"
|
||||
"psubw %%mm7, %%mm0 \n\t"
|
||||
"psubw %%mm7, %%mm1 \n\t"
|
||||
"por %%mm7, %%mm0 \n\t"
|
||||
"por %%mm7, %%mm1 \n\t"
|
||||
"pxor %%mm2, %%mm0 \n\t"
|
||||
"pxor %%mm3, %%mm1 \n\t"
|
||||
"psubw %%mm2, %%mm0 \n\t"
|
||||
"psubw %%mm3, %%mm1 \n\t"
|
||||
"pandn %%mm0, %%mm4 \n\t"
|
||||
"pandn %%mm1, %%mm5 \n\t"
|
||||
"movq %%mm4, (%0, %%"FF_REG_a") \n\t"
|
||||
"movq %%mm5, 8(%0, %%"FF_REG_a")\n\t"
|
||||
|
||||
"add $16, %%"FF_REG_a" \n\t"
|
||||
"js 1b \n\t"
|
||||
::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "g" (-2*nCoeffs)
|
||||
: "%"FF_REG_a, "memory"
|
||||
);
|
||||
block[0]= block0;
|
||||
}
|
||||
|
||||
static void dct_unquantize_mpeg1_inter_mmx(MpegEncContext *s,
|
||||
int16_t *block, int n, int qscale)
|
||||
{
|
||||
x86_reg nCoeffs;
|
||||
const uint16_t *quant_matrix;
|
||||
|
||||
av_assert2(s->block_last_index[n]>=0);
|
||||
|
||||
nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1;
|
||||
|
||||
quant_matrix = s->inter_matrix;
|
||||
__asm__ volatile(
|
||||
"pcmpeqw %%mm7, %%mm7 \n\t"
|
||||
"psrlw $15, %%mm7 \n\t"
|
||||
"movd %2, %%mm6 \n\t"
|
||||
"packssdw %%mm6, %%mm6 \n\t"
|
||||
"packssdw %%mm6, %%mm6 \n\t"
|
||||
"mov %3, %%"FF_REG_a" \n\t"
|
||||
".p2align 4 \n\t"
|
||||
"1: \n\t"
|
||||
"movq (%0, %%"FF_REG_a"), %%mm0 \n\t"
|
||||
"movq 8(%0, %%"FF_REG_a"), %%mm1\n\t"
|
||||
"movq (%1, %%"FF_REG_a"), %%mm4 \n\t"
|
||||
"movq 8(%1, %%"FF_REG_a"), %%mm5\n\t"
|
||||
"pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i]
|
||||
"pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i]
|
||||
"pxor %%mm2, %%mm2 \n\t"
|
||||
"pxor %%mm3, %%mm3 \n\t"
|
||||
"pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0
|
||||
"pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0
|
||||
"pxor %%mm2, %%mm0 \n\t"
|
||||
"pxor %%mm3, %%mm1 \n\t"
|
||||
"psubw %%mm2, %%mm0 \n\t" // abs(block[i])
|
||||
"psubw %%mm3, %%mm1 \n\t" // abs(block[i])
|
||||
"paddw %%mm0, %%mm0 \n\t" // abs(block[i])*2
|
||||
"paddw %%mm1, %%mm1 \n\t" // abs(block[i])*2
|
||||
"paddw %%mm7, %%mm0 \n\t" // abs(block[i])*2 + 1
|
||||
"paddw %%mm7, %%mm1 \n\t" // abs(block[i])*2 + 1
|
||||
"pmullw %%mm4, %%mm0 \n\t" // (abs(block[i])*2 + 1)*q
|
||||
"pmullw %%mm5, %%mm1 \n\t" // (abs(block[i])*2 + 1)*q
|
||||
"pxor %%mm4, %%mm4 \n\t"
|
||||
"pxor %%mm5, %%mm5 \n\t" // FIXME slow
|
||||
"pcmpeqw (%0, %%"FF_REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
|
||||
"pcmpeqw 8(%0, %%"FF_REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0
|
||||
"psraw $4, %%mm0 \n\t"
|
||||
"psraw $4, %%mm1 \n\t"
|
||||
"psubw %%mm7, %%mm0 \n\t"
|
||||
"psubw %%mm7, %%mm1 \n\t"
|
||||
"por %%mm7, %%mm0 \n\t"
|
||||
"por %%mm7, %%mm1 \n\t"
|
||||
"pxor %%mm2, %%mm0 \n\t"
|
||||
"pxor %%mm3, %%mm1 \n\t"
|
||||
"psubw %%mm2, %%mm0 \n\t"
|
||||
"psubw %%mm3, %%mm1 \n\t"
|
||||
"pandn %%mm0, %%mm4 \n\t"
|
||||
"pandn %%mm1, %%mm5 \n\t"
|
||||
"movq %%mm4, (%0, %%"FF_REG_a") \n\t"
|
||||
"movq %%mm5, 8(%0, %%"FF_REG_a")\n\t"
|
||||
|
||||
"add $16, %%"FF_REG_a" \n\t"
|
||||
"js 1b \n\t"
|
||||
::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "g" (-2*nCoeffs)
|
||||
: "%"FF_REG_a, "memory"
|
||||
);
|
||||
}
|
||||
|
||||
static void dct_unquantize_mpeg2_intra_mmx(MpegEncContext *s,
|
||||
int16_t *block, int n, int qscale)
|
||||
{
|
||||
x86_reg nCoeffs;
|
||||
const uint16_t *quant_matrix;
|
||||
int block0;
|
||||
|
||||
av_assert2(s->block_last_index[n]>=0);
|
||||
|
||||
if (s->q_scale_type) qscale = ff_mpeg2_non_linear_qscale[qscale];
|
||||
else qscale <<= 1;
|
||||
|
||||
if(s->alternate_scan) nCoeffs= 63; //FIXME
|
||||
else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ];
|
||||
|
||||
if (n < 4)
|
||||
block0 = block[0] * s->y_dc_scale;
|
||||
else
|
||||
block0 = block[0] * s->c_dc_scale;
|
||||
quant_matrix = s->intra_matrix;
|
||||
__asm__ volatile(
|
||||
"pcmpeqw %%mm7, %%mm7 \n\t"
|
||||
"psrlw $15, %%mm7 \n\t"
|
||||
"movd %2, %%mm6 \n\t"
|
||||
"packssdw %%mm6, %%mm6 \n\t"
|
||||
"packssdw %%mm6, %%mm6 \n\t"
|
||||
"mov %3, %%"FF_REG_a" \n\t"
|
||||
".p2align 4 \n\t"
|
||||
"1: \n\t"
|
||||
"movq (%0, %%"FF_REG_a"), %%mm0 \n\t"
|
||||
"movq 8(%0, %%"FF_REG_a"), %%mm1\n\t"
|
||||
"movq (%1, %%"FF_REG_a"), %%mm4 \n\t"
|
||||
"movq 8(%1, %%"FF_REG_a"), %%mm5\n\t"
|
||||
"pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i]
|
||||
"pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i]
|
||||
"pxor %%mm2, %%mm2 \n\t"
|
||||
"pxor %%mm3, %%mm3 \n\t"
|
||||
"pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0
|
||||
"pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0
|
||||
"pxor %%mm2, %%mm0 \n\t"
|
||||
"pxor %%mm3, %%mm1 \n\t"
|
||||
"psubw %%mm2, %%mm0 \n\t" // abs(block[i])
|
||||
"psubw %%mm3, %%mm1 \n\t" // abs(block[i])
|
||||
"pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*q
|
||||
"pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*q
|
||||
"pxor %%mm4, %%mm4 \n\t"
|
||||
"pxor %%mm5, %%mm5 \n\t" // FIXME slow
|
||||
"pcmpeqw (%0, %%"FF_REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
|
||||
"pcmpeqw 8(%0, %%"FF_REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0
|
||||
"psraw $4, %%mm0 \n\t"
|
||||
"psraw $4, %%mm1 \n\t"
|
||||
"pxor %%mm2, %%mm0 \n\t"
|
||||
"pxor %%mm3, %%mm1 \n\t"
|
||||
"psubw %%mm2, %%mm0 \n\t"
|
||||
"psubw %%mm3, %%mm1 \n\t"
|
||||
"pandn %%mm0, %%mm4 \n\t"
|
||||
"pandn %%mm1, %%mm5 \n\t"
|
||||
"movq %%mm4, (%0, %%"FF_REG_a") \n\t"
|
||||
"movq %%mm5, 8(%0, %%"FF_REG_a")\n\t"
|
||||
|
||||
"add $16, %%"FF_REG_a" \n\t"
|
||||
"jng 1b \n\t"
|
||||
::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "g" (-2*nCoeffs)
|
||||
: "%"FF_REG_a, "memory"
|
||||
);
|
||||
block[0]= block0;
|
||||
//Note, we do not do mismatch control for intra as errors cannot accumulate
|
||||
}
|
||||
|
||||
static void dct_unquantize_mpeg2_inter_mmx(MpegEncContext *s,
|
||||
int16_t *block, int n, int qscale)
|
||||
{
|
||||
x86_reg nCoeffs;
|
||||
const uint16_t *quant_matrix;
|
||||
|
||||
av_assert2(s->block_last_index[n]>=0);
|
||||
|
||||
if (s->q_scale_type) qscale = ff_mpeg2_non_linear_qscale[qscale];
|
||||
else qscale <<= 1;
|
||||
|
||||
if(s->alternate_scan) nCoeffs= 63; //FIXME
|
||||
else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ];
|
||||
|
||||
quant_matrix = s->inter_matrix;
|
||||
__asm__ volatile(
|
||||
"pcmpeqw %%mm7, %%mm7 \n\t"
|
||||
"psrlq $48, %%mm7 \n\t"
|
||||
"movd %2, %%mm6 \n\t"
|
||||
"packssdw %%mm6, %%mm6 \n\t"
|
||||
"packssdw %%mm6, %%mm6 \n\t"
|
||||
"mov %3, %%"FF_REG_a" \n\t"
|
||||
".p2align 4 \n\t"
|
||||
"1: \n\t"
|
||||
"movq (%0, %%"FF_REG_a"), %%mm0 \n\t"
|
||||
"movq 8(%0, %%"FF_REG_a"), %%mm1\n\t"
|
||||
"movq (%1, %%"FF_REG_a"), %%mm4 \n\t"
|
||||
"movq 8(%1, %%"FF_REG_a"), %%mm5\n\t"
|
||||
"pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i]
|
||||
"pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i]
|
||||
"pxor %%mm2, %%mm2 \n\t"
|
||||
"pxor %%mm3, %%mm3 \n\t"
|
||||
"pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0
|
||||
"pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0
|
||||
"pxor %%mm2, %%mm0 \n\t"
|
||||
"pxor %%mm3, %%mm1 \n\t"
|
||||
"psubw %%mm2, %%mm0 \n\t" // abs(block[i])
|
||||
"psubw %%mm3, %%mm1 \n\t" // abs(block[i])
|
||||
"paddw %%mm0, %%mm0 \n\t" // abs(block[i])*2
|
||||
"paddw %%mm1, %%mm1 \n\t" // abs(block[i])*2
|
||||
"pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*2*q
|
||||
"pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*2*q
|
||||
"paddw %%mm4, %%mm0 \n\t" // (abs(block[i])*2 + 1)*q
|
||||
"paddw %%mm5, %%mm1 \n\t" // (abs(block[i])*2 + 1)*q
|
||||
"pxor %%mm4, %%mm4 \n\t"
|
||||
"pxor %%mm5, %%mm5 \n\t" // FIXME slow
|
||||
"pcmpeqw (%0, %%"FF_REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
|
||||
"pcmpeqw 8(%0, %%"FF_REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0
|
||||
"psrlw $5, %%mm0 \n\t"
|
||||
"psrlw $5, %%mm1 \n\t"
|
||||
"pxor %%mm2, %%mm0 \n\t"
|
||||
"pxor %%mm3, %%mm1 \n\t"
|
||||
"psubw %%mm2, %%mm0 \n\t"
|
||||
"psubw %%mm3, %%mm1 \n\t"
|
||||
"pandn %%mm0, %%mm4 \n\t"
|
||||
"pandn %%mm1, %%mm5 \n\t"
|
||||
"pxor %%mm4, %%mm7 \n\t"
|
||||
"pxor %%mm5, %%mm7 \n\t"
|
||||
"movq %%mm4, (%0, %%"FF_REG_a") \n\t"
|
||||
"movq %%mm5, 8(%0, %%"FF_REG_a")\n\t"
|
||||
|
||||
"add $16, %%"FF_REG_a" \n\t"
|
||||
"jng 1b \n\t"
|
||||
"movd 124(%0, %3), %%mm0 \n\t"
|
||||
"movq %%mm7, %%mm6 \n\t"
|
||||
"psrlq $32, %%mm7 \n\t"
|
||||
"pxor %%mm6, %%mm7 \n\t"
|
||||
"movq %%mm7, %%mm6 \n\t"
|
||||
"psrlq $16, %%mm7 \n\t"
|
||||
"pxor %%mm6, %%mm7 \n\t"
|
||||
"pslld $31, %%mm7 \n\t"
|
||||
"psrlq $15, %%mm7 \n\t"
|
||||
"pxor %%mm7, %%mm0 \n\t"
|
||||
"movd %%mm0, 124(%0, %3) \n\t"
|
||||
|
||||
::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "r" (-2*nCoeffs)
|
||||
: "%"FF_REG_a, "memory"
|
||||
);
|
||||
}
|
||||
|
||||
#endif /* HAVE_MMX_INLINE */
|
||||
|
||||
av_cold void ff_mpv_common_init_x86(MpegEncContext *s)
|
||||
{
|
||||
#if HAVE_MMX_INLINE
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (INLINE_MMX(cpu_flags)) {
|
||||
s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_mmx;
|
||||
s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_mmx;
|
||||
s->dct_unquantize_mpeg1_intra = dct_unquantize_mpeg1_intra_mmx;
|
||||
s->dct_unquantize_mpeg1_inter = dct_unquantize_mpeg1_inter_mmx;
|
||||
if (!(s->avctx->flags & AV_CODEC_FLAG_BITEXACT))
|
||||
s->dct_unquantize_mpeg2_intra = dct_unquantize_mpeg2_intra_mmx;
|
||||
s->dct_unquantize_mpeg2_inter = dct_unquantize_mpeg2_inter_mmx;
|
||||
}
|
||||
#endif /* HAVE_MMX_INLINE */
|
||||
}
|
161
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/mpegvideodsp.c
vendored
Normal file
161
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/mpegvideodsp.c
vendored
Normal file
|
@ -0,0 +1,161 @@
|
|||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/mpegvideodsp.h"
|
||||
#include "libavcodec/videodsp.h"
|
||||
|
||||
#if HAVE_INLINE_ASM
|
||||
|
||||
static void gmc_mmx(uint8_t *dst, uint8_t *src,
|
||||
int stride, int h, int ox, int oy,
|
||||
int dxx, int dxy, int dyx, int dyy,
|
||||
int shift, int r, int width, int height)
|
||||
{
|
||||
const int w = 8;
|
||||
const int ix = ox >> (16 + shift);
|
||||
const int iy = oy >> (16 + shift);
|
||||
const int oxs = ox >> 4;
|
||||
const int oys = oy >> 4;
|
||||
const int dxxs = dxx >> 4;
|
||||
const int dxys = dxy >> 4;
|
||||
const int dyxs = dyx >> 4;
|
||||
const int dyys = dyy >> 4;
|
||||
const uint16_t r4[4] = { r, r, r, r };
|
||||
const uint16_t dxy4[4] = { dxys, dxys, dxys, dxys };
|
||||
const uint16_t dyy4[4] = { dyys, dyys, dyys, dyys };
|
||||
const uint64_t shift2 = 2 * shift;
|
||||
#define MAX_STRIDE 4096U
|
||||
#define MAX_H 8U
|
||||
uint8_t edge_buf[(MAX_H + 1) * MAX_STRIDE];
|
||||
int x, y;
|
||||
|
||||
const int dxw = (dxx - (1 << (16 + shift))) * (w - 1);
|
||||
const int dyh = (dyy - (1 << (16 + shift))) * (h - 1);
|
||||
const int dxh = dxy * (h - 1);
|
||||
const int dyw = dyx * (w - 1);
|
||||
int need_emu = (unsigned) ix >= width - w || width < w ||
|
||||
(unsigned) iy >= height - h || height< h
|
||||
;
|
||||
|
||||
if ( // non-constant fullpel offset (3% of blocks)
|
||||
((ox ^ (ox + dxw)) | (ox ^ (ox + dxh)) | (ox ^ (ox + dxw + dxh)) |
|
||||
(oy ^ (oy + dyw)) | (oy ^ (oy + dyh)) | (oy ^ (oy + dyw + dyh))) >> (16 + shift) ||
|
||||
// uses more than 16 bits of subpel mv (only at huge resolution)
|
||||
(dxx | dxy | dyx | dyy) & 15 ||
|
||||
(need_emu && (h > MAX_H || stride > MAX_STRIDE))) {
|
||||
// FIXME could still use mmx for some of the rows
|
||||
ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy,
|
||||
shift, r, width, height);
|
||||
return;
|
||||
}
|
||||
|
||||
src += ix + iy * stride;
|
||||
if (need_emu) {
|
||||
ff_emulated_edge_mc_8(edge_buf, src, stride, stride, w + 1, h + 1, ix, iy, width, height);
|
||||
src = edge_buf;
|
||||
}
|
||||
|
||||
__asm__ volatile (
|
||||
"movd %0, %%mm6 \n\t"
|
||||
"pxor %%mm7, %%mm7 \n\t"
|
||||
"punpcklwd %%mm6, %%mm6 \n\t"
|
||||
"punpcklwd %%mm6, %%mm6 \n\t"
|
||||
:: "r" (1 << shift));
|
||||
|
||||
for (x = 0; x < w; x += 4) {
|
||||
uint16_t dx4[4] = { oxs - dxys + dxxs * (x + 0),
|
||||
oxs - dxys + dxxs * (x + 1),
|
||||
oxs - dxys + dxxs * (x + 2),
|
||||
oxs - dxys + dxxs * (x + 3) };
|
||||
uint16_t dy4[4] = { oys - dyys + dyxs * (x + 0),
|
||||
oys - dyys + dyxs * (x + 1),
|
||||
oys - dyys + dyxs * (x + 2),
|
||||
oys - dyys + dyxs * (x + 3) };
|
||||
|
||||
for (y = 0; y < h; y++) {
|
||||
__asm__ volatile (
|
||||
"movq %0, %%mm4 \n\t"
|
||||
"movq %1, %%mm5 \n\t"
|
||||
"paddw %2, %%mm4 \n\t"
|
||||
"paddw %3, %%mm5 \n\t"
|
||||
"movq %%mm4, %0 \n\t"
|
||||
"movq %%mm5, %1 \n\t"
|
||||
"psrlw $12, %%mm4 \n\t"
|
||||
"psrlw $12, %%mm5 \n\t"
|
||||
: "+m" (*dx4), "+m" (*dy4)
|
||||
: "m" (*dxy4), "m" (*dyy4));
|
||||
|
||||
__asm__ volatile (
|
||||
"movq %%mm6, %%mm2 \n\t"
|
||||
"movq %%mm6, %%mm1 \n\t"
|
||||
"psubw %%mm4, %%mm2 \n\t"
|
||||
"psubw %%mm5, %%mm1 \n\t"
|
||||
"movq %%mm2, %%mm0 \n\t"
|
||||
"movq %%mm4, %%mm3 \n\t"
|
||||
"pmullw %%mm1, %%mm0 \n\t" // (s - dx) * (s - dy)
|
||||
"pmullw %%mm5, %%mm3 \n\t" // dx * dy
|
||||
"pmullw %%mm5, %%mm2 \n\t" // (s - dx) * dy
|
||||
"pmullw %%mm4, %%mm1 \n\t" // dx * (s - dy)
|
||||
|
||||
"movd %4, %%mm5 \n\t"
|
||||
"movd %3, %%mm4 \n\t"
|
||||
"punpcklbw %%mm7, %%mm5 \n\t"
|
||||
"punpcklbw %%mm7, %%mm4 \n\t"
|
||||
"pmullw %%mm5, %%mm3 \n\t" // src[1, 1] * dx * dy
|
||||
"pmullw %%mm4, %%mm2 \n\t" // src[0, 1] * (s - dx) * dy
|
||||
|
||||
"movd %2, %%mm5 \n\t"
|
||||
"movd %1, %%mm4 \n\t"
|
||||
"punpcklbw %%mm7, %%mm5 \n\t"
|
||||
"punpcklbw %%mm7, %%mm4 \n\t"
|
||||
"pmullw %%mm5, %%mm1 \n\t" // src[1, 0] * dx * (s - dy)
|
||||
"pmullw %%mm4, %%mm0 \n\t" // src[0, 0] * (s - dx) * (s - dy)
|
||||
"paddw %5, %%mm1 \n\t"
|
||||
"paddw %%mm3, %%mm2 \n\t"
|
||||
"paddw %%mm1, %%mm0 \n\t"
|
||||
"paddw %%mm2, %%mm0 \n\t"
|
||||
|
||||
"psrlw %6, %%mm0 \n\t"
|
||||
"packuswb %%mm0, %%mm0 \n\t"
|
||||
"movd %%mm0, %0 \n\t"
|
||||
|
||||
: "=m" (dst[x + y * stride])
|
||||
: "m" (src[0]), "m" (src[1]),
|
||||
"m" (src[stride]), "m" (src[stride + 1]),
|
||||
"m" (*r4), "m" (shift2));
|
||||
src += stride;
|
||||
}
|
||||
src += 4 - h * stride;
|
||||
}
|
||||
}
|
||||
|
||||
#endif /* HAVE_INLINE_ASM */
|
||||
|
||||
av_cold void ff_mpegvideodsp_init_x86(MpegVideoDSPContext *c)
|
||||
{
|
||||
#if HAVE_INLINE_ASM
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (INLINE_MMX(cpu_flags))
|
||||
c->gmc = gmc_mmx;
|
||||
#endif /* HAVE_INLINE_ASM */
|
||||
}
|
244
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/mpegvideoenc.c
vendored
Normal file
244
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/mpegvideoenc.c
vendored
Normal file
|
@ -0,0 +1,244 @@
|
|||
/*
|
||||
* The simplest mpeg encoder (well, it was the simplest!)
|
||||
* Copyright (c) 2000,2001 Fabrice Bellard
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/x86/asm.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/avcodec.h"
|
||||
#include "libavcodec/dct.h"
|
||||
#include "libavcodec/mpegvideo.h"
|
||||
|
||||
/* not permutated inverse zigzag_direct + 1 for MMX quantizer */
|
||||
DECLARE_ALIGNED(16, static const uint16_t, inv_zigzag_direct16)[64] = {
|
||||
1, 2, 6, 7, 15, 16, 28, 29,
|
||||
3, 5, 8, 14, 17, 27, 30, 43,
|
||||
4, 9, 13, 18, 26, 31, 42, 44,
|
||||
10, 12, 19, 25, 32, 41, 45, 54,
|
||||
11, 20, 24, 33, 40, 46, 53, 55,
|
||||
21, 23, 34, 39, 47, 52, 56, 61,
|
||||
22, 35, 38, 48, 51, 57, 60, 62,
|
||||
36, 37, 49, 50, 58, 59, 63, 64,
|
||||
};
|
||||
|
||||
#if HAVE_6REGS
|
||||
|
||||
#if HAVE_MMX_INLINE
|
||||
#define COMPILE_TEMPLATE_MMXEXT 0
|
||||
#define COMPILE_TEMPLATE_SSE2 0
|
||||
#define COMPILE_TEMPLATE_SSSE3 0
|
||||
#define RENAME(a) a ## _mmx
|
||||
#define RENAME_FDCT(a) a ## _mmx
|
||||
#include "mpegvideoenc_template.c"
|
||||
#endif /* HAVE_MMX_INLINE */
|
||||
|
||||
#if HAVE_MMXEXT_INLINE
|
||||
#undef COMPILE_TEMPLATE_SSSE3
|
||||
#undef COMPILE_TEMPLATE_SSE2
|
||||
#undef COMPILE_TEMPLATE_MMXEXT
|
||||
#define COMPILE_TEMPLATE_MMXEXT 1
|
||||
#define COMPILE_TEMPLATE_SSE2 0
|
||||
#define COMPILE_TEMPLATE_SSSE3 0
|
||||
#undef RENAME
|
||||
#undef RENAME_FDCT
|
||||
#define RENAME(a) a ## _mmxext
|
||||
#define RENAME_FDCT(a) a ## _mmxext
|
||||
#include "mpegvideoenc_template.c"
|
||||
#endif /* HAVE_MMXEXT_INLINE */
|
||||
|
||||
#if HAVE_SSE2_INLINE
|
||||
#undef COMPILE_TEMPLATE_MMXEXT
|
||||
#undef COMPILE_TEMPLATE_SSE2
|
||||
#undef COMPILE_TEMPLATE_SSSE3
|
||||
#define COMPILE_TEMPLATE_MMXEXT 0
|
||||
#define COMPILE_TEMPLATE_SSE2 1
|
||||
#define COMPILE_TEMPLATE_SSSE3 0
|
||||
#undef RENAME
|
||||
#undef RENAME_FDCT
|
||||
#define RENAME(a) a ## _sse2
|
||||
#define RENAME_FDCT(a) a ## _sse2
|
||||
#include "mpegvideoenc_template.c"
|
||||
#endif /* HAVE_SSE2_INLINE */
|
||||
|
||||
#if HAVE_SSSE3_INLINE
|
||||
#undef COMPILE_TEMPLATE_MMXEXT
|
||||
#undef COMPILE_TEMPLATE_SSE2
|
||||
#undef COMPILE_TEMPLATE_SSSE3
|
||||
#define COMPILE_TEMPLATE_MMXEXT 0
|
||||
#define COMPILE_TEMPLATE_SSE2 1
|
||||
#define COMPILE_TEMPLATE_SSSE3 1
|
||||
#undef RENAME
|
||||
#undef RENAME_FDCT
|
||||
#define RENAME(a) a ## _ssse3
|
||||
#define RENAME_FDCT(a) a ## _sse2
|
||||
#include "mpegvideoenc_template.c"
|
||||
#endif /* HAVE_SSSE3_INLINE */
|
||||
|
||||
#endif /* HAVE_6REGS */
|
||||
|
||||
#if HAVE_INLINE_ASM
|
||||
#if HAVE_MMX_INLINE
|
||||
static void denoise_dct_mmx(MpegEncContext *s, int16_t *block){
|
||||
const int intra= s->mb_intra;
|
||||
int *sum= s->dct_error_sum[intra];
|
||||
uint16_t *offset= s->dct_offset[intra];
|
||||
|
||||
s->dct_count[intra]++;
|
||||
|
||||
__asm__ volatile(
|
||||
"pxor %%mm7, %%mm7 \n\t"
|
||||
"1: \n\t"
|
||||
"pxor %%mm0, %%mm0 \n\t"
|
||||
"pxor %%mm1, %%mm1 \n\t"
|
||||
"movq (%0), %%mm2 \n\t"
|
||||
"movq 8(%0), %%mm3 \n\t"
|
||||
"pcmpgtw %%mm2, %%mm0 \n\t"
|
||||
"pcmpgtw %%mm3, %%mm1 \n\t"
|
||||
"pxor %%mm0, %%mm2 \n\t"
|
||||
"pxor %%mm1, %%mm3 \n\t"
|
||||
"psubw %%mm0, %%mm2 \n\t"
|
||||
"psubw %%mm1, %%mm3 \n\t"
|
||||
"movq %%mm2, %%mm4 \n\t"
|
||||
"movq %%mm3, %%mm5 \n\t"
|
||||
"psubusw (%2), %%mm2 \n\t"
|
||||
"psubusw 8(%2), %%mm3 \n\t"
|
||||
"pxor %%mm0, %%mm2 \n\t"
|
||||
"pxor %%mm1, %%mm3 \n\t"
|
||||
"psubw %%mm0, %%mm2 \n\t"
|
||||
"psubw %%mm1, %%mm3 \n\t"
|
||||
"movq %%mm2, (%0) \n\t"
|
||||
"movq %%mm3, 8(%0) \n\t"
|
||||
"movq %%mm4, %%mm2 \n\t"
|
||||
"movq %%mm5, %%mm3 \n\t"
|
||||
"punpcklwd %%mm7, %%mm4 \n\t"
|
||||
"punpckhwd %%mm7, %%mm2 \n\t"
|
||||
"punpcklwd %%mm7, %%mm5 \n\t"
|
||||
"punpckhwd %%mm7, %%mm3 \n\t"
|
||||
"paddd (%1), %%mm4 \n\t"
|
||||
"paddd 8(%1), %%mm2 \n\t"
|
||||
"paddd 16(%1), %%mm5 \n\t"
|
||||
"paddd 24(%1), %%mm3 \n\t"
|
||||
"movq %%mm4, (%1) \n\t"
|
||||
"movq %%mm2, 8(%1) \n\t"
|
||||
"movq %%mm5, 16(%1) \n\t"
|
||||
"movq %%mm3, 24(%1) \n\t"
|
||||
"add $16, %0 \n\t"
|
||||
"add $32, %1 \n\t"
|
||||
"add $16, %2 \n\t"
|
||||
"cmp %3, %0 \n\t"
|
||||
" jb 1b \n\t"
|
||||
: "+r" (block), "+r" (sum), "+r" (offset)
|
||||
: "r"(block+64)
|
||||
);
|
||||
}
|
||||
#endif /* HAVE_MMX_INLINE */
|
||||
|
||||
#if HAVE_SSE2_INLINE
|
||||
static void denoise_dct_sse2(MpegEncContext *s, int16_t *block){
|
||||
const int intra= s->mb_intra;
|
||||
int *sum= s->dct_error_sum[intra];
|
||||
uint16_t *offset= s->dct_offset[intra];
|
||||
|
||||
s->dct_count[intra]++;
|
||||
|
||||
__asm__ volatile(
|
||||
"pxor %%xmm7, %%xmm7 \n\t"
|
||||
"1: \n\t"
|
||||
"pxor %%xmm0, %%xmm0 \n\t"
|
||||
"pxor %%xmm1, %%xmm1 \n\t"
|
||||
"movdqa (%0), %%xmm2 \n\t"
|
||||
"movdqa 16(%0), %%xmm3 \n\t"
|
||||
"pcmpgtw %%xmm2, %%xmm0 \n\t"
|
||||
"pcmpgtw %%xmm3, %%xmm1 \n\t"
|
||||
"pxor %%xmm0, %%xmm2 \n\t"
|
||||
"pxor %%xmm1, %%xmm3 \n\t"
|
||||
"psubw %%xmm0, %%xmm2 \n\t"
|
||||
"psubw %%xmm1, %%xmm3 \n\t"
|
||||
"movdqa %%xmm2, %%xmm4 \n\t"
|
||||
"movdqa %%xmm3, %%xmm5 \n\t"
|
||||
"psubusw (%2), %%xmm2 \n\t"
|
||||
"psubusw 16(%2), %%xmm3 \n\t"
|
||||
"pxor %%xmm0, %%xmm2 \n\t"
|
||||
"pxor %%xmm1, %%xmm3 \n\t"
|
||||
"psubw %%xmm0, %%xmm2 \n\t"
|
||||
"psubw %%xmm1, %%xmm3 \n\t"
|
||||
"movdqa %%xmm2, (%0) \n\t"
|
||||
"movdqa %%xmm3, 16(%0) \n\t"
|
||||
"movdqa %%xmm4, %%xmm6 \n\t"
|
||||
"movdqa %%xmm5, %%xmm0 \n\t"
|
||||
"punpcklwd %%xmm7, %%xmm4 \n\t"
|
||||
"punpckhwd %%xmm7, %%xmm6 \n\t"
|
||||
"punpcklwd %%xmm7, %%xmm5 \n\t"
|
||||
"punpckhwd %%xmm7, %%xmm0 \n\t"
|
||||
"paddd (%1), %%xmm4 \n\t"
|
||||
"paddd 16(%1), %%xmm6 \n\t"
|
||||
"paddd 32(%1), %%xmm5 \n\t"
|
||||
"paddd 48(%1), %%xmm0 \n\t"
|
||||
"movdqa %%xmm4, (%1) \n\t"
|
||||
"movdqa %%xmm6, 16(%1) \n\t"
|
||||
"movdqa %%xmm5, 32(%1) \n\t"
|
||||
"movdqa %%xmm0, 48(%1) \n\t"
|
||||
"add $32, %0 \n\t"
|
||||
"add $64, %1 \n\t"
|
||||
"add $32, %2 \n\t"
|
||||
"cmp %3, %0 \n\t"
|
||||
" jb 1b \n\t"
|
||||
: "+r" (block), "+r" (sum), "+r" (offset)
|
||||
: "r"(block+64)
|
||||
XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7")
|
||||
);
|
||||
}
|
||||
#endif /* HAVE_SSE2_INLINE */
|
||||
#endif /* HAVE_INLINE_ASM */
|
||||
|
||||
av_cold void ff_dct_encode_init_x86(MpegEncContext *s)
|
||||
{
|
||||
const int dct_algo = s->avctx->dct_algo;
|
||||
|
||||
if (dct_algo == FF_DCT_AUTO || dct_algo == FF_DCT_MMX) {
|
||||
#if HAVE_MMX_INLINE
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
if (INLINE_MMX(cpu_flags)) {
|
||||
#if HAVE_6REGS
|
||||
s->dct_quantize = dct_quantize_mmx;
|
||||
#endif
|
||||
s->denoise_dct = denoise_dct_mmx;
|
||||
}
|
||||
#endif
|
||||
#if HAVE_6REGS && HAVE_MMXEXT_INLINE
|
||||
if (INLINE_MMXEXT(cpu_flags))
|
||||
s->dct_quantize = dct_quantize_mmxext;
|
||||
#endif
|
||||
#if HAVE_SSE2_INLINE
|
||||
if (INLINE_SSE2(cpu_flags)) {
|
||||
#if HAVE_6REGS
|
||||
s->dct_quantize = dct_quantize_sse2;
|
||||
#endif
|
||||
s->denoise_dct = denoise_dct_sse2;
|
||||
}
|
||||
#endif
|
||||
#if HAVE_6REGS && HAVE_SSSE3_INLINE
|
||||
if (INLINE_SSSE3(cpu_flags))
|
||||
s->dct_quantize = dct_quantize_ssse3;
|
||||
#endif
|
||||
}
|
||||
}
|
109
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/mpegvideoenc_qns_template.c
vendored
Normal file
109
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/mpegvideoenc_qns_template.c
vendored
Normal file
|
@ -0,0 +1,109 @@
|
|||
/*
|
||||
* QNS functions are compiled 3 times for MMX/3DNOW/SSSE3
|
||||
* Copyright (c) 2004 Michael Niedermayer
|
||||
*
|
||||
* MMX optimization by Michael Niedermayer <michaelni@gmx.at>
|
||||
* 3DNow! and SSSE3 optimization by Zuxy Meng <zuxy.meng@gmail.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include "libavutil/avassert.h"
|
||||
#include "libavutil/common.h"
|
||||
#include "libavutil/x86/asm.h"
|
||||
|
||||
#include "inline_asm.h"
|
||||
|
||||
#define MAX_ABS (512 >> (SCALE_OFFSET>0 ? SCALE_OFFSET : 0))
|
||||
|
||||
static int DEF(try_8x8basis)(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale)
|
||||
{
|
||||
x86_reg i=0;
|
||||
|
||||
av_assert2(FFABS(scale) < MAX_ABS);
|
||||
scale<<= 16 + SCALE_OFFSET - BASIS_SHIFT + RECON_SHIFT;
|
||||
|
||||
SET_RND(mm6);
|
||||
__asm__ volatile(
|
||||
"pxor %%mm7, %%mm7 \n\t"
|
||||
"movd %4, %%mm5 \n\t"
|
||||
"punpcklwd %%mm5, %%mm5 \n\t"
|
||||
"punpcklwd %%mm5, %%mm5 \n\t"
|
||||
".p2align 4 \n\t"
|
||||
"1: \n\t"
|
||||
"movq (%1, %0), %%mm0 \n\t"
|
||||
"movq 8(%1, %0), %%mm1 \n\t"
|
||||
PMULHRW(%%mm0, %%mm1, %%mm5, %%mm6)
|
||||
"paddw (%2, %0), %%mm0 \n\t"
|
||||
"paddw 8(%2, %0), %%mm1 \n\t"
|
||||
"psraw $6, %%mm0 \n\t"
|
||||
"psraw $6, %%mm1 \n\t"
|
||||
"pmullw (%3, %0), %%mm0 \n\t"
|
||||
"pmullw 8(%3, %0), %%mm1 \n\t"
|
||||
"pmaddwd %%mm0, %%mm0 \n\t"
|
||||
"pmaddwd %%mm1, %%mm1 \n\t"
|
||||
"paddd %%mm1, %%mm0 \n\t"
|
||||
"psrld $4, %%mm0 \n\t"
|
||||
"paddd %%mm0, %%mm7 \n\t"
|
||||
"add $16, %0 \n\t"
|
||||
"cmp $128, %0 \n\t" //FIXME optimize & bench
|
||||
" jb 1b \n\t"
|
||||
PHADDD(%%mm7, %%mm6)
|
||||
"psrld $2, %%mm7 \n\t"
|
||||
"movd %%mm7, %0 \n\t"
|
||||
|
||||
: "+r" (i)
|
||||
: "r"(basis), "r"(rem), "r"(weight), "g"(scale)
|
||||
);
|
||||
return i;
|
||||
}
|
||||
|
||||
static void DEF(add_8x8basis)(int16_t rem[64], int16_t basis[64], int scale)
|
||||
{
|
||||
x86_reg i=0;
|
||||
|
||||
if(FFABS(scale) < MAX_ABS){
|
||||
scale<<= 16 + SCALE_OFFSET - BASIS_SHIFT + RECON_SHIFT;
|
||||
SET_RND(mm6);
|
||||
__asm__ volatile(
|
||||
"movd %3, %%mm5 \n\t"
|
||||
"punpcklwd %%mm5, %%mm5 \n\t"
|
||||
"punpcklwd %%mm5, %%mm5 \n\t"
|
||||
".p2align 4 \n\t"
|
||||
"1: \n\t"
|
||||
"movq (%1, %0), %%mm0 \n\t"
|
||||
"movq 8(%1, %0), %%mm1 \n\t"
|
||||
PMULHRW(%%mm0, %%mm1, %%mm5, %%mm6)
|
||||
"paddw (%2, %0), %%mm0 \n\t"
|
||||
"paddw 8(%2, %0), %%mm1 \n\t"
|
||||
"movq %%mm0, (%2, %0) \n\t"
|
||||
"movq %%mm1, 8(%2, %0) \n\t"
|
||||
"add $16, %0 \n\t"
|
||||
"cmp $128, %0 \n\t" // FIXME optimize & bench
|
||||
" jb 1b \n\t"
|
||||
|
||||
: "+r" (i)
|
||||
: "r"(basis), "r"(rem), "g"(scale)
|
||||
);
|
||||
}else{
|
||||
for(i=0; i<8*8; i++){
|
||||
rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
|
||||
}
|
||||
}
|
||||
}
|
423
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/mpegvideoenc_template.c
vendored
Normal file
423
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/mpegvideoenc_template.c
vendored
Normal file
|
@ -0,0 +1,423 @@
|
|||
/*
|
||||
* MPEG video MMX templates
|
||||
*
|
||||
* Copyright (c) 2002 Michael Niedermayer <michaelni@gmx.at>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include "libavutil/internal.h"
|
||||
#include "libavutil/x86/asm.h"
|
||||
#include "libavcodec/mpegutils.h"
|
||||
#include "libavcodec/mpegvideo.h"
|
||||
#include "fdct.h"
|
||||
|
||||
#undef MMREG_WIDTH
|
||||
#undef MM
|
||||
#undef MOVQ
|
||||
#undef SPREADW
|
||||
#undef PMAXW
|
||||
#undef PMAX
|
||||
#undef SAVE_SIGN
|
||||
#undef RESTORE_SIGN
|
||||
|
||||
#if COMPILE_TEMPLATE_SSE2
|
||||
#define MMREG_WIDTH "16"
|
||||
#define MM "%%xmm"
|
||||
#define MOVQ "movdqa"
|
||||
#define SPREADW(a) \
|
||||
"pshuflw $0, "a", "a" \n\t"\
|
||||
"punpcklwd "a", "a" \n\t"
|
||||
#define PMAXW(a,b) "pmaxsw "a", "b" \n\t"
|
||||
#define PMAX(a,b) \
|
||||
"movhlps "a", "b" \n\t"\
|
||||
PMAXW(b, a)\
|
||||
"pshuflw $0x0E, "a", "b" \n\t"\
|
||||
PMAXW(b, a)\
|
||||
"pshuflw $0x01, "a", "b" \n\t"\
|
||||
PMAXW(b, a)
|
||||
#else
|
||||
#define MMREG_WIDTH "8"
|
||||
#define MM "%%mm"
|
||||
#define MOVQ "movq"
|
||||
#if COMPILE_TEMPLATE_MMXEXT
|
||||
#define SPREADW(a) "pshufw $0, "a", "a" \n\t"
|
||||
#define PMAXW(a,b) "pmaxsw "a", "b" \n\t"
|
||||
#define PMAX(a,b) \
|
||||
"pshufw $0x0E, "a", "b" \n\t"\
|
||||
PMAXW(b, a)\
|
||||
"pshufw $0x01, "a", "b" \n\t"\
|
||||
PMAXW(b, a)
|
||||
#else
|
||||
#define SPREADW(a) \
|
||||
"punpcklwd "a", "a" \n\t"\
|
||||
"punpcklwd "a", "a" \n\t"
|
||||
#define PMAXW(a,b) \
|
||||
"psubusw "a", "b" \n\t"\
|
||||
"paddw "a", "b" \n\t"
|
||||
#define PMAX(a,b) \
|
||||
"movq "a", "b" \n\t"\
|
||||
"psrlq $32, "a" \n\t"\
|
||||
PMAXW(b, a)\
|
||||
"movq "a", "b" \n\t"\
|
||||
"psrlq $16, "a" \n\t"\
|
||||
PMAXW(b, a)
|
||||
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if COMPILE_TEMPLATE_SSSE3
|
||||
#define SAVE_SIGN(a,b) \
|
||||
"movdqa "b", "a" \n\t"\
|
||||
"pabsw "b", "b" \n\t"
|
||||
#define RESTORE_SIGN(a,b) \
|
||||
"psignw "a", "b" \n\t"
|
||||
#else
|
||||
#define SAVE_SIGN(a,b) \
|
||||
"pxor "a", "a" \n\t"\
|
||||
"pcmpgtw "b", "a" \n\t" /* block[i] <= 0 ? 0xFF : 0x00 */\
|
||||
"pxor "a", "b" \n\t"\
|
||||
"psubw "a", "b" \n\t" /* ABS(block[i]) */
|
||||
#define RESTORE_SIGN(a,b) \
|
||||
"pxor "a", "b" \n\t"\
|
||||
"psubw "a", "b" \n\t" // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i])
|
||||
#endif
|
||||
|
||||
static int RENAME(dct_quantize)(MpegEncContext *s,
|
||||
int16_t *block, int n,
|
||||
int qscale, int *overflow)
|
||||
{
|
||||
x86_reg last_non_zero_p1;
|
||||
int level=0, q; //=0 is because gcc says uninitialized ...
|
||||
const uint16_t *qmat, *bias;
|
||||
LOCAL_ALIGNED_16(int16_t, temp_block, [64]);
|
||||
|
||||
av_assert2((7&(int)(&temp_block[0])) == 0); //did gcc align it correctly?
|
||||
|
||||
//s->fdct (block);
|
||||
RENAME_FDCT(ff_fdct)(block); // cannot be anything else ...
|
||||
|
||||
if(s->dct_error_sum)
|
||||
s->denoise_dct(s, block);
|
||||
|
||||
if (s->mb_intra) {
|
||||
int dummy;
|
||||
if (n < 4){
|
||||
q = s->y_dc_scale;
|
||||
bias = s->q_intra_matrix16[qscale][1];
|
||||
qmat = s->q_intra_matrix16[qscale][0];
|
||||
}else{
|
||||
q = s->c_dc_scale;
|
||||
bias = s->q_chroma_intra_matrix16[qscale][1];
|
||||
qmat = s->q_chroma_intra_matrix16[qscale][0];
|
||||
}
|
||||
/* note: block[0] is assumed to be positive */
|
||||
if (!s->h263_aic) {
|
||||
__asm__ volatile (
|
||||
"mul %%ecx \n\t"
|
||||
: "=d" (level), "=a"(dummy)
|
||||
: "a" ((block[0]>>2) + q), "c" (ff_inverse[q<<1])
|
||||
);
|
||||
} else
|
||||
/* For AIC we skip quant/dequant of INTRADC */
|
||||
level = (block[0] + 4)>>3;
|
||||
|
||||
block[0]=0; //avoid fake overflow
|
||||
// temp_block[0] = (block[0] + (q >> 1)) / q;
|
||||
last_non_zero_p1 = 1;
|
||||
} else {
|
||||
last_non_zero_p1 = 0;
|
||||
bias = s->q_inter_matrix16[qscale][1];
|
||||
qmat = s->q_inter_matrix16[qscale][0];
|
||||
}
|
||||
|
||||
if((s->out_format == FMT_H263 || s->out_format == FMT_H261) && s->mpeg_quant==0){
|
||||
|
||||
__asm__ volatile(
|
||||
"movd %%"FF_REG_a", "MM"3 \n\t" // last_non_zero_p1
|
||||
SPREADW(MM"3")
|
||||
"pxor "MM"7, "MM"7 \n\t" // 0
|
||||
"pxor "MM"4, "MM"4 \n\t" // 0
|
||||
MOVQ" (%2), "MM"5 \n\t" // qmat[0]
|
||||
"pxor "MM"6, "MM"6 \n\t"
|
||||
"psubw (%3), "MM"6 \n\t" // -bias[0]
|
||||
"mov $-128, %%"FF_REG_a" \n\t"
|
||||
".p2align 4 \n\t"
|
||||
"1: \n\t"
|
||||
MOVQ" (%1, %%"FF_REG_a"), "MM"0 \n\t" // block[i]
|
||||
SAVE_SIGN(MM"1", MM"0") // ABS(block[i])
|
||||
"psubusw "MM"6, "MM"0 \n\t" // ABS(block[i]) + bias[0]
|
||||
"pmulhw "MM"5, "MM"0 \n\t" // (ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16
|
||||
"por "MM"0, "MM"4 \n\t"
|
||||
RESTORE_SIGN(MM"1", MM"0") // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i])
|
||||
MOVQ" "MM"0, (%5, %%"FF_REG_a") \n\t"
|
||||
"pcmpeqw "MM"7, "MM"0 \n\t" // out==0 ? 0xFF : 0x00
|
||||
MOVQ" (%4, %%"FF_REG_a"), "MM"1 \n\t"
|
||||
MOVQ" "MM"7, (%1, %%"FF_REG_a") \n\t" // 0
|
||||
"pandn "MM"1, "MM"0 \n\t"
|
||||
PMAXW(MM"0", MM"3")
|
||||
"add $"MMREG_WIDTH", %%"FF_REG_a" \n\t"
|
||||
" js 1b \n\t"
|
||||
PMAX(MM"3", MM"0")
|
||||
"movd "MM"3, %%"FF_REG_a" \n\t"
|
||||
"movzbl %%al, %%eax \n\t" // last_non_zero_p1
|
||||
: "+a" (last_non_zero_p1)
|
||||
: "r" (block+64), "r" (qmat), "r" (bias),
|
||||
"r" (inv_zigzag_direct16 + 64), "r" (temp_block + 64)
|
||||
XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7")
|
||||
);
|
||||
}else{ // FMT_H263
|
||||
__asm__ volatile(
|
||||
"movd %%"FF_REG_a", "MM"3 \n\t" // last_non_zero_p1
|
||||
SPREADW(MM"3")
|
||||
"pxor "MM"7, "MM"7 \n\t" // 0
|
||||
"pxor "MM"4, "MM"4 \n\t" // 0
|
||||
"mov $-128, %%"FF_REG_a" \n\t"
|
||||
".p2align 4 \n\t"
|
||||
"1: \n\t"
|
||||
MOVQ" (%1, %%"FF_REG_a"), "MM"0 \n\t" // block[i]
|
||||
SAVE_SIGN(MM"1", MM"0") // ABS(block[i])
|
||||
MOVQ" (%3, %%"FF_REG_a"), "MM"6 \n\t" // bias[0]
|
||||
"paddusw "MM"6, "MM"0 \n\t" // ABS(block[i]) + bias[0]
|
||||
MOVQ" (%2, %%"FF_REG_a"), "MM"5 \n\t" // qmat[i]
|
||||
"pmulhw "MM"5, "MM"0 \n\t" // (ABS(block[i])*qmat[0] + bias[0]*qmat[0])>>16
|
||||
"por "MM"0, "MM"4 \n\t"
|
||||
RESTORE_SIGN(MM"1", MM"0") // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i])
|
||||
MOVQ" "MM"0, (%5, %%"FF_REG_a") \n\t"
|
||||
"pcmpeqw "MM"7, "MM"0 \n\t" // out==0 ? 0xFF : 0x00
|
||||
MOVQ" (%4, %%"FF_REG_a"), "MM"1 \n\t"
|
||||
MOVQ" "MM"7, (%1, %%"FF_REG_a") \n\t" // 0
|
||||
"pandn "MM"1, "MM"0 \n\t"
|
||||
PMAXW(MM"0", MM"3")
|
||||
"add $"MMREG_WIDTH", %%"FF_REG_a" \n\t"
|
||||
" js 1b \n\t"
|
||||
PMAX(MM"3", MM"0")
|
||||
"movd "MM"3, %%"FF_REG_a" \n\t"
|
||||
"movzbl %%al, %%eax \n\t" // last_non_zero_p1
|
||||
: "+a" (last_non_zero_p1)
|
||||
: "r" (block+64), "r" (qmat+64), "r" (bias+64),
|
||||
"r" (inv_zigzag_direct16 + 64), "r" (temp_block + 64)
|
||||
XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7")
|
||||
);
|
||||
}
|
||||
__asm__ volatile(
|
||||
"movd %1, "MM"1 \n\t" // max_qcoeff
|
||||
SPREADW(MM"1")
|
||||
"psubusw "MM"1, "MM"4 \n\t"
|
||||
"packuswb "MM"4, "MM"4 \n\t"
|
||||
#if COMPILE_TEMPLATE_SSE2
|
||||
"packsswb "MM"4, "MM"4 \n\t"
|
||||
#endif
|
||||
"movd "MM"4, %0 \n\t" // *overflow
|
||||
: "=g" (*overflow)
|
||||
: "g" (s->max_qcoeff)
|
||||
);
|
||||
|
||||
if(s->mb_intra) block[0]= level;
|
||||
else block[0]= temp_block[0];
|
||||
|
||||
if (s->idsp.perm_type == FF_IDCT_PERM_SIMPLE) {
|
||||
if(last_non_zero_p1 <= 1) goto end;
|
||||
block[0x08] = temp_block[0x01]; block[0x10] = temp_block[0x08];
|
||||
block[0x20] = temp_block[0x10];
|
||||
if(last_non_zero_p1 <= 4) goto end;
|
||||
block[0x18] = temp_block[0x09]; block[0x04] = temp_block[0x02];
|
||||
block[0x09] = temp_block[0x03];
|
||||
if(last_non_zero_p1 <= 7) goto end;
|
||||
block[0x14] = temp_block[0x0A]; block[0x28] = temp_block[0x11];
|
||||
block[0x12] = temp_block[0x18]; block[0x02] = temp_block[0x20];
|
||||
if(last_non_zero_p1 <= 11) goto end;
|
||||
block[0x1A] = temp_block[0x19]; block[0x24] = temp_block[0x12];
|
||||
block[0x19] = temp_block[0x0B]; block[0x01] = temp_block[0x04];
|
||||
block[0x0C] = temp_block[0x05];
|
||||
if(last_non_zero_p1 <= 16) goto end;
|
||||
block[0x11] = temp_block[0x0C]; block[0x29] = temp_block[0x13];
|
||||
block[0x16] = temp_block[0x1A]; block[0x0A] = temp_block[0x21];
|
||||
block[0x30] = temp_block[0x28]; block[0x22] = temp_block[0x30];
|
||||
block[0x38] = temp_block[0x29]; block[0x06] = temp_block[0x22];
|
||||
if(last_non_zero_p1 <= 24) goto end;
|
||||
block[0x1B] = temp_block[0x1B]; block[0x21] = temp_block[0x14];
|
||||
block[0x1C] = temp_block[0x0D]; block[0x05] = temp_block[0x06];
|
||||
block[0x0D] = temp_block[0x07]; block[0x15] = temp_block[0x0E];
|
||||
block[0x2C] = temp_block[0x15]; block[0x13] = temp_block[0x1C];
|
||||
if(last_non_zero_p1 <= 32) goto end;
|
||||
block[0x0B] = temp_block[0x23]; block[0x34] = temp_block[0x2A];
|
||||
block[0x2A] = temp_block[0x31]; block[0x32] = temp_block[0x38];
|
||||
block[0x3A] = temp_block[0x39]; block[0x26] = temp_block[0x32];
|
||||
block[0x39] = temp_block[0x2B]; block[0x03] = temp_block[0x24];
|
||||
if(last_non_zero_p1 <= 40) goto end;
|
||||
block[0x1E] = temp_block[0x1D]; block[0x25] = temp_block[0x16];
|
||||
block[0x1D] = temp_block[0x0F]; block[0x2D] = temp_block[0x17];
|
||||
block[0x17] = temp_block[0x1E]; block[0x0E] = temp_block[0x25];
|
||||
block[0x31] = temp_block[0x2C]; block[0x2B] = temp_block[0x33];
|
||||
if(last_non_zero_p1 <= 48) goto end;
|
||||
block[0x36] = temp_block[0x3A]; block[0x3B] = temp_block[0x3B];
|
||||
block[0x23] = temp_block[0x34]; block[0x3C] = temp_block[0x2D];
|
||||
block[0x07] = temp_block[0x26]; block[0x1F] = temp_block[0x1F];
|
||||
block[0x0F] = temp_block[0x27]; block[0x35] = temp_block[0x2E];
|
||||
if(last_non_zero_p1 <= 56) goto end;
|
||||
block[0x2E] = temp_block[0x35]; block[0x33] = temp_block[0x3C];
|
||||
block[0x3E] = temp_block[0x3D]; block[0x27] = temp_block[0x36];
|
||||
block[0x3D] = temp_block[0x2F]; block[0x2F] = temp_block[0x37];
|
||||
block[0x37] = temp_block[0x3E]; block[0x3F] = temp_block[0x3F];
|
||||
}else if(s->idsp.perm_type == FF_IDCT_PERM_LIBMPEG2){
|
||||
if(last_non_zero_p1 <= 1) goto end;
|
||||
block[0x04] = temp_block[0x01];
|
||||
block[0x08] = temp_block[0x08]; block[0x10] = temp_block[0x10];
|
||||
if(last_non_zero_p1 <= 4) goto end;
|
||||
block[0x0C] = temp_block[0x09]; block[0x01] = temp_block[0x02];
|
||||
block[0x05] = temp_block[0x03];
|
||||
if(last_non_zero_p1 <= 7) goto end;
|
||||
block[0x09] = temp_block[0x0A]; block[0x14] = temp_block[0x11];
|
||||
block[0x18] = temp_block[0x18]; block[0x20] = temp_block[0x20];
|
||||
if(last_non_zero_p1 <= 11) goto end;
|
||||
block[0x1C] = temp_block[0x19];
|
||||
block[0x11] = temp_block[0x12]; block[0x0D] = temp_block[0x0B];
|
||||
block[0x02] = temp_block[0x04]; block[0x06] = temp_block[0x05];
|
||||
if(last_non_zero_p1 <= 16) goto end;
|
||||
block[0x0A] = temp_block[0x0C]; block[0x15] = temp_block[0x13];
|
||||
block[0x19] = temp_block[0x1A]; block[0x24] = temp_block[0x21];
|
||||
block[0x28] = temp_block[0x28]; block[0x30] = temp_block[0x30];
|
||||
block[0x2C] = temp_block[0x29]; block[0x21] = temp_block[0x22];
|
||||
if(last_non_zero_p1 <= 24) goto end;
|
||||
block[0x1D] = temp_block[0x1B]; block[0x12] = temp_block[0x14];
|
||||
block[0x0E] = temp_block[0x0D]; block[0x03] = temp_block[0x06];
|
||||
block[0x07] = temp_block[0x07]; block[0x0B] = temp_block[0x0E];
|
||||
block[0x16] = temp_block[0x15]; block[0x1A] = temp_block[0x1C];
|
||||
if(last_non_zero_p1 <= 32) goto end;
|
||||
block[0x25] = temp_block[0x23]; block[0x29] = temp_block[0x2A];
|
||||
block[0x34] = temp_block[0x31]; block[0x38] = temp_block[0x38];
|
||||
block[0x3C] = temp_block[0x39]; block[0x31] = temp_block[0x32];
|
||||
block[0x2D] = temp_block[0x2B]; block[0x22] = temp_block[0x24];
|
||||
if(last_non_zero_p1 <= 40) goto end;
|
||||
block[0x1E] = temp_block[0x1D]; block[0x13] = temp_block[0x16];
|
||||
block[0x0F] = temp_block[0x0F]; block[0x17] = temp_block[0x17];
|
||||
block[0x1B] = temp_block[0x1E]; block[0x26] = temp_block[0x25];
|
||||
block[0x2A] = temp_block[0x2C]; block[0x35] = temp_block[0x33];
|
||||
if(last_non_zero_p1 <= 48) goto end;
|
||||
block[0x39] = temp_block[0x3A]; block[0x3D] = temp_block[0x3B];
|
||||
block[0x32] = temp_block[0x34]; block[0x2E] = temp_block[0x2D];
|
||||
block[0x23] = temp_block[0x26]; block[0x1F] = temp_block[0x1F];
|
||||
block[0x27] = temp_block[0x27]; block[0x2B] = temp_block[0x2E];
|
||||
if(last_non_zero_p1 <= 56) goto end;
|
||||
block[0x36] = temp_block[0x35]; block[0x3A] = temp_block[0x3C];
|
||||
block[0x3E] = temp_block[0x3D]; block[0x33] = temp_block[0x36];
|
||||
block[0x2F] = temp_block[0x2F]; block[0x37] = temp_block[0x37];
|
||||
block[0x3B] = temp_block[0x3E]; block[0x3F] = temp_block[0x3F];
|
||||
} else if (s->idsp.perm_type == FF_IDCT_PERM_NONE) {
|
||||
if(last_non_zero_p1 <= 1) goto end;
|
||||
block[0x01] = temp_block[0x01];
|
||||
block[0x08] = temp_block[0x08]; block[0x10] = temp_block[0x10];
|
||||
if(last_non_zero_p1 <= 4) goto end;
|
||||
block[0x09] = temp_block[0x09]; block[0x02] = temp_block[0x02];
|
||||
block[0x03] = temp_block[0x03];
|
||||
if(last_non_zero_p1 <= 7) goto end;
|
||||
block[0x0A] = temp_block[0x0A]; block[0x11] = temp_block[0x11];
|
||||
block[0x18] = temp_block[0x18]; block[0x20] = temp_block[0x20];
|
||||
if(last_non_zero_p1 <= 11) goto end;
|
||||
block[0x19] = temp_block[0x19];
|
||||
block[0x12] = temp_block[0x12]; block[0x0B] = temp_block[0x0B];
|
||||
block[0x04] = temp_block[0x04]; block[0x05] = temp_block[0x05];
|
||||
if(last_non_zero_p1 <= 16) goto end;
|
||||
block[0x0C] = temp_block[0x0C]; block[0x13] = temp_block[0x13];
|
||||
block[0x1A] = temp_block[0x1A]; block[0x21] = temp_block[0x21];
|
||||
block[0x28] = temp_block[0x28]; block[0x30] = temp_block[0x30];
|
||||
block[0x29] = temp_block[0x29]; block[0x22] = temp_block[0x22];
|
||||
if(last_non_zero_p1 <= 24) goto end;
|
||||
block[0x1B] = temp_block[0x1B]; block[0x14] = temp_block[0x14];
|
||||
block[0x0D] = temp_block[0x0D]; block[0x06] = temp_block[0x06];
|
||||
block[0x07] = temp_block[0x07]; block[0x0E] = temp_block[0x0E];
|
||||
block[0x15] = temp_block[0x15]; block[0x1C] = temp_block[0x1C];
|
||||
if(last_non_zero_p1 <= 32) goto end;
|
||||
block[0x23] = temp_block[0x23]; block[0x2A] = temp_block[0x2A];
|
||||
block[0x31] = temp_block[0x31]; block[0x38] = temp_block[0x38];
|
||||
block[0x39] = temp_block[0x39]; block[0x32] = temp_block[0x32];
|
||||
block[0x2B] = temp_block[0x2B]; block[0x24] = temp_block[0x24];
|
||||
if(last_non_zero_p1 <= 40) goto end;
|
||||
block[0x1D] = temp_block[0x1D]; block[0x16] = temp_block[0x16];
|
||||
block[0x0F] = temp_block[0x0F]; block[0x17] = temp_block[0x17];
|
||||
block[0x1E] = temp_block[0x1E]; block[0x25] = temp_block[0x25];
|
||||
block[0x2C] = temp_block[0x2C]; block[0x33] = temp_block[0x33];
|
||||
if(last_non_zero_p1 <= 48) goto end;
|
||||
block[0x3A] = temp_block[0x3A]; block[0x3B] = temp_block[0x3B];
|
||||
block[0x34] = temp_block[0x34]; block[0x2D] = temp_block[0x2D];
|
||||
block[0x26] = temp_block[0x26]; block[0x1F] = temp_block[0x1F];
|
||||
block[0x27] = temp_block[0x27]; block[0x2E] = temp_block[0x2E];
|
||||
if(last_non_zero_p1 <= 56) goto end;
|
||||
block[0x35] = temp_block[0x35]; block[0x3C] = temp_block[0x3C];
|
||||
block[0x3D] = temp_block[0x3D]; block[0x36] = temp_block[0x36];
|
||||
block[0x2F] = temp_block[0x2F]; block[0x37] = temp_block[0x37];
|
||||
block[0x3E] = temp_block[0x3E]; block[0x3F] = temp_block[0x3F];
|
||||
} else if (s->idsp.perm_type == FF_IDCT_PERM_TRANSPOSE) {
|
||||
if(last_non_zero_p1 <= 1) goto end;
|
||||
block[0x08] = temp_block[0x01];
|
||||
block[0x01] = temp_block[0x08]; block[0x02] = temp_block[0x10];
|
||||
if(last_non_zero_p1 <= 4) goto end;
|
||||
block[0x09] = temp_block[0x09]; block[0x10] = temp_block[0x02];
|
||||
block[0x18] = temp_block[0x03];
|
||||
if(last_non_zero_p1 <= 7) goto end;
|
||||
block[0x11] = temp_block[0x0A]; block[0x0A] = temp_block[0x11];
|
||||
block[0x03] = temp_block[0x18]; block[0x04] = temp_block[0x20];
|
||||
if(last_non_zero_p1 <= 11) goto end;
|
||||
block[0x0B] = temp_block[0x19];
|
||||
block[0x12] = temp_block[0x12]; block[0x19] = temp_block[0x0B];
|
||||
block[0x20] = temp_block[0x04]; block[0x28] = temp_block[0x05];
|
||||
if(last_non_zero_p1 <= 16) goto end;
|
||||
block[0x21] = temp_block[0x0C]; block[0x1A] = temp_block[0x13];
|
||||
block[0x13] = temp_block[0x1A]; block[0x0C] = temp_block[0x21];
|
||||
block[0x05] = temp_block[0x28]; block[0x06] = temp_block[0x30];
|
||||
block[0x0D] = temp_block[0x29]; block[0x14] = temp_block[0x22];
|
||||
if(last_non_zero_p1 <= 24) goto end;
|
||||
block[0x1B] = temp_block[0x1B]; block[0x22] = temp_block[0x14];
|
||||
block[0x29] = temp_block[0x0D]; block[0x30] = temp_block[0x06];
|
||||
block[0x38] = temp_block[0x07]; block[0x31] = temp_block[0x0E];
|
||||
block[0x2A] = temp_block[0x15]; block[0x23] = temp_block[0x1C];
|
||||
if(last_non_zero_p1 <= 32) goto end;
|
||||
block[0x1C] = temp_block[0x23]; block[0x15] = temp_block[0x2A];
|
||||
block[0x0E] = temp_block[0x31]; block[0x07] = temp_block[0x38];
|
||||
block[0x0F] = temp_block[0x39]; block[0x16] = temp_block[0x32];
|
||||
block[0x1D] = temp_block[0x2B]; block[0x24] = temp_block[0x24];
|
||||
if(last_non_zero_p1 <= 40) goto end;
|
||||
block[0x2B] = temp_block[0x1D]; block[0x32] = temp_block[0x16];
|
||||
block[0x39] = temp_block[0x0F]; block[0x3A] = temp_block[0x17];
|
||||
block[0x33] = temp_block[0x1E]; block[0x2C] = temp_block[0x25];
|
||||
block[0x25] = temp_block[0x2C]; block[0x1E] = temp_block[0x33];
|
||||
if(last_non_zero_p1 <= 48) goto end;
|
||||
block[0x17] = temp_block[0x3A]; block[0x1F] = temp_block[0x3B];
|
||||
block[0x26] = temp_block[0x34]; block[0x2D] = temp_block[0x2D];
|
||||
block[0x34] = temp_block[0x26]; block[0x3B] = temp_block[0x1F];
|
||||
block[0x3C] = temp_block[0x27]; block[0x35] = temp_block[0x2E];
|
||||
if(last_non_zero_p1 <= 56) goto end;
|
||||
block[0x2E] = temp_block[0x35]; block[0x27] = temp_block[0x3C];
|
||||
block[0x2F] = temp_block[0x3D]; block[0x36] = temp_block[0x36];
|
||||
block[0x3D] = temp_block[0x2F]; block[0x3E] = temp_block[0x37];
|
||||
block[0x37] = temp_block[0x3E]; block[0x3F] = temp_block[0x3F];
|
||||
} else {
|
||||
av_log(s, AV_LOG_DEBUG, "s->idsp.perm_type: %d\n",
|
||||
(int)s->idsp.perm_type);
|
||||
av_assert0(s->idsp.perm_type == FF_IDCT_PERM_NONE ||
|
||||
s->idsp.perm_type == FF_IDCT_PERM_LIBMPEG2 ||
|
||||
s->idsp.perm_type == FF_IDCT_PERM_SIMPLE ||
|
||||
s->idsp.perm_type == FF_IDCT_PERM_TRANSPOSE);
|
||||
}
|
||||
end:
|
||||
return last_non_zero_p1 - 1;
|
||||
}
|
272
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/mpegvideoencdsp_init.c
vendored
Normal file
272
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/mpegvideoencdsp_init.c
vendored
Normal file
|
@ -0,0 +1,272 @@
|
|||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/avassert.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/avcodec.h"
|
||||
#include "libavcodec/mpegvideoencdsp.h"
|
||||
|
||||
int ff_pix_sum16_mmx(uint8_t *pix, int line_size);
|
||||
int ff_pix_sum16_mmxext(uint8_t *pix, int line_size);
|
||||
int ff_pix_sum16_sse2(uint8_t *pix, int line_size);
|
||||
int ff_pix_sum16_xop(uint8_t *pix, int line_size);
|
||||
int ff_pix_norm1_mmx(uint8_t *pix, int line_size);
|
||||
int ff_pix_norm1_sse2(uint8_t *pix, int line_size);
|
||||
|
||||
#if HAVE_INLINE_ASM
|
||||
|
||||
#define PHADDD(a, t) \
|
||||
"movq " #a ", " #t " \n\t" \
|
||||
"psrlq $32, " #a " \n\t" \
|
||||
"paddd " #t ", " #a " \n\t"
|
||||
|
||||
/*
|
||||
* pmulhw: dst[0 - 15] = (src[0 - 15] * dst[0 - 15])[16 - 31]
|
||||
* pmulhrw: dst[0 - 15] = (src[0 - 15] * dst[0 - 15] + 0x8000)[16 - 31]
|
||||
* pmulhrsw: dst[0 - 15] = (src[0 - 15] * dst[0 - 15] + 0x4000)[15 - 30]
|
||||
*/
|
||||
#define PMULHRW(x, y, s, o) \
|
||||
"pmulhw " #s ", " #x " \n\t" \
|
||||
"pmulhw " #s ", " #y " \n\t" \
|
||||
"paddw " #o ", " #x " \n\t" \
|
||||
"paddw " #o ", " #y " \n\t" \
|
||||
"psraw $1, " #x " \n\t" \
|
||||
"psraw $1, " #y " \n\t"
|
||||
#define DEF(x) x ## _mmx
|
||||
#define SET_RND MOVQ_WONE
|
||||
#define SCALE_OFFSET 1
|
||||
|
||||
#include "mpegvideoenc_qns_template.c"
|
||||
|
||||
#undef DEF
|
||||
#undef SET_RND
|
||||
#undef SCALE_OFFSET
|
||||
#undef PMULHRW
|
||||
|
||||
#define DEF(x) x ## _3dnow
|
||||
#define SET_RND(x)
|
||||
#define SCALE_OFFSET 0
|
||||
#define PMULHRW(x, y, s, o) \
|
||||
"pmulhrw " #s ", " #x " \n\t" \
|
||||
"pmulhrw " #s ", " #y " \n\t"
|
||||
|
||||
#include "mpegvideoenc_qns_template.c"
|
||||
|
||||
#undef DEF
|
||||
#undef SET_RND
|
||||
#undef SCALE_OFFSET
|
||||
#undef PMULHRW
|
||||
|
||||
#if HAVE_SSSE3_INLINE
|
||||
#undef PHADDD
|
||||
#define DEF(x) x ## _ssse3
|
||||
#define SET_RND(x)
|
||||
#define SCALE_OFFSET -1
|
||||
|
||||
#define PHADDD(a, t) \
|
||||
"pshufw $0x0E, " #a ", " #t " \n\t" \
|
||||
/* faster than phaddd on core2 */ \
|
||||
"paddd " #t ", " #a " \n\t"
|
||||
|
||||
#define PMULHRW(x, y, s, o) \
|
||||
"pmulhrsw " #s ", " #x " \n\t" \
|
||||
"pmulhrsw " #s ", " #y " \n\t"
|
||||
|
||||
#include "mpegvideoenc_qns_template.c"
|
||||
|
||||
#undef DEF
|
||||
#undef SET_RND
|
||||
#undef SCALE_OFFSET
|
||||
#undef PMULHRW
|
||||
#undef PHADDD
|
||||
#endif /* HAVE_SSSE3_INLINE */
|
||||
|
||||
/* Draw the edges of width 'w' of an image of size width, height
|
||||
* this MMX version can only handle w == 8 || w == 16. */
|
||||
static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height,
|
||||
int w, int h, int sides)
|
||||
{
|
||||
uint8_t *ptr, *last_line;
|
||||
int i;
|
||||
|
||||
last_line = buf + (height - 1) * wrap;
|
||||
/* left and right */
|
||||
ptr = buf;
|
||||
if (w == 8) {
|
||||
__asm__ volatile (
|
||||
"1: \n\t"
|
||||
"movd (%0), %%mm0 \n\t"
|
||||
"punpcklbw %%mm0, %%mm0 \n\t"
|
||||
"punpcklwd %%mm0, %%mm0 \n\t"
|
||||
"punpckldq %%mm0, %%mm0 \n\t"
|
||||
"movq %%mm0, -8(%0) \n\t"
|
||||
"movq -8(%0, %2), %%mm1 \n\t"
|
||||
"punpckhbw %%mm1, %%mm1 \n\t"
|
||||
"punpckhwd %%mm1, %%mm1 \n\t"
|
||||
"punpckhdq %%mm1, %%mm1 \n\t"
|
||||
"movq %%mm1, (%0, %2) \n\t"
|
||||
"add %1, %0 \n\t"
|
||||
"cmp %3, %0 \n\t"
|
||||
"jb 1b \n\t"
|
||||
: "+r" (ptr)
|
||||
: "r" ((x86_reg) wrap), "r" ((x86_reg) width),
|
||||
"r" (ptr + wrap * height));
|
||||
} else if (w == 16) {
|
||||
__asm__ volatile (
|
||||
"1: \n\t"
|
||||
"movd (%0), %%mm0 \n\t"
|
||||
"punpcklbw %%mm0, %%mm0 \n\t"
|
||||
"punpcklwd %%mm0, %%mm0 \n\t"
|
||||
"punpckldq %%mm0, %%mm0 \n\t"
|
||||
"movq %%mm0, -8(%0) \n\t"
|
||||
"movq %%mm0, -16(%0) \n\t"
|
||||
"movq -8(%0, %2), %%mm1 \n\t"
|
||||
"punpckhbw %%mm1, %%mm1 \n\t"
|
||||
"punpckhwd %%mm1, %%mm1 \n\t"
|
||||
"punpckhdq %%mm1, %%mm1 \n\t"
|
||||
"movq %%mm1, (%0, %2) \n\t"
|
||||
"movq %%mm1, 8(%0, %2) \n\t"
|
||||
"add %1, %0 \n\t"
|
||||
"cmp %3, %0 \n\t"
|
||||
"jb 1b \n\t"
|
||||
: "+r"(ptr)
|
||||
: "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
|
||||
);
|
||||
} else {
|
||||
av_assert1(w == 4);
|
||||
__asm__ volatile (
|
||||
"1: \n\t"
|
||||
"movd (%0), %%mm0 \n\t"
|
||||
"punpcklbw %%mm0, %%mm0 \n\t"
|
||||
"punpcklwd %%mm0, %%mm0 \n\t"
|
||||
"movd %%mm0, -4(%0) \n\t"
|
||||
"movd -4(%0, %2), %%mm1 \n\t"
|
||||
"punpcklbw %%mm1, %%mm1 \n\t"
|
||||
"punpckhwd %%mm1, %%mm1 \n\t"
|
||||
"punpckhdq %%mm1, %%mm1 \n\t"
|
||||
"movd %%mm1, (%0, %2) \n\t"
|
||||
"add %1, %0 \n\t"
|
||||
"cmp %3, %0 \n\t"
|
||||
"jb 1b \n\t"
|
||||
: "+r" (ptr)
|
||||
: "r" ((x86_reg) wrap), "r" ((x86_reg) width),
|
||||
"r" (ptr + wrap * height));
|
||||
}
|
||||
|
||||
/* top and bottom (and hopefully also the corners) */
|
||||
if (sides & EDGE_TOP) {
|
||||
for (i = 0; i < h; i += 4) {
|
||||
ptr = buf - (i + 1) * wrap - w;
|
||||
__asm__ volatile (
|
||||
"1: \n\t"
|
||||
"movq (%1, %0), %%mm0 \n\t"
|
||||
"movq %%mm0, (%0) \n\t"
|
||||
"movq %%mm0, (%0, %2) \n\t"
|
||||
"movq %%mm0, (%0, %2, 2) \n\t"
|
||||
"movq %%mm0, (%0, %3) \n\t"
|
||||
"add $8, %0 \n\t"
|
||||
"cmp %4, %0 \n\t"
|
||||
"jb 1b \n\t"
|
||||
: "+r" (ptr)
|
||||
: "r" ((x86_reg) buf - (x86_reg) ptr - w),
|
||||
"r" ((x86_reg) - wrap), "r" ((x86_reg) - wrap * 3),
|
||||
"r" (ptr + width + 2 * w));
|
||||
}
|
||||
}
|
||||
|
||||
if (sides & EDGE_BOTTOM) {
|
||||
for (i = 0; i < h; i += 4) {
|
||||
ptr = last_line + (i + 1) * wrap - w;
|
||||
__asm__ volatile (
|
||||
"1: \n\t"
|
||||
"movq (%1, %0), %%mm0 \n\t"
|
||||
"movq %%mm0, (%0) \n\t"
|
||||
"movq %%mm0, (%0, %2) \n\t"
|
||||
"movq %%mm0, (%0, %2, 2) \n\t"
|
||||
"movq %%mm0, (%0, %3) \n\t"
|
||||
"add $8, %0 \n\t"
|
||||
"cmp %4, %0 \n\t"
|
||||
"jb 1b \n\t"
|
||||
: "+r" (ptr)
|
||||
: "r" ((x86_reg) last_line - (x86_reg) ptr - w),
|
||||
"r" ((x86_reg) wrap), "r" ((x86_reg) wrap * 3),
|
||||
"r" (ptr + width + 2 * w));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif /* HAVE_INLINE_ASM */
|
||||
|
||||
av_cold void ff_mpegvideoencdsp_init_x86(MpegvideoEncDSPContext *c,
|
||||
AVCodecContext *avctx)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
#if ARCH_X86_32
|
||||
if (EXTERNAL_MMX(cpu_flags)) {
|
||||
c->pix_sum = ff_pix_sum16_mmx;
|
||||
c->pix_norm1 = ff_pix_norm1_mmx;
|
||||
}
|
||||
|
||||
if (EXTERNAL_MMXEXT(cpu_flags)) {
|
||||
c->pix_sum = ff_pix_sum16_mmxext;
|
||||
}
|
||||
#endif
|
||||
|
||||
if (EXTERNAL_SSE2(cpu_flags)) {
|
||||
c->pix_sum = ff_pix_sum16_sse2;
|
||||
c->pix_norm1 = ff_pix_norm1_sse2;
|
||||
}
|
||||
|
||||
if (EXTERNAL_XOP(cpu_flags)) {
|
||||
c->pix_sum = ff_pix_sum16_xop;
|
||||
}
|
||||
|
||||
#if HAVE_INLINE_ASM
|
||||
|
||||
if (INLINE_MMX(cpu_flags)) {
|
||||
if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {
|
||||
c->try_8x8basis = try_8x8basis_mmx;
|
||||
}
|
||||
c->add_8x8basis = add_8x8basis_mmx;
|
||||
|
||||
if (avctx->bits_per_raw_sample <= 8) {
|
||||
c->draw_edges = draw_edges_mmx;
|
||||
}
|
||||
}
|
||||
|
||||
if (INLINE_AMD3DNOW(cpu_flags)) {
|
||||
if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {
|
||||
c->try_8x8basis = try_8x8basis_3dnow;
|
||||
}
|
||||
c->add_8x8basis = add_8x8basis_3dnow;
|
||||
}
|
||||
|
||||
#if HAVE_SSSE3_INLINE
|
||||
if (INLINE_SSSE3(cpu_flags)) {
|
||||
if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {
|
||||
c->try_8x8basis = try_8x8basis_ssse3;
|
||||
}
|
||||
c->add_8x8basis = add_8x8basis_ssse3;
|
||||
}
|
||||
#endif /* HAVE_SSSE3_INLINE */
|
||||
|
||||
#endif /* HAVE_INLINE_ASM */
|
||||
}
|
35
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/opusdsp_init.c
vendored
Normal file
35
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/opusdsp_init.c
vendored
Normal file
|
@ -0,0 +1,35 @@
|
|||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/opusdsp.h"
|
||||
|
||||
void ff_opus_postfilter_fma3(float *data, int period, float *gains, int len);
|
||||
float ff_opus_deemphasis_fma3(float *out, float *in, float coeff, int len);
|
||||
|
||||
av_cold void ff_opus_dsp_init_x86(OpusDSP *ctx)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (EXTERNAL_FMA3_FAST(cpu_flags)) {
|
||||
ctx->postfilter = ff_opus_postfilter_fma3;
|
||||
ctx->deemphasis = ff_opus_deemphasis_fma3;
|
||||
}
|
||||
}
|
52
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/pixblockdsp_init.c
vendored
Normal file
52
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/pixblockdsp_init.c
vendored
Normal file
|
@ -0,0 +1,52 @@
|
|||
/*
|
||||
* SIMD-optimized pixel operations
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/pixblockdsp.h"
|
||||
|
||||
void ff_get_pixels_mmx(int16_t *block, const uint8_t *pixels, ptrdiff_t stride);
|
||||
void ff_get_pixels_sse2(int16_t *block, const uint8_t *pixels, ptrdiff_t stride);
|
||||
void ff_diff_pixels_mmx(int16_t *block, const uint8_t *s1, const uint8_t *s2,
|
||||
ptrdiff_t stride);
|
||||
void ff_diff_pixels_sse2(int16_t *block, const uint8_t *s1, const uint8_t *s2,
|
||||
ptrdiff_t stride);
|
||||
|
||||
av_cold void ff_pixblockdsp_init_x86(PixblockDSPContext *c,
|
||||
AVCodecContext *avctx,
|
||||
unsigned high_bit_depth)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (EXTERNAL_MMX(cpu_flags)) {
|
||||
if (!high_bit_depth)
|
||||
c->get_pixels = ff_get_pixels_mmx;
|
||||
c->diff_pixels_unaligned =
|
||||
c->diff_pixels = ff_diff_pixels_mmx;
|
||||
}
|
||||
|
||||
if (EXTERNAL_SSE2(cpu_flags)) {
|
||||
if (!high_bit_depth)
|
||||
c->get_pixels = ff_get_pixels_sse2;
|
||||
c->diff_pixels_unaligned =
|
||||
c->diff_pixels = ff_diff_pixels_sse2;
|
||||
}
|
||||
}
|
50
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/pngdsp_init.c
vendored
Normal file
50
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/pngdsp_init.c
vendored
Normal file
|
@ -0,0 +1,50 @@
|
|||
/*
|
||||
* x86 PNG optimizations.
|
||||
* Copyright (c) 2008 Loren Merrit <lorenm@u.washington.edu>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/common.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/pngdsp.h"
|
||||
|
||||
void ff_add_png_paeth_prediction_mmxext(uint8_t *dst, uint8_t *src,
|
||||
uint8_t *top, int w, int bpp);
|
||||
void ff_add_png_paeth_prediction_ssse3(uint8_t *dst, uint8_t *src,
|
||||
uint8_t *top, int w, int bpp);
|
||||
void ff_add_bytes_l2_mmx (uint8_t *dst, uint8_t *src1,
|
||||
uint8_t *src2, int w);
|
||||
void ff_add_bytes_l2_sse2(uint8_t *dst, uint8_t *src1,
|
||||
uint8_t *src2, int w);
|
||||
|
||||
av_cold void ff_pngdsp_init_x86(PNGDSPContext *dsp)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
#if ARCH_X86_32
|
||||
if (EXTERNAL_MMX(cpu_flags))
|
||||
dsp->add_bytes_l2 = ff_add_bytes_l2_mmx;
|
||||
#endif
|
||||
if (EXTERNAL_MMXEXT(cpu_flags))
|
||||
dsp->add_paeth_prediction = ff_add_png_paeth_prediction_mmxext;
|
||||
if (EXTERNAL_SSE2(cpu_flags))
|
||||
dsp->add_bytes_l2 = ff_add_bytes_l2_sse2;
|
||||
if (EXTERNAL_SSSE3(cpu_flags))
|
||||
dsp->add_paeth_prediction = ff_add_png_paeth_prediction_ssse3;
|
||||
}
|
50
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/proresdsp_init.c
vendored
Normal file
50
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/proresdsp_init.c
vendored
Normal file
|
@ -0,0 +1,50 @@
|
|||
/*
|
||||
* Apple ProRes compatible decoder
|
||||
*
|
||||
* Copyright (c) 2010-2011 Maxim Poliakovski
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/idctdsp.h"
|
||||
#include "libavcodec/proresdsp.h"
|
||||
|
||||
void ff_prores_idct_put_10_sse2(uint16_t *dst, ptrdiff_t linesize,
|
||||
int16_t *block, const int16_t *qmat);
|
||||
void ff_prores_idct_put_10_avx (uint16_t *dst, ptrdiff_t linesize,
|
||||
int16_t *block, const int16_t *qmat);
|
||||
|
||||
av_cold void ff_proresdsp_init_x86(ProresDSPContext *dsp, AVCodecContext *avctx)
|
||||
{
|
||||
#if ARCH_X86_64
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (avctx->bits_per_raw_sample == 10){
|
||||
if (EXTERNAL_SSE2(cpu_flags)) {
|
||||
dsp->idct_permutation_type = FF_IDCT_PERM_TRANSPOSE;
|
||||
dsp->idct_put = ff_prores_idct_put_10_sse2;
|
||||
}
|
||||
|
||||
if (EXTERNAL_AVX(cpu_flags)) {
|
||||
dsp->idct_permutation_type = FF_IDCT_PERM_TRANSPOSE;
|
||||
dsp->idct_put = ff_prores_idct_put_10_avx;
|
||||
}
|
||||
}
|
||||
#endif /* ARCH_X86_64 */
|
||||
}
|
544
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/qpeldsp_init.c
vendored
Normal file
544
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/qpeldsp_init.c
vendored
Normal file
|
@ -0,0 +1,544 @@
|
|||
/*
|
||||
* quarterpel DSP functions
|
||||
* Copyright (c) 2000, 2001 Fabrice Bellard
|
||||
* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include "config.h"
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/pixels.h"
|
||||
#include "libavcodec/qpeldsp.h"
|
||||
#include "fpel.h"
|
||||
|
||||
void ff_put_pixels8_l2_mmxext(uint8_t *dst,
|
||||
const uint8_t *src1, const uint8_t *src2,
|
||||
int dstStride, int src1Stride, int h);
|
||||
void ff_put_no_rnd_pixels8_l2_mmxext(uint8_t *dst,
|
||||
const uint8_t *src1, const uint8_t *src2,
|
||||
int dstStride, int src1Stride, int h);
|
||||
void ff_avg_pixels8_l2_mmxext(uint8_t *dst,
|
||||
const uint8_t *src1, const uint8_t *src2,
|
||||
int dstStride, int src1Stride, int h);
|
||||
void ff_put_pixels16_l2_mmxext(uint8_t *dst,
|
||||
const uint8_t *src1, const uint8_t *src2,
|
||||
int dstStride, int src1Stride, int h);
|
||||
void ff_avg_pixels16_l2_mmxext(uint8_t *dst,
|
||||
const uint8_t *src1, const uint8_t *src2,
|
||||
int dstStride, int src1Stride, int h);
|
||||
void ff_put_no_rnd_pixels16_l2_mmxext(uint8_t *dst,
|
||||
const uint8_t *src1, const uint8_t *src2,
|
||||
int dstStride, int src1Stride, int h);
|
||||
void ff_put_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, const uint8_t *src,
|
||||
int dstStride, int srcStride, int h);
|
||||
void ff_avg_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, const uint8_t *src,
|
||||
int dstStride, int srcStride, int h);
|
||||
void ff_put_no_rnd_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst,
|
||||
const uint8_t *src,
|
||||
int dstStride, int srcStride,
|
||||
int h);
|
||||
void ff_put_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, const uint8_t *src,
|
||||
int dstStride, int srcStride, int h);
|
||||
void ff_avg_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, const uint8_t *src,
|
||||
int dstStride, int srcStride, int h);
|
||||
void ff_put_no_rnd_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst,
|
||||
const uint8_t *src,
|
||||
int dstStride, int srcStride,
|
||||
int h);
|
||||
void ff_put_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, const uint8_t *src,
|
||||
int dstStride, int srcStride);
|
||||
void ff_avg_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, const uint8_t *src,
|
||||
int dstStride, int srcStride);
|
||||
void ff_put_no_rnd_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst,
|
||||
const uint8_t *src,
|
||||
int dstStride, int srcStride);
|
||||
void ff_put_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, const uint8_t *src,
|
||||
int dstStride, int srcStride);
|
||||
void ff_avg_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, const uint8_t *src,
|
||||
int dstStride, int srcStride);
|
||||
void ff_put_no_rnd_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst,
|
||||
const uint8_t *src,
|
||||
int dstStride, int srcStride);
|
||||
#define ff_put_no_rnd_pixels16_mmxext ff_put_pixels16_mmx
|
||||
#define ff_put_no_rnd_pixels8_mmxext ff_put_pixels8_mmx
|
||||
|
||||
#if HAVE_X86ASM
|
||||
|
||||
#define ff_put_pixels16_mmxext ff_put_pixels16_mmx
|
||||
#define ff_put_pixels8_mmxext ff_put_pixels8_mmx
|
||||
|
||||
#define QPEL_OP(OPNAME, RND, MMX) \
|
||||
static void OPNAME ## qpel8_mc00_ ## MMX(uint8_t *dst, \
|
||||
const uint8_t *src, \
|
||||
ptrdiff_t stride) \
|
||||
{ \
|
||||
ff_ ## OPNAME ## pixels8_ ## MMX(dst, src, stride, 8); \
|
||||
} \
|
||||
\
|
||||
static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, \
|
||||
const uint8_t *src, \
|
||||
ptrdiff_t stride) \
|
||||
{ \
|
||||
uint64_t temp[8]; \
|
||||
uint8_t *const half = (uint8_t *) temp; \
|
||||
ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \
|
||||
stride, 8); \
|
||||
ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src, half, \
|
||||
stride, stride, 8); \
|
||||
} \
|
||||
\
|
||||
static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, \
|
||||
const uint8_t *src, \
|
||||
ptrdiff_t stride) \
|
||||
{ \
|
||||
ff_ ## OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, \
|
||||
stride, 8); \
|
||||
} \
|
||||
\
|
||||
static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, \
|
||||
const uint8_t *src, \
|
||||
ptrdiff_t stride) \
|
||||
{ \
|
||||
uint64_t temp[8]; \
|
||||
uint8_t *const half = (uint8_t *) temp; \
|
||||
ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \
|
||||
stride, 8); \
|
||||
ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src + 1, half, stride, \
|
||||
stride, 8); \
|
||||
} \
|
||||
\
|
||||
static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, \
|
||||
const uint8_t *src, \
|
||||
ptrdiff_t stride) \
|
||||
{ \
|
||||
uint64_t temp[8]; \
|
||||
uint8_t *const half = (uint8_t *) temp; \
|
||||
ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, \
|
||||
8, stride); \
|
||||
ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src, half, \
|
||||
stride, stride, 8); \
|
||||
} \
|
||||
\
|
||||
static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, \
|
||||
const uint8_t *src, \
|
||||
ptrdiff_t stride) \
|
||||
{ \
|
||||
ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, \
|
||||
stride, stride); \
|
||||
} \
|
||||
\
|
||||
static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, \
|
||||
const uint8_t *src, \
|
||||
ptrdiff_t stride) \
|
||||
{ \
|
||||
uint64_t temp[8]; \
|
||||
uint8_t *const half = (uint8_t *) temp; \
|
||||
ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, \
|
||||
8, stride); \
|
||||
ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src + stride, half, stride,\
|
||||
stride, 8); \
|
||||
} \
|
||||
\
|
||||
static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, \
|
||||
const uint8_t *src, \
|
||||
ptrdiff_t stride) \
|
||||
{ \
|
||||
uint64_t half[8 + 9]; \
|
||||
uint8_t *const halfH = (uint8_t *) half + 64; \
|
||||
uint8_t *const halfHV = (uint8_t *) half; \
|
||||
ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
|
||||
stride, 9); \
|
||||
ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, \
|
||||
stride, 9); \
|
||||
ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
|
||||
ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, \
|
||||
stride, 8, 8); \
|
||||
} \
|
||||
\
|
||||
static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, \
|
||||
const uint8_t *src, \
|
||||
ptrdiff_t stride) \
|
||||
{ \
|
||||
uint64_t half[8 + 9]; \
|
||||
uint8_t *const halfH = (uint8_t *) half + 64; \
|
||||
uint8_t *const halfHV = (uint8_t *) half; \
|
||||
ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
|
||||
stride, 9); \
|
||||
ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
|
||||
stride, 9); \
|
||||
ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
|
||||
ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, \
|
||||
stride, 8, 8); \
|
||||
} \
|
||||
\
|
||||
static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, \
|
||||
const uint8_t *src, \
|
||||
ptrdiff_t stride) \
|
||||
{ \
|
||||
uint64_t half[8 + 9]; \
|
||||
uint8_t *const halfH = (uint8_t *) half + 64; \
|
||||
uint8_t *const halfHV = (uint8_t *) half; \
|
||||
ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
|
||||
stride, 9); \
|
||||
ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, \
|
||||
stride, 9); \
|
||||
ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
|
||||
ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, \
|
||||
stride, 8, 8); \
|
||||
} \
|
||||
\
|
||||
static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, \
|
||||
const uint8_t *src, \
|
||||
ptrdiff_t stride) \
|
||||
{ \
|
||||
uint64_t half[8 + 9]; \
|
||||
uint8_t *const halfH = (uint8_t *) half + 64; \
|
||||
uint8_t *const halfHV = (uint8_t *) half; \
|
||||
ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
|
||||
stride, 9); \
|
||||
ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
|
||||
stride, 9); \
|
||||
ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
|
||||
ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, \
|
||||
stride, 8, 8); \
|
||||
} \
|
||||
\
|
||||
static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, \
|
||||
const uint8_t *src, \
|
||||
ptrdiff_t stride) \
|
||||
{ \
|
||||
uint64_t half[8 + 9]; \
|
||||
uint8_t *const halfH = (uint8_t *) half + 64; \
|
||||
uint8_t *const halfHV = (uint8_t *) half; \
|
||||
ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
|
||||
stride, 9); \
|
||||
ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
|
||||
ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, \
|
||||
stride, 8, 8); \
|
||||
} \
|
||||
\
|
||||
static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, \
|
||||
const uint8_t *src, \
|
||||
ptrdiff_t stride) \
|
||||
{ \
|
||||
uint64_t half[8 + 9]; \
|
||||
uint8_t *const halfH = (uint8_t *) half + 64; \
|
||||
uint8_t *const halfHV = (uint8_t *) half; \
|
||||
ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
|
||||
stride, 9); \
|
||||
ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
|
||||
ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, \
|
||||
stride, 8, 8); \
|
||||
} \
|
||||
\
|
||||
static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, \
|
||||
const uint8_t *src, \
|
||||
ptrdiff_t stride) \
|
||||
{ \
|
||||
uint64_t half[8 + 9]; \
|
||||
uint8_t *const halfH = (uint8_t *) half; \
|
||||
ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
|
||||
stride, 9); \
|
||||
ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, \
|
||||
8, stride, 9); \
|
||||
ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, \
|
||||
stride, 8); \
|
||||
} \
|
||||
\
|
||||
static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, \
|
||||
const uint8_t *src, \
|
||||
ptrdiff_t stride) \
|
||||
{ \
|
||||
uint64_t half[8 + 9]; \
|
||||
uint8_t *const halfH = (uint8_t *) half; \
|
||||
ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
|
||||
stride, 9); \
|
||||
ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
|
||||
stride, 9); \
|
||||
ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, \
|
||||
stride, 8); \
|
||||
} \
|
||||
\
|
||||
static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, \
|
||||
const uint8_t *src, \
|
||||
ptrdiff_t stride) \
|
||||
{ \
|
||||
uint64_t half[9]; \
|
||||
uint8_t *const halfH = (uint8_t *) half; \
|
||||
ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
|
||||
stride, 9); \
|
||||
ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, \
|
||||
stride, 8); \
|
||||
} \
|
||||
\
|
||||
static void OPNAME ## qpel16_mc00_ ## MMX(uint8_t *dst, \
|
||||
const uint8_t *src, \
|
||||
ptrdiff_t stride) \
|
||||
{ \
|
||||
ff_ ## OPNAME ## pixels16_ ## MMX(dst, src, stride, 16); \
|
||||
} \
|
||||
\
|
||||
static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, \
|
||||
const uint8_t *src, \
|
||||
ptrdiff_t stride) \
|
||||
{ \
|
||||
uint64_t temp[32]; \
|
||||
uint8_t *const half = (uint8_t *) temp; \
|
||||
ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \
|
||||
stride, 16); \
|
||||
ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, \
|
||||
stride, 16); \
|
||||
} \
|
||||
\
|
||||
static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, \
|
||||
const uint8_t *src, \
|
||||
ptrdiff_t stride) \
|
||||
{ \
|
||||
ff_ ## OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, \
|
||||
stride, stride, 16);\
|
||||
} \
|
||||
\
|
||||
static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, \
|
||||
const uint8_t *src, \
|
||||
ptrdiff_t stride) \
|
||||
{ \
|
||||
uint64_t temp[32]; \
|
||||
uint8_t *const half = (uint8_t*) temp; \
|
||||
ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \
|
||||
stride, 16); \
|
||||
ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src + 1, half, \
|
||||
stride, stride, 16); \
|
||||
} \
|
||||
\
|
||||
static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, \
|
||||
const uint8_t *src, \
|
||||
ptrdiff_t stride) \
|
||||
{ \
|
||||
uint64_t temp[32]; \
|
||||
uint8_t *const half = (uint8_t *) temp; \
|
||||
ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \
|
||||
stride); \
|
||||
ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, \
|
||||
stride, 16); \
|
||||
} \
|
||||
\
|
||||
static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, \
|
||||
const uint8_t *src, \
|
||||
ptrdiff_t stride) \
|
||||
{ \
|
||||
ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, \
|
||||
stride, stride); \
|
||||
} \
|
||||
\
|
||||
static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, \
|
||||
const uint8_t *src, \
|
||||
ptrdiff_t stride) \
|
||||
{ \
|
||||
uint64_t temp[32]; \
|
||||
uint8_t *const half = (uint8_t *) temp; \
|
||||
ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \
|
||||
stride); \
|
||||
ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, \
|
||||
stride, stride, 16); \
|
||||
} \
|
||||
\
|
||||
static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, \
|
||||
const uint8_t *src, \
|
||||
ptrdiff_t stride) \
|
||||
{ \
|
||||
uint64_t half[16 * 2 + 17 * 2]; \
|
||||
uint8_t *const halfH = (uint8_t *) half + 256; \
|
||||
uint8_t *const halfHV = (uint8_t *) half; \
|
||||
ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
|
||||
stride, 17); \
|
||||
ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
|
||||
stride, 17); \
|
||||
ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
|
||||
16, 16); \
|
||||
ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, \
|
||||
stride, 16, 16); \
|
||||
} \
|
||||
\
|
||||
static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, \
|
||||
const uint8_t *src, \
|
||||
ptrdiff_t stride) \
|
||||
{ \
|
||||
uint64_t half[16 * 2 + 17 * 2]; \
|
||||
uint8_t *const halfH = (uint8_t *) half + 256; \
|
||||
uint8_t *const halfHV = (uint8_t *) half; \
|
||||
ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
|
||||
stride, 17); \
|
||||
ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
|
||||
stride, 17); \
|
||||
ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
|
||||
16, 16); \
|
||||
ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, \
|
||||
stride, 16, 16); \
|
||||
} \
|
||||
\
|
||||
static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, \
|
||||
const uint8_t *src, \
|
||||
ptrdiff_t stride) \
|
||||
{ \
|
||||
uint64_t half[16 * 2 + 17 * 2]; \
|
||||
uint8_t *const halfH = (uint8_t *) half + 256; \
|
||||
uint8_t *const halfHV = (uint8_t *) half; \
|
||||
ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
|
||||
stride, 17); \
|
||||
ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
|
||||
stride, 17); \
|
||||
ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
|
||||
16, 16); \
|
||||
ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, \
|
||||
stride, 16, 16); \
|
||||
} \
|
||||
\
|
||||
static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, \
|
||||
const uint8_t *src, \
|
||||
ptrdiff_t stride) \
|
||||
{ \
|
||||
uint64_t half[16 * 2 + 17 * 2]; \
|
||||
uint8_t *const halfH = (uint8_t *) half + 256; \
|
||||
uint8_t *const halfHV = (uint8_t *) half; \
|
||||
ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
|
||||
stride, 17); \
|
||||
ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
|
||||
stride, 17); \
|
||||
ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
|
||||
16, 16); \
|
||||
ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, \
|
||||
stride, 16, 16); \
|
||||
} \
|
||||
\
|
||||
static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, \
|
||||
const uint8_t *src, \
|
||||
ptrdiff_t stride) \
|
||||
{ \
|
||||
uint64_t half[16 * 2 + 17 * 2]; \
|
||||
uint8_t *const halfH = (uint8_t *) half + 256; \
|
||||
uint8_t *const halfHV = (uint8_t *) half; \
|
||||
ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
|
||||
stride, 17); \
|
||||
ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
|
||||
16, 16); \
|
||||
ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, \
|
||||
stride, 16, 16); \
|
||||
} \
|
||||
\
|
||||
static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, \
|
||||
const uint8_t *src, \
|
||||
ptrdiff_t stride) \
|
||||
{ \
|
||||
uint64_t half[16 * 2 + 17 * 2]; \
|
||||
uint8_t *const halfH = (uint8_t *) half + 256; \
|
||||
uint8_t *const halfHV = (uint8_t *) half; \
|
||||
ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
|
||||
stride, 17); \
|
||||
ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
|
||||
16, 16); \
|
||||
ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, \
|
||||
stride, 16, 16); \
|
||||
} \
|
||||
\
|
||||
static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, \
|
||||
const uint8_t *src, \
|
||||
ptrdiff_t stride) \
|
||||
{ \
|
||||
uint64_t half[17 * 2]; \
|
||||
uint8_t *const halfH = (uint8_t *) half; \
|
||||
ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
|
||||
stride, 17); \
|
||||
ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
|
||||
stride, 17); \
|
||||
ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, \
|
||||
stride, 16); \
|
||||
} \
|
||||
\
|
||||
static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, \
|
||||
const uint8_t *src, \
|
||||
ptrdiff_t stride) \
|
||||
{ \
|
||||
uint64_t half[17 * 2]; \
|
||||
uint8_t *const halfH = (uint8_t *) half; \
|
||||
ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
|
||||
stride, 17); \
|
||||
ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
|
||||
stride, 17); \
|
||||
ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, \
|
||||
stride, 16); \
|
||||
} \
|
||||
\
|
||||
static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, \
|
||||
const uint8_t *src, \
|
||||
ptrdiff_t stride) \
|
||||
{ \
|
||||
uint64_t half[17 * 2]; \
|
||||
uint8_t *const halfH = (uint8_t *) half; \
|
||||
ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
|
||||
stride, 17); \
|
||||
ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, \
|
||||
stride, 16); \
|
||||
}
|
||||
|
||||
QPEL_OP(put_, _, mmxext)
|
||||
QPEL_OP(avg_, _, mmxext)
|
||||
QPEL_OP(put_no_rnd_, _no_rnd_, mmxext)
|
||||
|
||||
#endif /* HAVE_X86ASM */
|
||||
|
||||
#define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX) \
|
||||
do { \
|
||||
c->PFX ## _pixels_tab[IDX][ 0] = PREFIX ## PFX ## SIZE ## _mc00_ ## CPU; \
|
||||
c->PFX ## _pixels_tab[IDX][ 1] = PREFIX ## PFX ## SIZE ## _mc10_ ## CPU; \
|
||||
c->PFX ## _pixels_tab[IDX][ 2] = PREFIX ## PFX ## SIZE ## _mc20_ ## CPU; \
|
||||
c->PFX ## _pixels_tab[IDX][ 3] = PREFIX ## PFX ## SIZE ## _mc30_ ## CPU; \
|
||||
c->PFX ## _pixels_tab[IDX][ 4] = PREFIX ## PFX ## SIZE ## _mc01_ ## CPU; \
|
||||
c->PFX ## _pixels_tab[IDX][ 5] = PREFIX ## PFX ## SIZE ## _mc11_ ## CPU; \
|
||||
c->PFX ## _pixels_tab[IDX][ 6] = PREFIX ## PFX ## SIZE ## _mc21_ ## CPU; \
|
||||
c->PFX ## _pixels_tab[IDX][ 7] = PREFIX ## PFX ## SIZE ## _mc31_ ## CPU; \
|
||||
c->PFX ## _pixels_tab[IDX][ 8] = PREFIX ## PFX ## SIZE ## _mc02_ ## CPU; \
|
||||
c->PFX ## _pixels_tab[IDX][ 9] = PREFIX ## PFX ## SIZE ## _mc12_ ## CPU; \
|
||||
c->PFX ## _pixels_tab[IDX][10] = PREFIX ## PFX ## SIZE ## _mc22_ ## CPU; \
|
||||
c->PFX ## _pixels_tab[IDX][11] = PREFIX ## PFX ## SIZE ## _mc32_ ## CPU; \
|
||||
c->PFX ## _pixels_tab[IDX][12] = PREFIX ## PFX ## SIZE ## _mc03_ ## CPU; \
|
||||
c->PFX ## _pixels_tab[IDX][13] = PREFIX ## PFX ## SIZE ## _mc13_ ## CPU; \
|
||||
c->PFX ## _pixels_tab[IDX][14] = PREFIX ## PFX ## SIZE ## _mc23_ ## CPU; \
|
||||
c->PFX ## _pixels_tab[IDX][15] = PREFIX ## PFX ## SIZE ## _mc33_ ## CPU; \
|
||||
} while (0)
|
||||
|
||||
av_cold void ff_qpeldsp_init_x86(QpelDSPContext *c)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (X86_MMXEXT(cpu_flags)) {
|
||||
#if HAVE_MMXEXT_EXTERNAL
|
||||
SET_QPEL_FUNCS(avg_qpel, 0, 16, mmxext, );
|
||||
SET_QPEL_FUNCS(avg_qpel, 1, 8, mmxext, );
|
||||
|
||||
SET_QPEL_FUNCS(put_qpel, 0, 16, mmxext, );
|
||||
SET_QPEL_FUNCS(put_qpel, 1, 8, mmxext, );
|
||||
SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmxext, );
|
||||
SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmxext, );
|
||||
#endif /* HAVE_MMXEXT_EXTERNAL */
|
||||
}
|
||||
}
|
175
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/rnd_template.c
vendored
Normal file
175
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/rnd_template.c
vendored
Normal file
|
@ -0,0 +1,175 @@
|
|||
/*
|
||||
* SIMD-optimized halfpel functions are compiled twice for rnd/no_rnd
|
||||
* Copyright (c) 2000, 2001 Fabrice Bellard
|
||||
* Copyright (c) 2003-2004 Michael Niedermayer <michaelni@gmx.at>
|
||||
*
|
||||
* MMX optimization by Nick Kurshev <nickols_k@mail.ru>
|
||||
* mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
|
||||
* and improved by Zdenek Kabelac <kabi@users.sf.net>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include "inline_asm.h"
|
||||
|
||||
// put_pixels
|
||||
av_unused STATIC void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h)
|
||||
{
|
||||
MOVQ_ZERO(mm7);
|
||||
SET_RND(mm6); // =2 for rnd and =1 for no_rnd version
|
||||
__asm__ volatile(
|
||||
"movq (%1), %%mm0 \n\t"
|
||||
"movq 1(%1), %%mm4 \n\t"
|
||||
"movq %%mm0, %%mm1 \n\t"
|
||||
"movq %%mm4, %%mm5 \n\t"
|
||||
"punpcklbw %%mm7, %%mm0 \n\t"
|
||||
"punpcklbw %%mm7, %%mm4 \n\t"
|
||||
"punpckhbw %%mm7, %%mm1 \n\t"
|
||||
"punpckhbw %%mm7, %%mm5 \n\t"
|
||||
"paddusw %%mm0, %%mm4 \n\t"
|
||||
"paddusw %%mm1, %%mm5 \n\t"
|
||||
"xor %%"FF_REG_a", %%"FF_REG_a" \n\t"
|
||||
"add %3, %1 \n\t"
|
||||
".p2align 3 \n\t"
|
||||
"1: \n\t"
|
||||
"movq (%1, %%"FF_REG_a"), %%mm0 \n\t"
|
||||
"movq 1(%1, %%"FF_REG_a"), %%mm2 \n\t"
|
||||
"movq %%mm0, %%mm1 \n\t"
|
||||
"movq %%mm2, %%mm3 \n\t"
|
||||
"punpcklbw %%mm7, %%mm0 \n\t"
|
||||
"punpcklbw %%mm7, %%mm2 \n\t"
|
||||
"punpckhbw %%mm7, %%mm1 \n\t"
|
||||
"punpckhbw %%mm7, %%mm3 \n\t"
|
||||
"paddusw %%mm2, %%mm0 \n\t"
|
||||
"paddusw %%mm3, %%mm1 \n\t"
|
||||
"paddusw %%mm6, %%mm4 \n\t"
|
||||
"paddusw %%mm6, %%mm5 \n\t"
|
||||
"paddusw %%mm0, %%mm4 \n\t"
|
||||
"paddusw %%mm1, %%mm5 \n\t"
|
||||
"psrlw $2, %%mm4 \n\t"
|
||||
"psrlw $2, %%mm5 \n\t"
|
||||
"packuswb %%mm5, %%mm4 \n\t"
|
||||
"movq %%mm4, (%2, %%"FF_REG_a") \n\t"
|
||||
"add %3, %%"FF_REG_a" \n\t"
|
||||
|
||||
"movq (%1, %%"FF_REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3
|
||||
"movq 1(%1, %%"FF_REG_a"), %%mm4 \n\t"
|
||||
"movq %%mm2, %%mm3 \n\t"
|
||||
"movq %%mm4, %%mm5 \n\t"
|
||||
"punpcklbw %%mm7, %%mm2 \n\t"
|
||||
"punpcklbw %%mm7, %%mm4 \n\t"
|
||||
"punpckhbw %%mm7, %%mm3 \n\t"
|
||||
"punpckhbw %%mm7, %%mm5 \n\t"
|
||||
"paddusw %%mm2, %%mm4 \n\t"
|
||||
"paddusw %%mm3, %%mm5 \n\t"
|
||||
"paddusw %%mm6, %%mm0 \n\t"
|
||||
"paddusw %%mm6, %%mm1 \n\t"
|
||||
"paddusw %%mm4, %%mm0 \n\t"
|
||||
"paddusw %%mm5, %%mm1 \n\t"
|
||||
"psrlw $2, %%mm0 \n\t"
|
||||
"psrlw $2, %%mm1 \n\t"
|
||||
"packuswb %%mm1, %%mm0 \n\t"
|
||||
"movq %%mm0, (%2, %%"FF_REG_a") \n\t"
|
||||
"add %3, %%"FF_REG_a" \n\t"
|
||||
|
||||
"subl $2, %0 \n\t"
|
||||
"jnz 1b \n\t"
|
||||
:"+g"(h), "+S"(pixels)
|
||||
:"D"(block), "r"((x86_reg)line_size)
|
||||
:FF_REG_a, "memory");
|
||||
}
|
||||
|
||||
// avg_pixels
|
||||
// this routine is 'slightly' suboptimal but mostly unused
|
||||
av_unused STATIC void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h)
|
||||
{
|
||||
MOVQ_ZERO(mm7);
|
||||
SET_RND(mm6); // =2 for rnd and =1 for no_rnd version
|
||||
__asm__ volatile(
|
||||
"movq (%1), %%mm0 \n\t"
|
||||
"movq 1(%1), %%mm4 \n\t"
|
||||
"movq %%mm0, %%mm1 \n\t"
|
||||
"movq %%mm4, %%mm5 \n\t"
|
||||
"punpcklbw %%mm7, %%mm0 \n\t"
|
||||
"punpcklbw %%mm7, %%mm4 \n\t"
|
||||
"punpckhbw %%mm7, %%mm1 \n\t"
|
||||
"punpckhbw %%mm7, %%mm5 \n\t"
|
||||
"paddusw %%mm0, %%mm4 \n\t"
|
||||
"paddusw %%mm1, %%mm5 \n\t"
|
||||
"xor %%"FF_REG_a", %%"FF_REG_a" \n\t"
|
||||
"add %3, %1 \n\t"
|
||||
".p2align 3 \n\t"
|
||||
"1: \n\t"
|
||||
"movq (%1, %%"FF_REG_a"), %%mm0 \n\t"
|
||||
"movq 1(%1, %%"FF_REG_a"), %%mm2 \n\t"
|
||||
"movq %%mm0, %%mm1 \n\t"
|
||||
"movq %%mm2, %%mm3 \n\t"
|
||||
"punpcklbw %%mm7, %%mm0 \n\t"
|
||||
"punpcklbw %%mm7, %%mm2 \n\t"
|
||||
"punpckhbw %%mm7, %%mm1 \n\t"
|
||||
"punpckhbw %%mm7, %%mm3 \n\t"
|
||||
"paddusw %%mm2, %%mm0 \n\t"
|
||||
"paddusw %%mm3, %%mm1 \n\t"
|
||||
"paddusw %%mm6, %%mm4 \n\t"
|
||||
"paddusw %%mm6, %%mm5 \n\t"
|
||||
"paddusw %%mm0, %%mm4 \n\t"
|
||||
"paddusw %%mm1, %%mm5 \n\t"
|
||||
"psrlw $2, %%mm4 \n\t"
|
||||
"psrlw $2, %%mm5 \n\t"
|
||||
"movq (%2, %%"FF_REG_a"), %%mm3 \n\t"
|
||||
"packuswb %%mm5, %%mm4 \n\t"
|
||||
"pcmpeqd %%mm2, %%mm2 \n\t"
|
||||
"paddb %%mm2, %%mm2 \n\t"
|
||||
PAVGB_MMX(%%mm3, %%mm4, %%mm5, %%mm2)
|
||||
"movq %%mm5, (%2, %%"FF_REG_a") \n\t"
|
||||
"add %3, %%"FF_REG_a" \n\t"
|
||||
|
||||
"movq (%1, %%"FF_REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3
|
||||
"movq 1(%1, %%"FF_REG_a"), %%mm4 \n\t"
|
||||
"movq %%mm2, %%mm3 \n\t"
|
||||
"movq %%mm4, %%mm5 \n\t"
|
||||
"punpcklbw %%mm7, %%mm2 \n\t"
|
||||
"punpcklbw %%mm7, %%mm4 \n\t"
|
||||
"punpckhbw %%mm7, %%mm3 \n\t"
|
||||
"punpckhbw %%mm7, %%mm5 \n\t"
|
||||
"paddusw %%mm2, %%mm4 \n\t"
|
||||
"paddusw %%mm3, %%mm5 \n\t"
|
||||
"paddusw %%mm6, %%mm0 \n\t"
|
||||
"paddusw %%mm6, %%mm1 \n\t"
|
||||
"paddusw %%mm4, %%mm0 \n\t"
|
||||
"paddusw %%mm5, %%mm1 \n\t"
|
||||
"psrlw $2, %%mm0 \n\t"
|
||||
"psrlw $2, %%mm1 \n\t"
|
||||
"movq (%2, %%"FF_REG_a"), %%mm3 \n\t"
|
||||
"packuswb %%mm1, %%mm0 \n\t"
|
||||
"pcmpeqd %%mm2, %%mm2 \n\t"
|
||||
"paddb %%mm2, %%mm2 \n\t"
|
||||
PAVGB_MMX(%%mm3, %%mm0, %%mm1, %%mm2)
|
||||
"movq %%mm1, (%2, %%"FF_REG_a") \n\t"
|
||||
"add %3, %%"FF_REG_a" \n\t"
|
||||
|
||||
"subl $2, %0 \n\t"
|
||||
"jnz 1b \n\t"
|
||||
:"+g"(h), "+S"(pixels)
|
||||
:"D"(block), "r"((x86_reg)line_size)
|
||||
:FF_REG_a, "memory");
|
||||
}
|
48
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/rv34dsp_init.c
vendored
Normal file
48
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/rv34dsp_init.c
vendored
Normal file
|
@ -0,0 +1,48 @@
|
|||
/*
|
||||
* RV30/40 MMX/SSE2 optimizations
|
||||
* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/rv34dsp.h"
|
||||
|
||||
void ff_rv34_idct_dc_mmxext(int16_t *block);
|
||||
void ff_rv34_idct_dc_noround_mmxext(int16_t *block);
|
||||
void ff_rv34_idct_dc_add_mmx(uint8_t *dst, ptrdiff_t stride, int dc);
|
||||
void ff_rv34_idct_dc_add_sse2(uint8_t *dst, ptrdiff_t stride, int dc);
|
||||
void ff_rv34_idct_dc_add_sse4(uint8_t *dst, ptrdiff_t stride, int dc);
|
||||
void ff_rv34_idct_add_mmxext(uint8_t *dst, ptrdiff_t stride, int16_t *block);
|
||||
|
||||
av_cold void ff_rv34dsp_init_x86(RV34DSPContext* c)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (ARCH_X86_32 && EXTERNAL_MMX(cpu_flags))
|
||||
c->rv34_idct_dc_add = ff_rv34_idct_dc_add_mmx;
|
||||
if (EXTERNAL_MMXEXT(cpu_flags)) {
|
||||
c->rv34_inv_transform_dc = ff_rv34_idct_dc_noround_mmxext;
|
||||
c->rv34_idct_add = ff_rv34_idct_add_mmxext;
|
||||
}
|
||||
if (EXTERNAL_SSE2(cpu_flags))
|
||||
c->rv34_idct_dc_add = ff_rv34_idct_dc_add_sse2;
|
||||
if (EXTERNAL_SSE4(cpu_flags))
|
||||
c->rv34_idct_dc_add = ff_rv34_idct_dc_add_sse4;
|
||||
}
|
278
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/rv40dsp_init.c
vendored
Normal file
278
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/rv40dsp_init.c
vendored
Normal file
|
@ -0,0 +1,278 @@
|
|||
/*
|
||||
* RV40 decoder motion compensation functions x86-optimised
|
||||
* Copyright (c) 2008 Konstantin Shishkov
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
/**
|
||||
* @file
|
||||
* RV40 decoder motion compensation functions x86-optimised
|
||||
* 2,0 and 0,2 have h264 equivalents.
|
||||
* 3,3 is bugged in the rv40 format and maps to _xy2 version
|
||||
*/
|
||||
|
||||
#include "libavcodec/rv34dsp.h"
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/mem.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "hpeldsp.h"
|
||||
|
||||
#define DEFINE_FN(op, size, insn) \
|
||||
static void op##_rv40_qpel##size##_mc33_##insn(uint8_t *dst, const uint8_t *src, \
|
||||
ptrdiff_t stride) \
|
||||
{ \
|
||||
ff_##op##_pixels##size##_xy2_##insn(dst, src, stride, size); \
|
||||
}
|
||||
|
||||
#if HAVE_X86ASM
|
||||
void ff_put_rv40_chroma_mc8_mmx (uint8_t *dst, uint8_t *src,
|
||||
ptrdiff_t stride, int h, int x, int y);
|
||||
void ff_avg_rv40_chroma_mc8_mmxext(uint8_t *dst, uint8_t *src,
|
||||
ptrdiff_t stride, int h, int x, int y);
|
||||
void ff_avg_rv40_chroma_mc8_3dnow(uint8_t *dst, uint8_t *src,
|
||||
ptrdiff_t stride, int h, int x, int y);
|
||||
|
||||
void ff_put_rv40_chroma_mc4_mmx (uint8_t *dst, uint8_t *src,
|
||||
ptrdiff_t stride, int h, int x, int y);
|
||||
void ff_avg_rv40_chroma_mc4_mmxext(uint8_t *dst, uint8_t *src,
|
||||
ptrdiff_t stride, int h, int x, int y);
|
||||
void ff_avg_rv40_chroma_mc4_3dnow(uint8_t *dst, uint8_t *src,
|
||||
ptrdiff_t stride, int h, int x, int y);
|
||||
|
||||
#define DECLARE_WEIGHT(opt) \
|
||||
void ff_rv40_weight_func_rnd_16_##opt(uint8_t *dst, uint8_t *src1, uint8_t *src2, \
|
||||
int w1, int w2, ptrdiff_t stride); \
|
||||
void ff_rv40_weight_func_rnd_8_##opt (uint8_t *dst, uint8_t *src1, uint8_t *src2, \
|
||||
int w1, int w2, ptrdiff_t stride); \
|
||||
void ff_rv40_weight_func_nornd_16_##opt(uint8_t *dst, uint8_t *src1, uint8_t *src2, \
|
||||
int w1, int w2, ptrdiff_t stride); \
|
||||
void ff_rv40_weight_func_nornd_8_##opt (uint8_t *dst, uint8_t *src1, uint8_t *src2, \
|
||||
int w1, int w2, ptrdiff_t stride);
|
||||
DECLARE_WEIGHT(mmxext)
|
||||
DECLARE_WEIGHT(sse2)
|
||||
DECLARE_WEIGHT(ssse3)
|
||||
|
||||
/** @{ */
|
||||
/**
|
||||
* Define one qpel function.
|
||||
* LOOPSIZE must be already set to the number of pixels processed per
|
||||
* iteration in the inner loop of the called functions.
|
||||
* COFF(x) must be already defined so as to provide the offset into any
|
||||
* array of coeffs used by the called function for the qpel position x.
|
||||
*/
|
||||
#define QPEL_FUNC_DECL(OP, SIZE, PH, PV, OPT) \
|
||||
static void OP ## rv40_qpel ##SIZE ##_mc ##PH ##PV ##OPT(uint8_t *dst, \
|
||||
const uint8_t *src, \
|
||||
ptrdiff_t stride) \
|
||||
{ \
|
||||
int i; \
|
||||
if (PH && PV) { \
|
||||
LOCAL_ALIGNED(16, uint8_t, tmp, [SIZE * (SIZE + 5)]); \
|
||||
uint8_t *tmpptr = tmp + SIZE * 2; \
|
||||
src -= stride * 2; \
|
||||
\
|
||||
for (i = 0; i < SIZE; i += LOOPSIZE) \
|
||||
ff_put_rv40_qpel_h ##OPT(tmp + i, SIZE, src + i, stride, \
|
||||
SIZE + 5, HCOFF(PH)); \
|
||||
for (i = 0; i < SIZE; i += LOOPSIZE) \
|
||||
ff_ ##OP ##rv40_qpel_v ##OPT(dst + i, stride, tmpptr + i, \
|
||||
SIZE, SIZE, VCOFF(PV)); \
|
||||
} else if (PV) { \
|
||||
for (i = 0; i < SIZE; i += LOOPSIZE) \
|
||||
ff_ ##OP ##rv40_qpel_v ## OPT(dst + i, stride, src + i, \
|
||||
stride, SIZE, VCOFF(PV)); \
|
||||
} else { \
|
||||
for (i = 0; i < SIZE; i += LOOPSIZE) \
|
||||
ff_ ##OP ##rv40_qpel_h ## OPT(dst + i, stride, src + i, \
|
||||
stride, SIZE, HCOFF(PH)); \
|
||||
} \
|
||||
}
|
||||
|
||||
/** Declare functions for sizes 8 and 16 and given operations
|
||||
* and qpel position. */
|
||||
#define QPEL_FUNCS_DECL(OP, PH, PV, OPT) \
|
||||
QPEL_FUNC_DECL(OP, 8, PH, PV, OPT) \
|
||||
QPEL_FUNC_DECL(OP, 16, PH, PV, OPT)
|
||||
|
||||
/** Declare all functions for all sizes and qpel positions */
|
||||
#define QPEL_MC_DECL(OP, OPT) \
|
||||
void ff_ ##OP ##rv40_qpel_h ##OPT(uint8_t *dst, ptrdiff_t dstStride, \
|
||||
const uint8_t *src, \
|
||||
ptrdiff_t srcStride, \
|
||||
int len, int m); \
|
||||
void ff_ ##OP ##rv40_qpel_v ##OPT(uint8_t *dst, ptrdiff_t dstStride, \
|
||||
const uint8_t *src, \
|
||||
ptrdiff_t srcStride, \
|
||||
int len, int m); \
|
||||
QPEL_FUNCS_DECL(OP, 0, 1, OPT) \
|
||||
QPEL_FUNCS_DECL(OP, 0, 3, OPT) \
|
||||
QPEL_FUNCS_DECL(OP, 1, 0, OPT) \
|
||||
QPEL_FUNCS_DECL(OP, 1, 1, OPT) \
|
||||
QPEL_FUNCS_DECL(OP, 1, 2, OPT) \
|
||||
QPEL_FUNCS_DECL(OP, 1, 3, OPT) \
|
||||
QPEL_FUNCS_DECL(OP, 2, 1, OPT) \
|
||||
QPEL_FUNCS_DECL(OP, 2, 2, OPT) \
|
||||
QPEL_FUNCS_DECL(OP, 2, 3, OPT) \
|
||||
QPEL_FUNCS_DECL(OP, 3, 0, OPT) \
|
||||
QPEL_FUNCS_DECL(OP, 3, 1, OPT) \
|
||||
QPEL_FUNCS_DECL(OP, 3, 2, OPT)
|
||||
/** @} */
|
||||
|
||||
#define LOOPSIZE 8
|
||||
#define HCOFF(x) (32 * ((x) - 1))
|
||||
#define VCOFF(x) (32 * ((x) - 1))
|
||||
QPEL_MC_DECL(put_, _ssse3)
|
||||
QPEL_MC_DECL(avg_, _ssse3)
|
||||
|
||||
#undef LOOPSIZE
|
||||
#undef HCOFF
|
||||
#undef VCOFF
|
||||
#define LOOPSIZE 8
|
||||
#define HCOFF(x) (64 * ((x) - 1))
|
||||
#define VCOFF(x) (64 * ((x) - 1))
|
||||
QPEL_MC_DECL(put_, _sse2)
|
||||
QPEL_MC_DECL(avg_, _sse2)
|
||||
|
||||
#if ARCH_X86_32
|
||||
#undef LOOPSIZE
|
||||
#undef HCOFF
|
||||
#undef VCOFF
|
||||
#define LOOPSIZE 4
|
||||
#define HCOFF(x) (64 * ((x) - 1))
|
||||
#define VCOFF(x) (64 * ((x) - 1))
|
||||
|
||||
QPEL_MC_DECL(put_, _mmx)
|
||||
|
||||
#define ff_put_rv40_qpel_h_mmxext ff_put_rv40_qpel_h_mmx
|
||||
#define ff_put_rv40_qpel_v_mmxext ff_put_rv40_qpel_v_mmx
|
||||
QPEL_MC_DECL(avg_, _mmxext)
|
||||
|
||||
#define ff_put_rv40_qpel_h_3dnow ff_put_rv40_qpel_h_mmx
|
||||
#define ff_put_rv40_qpel_v_3dnow ff_put_rv40_qpel_v_mmx
|
||||
QPEL_MC_DECL(avg_, _3dnow)
|
||||
#endif
|
||||
|
||||
/** @{ */
|
||||
/** Set one function */
|
||||
#define QPEL_FUNC_SET(OP, SIZE, PH, PV, OPT) \
|
||||
c-> OP ## pixels_tab[2 - SIZE / 8][4 * PV + PH] = OP ## rv40_qpel ##SIZE ## _mc ##PH ##PV ##OPT;
|
||||
|
||||
/** Set functions put and avg for sizes 8 and 16 and a given qpel position */
|
||||
#define QPEL_FUNCS_SET(OP, PH, PV, OPT) \
|
||||
QPEL_FUNC_SET(OP, 8, PH, PV, OPT) \
|
||||
QPEL_FUNC_SET(OP, 16, PH, PV, OPT)
|
||||
|
||||
/** Set all functions for all sizes and qpel positions */
|
||||
#define QPEL_MC_SET(OP, OPT) \
|
||||
QPEL_FUNCS_SET (OP, 0, 1, OPT) \
|
||||
QPEL_FUNCS_SET (OP, 0, 3, OPT) \
|
||||
QPEL_FUNCS_SET (OP, 1, 0, OPT) \
|
||||
QPEL_FUNCS_SET (OP, 1, 1, OPT) \
|
||||
QPEL_FUNCS_SET (OP, 1, 2, OPT) \
|
||||
QPEL_FUNCS_SET (OP, 1, 3, OPT) \
|
||||
QPEL_FUNCS_SET (OP, 2, 1, OPT) \
|
||||
QPEL_FUNCS_SET (OP, 2, 2, OPT) \
|
||||
QPEL_FUNCS_SET (OP, 2, 3, OPT) \
|
||||
QPEL_FUNCS_SET (OP, 3, 0, OPT) \
|
||||
QPEL_FUNCS_SET (OP, 3, 1, OPT) \
|
||||
QPEL_FUNCS_SET (OP, 3, 2, OPT)
|
||||
/** @} */
|
||||
|
||||
DEFINE_FN(put, 8, ssse3)
|
||||
|
||||
DEFINE_FN(put, 16, sse2)
|
||||
DEFINE_FN(put, 16, ssse3)
|
||||
|
||||
DEFINE_FN(avg, 8, mmxext)
|
||||
DEFINE_FN(avg, 8, ssse3)
|
||||
|
||||
DEFINE_FN(avg, 16, sse2)
|
||||
DEFINE_FN(avg, 16, ssse3)
|
||||
#endif /* HAVE_X86ASM */
|
||||
|
||||
#if HAVE_MMX_INLINE
|
||||
DEFINE_FN(put, 8, mmx)
|
||||
DEFINE_FN(avg, 8, mmx)
|
||||
DEFINE_FN(put, 16, mmx)
|
||||
DEFINE_FN(avg, 16, mmx)
|
||||
#endif
|
||||
|
||||
av_cold void ff_rv40dsp_init_x86(RV34DSPContext *c)
|
||||
{
|
||||
av_unused int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
#if HAVE_MMX_INLINE
|
||||
if (INLINE_MMX(cpu_flags)) {
|
||||
c->put_pixels_tab[0][15] = put_rv40_qpel16_mc33_mmx;
|
||||
c->put_pixels_tab[1][15] = put_rv40_qpel8_mc33_mmx;
|
||||
c->avg_pixels_tab[0][15] = avg_rv40_qpel16_mc33_mmx;
|
||||
c->avg_pixels_tab[1][15] = avg_rv40_qpel8_mc33_mmx;
|
||||
}
|
||||
#endif /* HAVE_MMX_INLINE */
|
||||
|
||||
#if HAVE_X86ASM
|
||||
if (EXTERNAL_MMX(cpu_flags)) {
|
||||
c->put_chroma_pixels_tab[0] = ff_put_rv40_chroma_mc8_mmx;
|
||||
c->put_chroma_pixels_tab[1] = ff_put_rv40_chroma_mc4_mmx;
|
||||
#if ARCH_X86_32
|
||||
QPEL_MC_SET(put_, _mmx)
|
||||
#endif
|
||||
}
|
||||
if (EXTERNAL_AMD3DNOW(cpu_flags)) {
|
||||
c->avg_chroma_pixels_tab[0] = ff_avg_rv40_chroma_mc8_3dnow;
|
||||
c->avg_chroma_pixels_tab[1] = ff_avg_rv40_chroma_mc4_3dnow;
|
||||
#if ARCH_X86_32
|
||||
QPEL_MC_SET(avg_, _3dnow)
|
||||
#endif
|
||||
}
|
||||
if (EXTERNAL_MMXEXT(cpu_flags)) {
|
||||
c->avg_pixels_tab[1][15] = avg_rv40_qpel8_mc33_mmxext;
|
||||
c->avg_chroma_pixels_tab[0] = ff_avg_rv40_chroma_mc8_mmxext;
|
||||
c->avg_chroma_pixels_tab[1] = ff_avg_rv40_chroma_mc4_mmxext;
|
||||
c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_mmxext;
|
||||
c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_rnd_8_mmxext;
|
||||
c->rv40_weight_pixels_tab[1][0] = ff_rv40_weight_func_nornd_16_mmxext;
|
||||
c->rv40_weight_pixels_tab[1][1] = ff_rv40_weight_func_nornd_8_mmxext;
|
||||
#if ARCH_X86_32
|
||||
QPEL_MC_SET(avg_, _mmxext)
|
||||
#endif
|
||||
}
|
||||
if (EXTERNAL_SSE2(cpu_flags)) {
|
||||
c->put_pixels_tab[0][15] = put_rv40_qpel16_mc33_sse2;
|
||||
c->avg_pixels_tab[0][15] = avg_rv40_qpel16_mc33_sse2;
|
||||
c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_sse2;
|
||||
c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_rnd_8_sse2;
|
||||
c->rv40_weight_pixels_tab[1][0] = ff_rv40_weight_func_nornd_16_sse2;
|
||||
c->rv40_weight_pixels_tab[1][1] = ff_rv40_weight_func_nornd_8_sse2;
|
||||
QPEL_MC_SET(put_, _sse2)
|
||||
QPEL_MC_SET(avg_, _sse2)
|
||||
}
|
||||
if (EXTERNAL_SSSE3(cpu_flags)) {
|
||||
c->put_pixels_tab[0][15] = put_rv40_qpel16_mc33_ssse3;
|
||||
c->put_pixels_tab[1][15] = put_rv40_qpel8_mc33_ssse3;
|
||||
c->avg_pixels_tab[0][15] = avg_rv40_qpel16_mc33_ssse3;
|
||||
c->avg_pixels_tab[1][15] = avg_rv40_qpel8_mc33_ssse3;
|
||||
c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_ssse3;
|
||||
c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_rnd_8_ssse3;
|
||||
c->rv40_weight_pixels_tab[1][0] = ff_rv40_weight_func_nornd_16_ssse3;
|
||||
c->rv40_weight_pixels_tab[1][1] = ff_rv40_weight_func_nornd_8_ssse3;
|
||||
QPEL_MC_SET(put_, _ssse3)
|
||||
QPEL_MC_SET(avg_, _ssse3)
|
||||
}
|
||||
#endif /* HAVE_X86ASM */
|
||||
}
|
51
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/sbcdsp_init.c
vendored
Normal file
51
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/sbcdsp_init.c
vendored
Normal file
|
@ -0,0 +1,51 @@
|
|||
/*
|
||||
* Bluetooth low-complexity, subband codec (SBC)
|
||||
*
|
||||
* Copyright (C) 2017 Aurelien Jacobs <aurel@gnuage.org>
|
||||
* Copyright (C) 2008-2010 Nokia Corporation
|
||||
* Copyright (C) 2004-2010 Marcel Holtmann <marcel@holtmann.org>
|
||||
* Copyright (C) 2004-2005 Henryk Ploetz <henryk@ploetzli.ch>
|
||||
* Copyright (C) 2005-2006 Brad Midgley <bmidgley@xmission.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
/**
|
||||
* @file
|
||||
* SBC MMX optimization for some basic "building bricks"
|
||||
*/
|
||||
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/sbcdsp.h"
|
||||
|
||||
void ff_sbc_analyze_4_mmx(const int16_t *in, int32_t *out, const int16_t *consts);
|
||||
void ff_sbc_analyze_8_mmx(const int16_t *in, int32_t *out, const int16_t *consts);
|
||||
void ff_sbc_calc_scalefactors_mmx(int32_t sb_sample_f[16][2][8],
|
||||
uint32_t scale_factor[2][8],
|
||||
int blocks, int channels, int subbands);
|
||||
|
||||
av_cold void ff_sbcdsp_init_x86(SBCDSPContext *s)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (EXTERNAL_MMX(cpu_flags)) {
|
||||
s->sbc_analyze_4 = ff_sbc_analyze_4_mmx;
|
||||
s->sbc_analyze_8 = ff_sbc_analyze_8_mmx;
|
||||
s->sbc_calc_scalefactors = ff_sbc_calc_scalefactors_mmx;
|
||||
}
|
||||
}
|
548
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/sbrdsp.asm
vendored
Normal file
548
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/sbrdsp.asm
vendored
Normal file
|
@ -0,0 +1,548 @@
|
|||
;******************************************************************************
|
||||
;* AAC Spectral Band Replication decoding functions
|
||||
;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
SECTION_RODATA
|
||||
; mask equivalent for multiply by -1.0 1.0
|
||||
ps_mask times 2 dd 1<<31, 0
|
||||
ps_mask2 times 2 dd 0, 1<<31
|
||||
ps_mask3 dd 0, 0, 0, 1<<31
|
||||
ps_noise0 times 2 dd 1.0, 0.0,
|
||||
ps_noise2 times 2 dd -1.0, 0.0
|
||||
ps_noise13 dd 0.0, 1.0, 0.0, -1.0
|
||||
dd 0.0, -1.0, 0.0, 1.0
|
||||
dd 0.0, 1.0, 0.0, -1.0
|
||||
cextern sbr_noise_table
|
||||
cextern ps_neg
|
||||
|
||||
SECTION .text
|
||||
|
||||
INIT_XMM sse
|
||||
cglobal sbr_sum_square, 2, 3, 6
|
||||
mov r2d, r1d
|
||||
xorps m0, m0
|
||||
xorps m1, m1
|
||||
sar r2, 3
|
||||
jz .prepare
|
||||
.loop:
|
||||
movu m2, [r0 + 0]
|
||||
movu m3, [r0 + 16]
|
||||
movu m4, [r0 + 32]
|
||||
movu m5, [r0 + 48]
|
||||
mulps m2, m2
|
||||
mulps m3, m3
|
||||
mulps m4, m4
|
||||
mulps m5, m5
|
||||
addps m0, m2
|
||||
addps m1, m3
|
||||
addps m0, m4
|
||||
addps m1, m5
|
||||
add r0, 64
|
||||
dec r2
|
||||
jnz .loop
|
||||
.prepare:
|
||||
and r1, 7
|
||||
sar r1, 1
|
||||
jz .end
|
||||
; len is a multiple of 2, thus there are at least 4 elements to process
|
||||
.endloop:
|
||||
movu m2, [r0]
|
||||
add r0, 16
|
||||
mulps m2, m2
|
||||
dec r1
|
||||
addps m0, m2
|
||||
jnz .endloop
|
||||
.end:
|
||||
addps m0, m1
|
||||
movhlps m2, m0
|
||||
addps m0, m2
|
||||
movss m1, m0
|
||||
shufps m0, m0, 1
|
||||
addss m0, m1
|
||||
%if ARCH_X86_64 == 0
|
||||
movss r0m, m0
|
||||
fld dword r0m
|
||||
%endif
|
||||
RET
|
||||
|
||||
%define STEP 40*4*2
|
||||
cglobal sbr_hf_g_filt, 5, 6, 5
|
||||
lea r1, [r1 + 8*r4] ; offset by ixh elements into X_high
|
||||
mov r5, r3
|
||||
and r3, 0xFC
|
||||
lea r2, [r2 + r3*4]
|
||||
lea r0, [r0 + r3*8]
|
||||
neg r3
|
||||
jz .loop1
|
||||
.loop4:
|
||||
movlps m0, [r2 + 4*r3 + 0]
|
||||
movlps m1, [r2 + 4*r3 + 8]
|
||||
movlps m2, [r1 + 0*STEP]
|
||||
movlps m3, [r1 + 2*STEP]
|
||||
movhps m2, [r1 + 1*STEP]
|
||||
movhps m3, [r1 + 3*STEP]
|
||||
unpcklps m0, m0
|
||||
unpcklps m1, m1
|
||||
mulps m0, m2
|
||||
mulps m1, m3
|
||||
movu [r0 + 8*r3 + 0], m0
|
||||
movu [r0 + 8*r3 + 16], m1
|
||||
add r1, 4*STEP
|
||||
add r3, 4
|
||||
jnz .loop4
|
||||
and r5, 3 ; number of single element loops
|
||||
jz .end
|
||||
.loop1: ; element 0 and 1 can be computed at the same time
|
||||
movss m0, [r2]
|
||||
movlps m2, [r1]
|
||||
unpcklps m0, m0
|
||||
mulps m2, m0
|
||||
movlps [r0], m2
|
||||
add r0, 8
|
||||
add r2, 4
|
||||
add r1, STEP
|
||||
dec r5
|
||||
jnz .loop1
|
||||
.end:
|
||||
RET
|
||||
|
||||
; void ff_sbr_hf_gen_sse(float (*X_high)[2], const float (*X_low)[2],
|
||||
; const float alpha0[2], const float alpha1[2],
|
||||
; float bw, int start, int end)
|
||||
;
|
||||
cglobal sbr_hf_gen, 4,4,8, X_high, X_low, alpha0, alpha1, BW, S, E
|
||||
; load alpha factors
|
||||
%define bw m0
|
||||
%if ARCH_X86_64 == 0 || WIN64
|
||||
movss bw, BWm
|
||||
%endif
|
||||
movlps m2, [alpha1q]
|
||||
movlps m1, [alpha0q]
|
||||
shufps bw, bw, 0
|
||||
mulps m2, bw ; (a1[0] a1[1])*bw
|
||||
mulps m1, bw ; (a0[0] a0[1])*bw = (a2 a3)
|
||||
mulps m2, bw ; (a1[0] a1[1])*bw*bw = (a0 a1)
|
||||
mova m3, m1
|
||||
mova m4, m2
|
||||
|
||||
; Set pointers
|
||||
%if ARCH_X86_64 == 0 || WIN64
|
||||
; start and end 6th and 7th args on stack
|
||||
mov r2d, Sm
|
||||
mov r3d, Em
|
||||
DEFINE_ARGS X_high, X_low, start, end
|
||||
%else
|
||||
; BW does not actually occupy a register, so shift by 1
|
||||
DEFINE_ARGS X_high, X_low, alpha0, alpha1, start, end
|
||||
movsxd startq, startd
|
||||
movsxd endq, endd
|
||||
%endif
|
||||
sub startq, endq ; neg num of loops
|
||||
lea X_highq, [X_highq + endq*2*4]
|
||||
lea X_lowq, [X_lowq + endq*2*4 - 2*2*4]
|
||||
shl startq, 3 ; offset from num loops
|
||||
|
||||
mova m0, [X_lowq + startq]
|
||||
shufps m3, m3, q1111
|
||||
shufps m4, m4, q1111
|
||||
xorps m3, [ps_mask]
|
||||
shufps m1, m1, q0000
|
||||
shufps m2, m2, q0000
|
||||
xorps m4, [ps_mask]
|
||||
.loop2:
|
||||
movu m7, [X_lowq + startq + 8] ; BbCc
|
||||
mova m6, m0
|
||||
mova m5, m7
|
||||
shufps m0, m0, q2301 ; aAbB
|
||||
shufps m7, m7, q2301 ; bBcC
|
||||
mulps m0, m4
|
||||
mulps m7, m3
|
||||
mulps m6, m2
|
||||
mulps m5, m1
|
||||
addps m7, m0
|
||||
mova m0, [X_lowq + startq + 16] ; CcDd
|
||||
addps m7, m0
|
||||
addps m6, m5
|
||||
addps m7, m6
|
||||
mova [X_highq + startq], m7
|
||||
add startq, 16
|
||||
jnz .loop2
|
||||
RET
|
||||
|
||||
cglobal sbr_sum64x5, 1,2,4,z
|
||||
lea r1q, [zq+ 256]
|
||||
.loop:
|
||||
mova m0, [zq+ 0]
|
||||
mova m2, [zq+ 16]
|
||||
mova m1, [zq+ 256]
|
||||
mova m3, [zq+ 272]
|
||||
addps m0, [zq+ 512]
|
||||
addps m2, [zq+ 528]
|
||||
addps m1, [zq+ 768]
|
||||
addps m3, [zq+ 784]
|
||||
addps m0, [zq+1024]
|
||||
addps m2, [zq+1040]
|
||||
addps m0, m1
|
||||
addps m2, m3
|
||||
mova [zq], m0
|
||||
mova [zq+16], m2
|
||||
add zq, 32
|
||||
cmp zq, r1q
|
||||
jne .loop
|
||||
REP_RET
|
||||
|
||||
INIT_XMM sse
|
||||
cglobal sbr_qmf_post_shuffle, 2,3,4,W,z
|
||||
lea r2q, [zq + (64-4)*4]
|
||||
mova m3, [ps_neg]
|
||||
.loop:
|
||||
mova m1, [zq]
|
||||
xorps m0, m3, [r2q]
|
||||
shufps m0, m0, m0, q0123
|
||||
unpcklps m2, m0, m1
|
||||
unpckhps m0, m0, m1
|
||||
mova [Wq + 0], m2
|
||||
mova [Wq + 16], m0
|
||||
add Wq, 32
|
||||
sub r2q, 16
|
||||
add zq, 16
|
||||
cmp zq, r2q
|
||||
jl .loop
|
||||
REP_RET
|
||||
|
||||
INIT_XMM sse
|
||||
cglobal sbr_neg_odd_64, 1,2,4,z
|
||||
lea r1q, [zq+256]
|
||||
.loop:
|
||||
mova m0, [zq+ 0]
|
||||
mova m1, [zq+16]
|
||||
mova m2, [zq+32]
|
||||
mova m3, [zq+48]
|
||||
xorps m0, [ps_mask2]
|
||||
xorps m1, [ps_mask2]
|
||||
xorps m2, [ps_mask2]
|
||||
xorps m3, [ps_mask2]
|
||||
mova [zq+ 0], m0
|
||||
mova [zq+16], m1
|
||||
mova [zq+32], m2
|
||||
mova [zq+48], m3
|
||||
add zq, 64
|
||||
cmp zq, r1q
|
||||
jne .loop
|
||||
REP_RET
|
||||
|
||||
; void ff_sbr_qmf_deint_bfly_sse2(float *v, const float *src0, const float *src1)
|
||||
%macro SBR_QMF_DEINT_BFLY 0
|
||||
cglobal sbr_qmf_deint_bfly, 3,5,8, v,src0,src1,vrev,c
|
||||
mov cq, 64*4-2*mmsize
|
||||
lea vrevq, [vq + 64*4]
|
||||
.loop:
|
||||
mova m0, [src0q+cq]
|
||||
mova m1, [src1q]
|
||||
mova m4, [src0q+cq+mmsize]
|
||||
mova m5, [src1q+mmsize]
|
||||
%if cpuflag(sse2)
|
||||
pshufd m2, m0, q0123
|
||||
pshufd m3, m1, q0123
|
||||
pshufd m6, m4, q0123
|
||||
pshufd m7, m5, q0123
|
||||
%else
|
||||
shufps m2, m0, m0, q0123
|
||||
shufps m3, m1, m1, q0123
|
||||
shufps m6, m4, m4, q0123
|
||||
shufps m7, m5, m5, q0123
|
||||
%endif
|
||||
addps m5, m2
|
||||
subps m0, m7
|
||||
addps m1, m6
|
||||
subps m4, m3
|
||||
mova [vrevq], m1
|
||||
mova [vrevq+mmsize], m5
|
||||
mova [vq+cq], m0
|
||||
mova [vq+cq+mmsize], m4
|
||||
add src1q, 2*mmsize
|
||||
add vrevq, 2*mmsize
|
||||
sub cq, 2*mmsize
|
||||
jge .loop
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse
|
||||
SBR_QMF_DEINT_BFLY
|
||||
|
||||
INIT_XMM sse2
|
||||
SBR_QMF_DEINT_BFLY
|
||||
|
||||
INIT_XMM sse2
|
||||
cglobal sbr_qmf_pre_shuffle, 1,4,6,z
|
||||
%define OFFSET (32*4-2*mmsize)
|
||||
mov r3q, OFFSET
|
||||
lea r1q, [zq + (32+1)*4]
|
||||
lea r2q, [zq + 64*4]
|
||||
mova m5, [ps_neg]
|
||||
.loop:
|
||||
movu m0, [r1q]
|
||||
movu m2, [r1q + mmsize]
|
||||
movu m1, [zq + r3q + 4 + mmsize]
|
||||
movu m3, [zq + r3q + 4]
|
||||
|
||||
pxor m2, m5
|
||||
pxor m0, m5
|
||||
pshufd m2, m2, q0123
|
||||
pshufd m0, m0, q0123
|
||||
SBUTTERFLY dq, 2, 3, 4
|
||||
SBUTTERFLY dq, 0, 1, 4
|
||||
mova [r2q + 2*r3q + 0*mmsize], m2
|
||||
mova [r2q + 2*r3q + 1*mmsize], m3
|
||||
mova [r2q + 2*r3q + 2*mmsize], m0
|
||||
mova [r2q + 2*r3q + 3*mmsize], m1
|
||||
add r1q, 2*mmsize
|
||||
sub r3q, 2*mmsize
|
||||
jge .loop
|
||||
movq m2, [zq]
|
||||
movq [r2q], m2
|
||||
REP_RET
|
||||
|
||||
%ifdef PIC
|
||||
%define NREGS 1
|
||||
%if UNIX64
|
||||
%define NOISE_TABLE r6q ; r5q is m_max
|
||||
%else
|
||||
%define NOISE_TABLE r5q
|
||||
%endif
|
||||
%else
|
||||
%define NREGS 0
|
||||
%define NOISE_TABLE sbr_noise_table
|
||||
%endif
|
||||
|
||||
%macro LOAD_NST 1
|
||||
%ifdef PIC
|
||||
lea NOISE_TABLE, [%1]
|
||||
mova m0, [kxq + NOISE_TABLE]
|
||||
%else
|
||||
mova m0, [kxq + %1]
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse2
|
||||
; sbr_hf_apply_noise_0(float (*Y)[2], const float *s_m,
|
||||
; const float *q_filt, int noise,
|
||||
; int kx, int m_max)
|
||||
cglobal sbr_hf_apply_noise_0, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
|
||||
mova m0, [ps_noise0]
|
||||
jmp apply_noise_main
|
||||
|
||||
; sbr_hf_apply_noise_1(float (*Y)[2], const float *s_m,
|
||||
; const float *q_filt, int noise,
|
||||
; int kx, int m_max)
|
||||
cglobal sbr_hf_apply_noise_1, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
|
||||
and kxq, 1
|
||||
shl kxq, 4
|
||||
LOAD_NST ps_noise13
|
||||
jmp apply_noise_main
|
||||
|
||||
; sbr_hf_apply_noise_2(float (*Y)[2], const float *s_m,
|
||||
; const float *q_filt, int noise,
|
||||
; int kx, int m_max)
|
||||
cglobal sbr_hf_apply_noise_2, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
|
||||
mova m0, [ps_noise2]
|
||||
jmp apply_noise_main
|
||||
|
||||
; sbr_hf_apply_noise_3(float (*Y)[2], const float *s_m,
|
||||
; const float *q_filt, int noise,
|
||||
; int kx, int m_max)
|
||||
cglobal sbr_hf_apply_noise_3, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
|
||||
and kxq, 1
|
||||
shl kxq, 4
|
||||
LOAD_NST ps_noise13+16
|
||||
|
||||
apply_noise_main:
|
||||
%if ARCH_X86_64 == 0 || WIN64
|
||||
mov kxd, m_maxm
|
||||
DEFINE_ARGS Y, s_m, q_filt, noise, count
|
||||
%else
|
||||
DEFINE_ARGS Y, s_m, q_filt, noise, kx, count
|
||||
%endif
|
||||
movsxdifnidn noiseq, noised
|
||||
dec noiseq
|
||||
shl countd, 2
|
||||
%ifdef PIC
|
||||
lea NOISE_TABLE, [sbr_noise_table]
|
||||
%endif
|
||||
lea Yq, [Yq + 2*countq]
|
||||
add s_mq, countq
|
||||
add q_filtq, countq
|
||||
shl noiseq, 3
|
||||
pxor m5, m5
|
||||
neg countq
|
||||
.loop:
|
||||
mova m1, [q_filtq + countq]
|
||||
movu m3, [noiseq + NOISE_TABLE + 1*mmsize]
|
||||
movu m4, [noiseq + NOISE_TABLE + 2*mmsize]
|
||||
add noiseq, 2*mmsize
|
||||
and noiseq, 0x1ff<<3
|
||||
punpckhdq m2, m1, m1
|
||||
punpckldq m1, m1
|
||||
mulps m1, m3 ; m2 = q_filt[m] * ff_sbr_noise_table[noise]
|
||||
mulps m2, m4 ; m2 = q_filt[m] * ff_sbr_noise_table[noise]
|
||||
mova m3, [s_mq + countq]
|
||||
; TODO: replace by a vpermd in AVX2
|
||||
punpckhdq m4, m3, m3
|
||||
punpckldq m3, m3
|
||||
pcmpeqd m6, m3, m5 ; m6 == 0
|
||||
pcmpeqd m7, m4, m5 ; m7 == 0
|
||||
mulps m3, m0 ; s_m[m] * phi_sign
|
||||
mulps m4, m0 ; s_m[m] * phi_sign
|
||||
pand m1, m6
|
||||
pand m2, m7
|
||||
movu m6, [Yq + 2*countq]
|
||||
movu m7, [Yq + 2*countq + mmsize]
|
||||
addps m3, m1
|
||||
addps m4, m2
|
||||
addps m6, m3
|
||||
addps m7, m4
|
||||
movu [Yq + 2*countq], m6
|
||||
movu [Yq + 2*countq + mmsize], m7
|
||||
add countq, mmsize
|
||||
jl .loop
|
||||
RET
|
||||
|
||||
INIT_XMM sse
|
||||
cglobal sbr_qmf_deint_neg, 2,4,4,v,src,vrev,c
|
||||
%define COUNT 32*4
|
||||
%define OFFSET 32*4
|
||||
mov cq, -COUNT
|
||||
lea vrevq, [vq + OFFSET + COUNT]
|
||||
add vq, OFFSET-mmsize
|
||||
add srcq, 2*COUNT
|
||||
mova m3, [ps_neg]
|
||||
.loop:
|
||||
mova m0, [srcq + 2*cq + 0*mmsize]
|
||||
mova m1, [srcq + 2*cq + 1*mmsize]
|
||||
shufps m2, m0, m1, q2020
|
||||
shufps m1, m0, q1313
|
||||
xorps m2, m3
|
||||
mova [vq], m1
|
||||
mova [vrevq + cq], m2
|
||||
sub vq, mmsize
|
||||
add cq, mmsize
|
||||
jl .loop
|
||||
REP_RET
|
||||
|
||||
%macro SBR_AUTOCORRELATE 0
|
||||
cglobal sbr_autocorrelate, 2,3,8,32, x, phi, cnt
|
||||
mov cntq, 37*8
|
||||
add xq, cntq
|
||||
neg cntq
|
||||
|
||||
%if cpuflag(sse3)
|
||||
%define MOVH movsd
|
||||
movddup m5, [xq+cntq]
|
||||
%else
|
||||
%define MOVH movlps
|
||||
movlps m5, [xq+cntq]
|
||||
movlhps m5, m5
|
||||
%endif
|
||||
MOVH m7, [xq+cntq+8 ]
|
||||
MOVH m1, [xq+cntq+16]
|
||||
shufps m7, m7, q0110
|
||||
shufps m1, m1, q0110
|
||||
mulps m3, m5, m7 ; x[0][0] * x[1][0], x[0][1] * x[1][1], x[0][0] * x[1][1], x[0][1] * x[1][0]
|
||||
mulps m4, m5, m5 ; x[0][0] * x[0][0], x[0][1] * x[0][1];
|
||||
mulps m5, m1 ; real_sum2 = x[0][0] * x[2][0], x[0][1] * x[2][1]; imag_sum2 = x[0][0] * x[2][1], x[0][1] * x[2][0]
|
||||
movaps [rsp ], m3
|
||||
movaps [rsp+16], m4
|
||||
add cntq, 8
|
||||
|
||||
MOVH m2, [xq+cntq+16]
|
||||
movlhps m7, m7
|
||||
shufps m2, m2, q0110
|
||||
mulps m6, m7, m1 ; real_sum1 = x[1][0] * x[2][0], x[1][1] * x[2][1]; imag_sum1 += x[1][0] * x[2][1], x[1][1] * x[2][0]
|
||||
mulps m4, m7, m2
|
||||
mulps m7, m7 ; real_sum0 = x[1][0] * x[1][0], x[1][1] * x[1][1];
|
||||
addps m5, m4 ; real_sum2 += x[1][0] * x[3][0], x[1][1] * x[3][1]; imag_sum2 += x[1][0] * x[3][1], x[1][1] * x[3][0]
|
||||
|
||||
align 16
|
||||
.loop:
|
||||
add cntq, 8
|
||||
MOVH m0, [xq+cntq+16]
|
||||
movlhps m1, m1
|
||||
shufps m0, m0, q0110
|
||||
mulps m3, m1, m2
|
||||
mulps m4, m1, m0
|
||||
mulps m1, m1
|
||||
addps m6, m3 ; real_sum1 += x[i][0] * x[i + 1][0], x[i][1] * x[i + 1][1]; imag_sum1 += x[i][0] * x[i + 1][1], x[i][1] * x[i + 1][0];
|
||||
addps m5, m4 ; real_sum2 += x[i][0] * x[i + 2][0], x[i][1] * x[i + 2][1]; imag_sum2 += x[i][0] * x[i + 2][1], x[i][1] * x[i + 2][0];
|
||||
addps m7, m1 ; real_sum0 += x[i][0] * x[i][0], x[i][1] * x[i][1];
|
||||
add cntq, 8
|
||||
MOVH m1, [xq+cntq+16]
|
||||
movlhps m2, m2
|
||||
shufps m1, m1, q0110
|
||||
mulps m3, m2, m0
|
||||
mulps m4, m2, m1
|
||||
mulps m2, m2
|
||||
addps m6, m3 ; real_sum1 += x[i][0] * x[i + 1][0], x[i][1] * x[i + 1][1]; imag_sum1 += x[i][0] * x[i + 1][1], x[i][1] * x[i + 1][0];
|
||||
addps m5, m4 ; real_sum2 += x[i][0] * x[i + 2][0], x[i][1] * x[i + 2][1]; imag_sum2 += x[i][0] * x[i + 2][1], x[i][1] * x[i + 2][0];
|
||||
addps m7, m2 ; real_sum0 += x[i][0] * x[i][0], x[i][1] * x[i][1];
|
||||
add cntq, 8
|
||||
MOVH m2, [xq+cntq+16]
|
||||
movlhps m0, m0
|
||||
shufps m2, m2, q0110
|
||||
mulps m3, m0, m1
|
||||
mulps m4, m0, m2
|
||||
mulps m0, m0
|
||||
addps m6, m3 ; real_sum1 += x[i][0] * x[i + 1][0], x[i][1] * x[i + 1][1]; imag_sum1 += x[i][0] * x[i + 1][1], x[i][1] * x[i + 1][0];
|
||||
addps m5, m4 ; real_sum2 += x[i][0] * x[i + 2][0], x[i][1] * x[i + 2][1]; imag_sum2 += x[i][0] * x[i + 2][1], x[i][1] * x[i + 2][0];
|
||||
addps m7, m0 ; real_sum0 += x[i][0] * x[i][0], x[i][1] * x[i][1];
|
||||
jl .loop
|
||||
|
||||
movlhps m1, m1
|
||||
mulps m2, m1
|
||||
mulps m1, m1
|
||||
addps m2, m6 ; real_sum1 + x[38][0] * x[39][0], x[38][1] * x[39][1]; imag_sum1 + x[38][0] * x[39][1], x[38][1] * x[39][0];
|
||||
addps m1, m7 ; real_sum0 + x[38][0] * x[38][0], x[38][1] * x[38][1];
|
||||
addps m6, [rsp ] ; real_sum1 + x[ 0][0] * x[ 1][0], x[ 0][1] * x[ 1][1]; imag_sum1 + x[ 0][0] * x[ 1][1], x[ 0][1] * x[ 1][0];
|
||||
addps m7, [rsp+16] ; real_sum0 + x[ 0][0] * x[ 0][0], x[ 0][1] * x[ 0][1];
|
||||
|
||||
xorps m2, [ps_mask3]
|
||||
xorps m5, [ps_mask3]
|
||||
xorps m6, [ps_mask3]
|
||||
HADDPS m2, m5, m3
|
||||
HADDPS m7, m6, m4
|
||||
%if cpuflag(sse3)
|
||||
movshdup m0, m1
|
||||
%else
|
||||
movss m0, m1
|
||||
shufps m1, m1, q0001
|
||||
%endif
|
||||
addss m1, m0
|
||||
movaps [phiq ], m2
|
||||
movhps [phiq+0x18], m7
|
||||
movss [phiq+0x28], m7
|
||||
movss [phiq+0x10], m1
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse
|
||||
SBR_AUTOCORRELATE
|
||||
INIT_XMM sse3
|
||||
SBR_AUTOCORRELATE
|
87
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/sbrdsp_init.c
vendored
Normal file
87
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/sbrdsp_init.c
vendored
Normal file
|
@ -0,0 +1,87 @@
|
|||
/*
|
||||
* AAC Spectral Band Replication decoding functions
|
||||
* Copyright (c) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/sbrdsp.h"
|
||||
|
||||
float ff_sbr_sum_square_sse(float (*x)[2], int n);
|
||||
void ff_sbr_sum64x5_sse(float *z);
|
||||
void ff_sbr_hf_g_filt_sse(float (*Y)[2], const float (*X_high)[40][2],
|
||||
const float *g_filt, int m_max, intptr_t ixh);
|
||||
void ff_sbr_hf_gen_sse(float (*X_high)[2], const float (*X_low)[2],
|
||||
const float alpha0[2], const float alpha1[2],
|
||||
float bw, int start, int end);
|
||||
void ff_sbr_neg_odd_64_sse(float *z);
|
||||
void ff_sbr_qmf_post_shuffle_sse(float W[32][2], const float *z);
|
||||
void ff_sbr_qmf_deint_bfly_sse(float *v, const float *src0, const float *src1);
|
||||
void ff_sbr_qmf_deint_bfly_sse2(float *v, const float *src0, const float *src1);
|
||||
void ff_sbr_qmf_pre_shuffle_sse2(float *z);
|
||||
|
||||
void ff_sbr_hf_apply_noise_0_sse2(float (*Y)[2], const float *s_m,
|
||||
const float *q_filt, int noise,
|
||||
int kx, int m_max);
|
||||
void ff_sbr_hf_apply_noise_1_sse2(float (*Y)[2], const float *s_m,
|
||||
const float *q_filt, int noise,
|
||||
int kx, int m_max);
|
||||
void ff_sbr_hf_apply_noise_2_sse2(float (*Y)[2], const float *s_m,
|
||||
const float *q_filt, int noise,
|
||||
int kx, int m_max);
|
||||
void ff_sbr_hf_apply_noise_3_sse2(float (*Y)[2], const float *s_m,
|
||||
const float *q_filt, int noise,
|
||||
int kx, int m_max);
|
||||
|
||||
void ff_sbr_qmf_deint_neg_sse(float *v, const float *src);
|
||||
|
||||
void ff_sbr_autocorrelate_sse (const float x[40][2], float phi[3][2][2]);
|
||||
void ff_sbr_autocorrelate_sse3(const float x[40][2], float phi[3][2][2]);
|
||||
|
||||
av_cold void ff_sbrdsp_init_x86(SBRDSPContext *s)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (EXTERNAL_SSE(cpu_flags)) {
|
||||
s->neg_odd_64 = ff_sbr_neg_odd_64_sse;
|
||||
s->sum_square = ff_sbr_sum_square_sse;
|
||||
s->sum64x5 = ff_sbr_sum64x5_sse;
|
||||
s->hf_g_filt = ff_sbr_hf_g_filt_sse;
|
||||
s->hf_gen = ff_sbr_hf_gen_sse;
|
||||
s->qmf_post_shuffle = ff_sbr_qmf_post_shuffle_sse;
|
||||
s->qmf_deint_bfly = ff_sbr_qmf_deint_bfly_sse;
|
||||
s->qmf_deint_neg = ff_sbr_qmf_deint_neg_sse;
|
||||
s->autocorrelate = ff_sbr_autocorrelate_sse;
|
||||
}
|
||||
|
||||
if (EXTERNAL_SSE2(cpu_flags)) {
|
||||
s->qmf_deint_bfly = ff_sbr_qmf_deint_bfly_sse2;
|
||||
s->qmf_pre_shuffle = ff_sbr_qmf_pre_shuffle_sse2;
|
||||
s->hf_apply_noise[0] = ff_sbr_hf_apply_noise_0_sse2;
|
||||
s->hf_apply_noise[1] = ff_sbr_hf_apply_noise_1_sse2;
|
||||
s->hf_apply_noise[2] = ff_sbr_hf_apply_noise_2_sse2;
|
||||
s->hf_apply_noise[3] = ff_sbr_hf_apply_noise_3_sse2;
|
||||
}
|
||||
|
||||
if (EXTERNAL_SSE3(cpu_flags)) {
|
||||
s->autocorrelate = ff_sbr_autocorrelate_sse3;
|
||||
}
|
||||
}
|
53
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/simple_idct.h
vendored
Normal file
53
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/simple_idct.h
vendored
Normal file
|
@ -0,0 +1,53 @@
|
|||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef AVCODEC_X86_SIMPLE_IDCT_H
|
||||
#define AVCODEC_X86_SIMPLE_IDCT_H
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
void ff_simple_idct_mmx(int16_t *block);
|
||||
void ff_simple_idct_add_mmx(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
|
||||
void ff_simple_idct_put_mmx(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
|
||||
|
||||
void ff_simple_idct_add_sse2(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
|
||||
void ff_simple_idct_put_sse2(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
|
||||
|
||||
void ff_simple_idct8_sse2(int16_t *block);
|
||||
void ff_simple_idct8_avx(int16_t *block);
|
||||
|
||||
void ff_simple_idct8_put_sse2(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
|
||||
void ff_simple_idct8_put_avx(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
|
||||
|
||||
void ff_simple_idct8_add_sse2(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
|
||||
void ff_simple_idct8_add_avx(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
|
||||
|
||||
void ff_simple_idct10_sse2(int16_t *block);
|
||||
void ff_simple_idct10_avx(int16_t *block);
|
||||
|
||||
void ff_simple_idct10_put_sse2(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
|
||||
void ff_simple_idct10_put_avx(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
|
||||
|
||||
void ff_simple_idct12_sse2(int16_t *block);
|
||||
void ff_simple_idct12_avx(int16_t *block);
|
||||
|
||||
void ff_simple_idct12_put_sse2(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
|
||||
void ff_simple_idct12_put_avx(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
|
||||
|
||||
#endif /* AVCODEC_X86_SIMPLE_IDCT_H */
|
908
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/snowdsp.c
vendored
Normal file
908
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/snowdsp.c
vendored
Normal file
|
@ -0,0 +1,908 @@
|
|||
/*
|
||||
* MMX and SSE2 optimized snow DSP utils
|
||||
* Copyright (c) 2005-2006 Robert Edele <yartrebo@earthlink.net>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/x86/asm.h"
|
||||
#include "libavcodec/avcodec.h"
|
||||
#include "libavcodec/snow.h"
|
||||
#include "libavcodec/snow_dwt.h"
|
||||
|
||||
#if HAVE_INLINE_ASM
|
||||
|
||||
static void ff_snow_horizontal_compose97i_sse2(IDWTELEM *b, IDWTELEM *temp, int width){
|
||||
const int w2= (width+1)>>1;
|
||||
const int w_l= (width>>1);
|
||||
const int w_r= w2 - 1;
|
||||
int i;
|
||||
|
||||
{ // Lift 0
|
||||
IDWTELEM * const ref = b + w2 - 1;
|
||||
IDWTELEM b_0 = b[0]; //By allowing the first entry in b[0] to be calculated twice
|
||||
// (the first time erroneously), we allow the SSE2 code to run an extra pass.
|
||||
// The savings in code and time are well worth having to store this value and
|
||||
// calculate b[0] correctly afterwards.
|
||||
|
||||
i = 0;
|
||||
__asm__ volatile(
|
||||
"pcmpeqd %%xmm7, %%xmm7 \n\t"
|
||||
"pcmpeqd %%xmm3, %%xmm3 \n\t"
|
||||
"psllw $1, %%xmm3 \n\t"
|
||||
"paddw %%xmm7, %%xmm3 \n\t"
|
||||
"psllw $13, %%xmm3 \n\t"
|
||||
::);
|
||||
for(; i<w_l-15; i+=16){
|
||||
__asm__ volatile(
|
||||
"movdqu (%1), %%xmm1 \n\t"
|
||||
"movdqu 16(%1), %%xmm5 \n\t"
|
||||
"movdqu 2(%1), %%xmm2 \n\t"
|
||||
"movdqu 18(%1), %%xmm6 \n\t"
|
||||
"paddw %%xmm1, %%xmm2 \n\t"
|
||||
"paddw %%xmm5, %%xmm6 \n\t"
|
||||
"paddw %%xmm7, %%xmm2 \n\t"
|
||||
"paddw %%xmm7, %%xmm6 \n\t"
|
||||
"pmulhw %%xmm3, %%xmm2 \n\t"
|
||||
"pmulhw %%xmm3, %%xmm6 \n\t"
|
||||
"paddw (%0), %%xmm2 \n\t"
|
||||
"paddw 16(%0), %%xmm6 \n\t"
|
||||
"movdqa %%xmm2, (%0) \n\t"
|
||||
"movdqa %%xmm6, 16(%0) \n\t"
|
||||
:: "r"(&b[i]), "r"(&ref[i])
|
||||
: "memory"
|
||||
);
|
||||
}
|
||||
snow_horizontal_compose_lift_lead_out(i, b, b, ref, width, w_l, 0, W_DM, W_DO, W_DS);
|
||||
b[0] = b_0 - ((W_DM * 2 * ref[1]+W_DO)>>W_DS);
|
||||
}
|
||||
|
||||
{ // Lift 1
|
||||
IDWTELEM * const dst = b+w2;
|
||||
|
||||
i = 0;
|
||||
for(; (((x86_reg)&dst[i]) & 0x1F) && i<w_r; i++){
|
||||
dst[i] = dst[i] - (b[i] + b[i + 1]);
|
||||
}
|
||||
for(; i<w_r-15; i+=16){
|
||||
__asm__ volatile(
|
||||
"movdqu (%1), %%xmm1 \n\t"
|
||||
"movdqu 16(%1), %%xmm5 \n\t"
|
||||
"movdqu 2(%1), %%xmm2 \n\t"
|
||||
"movdqu 18(%1), %%xmm6 \n\t"
|
||||
"paddw %%xmm1, %%xmm2 \n\t"
|
||||
"paddw %%xmm5, %%xmm6 \n\t"
|
||||
"movdqa (%0), %%xmm0 \n\t"
|
||||
"movdqa 16(%0), %%xmm4 \n\t"
|
||||
"psubw %%xmm2, %%xmm0 \n\t"
|
||||
"psubw %%xmm6, %%xmm4 \n\t"
|
||||
"movdqa %%xmm0, (%0) \n\t"
|
||||
"movdqa %%xmm4, 16(%0) \n\t"
|
||||
:: "r"(&dst[i]), "r"(&b[i])
|
||||
: "memory"
|
||||
);
|
||||
}
|
||||
snow_horizontal_compose_lift_lead_out(i, dst, dst, b, width, w_r, 1, W_CM, W_CO, W_CS);
|
||||
}
|
||||
|
||||
{ // Lift 2
|
||||
IDWTELEM * const ref = b+w2 - 1;
|
||||
IDWTELEM b_0 = b[0];
|
||||
|
||||
i = 0;
|
||||
__asm__ volatile(
|
||||
"psllw $15, %%xmm7 \n\t"
|
||||
"pcmpeqw %%xmm6, %%xmm6 \n\t"
|
||||
"psrlw $13, %%xmm6 \n\t"
|
||||
"paddw %%xmm7, %%xmm6 \n\t"
|
||||
::);
|
||||
for(; i<w_l-15; i+=16){
|
||||
__asm__ volatile(
|
||||
"movdqu (%1), %%xmm0 \n\t"
|
||||
"movdqu 16(%1), %%xmm4 \n\t"
|
||||
"movdqu 2(%1), %%xmm1 \n\t"
|
||||
"movdqu 18(%1), %%xmm5 \n\t" //FIXME try aligned reads and shifts
|
||||
"paddw %%xmm6, %%xmm0 \n\t"
|
||||
"paddw %%xmm6, %%xmm4 \n\t"
|
||||
"paddw %%xmm7, %%xmm1 \n\t"
|
||||
"paddw %%xmm7, %%xmm5 \n\t"
|
||||
"pavgw %%xmm1, %%xmm0 \n\t"
|
||||
"pavgw %%xmm5, %%xmm4 \n\t"
|
||||
"psubw %%xmm7, %%xmm0 \n\t"
|
||||
"psubw %%xmm7, %%xmm4 \n\t"
|
||||
"psraw $1, %%xmm0 \n\t"
|
||||
"psraw $1, %%xmm4 \n\t"
|
||||
"movdqa (%0), %%xmm1 \n\t"
|
||||
"movdqa 16(%0), %%xmm5 \n\t"
|
||||
"paddw %%xmm1, %%xmm0 \n\t"
|
||||
"paddw %%xmm5, %%xmm4 \n\t"
|
||||
"psraw $2, %%xmm0 \n\t"
|
||||
"psraw $2, %%xmm4 \n\t"
|
||||
"paddw %%xmm1, %%xmm0 \n\t"
|
||||
"paddw %%xmm5, %%xmm4 \n\t"
|
||||
"movdqa %%xmm0, (%0) \n\t"
|
||||
"movdqa %%xmm4, 16(%0) \n\t"
|
||||
:: "r"(&b[i]), "r"(&ref[i])
|
||||
: "memory"
|
||||
);
|
||||
}
|
||||
snow_horizontal_compose_liftS_lead_out(i, b, b, ref, width, w_l);
|
||||
b[0] = b_0 + ((2 * ref[1] + W_BO-1 + 4 * b_0) >> W_BS);
|
||||
}
|
||||
|
||||
{ // Lift 3
|
||||
IDWTELEM * const src = b+w2;
|
||||
|
||||
i = 0;
|
||||
for(; (((x86_reg)&temp[i]) & 0x1F) && i<w_r; i++){
|
||||
temp[i] = src[i] - ((-W_AM*(b[i] + b[i+1]))>>W_AS);
|
||||
}
|
||||
for(; i<w_r-7; i+=8){
|
||||
__asm__ volatile(
|
||||
"movdqu 2(%1), %%xmm2 \n\t"
|
||||
"movdqu 18(%1), %%xmm6 \n\t"
|
||||
"paddw (%1), %%xmm2 \n\t"
|
||||
"paddw 16(%1), %%xmm6 \n\t"
|
||||
"movdqu (%0), %%xmm0 \n\t"
|
||||
"movdqu 16(%0), %%xmm4 \n\t"
|
||||
"paddw %%xmm2, %%xmm0 \n\t"
|
||||
"paddw %%xmm6, %%xmm4 \n\t"
|
||||
"psraw $1, %%xmm2 \n\t"
|
||||
"psraw $1, %%xmm6 \n\t"
|
||||
"paddw %%xmm0, %%xmm2 \n\t"
|
||||
"paddw %%xmm4, %%xmm6 \n\t"
|
||||
"movdqa %%xmm2, (%2) \n\t"
|
||||
"movdqa %%xmm6, 16(%2) \n\t"
|
||||
:: "r"(&src[i]), "r"(&b[i]), "r"(&temp[i])
|
||||
: "memory"
|
||||
);
|
||||
}
|
||||
snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -W_AM, W_AO+1, W_AS);
|
||||
}
|
||||
|
||||
{
|
||||
snow_interleave_line_header(&i, width, b, temp);
|
||||
|
||||
for (; (i & 0x3E) != 0x3E; i-=2){
|
||||
b[i+1] = temp[i>>1];
|
||||
b[i] = b[i>>1];
|
||||
}
|
||||
for (i-=62; i>=0; i-=64){
|
||||
__asm__ volatile(
|
||||
"movdqa (%1), %%xmm0 \n\t"
|
||||
"movdqa 16(%1), %%xmm2 \n\t"
|
||||
"movdqa 32(%1), %%xmm4 \n\t"
|
||||
"movdqa 48(%1), %%xmm6 \n\t"
|
||||
"movdqa (%1), %%xmm1 \n\t"
|
||||
"movdqa 16(%1), %%xmm3 \n\t"
|
||||
"movdqa 32(%1), %%xmm5 \n\t"
|
||||
"movdqa 48(%1), %%xmm7 \n\t"
|
||||
"punpcklwd (%2), %%xmm0 \n\t"
|
||||
"punpcklwd 16(%2), %%xmm2 \n\t"
|
||||
"punpcklwd 32(%2), %%xmm4 \n\t"
|
||||
"punpcklwd 48(%2), %%xmm6 \n\t"
|
||||
"movdqa %%xmm0, (%0) \n\t"
|
||||
"movdqa %%xmm2, 32(%0) \n\t"
|
||||
"movdqa %%xmm4, 64(%0) \n\t"
|
||||
"movdqa %%xmm6, 96(%0) \n\t"
|
||||
"punpckhwd (%2), %%xmm1 \n\t"
|
||||
"punpckhwd 16(%2), %%xmm3 \n\t"
|
||||
"punpckhwd 32(%2), %%xmm5 \n\t"
|
||||
"punpckhwd 48(%2), %%xmm7 \n\t"
|
||||
"movdqa %%xmm1, 16(%0) \n\t"
|
||||
"movdqa %%xmm3, 48(%0) \n\t"
|
||||
"movdqa %%xmm5, 80(%0) \n\t"
|
||||
"movdqa %%xmm7, 112(%0) \n\t"
|
||||
:: "r"(&(b)[i]), "r"(&(b)[i>>1]), "r"(&(temp)[i>>1])
|
||||
: "memory"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void ff_snow_horizontal_compose97i_mmx(IDWTELEM *b, IDWTELEM *temp, int width){
|
||||
const int w2= (width+1)>>1;
|
||||
const int w_l= (width>>1);
|
||||
const int w_r= w2 - 1;
|
||||
int i;
|
||||
|
||||
{ // Lift 0
|
||||
IDWTELEM * const ref = b + w2 - 1;
|
||||
|
||||
i = 1;
|
||||
b[0] = b[0] - ((W_DM * 2 * ref[1]+W_DO)>>W_DS);
|
||||
__asm__ volatile(
|
||||
"pcmpeqw %%mm7, %%mm7 \n\t"
|
||||
"pcmpeqw %%mm3, %%mm3 \n\t"
|
||||
"psllw $1, %%mm3 \n\t"
|
||||
"paddw %%mm7, %%mm3 \n\t"
|
||||
"psllw $13, %%mm3 \n\t"
|
||||
::);
|
||||
for(; i<w_l-7; i+=8){
|
||||
__asm__ volatile(
|
||||
"movq (%1), %%mm2 \n\t"
|
||||
"movq 8(%1), %%mm6 \n\t"
|
||||
"paddw 2(%1), %%mm2 \n\t"
|
||||
"paddw 10(%1), %%mm6 \n\t"
|
||||
"paddw %%mm7, %%mm2 \n\t"
|
||||
"paddw %%mm7, %%mm6 \n\t"
|
||||
"pmulhw %%mm3, %%mm2 \n\t"
|
||||
"pmulhw %%mm3, %%mm6 \n\t"
|
||||
"paddw (%0), %%mm2 \n\t"
|
||||
"paddw 8(%0), %%mm6 \n\t"
|
||||
"movq %%mm2, (%0) \n\t"
|
||||
"movq %%mm6, 8(%0) \n\t"
|
||||
:: "r"(&b[i]), "r"(&ref[i])
|
||||
: "memory"
|
||||
);
|
||||
}
|
||||
snow_horizontal_compose_lift_lead_out(i, b, b, ref, width, w_l, 0, W_DM, W_DO, W_DS);
|
||||
}
|
||||
|
||||
{ // Lift 1
|
||||
IDWTELEM * const dst = b+w2;
|
||||
|
||||
i = 0;
|
||||
for(; i<w_r-7; i+=8){
|
||||
__asm__ volatile(
|
||||
"movq (%1), %%mm2 \n\t"
|
||||
"movq 8(%1), %%mm6 \n\t"
|
||||
"paddw 2(%1), %%mm2 \n\t"
|
||||
"paddw 10(%1), %%mm6 \n\t"
|
||||
"movq (%0), %%mm0 \n\t"
|
||||
"movq 8(%0), %%mm4 \n\t"
|
||||
"psubw %%mm2, %%mm0 \n\t"
|
||||
"psubw %%mm6, %%mm4 \n\t"
|
||||
"movq %%mm0, (%0) \n\t"
|
||||
"movq %%mm4, 8(%0) \n\t"
|
||||
:: "r"(&dst[i]), "r"(&b[i])
|
||||
: "memory"
|
||||
);
|
||||
}
|
||||
snow_horizontal_compose_lift_lead_out(i, dst, dst, b, width, w_r, 1, W_CM, W_CO, W_CS);
|
||||
}
|
||||
|
||||
{ // Lift 2
|
||||
IDWTELEM * const ref = b+w2 - 1;
|
||||
|
||||
i = 1;
|
||||
b[0] = b[0] + (((2 * ref[1] + W_BO) + 4 * b[0]) >> W_BS);
|
||||
__asm__ volatile(
|
||||
"psllw $15, %%mm7 \n\t"
|
||||
"pcmpeqw %%mm6, %%mm6 \n\t"
|
||||
"psrlw $13, %%mm6 \n\t"
|
||||
"paddw %%mm7, %%mm6 \n\t"
|
||||
::);
|
||||
for(; i<w_l-7; i+=8){
|
||||
__asm__ volatile(
|
||||
"movq (%1), %%mm0 \n\t"
|
||||
"movq 8(%1), %%mm4 \n\t"
|
||||
"movq 2(%1), %%mm1 \n\t"
|
||||
"movq 10(%1), %%mm5 \n\t"
|
||||
"paddw %%mm6, %%mm0 \n\t"
|
||||
"paddw %%mm6, %%mm4 \n\t"
|
||||
"paddw %%mm7, %%mm1 \n\t"
|
||||
"paddw %%mm7, %%mm5 \n\t"
|
||||
"pavgw %%mm1, %%mm0 \n\t"
|
||||
"pavgw %%mm5, %%mm4 \n\t"
|
||||
"psubw %%mm7, %%mm0 \n\t"
|
||||
"psubw %%mm7, %%mm4 \n\t"
|
||||
"psraw $1, %%mm0 \n\t"
|
||||
"psraw $1, %%mm4 \n\t"
|
||||
"movq (%0), %%mm1 \n\t"
|
||||
"movq 8(%0), %%mm5 \n\t"
|
||||
"paddw %%mm1, %%mm0 \n\t"
|
||||
"paddw %%mm5, %%mm4 \n\t"
|
||||
"psraw $2, %%mm0 \n\t"
|
||||
"psraw $2, %%mm4 \n\t"
|
||||
"paddw %%mm1, %%mm0 \n\t"
|
||||
"paddw %%mm5, %%mm4 \n\t"
|
||||
"movq %%mm0, (%0) \n\t"
|
||||
"movq %%mm4, 8(%0) \n\t"
|
||||
:: "r"(&b[i]), "r"(&ref[i])
|
||||
: "memory"
|
||||
);
|
||||
}
|
||||
snow_horizontal_compose_liftS_lead_out(i, b, b, ref, width, w_l);
|
||||
}
|
||||
|
||||
{ // Lift 3
|
||||
IDWTELEM * const src = b+w2;
|
||||
i = 0;
|
||||
|
||||
for(; i<w_r-7; i+=8){
|
||||
__asm__ volatile(
|
||||
"movq 2(%1), %%mm2 \n\t"
|
||||
"movq 10(%1), %%mm6 \n\t"
|
||||
"paddw (%1), %%mm2 \n\t"
|
||||
"paddw 8(%1), %%mm6 \n\t"
|
||||
"movq (%0), %%mm0 \n\t"
|
||||
"movq 8(%0), %%mm4 \n\t"
|
||||
"paddw %%mm2, %%mm0 \n\t"
|
||||
"paddw %%mm6, %%mm4 \n\t"
|
||||
"psraw $1, %%mm2 \n\t"
|
||||
"psraw $1, %%mm6 \n\t"
|
||||
"paddw %%mm0, %%mm2 \n\t"
|
||||
"paddw %%mm4, %%mm6 \n\t"
|
||||
"movq %%mm2, (%2) \n\t"
|
||||
"movq %%mm6, 8(%2) \n\t"
|
||||
:: "r"(&src[i]), "r"(&b[i]), "r"(&temp[i])
|
||||
: "memory"
|
||||
);
|
||||
}
|
||||
snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -W_AM, W_AO+1, W_AS);
|
||||
}
|
||||
|
||||
{
|
||||
snow_interleave_line_header(&i, width, b, temp);
|
||||
|
||||
for (; (i & 0x1E) != 0x1E; i-=2){
|
||||
b[i+1] = temp[i>>1];
|
||||
b[i] = b[i>>1];
|
||||
}
|
||||
for (i-=30; i>=0; i-=32){
|
||||
__asm__ volatile(
|
||||
"movq (%1), %%mm0 \n\t"
|
||||
"movq 8(%1), %%mm2 \n\t"
|
||||
"movq 16(%1), %%mm4 \n\t"
|
||||
"movq 24(%1), %%mm6 \n\t"
|
||||
"movq (%1), %%mm1 \n\t"
|
||||
"movq 8(%1), %%mm3 \n\t"
|
||||
"movq 16(%1), %%mm5 \n\t"
|
||||
"movq 24(%1), %%mm7 \n\t"
|
||||
"punpcklwd (%2), %%mm0 \n\t"
|
||||
"punpcklwd 8(%2), %%mm2 \n\t"
|
||||
"punpcklwd 16(%2), %%mm4 \n\t"
|
||||
"punpcklwd 24(%2), %%mm6 \n\t"
|
||||
"movq %%mm0, (%0) \n\t"
|
||||
"movq %%mm2, 16(%0) \n\t"
|
||||
"movq %%mm4, 32(%0) \n\t"
|
||||
"movq %%mm6, 48(%0) \n\t"
|
||||
"punpckhwd (%2), %%mm1 \n\t"
|
||||
"punpckhwd 8(%2), %%mm3 \n\t"
|
||||
"punpckhwd 16(%2), %%mm5 \n\t"
|
||||
"punpckhwd 24(%2), %%mm7 \n\t"
|
||||
"movq %%mm1, 8(%0) \n\t"
|
||||
"movq %%mm3, 24(%0) \n\t"
|
||||
"movq %%mm5, 40(%0) \n\t"
|
||||
"movq %%mm7, 56(%0) \n\t"
|
||||
:: "r"(&b[i]), "r"(&b[i>>1]), "r"(&temp[i>>1])
|
||||
: "memory"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#if HAVE_7REGS
|
||||
#define snow_vertical_compose_sse2_load_add(op,r,t0,t1,t2,t3)\
|
||||
""op" ("r",%%"FF_REG_d"), %%"t0" \n\t"\
|
||||
""op" 16("r",%%"FF_REG_d"), %%"t1" \n\t"\
|
||||
""op" 32("r",%%"FF_REG_d"), %%"t2" \n\t"\
|
||||
""op" 48("r",%%"FF_REG_d"), %%"t3" \n\t"
|
||||
|
||||
#define snow_vertical_compose_sse2_load(r,t0,t1,t2,t3)\
|
||||
snow_vertical_compose_sse2_load_add("movdqa",r,t0,t1,t2,t3)
|
||||
|
||||
#define snow_vertical_compose_sse2_add(r,t0,t1,t2,t3)\
|
||||
snow_vertical_compose_sse2_load_add("paddw",r,t0,t1,t2,t3)
|
||||
|
||||
#define snow_vertical_compose_r2r_sub(s0,s1,s2,s3,t0,t1,t2,t3)\
|
||||
"psubw %%"s0", %%"t0" \n\t"\
|
||||
"psubw %%"s1", %%"t1" \n\t"\
|
||||
"psubw %%"s2", %%"t2" \n\t"\
|
||||
"psubw %%"s3", %%"t3" \n\t"
|
||||
|
||||
#define snow_vertical_compose_sse2_store(w,s0,s1,s2,s3)\
|
||||
"movdqa %%"s0", ("w",%%"FF_REG_d") \n\t"\
|
||||
"movdqa %%"s1", 16("w",%%"FF_REG_d") \n\t"\
|
||||
"movdqa %%"s2", 32("w",%%"FF_REG_d") \n\t"\
|
||||
"movdqa %%"s3", 48("w",%%"FF_REG_d") \n\t"
|
||||
|
||||
#define snow_vertical_compose_sra(n,t0,t1,t2,t3)\
|
||||
"psraw $"n", %%"t0" \n\t"\
|
||||
"psraw $"n", %%"t1" \n\t"\
|
||||
"psraw $"n", %%"t2" \n\t"\
|
||||
"psraw $"n", %%"t3" \n\t"
|
||||
|
||||
#define snow_vertical_compose_r2r_add(s0,s1,s2,s3,t0,t1,t2,t3)\
|
||||
"paddw %%"s0", %%"t0" \n\t"\
|
||||
"paddw %%"s1", %%"t1" \n\t"\
|
||||
"paddw %%"s2", %%"t2" \n\t"\
|
||||
"paddw %%"s3", %%"t3" \n\t"
|
||||
|
||||
#define snow_vertical_compose_r2r_pmulhw(s0,s1,s2,s3,t0,t1,t2,t3)\
|
||||
"pmulhw %%"s0", %%"t0" \n\t"\
|
||||
"pmulhw %%"s1", %%"t1" \n\t"\
|
||||
"pmulhw %%"s2", %%"t2" \n\t"\
|
||||
"pmulhw %%"s3", %%"t3" \n\t"
|
||||
|
||||
#define snow_vertical_compose_sse2_move(s0,s1,s2,s3,t0,t1,t2,t3)\
|
||||
"movdqa %%"s0", %%"t0" \n\t"\
|
||||
"movdqa %%"s1", %%"t1" \n\t"\
|
||||
"movdqa %%"s2", %%"t2" \n\t"\
|
||||
"movdqa %%"s3", %%"t3" \n\t"
|
||||
|
||||
static void ff_snow_vertical_compose97i_sse2(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width){
|
||||
x86_reg i = width;
|
||||
|
||||
while(i & 0x1F)
|
||||
{
|
||||
i--;
|
||||
b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS;
|
||||
b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS;
|
||||
b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS;
|
||||
b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
|
||||
}
|
||||
i+=i;
|
||||
|
||||
__asm__ volatile (
|
||||
"jmp 2f \n\t"
|
||||
"1: \n\t"
|
||||
snow_vertical_compose_sse2_load("%4","xmm0","xmm2","xmm4","xmm6")
|
||||
snow_vertical_compose_sse2_add("%6","xmm0","xmm2","xmm4","xmm6")
|
||||
|
||||
|
||||
"pcmpeqw %%xmm0, %%xmm0 \n\t"
|
||||
"pcmpeqw %%xmm2, %%xmm2 \n\t"
|
||||
"paddw %%xmm2, %%xmm2 \n\t"
|
||||
"paddw %%xmm0, %%xmm2 \n\t"
|
||||
"psllw $13, %%xmm2 \n\t"
|
||||
snow_vertical_compose_r2r_add("xmm0","xmm0","xmm0","xmm0","xmm1","xmm3","xmm5","xmm7")
|
||||
snow_vertical_compose_r2r_pmulhw("xmm2","xmm2","xmm2","xmm2","xmm1","xmm3","xmm5","xmm7")
|
||||
snow_vertical_compose_sse2_add("%5","xmm1","xmm3","xmm5","xmm7")
|
||||
snow_vertical_compose_sse2_store("%5","xmm1","xmm3","xmm5","xmm7")
|
||||
snow_vertical_compose_sse2_load("%4","xmm0","xmm2","xmm4","xmm6")
|
||||
snow_vertical_compose_sse2_add("%3","xmm1","xmm3","xmm5","xmm7")
|
||||
snow_vertical_compose_r2r_sub("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6")
|
||||
snow_vertical_compose_sse2_store("%4","xmm0","xmm2","xmm4","xmm6")
|
||||
|
||||
"pcmpeqw %%xmm7, %%xmm7 \n\t"
|
||||
"pcmpeqw %%xmm5, %%xmm5 \n\t"
|
||||
"psllw $15, %%xmm7 \n\t"
|
||||
"psrlw $13, %%xmm5 \n\t"
|
||||
"paddw %%xmm7, %%xmm5 \n\t"
|
||||
snow_vertical_compose_r2r_add("xmm5","xmm5","xmm5","xmm5","xmm0","xmm2","xmm4","xmm6")
|
||||
"movq (%2,%%"FF_REG_d"), %%xmm1 \n\t"
|
||||
"movq 8(%2,%%"FF_REG_d"), %%xmm3 \n\t"
|
||||
"paddw %%xmm7, %%xmm1 \n\t"
|
||||
"paddw %%xmm7, %%xmm3 \n\t"
|
||||
"pavgw %%xmm1, %%xmm0 \n\t"
|
||||
"pavgw %%xmm3, %%xmm2 \n\t"
|
||||
"movq 16(%2,%%"FF_REG_d"), %%xmm1 \n\t"
|
||||
"movq 24(%2,%%"FF_REG_d"), %%xmm3 \n\t"
|
||||
"paddw %%xmm7, %%xmm1 \n\t"
|
||||
"paddw %%xmm7, %%xmm3 \n\t"
|
||||
"pavgw %%xmm1, %%xmm4 \n\t"
|
||||
"pavgw %%xmm3, %%xmm6 \n\t"
|
||||
snow_vertical_compose_r2r_sub("xmm7","xmm7","xmm7","xmm7","xmm0","xmm2","xmm4","xmm6")
|
||||
snow_vertical_compose_sra("1","xmm0","xmm2","xmm4","xmm6")
|
||||
snow_vertical_compose_sse2_add("%3","xmm0","xmm2","xmm4","xmm6")
|
||||
|
||||
snow_vertical_compose_sra("2","xmm0","xmm2","xmm4","xmm6")
|
||||
snow_vertical_compose_sse2_add("%3","xmm0","xmm2","xmm4","xmm6")
|
||||
snow_vertical_compose_sse2_store("%3","xmm0","xmm2","xmm4","xmm6")
|
||||
snow_vertical_compose_sse2_add("%1","xmm0","xmm2","xmm4","xmm6")
|
||||
snow_vertical_compose_sse2_move("xmm0","xmm2","xmm4","xmm6","xmm1","xmm3","xmm5","xmm7")
|
||||
snow_vertical_compose_sra("1","xmm0","xmm2","xmm4","xmm6")
|
||||
snow_vertical_compose_r2r_add("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6")
|
||||
snow_vertical_compose_sse2_add("%2","xmm0","xmm2","xmm4","xmm6")
|
||||
snow_vertical_compose_sse2_store("%2","xmm0","xmm2","xmm4","xmm6")
|
||||
|
||||
"2: \n\t"
|
||||
"sub $64, %%"FF_REG_d" \n\t"
|
||||
"jge 1b \n\t"
|
||||
:"+d"(i)
|
||||
:"r"(b0),"r"(b1),"r"(b2),"r"(b3),"r"(b4),"r"(b5));
|
||||
}
|
||||
|
||||
#define snow_vertical_compose_mmx_load_add(op,r,t0,t1,t2,t3)\
|
||||
""op" ("r",%%"FF_REG_d"), %%"t0" \n\t"\
|
||||
""op" 8("r",%%"FF_REG_d"), %%"t1" \n\t"\
|
||||
""op" 16("r",%%"FF_REG_d"), %%"t2" \n\t"\
|
||||
""op" 24("r",%%"FF_REG_d"), %%"t3" \n\t"
|
||||
|
||||
#define snow_vertical_compose_mmx_load(r,t0,t1,t2,t3)\
|
||||
snow_vertical_compose_mmx_load_add("movq",r,t0,t1,t2,t3)
|
||||
|
||||
#define snow_vertical_compose_mmx_add(r,t0,t1,t2,t3)\
|
||||
snow_vertical_compose_mmx_load_add("paddw",r,t0,t1,t2,t3)
|
||||
|
||||
#define snow_vertical_compose_mmx_store(w,s0,s1,s2,s3)\
|
||||
"movq %%"s0", ("w",%%"FF_REG_d") \n\t"\
|
||||
"movq %%"s1", 8("w",%%"FF_REG_d") \n\t"\
|
||||
"movq %%"s2", 16("w",%%"FF_REG_d") \n\t"\
|
||||
"movq %%"s3", 24("w",%%"FF_REG_d") \n\t"
|
||||
|
||||
#define snow_vertical_compose_mmx_move(s0,s1,s2,s3,t0,t1,t2,t3)\
|
||||
"movq %%"s0", %%"t0" \n\t"\
|
||||
"movq %%"s1", %%"t1" \n\t"\
|
||||
"movq %%"s2", %%"t2" \n\t"\
|
||||
"movq %%"s3", %%"t3" \n\t"
|
||||
|
||||
|
||||
static void ff_snow_vertical_compose97i_mmx(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width){
|
||||
x86_reg i = width;
|
||||
while(i & 15)
|
||||
{
|
||||
i--;
|
||||
b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS;
|
||||
b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS;
|
||||
b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS;
|
||||
b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
|
||||
}
|
||||
i+=i;
|
||||
__asm__ volatile(
|
||||
"jmp 2f \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
snow_vertical_compose_mmx_load("%4","mm1","mm3","mm5","mm7")
|
||||
snow_vertical_compose_mmx_add("%6","mm1","mm3","mm5","mm7")
|
||||
"pcmpeqw %%mm0, %%mm0 \n\t"
|
||||
"pcmpeqw %%mm2, %%mm2 \n\t"
|
||||
"paddw %%mm2, %%mm2 \n\t"
|
||||
"paddw %%mm0, %%mm2 \n\t"
|
||||
"psllw $13, %%mm2 \n\t"
|
||||
snow_vertical_compose_r2r_add("mm0","mm0","mm0","mm0","mm1","mm3","mm5","mm7")
|
||||
snow_vertical_compose_r2r_pmulhw("mm2","mm2","mm2","mm2","mm1","mm3","mm5","mm7")
|
||||
snow_vertical_compose_mmx_add("%5","mm1","mm3","mm5","mm7")
|
||||
snow_vertical_compose_mmx_store("%5","mm1","mm3","mm5","mm7")
|
||||
snow_vertical_compose_mmx_load("%4","mm0","mm2","mm4","mm6")
|
||||
snow_vertical_compose_mmx_add("%3","mm1","mm3","mm5","mm7")
|
||||
snow_vertical_compose_r2r_sub("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6")
|
||||
snow_vertical_compose_mmx_store("%4","mm0","mm2","mm4","mm6")
|
||||
"pcmpeqw %%mm7, %%mm7 \n\t"
|
||||
"pcmpeqw %%mm5, %%mm5 \n\t"
|
||||
"psllw $15, %%mm7 \n\t"
|
||||
"psrlw $13, %%mm5 \n\t"
|
||||
"paddw %%mm7, %%mm5 \n\t"
|
||||
snow_vertical_compose_r2r_add("mm5","mm5","mm5","mm5","mm0","mm2","mm4","mm6")
|
||||
"movq (%2,%%"FF_REG_d"), %%mm1 \n\t"
|
||||
"movq 8(%2,%%"FF_REG_d"), %%mm3 \n\t"
|
||||
"paddw %%mm7, %%mm1 \n\t"
|
||||
"paddw %%mm7, %%mm3 \n\t"
|
||||
"pavgw %%mm1, %%mm0 \n\t"
|
||||
"pavgw %%mm3, %%mm2 \n\t"
|
||||
"movq 16(%2,%%"FF_REG_d"), %%mm1 \n\t"
|
||||
"movq 24(%2,%%"FF_REG_d"), %%mm3 \n\t"
|
||||
"paddw %%mm7, %%mm1 \n\t"
|
||||
"paddw %%mm7, %%mm3 \n\t"
|
||||
"pavgw %%mm1, %%mm4 \n\t"
|
||||
"pavgw %%mm3, %%mm6 \n\t"
|
||||
snow_vertical_compose_r2r_sub("mm7","mm7","mm7","mm7","mm0","mm2","mm4","mm6")
|
||||
snow_vertical_compose_sra("1","mm0","mm2","mm4","mm6")
|
||||
snow_vertical_compose_mmx_add("%3","mm0","mm2","mm4","mm6")
|
||||
|
||||
snow_vertical_compose_sra("2","mm0","mm2","mm4","mm6")
|
||||
snow_vertical_compose_mmx_add("%3","mm0","mm2","mm4","mm6")
|
||||
snow_vertical_compose_mmx_store("%3","mm0","mm2","mm4","mm6")
|
||||
snow_vertical_compose_mmx_add("%1","mm0","mm2","mm4","mm6")
|
||||
snow_vertical_compose_mmx_move("mm0","mm2","mm4","mm6","mm1","mm3","mm5","mm7")
|
||||
snow_vertical_compose_sra("1","mm0","mm2","mm4","mm6")
|
||||
snow_vertical_compose_r2r_add("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6")
|
||||
snow_vertical_compose_mmx_add("%2","mm0","mm2","mm4","mm6")
|
||||
snow_vertical_compose_mmx_store("%2","mm0","mm2","mm4","mm6")
|
||||
|
||||
"2: \n\t"
|
||||
"sub $32, %%"FF_REG_d" \n\t"
|
||||
"jge 1b \n\t"
|
||||
:"+d"(i)
|
||||
:"r"(b0),"r"(b1),"r"(b2),"r"(b3),"r"(b4),"r"(b5));
|
||||
}
|
||||
#endif //HAVE_7REGS
|
||||
|
||||
#if HAVE_6REGS
|
||||
#define snow_inner_add_yblock_sse2_header \
|
||||
IDWTELEM * * dst_array = sb->line + src_y;\
|
||||
x86_reg tmp;\
|
||||
__asm__ volatile(\
|
||||
"mov %7, %%"FF_REG_c" \n\t"\
|
||||
"mov %6, %2 \n\t"\
|
||||
"mov %4, %%"FF_REG_S" \n\t"\
|
||||
"pxor %%xmm7, %%xmm7 \n\t" /* 0 */\
|
||||
"pcmpeqd %%xmm3, %%xmm3 \n\t"\
|
||||
"psllw $15, %%xmm3 \n\t"\
|
||||
"psrlw $12, %%xmm3 \n\t" /* FRAC_BITS >> 1 */\
|
||||
"1: \n\t"\
|
||||
"mov %1, %%"FF_REG_D" \n\t"\
|
||||
"mov (%%"FF_REG_D"), %%"FF_REG_D" \n\t"\
|
||||
"add %3, %%"FF_REG_D" \n\t"
|
||||
|
||||
#define snow_inner_add_yblock_sse2_start_8(out_reg1, out_reg2, ptr_offset, s_offset)\
|
||||
"mov "FF_PTR_SIZE"*"ptr_offset"(%%"FF_REG_a"), %%"FF_REG_d"; \n\t"\
|
||||
"movq (%%"FF_REG_d"), %%"out_reg1" \n\t"\
|
||||
"movq (%%"FF_REG_d", %%"FF_REG_c"), %%"out_reg2" \n\t"\
|
||||
"punpcklbw %%xmm7, %%"out_reg1" \n\t"\
|
||||
"punpcklbw %%xmm7, %%"out_reg2" \n\t"\
|
||||
"movq "s_offset"(%%"FF_REG_S"), %%xmm0 \n\t"\
|
||||
"movq "s_offset"+16(%%"FF_REG_S"), %%xmm4 \n\t"\
|
||||
"punpcklbw %%xmm7, %%xmm0 \n\t"\
|
||||
"punpcklbw %%xmm7, %%xmm4 \n\t"\
|
||||
"pmullw %%xmm0, %%"out_reg1" \n\t"\
|
||||
"pmullw %%xmm4, %%"out_reg2" \n\t"
|
||||
|
||||
#define snow_inner_add_yblock_sse2_start_16(out_reg1, out_reg2, ptr_offset, s_offset)\
|
||||
"mov "FF_PTR_SIZE"*"ptr_offset"(%%"FF_REG_a"), %%"FF_REG_d"; \n\t"\
|
||||
"movq (%%"FF_REG_d"), %%"out_reg1" \n\t"\
|
||||
"movq 8(%%"FF_REG_d"), %%"out_reg2" \n\t"\
|
||||
"punpcklbw %%xmm7, %%"out_reg1" \n\t"\
|
||||
"punpcklbw %%xmm7, %%"out_reg2" \n\t"\
|
||||
"movq "s_offset"(%%"FF_REG_S"), %%xmm0 \n\t"\
|
||||
"movq "s_offset"+8(%%"FF_REG_S"), %%xmm4 \n\t"\
|
||||
"punpcklbw %%xmm7, %%xmm0 \n\t"\
|
||||
"punpcklbw %%xmm7, %%xmm4 \n\t"\
|
||||
"pmullw %%xmm0, %%"out_reg1" \n\t"\
|
||||
"pmullw %%xmm4, %%"out_reg2" \n\t"
|
||||
|
||||
#define snow_inner_add_yblock_sse2_accum_8(ptr_offset, s_offset) \
|
||||
snow_inner_add_yblock_sse2_start_8("xmm2", "xmm6", ptr_offset, s_offset)\
|
||||
"paddusw %%xmm2, %%xmm1 \n\t"\
|
||||
"paddusw %%xmm6, %%xmm5 \n\t"
|
||||
|
||||
#define snow_inner_add_yblock_sse2_accum_16(ptr_offset, s_offset) \
|
||||
snow_inner_add_yblock_sse2_start_16("xmm2", "xmm6", ptr_offset, s_offset)\
|
||||
"paddusw %%xmm2, %%xmm1 \n\t"\
|
||||
"paddusw %%xmm6, %%xmm5 \n\t"
|
||||
|
||||
#define snow_inner_add_yblock_sse2_end_common1\
|
||||
"add $32, %%"FF_REG_S" \n\t"\
|
||||
"add %%"FF_REG_c", %0 \n\t"\
|
||||
"add %%"FF_REG_c", "FF_PTR_SIZE"*3(%%"FF_REG_a"); \n\t"\
|
||||
"add %%"FF_REG_c", "FF_PTR_SIZE"*2(%%"FF_REG_a"); \n\t"\
|
||||
"add %%"FF_REG_c", "FF_PTR_SIZE"*1(%%"FF_REG_a"); \n\t"\
|
||||
"add %%"FF_REG_c", (%%"FF_REG_a") \n\t"
|
||||
|
||||
#define snow_inner_add_yblock_sse2_end_common2\
|
||||
"jnz 1b \n\t"\
|
||||
:"+m"(dst8),"+m"(dst_array),"=&r"(tmp)\
|
||||
:\
|
||||
"rm"((x86_reg)(src_x<<1)),"m"(obmc),"a"(block),"m"(b_h),"m"(src_stride):\
|
||||
XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", )\
|
||||
"%"FF_REG_c"","%"FF_REG_S"","%"FF_REG_D"","%"FF_REG_d"");
|
||||
|
||||
#define snow_inner_add_yblock_sse2_end_8\
|
||||
"sal $1, %%"FF_REG_c" \n\t"\
|
||||
"add"FF_OPSIZE" $"FF_PTR_SIZE"*2, %1 \n\t"\
|
||||
snow_inner_add_yblock_sse2_end_common1\
|
||||
"sar $1, %%"FF_REG_c" \n\t"\
|
||||
"sub $2, %2 \n\t"\
|
||||
snow_inner_add_yblock_sse2_end_common2
|
||||
|
||||
#define snow_inner_add_yblock_sse2_end_16\
|
||||
"add"FF_OPSIZE" $"FF_PTR_SIZE"*1, %1 \n\t"\
|
||||
snow_inner_add_yblock_sse2_end_common1\
|
||||
"dec %2 \n\t"\
|
||||
snow_inner_add_yblock_sse2_end_common2
|
||||
|
||||
static void inner_add_yblock_bw_8_obmc_16_bh_even_sse2(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h,
|
||||
int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){
|
||||
snow_inner_add_yblock_sse2_header
|
||||
snow_inner_add_yblock_sse2_start_8("xmm1", "xmm5", "3", "0")
|
||||
snow_inner_add_yblock_sse2_accum_8("2", "8")
|
||||
snow_inner_add_yblock_sse2_accum_8("1", "128")
|
||||
snow_inner_add_yblock_sse2_accum_8("0", "136")
|
||||
|
||||
"mov %0, %%"FF_REG_d" \n\t"
|
||||
"movdqa (%%"FF_REG_D"), %%xmm0 \n\t"
|
||||
"movdqa %%xmm1, %%xmm2 \n\t"
|
||||
|
||||
"punpckhwd %%xmm7, %%xmm1 \n\t"
|
||||
"punpcklwd %%xmm7, %%xmm2 \n\t"
|
||||
"paddd %%xmm2, %%xmm0 \n\t"
|
||||
"movdqa 16(%%"FF_REG_D"), %%xmm2\n\t"
|
||||
"paddd %%xmm1, %%xmm2 \n\t"
|
||||
"paddd %%xmm3, %%xmm0 \n\t"
|
||||
"paddd %%xmm3, %%xmm2 \n\t"
|
||||
|
||||
"mov %1, %%"FF_REG_D" \n\t"
|
||||
"mov "FF_PTR_SIZE"(%%"FF_REG_D"), %%"FF_REG_D"; \n\t"
|
||||
"add %3, %%"FF_REG_D" \n\t"
|
||||
|
||||
"movdqa (%%"FF_REG_D"), %%xmm4 \n\t"
|
||||
"movdqa %%xmm5, %%xmm6 \n\t"
|
||||
"punpckhwd %%xmm7, %%xmm5 \n\t"
|
||||
"punpcklwd %%xmm7, %%xmm6 \n\t"
|
||||
"paddd %%xmm6, %%xmm4 \n\t"
|
||||
"movdqa 16(%%"FF_REG_D"), %%xmm6\n\t"
|
||||
"paddd %%xmm5, %%xmm6 \n\t"
|
||||
"paddd %%xmm3, %%xmm4 \n\t"
|
||||
"paddd %%xmm3, %%xmm6 \n\t"
|
||||
|
||||
"psrad $8, %%xmm0 \n\t" /* FRAC_BITS. */
|
||||
"psrad $8, %%xmm2 \n\t" /* FRAC_BITS. */
|
||||
"packssdw %%xmm2, %%xmm0 \n\t"
|
||||
"packuswb %%xmm7, %%xmm0 \n\t"
|
||||
"movq %%xmm0, (%%"FF_REG_d") \n\t"
|
||||
|
||||
"psrad $8, %%xmm4 \n\t" /* FRAC_BITS. */
|
||||
"psrad $8, %%xmm6 \n\t" /* FRAC_BITS. */
|
||||
"packssdw %%xmm6, %%xmm4 \n\t"
|
||||
"packuswb %%xmm7, %%xmm4 \n\t"
|
||||
"movq %%xmm4, (%%"FF_REG_d",%%"FF_REG_c"); \n\t"
|
||||
snow_inner_add_yblock_sse2_end_8
|
||||
}
|
||||
|
||||
static void inner_add_yblock_bw_16_obmc_32_sse2(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h,
|
||||
int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){
|
||||
snow_inner_add_yblock_sse2_header
|
||||
snow_inner_add_yblock_sse2_start_16("xmm1", "xmm5", "3", "0")
|
||||
snow_inner_add_yblock_sse2_accum_16("2", "16")
|
||||
snow_inner_add_yblock_sse2_accum_16("1", "512")
|
||||
snow_inner_add_yblock_sse2_accum_16("0", "528")
|
||||
|
||||
"mov %0, %%"FF_REG_d" \n\t"
|
||||
"psrlw $4, %%xmm1 \n\t"
|
||||
"psrlw $4, %%xmm5 \n\t"
|
||||
"paddw (%%"FF_REG_D"), %%xmm1 \n\t"
|
||||
"paddw 16(%%"FF_REG_D"), %%xmm5 \n\t"
|
||||
"paddw %%xmm3, %%xmm1 \n\t"
|
||||
"paddw %%xmm3, %%xmm5 \n\t"
|
||||
"psraw $4, %%xmm1 \n\t" /* FRAC_BITS. */
|
||||
"psraw $4, %%xmm5 \n\t" /* FRAC_BITS. */
|
||||
"packuswb %%xmm5, %%xmm1 \n\t"
|
||||
|
||||
"movdqu %%xmm1, (%%"FF_REG_d") \n\t"
|
||||
|
||||
snow_inner_add_yblock_sse2_end_16
|
||||
}
|
||||
|
||||
#define snow_inner_add_yblock_mmx_header \
|
||||
IDWTELEM * * dst_array = sb->line + src_y;\
|
||||
x86_reg tmp;\
|
||||
__asm__ volatile(\
|
||||
"mov %7, %%"FF_REG_c" \n\t"\
|
||||
"mov %6, %2 \n\t"\
|
||||
"mov %4, %%"FF_REG_S" \n\t"\
|
||||
"pxor %%mm7, %%mm7 \n\t" /* 0 */\
|
||||
"pcmpeqd %%mm3, %%mm3 \n\t"\
|
||||
"psllw $15, %%mm3 \n\t"\
|
||||
"psrlw $12, %%mm3 \n\t" /* FRAC_BITS >> 1 */\
|
||||
"1: \n\t"\
|
||||
"mov %1, %%"FF_REG_D" \n\t"\
|
||||
"mov (%%"FF_REG_D"), %%"FF_REG_D" \n\t"\
|
||||
"add %3, %%"FF_REG_D" \n\t"
|
||||
|
||||
#define snow_inner_add_yblock_mmx_start(out_reg1, out_reg2, ptr_offset, s_offset, d_offset)\
|
||||
"mov "FF_PTR_SIZE"*"ptr_offset"(%%"FF_REG_a"), %%"FF_REG_d"; \n\t"\
|
||||
"movd "d_offset"(%%"FF_REG_d"), %%"out_reg1" \n\t"\
|
||||
"movd "d_offset"+4(%%"FF_REG_d"), %%"out_reg2" \n\t"\
|
||||
"punpcklbw %%mm7, %%"out_reg1" \n\t"\
|
||||
"punpcklbw %%mm7, %%"out_reg2" \n\t"\
|
||||
"movd "s_offset"(%%"FF_REG_S"), %%mm0 \n\t"\
|
||||
"movd "s_offset"+4(%%"FF_REG_S"), %%mm4 \n\t"\
|
||||
"punpcklbw %%mm7, %%mm0 \n\t"\
|
||||
"punpcklbw %%mm7, %%mm4 \n\t"\
|
||||
"pmullw %%mm0, %%"out_reg1" \n\t"\
|
||||
"pmullw %%mm4, %%"out_reg2" \n\t"
|
||||
|
||||
#define snow_inner_add_yblock_mmx_accum(ptr_offset, s_offset, d_offset) \
|
||||
snow_inner_add_yblock_mmx_start("mm2", "mm6", ptr_offset, s_offset, d_offset)\
|
||||
"paddusw %%mm2, %%mm1 \n\t"\
|
||||
"paddusw %%mm6, %%mm5 \n\t"
|
||||
|
||||
#define snow_inner_add_yblock_mmx_mix(read_offset, write_offset)\
|
||||
"mov %0, %%"FF_REG_d" \n\t"\
|
||||
"psrlw $4, %%mm1 \n\t"\
|
||||
"psrlw $4, %%mm5 \n\t"\
|
||||
"paddw "read_offset"(%%"FF_REG_D"), %%mm1 \n\t"\
|
||||
"paddw "read_offset"+8(%%"FF_REG_D"), %%mm5 \n\t"\
|
||||
"paddw %%mm3, %%mm1 \n\t"\
|
||||
"paddw %%mm3, %%mm5 \n\t"\
|
||||
"psraw $4, %%mm1 \n\t"\
|
||||
"psraw $4, %%mm5 \n\t"\
|
||||
"packuswb %%mm5, %%mm1 \n\t"\
|
||||
"movq %%mm1, "write_offset"(%%"FF_REG_d") \n\t"
|
||||
|
||||
#define snow_inner_add_yblock_mmx_end(s_step)\
|
||||
"add $"s_step", %%"FF_REG_S" \n\t"\
|
||||
"add %%"FF_REG_c", "FF_PTR_SIZE"*3(%%"FF_REG_a"); \n\t"\
|
||||
"add %%"FF_REG_c", "FF_PTR_SIZE"*2(%%"FF_REG_a"); \n\t"\
|
||||
"add %%"FF_REG_c", "FF_PTR_SIZE"*1(%%"FF_REG_a"); \n\t"\
|
||||
"add %%"FF_REG_c", (%%"FF_REG_a") \n\t"\
|
||||
"add"FF_OPSIZE " $"FF_PTR_SIZE"*1, %1 \n\t"\
|
||||
"add %%"FF_REG_c", %0 \n\t"\
|
||||
"dec %2 \n\t"\
|
||||
"jnz 1b \n\t"\
|
||||
:"+m"(dst8),"+m"(dst_array),"=&r"(tmp)\
|
||||
:\
|
||||
"rm"((x86_reg)(src_x<<1)),"m"(obmc),"a"(block),"m"(b_h),"m"(src_stride):\
|
||||
"%"FF_REG_c"","%"FF_REG_S"","%"FF_REG_D"","%"FF_REG_d"");
|
||||
|
||||
static void inner_add_yblock_bw_8_obmc_16_mmx(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h,
|
||||
int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){
|
||||
snow_inner_add_yblock_mmx_header
|
||||
snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "0", "0")
|
||||
snow_inner_add_yblock_mmx_accum("2", "8", "0")
|
||||
snow_inner_add_yblock_mmx_accum("1", "128", "0")
|
||||
snow_inner_add_yblock_mmx_accum("0", "136", "0")
|
||||
snow_inner_add_yblock_mmx_mix("0", "0")
|
||||
snow_inner_add_yblock_mmx_end("16")
|
||||
}
|
||||
|
||||
static void inner_add_yblock_bw_16_obmc_32_mmx(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h,
|
||||
int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){
|
||||
snow_inner_add_yblock_mmx_header
|
||||
snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "0", "0")
|
||||
snow_inner_add_yblock_mmx_accum("2", "16", "0")
|
||||
snow_inner_add_yblock_mmx_accum("1", "512", "0")
|
||||
snow_inner_add_yblock_mmx_accum("0", "528", "0")
|
||||
snow_inner_add_yblock_mmx_mix("0", "0")
|
||||
|
||||
snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "8", "8")
|
||||
snow_inner_add_yblock_mmx_accum("2", "24", "8")
|
||||
snow_inner_add_yblock_mmx_accum("1", "520", "8")
|
||||
snow_inner_add_yblock_mmx_accum("0", "536", "8")
|
||||
snow_inner_add_yblock_mmx_mix("16", "8")
|
||||
snow_inner_add_yblock_mmx_end("32")
|
||||
}
|
||||
|
||||
static void ff_snow_inner_add_yblock_sse2(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
|
||||
int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){
|
||||
|
||||
if (b_w == 16)
|
||||
inner_add_yblock_bw_16_obmc_32_sse2(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
|
||||
else if (b_w == 8 && obmc_stride == 16) {
|
||||
if (!(b_h & 1))
|
||||
inner_add_yblock_bw_8_obmc_16_bh_even_sse2(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
|
||||
else
|
||||
inner_add_yblock_bw_8_obmc_16_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
|
||||
} else
|
||||
ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
|
||||
}
|
||||
|
||||
static void ff_snow_inner_add_yblock_mmx(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
|
||||
int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){
|
||||
if (b_w == 16)
|
||||
inner_add_yblock_bw_16_obmc_32_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
|
||||
else if (b_w == 8 && obmc_stride == 16)
|
||||
inner_add_yblock_bw_8_obmc_16_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
|
||||
else
|
||||
ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
|
||||
}
|
||||
#endif /* HAVE_6REGS */
|
||||
|
||||
#endif /* HAVE_INLINE_ASM */
|
||||
|
||||
av_cold void ff_dwt_init_x86(SnowDWTContext *c)
|
||||
{
|
||||
#if HAVE_INLINE_ASM
|
||||
int mm_flags = av_get_cpu_flags();
|
||||
|
||||
if (mm_flags & AV_CPU_FLAG_MMX) {
|
||||
if(mm_flags & AV_CPU_FLAG_SSE2 & 0){
|
||||
c->horizontal_compose97i = ff_snow_horizontal_compose97i_sse2;
|
||||
#if HAVE_7REGS
|
||||
c->vertical_compose97i = ff_snow_vertical_compose97i_sse2;
|
||||
#endif
|
||||
#if HAVE_6REGS
|
||||
c->inner_add_yblock = ff_snow_inner_add_yblock_sse2;
|
||||
#endif
|
||||
}
|
||||
else{
|
||||
if (mm_flags & AV_CPU_FLAG_MMXEXT) {
|
||||
c->horizontal_compose97i = ff_snow_horizontal_compose97i_mmx;
|
||||
#if HAVE_7REGS
|
||||
c->vertical_compose97i = ff_snow_vertical_compose97i_mmx;
|
||||
#endif
|
||||
}
|
||||
#if HAVE_6REGS
|
||||
c->inner_add_yblock = ff_snow_inner_add_yblock_mmx;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
#endif /* HAVE_INLINE_ASM */
|
||||
}
|
42
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/svq1enc_init.c
vendored
Normal file
42
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/svq1enc_init.c
vendored
Normal file
|
@ -0,0 +1,42 @@
|
|||
/*
|
||||
* Copyright (c) 2007 Loren Merritt
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/svq1enc.h"
|
||||
|
||||
int ff_ssd_int8_vs_int16_mmx(const int8_t *pix1, const int16_t *pix2,
|
||||
intptr_t size);
|
||||
int ff_ssd_int8_vs_int16_sse2(const int8_t *pix1, const int16_t *pix2,
|
||||
intptr_t size);
|
||||
|
||||
av_cold void ff_svq1enc_init_x86(SVQ1EncContext *c)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (EXTERNAL_MMX(cpu_flags)) {
|
||||
c->ssd_int8_vs_int16 = ff_ssd_int8_vs_int16_mmx;
|
||||
}
|
||||
if (EXTERNAL_SSE2(cpu_flags)) {
|
||||
c->ssd_int8_vs_int16 = ff_ssd_int8_vs_int16_sse2;
|
||||
}
|
||||
}
|
74
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/synth_filter_init.c
vendored
Normal file
74
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/synth_filter_init.c
vendored
Normal file
|
@ -0,0 +1,74 @@
|
|||
/*
|
||||
* Copyright (c) 2012-2014 Christophe Gisquet <christophe.gisquet@gmail.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/synth_filter.h"
|
||||
|
||||
#define SYNTH_FILTER_FUNC(opt) \
|
||||
void ff_synth_filter_inner_##opt(float *synth_buf_ptr, float synth_buf2[32], \
|
||||
const float window[512], \
|
||||
float out[32], intptr_t offset, float scale); \
|
||||
static void synth_filter_##opt(FFTContext *imdct, \
|
||||
float *synth_buf_ptr, int *synth_buf_offset, \
|
||||
float synth_buf2[32], const float window[512], \
|
||||
float out[32], const float in[32], float scale) \
|
||||
{ \
|
||||
float *synth_buf= synth_buf_ptr + *synth_buf_offset; \
|
||||
\
|
||||
imdct->imdct_half(imdct, synth_buf, in); \
|
||||
\
|
||||
ff_synth_filter_inner_##opt(synth_buf, synth_buf2, window, \
|
||||
out, *synth_buf_offset, scale); \
|
||||
\
|
||||
*synth_buf_offset = (*synth_buf_offset - 32) & 511; \
|
||||
} \
|
||||
|
||||
#if HAVE_X86ASM
|
||||
#if ARCH_X86_32
|
||||
SYNTH_FILTER_FUNC(sse)
|
||||
#endif
|
||||
SYNTH_FILTER_FUNC(sse2)
|
||||
SYNTH_FILTER_FUNC(avx)
|
||||
SYNTH_FILTER_FUNC(fma3)
|
||||
#endif /* HAVE_X86ASM */
|
||||
|
||||
av_cold void ff_synth_filter_init_x86(SynthFilterContext *s)
|
||||
{
|
||||
#if HAVE_X86ASM
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
#if ARCH_X86_32
|
||||
if (EXTERNAL_SSE(cpu_flags)) {
|
||||
s->synth_filter_float = synth_filter_sse;
|
||||
}
|
||||
#endif
|
||||
if (EXTERNAL_SSE2(cpu_flags)) {
|
||||
s->synth_filter_float = synth_filter_sse2;
|
||||
}
|
||||
if (EXTERNAL_AVX_FAST(cpu_flags)) {
|
||||
s->synth_filter_float = synth_filter_avx;
|
||||
}
|
||||
if (EXTERNAL_FMA3_FAST(cpu_flags)) {
|
||||
s->synth_filter_float = synth_filter_fma3;
|
||||
}
|
||||
#endif /* HAVE_X86ASM */
|
||||
}
|
45
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/takdsp_init.c
vendored
Normal file
45
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/takdsp_init.c
vendored
Normal file
|
@ -0,0 +1,45 @@
|
|||
/*
|
||||
* Copyright (c) 2015 Paul B Mahol
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavcodec/takdsp.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "config.h"
|
||||
|
||||
void ff_tak_decorrelate_ls_sse2(int32_t *p1, int32_t *p2, int length);
|
||||
void ff_tak_decorrelate_sr_sse2(int32_t *p1, int32_t *p2, int length);
|
||||
void ff_tak_decorrelate_sm_sse2(int32_t *p1, int32_t *p2, int length);
|
||||
void ff_tak_decorrelate_sf_sse4(int32_t *p1, int32_t *p2, int length, int dshift, int dfactor);
|
||||
|
||||
av_cold void ff_takdsp_init_x86(TAKDSPContext *c)
|
||||
{
|
||||
#if HAVE_X86ASM
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (EXTERNAL_SSE2(cpu_flags)) {
|
||||
c->decorrelate_ls = ff_tak_decorrelate_ls_sse2;
|
||||
c->decorrelate_sr = ff_tak_decorrelate_sr_sse2;
|
||||
c->decorrelate_sm = ff_tak_decorrelate_sm_sse2;
|
||||
}
|
||||
|
||||
if (EXTERNAL_SSE4(cpu_flags)) {
|
||||
c->decorrelate_sf = ff_tak_decorrelate_sf_sse4;
|
||||
}
|
||||
#endif
|
||||
}
|
42
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/ttadsp_init.c
vendored
Normal file
42
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/ttadsp_init.c
vendored
Normal file
|
@ -0,0 +1,42 @@
|
|||
/*
|
||||
* Copyright (c) 2014 James Almer
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavcodec/ttadsp.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "config.h"
|
||||
|
||||
void ff_tta_filter_process_ssse3(int32_t *qm, int32_t *dx, int32_t *dl,
|
||||
int32_t *error, int32_t *in, int32_t shift,
|
||||
int32_t round);
|
||||
void ff_tta_filter_process_sse4(int32_t *qm, int32_t *dx, int32_t *dl,
|
||||
int32_t *error, int32_t *in, int32_t shift,
|
||||
int32_t round);
|
||||
|
||||
av_cold void ff_ttadsp_init_x86(TTADSPContext *c)
|
||||
{
|
||||
#if HAVE_X86ASM
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (EXTERNAL_SSSE3(cpu_flags))
|
||||
c->filter_process = ff_tta_filter_process_ssse3;
|
||||
if (EXTERNAL_SSE4(cpu_flags))
|
||||
c->filter_process = ff_tta_filter_process_sse4;
|
||||
#endif
|
||||
}
|
42
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/ttaencdsp_init.c
vendored
Normal file
42
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/ttaencdsp_init.c
vendored
Normal file
|
@ -0,0 +1,42 @@
|
|||
/*
|
||||
* Copyright (c) 2014-2016 James Almer
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavcodec/ttaencdsp.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "config.h"
|
||||
|
||||
void ff_ttaenc_filter_process_ssse3(int32_t *qm, int32_t *dx, int32_t *dl,
|
||||
int32_t *error, int32_t *in, int32_t shift,
|
||||
int32_t round);
|
||||
void ff_ttaenc_filter_process_sse4(int32_t *qm, int32_t *dx, int32_t *dl,
|
||||
int32_t *error, int32_t *in, int32_t shift,
|
||||
int32_t round);
|
||||
|
||||
av_cold void ff_ttaencdsp_init_x86(TTAEncDSPContext *c)
|
||||
{
|
||||
#if HAVE_X86ASM
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (EXTERNAL_SSSE3(cpu_flags))
|
||||
c->filter_process = ff_ttaenc_filter_process_ssse3;
|
||||
if (EXTERNAL_SSE4(cpu_flags))
|
||||
c->filter_process = ff_ttaenc_filter_process_sse4;
|
||||
#endif
|
||||
}
|
54
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/utvideodsp_init.c
vendored
Normal file
54
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/utvideodsp_init.c
vendored
Normal file
|
@ -0,0 +1,54 @@
|
|||
/*
|
||||
* Copyright (c) 2017 Paul B Mahol
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/x86/asm.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/utvideodsp.h"
|
||||
|
||||
void ff_restore_rgb_planes_sse2(uint8_t *src_r, uint8_t *src_g, uint8_t *src_b,
|
||||
ptrdiff_t linesize_r, ptrdiff_t linesize_g,
|
||||
ptrdiff_t linesize_b, int width, int height);
|
||||
void ff_restore_rgb_planes_avx2(uint8_t *src_r, uint8_t *src_g, uint8_t *src_b,
|
||||
ptrdiff_t linesize_r, ptrdiff_t linesize_g,
|
||||
ptrdiff_t linesize_b, int width, int height);
|
||||
|
||||
void ff_restore_rgb_planes10_sse2(uint16_t *src_r, uint16_t *src_g, uint16_t *src_b,
|
||||
ptrdiff_t linesize_r, ptrdiff_t linesize_g,
|
||||
ptrdiff_t linesize_b, int width, int height);
|
||||
void ff_restore_rgb_planes10_avx2(uint16_t *src_r, uint16_t *src_g, uint16_t *src_b,
|
||||
ptrdiff_t linesize_r, ptrdiff_t linesize_g,
|
||||
ptrdiff_t linesize_b, int width, int height);
|
||||
|
||||
av_cold void ff_utvideodsp_init_x86(UTVideoDSPContext *c)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (EXTERNAL_SSE2(cpu_flags)) {
|
||||
c->restore_rgb_planes = ff_restore_rgb_planes_sse2;
|
||||
c->restore_rgb_planes10 = ff_restore_rgb_planes10_sse2;
|
||||
}
|
||||
if (EXTERNAL_AVX2_FAST(cpu_flags)) {
|
||||
c->restore_rgb_planes = ff_restore_rgb_planes_avx2;
|
||||
c->restore_rgb_planes10 = ff_restore_rgb_planes10_avx2;
|
||||
}
|
||||
}
|
56
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/v210-init.c
vendored
Normal file
56
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/v210-init.c
vendored
Normal file
|
@ -0,0 +1,56 @@
|
|||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavcodec/v210dec.h"
|
||||
|
||||
extern void ff_v210_planar_unpack_unaligned_ssse3(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width);
|
||||
extern void ff_v210_planar_unpack_unaligned_avx(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width);
|
||||
extern void ff_v210_planar_unpack_unaligned_avx2(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width);
|
||||
|
||||
extern void ff_v210_planar_unpack_aligned_ssse3(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width);
|
||||
extern void ff_v210_planar_unpack_aligned_avx(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width);
|
||||
extern void ff_v210_planar_unpack_aligned_avx2(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width);
|
||||
|
||||
av_cold void ff_v210_x86_init(V210DecContext *s)
|
||||
{
|
||||
#if HAVE_X86ASM
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (s->aligned_input) {
|
||||
if (cpu_flags & AV_CPU_FLAG_SSSE3)
|
||||
s->unpack_frame = ff_v210_planar_unpack_aligned_ssse3;
|
||||
|
||||
if (HAVE_AVX_EXTERNAL && cpu_flags & AV_CPU_FLAG_AVX)
|
||||
s->unpack_frame = ff_v210_planar_unpack_aligned_avx;
|
||||
|
||||
if (HAVE_AVX2_EXTERNAL && cpu_flags & AV_CPU_FLAG_AVX2)
|
||||
s->unpack_frame = ff_v210_planar_unpack_aligned_avx2;
|
||||
}
|
||||
else {
|
||||
if (cpu_flags & AV_CPU_FLAG_SSSE3)
|
||||
s->unpack_frame = ff_v210_planar_unpack_unaligned_ssse3;
|
||||
|
||||
if (HAVE_AVX_EXTERNAL && cpu_flags & AV_CPU_FLAG_AVX)
|
||||
s->unpack_frame = ff_v210_planar_unpack_unaligned_avx;
|
||||
|
||||
if (HAVE_AVX2_EXTERNAL && cpu_flags & AV_CPU_FLAG_AVX2)
|
||||
s->unpack_frame = ff_v210_planar_unpack_unaligned_avx2;
|
||||
}
|
||||
#endif
|
||||
}
|
54
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/v210enc_init.c
vendored
Normal file
54
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/v210enc_init.c
vendored
Normal file
|
@ -0,0 +1,54 @@
|
|||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/v210enc.h"
|
||||
|
||||
void ff_v210_planar_pack_8_ssse3(const uint8_t *y, const uint8_t *u,
|
||||
const uint8_t *v, uint8_t *dst,
|
||||
ptrdiff_t width);
|
||||
void ff_v210_planar_pack_8_avx(const uint8_t *y, const uint8_t *u,
|
||||
const uint8_t *v, uint8_t *dst, ptrdiff_t width);
|
||||
void ff_v210_planar_pack_8_avx2(const uint8_t *y, const uint8_t *u,
|
||||
const uint8_t *v, uint8_t *dst, ptrdiff_t width);
|
||||
void ff_v210_planar_pack_10_ssse3(const uint16_t *y, const uint16_t *u,
|
||||
const uint16_t *v, uint8_t *dst,
|
||||
ptrdiff_t width);
|
||||
void ff_v210_planar_pack_10_avx2(const uint16_t *y, const uint16_t *u,
|
||||
const uint16_t *v, uint8_t *dst,
|
||||
ptrdiff_t width);
|
||||
|
||||
av_cold void ff_v210enc_init_x86(V210EncContext *s)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (EXTERNAL_SSSE3(cpu_flags)) {
|
||||
s->pack_line_8 = ff_v210_planar_pack_8_ssse3;
|
||||
s->pack_line_10 = ff_v210_planar_pack_10_ssse3;
|
||||
}
|
||||
|
||||
if (EXTERNAL_AVX(cpu_flags))
|
||||
s->pack_line_8 = ff_v210_planar_pack_8_avx;
|
||||
|
||||
if (EXTERNAL_AVX2(cpu_flags)) {
|
||||
s->sample_factor_8 = 2;
|
||||
s->pack_line_8 = ff_v210_planar_pack_8_avx2;
|
||||
s->sample_factor_10 = 2;
|
||||
s->pack_line_10 = ff_v210_planar_pack_10_avx2;
|
||||
}
|
||||
}
|
29
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/vc1dsp.h
vendored
Normal file
29
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/vc1dsp.h
vendored
Normal file
|
@ -0,0 +1,29 @@
|
|||
/*
|
||||
* VC-1 and WMV3 decoder - X86 DSP init functions
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef AVCODEC_X86_VC1DSP_H
|
||||
#define AVCODEC_X86_VC1DSP_H
|
||||
|
||||
#include "libavcodec/vc1dsp.h"
|
||||
|
||||
void ff_vc1dsp_init_mmx(VC1DSPContext *dsp);
|
||||
void ff_vc1dsp_init_mmxext(VC1DSPContext *dsp);
|
||||
|
||||
#endif /* AVCODEC_X86_VC1DSP_H */
|
168
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/vc1dsp_init.c
vendored
Normal file
168
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/vc1dsp_init.c
vendored
Normal file
|
@ -0,0 +1,168 @@
|
|||
/*
|
||||
* VC-1 and WMV3 - DSP functions MMX-optimized
|
||||
* Copyright (c) 2007 Christophe GISQUET <christophe.gisquet@free.fr>
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavutil/x86/asm.h"
|
||||
#include "libavcodec/vc1dsp.h"
|
||||
#include "fpel.h"
|
||||
#include "vc1dsp.h"
|
||||
#include "config.h"
|
||||
|
||||
#define LOOP_FILTER(EXT) \
|
||||
void ff_vc1_v_loop_filter4_ ## EXT(uint8_t *src, int stride, int pq); \
|
||||
void ff_vc1_h_loop_filter4_ ## EXT(uint8_t *src, int stride, int pq); \
|
||||
void ff_vc1_v_loop_filter8_ ## EXT(uint8_t *src, int stride, int pq); \
|
||||
void ff_vc1_h_loop_filter8_ ## EXT(uint8_t *src, int stride, int pq); \
|
||||
\
|
||||
static void vc1_v_loop_filter16_ ## EXT(uint8_t *src, int stride, int pq) \
|
||||
{ \
|
||||
ff_vc1_v_loop_filter8_ ## EXT(src, stride, pq); \
|
||||
ff_vc1_v_loop_filter8_ ## EXT(src+8, stride, pq); \
|
||||
} \
|
||||
\
|
||||
static void vc1_h_loop_filter16_ ## EXT(uint8_t *src, int stride, int pq) \
|
||||
{ \
|
||||
ff_vc1_h_loop_filter8_ ## EXT(src, stride, pq); \
|
||||
ff_vc1_h_loop_filter8_ ## EXT(src+8*stride, stride, pq); \
|
||||
}
|
||||
|
||||
#if HAVE_X86ASM
|
||||
LOOP_FILTER(mmxext)
|
||||
LOOP_FILTER(sse2)
|
||||
LOOP_FILTER(ssse3)
|
||||
|
||||
void ff_vc1_h_loop_filter8_sse4(uint8_t *src, int stride, int pq);
|
||||
|
||||
static void vc1_h_loop_filter16_sse4(uint8_t *src, int stride, int pq)
|
||||
{
|
||||
ff_vc1_h_loop_filter8_sse4(src, stride, pq);
|
||||
ff_vc1_h_loop_filter8_sse4(src+8*stride, stride, pq);
|
||||
}
|
||||
|
||||
#define DECLARE_FUNCTION(OP, DEPTH, INSN) \
|
||||
static void OP##vc1_mspel_mc00_##DEPTH##INSN(uint8_t *dst, \
|
||||
const uint8_t *src, ptrdiff_t stride, int rnd) \
|
||||
{ \
|
||||
ff_ ## OP ## pixels ## DEPTH ## INSN(dst, src, stride, DEPTH); \
|
||||
}
|
||||
|
||||
DECLARE_FUNCTION(put_, 8, _mmx)
|
||||
DECLARE_FUNCTION(put_, 16, _mmx)
|
||||
DECLARE_FUNCTION(avg_, 8, _mmx)
|
||||
DECLARE_FUNCTION(avg_, 16, _mmx)
|
||||
DECLARE_FUNCTION(avg_, 8, _mmxext)
|
||||
DECLARE_FUNCTION(avg_, 16, _mmxext)
|
||||
DECLARE_FUNCTION(put_, 16, _sse2)
|
||||
DECLARE_FUNCTION(avg_, 16, _sse2)
|
||||
|
||||
#endif /* HAVE_X86ASM */
|
||||
|
||||
void ff_put_vc1_chroma_mc8_nornd_mmx (uint8_t *dst, uint8_t *src,
|
||||
ptrdiff_t stride, int h, int x, int y);
|
||||
void ff_avg_vc1_chroma_mc8_nornd_mmxext(uint8_t *dst, uint8_t *src,
|
||||
ptrdiff_t stride, int h, int x, int y);
|
||||
void ff_avg_vc1_chroma_mc8_nornd_3dnow(uint8_t *dst, uint8_t *src,
|
||||
ptrdiff_t stride, int h, int x, int y);
|
||||
void ff_put_vc1_chroma_mc8_nornd_ssse3(uint8_t *dst, uint8_t *src,
|
||||
ptrdiff_t stride, int h, int x, int y);
|
||||
void ff_avg_vc1_chroma_mc8_nornd_ssse3(uint8_t *dst, uint8_t *src,
|
||||
ptrdiff_t stride, int h, int x, int y);
|
||||
void ff_vc1_inv_trans_4x4_dc_mmxext(uint8_t *dest, ptrdiff_t linesize,
|
||||
int16_t *block);
|
||||
void ff_vc1_inv_trans_4x8_dc_mmxext(uint8_t *dest, ptrdiff_t linesize,
|
||||
int16_t *block);
|
||||
void ff_vc1_inv_trans_8x4_dc_mmxext(uint8_t *dest, ptrdiff_t linesize,
|
||||
int16_t *block);
|
||||
void ff_vc1_inv_trans_8x8_dc_mmxext(uint8_t *dest, ptrdiff_t linesize,
|
||||
int16_t *block);
|
||||
|
||||
|
||||
av_cold void ff_vc1dsp_init_x86(VC1DSPContext *dsp)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (HAVE_6REGS && INLINE_MMX(cpu_flags))
|
||||
if (EXTERNAL_MMX(cpu_flags))
|
||||
ff_vc1dsp_init_mmx(dsp);
|
||||
|
||||
if (HAVE_6REGS && INLINE_MMXEXT(cpu_flags))
|
||||
if (EXTERNAL_MMXEXT(cpu_flags))
|
||||
ff_vc1dsp_init_mmxext(dsp);
|
||||
|
||||
#define ASSIGN_LF(EXT) \
|
||||
dsp->vc1_v_loop_filter4 = ff_vc1_v_loop_filter4_ ## EXT; \
|
||||
dsp->vc1_h_loop_filter4 = ff_vc1_h_loop_filter4_ ## EXT; \
|
||||
dsp->vc1_v_loop_filter8 = ff_vc1_v_loop_filter8_ ## EXT; \
|
||||
dsp->vc1_h_loop_filter8 = ff_vc1_h_loop_filter8_ ## EXT; \
|
||||
dsp->vc1_v_loop_filter16 = vc1_v_loop_filter16_ ## EXT; \
|
||||
dsp->vc1_h_loop_filter16 = vc1_h_loop_filter16_ ## EXT
|
||||
|
||||
#if HAVE_X86ASM
|
||||
if (EXTERNAL_MMX(cpu_flags)) {
|
||||
dsp->put_no_rnd_vc1_chroma_pixels_tab[0] = ff_put_vc1_chroma_mc8_nornd_mmx;
|
||||
|
||||
dsp->put_vc1_mspel_pixels_tab[1][0] = put_vc1_mspel_mc00_8_mmx;
|
||||
dsp->put_vc1_mspel_pixels_tab[0][0] = put_vc1_mspel_mc00_16_mmx;
|
||||
dsp->avg_vc1_mspel_pixels_tab[1][0] = avg_vc1_mspel_mc00_8_mmx;
|
||||
dsp->avg_vc1_mspel_pixels_tab[0][0] = avg_vc1_mspel_mc00_16_mmx;
|
||||
}
|
||||
if (EXTERNAL_AMD3DNOW(cpu_flags)) {
|
||||
dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_nornd_3dnow;
|
||||
}
|
||||
if (EXTERNAL_MMXEXT(cpu_flags)) {
|
||||
ASSIGN_LF(mmxext);
|
||||
dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_nornd_mmxext;
|
||||
|
||||
dsp->avg_vc1_mspel_pixels_tab[1][0] = avg_vc1_mspel_mc00_8_mmxext;
|
||||
dsp->avg_vc1_mspel_pixels_tab[0][0] = avg_vc1_mspel_mc00_16_mmxext;
|
||||
|
||||
dsp->vc1_inv_trans_8x8_dc = ff_vc1_inv_trans_8x8_dc_mmxext;
|
||||
dsp->vc1_inv_trans_4x8_dc = ff_vc1_inv_trans_4x8_dc_mmxext;
|
||||
dsp->vc1_inv_trans_8x4_dc = ff_vc1_inv_trans_8x4_dc_mmxext;
|
||||
dsp->vc1_inv_trans_4x4_dc = ff_vc1_inv_trans_4x4_dc_mmxext;
|
||||
}
|
||||
if (EXTERNAL_SSE2(cpu_flags)) {
|
||||
dsp->vc1_v_loop_filter8 = ff_vc1_v_loop_filter8_sse2;
|
||||
dsp->vc1_h_loop_filter8 = ff_vc1_h_loop_filter8_sse2;
|
||||
dsp->vc1_v_loop_filter16 = vc1_v_loop_filter16_sse2;
|
||||
dsp->vc1_h_loop_filter16 = vc1_h_loop_filter16_sse2;
|
||||
|
||||
dsp->put_vc1_mspel_pixels_tab[0][0] = put_vc1_mspel_mc00_16_sse2;
|
||||
dsp->avg_vc1_mspel_pixels_tab[0][0] = avg_vc1_mspel_mc00_16_sse2;
|
||||
}
|
||||
if (EXTERNAL_SSSE3(cpu_flags)) {
|
||||
ASSIGN_LF(ssse3);
|
||||
dsp->put_no_rnd_vc1_chroma_pixels_tab[0] = ff_put_vc1_chroma_mc8_nornd_ssse3;
|
||||
dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_nornd_ssse3;
|
||||
}
|
||||
if (EXTERNAL_SSE4(cpu_flags)) {
|
||||
dsp->vc1_h_loop_filter8 = ff_vc1_h_loop_filter8_sse4;
|
||||
dsp->vc1_h_loop_filter16 = vc1_h_loop_filter16_sse4;
|
||||
}
|
||||
#endif /* HAVE_X86ASM */
|
||||
}
|
486
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/vc1dsp_mmx.c
vendored
Normal file
486
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/vc1dsp_mmx.c
vendored
Normal file
|
@ -0,0 +1,486 @@
|
|||
/*
|
||||
* VC-1 and WMV3 - DSP functions MMX-optimized
|
||||
* Copyright (c) 2007 Christophe GISQUET <christophe.gisquet@free.fr>
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/mem.h"
|
||||
#include "libavutil/x86/asm.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/vc1dsp.h"
|
||||
#include "constants.h"
|
||||
#include "fpel.h"
|
||||
#include "vc1dsp.h"
|
||||
|
||||
#if HAVE_6REGS && HAVE_INLINE_ASM && HAVE_MMX_EXTERNAL
|
||||
|
||||
void ff_vc1_put_ver_16b_shift2_mmx(int16_t *dst,
|
||||
const uint8_t *src, x86_reg stride,
|
||||
int rnd, int64_t shift);
|
||||
void ff_vc1_put_hor_16b_shift2_mmx(uint8_t *dst, x86_reg stride,
|
||||
const int16_t *src, int rnd);
|
||||
void ff_vc1_avg_hor_16b_shift2_mmxext(uint8_t *dst, x86_reg stride,
|
||||
const int16_t *src, int rnd);
|
||||
|
||||
#define OP_PUT(S,D)
|
||||
#define OP_AVG(S,D) "pavgb " #S ", " #D " \n\t"
|
||||
|
||||
/** Add rounder from mm7 to mm3 and pack result at destination */
|
||||
#define NORMALIZE_MMX(SHIFT) \
|
||||
"paddw %%mm7, %%mm3 \n\t" /* +bias-r */ \
|
||||
"paddw %%mm7, %%mm4 \n\t" /* +bias-r */ \
|
||||
"psraw "SHIFT", %%mm3 \n\t" \
|
||||
"psraw "SHIFT", %%mm4 \n\t"
|
||||
|
||||
#define TRANSFER_DO_PACK(OP) \
|
||||
"packuswb %%mm4, %%mm3 \n\t" \
|
||||
OP((%2), %%mm3) \
|
||||
"movq %%mm3, (%2) \n\t"
|
||||
|
||||
#define TRANSFER_DONT_PACK(OP) \
|
||||
OP(0(%2), %%mm3) \
|
||||
OP(8(%2), %%mm4) \
|
||||
"movq %%mm3, 0(%2) \n\t" \
|
||||
"movq %%mm4, 8(%2) \n\t"
|
||||
|
||||
/** @see MSPEL_FILTER13_CORE for use as UNPACK macro */
|
||||
#define DO_UNPACK(reg) "punpcklbw %%mm0, " reg "\n\t"
|
||||
#define DONT_UNPACK(reg)
|
||||
|
||||
/** Compute the rounder 32-r or 8-r and unpacks it to mm7 */
|
||||
#define LOAD_ROUNDER_MMX(ROUND) \
|
||||
"movd "ROUND", %%mm7 \n\t" \
|
||||
"punpcklwd %%mm7, %%mm7 \n\t" \
|
||||
"punpckldq %%mm7, %%mm7 \n\t"
|
||||
|
||||
/**
|
||||
* Purely vertical or horizontal 1/2 shift interpolation.
|
||||
* Sacrifice mm6 for *9 factor.
|
||||
*/
|
||||
#define VC1_SHIFT2(OP, OPNAME)\
|
||||
static void OPNAME ## vc1_shift2_mmx(uint8_t *dst, const uint8_t *src,\
|
||||
x86_reg stride, int rnd, x86_reg offset)\
|
||||
{\
|
||||
rnd = 8-rnd;\
|
||||
__asm__ volatile(\
|
||||
"mov $8, %%"FF_REG_c" \n\t"\
|
||||
LOAD_ROUNDER_MMX("%5")\
|
||||
"movq "MANGLE(ff_pw_9)", %%mm6\n\t"\
|
||||
"1: \n\t"\
|
||||
"movd 0(%0 ), %%mm3 \n\t"\
|
||||
"movd 4(%0 ), %%mm4 \n\t"\
|
||||
"movd 0(%0,%2), %%mm1 \n\t"\
|
||||
"movd 4(%0,%2), %%mm2 \n\t"\
|
||||
"add %2, %0 \n\t"\
|
||||
"punpcklbw %%mm0, %%mm3 \n\t"\
|
||||
"punpcklbw %%mm0, %%mm4 \n\t"\
|
||||
"punpcklbw %%mm0, %%mm1 \n\t"\
|
||||
"punpcklbw %%mm0, %%mm2 \n\t"\
|
||||
"paddw %%mm1, %%mm3 \n\t"\
|
||||
"paddw %%mm2, %%mm4 \n\t"\
|
||||
"movd 0(%0,%3), %%mm1 \n\t"\
|
||||
"movd 4(%0,%3), %%mm2 \n\t"\
|
||||
"pmullw %%mm6, %%mm3 \n\t" /* 0,9,9,0*/\
|
||||
"pmullw %%mm6, %%mm4 \n\t" /* 0,9,9,0*/\
|
||||
"punpcklbw %%mm0, %%mm1 \n\t"\
|
||||
"punpcklbw %%mm0, %%mm2 \n\t"\
|
||||
"psubw %%mm1, %%mm3 \n\t" /*-1,9,9,0*/\
|
||||
"psubw %%mm2, %%mm4 \n\t" /*-1,9,9,0*/\
|
||||
"movd 0(%0,%2), %%mm1 \n\t"\
|
||||
"movd 4(%0,%2), %%mm2 \n\t"\
|
||||
"punpcklbw %%mm0, %%mm1 \n\t"\
|
||||
"punpcklbw %%mm0, %%mm2 \n\t"\
|
||||
"psubw %%mm1, %%mm3 \n\t" /*-1,9,9,-1*/\
|
||||
"psubw %%mm2, %%mm4 \n\t" /*-1,9,9,-1*/\
|
||||
NORMALIZE_MMX("$4")\
|
||||
"packuswb %%mm4, %%mm3 \n\t"\
|
||||
OP((%1), %%mm3)\
|
||||
"movq %%mm3, (%1) \n\t"\
|
||||
"add %6, %0 \n\t"\
|
||||
"add %4, %1 \n\t"\
|
||||
"dec %%"FF_REG_c" \n\t"\
|
||||
"jnz 1b \n\t"\
|
||||
: "+r"(src), "+r"(dst)\
|
||||
: "r"(offset), "r"(-2*offset), "g"(stride), "m"(rnd),\
|
||||
"g"(stride-offset)\
|
||||
NAMED_CONSTRAINTS_ADD(ff_pw_9)\
|
||||
: "%"FF_REG_c, "memory"\
|
||||
);\
|
||||
}
|
||||
|
||||
VC1_SHIFT2(OP_PUT, put_)
|
||||
VC1_SHIFT2(OP_AVG, avg_)
|
||||
|
||||
/**
|
||||
* Core of the 1/4 and 3/4 shift bicubic interpolation.
|
||||
*
|
||||
* @param UNPACK Macro unpacking arguments from 8 to 16 bits (can be empty).
|
||||
* @param MOVQ "movd 1" or "movq 2", if data read is already unpacked.
|
||||
* @param A1 Address of 1st tap (beware of unpacked/packed).
|
||||
* @param A2 Address of 2nd tap
|
||||
* @param A3 Address of 3rd tap
|
||||
* @param A4 Address of 4th tap
|
||||
*/
|
||||
#define MSPEL_FILTER13_CORE(UNPACK, MOVQ, A1, A2, A3, A4) \
|
||||
MOVQ "*0+"A1", %%mm1 \n\t" \
|
||||
MOVQ "*4+"A1", %%mm2 \n\t" \
|
||||
UNPACK("%%mm1") \
|
||||
UNPACK("%%mm2") \
|
||||
"pmullw "MANGLE(ff_pw_3)", %%mm1\n\t" \
|
||||
"pmullw "MANGLE(ff_pw_3)", %%mm2\n\t" \
|
||||
MOVQ "*0+"A2", %%mm3 \n\t" \
|
||||
MOVQ "*4+"A2", %%mm4 \n\t" \
|
||||
UNPACK("%%mm3") \
|
||||
UNPACK("%%mm4") \
|
||||
"pmullw %%mm6, %%mm3 \n\t" /* *18 */ \
|
||||
"pmullw %%mm6, %%mm4 \n\t" /* *18 */ \
|
||||
"psubw %%mm1, %%mm3 \n\t" /* 18,-3 */ \
|
||||
"psubw %%mm2, %%mm4 \n\t" /* 18,-3 */ \
|
||||
MOVQ "*0+"A4", %%mm1 \n\t" \
|
||||
MOVQ "*4+"A4", %%mm2 \n\t" \
|
||||
UNPACK("%%mm1") \
|
||||
UNPACK("%%mm2") \
|
||||
"psllw $2, %%mm1 \n\t" /* 4* */ \
|
||||
"psllw $2, %%mm2 \n\t" /* 4* */ \
|
||||
"psubw %%mm1, %%mm3 \n\t" /* -4,18,-3 */ \
|
||||
"psubw %%mm2, %%mm4 \n\t" /* -4,18,-3 */ \
|
||||
MOVQ "*0+"A3", %%mm1 \n\t" \
|
||||
MOVQ "*4+"A3", %%mm2 \n\t" \
|
||||
UNPACK("%%mm1") \
|
||||
UNPACK("%%mm2") \
|
||||
"pmullw %%mm5, %%mm1 \n\t" /* *53 */ \
|
||||
"pmullw %%mm5, %%mm2 \n\t" /* *53 */ \
|
||||
"paddw %%mm1, %%mm3 \n\t" /* 4,53,18,-3 */ \
|
||||
"paddw %%mm2, %%mm4 \n\t" /* 4,53,18,-3 */
|
||||
|
||||
/**
|
||||
* Macro to build the vertical 16 bits version of vc1_put_shift[13].
|
||||
* Here, offset=src_stride. Parameters passed A1 to A4 must use
|
||||
* %3 (src_stride) and %4 (3*src_stride).
|
||||
*
|
||||
* @param NAME Either 1 or 3
|
||||
* @see MSPEL_FILTER13_CORE for information on A1->A4
|
||||
*/
|
||||
#define MSPEL_FILTER13_VER_16B(NAME, A1, A2, A3, A4) \
|
||||
static void \
|
||||
vc1_put_ver_16b_ ## NAME ## _mmx(int16_t *dst, const uint8_t *src, \
|
||||
x86_reg src_stride, \
|
||||
int rnd, int64_t shift) \
|
||||
{ \
|
||||
int h = 8; \
|
||||
src -= src_stride; \
|
||||
__asm__ volatile( \
|
||||
LOAD_ROUNDER_MMX("%5") \
|
||||
"movq "MANGLE(ff_pw_53)", %%mm5\n\t" \
|
||||
"movq "MANGLE(ff_pw_18)", %%mm6\n\t" \
|
||||
".p2align 3 \n\t" \
|
||||
"1: \n\t" \
|
||||
MSPEL_FILTER13_CORE(DO_UNPACK, "movd 1", A1, A2, A3, A4) \
|
||||
NORMALIZE_MMX("%6") \
|
||||
TRANSFER_DONT_PACK(OP_PUT) \
|
||||
/* Last 3 (in fact 4) bytes on the line */ \
|
||||
"movd 8+"A1", %%mm1 \n\t" \
|
||||
DO_UNPACK("%%mm1") \
|
||||
"movq %%mm1, %%mm3 \n\t" \
|
||||
"paddw %%mm1, %%mm1 \n\t" \
|
||||
"paddw %%mm3, %%mm1 \n\t" /* 3* */ \
|
||||
"movd 8+"A2", %%mm3 \n\t" \
|
||||
DO_UNPACK("%%mm3") \
|
||||
"pmullw %%mm6, %%mm3 \n\t" /* *18 */ \
|
||||
"psubw %%mm1, %%mm3 \n\t" /*18,-3 */ \
|
||||
"movd 8+"A3", %%mm1 \n\t" \
|
||||
DO_UNPACK("%%mm1") \
|
||||
"pmullw %%mm5, %%mm1 \n\t" /* *53 */ \
|
||||
"paddw %%mm1, %%mm3 \n\t" /*53,18,-3 */ \
|
||||
"movd 8+"A4", %%mm1 \n\t" \
|
||||
DO_UNPACK("%%mm1") \
|
||||
"psllw $2, %%mm1 \n\t" /* 4* */ \
|
||||
"psubw %%mm1, %%mm3 \n\t" \
|
||||
"paddw %%mm7, %%mm3 \n\t" \
|
||||
"psraw %6, %%mm3 \n\t" \
|
||||
"movq %%mm3, 16(%2) \n\t" \
|
||||
"add %3, %1 \n\t" \
|
||||
"add $24, %2 \n\t" \
|
||||
"decl %0 \n\t" \
|
||||
"jnz 1b \n\t" \
|
||||
: "+r"(h), "+r" (src), "+r" (dst) \
|
||||
: "r"(src_stride), "r"(3*src_stride), \
|
||||
"m"(rnd), "m"(shift) \
|
||||
NAMED_CONSTRAINTS_ADD(ff_pw_3,ff_pw_53,ff_pw_18) \
|
||||
: "memory" \
|
||||
); \
|
||||
}
|
||||
|
||||
/**
|
||||
* Macro to build the horizontal 16 bits version of vc1_put_shift[13].
|
||||
* Here, offset=16 bits, so parameters passed A1 to A4 should be simple.
|
||||
*
|
||||
* @param NAME Either 1 or 3
|
||||
* @see MSPEL_FILTER13_CORE for information on A1->A4
|
||||
*/
|
||||
#define MSPEL_FILTER13_HOR_16B(NAME, A1, A2, A3, A4, OP, OPNAME) \
|
||||
static void \
|
||||
OPNAME ## vc1_hor_16b_ ## NAME ## _mmx(uint8_t *dst, x86_reg stride, \
|
||||
const int16_t *src, int rnd) \
|
||||
{ \
|
||||
int h = 8; \
|
||||
src -= 1; \
|
||||
rnd -= (-4+58+13-3)*256; /* Add -256 bias */ \
|
||||
__asm__ volatile( \
|
||||
LOAD_ROUNDER_MMX("%4") \
|
||||
"movq "MANGLE(ff_pw_18)", %%mm6 \n\t" \
|
||||
"movq "MANGLE(ff_pw_53)", %%mm5 \n\t" \
|
||||
".p2align 3 \n\t" \
|
||||
"1: \n\t" \
|
||||
MSPEL_FILTER13_CORE(DONT_UNPACK, "movq 2", A1, A2, A3, A4) \
|
||||
NORMALIZE_MMX("$7") \
|
||||
/* Remove bias */ \
|
||||
"paddw "MANGLE(ff_pw_128)", %%mm3 \n\t" \
|
||||
"paddw "MANGLE(ff_pw_128)", %%mm4 \n\t" \
|
||||
TRANSFER_DO_PACK(OP) \
|
||||
"add $24, %1 \n\t" \
|
||||
"add %3, %2 \n\t" \
|
||||
"decl %0 \n\t" \
|
||||
"jnz 1b \n\t" \
|
||||
: "+r"(h), "+r" (src), "+r" (dst) \
|
||||
: "r"(stride), "m"(rnd) \
|
||||
NAMED_CONSTRAINTS_ADD(ff_pw_3,ff_pw_18,ff_pw_53,ff_pw_128) \
|
||||
: "memory" \
|
||||
); \
|
||||
}
|
||||
|
||||
/**
|
||||
* Macro to build the 8 bits, any direction, version of vc1_put_shift[13].
|
||||
* Here, offset=src_stride. Parameters passed A1 to A4 must use
|
||||
* %3 (offset) and %4 (3*offset).
|
||||
*
|
||||
* @param NAME Either 1 or 3
|
||||
* @see MSPEL_FILTER13_CORE for information on A1->A4
|
||||
*/
|
||||
#define MSPEL_FILTER13_8B(NAME, A1, A2, A3, A4, OP, OPNAME) \
|
||||
static void \
|
||||
OPNAME ## vc1_## NAME ## _mmx(uint8_t *dst, const uint8_t *src, \
|
||||
x86_reg stride, int rnd, x86_reg offset) \
|
||||
{ \
|
||||
int h = 8; \
|
||||
src -= offset; \
|
||||
rnd = 32-rnd; \
|
||||
__asm__ volatile ( \
|
||||
LOAD_ROUNDER_MMX("%6") \
|
||||
"movq "MANGLE(ff_pw_53)", %%mm5 \n\t" \
|
||||
"movq "MANGLE(ff_pw_18)", %%mm6 \n\t" \
|
||||
".p2align 3 \n\t" \
|
||||
"1: \n\t" \
|
||||
MSPEL_FILTER13_CORE(DO_UNPACK, "movd 1", A1, A2, A3, A4) \
|
||||
NORMALIZE_MMX("$6") \
|
||||
TRANSFER_DO_PACK(OP) \
|
||||
"add %5, %1 \n\t" \
|
||||
"add %5, %2 \n\t" \
|
||||
"decl %0 \n\t" \
|
||||
"jnz 1b \n\t" \
|
||||
: "+r"(h), "+r" (src), "+r" (dst) \
|
||||
: "r"(offset), "r"(3*offset), "g"(stride), "m"(rnd) \
|
||||
NAMED_CONSTRAINTS_ADD(ff_pw_53,ff_pw_18,ff_pw_3) \
|
||||
: "memory" \
|
||||
); \
|
||||
}
|
||||
|
||||
/** 1/4 shift bicubic interpolation */
|
||||
MSPEL_FILTER13_8B (shift1, "0(%1,%4 )", "0(%1,%3,2)", "0(%1,%3 )", "0(%1 )", OP_PUT, put_)
|
||||
MSPEL_FILTER13_8B (shift1, "0(%1,%4 )", "0(%1,%3,2)", "0(%1,%3 )", "0(%1 )", OP_AVG, avg_)
|
||||
MSPEL_FILTER13_VER_16B(shift1, "0(%1,%4 )", "0(%1,%3,2)", "0(%1,%3 )", "0(%1 )")
|
||||
MSPEL_FILTER13_HOR_16B(shift1, "2*3(%1)", "2*2(%1)", "2*1(%1)", "2*0(%1)", OP_PUT, put_)
|
||||
MSPEL_FILTER13_HOR_16B(shift1, "2*3(%1)", "2*2(%1)", "2*1(%1)", "2*0(%1)", OP_AVG, avg_)
|
||||
|
||||
/** 3/4 shift bicubic interpolation */
|
||||
MSPEL_FILTER13_8B (shift3, "0(%1 )", "0(%1,%3 )", "0(%1,%3,2)", "0(%1,%4 )", OP_PUT, put_)
|
||||
MSPEL_FILTER13_8B (shift3, "0(%1 )", "0(%1,%3 )", "0(%1,%3,2)", "0(%1,%4 )", OP_AVG, avg_)
|
||||
MSPEL_FILTER13_VER_16B(shift3, "0(%1 )", "0(%1,%3 )", "0(%1,%3,2)", "0(%1,%4 )")
|
||||
MSPEL_FILTER13_HOR_16B(shift3, "2*0(%1)", "2*1(%1)", "2*2(%1)", "2*3(%1)", OP_PUT, put_)
|
||||
MSPEL_FILTER13_HOR_16B(shift3, "2*0(%1)", "2*1(%1)", "2*2(%1)", "2*3(%1)", OP_AVG, avg_)
|
||||
|
||||
typedef void (*vc1_mspel_mc_filter_ver_16bits)(int16_t *dst, const uint8_t *src, x86_reg src_stride, int rnd, int64_t shift);
|
||||
typedef void (*vc1_mspel_mc_filter_hor_16bits)(uint8_t *dst, x86_reg dst_stride, const int16_t *src, int rnd);
|
||||
typedef void (*vc1_mspel_mc_filter_8bits)(uint8_t *dst, const uint8_t *src, x86_reg stride, int rnd, x86_reg offset);
|
||||
|
||||
/**
|
||||
* Interpolate fractional pel values by applying proper vertical then
|
||||
* horizontal filter.
|
||||
*
|
||||
* @param dst Destination buffer for interpolated pels.
|
||||
* @param src Source buffer.
|
||||
* @param stride Stride for both src and dst buffers.
|
||||
* @param hmode Horizontal filter (expressed in quarter pixels shift).
|
||||
* @param hmode Vertical filter.
|
||||
* @param rnd Rounding bias.
|
||||
*/
|
||||
#define VC1_MSPEL_MC(OP, INSTR)\
|
||||
static void OP ## vc1_mspel_mc(uint8_t *dst, const uint8_t *src, int stride,\
|
||||
int hmode, int vmode, int rnd)\
|
||||
{\
|
||||
static const vc1_mspel_mc_filter_ver_16bits vc1_put_shift_ver_16bits[] =\
|
||||
{ NULL, vc1_put_ver_16b_shift1_mmx, ff_vc1_put_ver_16b_shift2_mmx, vc1_put_ver_16b_shift3_mmx };\
|
||||
static const vc1_mspel_mc_filter_hor_16bits vc1_put_shift_hor_16bits[] =\
|
||||
{ NULL, OP ## vc1_hor_16b_shift1_mmx, ff_vc1_ ## OP ## hor_16b_shift2_ ## INSTR, OP ## vc1_hor_16b_shift3_mmx };\
|
||||
static const vc1_mspel_mc_filter_8bits vc1_put_shift_8bits[] =\
|
||||
{ NULL, OP ## vc1_shift1_mmx, OP ## vc1_shift2_mmx, OP ## vc1_shift3_mmx };\
|
||||
\
|
||||
__asm__ volatile(\
|
||||
"pxor %%mm0, %%mm0 \n\t"\
|
||||
::: "memory"\
|
||||
);\
|
||||
\
|
||||
if (vmode) { /* Vertical filter to apply */\
|
||||
if (hmode) { /* Horizontal filter to apply, output to tmp */\
|
||||
static const int shift_value[] = { 0, 5, 1, 5 };\
|
||||
int shift = (shift_value[hmode]+shift_value[vmode])>>1;\
|
||||
int r;\
|
||||
LOCAL_ALIGNED(16, int16_t, tmp, [12*8]);\
|
||||
\
|
||||
r = (1<<(shift-1)) + rnd-1;\
|
||||
vc1_put_shift_ver_16bits[vmode](tmp, src-1, stride, r, shift);\
|
||||
\
|
||||
vc1_put_shift_hor_16bits[hmode](dst, stride, tmp+1, 64-rnd);\
|
||||
return;\
|
||||
}\
|
||||
else { /* No horizontal filter, output 8 lines to dst */\
|
||||
vc1_put_shift_8bits[vmode](dst, src, stride, 1-rnd, stride);\
|
||||
return;\
|
||||
}\
|
||||
}\
|
||||
\
|
||||
/* Horizontal mode with no vertical mode */\
|
||||
vc1_put_shift_8bits[hmode](dst, src, stride, rnd, 1);\
|
||||
} \
|
||||
static void OP ## vc1_mspel_mc_16(uint8_t *dst, const uint8_t *src, \
|
||||
int stride, int hmode, int vmode, int rnd)\
|
||||
{ \
|
||||
OP ## vc1_mspel_mc(dst + 0, src + 0, stride, hmode, vmode, rnd); \
|
||||
OP ## vc1_mspel_mc(dst + 8, src + 8, stride, hmode, vmode, rnd); \
|
||||
dst += 8*stride; src += 8*stride; \
|
||||
OP ## vc1_mspel_mc(dst + 0, src + 0, stride, hmode, vmode, rnd); \
|
||||
OP ## vc1_mspel_mc(dst + 8, src + 8, stride, hmode, vmode, rnd); \
|
||||
}
|
||||
|
||||
VC1_MSPEL_MC(put_, mmx)
|
||||
VC1_MSPEL_MC(avg_, mmxext)
|
||||
|
||||
/** Macro to ease bicubic filter interpolation functions declarations */
|
||||
#define DECLARE_FUNCTION(a, b) \
|
||||
static void put_vc1_mspel_mc ## a ## b ## _mmx(uint8_t *dst, \
|
||||
const uint8_t *src, \
|
||||
ptrdiff_t stride, \
|
||||
int rnd) \
|
||||
{ \
|
||||
put_vc1_mspel_mc(dst, src, stride, a, b, rnd); \
|
||||
}\
|
||||
static void avg_vc1_mspel_mc ## a ## b ## _mmxext(uint8_t *dst, \
|
||||
const uint8_t *src, \
|
||||
ptrdiff_t stride, \
|
||||
int rnd) \
|
||||
{ \
|
||||
avg_vc1_mspel_mc(dst, src, stride, a, b, rnd); \
|
||||
}\
|
||||
static void put_vc1_mspel_mc ## a ## b ## _16_mmx(uint8_t *dst, \
|
||||
const uint8_t *src, \
|
||||
ptrdiff_t stride, \
|
||||
int rnd) \
|
||||
{ \
|
||||
put_vc1_mspel_mc_16(dst, src, stride, a, b, rnd); \
|
||||
}\
|
||||
static void avg_vc1_mspel_mc ## a ## b ## _16_mmxext(uint8_t *dst, \
|
||||
const uint8_t *src,\
|
||||
ptrdiff_t stride, \
|
||||
int rnd) \
|
||||
{ \
|
||||
avg_vc1_mspel_mc_16(dst, src, stride, a, b, rnd); \
|
||||
}
|
||||
|
||||
DECLARE_FUNCTION(0, 1)
|
||||
DECLARE_FUNCTION(0, 2)
|
||||
DECLARE_FUNCTION(0, 3)
|
||||
|
||||
DECLARE_FUNCTION(1, 0)
|
||||
DECLARE_FUNCTION(1, 1)
|
||||
DECLARE_FUNCTION(1, 2)
|
||||
DECLARE_FUNCTION(1, 3)
|
||||
|
||||
DECLARE_FUNCTION(2, 0)
|
||||
DECLARE_FUNCTION(2, 1)
|
||||
DECLARE_FUNCTION(2, 2)
|
||||
DECLARE_FUNCTION(2, 3)
|
||||
|
||||
DECLARE_FUNCTION(3, 0)
|
||||
DECLARE_FUNCTION(3, 1)
|
||||
DECLARE_FUNCTION(3, 2)
|
||||
DECLARE_FUNCTION(3, 3)
|
||||
|
||||
#define FN_ASSIGN(OP, X, Y, INSN) \
|
||||
dsp->OP##vc1_mspel_pixels_tab[1][X+4*Y] = OP##vc1_mspel_mc##X##Y##INSN; \
|
||||
dsp->OP##vc1_mspel_pixels_tab[0][X+4*Y] = OP##vc1_mspel_mc##X##Y##_16##INSN
|
||||
|
||||
av_cold void ff_vc1dsp_init_mmx(VC1DSPContext *dsp)
|
||||
{
|
||||
FN_ASSIGN(put_, 0, 1, _mmx);
|
||||
FN_ASSIGN(put_, 0, 2, _mmx);
|
||||
FN_ASSIGN(put_, 0, 3, _mmx);
|
||||
|
||||
FN_ASSIGN(put_, 1, 0, _mmx);
|
||||
FN_ASSIGN(put_, 1, 1, _mmx);
|
||||
FN_ASSIGN(put_, 1, 2, _mmx);
|
||||
FN_ASSIGN(put_, 1, 3, _mmx);
|
||||
|
||||
FN_ASSIGN(put_, 2, 0, _mmx);
|
||||
FN_ASSIGN(put_, 2, 1, _mmx);
|
||||
FN_ASSIGN(put_, 2, 2, _mmx);
|
||||
FN_ASSIGN(put_, 2, 3, _mmx);
|
||||
|
||||
FN_ASSIGN(put_, 3, 0, _mmx);
|
||||
FN_ASSIGN(put_, 3, 1, _mmx);
|
||||
FN_ASSIGN(put_, 3, 2, _mmx);
|
||||
FN_ASSIGN(put_, 3, 3, _mmx);
|
||||
}
|
||||
|
||||
av_cold void ff_vc1dsp_init_mmxext(VC1DSPContext *dsp)
|
||||
{
|
||||
FN_ASSIGN(avg_, 0, 1, _mmxext);
|
||||
FN_ASSIGN(avg_, 0, 2, _mmxext);
|
||||
FN_ASSIGN(avg_, 0, 3, _mmxext);
|
||||
|
||||
FN_ASSIGN(avg_, 1, 0, _mmxext);
|
||||
FN_ASSIGN(avg_, 1, 1, _mmxext);
|
||||
FN_ASSIGN(avg_, 1, 2, _mmxext);
|
||||
FN_ASSIGN(avg_, 1, 3, _mmxext);
|
||||
|
||||
FN_ASSIGN(avg_, 2, 0, _mmxext);
|
||||
FN_ASSIGN(avg_, 2, 1, _mmxext);
|
||||
FN_ASSIGN(avg_, 2, 2, _mmxext);
|
||||
FN_ASSIGN(avg_, 2, 3, _mmxext);
|
||||
|
||||
FN_ASSIGN(avg_, 3, 0, _mmxext);
|
||||
FN_ASSIGN(avg_, 3, 1, _mmxext);
|
||||
FN_ASSIGN(avg_, 3, 2, _mmxext);
|
||||
FN_ASSIGN(avg_, 3, 3, _mmxext);
|
||||
}
|
||||
#endif /* HAVE_6REGS && HAVE_INLINE_ASM && HAVE_MMX_EXTERNAL */
|
309
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/videodsp_init.c
vendored
Normal file
309
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/videodsp_init.c
vendored
Normal file
|
@ -0,0 +1,309 @@
|
|||
/*
|
||||
* Copyright (C) 2002-2012 Michael Niedermayer
|
||||
* Copyright (C) 2012 Ronald S. Bultje
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/avassert.h"
|
||||
#include "libavutil/common.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/mem.h"
|
||||
#include "libavutil/x86/asm.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/videodsp.h"
|
||||
|
||||
#if HAVE_X86ASM
|
||||
typedef void emu_edge_vfix_func(uint8_t *dst, x86_reg dst_stride,
|
||||
const uint8_t *src, x86_reg src_stride,
|
||||
x86_reg start_y, x86_reg end_y, x86_reg bh);
|
||||
typedef void emu_edge_vvar_func(uint8_t *dst, x86_reg dst_stride,
|
||||
const uint8_t *src, x86_reg src_stride,
|
||||
x86_reg start_y, x86_reg end_y, x86_reg bh,
|
||||
x86_reg w);
|
||||
|
||||
extern emu_edge_vfix_func ff_emu_edge_vfix1_mmx;
|
||||
extern emu_edge_vfix_func ff_emu_edge_vfix2_mmx;
|
||||
extern emu_edge_vfix_func ff_emu_edge_vfix3_mmx;
|
||||
extern emu_edge_vfix_func ff_emu_edge_vfix4_mmx;
|
||||
extern emu_edge_vfix_func ff_emu_edge_vfix5_mmx;
|
||||
extern emu_edge_vfix_func ff_emu_edge_vfix6_mmx;
|
||||
extern emu_edge_vfix_func ff_emu_edge_vfix7_mmx;
|
||||
extern emu_edge_vfix_func ff_emu_edge_vfix8_mmx;
|
||||
extern emu_edge_vfix_func ff_emu_edge_vfix9_mmx;
|
||||
extern emu_edge_vfix_func ff_emu_edge_vfix10_mmx;
|
||||
extern emu_edge_vfix_func ff_emu_edge_vfix11_mmx;
|
||||
extern emu_edge_vfix_func ff_emu_edge_vfix12_mmx;
|
||||
extern emu_edge_vfix_func ff_emu_edge_vfix13_mmx;
|
||||
extern emu_edge_vfix_func ff_emu_edge_vfix14_mmx;
|
||||
extern emu_edge_vfix_func ff_emu_edge_vfix15_mmx;
|
||||
extern emu_edge_vfix_func ff_emu_edge_vfix16_mmx;
|
||||
extern emu_edge_vfix_func ff_emu_edge_vfix17_mmx;
|
||||
extern emu_edge_vfix_func ff_emu_edge_vfix18_mmx;
|
||||
extern emu_edge_vfix_func ff_emu_edge_vfix19_mmx;
|
||||
extern emu_edge_vfix_func ff_emu_edge_vfix20_mmx;
|
||||
extern emu_edge_vfix_func ff_emu_edge_vfix21_mmx;
|
||||
extern emu_edge_vfix_func ff_emu_edge_vfix22_mmx;
|
||||
#if ARCH_X86_32
|
||||
static emu_edge_vfix_func * const vfixtbl_mmx[22] = {
|
||||
&ff_emu_edge_vfix1_mmx, &ff_emu_edge_vfix2_mmx, &ff_emu_edge_vfix3_mmx,
|
||||
&ff_emu_edge_vfix4_mmx, &ff_emu_edge_vfix5_mmx, &ff_emu_edge_vfix6_mmx,
|
||||
&ff_emu_edge_vfix7_mmx, &ff_emu_edge_vfix8_mmx, &ff_emu_edge_vfix9_mmx,
|
||||
&ff_emu_edge_vfix10_mmx, &ff_emu_edge_vfix11_mmx, &ff_emu_edge_vfix12_mmx,
|
||||
&ff_emu_edge_vfix13_mmx, &ff_emu_edge_vfix14_mmx, &ff_emu_edge_vfix15_mmx,
|
||||
&ff_emu_edge_vfix16_mmx, &ff_emu_edge_vfix17_mmx, &ff_emu_edge_vfix18_mmx,
|
||||
&ff_emu_edge_vfix19_mmx, &ff_emu_edge_vfix20_mmx, &ff_emu_edge_vfix21_mmx,
|
||||
&ff_emu_edge_vfix22_mmx
|
||||
};
|
||||
#endif
|
||||
extern emu_edge_vvar_func ff_emu_edge_vvar_mmx;
|
||||
extern emu_edge_vfix_func ff_emu_edge_vfix16_sse;
|
||||
extern emu_edge_vfix_func ff_emu_edge_vfix17_sse;
|
||||
extern emu_edge_vfix_func ff_emu_edge_vfix18_sse;
|
||||
extern emu_edge_vfix_func ff_emu_edge_vfix19_sse;
|
||||
extern emu_edge_vfix_func ff_emu_edge_vfix20_sse;
|
||||
extern emu_edge_vfix_func ff_emu_edge_vfix21_sse;
|
||||
extern emu_edge_vfix_func ff_emu_edge_vfix22_sse;
|
||||
static emu_edge_vfix_func * const vfixtbl_sse[22] = {
|
||||
ff_emu_edge_vfix1_mmx, ff_emu_edge_vfix2_mmx, ff_emu_edge_vfix3_mmx,
|
||||
ff_emu_edge_vfix4_mmx, ff_emu_edge_vfix5_mmx, ff_emu_edge_vfix6_mmx,
|
||||
ff_emu_edge_vfix7_mmx, ff_emu_edge_vfix8_mmx, ff_emu_edge_vfix9_mmx,
|
||||
ff_emu_edge_vfix10_mmx, ff_emu_edge_vfix11_mmx, ff_emu_edge_vfix12_mmx,
|
||||
ff_emu_edge_vfix13_mmx, ff_emu_edge_vfix14_mmx, ff_emu_edge_vfix15_mmx,
|
||||
ff_emu_edge_vfix16_sse, ff_emu_edge_vfix17_sse, ff_emu_edge_vfix18_sse,
|
||||
ff_emu_edge_vfix19_sse, ff_emu_edge_vfix20_sse, ff_emu_edge_vfix21_sse,
|
||||
ff_emu_edge_vfix22_sse
|
||||
};
|
||||
extern emu_edge_vvar_func ff_emu_edge_vvar_sse;
|
||||
|
||||
typedef void emu_edge_hfix_func(uint8_t *dst, x86_reg dst_stride,
|
||||
x86_reg start_x, x86_reg bh);
|
||||
typedef void emu_edge_hvar_func(uint8_t *dst, x86_reg dst_stride,
|
||||
x86_reg start_x, x86_reg n_words, x86_reg bh);
|
||||
|
||||
extern emu_edge_hfix_func ff_emu_edge_hfix2_mmx;
|
||||
extern emu_edge_hfix_func ff_emu_edge_hfix4_mmx;
|
||||
extern emu_edge_hfix_func ff_emu_edge_hfix6_mmx;
|
||||
extern emu_edge_hfix_func ff_emu_edge_hfix8_mmx;
|
||||
extern emu_edge_hfix_func ff_emu_edge_hfix10_mmx;
|
||||
extern emu_edge_hfix_func ff_emu_edge_hfix12_mmx;
|
||||
extern emu_edge_hfix_func ff_emu_edge_hfix14_mmx;
|
||||
extern emu_edge_hfix_func ff_emu_edge_hfix16_mmx;
|
||||
extern emu_edge_hfix_func ff_emu_edge_hfix18_mmx;
|
||||
extern emu_edge_hfix_func ff_emu_edge_hfix20_mmx;
|
||||
extern emu_edge_hfix_func ff_emu_edge_hfix22_mmx;
|
||||
#if ARCH_X86_32
|
||||
static emu_edge_hfix_func * const hfixtbl_mmx[11] = {
|
||||
ff_emu_edge_hfix2_mmx, ff_emu_edge_hfix4_mmx, ff_emu_edge_hfix6_mmx,
|
||||
ff_emu_edge_hfix8_mmx, ff_emu_edge_hfix10_mmx, ff_emu_edge_hfix12_mmx,
|
||||
ff_emu_edge_hfix14_mmx, ff_emu_edge_hfix16_mmx, ff_emu_edge_hfix18_mmx,
|
||||
ff_emu_edge_hfix20_mmx, ff_emu_edge_hfix22_mmx
|
||||
};
|
||||
#endif
|
||||
extern emu_edge_hvar_func ff_emu_edge_hvar_mmx;
|
||||
extern emu_edge_hfix_func ff_emu_edge_hfix16_sse2;
|
||||
extern emu_edge_hfix_func ff_emu_edge_hfix18_sse2;
|
||||
extern emu_edge_hfix_func ff_emu_edge_hfix20_sse2;
|
||||
extern emu_edge_hfix_func ff_emu_edge_hfix22_sse2;
|
||||
static emu_edge_hfix_func * const hfixtbl_sse2[11] = {
|
||||
ff_emu_edge_hfix2_mmx, ff_emu_edge_hfix4_mmx, ff_emu_edge_hfix6_mmx,
|
||||
ff_emu_edge_hfix8_mmx, ff_emu_edge_hfix10_mmx, ff_emu_edge_hfix12_mmx,
|
||||
ff_emu_edge_hfix14_mmx, ff_emu_edge_hfix16_sse2, ff_emu_edge_hfix18_sse2,
|
||||
ff_emu_edge_hfix20_sse2, ff_emu_edge_hfix22_sse2
|
||||
};
|
||||
extern emu_edge_hvar_func ff_emu_edge_hvar_sse2;
|
||||
#if HAVE_AVX2_EXTERNAL
|
||||
extern emu_edge_hfix_func ff_emu_edge_hfix8_avx2;
|
||||
extern emu_edge_hfix_func ff_emu_edge_hfix10_avx2;
|
||||
extern emu_edge_hfix_func ff_emu_edge_hfix12_avx2;
|
||||
extern emu_edge_hfix_func ff_emu_edge_hfix14_avx2;
|
||||
extern emu_edge_hfix_func ff_emu_edge_hfix16_avx2;
|
||||
extern emu_edge_hfix_func ff_emu_edge_hfix18_avx2;
|
||||
extern emu_edge_hfix_func ff_emu_edge_hfix20_avx2;
|
||||
extern emu_edge_hfix_func ff_emu_edge_hfix22_avx2;
|
||||
static emu_edge_hfix_func * const hfixtbl_avx2[11] = {
|
||||
ff_emu_edge_hfix2_mmx, ff_emu_edge_hfix4_mmx, ff_emu_edge_hfix6_mmx,
|
||||
ff_emu_edge_hfix8_avx2, ff_emu_edge_hfix10_avx2, ff_emu_edge_hfix12_avx2,
|
||||
ff_emu_edge_hfix14_avx2, ff_emu_edge_hfix16_avx2, ff_emu_edge_hfix18_avx2,
|
||||
ff_emu_edge_hfix20_avx2, ff_emu_edge_hfix22_avx2
|
||||
};
|
||||
extern emu_edge_hvar_func ff_emu_edge_hvar_avx2;
|
||||
#endif
|
||||
|
||||
static av_always_inline void emulated_edge_mc(uint8_t *dst, const uint8_t *src,
|
||||
ptrdiff_t dst_stride,
|
||||
ptrdiff_t src_stride,
|
||||
x86_reg block_w, x86_reg block_h,
|
||||
x86_reg src_x, x86_reg src_y,
|
||||
x86_reg w, x86_reg h,
|
||||
emu_edge_vfix_func * const *vfix_tbl,
|
||||
emu_edge_vvar_func *v_extend_var,
|
||||
emu_edge_hfix_func * const *hfix_tbl,
|
||||
emu_edge_hvar_func *h_extend_var)
|
||||
{
|
||||
x86_reg start_y, start_x, end_y, end_x, src_y_add = 0, p;
|
||||
|
||||
if (!w || !h)
|
||||
return;
|
||||
|
||||
av_assert2(block_w <= FFABS(dst_stride));
|
||||
|
||||
if (src_y >= h) {
|
||||
src -= src_y*src_stride;
|
||||
src_y_add = h - 1;
|
||||
src_y = h - 1;
|
||||
} else if (src_y <= -block_h) {
|
||||
src -= src_y*src_stride;
|
||||
src_y_add = 1 - block_h;
|
||||
src_y = 1 - block_h;
|
||||
}
|
||||
if (src_x >= w) {
|
||||
src += w - 1 - src_x;
|
||||
src_x = w - 1;
|
||||
} else if (src_x <= -block_w) {
|
||||
src += 1 - block_w - src_x;
|
||||
src_x = 1 - block_w;
|
||||
}
|
||||
|
||||
start_y = FFMAX(0, -src_y);
|
||||
start_x = FFMAX(0, -src_x);
|
||||
end_y = FFMIN(block_h, h-src_y);
|
||||
end_x = FFMIN(block_w, w-src_x);
|
||||
av_assert2(start_x < end_x && block_w > 0);
|
||||
av_assert2(start_y < end_y && block_h > 0);
|
||||
|
||||
// fill in the to-be-copied part plus all above/below
|
||||
src += (src_y_add + start_y) * src_stride + start_x;
|
||||
w = end_x - start_x;
|
||||
if (w <= 22) {
|
||||
vfix_tbl[w - 1](dst + start_x, dst_stride, src, src_stride,
|
||||
start_y, end_y, block_h);
|
||||
} else {
|
||||
v_extend_var(dst + start_x, dst_stride, src, src_stride,
|
||||
start_y, end_y, block_h, w);
|
||||
}
|
||||
|
||||
// fill left
|
||||
if (start_x) {
|
||||
if (start_x <= 22) {
|
||||
hfix_tbl[(start_x - 1) >> 1](dst, dst_stride, start_x, block_h);
|
||||
} else {
|
||||
h_extend_var(dst, dst_stride,
|
||||
start_x, (start_x + 1) >> 1, block_h);
|
||||
}
|
||||
}
|
||||
|
||||
// fill right
|
||||
p = block_w - end_x;
|
||||
if (p) {
|
||||
if (p <= 22) {
|
||||
hfix_tbl[(p - 1) >> 1](dst + end_x - (p & 1), dst_stride,
|
||||
-!(p & 1), block_h);
|
||||
} else {
|
||||
h_extend_var(dst + end_x - (p & 1), dst_stride,
|
||||
-!(p & 1), (p + 1) >> 1, block_h);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#if ARCH_X86_32
|
||||
static av_noinline void emulated_edge_mc_mmx(uint8_t *buf, const uint8_t *src,
|
||||
ptrdiff_t buf_stride,
|
||||
ptrdiff_t src_stride,
|
||||
int block_w, int block_h,
|
||||
int src_x, int src_y, int w, int h)
|
||||
{
|
||||
emulated_edge_mc(buf, src, buf_stride, src_stride, block_w, block_h,
|
||||
src_x, src_y, w, h, vfixtbl_mmx, &ff_emu_edge_vvar_mmx,
|
||||
hfixtbl_mmx, &ff_emu_edge_hvar_mmx);
|
||||
}
|
||||
|
||||
static av_noinline void emulated_edge_mc_sse(uint8_t *buf, const uint8_t *src,
|
||||
ptrdiff_t buf_stride,
|
||||
ptrdiff_t src_stride,
|
||||
int block_w, int block_h,
|
||||
int src_x, int src_y, int w, int h)
|
||||
{
|
||||
emulated_edge_mc(buf, src, buf_stride, src_stride, block_w, block_h,
|
||||
src_x, src_y, w, h, vfixtbl_sse, &ff_emu_edge_vvar_sse,
|
||||
hfixtbl_mmx, &ff_emu_edge_hvar_mmx);
|
||||
}
|
||||
#endif
|
||||
|
||||
static av_noinline void emulated_edge_mc_sse2(uint8_t *buf, const uint8_t *src,
|
||||
ptrdiff_t buf_stride,
|
||||
ptrdiff_t src_stride,
|
||||
int block_w, int block_h,
|
||||
int src_x, int src_y, int w,
|
||||
int h)
|
||||
{
|
||||
emulated_edge_mc(buf, src, buf_stride, src_stride, block_w, block_h,
|
||||
src_x, src_y, w, h, vfixtbl_sse, &ff_emu_edge_vvar_sse,
|
||||
hfixtbl_sse2, &ff_emu_edge_hvar_sse2);
|
||||
}
|
||||
|
||||
#if HAVE_AVX2_EXTERNAL
|
||||
static av_noinline void emulated_edge_mc_avx2(uint8_t *buf, const uint8_t *src,
|
||||
ptrdiff_t buf_stride,
|
||||
ptrdiff_t src_stride,
|
||||
int block_w, int block_h,
|
||||
int src_x, int src_y, int w,
|
||||
int h)
|
||||
{
|
||||
emulated_edge_mc(buf, src, buf_stride, src_stride, block_w, block_h,
|
||||
src_x, src_y, w, h, vfixtbl_sse, &ff_emu_edge_vvar_sse,
|
||||
hfixtbl_avx2, &ff_emu_edge_hvar_avx2);
|
||||
}
|
||||
#endif /* HAVE_AVX2_EXTERNAL */
|
||||
#endif /* HAVE_X86ASM */
|
||||
|
||||
void ff_prefetch_mmxext(uint8_t *buf, ptrdiff_t stride, int h);
|
||||
void ff_prefetch_3dnow(uint8_t *buf, ptrdiff_t stride, int h);
|
||||
|
||||
av_cold void ff_videodsp_init_x86(VideoDSPContext *ctx, int bpc)
|
||||
{
|
||||
#if HAVE_X86ASM
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
#if ARCH_X86_32
|
||||
if (EXTERNAL_MMX(cpu_flags) && bpc <= 8) {
|
||||
ctx->emulated_edge_mc = emulated_edge_mc_mmx;
|
||||
}
|
||||
if (EXTERNAL_AMD3DNOW(cpu_flags)) {
|
||||
ctx->prefetch = ff_prefetch_3dnow;
|
||||
}
|
||||
#endif /* ARCH_X86_32 */
|
||||
if (EXTERNAL_MMXEXT(cpu_flags)) {
|
||||
ctx->prefetch = ff_prefetch_mmxext;
|
||||
}
|
||||
#if ARCH_X86_32
|
||||
if (EXTERNAL_SSE(cpu_flags) && bpc <= 8) {
|
||||
ctx->emulated_edge_mc = emulated_edge_mc_sse;
|
||||
}
|
||||
#endif /* ARCH_X86_32 */
|
||||
if (EXTERNAL_SSE2(cpu_flags) && bpc <= 8) {
|
||||
ctx->emulated_edge_mc = emulated_edge_mc_sse2;
|
||||
}
|
||||
#if HAVE_AVX2_EXTERNAL
|
||||
if (EXTERNAL_AVX2(cpu_flags) && bpc <= 8) {
|
||||
ctx->emulated_edge_mc = emulated_edge_mc_avx2;
|
||||
}
|
||||
#endif
|
||||
#endif /* HAVE_X86ASM */
|
||||
}
|
42
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/vorbisdsp_init.c
vendored
Normal file
42
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/vorbisdsp_init.c
vendored
Normal file
|
@ -0,0 +1,42 @@
|
|||
/*
|
||||
* Copyright (C) 2006 Loren Merritt <lorenm@u.washington.edu>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/vorbisdsp.h"
|
||||
|
||||
void ff_vorbis_inverse_coupling_3dnow(float *mag, float *ang,
|
||||
intptr_t blocksize);
|
||||
void ff_vorbis_inverse_coupling_sse(float *mag, float *ang,
|
||||
intptr_t blocksize);
|
||||
|
||||
av_cold void ff_vorbisdsp_init_x86(VorbisDSPContext *dsp)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
#if ARCH_X86_32
|
||||
if (EXTERNAL_AMD3DNOW(cpu_flags))
|
||||
dsp->vorbis_inverse_coupling = ff_vorbis_inverse_coupling_3dnow;
|
||||
#endif /* ARCH_X86_32 */
|
||||
if (EXTERNAL_SSE(cpu_flags))
|
||||
dsp->vorbis_inverse_coupling = ff_vorbis_inverse_coupling_sse;
|
||||
}
|
71
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/vp3dsp_init.c
vendored
Normal file
71
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/vp3dsp_init.c
vendored
Normal file
|
@ -0,0 +1,71 @@
|
|||
/*
|
||||
* Copyright (c) 2009 David Conrad <lessen42@gmail.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/avcodec.h"
|
||||
#include "libavcodec/vp3dsp.h"
|
||||
|
||||
void ff_vp3_idct_put_mmx(uint8_t *dest, ptrdiff_t stride, int16_t *block);
|
||||
void ff_vp3_idct_add_mmx(uint8_t *dest, ptrdiff_t stride, int16_t *block);
|
||||
|
||||
void ff_vp3_idct_put_sse2(uint8_t *dest, ptrdiff_t stride, int16_t *block);
|
||||
void ff_vp3_idct_add_sse2(uint8_t *dest, ptrdiff_t stride, int16_t *block);
|
||||
|
||||
void ff_vp3_idct_dc_add_mmxext(uint8_t *dest, ptrdiff_t stride, int16_t *block);
|
||||
|
||||
void ff_vp3_v_loop_filter_mmxext(uint8_t *src, ptrdiff_t stride,
|
||||
int *bounding_values);
|
||||
void ff_vp3_h_loop_filter_mmxext(uint8_t *src, ptrdiff_t stride,
|
||||
int *bounding_values);
|
||||
|
||||
void ff_put_vp_no_rnd_pixels8_l2_mmx(uint8_t *dst, const uint8_t *a,
|
||||
const uint8_t *b, ptrdiff_t stride,
|
||||
int h);
|
||||
|
||||
av_cold void ff_vp3dsp_init_x86(VP3DSPContext *c, int flags)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (EXTERNAL_MMX(cpu_flags)) {
|
||||
c->put_no_rnd_pixels_l2 = ff_put_vp_no_rnd_pixels8_l2_mmx;
|
||||
#if ARCH_X86_32
|
||||
c->idct_put = ff_vp3_idct_put_mmx;
|
||||
c->idct_add = ff_vp3_idct_add_mmx;
|
||||
#endif
|
||||
}
|
||||
|
||||
if (EXTERNAL_MMXEXT(cpu_flags)) {
|
||||
c->idct_dc_add = ff_vp3_idct_dc_add_mmxext;
|
||||
|
||||
if (!(flags & AV_CODEC_FLAG_BITEXACT)) {
|
||||
c->v_loop_filter = ff_vp3_v_loop_filter_mmxext;
|
||||
c->h_loop_filter = ff_vp3_h_loop_filter_mmxext;
|
||||
}
|
||||
}
|
||||
|
||||
if (EXTERNAL_SSE2(cpu_flags)) {
|
||||
c->idct_put = ff_vp3_idct_put_sse2;
|
||||
c->idct_add = ff_vp3_idct_add_sse2;
|
||||
}
|
||||
}
|
51
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/vp56_arith.h
vendored
Normal file
51
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/vp56_arith.h
vendored
Normal file
|
@ -0,0 +1,51 @@
|
|||
/**
|
||||
* VP5 and VP6 compatible video decoder (arith decoder)
|
||||
*
|
||||
* Copyright (C) 2006 Aurelien Jacobs <aurel@gnuage.org>
|
||||
* Copyright (C) 2010 Eli Friedman
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef AVCODEC_X86_VP56_ARITH_H
|
||||
#define AVCODEC_X86_VP56_ARITH_H
|
||||
|
||||
#if HAVE_INLINE_ASM && HAVE_FAST_CMOV && HAVE_6REGS
|
||||
#define vp56_rac_get_prob vp56_rac_get_prob
|
||||
static av_always_inline int vp56_rac_get_prob(VP56RangeCoder *c, uint8_t prob)
|
||||
{
|
||||
unsigned int code_word = vp56_rac_renorm(c);
|
||||
unsigned int low = 1 + (((c->high - 1) * prob) >> 8);
|
||||
unsigned int low_shift = low << 16;
|
||||
int bit = 0;
|
||||
c->code_word = code_word;
|
||||
|
||||
__asm__(
|
||||
"subl %4, %1 \n\t"
|
||||
"subl %3, %2 \n\t"
|
||||
"setae %b0 \n\t"
|
||||
"cmovb %4, %1 \n\t"
|
||||
"cmovb %5, %2 \n\t"
|
||||
: "+q"(bit), "+&r"(c->high), "+&r"(c->code_word)
|
||||
: "r"(low_shift), "r"(low), "r"(code_word)
|
||||
);
|
||||
|
||||
return bit;
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* AVCODEC_X86_VP56_ARITH_H */
|
45
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/vp6dsp_init.c
vendored
Normal file
45
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/vp6dsp_init.c
vendored
Normal file
|
@ -0,0 +1,45 @@
|
|||
/*
|
||||
* VP6 MMX/SSE2 optimizations
|
||||
* Copyright (C) 2009 Sebastien Lucas <sebastien.lucas@gmail.com>
|
||||
* Copyright (C) 2009 Zuxy Meng <zuxy.meng@gmail.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/vp56dsp.h"
|
||||
|
||||
void ff_vp6_filter_diag4_mmx(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
|
||||
const int16_t *h_weights,const int16_t *v_weights);
|
||||
void ff_vp6_filter_diag4_sse2(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
|
||||
const int16_t *h_weights,const int16_t *v_weights);
|
||||
|
||||
av_cold void ff_vp6dsp_init_x86(VP56DSPContext *c)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
#if ARCH_X86_32
|
||||
if (EXTERNAL_MMX(cpu_flags)) {
|
||||
c->vp6_filter_diag4 = ff_vp6_filter_diag4_mmx;
|
||||
}
|
||||
#endif
|
||||
if (EXTERNAL_SSE2(cpu_flags)) {
|
||||
c->vp6_filter_diag4 = ff_vp6_filter_diag4_sse2;
|
||||
}
|
||||
}
|
467
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/vp8dsp_init.c
vendored
Normal file
467
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/vp8dsp_init.c
vendored
Normal file
|
@ -0,0 +1,467 @@
|
|||
/*
|
||||
* VP8 DSP functions x86-optimized
|
||||
* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
|
||||
* Copyright (c) 2010 Fiona Glaser <fiona@x264.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/mem.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/vp8dsp.h"
|
||||
|
||||
#if HAVE_X86ASM
|
||||
|
||||
/*
|
||||
* MC functions
|
||||
*/
|
||||
void ff_put_vp8_epel4_h4_mmxext(uint8_t *dst, ptrdiff_t dststride,
|
||||
uint8_t *src, ptrdiff_t srcstride,
|
||||
int height, int mx, int my);
|
||||
void ff_put_vp8_epel4_h6_mmxext(uint8_t *dst, ptrdiff_t dststride,
|
||||
uint8_t *src, ptrdiff_t srcstride,
|
||||
int height, int mx, int my);
|
||||
void ff_put_vp8_epel4_v4_mmxext(uint8_t *dst, ptrdiff_t dststride,
|
||||
uint8_t *src, ptrdiff_t srcstride,
|
||||
int height, int mx, int my);
|
||||
void ff_put_vp8_epel4_v6_mmxext(uint8_t *dst, ptrdiff_t dststride,
|
||||
uint8_t *src, ptrdiff_t srcstride,
|
||||
int height, int mx, int my);
|
||||
|
||||
void ff_put_vp8_epel8_h4_sse2 (uint8_t *dst, ptrdiff_t dststride,
|
||||
uint8_t *src, ptrdiff_t srcstride,
|
||||
int height, int mx, int my);
|
||||
void ff_put_vp8_epel8_h6_sse2 (uint8_t *dst, ptrdiff_t dststride,
|
||||
uint8_t *src, ptrdiff_t srcstride,
|
||||
int height, int mx, int my);
|
||||
void ff_put_vp8_epel8_v4_sse2 (uint8_t *dst, ptrdiff_t dststride,
|
||||
uint8_t *src, ptrdiff_t srcstride,
|
||||
int height, int mx, int my);
|
||||
void ff_put_vp8_epel8_v6_sse2 (uint8_t *dst, ptrdiff_t dststride,
|
||||
uint8_t *src, ptrdiff_t srcstride,
|
||||
int height, int mx, int my);
|
||||
|
||||
void ff_put_vp8_epel4_h4_ssse3 (uint8_t *dst, ptrdiff_t dststride,
|
||||
uint8_t *src, ptrdiff_t srcstride,
|
||||
int height, int mx, int my);
|
||||
void ff_put_vp8_epel4_h6_ssse3 (uint8_t *dst, ptrdiff_t dststride,
|
||||
uint8_t *src, ptrdiff_t srcstride,
|
||||
int height, int mx, int my);
|
||||
void ff_put_vp8_epel4_v4_ssse3 (uint8_t *dst, ptrdiff_t dststride,
|
||||
uint8_t *src, ptrdiff_t srcstride,
|
||||
int height, int mx, int my);
|
||||
void ff_put_vp8_epel4_v6_ssse3 (uint8_t *dst, ptrdiff_t dststride,
|
||||
uint8_t *src, ptrdiff_t srcstride,
|
||||
int height, int mx, int my);
|
||||
void ff_put_vp8_epel8_h4_ssse3 (uint8_t *dst, ptrdiff_t dststride,
|
||||
uint8_t *src, ptrdiff_t srcstride,
|
||||
int height, int mx, int my);
|
||||
void ff_put_vp8_epel8_h6_ssse3 (uint8_t *dst, ptrdiff_t dststride,
|
||||
uint8_t *src, ptrdiff_t srcstride,
|
||||
int height, int mx, int my);
|
||||
void ff_put_vp8_epel8_v4_ssse3 (uint8_t *dst, ptrdiff_t dststride,
|
||||
uint8_t *src, ptrdiff_t srcstride,
|
||||
int height, int mx, int my);
|
||||
void ff_put_vp8_epel8_v6_ssse3 (uint8_t *dst, ptrdiff_t dststride,
|
||||
uint8_t *src, ptrdiff_t srcstride,
|
||||
int height, int mx, int my);
|
||||
|
||||
void ff_put_vp8_bilinear4_h_mmxext(uint8_t *dst, ptrdiff_t dststride,
|
||||
uint8_t *src, ptrdiff_t srcstride,
|
||||
int height, int mx, int my);
|
||||
void ff_put_vp8_bilinear8_h_sse2 (uint8_t *dst, ptrdiff_t dststride,
|
||||
uint8_t *src, ptrdiff_t srcstride,
|
||||
int height, int mx, int my);
|
||||
void ff_put_vp8_bilinear4_h_ssse3 (uint8_t *dst, ptrdiff_t dststride,
|
||||
uint8_t *src, ptrdiff_t srcstride,
|
||||
int height, int mx, int my);
|
||||
void ff_put_vp8_bilinear8_h_ssse3 (uint8_t *dst, ptrdiff_t dststride,
|
||||
uint8_t *src, ptrdiff_t srcstride,
|
||||
int height, int mx, int my);
|
||||
|
||||
void ff_put_vp8_bilinear4_v_mmxext(uint8_t *dst, ptrdiff_t dststride,
|
||||
uint8_t *src, ptrdiff_t srcstride,
|
||||
int height, int mx, int my);
|
||||
void ff_put_vp8_bilinear8_v_sse2 (uint8_t *dst, ptrdiff_t dststride,
|
||||
uint8_t *src, ptrdiff_t srcstride,
|
||||
int height, int mx, int my);
|
||||
void ff_put_vp8_bilinear4_v_ssse3 (uint8_t *dst, ptrdiff_t dststride,
|
||||
uint8_t *src, ptrdiff_t srcstride,
|
||||
int height, int mx, int my);
|
||||
void ff_put_vp8_bilinear8_v_ssse3 (uint8_t *dst, ptrdiff_t dststride,
|
||||
uint8_t *src, ptrdiff_t srcstride,
|
||||
int height, int mx, int my);
|
||||
|
||||
|
||||
void ff_put_vp8_pixels8_mmx (uint8_t *dst, ptrdiff_t dststride,
|
||||
uint8_t *src, ptrdiff_t srcstride,
|
||||
int height, int mx, int my);
|
||||
void ff_put_vp8_pixels16_mmx(uint8_t *dst, ptrdiff_t dststride,
|
||||
uint8_t *src, ptrdiff_t srcstride,
|
||||
int height, int mx, int my);
|
||||
void ff_put_vp8_pixels16_sse(uint8_t *dst, ptrdiff_t dststride,
|
||||
uint8_t *src, ptrdiff_t srcstride,
|
||||
int height, int mx, int my);
|
||||
|
||||
#define TAP_W16(OPT, FILTERTYPE, TAPTYPE) \
|
||||
static void ff_put_vp8_ ## FILTERTYPE ## 16_ ## TAPTYPE ## _ ## OPT( \
|
||||
uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \
|
||||
ptrdiff_t srcstride, int height, int mx, int my) \
|
||||
{ \
|
||||
ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \
|
||||
dst, dststride, src, srcstride, height, mx, my); \
|
||||
ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \
|
||||
dst + 8, dststride, src + 8, srcstride, height, mx, my); \
|
||||
}
|
||||
#define TAP_W8(OPT, FILTERTYPE, TAPTYPE) \
|
||||
static void ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \
|
||||
uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \
|
||||
ptrdiff_t srcstride, int height, int mx, int my) \
|
||||
{ \
|
||||
ff_put_vp8_ ## FILTERTYPE ## 4_ ## TAPTYPE ## _ ## OPT( \
|
||||
dst, dststride, src, srcstride, height, mx, my); \
|
||||
ff_put_vp8_ ## FILTERTYPE ## 4_ ## TAPTYPE ## _ ## OPT( \
|
||||
dst + 4, dststride, src + 4, srcstride, height, mx, my); \
|
||||
}
|
||||
|
||||
#if ARCH_X86_32
|
||||
TAP_W8 (mmxext, epel, h4)
|
||||
TAP_W8 (mmxext, epel, h6)
|
||||
TAP_W16(mmxext, epel, h6)
|
||||
TAP_W8 (mmxext, epel, v4)
|
||||
TAP_W8 (mmxext, epel, v6)
|
||||
TAP_W16(mmxext, epel, v6)
|
||||
TAP_W8 (mmxext, bilinear, h)
|
||||
TAP_W16(mmxext, bilinear, h)
|
||||
TAP_W8 (mmxext, bilinear, v)
|
||||
TAP_W16(mmxext, bilinear, v)
|
||||
#endif
|
||||
|
||||
TAP_W16(sse2, epel, h6)
|
||||
TAP_W16(sse2, epel, v6)
|
||||
TAP_W16(sse2, bilinear, h)
|
||||
TAP_W16(sse2, bilinear, v)
|
||||
|
||||
TAP_W16(ssse3, epel, h6)
|
||||
TAP_W16(ssse3, epel, v6)
|
||||
TAP_W16(ssse3, bilinear, h)
|
||||
TAP_W16(ssse3, bilinear, v)
|
||||
|
||||
#define HVTAP(OPT, ALIGN, TAPNUMX, TAPNUMY, SIZE, MAXHEIGHT) \
|
||||
static void ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## v ## TAPNUMY ## _ ## OPT( \
|
||||
uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \
|
||||
ptrdiff_t srcstride, int height, int mx, int my) \
|
||||
{ \
|
||||
LOCAL_ALIGNED(ALIGN, uint8_t, tmp, [SIZE * (MAXHEIGHT + TAPNUMY - 1)]); \
|
||||
uint8_t *tmpptr = tmp + SIZE * (TAPNUMY / 2 - 1); \
|
||||
src -= srcstride * (TAPNUMY / 2 - 1); \
|
||||
ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## _ ## OPT( \
|
||||
tmp, SIZE, src, srcstride, height + TAPNUMY - 1, mx, my); \
|
||||
ff_put_vp8_epel ## SIZE ## _v ## TAPNUMY ## _ ## OPT( \
|
||||
dst, dststride, tmpptr, SIZE, height, mx, my); \
|
||||
}
|
||||
|
||||
#if ARCH_X86_32
|
||||
#define HVTAPMMX(x, y) \
|
||||
HVTAP(mmxext, 8, x, y, 4, 8) \
|
||||
HVTAP(mmxext, 8, x, y, 8, 16)
|
||||
|
||||
HVTAP(mmxext, 8, 6, 6, 16, 16)
|
||||
#else
|
||||
#define HVTAPMMX(x, y) \
|
||||
HVTAP(mmxext, 8, x, y, 4, 8)
|
||||
#endif
|
||||
|
||||
HVTAPMMX(4, 4)
|
||||
HVTAPMMX(4, 6)
|
||||
HVTAPMMX(6, 4)
|
||||
HVTAPMMX(6, 6)
|
||||
|
||||
#define HVTAPSSE2(x, y, w) \
|
||||
HVTAP(sse2, 16, x, y, w, 16) \
|
||||
HVTAP(ssse3, 16, x, y, w, 16)
|
||||
|
||||
HVTAPSSE2(4, 4, 8)
|
||||
HVTAPSSE2(4, 6, 8)
|
||||
HVTAPSSE2(6, 4, 8)
|
||||
HVTAPSSE2(6, 6, 8)
|
||||
HVTAPSSE2(6, 6, 16)
|
||||
|
||||
HVTAP(ssse3, 16, 4, 4, 4, 8)
|
||||
HVTAP(ssse3, 16, 4, 6, 4, 8)
|
||||
HVTAP(ssse3, 16, 6, 4, 4, 8)
|
||||
HVTAP(ssse3, 16, 6, 6, 4, 8)
|
||||
|
||||
#define HVBILIN(OPT, ALIGN, SIZE, MAXHEIGHT) \
|
||||
static void ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT( \
|
||||
uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \
|
||||
ptrdiff_t srcstride, int height, int mx, int my) \
|
||||
{ \
|
||||
LOCAL_ALIGNED(ALIGN, uint8_t, tmp, [SIZE * (MAXHEIGHT + 2)]); \
|
||||
ff_put_vp8_bilinear ## SIZE ## _h_ ## OPT( \
|
||||
tmp, SIZE, src, srcstride, height + 1, mx, my); \
|
||||
ff_put_vp8_bilinear ## SIZE ## _v_ ## OPT( \
|
||||
dst, dststride, tmp, SIZE, height, mx, my); \
|
||||
}
|
||||
|
||||
HVBILIN(mmxext, 8, 4, 8)
|
||||
#if ARCH_X86_32
|
||||
HVBILIN(mmxext, 8, 8, 16)
|
||||
HVBILIN(mmxext, 8, 16, 16)
|
||||
#endif
|
||||
HVBILIN(sse2, 8, 8, 16)
|
||||
HVBILIN(sse2, 8, 16, 16)
|
||||
HVBILIN(ssse3, 8, 4, 8)
|
||||
HVBILIN(ssse3, 8, 8, 16)
|
||||
HVBILIN(ssse3, 8, 16, 16)
|
||||
|
||||
void ff_vp8_idct_dc_add_mmx(uint8_t *dst, int16_t block[16],
|
||||
ptrdiff_t stride);
|
||||
void ff_vp8_idct_dc_add_sse2(uint8_t *dst, int16_t block[16],
|
||||
ptrdiff_t stride);
|
||||
void ff_vp8_idct_dc_add_sse4(uint8_t *dst, int16_t block[16],
|
||||
ptrdiff_t stride);
|
||||
void ff_vp8_idct_dc_add4y_mmx(uint8_t *dst, int16_t block[4][16],
|
||||
ptrdiff_t stride);
|
||||
void ff_vp8_idct_dc_add4y_sse2(uint8_t *dst, int16_t block[4][16],
|
||||
ptrdiff_t stride);
|
||||
void ff_vp8_idct_dc_add4uv_mmx(uint8_t *dst, int16_t block[2][16],
|
||||
ptrdiff_t stride);
|
||||
void ff_vp8_luma_dc_wht_mmx(int16_t block[4][4][16], int16_t dc[16]);
|
||||
void ff_vp8_luma_dc_wht_sse(int16_t block[4][4][16], int16_t dc[16]);
|
||||
void ff_vp8_idct_add_mmx(uint8_t *dst, int16_t block[16], ptrdiff_t stride);
|
||||
void ff_vp8_idct_add_sse(uint8_t *dst, int16_t block[16], ptrdiff_t stride);
|
||||
|
||||
#define DECLARE_LOOP_FILTER(NAME) \
|
||||
void ff_vp8_v_loop_filter_simple_ ## NAME(uint8_t *dst, \
|
||||
ptrdiff_t stride, \
|
||||
int flim); \
|
||||
void ff_vp8_h_loop_filter_simple_ ## NAME(uint8_t *dst, \
|
||||
ptrdiff_t stride, \
|
||||
int flim); \
|
||||
void ff_vp8_v_loop_filter16y_inner_ ## NAME (uint8_t *dst, \
|
||||
ptrdiff_t stride, \
|
||||
int e, int i, int hvt); \
|
||||
void ff_vp8_h_loop_filter16y_inner_ ## NAME (uint8_t *dst, \
|
||||
ptrdiff_t stride, \
|
||||
int e, int i, int hvt); \
|
||||
void ff_vp8_v_loop_filter8uv_inner_ ## NAME (uint8_t *dstU, \
|
||||
uint8_t *dstV, \
|
||||
ptrdiff_t s, \
|
||||
int e, int i, int hvt); \
|
||||
void ff_vp8_h_loop_filter8uv_inner_ ## NAME (uint8_t *dstU, \
|
||||
uint8_t *dstV, \
|
||||
ptrdiff_t s, \
|
||||
int e, int i, int hvt); \
|
||||
void ff_vp8_v_loop_filter16y_mbedge_ ## NAME(uint8_t *dst, \
|
||||
ptrdiff_t stride, \
|
||||
int e, int i, int hvt); \
|
||||
void ff_vp8_h_loop_filter16y_mbedge_ ## NAME(uint8_t *dst, \
|
||||
ptrdiff_t stride, \
|
||||
int e, int i, int hvt); \
|
||||
void ff_vp8_v_loop_filter8uv_mbedge_ ## NAME(uint8_t *dstU, \
|
||||
uint8_t *dstV, \
|
||||
ptrdiff_t s, \
|
||||
int e, int i, int hvt); \
|
||||
void ff_vp8_h_loop_filter8uv_mbedge_ ## NAME(uint8_t *dstU, \
|
||||
uint8_t *dstV, \
|
||||
ptrdiff_t s, \
|
||||
int e, int i, int hvt);
|
||||
|
||||
DECLARE_LOOP_FILTER(mmx)
|
||||
DECLARE_LOOP_FILTER(mmxext)
|
||||
DECLARE_LOOP_FILTER(sse2)
|
||||
DECLARE_LOOP_FILTER(ssse3)
|
||||
DECLARE_LOOP_FILTER(sse4)
|
||||
|
||||
#endif /* HAVE_X86ASM */
|
||||
|
||||
#define VP8_LUMA_MC_FUNC(IDX, SIZE, OPT) \
|
||||
c->put_vp8_epel_pixels_tab[IDX][0][2] = ff_put_vp8_epel ## SIZE ## _h6_ ## OPT; \
|
||||
c->put_vp8_epel_pixels_tab[IDX][2][0] = ff_put_vp8_epel ## SIZE ## _v6_ ## OPT; \
|
||||
c->put_vp8_epel_pixels_tab[IDX][2][2] = ff_put_vp8_epel ## SIZE ## _h6v6_ ## OPT
|
||||
|
||||
#define VP8_MC_FUNC(IDX, SIZE, OPT) \
|
||||
c->put_vp8_epel_pixels_tab[IDX][0][1] = ff_put_vp8_epel ## SIZE ## _h4_ ## OPT; \
|
||||
c->put_vp8_epel_pixels_tab[IDX][1][0] = ff_put_vp8_epel ## SIZE ## _v4_ ## OPT; \
|
||||
c->put_vp8_epel_pixels_tab[IDX][1][1] = ff_put_vp8_epel ## SIZE ## _h4v4_ ## OPT; \
|
||||
c->put_vp8_epel_pixels_tab[IDX][1][2] = ff_put_vp8_epel ## SIZE ## _h6v4_ ## OPT; \
|
||||
c->put_vp8_epel_pixels_tab[IDX][2][1] = ff_put_vp8_epel ## SIZE ## _h4v6_ ## OPT; \
|
||||
VP8_LUMA_MC_FUNC(IDX, SIZE, OPT)
|
||||
|
||||
#define VP8_BILINEAR_MC_FUNC(IDX, SIZE, OPT) \
|
||||
c->put_vp8_bilinear_pixels_tab[IDX][0][1] = ff_put_vp8_bilinear ## SIZE ## _h_ ## OPT; \
|
||||
c->put_vp8_bilinear_pixels_tab[IDX][0][2] = ff_put_vp8_bilinear ## SIZE ## _h_ ## OPT; \
|
||||
c->put_vp8_bilinear_pixels_tab[IDX][1][0] = ff_put_vp8_bilinear ## SIZE ## _v_ ## OPT; \
|
||||
c->put_vp8_bilinear_pixels_tab[IDX][1][1] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT; \
|
||||
c->put_vp8_bilinear_pixels_tab[IDX][1][2] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT; \
|
||||
c->put_vp8_bilinear_pixels_tab[IDX][2][0] = ff_put_vp8_bilinear ## SIZE ## _v_ ## OPT; \
|
||||
c->put_vp8_bilinear_pixels_tab[IDX][2][1] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT; \
|
||||
c->put_vp8_bilinear_pixels_tab[IDX][2][2] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT
|
||||
|
||||
|
||||
av_cold void ff_vp78dsp_init_x86(VP8DSPContext *c)
|
||||
{
|
||||
#if HAVE_X86ASM
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (EXTERNAL_MMX(cpu_flags)) {
|
||||
#if ARCH_X86_32
|
||||
c->put_vp8_epel_pixels_tab[0][0][0] =
|
||||
c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_mmx;
|
||||
#endif
|
||||
c->put_vp8_epel_pixels_tab[1][0][0] =
|
||||
c->put_vp8_bilinear_pixels_tab[1][0][0] = ff_put_vp8_pixels8_mmx;
|
||||
}
|
||||
|
||||
/* note that 4-tap width=16 functions are missing because w=16
|
||||
* is only used for luma, and luma is always a copy or sixtap. */
|
||||
if (EXTERNAL_MMXEXT(cpu_flags)) {
|
||||
VP8_MC_FUNC(2, 4, mmxext);
|
||||
VP8_BILINEAR_MC_FUNC(2, 4, mmxext);
|
||||
#if ARCH_X86_32
|
||||
VP8_LUMA_MC_FUNC(0, 16, mmxext);
|
||||
VP8_MC_FUNC(1, 8, mmxext);
|
||||
VP8_BILINEAR_MC_FUNC(0, 16, mmxext);
|
||||
VP8_BILINEAR_MC_FUNC(1, 8, mmxext);
|
||||
#endif
|
||||
}
|
||||
|
||||
if (EXTERNAL_SSE(cpu_flags)) {
|
||||
c->put_vp8_epel_pixels_tab[0][0][0] =
|
||||
c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_sse;
|
||||
}
|
||||
|
||||
if (EXTERNAL_SSE2(cpu_flags) || EXTERNAL_SSE2_SLOW(cpu_flags)) {
|
||||
VP8_LUMA_MC_FUNC(0, 16, sse2);
|
||||
VP8_MC_FUNC(1, 8, sse2);
|
||||
VP8_BILINEAR_MC_FUNC(0, 16, sse2);
|
||||
VP8_BILINEAR_MC_FUNC(1, 8, sse2);
|
||||
}
|
||||
|
||||
if (EXTERNAL_SSSE3(cpu_flags)) {
|
||||
VP8_LUMA_MC_FUNC(0, 16, ssse3);
|
||||
VP8_MC_FUNC(1, 8, ssse3);
|
||||
VP8_MC_FUNC(2, 4, ssse3);
|
||||
VP8_BILINEAR_MC_FUNC(0, 16, ssse3);
|
||||
VP8_BILINEAR_MC_FUNC(1, 8, ssse3);
|
||||
VP8_BILINEAR_MC_FUNC(2, 4, ssse3);
|
||||
}
|
||||
#endif /* HAVE_X86ASM */
|
||||
}
|
||||
|
||||
av_cold void ff_vp8dsp_init_x86(VP8DSPContext *c)
|
||||
{
|
||||
#if HAVE_X86ASM
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (EXTERNAL_MMX(cpu_flags)) {
|
||||
c->vp8_idct_dc_add4uv = ff_vp8_idct_dc_add4uv_mmx;
|
||||
#if ARCH_X86_32
|
||||
c->vp8_idct_dc_add = ff_vp8_idct_dc_add_mmx;
|
||||
c->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_mmx;
|
||||
c->vp8_idct_add = ff_vp8_idct_add_mmx;
|
||||
c->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_mmx;
|
||||
|
||||
c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_mmx;
|
||||
c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_mmx;
|
||||
|
||||
c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_mmx;
|
||||
c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_mmx;
|
||||
c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_mmx;
|
||||
c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_mmx;
|
||||
|
||||
c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_mmx;
|
||||
c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_mmx;
|
||||
c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_mmx;
|
||||
c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_mmx;
|
||||
#endif
|
||||
}
|
||||
|
||||
/* note that 4-tap width=16 functions are missing because w=16
|
||||
* is only used for luma, and luma is always a copy or sixtap. */
|
||||
if (EXTERNAL_MMXEXT(cpu_flags)) {
|
||||
#if ARCH_X86_32
|
||||
c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_mmxext;
|
||||
c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_mmxext;
|
||||
|
||||
c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_mmxext;
|
||||
c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_mmxext;
|
||||
c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_mmxext;
|
||||
c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_mmxext;
|
||||
|
||||
c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_mmxext;
|
||||
c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_mmxext;
|
||||
c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_mmxext;
|
||||
c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_mmxext;
|
||||
#endif
|
||||
}
|
||||
|
||||
if (EXTERNAL_SSE(cpu_flags)) {
|
||||
c->vp8_idct_add = ff_vp8_idct_add_sse;
|
||||
c->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_sse;
|
||||
}
|
||||
|
||||
if (EXTERNAL_SSE2(cpu_flags) || EXTERNAL_SSE2_SLOW(cpu_flags)) {
|
||||
c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_sse2;
|
||||
|
||||
c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_sse2;
|
||||
c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_sse2;
|
||||
|
||||
c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_sse2;
|
||||
c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_sse2;
|
||||
}
|
||||
|
||||
if (EXTERNAL_SSE2(cpu_flags)) {
|
||||
c->vp8_idct_dc_add = ff_vp8_idct_dc_add_sse2;
|
||||
c->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_sse2;
|
||||
|
||||
c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse2;
|
||||
|
||||
c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_sse2;
|
||||
c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_sse2;
|
||||
|
||||
c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_sse2;
|
||||
c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_sse2;
|
||||
}
|
||||
|
||||
if (EXTERNAL_SSSE3(cpu_flags)) {
|
||||
c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_ssse3;
|
||||
c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_ssse3;
|
||||
|
||||
c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_ssse3;
|
||||
c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_ssse3;
|
||||
c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_ssse3;
|
||||
c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_ssse3;
|
||||
|
||||
c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_ssse3;
|
||||
c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_ssse3;
|
||||
c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_ssse3;
|
||||
c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_ssse3;
|
||||
}
|
||||
|
||||
if (EXTERNAL_SSE4(cpu_flags)) {
|
||||
c->vp8_idct_dc_add = ff_vp8_idct_dc_add_sse4;
|
||||
|
||||
c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse4;
|
||||
c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_sse4;
|
||||
c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_sse4;
|
||||
}
|
||||
#endif /* HAVE_X86ASM */
|
||||
}
|
416
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/vp9dsp_init.c
vendored
Normal file
416
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/vp9dsp_init.c
vendored
Normal file
|
@ -0,0 +1,416 @@
|
|||
/*
|
||||
* VP9 SIMD optimizations
|
||||
*
|
||||
* Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/mem.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/vp9dsp.h"
|
||||
#include "libavcodec/x86/vp9dsp_init.h"
|
||||
|
||||
#if HAVE_X86ASM
|
||||
|
||||
decl_fpel_func(put, 4, , mmx);
|
||||
decl_fpel_func(put, 8, , mmx);
|
||||
decl_fpel_func(put, 16, , sse);
|
||||
decl_fpel_func(put, 32, , sse);
|
||||
decl_fpel_func(put, 64, , sse);
|
||||
decl_fpel_func(avg, 4, _8, mmxext);
|
||||
decl_fpel_func(avg, 8, _8, mmxext);
|
||||
decl_fpel_func(avg, 16, _8, sse2);
|
||||
decl_fpel_func(avg, 32, _8, sse2);
|
||||
decl_fpel_func(avg, 64, _8, sse2);
|
||||
decl_fpel_func(put, 32, , avx);
|
||||
decl_fpel_func(put, 64, , avx);
|
||||
decl_fpel_func(avg, 32, _8, avx2);
|
||||
decl_fpel_func(avg, 64, _8, avx2);
|
||||
|
||||
decl_mc_funcs(4, mmxext, int16_t, 8, 8);
|
||||
decl_mc_funcs(8, sse2, int16_t, 8, 8);
|
||||
decl_mc_funcs(4, ssse3, int8_t, 32, 8);
|
||||
decl_mc_funcs(8, ssse3, int8_t, 32, 8);
|
||||
#if ARCH_X86_64
|
||||
decl_mc_funcs(16, ssse3, int8_t, 32, 8);
|
||||
decl_mc_funcs(32, avx2, int8_t, 32, 8);
|
||||
#endif
|
||||
|
||||
mc_rep_funcs(16, 8, 8, sse2, int16_t, 8, 8)
|
||||
#if ARCH_X86_32
|
||||
mc_rep_funcs(16, 8, 8, ssse3, int8_t, 32, 8)
|
||||
#endif
|
||||
mc_rep_funcs(32, 16, 16, sse2, int16_t, 8, 8)
|
||||
mc_rep_funcs(32, 16, 16, ssse3, int8_t, 32, 8)
|
||||
mc_rep_funcs(64, 32, 32, sse2, int16_t, 8, 8)
|
||||
mc_rep_funcs(64, 32, 32, ssse3, int8_t, 32, 8)
|
||||
#if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
|
||||
mc_rep_funcs(64, 32, 32, avx2, int8_t, 32, 8)
|
||||
#endif
|
||||
|
||||
extern const int8_t ff_filters_ssse3[3][15][4][32];
|
||||
extern const int16_t ff_filters_sse2[3][15][8][8];
|
||||
|
||||
filters_8tap_2d_fn2(put, 16, 8, 1, mmxext, sse2, sse2)
|
||||
filters_8tap_2d_fn2(avg, 16, 8, 1, mmxext, sse2, sse2)
|
||||
filters_8tap_2d_fn2(put, 16, 8, 1, ssse3, ssse3, ssse3)
|
||||
filters_8tap_2d_fn2(avg, 16, 8, 1, ssse3, ssse3, ssse3)
|
||||
#if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
|
||||
filters_8tap_2d_fn(put, 64, 32, 8, 1, avx2, ssse3)
|
||||
filters_8tap_2d_fn(put, 32, 32, 8, 1, avx2, ssse3)
|
||||
filters_8tap_2d_fn(avg, 64, 32, 8, 1, avx2, ssse3)
|
||||
filters_8tap_2d_fn(avg, 32, 32, 8, 1, avx2, ssse3)
|
||||
#endif
|
||||
|
||||
filters_8tap_1d_fn3(put, 8, mmxext, sse2, sse2)
|
||||
filters_8tap_1d_fn3(avg, 8, mmxext, sse2, sse2)
|
||||
filters_8tap_1d_fn3(put, 8, ssse3, ssse3, ssse3)
|
||||
filters_8tap_1d_fn3(avg, 8, ssse3, ssse3, ssse3)
|
||||
#if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
|
||||
filters_8tap_1d_fn2(put, 64, 8, avx2, ssse3)
|
||||
filters_8tap_1d_fn2(put, 32, 8, avx2, ssse3)
|
||||
filters_8tap_1d_fn2(avg, 64, 8, avx2, ssse3)
|
||||
filters_8tap_1d_fn2(avg, 32, 8, avx2, ssse3)
|
||||
#endif
|
||||
|
||||
#define itxfm_func(typea, typeb, size, opt) \
|
||||
void ff_vp9_##typea##_##typeb##_##size##x##size##_add_##opt(uint8_t *dst, ptrdiff_t stride, \
|
||||
int16_t *block, int eob)
|
||||
#define itxfm_funcs(size, opt) \
|
||||
itxfm_func(idct, idct, size, opt); \
|
||||
itxfm_func(iadst, idct, size, opt); \
|
||||
itxfm_func(idct, iadst, size, opt); \
|
||||
itxfm_func(iadst, iadst, size, opt)
|
||||
|
||||
itxfm_func(idct, idct, 4, mmxext);
|
||||
itxfm_func(idct, iadst, 4, sse2);
|
||||
itxfm_func(iadst, idct, 4, sse2);
|
||||
itxfm_func(iadst, iadst, 4, sse2);
|
||||
itxfm_funcs(4, ssse3);
|
||||
itxfm_funcs(8, sse2);
|
||||
itxfm_funcs(8, ssse3);
|
||||
itxfm_funcs(8, avx);
|
||||
itxfm_funcs(16, sse2);
|
||||
itxfm_funcs(16, ssse3);
|
||||
itxfm_funcs(16, avx);
|
||||
itxfm_func(idct, idct, 32, sse2);
|
||||
itxfm_func(idct, idct, 32, ssse3);
|
||||
itxfm_func(idct, idct, 32, avx);
|
||||
itxfm_func(iwht, iwht, 4, mmx);
|
||||
itxfm_funcs(16, avx2);
|
||||
itxfm_func(idct, idct, 32, avx2);
|
||||
|
||||
#undef itxfm_func
|
||||
#undef itxfm_funcs
|
||||
|
||||
#define lpf_funcs(size1, size2, opt) \
|
||||
void ff_vp9_loop_filter_v_##size1##_##size2##_##opt(uint8_t *dst, ptrdiff_t stride, \
|
||||
int E, int I, int H); \
|
||||
void ff_vp9_loop_filter_h_##size1##_##size2##_##opt(uint8_t *dst, ptrdiff_t stride, \
|
||||
int E, int I, int H)
|
||||
|
||||
lpf_funcs(4, 8, mmxext);
|
||||
lpf_funcs(8, 8, mmxext);
|
||||
lpf_funcs(16, 16, sse2);
|
||||
lpf_funcs(16, 16, ssse3);
|
||||
lpf_funcs(16, 16, avx);
|
||||
lpf_funcs(44, 16, sse2);
|
||||
lpf_funcs(44, 16, ssse3);
|
||||
lpf_funcs(44, 16, avx);
|
||||
lpf_funcs(84, 16, sse2);
|
||||
lpf_funcs(84, 16, ssse3);
|
||||
lpf_funcs(84, 16, avx);
|
||||
lpf_funcs(48, 16, sse2);
|
||||
lpf_funcs(48, 16, ssse3);
|
||||
lpf_funcs(48, 16, avx);
|
||||
lpf_funcs(88, 16, sse2);
|
||||
lpf_funcs(88, 16, ssse3);
|
||||
lpf_funcs(88, 16, avx);
|
||||
|
||||
#undef lpf_funcs
|
||||
|
||||
#define ipred_func(size, type, opt) \
|
||||
void ff_vp9_ipred_##type##_##size##x##size##_##opt(uint8_t *dst, ptrdiff_t stride, \
|
||||
const uint8_t *l, const uint8_t *a)
|
||||
|
||||
ipred_func(8, v, mmx);
|
||||
|
||||
#define ipred_dc_funcs(size, opt) \
|
||||
ipred_func(size, dc, opt); \
|
||||
ipred_func(size, dc_left, opt); \
|
||||
ipred_func(size, dc_top, opt)
|
||||
|
||||
ipred_dc_funcs(4, mmxext);
|
||||
ipred_dc_funcs(8, mmxext);
|
||||
|
||||
#define ipred_dir_tm_funcs(size, opt) \
|
||||
ipred_func(size, tm, opt); \
|
||||
ipred_func(size, dl, opt); \
|
||||
ipred_func(size, dr, opt); \
|
||||
ipred_func(size, hd, opt); \
|
||||
ipred_func(size, hu, opt); \
|
||||
ipred_func(size, vl, opt); \
|
||||
ipred_func(size, vr, opt)
|
||||
|
||||
ipred_dir_tm_funcs(4, mmxext);
|
||||
|
||||
ipred_func(16, v, sse);
|
||||
ipred_func(32, v, sse);
|
||||
|
||||
ipred_dc_funcs(16, sse2);
|
||||
ipred_dc_funcs(32, sse2);
|
||||
|
||||
#define ipred_dir_tm_h_funcs(size, opt) \
|
||||
ipred_dir_tm_funcs(size, opt); \
|
||||
ipred_func(size, h, opt)
|
||||
|
||||
ipred_dir_tm_h_funcs(8, sse2);
|
||||
ipred_dir_tm_h_funcs(16, sse2);
|
||||
ipred_dir_tm_h_funcs(32, sse2);
|
||||
|
||||
ipred_func(4, h, sse2);
|
||||
|
||||
#define ipred_all_funcs(size, opt) \
|
||||
ipred_dc_funcs(size, opt); \
|
||||
ipred_dir_tm_h_funcs(size, opt)
|
||||
|
||||
// FIXME hd/vl_4x4_ssse3 does not exist
|
||||
ipred_all_funcs(4, ssse3);
|
||||
ipred_all_funcs(8, ssse3);
|
||||
ipred_all_funcs(16, ssse3);
|
||||
ipred_all_funcs(32, ssse3);
|
||||
|
||||
ipred_dir_tm_h_funcs(8, avx);
|
||||
ipred_dir_tm_h_funcs(16, avx);
|
||||
ipred_dir_tm_h_funcs(32, avx);
|
||||
|
||||
ipred_func(32, v, avx);
|
||||
|
||||
ipred_dc_funcs(32, avx2);
|
||||
ipred_func(32, h, avx2);
|
||||
ipred_func(32, tm, avx2);
|
||||
|
||||
#undef ipred_func
|
||||
#undef ipred_dir_tm_h_funcs
|
||||
#undef ipred_dir_tm_funcs
|
||||
#undef ipred_dc_funcs
|
||||
|
||||
#endif /* HAVE_X86ASM */
|
||||
|
||||
av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int bpp, int bitexact)
|
||||
{
|
||||
#if HAVE_X86ASM
|
||||
int cpu_flags;
|
||||
|
||||
if (bpp == 10) {
|
||||
ff_vp9dsp_init_10bpp_x86(dsp, bitexact);
|
||||
return;
|
||||
} else if (bpp == 12) {
|
||||
ff_vp9dsp_init_12bpp_x86(dsp, bitexact);
|
||||
return;
|
||||
}
|
||||
|
||||
cpu_flags = av_get_cpu_flags();
|
||||
|
||||
#define init_lpf(opt) do { \
|
||||
dsp->loop_filter_16[0] = ff_vp9_loop_filter_h_16_16_##opt; \
|
||||
dsp->loop_filter_16[1] = ff_vp9_loop_filter_v_16_16_##opt; \
|
||||
dsp->loop_filter_mix2[0][0][0] = ff_vp9_loop_filter_h_44_16_##opt; \
|
||||
dsp->loop_filter_mix2[0][0][1] = ff_vp9_loop_filter_v_44_16_##opt; \
|
||||
dsp->loop_filter_mix2[0][1][0] = ff_vp9_loop_filter_h_48_16_##opt; \
|
||||
dsp->loop_filter_mix2[0][1][1] = ff_vp9_loop_filter_v_48_16_##opt; \
|
||||
dsp->loop_filter_mix2[1][0][0] = ff_vp9_loop_filter_h_84_16_##opt; \
|
||||
dsp->loop_filter_mix2[1][0][1] = ff_vp9_loop_filter_v_84_16_##opt; \
|
||||
dsp->loop_filter_mix2[1][1][0] = ff_vp9_loop_filter_h_88_16_##opt; \
|
||||
dsp->loop_filter_mix2[1][1][1] = ff_vp9_loop_filter_v_88_16_##opt; \
|
||||
} while (0)
|
||||
|
||||
#define init_ipred(sz, opt, t, e) \
|
||||
dsp->intra_pred[TX_##sz##X##sz][e##_PRED] = ff_vp9_ipred_##t##_##sz##x##sz##_##opt
|
||||
|
||||
#define ff_vp9_ipred_hd_4x4_ssse3 ff_vp9_ipred_hd_4x4_mmxext
|
||||
#define ff_vp9_ipred_vl_4x4_ssse3 ff_vp9_ipred_vl_4x4_mmxext
|
||||
#define init_dir_tm_ipred(sz, opt) do { \
|
||||
init_ipred(sz, opt, dl, DIAG_DOWN_LEFT); \
|
||||
init_ipred(sz, opt, dr, DIAG_DOWN_RIGHT); \
|
||||
init_ipred(sz, opt, hd, HOR_DOWN); \
|
||||
init_ipred(sz, opt, vl, VERT_LEFT); \
|
||||
init_ipred(sz, opt, hu, HOR_UP); \
|
||||
init_ipred(sz, opt, tm, TM_VP8); \
|
||||
init_ipred(sz, opt, vr, VERT_RIGHT); \
|
||||
} while (0)
|
||||
#define init_dir_tm_h_ipred(sz, opt) do { \
|
||||
init_dir_tm_ipred(sz, opt); \
|
||||
init_ipred(sz, opt, h, HOR); \
|
||||
} while (0)
|
||||
#define init_dc_ipred(sz, opt) do { \
|
||||
init_ipred(sz, opt, dc, DC); \
|
||||
init_ipred(sz, opt, dc_left, LEFT_DC); \
|
||||
init_ipred(sz, opt, dc_top, TOP_DC); \
|
||||
} while (0)
|
||||
#define init_all_ipred(sz, opt) do { \
|
||||
init_dc_ipred(sz, opt); \
|
||||
init_dir_tm_h_ipred(sz, opt); \
|
||||
} while (0)
|
||||
|
||||
if (EXTERNAL_MMX(cpu_flags)) {
|
||||
init_fpel_func(4, 0, 4, put, , mmx);
|
||||
init_fpel_func(3, 0, 8, put, , mmx);
|
||||
if (!bitexact) {
|
||||
dsp->itxfm_add[4 /* lossless */][DCT_DCT] =
|
||||
dsp->itxfm_add[4 /* lossless */][ADST_DCT] =
|
||||
dsp->itxfm_add[4 /* lossless */][DCT_ADST] =
|
||||
dsp->itxfm_add[4 /* lossless */][ADST_ADST] = ff_vp9_iwht_iwht_4x4_add_mmx;
|
||||
}
|
||||
init_ipred(8, mmx, v, VERT);
|
||||
}
|
||||
|
||||
if (EXTERNAL_MMXEXT(cpu_flags)) {
|
||||
dsp->loop_filter_8[0][0] = ff_vp9_loop_filter_h_4_8_mmxext;
|
||||
dsp->loop_filter_8[0][1] = ff_vp9_loop_filter_v_4_8_mmxext;
|
||||
dsp->loop_filter_8[1][0] = ff_vp9_loop_filter_h_8_8_mmxext;
|
||||
dsp->loop_filter_8[1][1] = ff_vp9_loop_filter_v_8_8_mmxext;
|
||||
init_subpel2(4, 0, 4, put, 8, mmxext);
|
||||
init_subpel2(4, 1, 4, avg, 8, mmxext);
|
||||
init_fpel_func(4, 1, 4, avg, _8, mmxext);
|
||||
init_fpel_func(3, 1, 8, avg, _8, mmxext);
|
||||
dsp->itxfm_add[TX_4X4][DCT_DCT] = ff_vp9_idct_idct_4x4_add_mmxext;
|
||||
init_dc_ipred(4, mmxext);
|
||||
init_dc_ipred(8, mmxext);
|
||||
init_dir_tm_ipred(4, mmxext);
|
||||
}
|
||||
|
||||
if (EXTERNAL_SSE(cpu_flags)) {
|
||||
init_fpel_func(2, 0, 16, put, , sse);
|
||||
init_fpel_func(1, 0, 32, put, , sse);
|
||||
init_fpel_func(0, 0, 64, put, , sse);
|
||||
init_ipred(16, sse, v, VERT);
|
||||
init_ipred(32, sse, v, VERT);
|
||||
}
|
||||
|
||||
if (EXTERNAL_SSE2(cpu_flags)) {
|
||||
init_subpel3_8to64(0, put, 8, sse2);
|
||||
init_subpel3_8to64(1, avg, 8, sse2);
|
||||
init_fpel_func(2, 1, 16, avg, _8, sse2);
|
||||
init_fpel_func(1, 1, 32, avg, _8, sse2);
|
||||
init_fpel_func(0, 1, 64, avg, _8, sse2);
|
||||
init_lpf(sse2);
|
||||
dsp->itxfm_add[TX_4X4][ADST_DCT] = ff_vp9_idct_iadst_4x4_add_sse2;
|
||||
dsp->itxfm_add[TX_4X4][DCT_ADST] = ff_vp9_iadst_idct_4x4_add_sse2;
|
||||
dsp->itxfm_add[TX_4X4][ADST_ADST] = ff_vp9_iadst_iadst_4x4_add_sse2;
|
||||
dsp->itxfm_add[TX_8X8][DCT_DCT] = ff_vp9_idct_idct_8x8_add_sse2;
|
||||
dsp->itxfm_add[TX_8X8][ADST_DCT] = ff_vp9_idct_iadst_8x8_add_sse2;
|
||||
dsp->itxfm_add[TX_8X8][DCT_ADST] = ff_vp9_iadst_idct_8x8_add_sse2;
|
||||
dsp->itxfm_add[TX_8X8][ADST_ADST] = ff_vp9_iadst_iadst_8x8_add_sse2;
|
||||
dsp->itxfm_add[TX_16X16][DCT_DCT] = ff_vp9_idct_idct_16x16_add_sse2;
|
||||
dsp->itxfm_add[TX_16X16][ADST_DCT] = ff_vp9_idct_iadst_16x16_add_sse2;
|
||||
dsp->itxfm_add[TX_16X16][DCT_ADST] = ff_vp9_iadst_idct_16x16_add_sse2;
|
||||
dsp->itxfm_add[TX_16X16][ADST_ADST] = ff_vp9_iadst_iadst_16x16_add_sse2;
|
||||
dsp->itxfm_add[TX_32X32][ADST_ADST] =
|
||||
dsp->itxfm_add[TX_32X32][ADST_DCT] =
|
||||
dsp->itxfm_add[TX_32X32][DCT_ADST] =
|
||||
dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_sse2;
|
||||
init_dc_ipred(16, sse2);
|
||||
init_dc_ipred(32, sse2);
|
||||
init_dir_tm_h_ipred(8, sse2);
|
||||
init_dir_tm_h_ipred(16, sse2);
|
||||
init_dir_tm_h_ipred(32, sse2);
|
||||
init_ipred(4, sse2, h, HOR);
|
||||
}
|
||||
|
||||
if (EXTERNAL_SSSE3(cpu_flags)) {
|
||||
init_subpel3(0, put, 8, ssse3);
|
||||
init_subpel3(1, avg, 8, ssse3);
|
||||
dsp->itxfm_add[TX_4X4][DCT_DCT] = ff_vp9_idct_idct_4x4_add_ssse3;
|
||||
dsp->itxfm_add[TX_4X4][ADST_DCT] = ff_vp9_idct_iadst_4x4_add_ssse3;
|
||||
dsp->itxfm_add[TX_4X4][DCT_ADST] = ff_vp9_iadst_idct_4x4_add_ssse3;
|
||||
dsp->itxfm_add[TX_4X4][ADST_ADST] = ff_vp9_iadst_iadst_4x4_add_ssse3;
|
||||
dsp->itxfm_add[TX_8X8][DCT_DCT] = ff_vp9_idct_idct_8x8_add_ssse3;
|
||||
dsp->itxfm_add[TX_8X8][ADST_DCT] = ff_vp9_idct_iadst_8x8_add_ssse3;
|
||||
dsp->itxfm_add[TX_8X8][DCT_ADST] = ff_vp9_iadst_idct_8x8_add_ssse3;
|
||||
dsp->itxfm_add[TX_8X8][ADST_ADST] = ff_vp9_iadst_iadst_8x8_add_ssse3;
|
||||
dsp->itxfm_add[TX_16X16][DCT_DCT] = ff_vp9_idct_idct_16x16_add_ssse3;
|
||||
dsp->itxfm_add[TX_16X16][ADST_DCT] = ff_vp9_idct_iadst_16x16_add_ssse3;
|
||||
dsp->itxfm_add[TX_16X16][DCT_ADST] = ff_vp9_iadst_idct_16x16_add_ssse3;
|
||||
dsp->itxfm_add[TX_16X16][ADST_ADST] = ff_vp9_iadst_iadst_16x16_add_ssse3;
|
||||
dsp->itxfm_add[TX_32X32][ADST_ADST] =
|
||||
dsp->itxfm_add[TX_32X32][ADST_DCT] =
|
||||
dsp->itxfm_add[TX_32X32][DCT_ADST] =
|
||||
dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_ssse3;
|
||||
init_lpf(ssse3);
|
||||
init_all_ipred(4, ssse3);
|
||||
init_all_ipred(8, ssse3);
|
||||
init_all_ipred(16, ssse3);
|
||||
init_all_ipred(32, ssse3);
|
||||
}
|
||||
|
||||
if (EXTERNAL_AVX(cpu_flags)) {
|
||||
dsp->itxfm_add[TX_8X8][DCT_DCT] = ff_vp9_idct_idct_8x8_add_avx;
|
||||
dsp->itxfm_add[TX_8X8][ADST_DCT] = ff_vp9_idct_iadst_8x8_add_avx;
|
||||
dsp->itxfm_add[TX_8X8][DCT_ADST] = ff_vp9_iadst_idct_8x8_add_avx;
|
||||
dsp->itxfm_add[TX_8X8][ADST_ADST] = ff_vp9_iadst_iadst_8x8_add_avx;
|
||||
dsp->itxfm_add[TX_16X16][DCT_DCT] = ff_vp9_idct_idct_16x16_add_avx;
|
||||
dsp->itxfm_add[TX_16X16][ADST_DCT] = ff_vp9_idct_iadst_16x16_add_avx;
|
||||
dsp->itxfm_add[TX_16X16][DCT_ADST] = ff_vp9_iadst_idct_16x16_add_avx;
|
||||
dsp->itxfm_add[TX_16X16][ADST_ADST] = ff_vp9_iadst_iadst_16x16_add_avx;
|
||||
dsp->itxfm_add[TX_32X32][ADST_ADST] =
|
||||
dsp->itxfm_add[TX_32X32][ADST_DCT] =
|
||||
dsp->itxfm_add[TX_32X32][DCT_ADST] =
|
||||
dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_avx;
|
||||
init_lpf(avx);
|
||||
init_dir_tm_h_ipred(8, avx);
|
||||
init_dir_tm_h_ipred(16, avx);
|
||||
init_dir_tm_h_ipred(32, avx);
|
||||
}
|
||||
if (EXTERNAL_AVX_FAST(cpu_flags)) {
|
||||
init_fpel_func(1, 0, 32, put, , avx);
|
||||
init_fpel_func(0, 0, 64, put, , avx);
|
||||
init_ipred(32, avx, v, VERT);
|
||||
}
|
||||
|
||||
if (EXTERNAL_AVX2_FAST(cpu_flags)) {
|
||||
init_fpel_func(1, 1, 32, avg, _8, avx2);
|
||||
init_fpel_func(0, 1, 64, avg, _8, avx2);
|
||||
if (ARCH_X86_64) {
|
||||
#if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
|
||||
dsp->itxfm_add[TX_16X16][DCT_DCT] = ff_vp9_idct_idct_16x16_add_avx2;
|
||||
dsp->itxfm_add[TX_16X16][ADST_DCT] = ff_vp9_idct_iadst_16x16_add_avx2;
|
||||
dsp->itxfm_add[TX_16X16][DCT_ADST] = ff_vp9_iadst_idct_16x16_add_avx2;
|
||||
dsp->itxfm_add[TX_16X16][ADST_ADST] = ff_vp9_iadst_iadst_16x16_add_avx2;
|
||||
dsp->itxfm_add[TX_32X32][ADST_ADST] =
|
||||
dsp->itxfm_add[TX_32X32][ADST_DCT] =
|
||||
dsp->itxfm_add[TX_32X32][DCT_ADST] =
|
||||
dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_avx2;
|
||||
init_subpel3_32_64(0, put, 8, avx2);
|
||||
init_subpel3_32_64(1, avg, 8, avx2);
|
||||
#endif
|
||||
}
|
||||
init_dc_ipred(32, avx2);
|
||||
init_ipred(32, avx2, h, HOR);
|
||||
init_ipred(32, avx2, tm, TM_VP8);
|
||||
}
|
||||
|
||||
#undef init_fpel
|
||||
#undef init_subpel1
|
||||
#undef init_subpel2
|
||||
#undef init_subpel3
|
||||
|
||||
#endif /* HAVE_X86ASM */
|
||||
}
|
189
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/vp9dsp_init.h
vendored
Normal file
189
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/vp9dsp_init.h
vendored
Normal file
|
@ -0,0 +1,189 @@
|
|||
/*
|
||||
* VP9 SIMD optimizations
|
||||
*
|
||||
* Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef AVCODEC_X86_VP9DSP_INIT_H
|
||||
#define AVCODEC_X86_VP9DSP_INIT_H
|
||||
|
||||
#include "libavcodec/vp9dsp.h"
|
||||
|
||||
// hack to force-expand BPC
|
||||
#define cat(a, bpp, b) a##bpp##b
|
||||
|
||||
#define decl_fpel_func(avg, sz, bpp, opt) \
|
||||
void ff_vp9_##avg##sz##bpp##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
|
||||
const uint8_t *src, ptrdiff_t src_stride, \
|
||||
int h, int mx, int my)
|
||||
|
||||
#define decl_mc_func(avg, sz, dir, opt, type, f_sz, bpp) \
|
||||
void ff_vp9_##avg##_8tap_1d_##dir##_##sz##_##bpp##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
|
||||
const uint8_t *src, ptrdiff_t src_stride, \
|
||||
int h, const type (*filter)[f_sz])
|
||||
|
||||
#define decl_mc_funcs(sz, opt, type, fsz, bpp) \
|
||||
decl_mc_func(put, sz, h, opt, type, fsz, bpp); \
|
||||
decl_mc_func(avg, sz, h, opt, type, fsz, bpp); \
|
||||
decl_mc_func(put, sz, v, opt, type, fsz, bpp); \
|
||||
decl_mc_func(avg, sz, v, opt, type, fsz, bpp)
|
||||
|
||||
#define decl_ipred_fn(type, sz, bpp, opt) \
|
||||
void ff_vp9_ipred_##type##_##sz##x##sz##_##bpp##_##opt(uint8_t *dst, \
|
||||
ptrdiff_t stride, \
|
||||
const uint8_t *l, \
|
||||
const uint8_t *a)
|
||||
|
||||
#define decl_ipred_fns(type, bpp, opt4, opt8_16_32) \
|
||||
decl_ipred_fn(type, 4, bpp, opt4); \
|
||||
decl_ipred_fn(type, 8, bpp, opt8_16_32); \
|
||||
decl_ipred_fn(type, 16, bpp, opt8_16_32); \
|
||||
decl_ipred_fn(type, 32, bpp, opt8_16_32)
|
||||
|
||||
#define decl_itxfm_func(typea, typeb, size, bpp, opt) \
|
||||
void cat(ff_vp9_##typea##_##typeb##_##size##x##size##_add_, bpp, _##opt)(uint8_t *dst, \
|
||||
ptrdiff_t stride, \
|
||||
int16_t *block, \
|
||||
int eob)
|
||||
|
||||
#define decl_itxfm_funcs(size, bpp, opt) \
|
||||
decl_itxfm_func(idct, idct, size, bpp, opt); \
|
||||
decl_itxfm_func(iadst, idct, size, bpp, opt); \
|
||||
decl_itxfm_func(idct, iadst, size, bpp, opt); \
|
||||
decl_itxfm_func(iadst, iadst, size, bpp, opt)
|
||||
|
||||
#define mc_rep_func(avg, sz, hsz, hszb, dir, opt, type, f_sz, bpp) \
|
||||
static av_always_inline void \
|
||||
ff_vp9_##avg##_8tap_1d_##dir##_##sz##_##bpp##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
|
||||
const uint8_t *src, ptrdiff_t src_stride, \
|
||||
int h, const type (*filter)[f_sz]) \
|
||||
{ \
|
||||
ff_vp9_##avg##_8tap_1d_##dir##_##hsz##_##bpp##_##opt(dst, dst_stride, src, \
|
||||
src_stride, h, filter); \
|
||||
ff_vp9_##avg##_8tap_1d_##dir##_##hsz##_##bpp##_##opt(dst + hszb, dst_stride, src + hszb, \
|
||||
src_stride, h, filter); \
|
||||
}
|
||||
|
||||
#define mc_rep_funcs(sz, hsz, hszb, opt, type, fsz, bpp) \
|
||||
mc_rep_func(put, sz, hsz, hszb, h, opt, type, fsz, bpp) \
|
||||
mc_rep_func(avg, sz, hsz, hszb, h, opt, type, fsz, bpp) \
|
||||
mc_rep_func(put, sz, hsz, hszb, v, opt, type, fsz, bpp) \
|
||||
mc_rep_func(avg, sz, hsz, hszb, v, opt, type, fsz, bpp)
|
||||
|
||||
#define filter_8tap_1d_fn(op, sz, f, f_opt, fname, dir, dvar, bpp, opt) \
|
||||
static void op##_8tap_##fname##_##sz##dir##_##bpp##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
|
||||
const uint8_t *src, ptrdiff_t src_stride, \
|
||||
int h, int mx, int my) \
|
||||
{ \
|
||||
ff_vp9_##op##_8tap_1d_##dir##_##sz##_##bpp##_##opt(dst, dst_stride, src, src_stride, \
|
||||
h, ff_filters_##f_opt[f][dvar - 1]); \
|
||||
}
|
||||
|
||||
#define filters_8tap_1d_fn(op, sz, dir, dvar, bpp, opt, f_opt) \
|
||||
filter_8tap_1d_fn(op, sz, FILTER_8TAP_REGULAR, f_opt, regular, dir, dvar, bpp, opt) \
|
||||
filter_8tap_1d_fn(op, sz, FILTER_8TAP_SHARP, f_opt, sharp, dir, dvar, bpp, opt) \
|
||||
filter_8tap_1d_fn(op, sz, FILTER_8TAP_SMOOTH, f_opt, smooth, dir, dvar, bpp, opt)
|
||||
|
||||
#define filters_8tap_1d_fn2(op, sz, bpp, opt, f_opt) \
|
||||
filters_8tap_1d_fn(op, sz, h, mx, bpp, opt, f_opt) \
|
||||
filters_8tap_1d_fn(op, sz, v, my, bpp, opt, f_opt)
|
||||
|
||||
#define filters_8tap_1d_fn3(op, bpp, opt4, opt8, f_opt) \
|
||||
filters_8tap_1d_fn2(op, 64, bpp, opt8, f_opt) \
|
||||
filters_8tap_1d_fn2(op, 32, bpp, opt8, f_opt) \
|
||||
filters_8tap_1d_fn2(op, 16, bpp, opt8, f_opt) \
|
||||
filters_8tap_1d_fn2(op, 8, bpp, opt8, f_opt) \
|
||||
filters_8tap_1d_fn2(op, 4, bpp, opt4, f_opt)
|
||||
|
||||
#define filter_8tap_2d_fn(op, sz, f, f_opt, fname, align, bpp, bytes, opt) \
|
||||
static void op##_8tap_##fname##_##sz##hv_##bpp##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
|
||||
const uint8_t *src, ptrdiff_t src_stride, \
|
||||
int h, int mx, int my) \
|
||||
{ \
|
||||
LOCAL_ALIGNED_##align(uint8_t, temp, [71 * 64 * bytes]); \
|
||||
ff_vp9_put_8tap_1d_h_##sz##_##bpp##_##opt(temp, 64 * bytes, src - 3 * src_stride, \
|
||||
src_stride, h + 7, \
|
||||
ff_filters_##f_opt[f][mx - 1]); \
|
||||
ff_vp9_##op##_8tap_1d_v_##sz##_##bpp##_##opt(dst, dst_stride, temp + 3 * bytes * 64, \
|
||||
64 * bytes, h, \
|
||||
ff_filters_##f_opt[f][my - 1]); \
|
||||
}
|
||||
|
||||
#define filters_8tap_2d_fn(op, sz, align, bpp, bytes, opt, f_opt) \
|
||||
filter_8tap_2d_fn(op, sz, FILTER_8TAP_REGULAR, f_opt, regular, align, bpp, bytes, opt) \
|
||||
filter_8tap_2d_fn(op, sz, FILTER_8TAP_SHARP, f_opt, sharp, align, bpp, bytes, opt) \
|
||||
filter_8tap_2d_fn(op, sz, FILTER_8TAP_SMOOTH, f_opt, smooth, align, bpp, bytes, opt)
|
||||
|
||||
#define filters_8tap_2d_fn2(op, align, bpp, bytes, opt4, opt8, f_opt) \
|
||||
filters_8tap_2d_fn(op, 64, align, bpp, bytes, opt8, f_opt) \
|
||||
filters_8tap_2d_fn(op, 32, align, bpp, bytes, opt8, f_opt) \
|
||||
filters_8tap_2d_fn(op, 16, align, bpp, bytes, opt8, f_opt) \
|
||||
filters_8tap_2d_fn(op, 8, align, bpp, bytes, opt8, f_opt) \
|
||||
filters_8tap_2d_fn(op, 4, align, bpp, bytes, opt4, f_opt)
|
||||
|
||||
#define init_fpel_func(idx1, idx2, sz, type, bpp, opt) \
|
||||
dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][0][0] = \
|
||||
dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][0][0] = \
|
||||
dsp->mc[idx1][FILTER_8TAP_SHARP ][idx2][0][0] = \
|
||||
dsp->mc[idx1][FILTER_BILINEAR ][idx2][0][0] = ff_vp9_##type##sz##bpp##_##opt
|
||||
|
||||
#define init_subpel1(idx1, idx2, idxh, idxv, sz, dir, type, bpp, opt) \
|
||||
dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][idxh][idxv] = \
|
||||
type##_8tap_smooth_##sz##dir##_##bpp##_##opt; \
|
||||
dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] = \
|
||||
type##_8tap_regular_##sz##dir##_##bpp##_##opt; \
|
||||
dsp->mc[idx1][FILTER_8TAP_SHARP ][idx2][idxh][idxv] = \
|
||||
type##_8tap_sharp_##sz##dir##_##bpp##_##opt
|
||||
|
||||
#define init_subpel2(idx1, idx2, sz, type, bpp, opt) \
|
||||
init_subpel1(idx1, idx2, 1, 1, sz, hv, type, bpp, opt); \
|
||||
init_subpel1(idx1, idx2, 0, 1, sz, v, type, bpp, opt); \
|
||||
init_subpel1(idx1, idx2, 1, 0, sz, h, type, bpp, opt)
|
||||
|
||||
#define init_subpel3_32_64(idx, type, bpp, opt) \
|
||||
init_subpel2(0, idx, 64, type, bpp, opt); \
|
||||
init_subpel2(1, idx, 32, type, bpp, opt)
|
||||
|
||||
#define init_subpel3_8to64(idx, type, bpp, opt) \
|
||||
init_subpel3_32_64(idx, type, bpp, opt); \
|
||||
init_subpel2(2, idx, 16, type, bpp, opt); \
|
||||
init_subpel2(3, idx, 8, type, bpp, opt)
|
||||
|
||||
#define init_subpel3(idx, type, bpp, opt) \
|
||||
init_subpel3_8to64(idx, type, bpp, opt); \
|
||||
init_subpel2(4, idx, 4, type, bpp, opt)
|
||||
|
||||
#define init_ipred_func(type, enum, sz, bpp, opt) \
|
||||
dsp->intra_pred[TX_##sz##X##sz][enum##_PRED] = \
|
||||
cat(ff_vp9_ipred_##type##_##sz##x##sz##_, bpp, _##opt)
|
||||
|
||||
#define init_8_16_32_ipred_funcs(type, enum, bpp, opt) \
|
||||
init_ipred_func(type, enum, 8, bpp, opt); \
|
||||
init_ipred_func(type, enum, 16, bpp, opt); \
|
||||
init_ipred_func(type, enum, 32, bpp, opt)
|
||||
|
||||
#define init_ipred_funcs(type, enum, bpp, opt) \
|
||||
init_ipred_func(type, enum, 4, bpp, opt); \
|
||||
init_8_16_32_ipred_funcs(type, enum, bpp, opt)
|
||||
|
||||
void ff_vp9dsp_init_10bpp_x86(VP9DSPContext *dsp, int bitexact);
|
||||
void ff_vp9dsp_init_12bpp_x86(VP9DSPContext *dsp, int bitexact);
|
||||
void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp);
|
||||
|
||||
#endif /* AVCODEC_X86_VP9DSP_INIT_H */
|
25
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/vp9dsp_init_10bpp.c
vendored
Normal file
25
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/vp9dsp_init_10bpp.c
vendored
Normal file
|
@ -0,0 +1,25 @@
|
|||
/*
|
||||
* VP9 SIMD optimizations
|
||||
*
|
||||
* Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#define BPC 10
|
||||
#define INIT_FUNC ff_vp9dsp_init_10bpp_x86
|
||||
#include "vp9dsp_init_16bpp_template.c"
|
25
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/vp9dsp_init_12bpp.c
vendored
Normal file
25
trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/vp9dsp_init_12bpp.c
vendored
Normal file
|
@ -0,0 +1,25 @@
|
|||
/*
|
||||
* VP9 SIMD optimizations
|
||||
*
|
||||
* Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#define BPC 12
|
||||
#define INIT_FUNC ff_vp9dsp_init_12bpp_x86
|
||||
#include "vp9dsp_init_16bpp_template.c"
|
Some files were not shown because too many files have changed in this diff Show more
Loading…
Add table
Add a link
Reference in a new issue