1
0
Fork 0
mirror of https://github.com/ossrs/srs.git synced 2025-03-09 15:49:59 +00:00

Rename ffmpeg-4.2-fit to ffmpeg-4-fit

This commit is contained in:
winlin 2021-03-02 17:48:40 +08:00
parent b19074721c
commit 27712fdda7
720 changed files with 14 additions and 14 deletions

View file

@ -0,0 +1,199 @@
OBJS += x86/constants.o \
# subsystems
OBJS-$(CONFIG_AC3DSP) += x86/ac3dsp_init.o
OBJS-$(CONFIG_AUDIODSP) += x86/audiodsp_init.o
OBJS-$(CONFIG_BLOCKDSP) += x86/blockdsp_init.o
OBJS-$(CONFIG_BSWAPDSP) += x86/bswapdsp_init.o
OBJS-$(CONFIG_DCT) += x86/dct_init.o
OBJS-$(CONFIG_DIRAC_DECODER) += x86/diracdsp_init.o \
x86/dirac_dwt_init.o
OBJS-$(CONFIG_FDCTDSP) += x86/fdctdsp_init.o
OBJS-$(CONFIG_FFT) += x86/fft_init.o
OBJS-$(CONFIG_FLACDSP) += x86/flacdsp_init.o
OBJS-$(CONFIG_FMTCONVERT) += x86/fmtconvert_init.o
OBJS-$(CONFIG_H263DSP) += x86/h263dsp_init.o
OBJS-$(CONFIG_H264CHROMA) += x86/h264chroma_init.o
OBJS-$(CONFIG_H264DSP) += x86/h264dsp_init.o
OBJS-$(CONFIG_H264PRED) += x86/h264_intrapred_init.o
OBJS-$(CONFIG_H264QPEL) += x86/h264_qpel.o
OBJS-$(CONFIG_HPELDSP) += x86/hpeldsp_init.o
OBJS-$(CONFIG_LLAUDDSP) += x86/lossless_audiodsp_init.o
OBJS-$(CONFIG_LLVIDDSP) += x86/lossless_videodsp_init.o
OBJS-$(CONFIG_LLVIDENCDSP) += x86/lossless_videoencdsp_init.o
OBJS-$(CONFIG_HUFFYUVDSP) += x86/huffyuvdsp_init.o
OBJS-$(CONFIG_HUFFYUVENCDSP) += x86/huffyuvencdsp_init.o
OBJS-$(CONFIG_IDCTDSP) += x86/idctdsp_init.o
OBJS-$(CONFIG_LPC) += x86/lpc.o
OBJS-$(CONFIG_MDCT15) += x86/mdct15_init.o
OBJS-$(CONFIG_ME_CMP) += x86/me_cmp_init.o
OBJS-$(CONFIG_MPEGAUDIODSP) += x86/mpegaudiodsp.o
OBJS-$(CONFIG_MPEGVIDEO) += x86/mpegvideo.o \
x86/mpegvideodsp.o
OBJS-$(CONFIG_MPEGVIDEOENC) += x86/mpegvideoenc.o \
x86/mpegvideoencdsp_init.o
OBJS-$(CONFIG_PIXBLOCKDSP) += x86/pixblockdsp_init.o
OBJS-$(CONFIG_QPELDSP) += x86/qpeldsp_init.o
OBJS-$(CONFIG_RV34DSP) += x86/rv34dsp_init.o
OBJS-$(CONFIG_VC1DSP) += x86/vc1dsp_init.o
OBJS-$(CONFIG_VIDEODSP) += x86/videodsp_init.o
OBJS-$(CONFIG_VP3DSP) += x86/vp3dsp_init.o
OBJS-$(CONFIG_VP8DSP) += x86/vp8dsp_init.o
OBJS-$(CONFIG_XMM_CLOBBER_TEST) += x86/w64xmmtest.o
# decoders/encoders
OBJS-$(CONFIG_AAC_DECODER) += x86/aacpsdsp_init.o \
x86/sbrdsp_init.o
OBJS-$(CONFIG_AAC_ENCODER) += x86/aacencdsp_init.o
OBJS-$(CONFIG_ADPCM_G722_DECODER) += x86/g722dsp_init.o
OBJS-$(CONFIG_ADPCM_G722_ENCODER) += x86/g722dsp_init.o
OBJS-$(CONFIG_ALAC_DECODER) += x86/alacdsp_init.o
OBJS-$(CONFIG_APNG_DECODER) += x86/pngdsp_init.o
OBJS-$(CONFIG_CAVS_DECODER) += x86/cavsdsp.o
OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp_init.o x86/synth_filter_init.o
OBJS-$(CONFIG_DNXHD_ENCODER) += x86/dnxhdenc_init.o
OBJS-$(CONFIG_EXR_DECODER) += x86/exrdsp_init.o
OBJS-$(CONFIG_OPUS_DECODER) += x86/opusdsp_init.o
OBJS-$(CONFIG_OPUS_ENCODER) += x86/celt_pvq_init.o
OBJS-$(CONFIG_HEVC_DECODER) += x86/hevcdsp_init.o
OBJS-$(CONFIG_JPEG2000_DECODER) += x86/jpeg2000dsp_init.o
OBJS-$(CONFIG_LSCR_DECODER) += x86/pngdsp_init.o
OBJS-$(CONFIG_MLP_DECODER) += x86/mlpdsp_init.o
OBJS-$(CONFIG_MPEG4_DECODER) += x86/xvididct_init.o
OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp_init.o
OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp_init.o
OBJS-$(CONFIG_PRORES_LGPL_DECODER) += x86/proresdsp_init.o
OBJS-$(CONFIG_RV40_DECODER) += x86/rv40dsp_init.o
OBJS-$(CONFIG_SBC_ENCODER) += x86/sbcdsp_init.o
OBJS-$(CONFIG_SVQ1_ENCODER) += x86/svq1enc_init.o
OBJS-$(CONFIG_TAK_DECODER) += x86/takdsp_init.o
OBJS-$(CONFIG_TRUEHD_DECODER) += x86/mlpdsp_init.o
OBJS-$(CONFIG_TTA_DECODER) += x86/ttadsp_init.o
OBJS-$(CONFIG_TTA_ENCODER) += x86/ttaencdsp_init.o
OBJS-$(CONFIG_UTVIDEO_DECODER) += x86/utvideodsp_init.o
OBJS-$(CONFIG_V210_DECODER) += x86/v210-init.o
OBJS-$(CONFIG_V210_ENCODER) += x86/v210enc_init.o
OBJS-$(CONFIG_VORBIS_DECODER) += x86/vorbisdsp_init.o
OBJS-$(CONFIG_VP3_DECODER) += x86/hpeldsp_vp3_init.o
OBJS-$(CONFIG_VP6_DECODER) += x86/vp6dsp_init.o
OBJS-$(CONFIG_VP9_DECODER) += x86/vp9dsp_init.o \
x86/vp9dsp_init_10bpp.o \
x86/vp9dsp_init_12bpp.o \
x86/vp9dsp_init_16bpp.o
OBJS-$(CONFIG_WEBP_DECODER) += x86/vp8dsp_init.o
# GCC inline assembly optimizations
# subsystems
MMX-OBJS-$(CONFIG_FDCTDSP) += x86/fdct.o
MMX-OBJS-$(CONFIG_VC1DSP) += x86/vc1dsp_mmx.o
# decoders/encoders
MMX-OBJS-$(CONFIG_SNOW_DECODER) += x86/snowdsp.o
MMX-OBJS-$(CONFIG_SNOW_ENCODER) += x86/snowdsp.o
# subsystems
X86ASM-OBJS-$(CONFIG_AC3DSP) += x86/ac3dsp.o \
x86/ac3dsp_downmix.o
X86ASM-OBJS-$(CONFIG_AUDIODSP) += x86/audiodsp.o
X86ASM-OBJS-$(CONFIG_BLOCKDSP) += x86/blockdsp.o
X86ASM-OBJS-$(CONFIG_BSWAPDSP) += x86/bswapdsp.o
X86ASM-OBJS-$(CONFIG_DCT) += x86/dct32.o
X86ASM-OBJS-$(CONFIG_FFT) += x86/fft.o
X86ASM-OBJS-$(CONFIG_FMTCONVERT) += x86/fmtconvert.o
X86ASM-OBJS-$(CONFIG_H263DSP) += x86/h263_loopfilter.o
X86ASM-OBJS-$(CONFIG_H264CHROMA) += x86/h264_chromamc.o \
x86/h264_chromamc_10bit.o
X86ASM-OBJS-$(CONFIG_H264DSP) += x86/h264_deblock.o \
x86/h264_deblock_10bit.o \
x86/h264_idct.o \
x86/h264_idct_10bit.o \
x86/h264_weight.o \
x86/h264_weight_10bit.o
X86ASM-OBJS-$(CONFIG_H264PRED) += x86/h264_intrapred.o \
x86/h264_intrapred_10bit.o
X86ASM-OBJS-$(CONFIG_H264QPEL) += x86/h264_qpel_8bit.o \
x86/h264_qpel_10bit.o \
x86/fpel.o \
x86/qpel.o
X86ASM-OBJS-$(CONFIG_HPELDSP) += x86/fpel.o \
x86/hpeldsp.o
X86ASM-OBJS-$(CONFIG_HUFFYUVDSP) += x86/huffyuvdsp.o
X86ASM-OBJS-$(CONFIG_HUFFYUVENCDSP) += x86/huffyuvencdsp.o
X86ASM-OBJS-$(CONFIG_IDCTDSP) += x86/idctdsp.o
X86ASM-OBJS-$(CONFIG_LLAUDDSP) += x86/lossless_audiodsp.o
X86ASM-OBJS-$(CONFIG_LLVIDDSP) += x86/lossless_videodsp.o
X86ASM-OBJS-$(CONFIG_LLVIDENCDSP) += x86/lossless_videoencdsp.o
X86ASM-OBJS-$(CONFIG_MDCT15) += x86/mdct15.o
X86ASM-OBJS-$(CONFIG_ME_CMP) += x86/me_cmp.o
X86ASM-OBJS-$(CONFIG_MPEGAUDIODSP) += x86/imdct36.o
X86ASM-OBJS-$(CONFIG_MPEGVIDEOENC) += x86/mpegvideoencdsp.o
X86ASM-OBJS-$(CONFIG_OPUS_DECODER) += x86/opusdsp.o
X86ASM-OBJS-$(CONFIG_OPUS_ENCODER) += x86/celt_pvq_search.o
X86ASM-OBJS-$(CONFIG_PIXBLOCKDSP) += x86/pixblockdsp.o
X86ASM-OBJS-$(CONFIG_QPELDSP) += x86/qpeldsp.o \
x86/fpel.o \
x86/qpel.o
X86ASM-OBJS-$(CONFIG_RV34DSP) += x86/rv34dsp.o
X86ASM-OBJS-$(CONFIG_VC1DSP) += x86/vc1dsp_loopfilter.o \
x86/vc1dsp_mc.o
X86ASM-OBJS-$(CONFIG_IDCTDSP) += x86/simple_idct10.o \
x86/simple_idct.o
X86ASM-OBJS-$(CONFIG_VIDEODSP) += x86/videodsp.o
X86ASM-OBJS-$(CONFIG_VP3DSP) += x86/vp3dsp.o
X86ASM-OBJS-$(CONFIG_VP8DSP) += x86/vp8dsp.o \
x86/vp8dsp_loopfilter.o
# decoders/encoders
X86ASM-OBJS-$(CONFIG_AAC_DECODER) += x86/aacpsdsp.o \
x86/sbrdsp.o
X86ASM-OBJS-$(CONFIG_AAC_ENCODER) += x86/aacencdsp.o
X86ASM-OBJS-$(CONFIG_ADPCM_G722_DECODER) += x86/g722dsp.o
X86ASM-OBJS-$(CONFIG_ADPCM_G722_ENCODER) += x86/g722dsp.o
X86ASM-OBJS-$(CONFIG_ALAC_DECODER) += x86/alacdsp.o
X86ASM-OBJS-$(CONFIG_APNG_DECODER) += x86/pngdsp.o
X86ASM-OBJS-$(CONFIG_CAVS_DECODER) += x86/cavsidct.o
X86ASM-OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp.o x86/synth_filter.o
X86ASM-OBJS-$(CONFIG_DIRAC_DECODER) += x86/diracdsp.o \
x86/dirac_dwt.o
X86ASM-OBJS-$(CONFIG_DNXHD_ENCODER) += x86/dnxhdenc.o
X86ASM-OBJS-$(CONFIG_EXR_DECODER) += x86/exrdsp.o
X86ASM-OBJS-$(CONFIG_FLAC_DECODER) += x86/flacdsp.o
ifdef CONFIG_GPL
X86ASM-OBJS-$(CONFIG_FLAC_ENCODER) += x86/flac_dsp_gpl.o
endif
X86ASM-OBJS-$(CONFIG_HEVC_DECODER) += x86/hevc_add_res.o \
x86/hevc_deblock.o \
x86/hevc_idct.o \
x86/hevc_mc.o \
x86/hevc_sao.o \
x86/hevc_sao_10bit.o
X86ASM-OBJS-$(CONFIG_JPEG2000_DECODER) += x86/jpeg2000dsp.o
X86ASM-OBJS-$(CONFIG_LSCR_DECODER) += x86/pngdsp.o
X86ASM-OBJS-$(CONFIG_MLP_DECODER) += x86/mlpdsp.o
X86ASM-OBJS-$(CONFIG_MPEG4_DECODER) += x86/xvididct.o
X86ASM-OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp.o
X86ASM-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp.o
X86ASM-OBJS-$(CONFIG_PRORES_LGPL_DECODER) += x86/proresdsp.o
X86ASM-OBJS-$(CONFIG_RV40_DECODER) += x86/rv40dsp.o
X86ASM-OBJS-$(CONFIG_SBC_ENCODER) += x86/sbcdsp.o
X86ASM-OBJS-$(CONFIG_SVQ1_ENCODER) += x86/svq1enc.o
X86ASM-OBJS-$(CONFIG_TAK_DECODER) += x86/takdsp.o
X86ASM-OBJS-$(CONFIG_TRUEHD_DECODER) += x86/mlpdsp.o
X86ASM-OBJS-$(CONFIG_TTA_DECODER) += x86/ttadsp.o
X86ASM-OBJS-$(CONFIG_TTA_ENCODER) += x86/ttaencdsp.o
X86ASM-OBJS-$(CONFIG_UTVIDEO_DECODER) += x86/utvideodsp.o
X86ASM-OBJS-$(CONFIG_V210_ENCODER) += x86/v210enc.o
X86ASM-OBJS-$(CONFIG_V210_DECODER) += x86/v210.o
X86ASM-OBJS-$(CONFIG_VORBIS_DECODER) += x86/vorbisdsp.o
X86ASM-OBJS-$(CONFIG_VP3_DECODER) += x86/hpeldsp_vp3.o
X86ASM-OBJS-$(CONFIG_VP6_DECODER) += x86/vp6dsp.o
X86ASM-OBJS-$(CONFIG_VP9_DECODER) += x86/vp9intrapred.o \
x86/vp9intrapred_16bpp.o \
x86/vp9itxfm.o \
x86/vp9itxfm_16bpp.o \
x86/vp9lpf.o \
x86/vp9lpf_16bpp.o \
x86/vp9mc.o \
x86/vp9mc_16bpp.o
X86ASM-OBJS-$(CONFIG_WEBP_DECODER) += x86/vp8dsp.o

View file

@ -0,0 +1,86 @@
;******************************************************************************
;* SIMD optimized AAC encoder DSP functions
;*
;* Copyright (C) 2016 Rostislav Pehlivanov <atomnuker@gmail.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION_RODATA
float_abs_mask: times 4 dd 0x7fffffff
SECTION .text
;*******************************************************************
;void ff_abs_pow34(float *out, const float *in, const int size);
;*******************************************************************
INIT_XMM sse
cglobal abs_pow34, 3, 3, 3, out, in, size
mova m2, [float_abs_mask]
shl sizeq, 2
add inq, sizeq
add outq, sizeq
neg sizeq
.loop:
andps m0, m2, [inq+sizeq]
sqrtps m1, m0
mulps m0, m1
sqrtps m0, m0
mova [outq+sizeq], m0
add sizeq, mmsize
jl .loop
RET
;*******************************************************************
;void ff_aac_quantize_bands(int *out, const float *in, const float *scaled,
; int size, int is_signed, int maxval, const float Q34,
; const float rounding)
;*******************************************************************
INIT_XMM sse2
cglobal aac_quantize_bands, 5, 5, 6, out, in, scaled, size, is_signed, maxval, Q34, rounding
%if UNIX64 == 0
movss m0, Q34m
movss m1, roundingm
cvtsi2ss m3, dword maxvalm
%else
cvtsi2ss m3, maxvald
%endif
shufps m0, m0, 0
shufps m1, m1, 0
shufps m3, m3, 0
shl is_signedd, 31
movd m4, is_signedd
shufps m4, m4, 0
shl sized, 2
add inq, sizeq
add outq, sizeq
add scaledq, sizeq
neg sizeq
.loop:
mulps m2, m0, [scaledq+sizeq]
addps m2, m1
minps m2, m3
andps m5, m4, [inq+sizeq]
orps m2, m5
cvttps2dq m2, m2
mova [outq+sizeq], m2
add sizeq, mmsize
jl .loop
RET

View file

@ -0,0 +1,43 @@
/*
* AAC encoder assembly optimizations
* Copyright (C) 2016 Rostislav Pehlivanov <atomnuker@gmail.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#include "libavutil/float_dsp.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/aacenc.h"
void ff_abs_pow34_sse(float *out, const float *in, const int size);
void ff_aac_quantize_bands_sse2(int *out, const float *in, const float *scaled,
int size, int is_signed, int maxval, const float Q34,
const float rounding);
av_cold void ff_aac_dsp_init_x86(AACEncContext *s)
{
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_SSE(cpu_flags))
s->abs_pow34 = ff_abs_pow34_sse;
if (EXTERNAL_SSE2(cpu_flags))
s->quant_bands = ff_aac_quantize_bands_sse2;
}

View file

@ -0,0 +1,487 @@
;******************************************************************************
;* SIMD optimized MPEG-4 Parametric Stereo decoding functions
;*
;* Copyright (C) 2015 James Almer
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION_RODATA
ps_p1m1p1m1: dd 0, 0x80000000, 0, 0x80000000
SECTION .text
;*************************************************************************
;void ff_ps_add_squares_<opt>(float *dst, const float (*src)[2], int n);
;*************************************************************************
%macro PS_ADD_SQUARES 1
cglobal ps_add_squares, 3, 3, %1, dst, src, n
shl nd, 3
add srcq, nq
neg nq
align 16
.loop:
movaps m0, [srcq+nq]
movaps m1, [srcq+nq+mmsize]
mulps m0, m0
mulps m1, m1
HADDPS m0, m1, m2
addps m0, [dstq]
movaps [dstq], m0
add dstq, mmsize
add nq, mmsize*2
jl .loop
REP_RET
%endmacro
INIT_XMM sse
PS_ADD_SQUARES 2
INIT_XMM sse3
PS_ADD_SQUARES 3
;*******************************************************************
;void ff_ps_mul_pair_single_sse(float (*dst)[2], float (*src0)[2],
; float *src1, int n);
;*******************************************************************
INIT_XMM sse
cglobal ps_mul_pair_single, 4, 4, 4, dst, src1, src2, n
shl nd, 3
add src1q, nq
add dstq, nq
neg nq
align 16
.loop:
movu m0, [src1q+nq]
movu m1, [src1q+nq+mmsize]
mova m2, [src2q]
mova m3, m2
unpcklps m2, m2
unpckhps m3, m3
mulps m0, m2
mulps m1, m3
mova [dstq+nq], m0
mova [dstq+nq+mmsize], m1
add src2q, mmsize
add nq, mmsize*2
jl .loop
REP_RET
;***********************************************************************
;void ff_ps_stereo_interpolate_sse3(float (*l)[2], float (*r)[2],
; float h[2][4], float h_step[2][4],
; int len);
;***********************************************************************
INIT_XMM sse3
cglobal ps_stereo_interpolate, 5, 5, 6, l, r, h, h_step, n
movaps m0, [hq]
movaps m1, [h_stepq]
unpcklps m4, m0, m0
unpckhps m0, m0
unpcklps m5, m1, m1
unpckhps m1, m1
shl nd, 3
add lq, nq
add rq, nq
neg nq
align 16
.loop:
addps m4, m5
addps m0, m1
movddup m2, [lq+nq]
movddup m3, [rq+nq]
mulps m2, m4
mulps m3, m0
addps m2, m3
movsd [lq+nq], m2
movhps [rq+nq], m2
add nq, 8
jl .loop
REP_RET
;***************************************************************************
;void ps_stereo_interpolate_ipdopd_sse3(float (*l)[2], float (*r)[2],
; float h[2][4], float h_step[2][4],
; int len);
;***************************************************************************
INIT_XMM sse3
cglobal ps_stereo_interpolate_ipdopd, 5, 5, 10, l, r, h, h_step, n
movaps m0, [hq]
movaps m1, [hq+mmsize]
%if ARCH_X86_64
movaps m8, [h_stepq]
movaps m9, [h_stepq+mmsize]
%define H_STEP0 m8
%define H_STEP1 m9
%else
%define H_STEP0 [h_stepq]
%define H_STEP1 [h_stepq+mmsize]
%endif
shl nd, 3
add lq, nq
add rq, nq
neg nq
align 16
.loop:
addps m0, H_STEP0
addps m1, H_STEP1
movddup m2, [lq+nq]
movddup m3, [rq+nq]
shufps m4, m2, m2, q2301
shufps m5, m3, m3, q2301
unpcklps m6, m0, m0
unpckhps m7, m0, m0
mulps m2, m6
mulps m3, m7
unpcklps m6, m1, m1
unpckhps m7, m1, m1
mulps m4, m6
mulps m5, m7
addps m2, m3
addsubps m2, m4
addsubps m2, m5
movsd [lq+nq], m2
movhps [rq+nq], m2
add nq, 8
jl .loop
REP_RET
;**********************************************************
;void ps_hybrid_analysis_ileave_sse(float out[2][38][64],
; float (*in)[32][2],
; int i, int len)
;**********************************************************
INIT_XMM sse
cglobal ps_hybrid_analysis_ileave, 3, 7, 5, out, in, i, len, in0, in1, tmp
movsxdifnidn iq, id
mov lend, 32 << 3
lea inq, [inq+iq*4]
mov tmpd, id
shl tmpd, 8
add outq, tmpq
mov tmpd, 64
sub tmpd, id
mov id, tmpd
test id, 1
jne .loop4
test id, 2
jne .loop8
align 16
.loop16:
mov in0q, inq
mov in1q, 38*64*4
add in1q, in0q
mov tmpd, lend
.inner_loop16:
movaps m0, [in0q]
movaps m1, [in1q]
movaps m2, [in0q+lenq]
movaps m3, [in1q+lenq]
TRANSPOSE4x4PS 0, 1, 2, 3, 4
movaps [outq], m0
movaps [outq+lenq], m1
movaps [outq+lenq*2], m2
movaps [outq+3*32*2*4], m3
lea in0q, [in0q+lenq*2]
lea in1q, [in1q+lenq*2]
add outq, mmsize
sub tmpd, mmsize
jg .inner_loop16
add inq, 16
add outq, 3*32*2*4
sub id, 4
jg .loop16
RET
align 16
.loop8:
mov in0q, inq
mov in1q, 38*64*4
add in1q, in0q
mov tmpd, lend
.inner_loop8:
movlps m0, [in0q]
movlps m1, [in1q]
movhps m0, [in0q+lenq]
movhps m1, [in1q+lenq]
SBUTTERFLYPS 0, 1, 2
SBUTTERFLYPD 0, 1, 2
movaps [outq], m0
movaps [outq+lenq], m1
lea in0q, [in0q+lenq*2]
lea in1q, [in1q+lenq*2]
add outq, mmsize
sub tmpd, mmsize
jg .inner_loop8
add inq, 8
add outq, lenq
sub id, 2
jg .loop16
RET
align 16
.loop4:
mov in0q, inq
mov in1q, 38*64*4
add in1q, in0q
mov tmpd, lend
.inner_loop4:
movss m0, [in0q]
movss m1, [in1q]
movss m2, [in0q+lenq]
movss m3, [in1q+lenq]
movlhps m0, m1
movlhps m2, m3
shufps m0, m2, q2020
movaps [outq], m0
lea in0q, [in0q+lenq*2]
lea in1q, [in1q+lenq*2]
add outq, mmsize
sub tmpd, mmsize
jg .inner_loop4
add inq, 4
sub id, 1
test id, 2
jne .loop8
cmp id, 4
jge .loop16
RET
;***********************************************************
;void ps_hybrid_synthesis_deint_sse4(float out[2][38][64],
; float (*in)[32][2],
; int i, int len)
;***********************************************************
%macro HYBRID_SYNTHESIS_DEINT 0
cglobal ps_hybrid_synthesis_deint, 3, 7, 5, out, in, i, len, out0, out1, tmp
%if cpuflag(sse4)
%define MOVH movsd
%else
%define MOVH movlps
%endif
movsxdifnidn iq, id
mov lend, 32 << 3
lea outq, [outq+iq*4]
mov tmpd, id
shl tmpd, 8
add inq, tmpq
mov tmpd, 64
sub tmpd, id
mov id, tmpd
test id, 1
jne .loop4
test id, 2
jne .loop8
align 16
.loop16:
mov out0q, outq
mov out1q, 38*64*4
add out1q, out0q
mov tmpd, lend
.inner_loop16:
movaps m0, [inq]
movaps m1, [inq+lenq]
movaps m2, [inq+lenq*2]
movaps m3, [inq+3*32*2*4]
TRANSPOSE4x4PS 0, 1, 2, 3, 4
movaps [out0q], m0
movaps [out1q], m1
movaps [out0q+lenq], m2
movaps [out1q+lenq], m3
lea out0q, [out0q+lenq*2]
lea out1q, [out1q+lenq*2]
add inq, mmsize
sub tmpd, mmsize
jg .inner_loop16
add outq, 16
add inq, 3*32*2*4
sub id, 4
jg .loop16
RET
align 16
.loop8:
mov out0q, outq
mov out1q, 38*64*4
add out1q, out0q
mov tmpd, lend
.inner_loop8:
movaps m0, [inq]
movaps m1, [inq+lenq]
SBUTTERFLYPS 0, 1, 2
SBUTTERFLYPD 0, 1, 2
MOVH [out0q], m0
MOVH [out1q], m1
movhps [out0q+lenq], m0
movhps [out1q+lenq], m1
lea out0q, [out0q+lenq*2]
lea out1q, [out1q+lenq*2]
add inq, mmsize
sub tmpd, mmsize
jg .inner_loop8
add outq, 8
add inq, lenq
sub id, 2
jg .loop16
RET
align 16
.loop4:
mov out0q, outq
mov out1q, 38*64*4
add out1q, out0q
mov tmpd, lend
.inner_loop4:
movaps m0, [inq]
movss [out0q], m0
%if cpuflag(sse4)
extractps [out1q], m0, 1
extractps [out0q+lenq], m0, 2
extractps [out1q+lenq], m0, 3
%else
movhlps m1, m0
movss [out0q+lenq], m1
shufps m0, m0, 0xb1
movss [out1q], m0
movhlps m1, m0
movss [out1q+lenq], m1
%endif
lea out0q, [out0q+lenq*2]
lea out1q, [out1q+lenq*2]
add inq, mmsize
sub tmpd, mmsize
jg .inner_loop4
add outq, 4
sub id, 1
test id, 2
jne .loop8
cmp id, 4
jge .loop16
RET
%endmacro
INIT_XMM sse
HYBRID_SYNTHESIS_DEINT
INIT_XMM sse4
HYBRID_SYNTHESIS_DEINT
;*******************************************************************
;void ff_ps_hybrid_analysis_<opt>(float (*out)[2], float (*in)[2],
; const float (*filter)[8][2],
; ptrdiff_t stride, int n);
;*******************************************************************
%macro PS_HYBRID_ANALYSIS_LOOP 3
movu %1, [inq+mmsize*%3]
movu m1, [inq+mmsize*(5-%3)+8]
%if cpuflag(sse3)
pshufd %2, %1, q2301
pshufd m4, m1, q0123
pshufd m1, m1, q1032
pshufd m2, [filterq+nq+mmsize*%3], q2301
addsubps %2, m4
addsubps %1, m1
%else
mova m2, [filterq+nq+mmsize*%3]
mova %2, %1
mova m4, m1
shufps %2, %2, q2301
shufps m4, m4, q0123
shufps m1, m1, q1032
shufps m2, m2, q2301
xorps m4, m7
xorps m1, m7
subps %2, m4
subps %1, m1
%endif
mulps %2, m2
mulps %1, m2
%if %3
addps m3, %2
addps m0, %1
%endif
%endmacro
%macro PS_HYBRID_ANALYSIS 0
cglobal ps_hybrid_analysis, 5, 5, 8, out, in, filter, stride, n
%if cpuflag(sse3)
%define MOVH movsd
%else
%define MOVH movlps
%endif
shl strideq, 3
shl nd, 6
add filterq, nq
neg nq
mova m7, [ps_p1m1p1m1]
align 16
.loop:
PS_HYBRID_ANALYSIS_LOOP m0, m3, 0
PS_HYBRID_ANALYSIS_LOOP m5, m6, 1
PS_HYBRID_ANALYSIS_LOOP m5, m6, 2
%if cpuflag(sse3)
pshufd m3, m3, q2301
xorps m0, m7
hsubps m3, m0
pshufd m1, m3, q0020
pshufd m3, m3, q0031
addps m1, m3
movsd m2, [inq+6*8]
%else
mova m1, m3
mova m2, m0
shufps m1, m1, q2301
shufps m2, m2, q2301
subps m1, m3
addps m2, m0
unpcklps m3, m1, m2
unpckhps m1, m2
addps m1, m3
movu m2, [inq+6*8] ; faster than movlps and no risk of overread
%endif
movss m3, [filterq+nq+8*6]
SPLATD m3
mulps m2, m3
addps m1, m2
MOVH [outq], m1
add outq, strideq
add nq, 64
jl .loop
REP_RET
%endmacro
INIT_XMM sse
PS_HYBRID_ANALYSIS
INIT_XMM sse3
PS_HYBRID_ANALYSIS

View file

@ -0,0 +1,72 @@
/*
* SIMD optimized MPEG-4 Parametric Stereo decoding functions
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stddef.h>
#include "config.h"
#include "libavutil/x86/cpu.h"
#include "libavutil/attributes.h"
#include "libavcodec/aacpsdsp.h"
void ff_ps_add_squares_sse (float *dst, const float (*src)[2], int n);
void ff_ps_add_squares_sse3 (float *dst, const float (*src)[2], int n);
void ff_ps_mul_pair_single_sse (float (*dst)[2], float (*src0)[2],
float *src1, int n);
void ff_ps_hybrid_analysis_sse (float (*out)[2], float (*in)[2],
const float (*filter)[8][2],
ptrdiff_t stride, int n);
void ff_ps_hybrid_analysis_sse3(float (*out)[2], float (*in)[2],
const float (*filter)[8][2],
ptrdiff_t stride, int n);
void ff_ps_stereo_interpolate_sse3(float (*l)[2], float (*r)[2],
float h[2][4], float h_step[2][4],
int len);
void ff_ps_stereo_interpolate_ipdopd_sse3(float (*l)[2], float (*r)[2],
float h[2][4], float h_step[2][4],
int len);
void ff_ps_hybrid_synthesis_deint_sse(float out[2][38][64], float (*in)[32][2],
int i, int len);
void ff_ps_hybrid_synthesis_deint_sse4(float out[2][38][64], float (*in)[32][2],
int i, int len);
void ff_ps_hybrid_analysis_ileave_sse(float (*out)[32][2], float L[2][38][64],
int i, int len);
av_cold void ff_psdsp_init_x86(PSDSPContext *s)
{
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_SSE(cpu_flags)) {
s->add_squares = ff_ps_add_squares_sse;
s->mul_pair_single = ff_ps_mul_pair_single_sse;
s->hybrid_analysis_ileave = ff_ps_hybrid_analysis_ileave_sse;
s->hybrid_synthesis_deint = ff_ps_hybrid_synthesis_deint_sse;
s->hybrid_analysis = ff_ps_hybrid_analysis_sse;
}
if (EXTERNAL_SSE3(cpu_flags)) {
s->add_squares = ff_ps_add_squares_sse3;
s->stereo_interpolate[0] = ff_ps_stereo_interpolate_sse3;
s->stereo_interpolate[1] = ff_ps_stereo_interpolate_ipdopd_sse3;
s->hybrid_analysis = ff_ps_hybrid_analysis_sse3;
}
if (EXTERNAL_SSE4(cpu_flags)) {
s->hybrid_synthesis_deint = ff_ps_hybrid_synthesis_deint_sse4;
}
}

View file

@ -0,0 +1,164 @@
/*
* x86-optimized AC-3 DSP functions
* Copyright (c) 2011 Justin Ruggles
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/attributes.h"
#include "libavutil/mem.h"
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/ac3.h"
#include "libavcodec/ac3dsp.h"
void ff_ac3_exponent_min_mmx (uint8_t *exp, int num_reuse_blocks, int nb_coefs);
void ff_ac3_exponent_min_mmxext(uint8_t *exp, int num_reuse_blocks, int nb_coefs);
void ff_ac3_exponent_min_sse2 (uint8_t *exp, int num_reuse_blocks, int nb_coefs);
int ff_ac3_max_msb_abs_int16_mmx (const int16_t *src, int len);
int ff_ac3_max_msb_abs_int16_mmxext(const int16_t *src, int len);
int ff_ac3_max_msb_abs_int16_sse2 (const int16_t *src, int len);
int ff_ac3_max_msb_abs_int16_ssse3(const int16_t *src, int len);
void ff_ac3_lshift_int16_mmx (int16_t *src, unsigned int len, unsigned int shift);
void ff_ac3_lshift_int16_sse2(int16_t *src, unsigned int len, unsigned int shift);
void ff_ac3_rshift_int32_mmx (int32_t *src, unsigned int len, unsigned int shift);
void ff_ac3_rshift_int32_sse2(int32_t *src, unsigned int len, unsigned int shift);
void ff_float_to_fixed24_3dnow(int32_t *dst, const float *src, unsigned int len);
void ff_float_to_fixed24_sse (int32_t *dst, const float *src, unsigned int len);
void ff_float_to_fixed24_sse2 (int32_t *dst, const float *src, unsigned int len);
int ff_ac3_compute_mantissa_size_sse2(uint16_t mant_cnt[6][16]);
void ff_ac3_extract_exponents_sse2 (uint8_t *exp, int32_t *coef, int nb_coefs);
void ff_ac3_extract_exponents_ssse3(uint8_t *exp, int32_t *coef, int nb_coefs);
void ff_apply_window_int16_round_mmxext(int16_t *output, const int16_t *input,
const int16_t *window, unsigned int len);
void ff_apply_window_int16_round_sse2(int16_t *output, const int16_t *input,
const int16_t *window, unsigned int len);
void ff_apply_window_int16_mmxext(int16_t *output, const int16_t *input,
const int16_t *window, unsigned int len);
void ff_apply_window_int16_sse2(int16_t *output, const int16_t *input,
const int16_t *window, unsigned int len);
void ff_apply_window_int16_ssse3(int16_t *output, const int16_t *input,
const int16_t *window, unsigned int len);
void ff_apply_window_int16_ssse3_atom(int16_t *output, const int16_t *input,
const int16_t *window, unsigned int len);
av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c, int bit_exact)
{
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_MMX(cpu_flags)) {
c->ac3_exponent_min = ff_ac3_exponent_min_mmx;
c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_mmx;
c->ac3_lshift_int16 = ff_ac3_lshift_int16_mmx;
c->ac3_rshift_int32 = ff_ac3_rshift_int32_mmx;
}
if (EXTERNAL_AMD3DNOW(cpu_flags)) {
if (!bit_exact) {
c->float_to_fixed24 = ff_float_to_fixed24_3dnow;
}
}
if (EXTERNAL_MMXEXT(cpu_flags)) {
c->ac3_exponent_min = ff_ac3_exponent_min_mmxext;
c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_mmxext;
if (bit_exact) {
c->apply_window_int16 = ff_apply_window_int16_mmxext;
} else {
c->apply_window_int16 = ff_apply_window_int16_round_mmxext;
}
}
if (EXTERNAL_SSE(cpu_flags)) {
c->float_to_fixed24 = ff_float_to_fixed24_sse;
}
if (EXTERNAL_SSE2(cpu_flags)) {
c->ac3_exponent_min = ff_ac3_exponent_min_sse2;
c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_sse2;
c->float_to_fixed24 = ff_float_to_fixed24_sse2;
c->compute_mantissa_size = ff_ac3_compute_mantissa_size_sse2;
c->extract_exponents = ff_ac3_extract_exponents_sse2;
if (bit_exact) {
c->apply_window_int16 = ff_apply_window_int16_sse2;
}
}
if (EXTERNAL_SSE2_FAST(cpu_flags)) {
c->ac3_lshift_int16 = ff_ac3_lshift_int16_sse2;
c->ac3_rshift_int32 = ff_ac3_rshift_int32_sse2;
if (!bit_exact) {
c->apply_window_int16 = ff_apply_window_int16_round_sse2;
}
}
if (EXTERNAL_SSSE3(cpu_flags)) {
c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_ssse3;
if (cpu_flags & AV_CPU_FLAG_ATOM) {
c->apply_window_int16 = ff_apply_window_int16_ssse3_atom;
} else {
c->extract_exponents = ff_ac3_extract_exponents_ssse3;
c->apply_window_int16 = ff_apply_window_int16_ssse3;
}
}
}
#define DOWNMIX_FUNC_OPT(ch, opt) \
void ff_ac3_downmix_ ## ch ## _to_1_ ## opt(float **samples, \
float **matrix, int len); \
void ff_ac3_downmix_ ## ch ## _to_2_ ## opt(float **samples, \
float **matrix, int len);
#define DOWNMIX_FUNCS(opt) \
DOWNMIX_FUNC_OPT(3, opt) \
DOWNMIX_FUNC_OPT(4, opt) \
DOWNMIX_FUNC_OPT(5, opt) \
DOWNMIX_FUNC_OPT(6, opt)
DOWNMIX_FUNCS(sse)
DOWNMIX_FUNCS(avx)
DOWNMIX_FUNCS(fma3)
void ff_ac3dsp_set_downmix_x86(AC3DSPContext *c)
{
int cpu_flags = av_get_cpu_flags();
#define SET_DOWNMIX(ch, suf, SUF) \
if (ch == c->in_channels) { \
if (EXTERNAL_ ## SUF (cpu_flags)) { \
if (c->out_channels == 1) \
c->downmix = ff_ac3_downmix_ ## ch ## _to_1_ ## suf; \
else \
c->downmix = ff_ac3_downmix_ ## ch ## _to_2_ ## suf; \
} \
}
#define SET_DOWNMIX_ALL(suf, SUF) \
SET_DOWNMIX(3, suf, SUF) \
SET_DOWNMIX(4, suf, SUF) \
SET_DOWNMIX(5, suf, SUF) \
SET_DOWNMIX(6, suf, SUF)
SET_DOWNMIX_ALL(sse, SSE)
if (!(cpu_flags & AV_CPU_FLAG_AVXSLOW)) {
SET_DOWNMIX_ALL(avx, AVX)
SET_DOWNMIX_ALL(fma3, FMA3)
}
}

View file

@ -0,0 +1,44 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/attributes.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/alacdsp.h"
#include "config.h"
void ff_alac_decorrelate_stereo_sse4(int32_t *buffer[2], int nb_samples,
int decorr_shift, int decorr_left_weight);
void ff_alac_append_extra_bits_stereo_sse2(int32_t *buffer[2], int32_t *extra_bits_buffer[2],
int extra_bits, int channels, int nb_samples);
void ff_alac_append_extra_bits_mono_sse2(int32_t *buffer[2], int32_t *extra_bits_buffer[2],
int extra_bits, int channels, int nb_samples);
av_cold void ff_alacdsp_init_x86(ALACDSPContext *c)
{
#if HAVE_X86ASM
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_SSE2(cpu_flags)) {
c->append_extra_bits[0] = ff_alac_append_extra_bits_mono_sse2;
c->append_extra_bits[1] = ff_alac_append_extra_bits_stereo_sse2;
}
if (EXTERNAL_SSE4(cpu_flags)) {
c->decorrelate_stereo = ff_alac_decorrelate_stereo_sse4;
}
#endif /* HAVE_X86ASM */
}

View file

@ -0,0 +1,66 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdint.h>
#include "config.h"
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/audiodsp.h"
int32_t ff_scalarproduct_int16_mmxext(const int16_t *v1, const int16_t *v2,
int order);
int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2,
int order);
void ff_vector_clip_int32_mmx(int32_t *dst, const int32_t *src,
int32_t min, int32_t max, unsigned int len);
void ff_vector_clip_int32_sse2(int32_t *dst, const int32_t *src,
int32_t min, int32_t max, unsigned int len);
void ff_vector_clip_int32_int_sse2(int32_t *dst, const int32_t *src,
int32_t min, int32_t max, unsigned int len);
void ff_vector_clip_int32_sse4(int32_t *dst, const int32_t *src,
int32_t min, int32_t max, unsigned int len);
void ff_vector_clipf_sse(float *dst, const float *src,
int len, float min, float max);
av_cold void ff_audiodsp_init_x86(AudioDSPContext *c)
{
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_MMX(cpu_flags))
c->vector_clip_int32 = ff_vector_clip_int32_mmx;
if (EXTERNAL_MMXEXT(cpu_flags))
c->scalarproduct_int16 = ff_scalarproduct_int16_mmxext;
if (EXTERNAL_SSE(cpu_flags))
c->vector_clipf = ff_vector_clipf_sse;
if (EXTERNAL_SSE2(cpu_flags)) {
c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
if (cpu_flags & AV_CPU_FLAG_ATOM)
c->vector_clip_int32 = ff_vector_clip_int32_int_sse2;
else
c->vector_clip_int32 = ff_vector_clip_int32_sse2;
}
if (EXTERNAL_SSE4(cpu_flags))
c->vector_clip_int32 = ff_vector_clip_int32_sse4;
}

View file

@ -0,0 +1,60 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdint.h>
#include "config.h"
#include "libavutil/attributes.h"
#include "libavutil/internal.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/blockdsp.h"
#include "libavcodec/version.h"
void ff_clear_block_mmx(int16_t *block);
void ff_clear_block_sse(int16_t *block);
void ff_clear_block_avx(int16_t *block);
void ff_clear_blocks_mmx(int16_t *blocks);
void ff_clear_blocks_sse(int16_t *blocks);
void ff_clear_blocks_avx(int16_t *blocks);
av_cold void ff_blockdsp_init_x86(BlockDSPContext *c,
AVCodecContext *avctx)
{
#if HAVE_X86ASM
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_MMX(cpu_flags)) {
c->clear_block = ff_clear_block_mmx;
c->clear_blocks = ff_clear_blocks_mmx;
}
/* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */
if (CONFIG_XVMC && avctx->hwaccel && avctx->hwaccel->decode_mb)
return;
if (EXTERNAL_SSE(cpu_flags)) {
c->clear_block = ff_clear_block_sse;
c->clear_blocks = ff_clear_blocks_sse;
}
if (EXTERNAL_AVX_FAST(cpu_flags)) {
c->clear_block = ff_clear_block_avx;
c->clear_blocks = ff_clear_blocks_avx;
}
#endif /* HAVE_X86ASM */
}

View file

@ -0,0 +1,40 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdint.h>
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/bswapdsp.h"
void ff_bswap32_buf_sse2(uint32_t *dst, const uint32_t *src, int w);
void ff_bswap32_buf_ssse3(uint32_t *dst, const uint32_t *src, int w);
void ff_bswap32_buf_avx2(uint32_t *dst, const uint32_t *src, int w);
av_cold void ff_bswapdsp_init_x86(BswapDSPContext *c)
{
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_SSE2(cpu_flags))
c->bswap_buf = ff_bswap32_buf_sse2;
if (EXTERNAL_SSSE3(cpu_flags))
c->bswap_buf = ff_bswap32_buf_ssse3;
if (EXTERNAL_AVX2_FAST(cpu_flags))
c->bswap_buf = ff_bswap32_buf_avx2;
}

View file

@ -0,0 +1,301 @@
/*
* Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVCODEC_X86_CABAC_H
#define AVCODEC_X86_CABAC_H
#include "libavcodec/cabac.h"
#include "libavutil/attributes.h"
#include "libavutil/macros.h"
#include "libavutil/x86/asm.h"
#include "config.h"
#if (defined(__i386) && defined(__clang__) && (__clang_major__<2 || (__clang_major__==2 && __clang_minor__<10)))\
|| ( !defined(__clang__) && defined(__llvm__) && __GNUC__==4 && __GNUC_MINOR__==2 && __GNUC_PATCHLEVEL__<=1)\
|| (defined(__INTEL_COMPILER) && defined(_MSC_VER))
# define BROKEN_COMPILER 1
#else
# define BROKEN_COMPILER 0
#endif
#if HAVE_INLINE_ASM
#ifndef UNCHECKED_BITSTREAM_READER
#define UNCHECKED_BITSTREAM_READER !CONFIG_SAFE_BITSTREAM_READER
#endif
#if UNCHECKED_BITSTREAM_READER
#define END_CHECK(end) ""
#else
#define END_CHECK(end) \
"cmp "end" , %%"FF_REG_c" \n\t"\
"jge 1f \n\t"
#endif
#ifdef BROKEN_RELOCATIONS
#define TABLES_ARG , "r"(tables)
#if HAVE_FAST_CMOV
#define BRANCHLESS_GET_CABAC_UPDATE(ret, retq, low, range, tmp) \
"cmp "low" , "tmp" \n\t"\
"cmova %%ecx , "range" \n\t"\
"sbb %%rcx , %%rcx \n\t"\
"and %%ecx , "tmp" \n\t"\
"xor %%rcx , "retq" \n\t"\
"sub "tmp" , "low" \n\t"
#else /* HAVE_FAST_CMOV */
#define BRANCHLESS_GET_CABAC_UPDATE(ret, retq, low, range, tmp) \
/* P4 Prescott has crappy cmov,sbb,64-bit shift so avoid them */ \
"sub "low" , "tmp" \n\t"\
"sar $31 , "tmp" \n\t"\
"sub %%ecx , "range" \n\t"\
"and "tmp" , "range" \n\t"\
"add %%ecx , "range" \n\t"\
"shl $17 , %%ecx \n\t"\
"and "tmp" , %%ecx \n\t"\
"sub %%ecx , "low" \n\t"\
"xor "tmp" , "ret" \n\t"\
"movslq "ret" , "retq" \n\t"
#endif /* HAVE_FAST_CMOV */
#define BRANCHLESS_GET_CABAC(ret, retq, statep, low, lowword, range, rangeq, tmp, tmpbyte, byte, end, norm_off, lps_off, mlps_off, tables) \
"movzbl "statep" , "ret" \n\t"\
"mov "range" , "tmp" \n\t"\
"and $0xC0 , "range" \n\t"\
"lea ("ret", "range", 2), %%ecx \n\t"\
"movzbl "lps_off"("tables", %%rcx), "range" \n\t"\
"sub "range" , "tmp" \n\t"\
"mov "tmp" , %%ecx \n\t"\
"shl $17 , "tmp" \n\t"\
BRANCHLESS_GET_CABAC_UPDATE(ret, retq, low, range, tmp) \
"movzbl "norm_off"("tables", "rangeq"), %%ecx \n\t"\
"shl %%cl , "range" \n\t"\
"movzbl "mlps_off"+128("tables", "retq"), "tmp" \n\t"\
"shl %%cl , "low" \n\t"\
"mov "tmpbyte" , "statep" \n\t"\
"test "lowword" , "lowword" \n\t"\
"jnz 2f \n\t"\
"mov "byte" , %%"FF_REG_c" \n\t"\
END_CHECK(end)\
"add"FF_OPSIZE" $2 , "byte" \n\t"\
"1: \n\t"\
"movzwl (%%"FF_REG_c") , "tmp" \n\t"\
"lea -1("low") , %%ecx \n\t"\
"xor "low" , %%ecx \n\t"\
"shr $15 , %%ecx \n\t"\
"bswap "tmp" \n\t"\
"shr $15 , "tmp" \n\t"\
"movzbl "norm_off"("tables", %%rcx), %%ecx \n\t"\
"sub $0xFFFF , "tmp" \n\t"\
"neg %%ecx \n\t"\
"add $7 , %%ecx \n\t"\
"shl %%cl , "tmp" \n\t"\
"add "tmp" , "low" \n\t"\
"2: \n\t"
#else /* BROKEN_RELOCATIONS */
#define TABLES_ARG NAMED_CONSTRAINTS_ARRAY_ADD(ff_h264_cabac_tables)
#define RIP_ARG
#if HAVE_FAST_CMOV
#define BRANCHLESS_GET_CABAC_UPDATE(ret, low, range, tmp)\
"mov "tmp" , %%ecx \n\t"\
"shl $17 , "tmp" \n\t"\
"cmp "low" , "tmp" \n\t"\
"cmova %%ecx , "range" \n\t"\
"sbb %%ecx , %%ecx \n\t"\
"and %%ecx , "tmp" \n\t"\
"xor %%ecx , "ret" \n\t"\
"sub "tmp" , "low" \n\t"
#else /* HAVE_FAST_CMOV */
#define BRANCHLESS_GET_CABAC_UPDATE(ret, low, range, tmp)\
"mov "tmp" , %%ecx \n\t"\
"shl $17 , "tmp" \n\t"\
"sub "low" , "tmp" \n\t"\
"sar $31 , "tmp" \n\t" /*lps_mask*/\
"sub %%ecx , "range" \n\t" /*RangeLPS - range*/\
"and "tmp" , "range" \n\t" /*(RangeLPS - range)&lps_mask*/\
"add %%ecx , "range" \n\t" /*new range*/\
"shl $17 , %%ecx \n\t"\
"and "tmp" , %%ecx \n\t"\
"sub %%ecx , "low" \n\t"\
"xor "tmp" , "ret" \n\t"
#endif /* HAVE_FAST_CMOV */
#define BRANCHLESS_GET_CABAC(ret, retq, statep, low, lowword, range, rangeq, tmp, tmpbyte, byte, end, norm_off, lps_off, mlps_off, tables) \
"movzbl "statep" , "ret" \n\t"\
"mov "range" , "tmp" \n\t"\
"and $0xC0 , "range" \n\t"\
"movzbl "MANGLE(ff_h264_cabac_tables)"+"lps_off"("ret", "range", 2), "range" \n\t"\
"sub "range" , "tmp" \n\t"\
BRANCHLESS_GET_CABAC_UPDATE(ret, low, range, tmp) \
"movzbl "MANGLE(ff_h264_cabac_tables)"+"norm_off"("range"), %%ecx \n\t"\
"shl %%cl , "range" \n\t"\
"movzbl "MANGLE(ff_h264_cabac_tables)"+"mlps_off"+128("ret"), "tmp" \n\t"\
"shl %%cl , "low" \n\t"\
"mov "tmpbyte" , "statep" \n\t"\
"test "lowword" , "lowword" \n\t"\
" jnz 2f \n\t"\
"mov "byte" , %%"FF_REG_c" \n\t"\
END_CHECK(end)\
"add"FF_OPSIZE" $2 , "byte" \n\t"\
"1: \n\t"\
"movzwl (%%"FF_REG_c") , "tmp" \n\t"\
"lea -1("low") , %%ecx \n\t"\
"xor "low" , %%ecx \n\t"\
"shr $15 , %%ecx \n\t"\
"bswap "tmp" \n\t"\
"shr $15 , "tmp" \n\t"\
"movzbl "MANGLE(ff_h264_cabac_tables)"+"norm_off"(%%ecx), %%ecx \n\t"\
"sub $0xFFFF , "tmp" \n\t"\
"neg %%ecx \n\t"\
"add $7 , %%ecx \n\t"\
"shl %%cl , "tmp" \n\t"\
"add "tmp" , "low" \n\t"\
"2: \n\t"
#endif /* BROKEN_RELOCATIONS */
#if HAVE_7REGS && !BROKEN_COMPILER
#define get_cabac_inline get_cabac_inline_x86
static av_always_inline int get_cabac_inline_x86(CABACContext *c,
uint8_t *const state)
{
int bit, tmp;
#ifdef BROKEN_RELOCATIONS
void *tables;
__asm__ volatile(
"lea "MANGLE(ff_h264_cabac_tables)", %0 \n\t"
: "=&r"(tables)
: NAMED_CONSTRAINTS_ARRAY(ff_h264_cabac_tables)
);
#endif
__asm__ volatile(
BRANCHLESS_GET_CABAC("%0", "%q0", "(%4)", "%1", "%w1",
"%2", "%q2", "%3", "%b3",
"%c6(%5)", "%c7(%5)",
AV_STRINGIFY(H264_NORM_SHIFT_OFFSET),
AV_STRINGIFY(H264_LPS_RANGE_OFFSET),
AV_STRINGIFY(H264_MLPS_STATE_OFFSET),
"%8")
: "=&r"(bit), "=&r"(c->low), "=&r"(c->range), "=&q"(tmp)
: "r"(state), "r"(c),
"i"(offsetof(CABACContext, bytestream)),
"i"(offsetof(CABACContext, bytestream_end))
TABLES_ARG
,"1"(c->low), "2"(c->range)
: "%"FF_REG_c, "memory"
);
return bit & 1;
}
#endif /* HAVE_7REGS && !BROKEN_COMPILER */
#if !BROKEN_COMPILER
#define get_cabac_bypass_sign get_cabac_bypass_sign_x86
static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
{
x86_reg tmp;
__asm__ volatile(
"movl %c6(%2), %k1 \n\t"
"movl %c3(%2), %%eax \n\t"
"shl $17, %k1 \n\t"
"add %%eax, %%eax \n\t"
"sub %k1, %%eax \n\t"
"cdq \n\t"
"and %%edx, %k1 \n\t"
"add %k1, %%eax \n\t"
"xor %%edx, %%ecx \n\t"
"sub %%edx, %%ecx \n\t"
"test %%ax, %%ax \n\t"
"jnz 1f \n\t"
"mov %c4(%2), %1 \n\t"
"subl $0xFFFF, %%eax \n\t"
"movzwl (%1), %%edx \n\t"
"bswap %%edx \n\t"
"shrl $15, %%edx \n\t"
#if UNCHECKED_BITSTREAM_READER
"add $2, %1 \n\t"
"addl %%edx, %%eax \n\t"
"mov %1, %c4(%2) \n\t"
#else
"addl %%edx, %%eax \n\t"
"cmp %c5(%2), %1 \n\t"
"jge 1f \n\t"
"add"FF_OPSIZE" $2, %c4(%2) \n\t"
#endif
"1: \n\t"
"movl %%eax, %c3(%2) \n\t"
: "+c"(val), "=&r"(tmp)
: "r"(c),
"i"(offsetof(CABACContext, low)),
"i"(offsetof(CABACContext, bytestream)),
"i"(offsetof(CABACContext, bytestream_end)),
"i"(offsetof(CABACContext, range))
: "%eax", "%edx", "memory"
);
return val;
}
#define get_cabac_bypass get_cabac_bypass_x86
static av_always_inline int get_cabac_bypass_x86(CABACContext *c)
{
x86_reg tmp;
int res;
__asm__ volatile(
"movl %c6(%2), %k1 \n\t"
"movl %c3(%2), %%eax \n\t"
"shl $17, %k1 \n\t"
"add %%eax, %%eax \n\t"
"sub %k1, %%eax \n\t"
"cdq \n\t"
"and %%edx, %k1 \n\t"
"add %k1, %%eax \n\t"
"inc %%edx \n\t"
"test %%ax, %%ax \n\t"
"jnz 1f \n\t"
"mov %c4(%2), %1 \n\t"
"subl $0xFFFF, %%eax \n\t"
"movzwl (%1), %%ecx \n\t"
"bswap %%ecx \n\t"
"shrl $15, %%ecx \n\t"
"addl %%ecx, %%eax \n\t"
"cmp %c5(%2), %1 \n\t"
"jge 1f \n\t"
"add"FF_OPSIZE" $2, %c4(%2) \n\t"
"1: \n\t"
"movl %%eax, %c3(%2) \n\t"
: "=&d"(res), "=&r"(tmp)
: "r"(c),
"i"(offsetof(CABACContext, low)),
"i"(offsetof(CABACContext, bytestream)),
"i"(offsetof(CABACContext, bytestream_end)),
"i"(offsetof(CABACContext, range))
: "%eax", "%ecx", "memory"
);
return res;
}
#endif /* !BROKEN_COMPILER */
#endif /* HAVE_INLINE_ASM */
#endif /* AVCODEC_X86_CABAC_H */

View file

@ -0,0 +1,463 @@
/*
* Chinese AVS video (AVS1-P2, JiZhun profile) decoder.
* Copyright (c) 2006 Stefan Gehrer <stefan.gehrer@gmx.de>
*
* MMX-optimized DSP functions, based on H.264 optimizations by
* Michael Niedermayer and Loren Merritt
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/attributes.h"
#include "libavutil/common.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/cavsdsp.h"
#include "libavcodec/idctdsp.h"
#include "constants.h"
#include "fpel.h"
#include "idctdsp.h"
#include "config.h"
#if HAVE_MMX_EXTERNAL
void ff_cavs_idct8_mmx(int16_t *out, const int16_t *in);
static void cavs_idct8_add_mmx(uint8_t *dst, int16_t *block, ptrdiff_t stride)
{
LOCAL_ALIGNED(16, int16_t, b2, [64]);
ff_cavs_idct8_mmx(b2, block);
ff_add_pixels_clamped_mmx(b2, dst, stride);
}
void ff_cavs_idct8_sse2(int16_t *out, const int16_t *in);
static void cavs_idct8_add_sse2(uint8_t *dst, int16_t *block, ptrdiff_t stride)
{
LOCAL_ALIGNED(16, int16_t, b2, [64]);
ff_cavs_idct8_sse2(b2, block);
ff_add_pixels_clamped_sse2(b2, dst, stride);
}
#endif /* HAVE_MMX_EXTERNAL */
#if (HAVE_MMXEXT_INLINE || HAVE_AMD3DNOW_INLINE)
/*****************************************************************************
*
* motion compensation
*
****************************************************************************/
/* vertical filter [-1 -2 96 42 -7 0] */
#define QPEL_CAVSV1(A,B,C,D,E,F,OP,ADD, MUL1, MUL2) \
"movd (%0), "#F" \n\t"\
"movq "#C", %%mm6 \n\t"\
"pmullw "MANGLE(MUL1)", %%mm6\n\t"\
"movq "#D", %%mm7 \n\t"\
"pmullw "MANGLE(MUL2)", %%mm7\n\t"\
"psllw $3, "#E" \n\t"\
"psubw "#E", %%mm6 \n\t"\
"psraw $3, "#E" \n\t"\
"paddw %%mm7, %%mm6 \n\t"\
"paddw "#E", %%mm6 \n\t"\
"paddw "#B", "#B" \n\t"\
"pxor %%mm7, %%mm7 \n\t"\
"add %2, %0 \n\t"\
"punpcklbw %%mm7, "#F" \n\t"\
"psubw "#B", %%mm6 \n\t"\
"psraw $1, "#B" \n\t"\
"psubw "#A", %%mm6 \n\t"\
"paddw "MANGLE(ADD)", %%mm6 \n\t"\
"psraw $7, %%mm6 \n\t"\
"packuswb %%mm6, %%mm6 \n\t"\
OP(%%mm6, (%1), A, d) \
"add %3, %1 \n\t"
/* vertical filter [ 0 -1 5 5 -1 0] */
#define QPEL_CAVSV2(A,B,C,D,E,F,OP,ADD, MUL1, MUL2) \
"movd (%0), "#F" \n\t"\
"movq "#C", %%mm6 \n\t"\
"paddw "#D", %%mm6 \n\t"\
"pmullw "MANGLE(MUL1)", %%mm6\n\t"\
"add %2, %0 \n\t"\
"punpcklbw %%mm7, "#F" \n\t"\
"psubw "#B", %%mm6 \n\t"\
"psubw "#E", %%mm6 \n\t"\
"paddw "MANGLE(ADD)", %%mm6 \n\t"\
"psraw $3, %%mm6 \n\t"\
"packuswb %%mm6, %%mm6 \n\t"\
OP(%%mm6, (%1), A, d) \
"add %3, %1 \n\t"
/* vertical filter [ 0 -7 42 96 -2 -1] */
#define QPEL_CAVSV3(A,B,C,D,E,F,OP,ADD, MUL1, MUL2) \
"movd (%0), "#F" \n\t"\
"movq "#C", %%mm6 \n\t"\
"pmullw "MANGLE(MUL2)", %%mm6\n\t"\
"movq "#D", %%mm7 \n\t"\
"pmullw "MANGLE(MUL1)", %%mm7\n\t"\
"psllw $3, "#B" \n\t"\
"psubw "#B", %%mm6 \n\t"\
"psraw $3, "#B" \n\t"\
"paddw %%mm7, %%mm6 \n\t"\
"paddw "#B", %%mm6 \n\t"\
"paddw "#E", "#E" \n\t"\
"pxor %%mm7, %%mm7 \n\t"\
"add %2, %0 \n\t"\
"punpcklbw %%mm7, "#F" \n\t"\
"psubw "#E", %%mm6 \n\t"\
"psraw $1, "#E" \n\t"\
"psubw "#F", %%mm6 \n\t"\
"paddw "MANGLE(ADD)", %%mm6 \n\t"\
"psraw $7, %%mm6 \n\t"\
"packuswb %%mm6, %%mm6 \n\t"\
OP(%%mm6, (%1), A, d) \
"add %3, %1 \n\t"
#define QPEL_CAVSVNUM(VOP,OP,ADD,MUL1,MUL2)\
int w= 2;\
src -= 2*srcStride;\
\
while(w--){\
__asm__ volatile(\
"pxor %%mm7, %%mm7 \n\t"\
"movd (%0), %%mm0 \n\t"\
"add %2, %0 \n\t"\
"movd (%0), %%mm1 \n\t"\
"add %2, %0 \n\t"\
"movd (%0), %%mm2 \n\t"\
"add %2, %0 \n\t"\
"movd (%0), %%mm3 \n\t"\
"add %2, %0 \n\t"\
"movd (%0), %%mm4 \n\t"\
"add %2, %0 \n\t"\
"punpcklbw %%mm7, %%mm0 \n\t"\
"punpcklbw %%mm7, %%mm1 \n\t"\
"punpcklbw %%mm7, %%mm2 \n\t"\
"punpcklbw %%mm7, %%mm3 \n\t"\
"punpcklbw %%mm7, %%mm4 \n\t"\
VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, ADD, MUL1, MUL2)\
VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, ADD, MUL1, MUL2)\
VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, ADD, MUL1, MUL2)\
VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, ADD, MUL1, MUL2)\
VOP(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP, ADD, MUL1, MUL2)\
VOP(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP, ADD, MUL1, MUL2)\
VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, ADD, MUL1, MUL2)\
VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, ADD, MUL1, MUL2)\
\
: "+a"(src), "+c"(dst)\
: "S"((x86_reg)srcStride), "r"((x86_reg)dstStride)\
NAMED_CONSTRAINTS_ADD(ADD,MUL1,MUL2)\
: "memory"\
);\
if(h==16){\
__asm__ volatile(\
VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, ADD, MUL1, MUL2)\
VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, ADD, MUL1, MUL2)\
VOP(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP, ADD, MUL1, MUL2)\
VOP(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP, ADD, MUL1, MUL2)\
VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, ADD, MUL1, MUL2)\
VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, ADD, MUL1, MUL2)\
VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, ADD, MUL1, MUL2)\
VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, ADD, MUL1, MUL2)\
\
: "+a"(src), "+c"(dst)\
: "S"((x86_reg)srcStride), "r"((x86_reg)dstStride)\
NAMED_CONSTRAINTS_ADD(ADD,MUL1,MUL2)\
: "memory"\
);\
}\
src += 4-(h+5)*srcStride;\
dst += 4-h*dstStride;\
}
#define QPEL_CAVS(OPNAME, OP, MMX)\
static void OPNAME ## cavs_qpel8_h_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride)\
{\
int h=8;\
__asm__ volatile(\
"pxor %%mm7, %%mm7 \n\t"\
"movq "MANGLE(ff_pw_5)", %%mm6\n\t"\
"1: \n\t"\
"movq (%0), %%mm0 \n\t"\
"movq 1(%0), %%mm2 \n\t"\
"movq %%mm0, %%mm1 \n\t"\
"movq %%mm2, %%mm3 \n\t"\
"punpcklbw %%mm7, %%mm0 \n\t"\
"punpckhbw %%mm7, %%mm1 \n\t"\
"punpcklbw %%mm7, %%mm2 \n\t"\
"punpckhbw %%mm7, %%mm3 \n\t"\
"paddw %%mm2, %%mm0 \n\t"\
"paddw %%mm3, %%mm1 \n\t"\
"pmullw %%mm6, %%mm0 \n\t"\
"pmullw %%mm6, %%mm1 \n\t"\
"movq -1(%0), %%mm2 \n\t"\
"movq 2(%0), %%mm4 \n\t"\
"movq %%mm2, %%mm3 \n\t"\
"movq %%mm4, %%mm5 \n\t"\
"punpcklbw %%mm7, %%mm2 \n\t"\
"punpckhbw %%mm7, %%mm3 \n\t"\
"punpcklbw %%mm7, %%mm4 \n\t"\
"punpckhbw %%mm7, %%mm5 \n\t"\
"paddw %%mm4, %%mm2 \n\t"\
"paddw %%mm3, %%mm5 \n\t"\
"psubw %%mm2, %%mm0 \n\t"\
"psubw %%mm5, %%mm1 \n\t"\
"movq "MANGLE(ff_pw_4)", %%mm5\n\t"\
"paddw %%mm5, %%mm0 \n\t"\
"paddw %%mm5, %%mm1 \n\t"\
"psraw $3, %%mm0 \n\t"\
"psraw $3, %%mm1 \n\t"\
"packuswb %%mm1, %%mm0 \n\t"\
OP(%%mm0, (%1),%%mm5, q) \
"add %3, %0 \n\t"\
"add %4, %1 \n\t"\
"decl %2 \n\t"\
" jnz 1b \n\t"\
: "+a"(src), "+c"(dst), "+m"(h)\
: "d"((x86_reg)srcStride), "S"((x86_reg)dstStride)\
NAMED_CONSTRAINTS_ADD(ff_pw_4,ff_pw_5)\
: "memory"\
);\
}\
\
static inline void OPNAME ## cavs_qpel8or16_v1_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride, int h)\
{ \
QPEL_CAVSVNUM(QPEL_CAVSV1,OP,ff_pw_64,ff_pw_96,ff_pw_42) \
}\
\
static inline void OPNAME ## cavs_qpel8or16_v2_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride, int h)\
{ \
QPEL_CAVSVNUM(QPEL_CAVSV2,OP,ff_pw_4,ff_pw_5,ff_pw_42) \
}\
\
static inline void OPNAME ## cavs_qpel8or16_v3_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride, int h)\
{ \
QPEL_CAVSVNUM(QPEL_CAVSV3,OP,ff_pw_64,ff_pw_96,ff_pw_42) \
}\
\
static void OPNAME ## cavs_qpel8_v1_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride)\
{ \
OPNAME ## cavs_qpel8or16_v1_ ## MMX(dst , src , dstStride, srcStride, 8);\
}\
static void OPNAME ## cavs_qpel16_v1_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride)\
{ \
OPNAME ## cavs_qpel8or16_v1_ ## MMX(dst , src , dstStride, srcStride, 16);\
OPNAME ## cavs_qpel8or16_v1_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
}\
\
static void OPNAME ## cavs_qpel8_v2_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride)\
{ \
OPNAME ## cavs_qpel8or16_v2_ ## MMX(dst , src , dstStride, srcStride, 8);\
}\
static void OPNAME ## cavs_qpel16_v2_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride)\
{ \
OPNAME ## cavs_qpel8or16_v2_ ## MMX(dst , src , dstStride, srcStride, 16);\
OPNAME ## cavs_qpel8or16_v2_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
}\
\
static void OPNAME ## cavs_qpel8_v3_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride)\
{ \
OPNAME ## cavs_qpel8or16_v3_ ## MMX(dst , src , dstStride, srcStride, 8);\
}\
static void OPNAME ## cavs_qpel16_v3_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride)\
{ \
OPNAME ## cavs_qpel8or16_v3_ ## MMX(dst , src , dstStride, srcStride, 16);\
OPNAME ## cavs_qpel8or16_v3_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
}\
\
static void OPNAME ## cavs_qpel16_h_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride)\
{ \
OPNAME ## cavs_qpel8_h_ ## MMX(dst , src , dstStride, srcStride);\
OPNAME ## cavs_qpel8_h_ ## MMX(dst+8, src+8, dstStride, srcStride);\
src += 8*srcStride;\
dst += 8*dstStride;\
OPNAME ## cavs_qpel8_h_ ## MMX(dst , src , dstStride, srcStride);\
OPNAME ## cavs_qpel8_h_ ## MMX(dst+8, src+8, dstStride, srcStride);\
}\
#define CAVS_MC(OPNAME, SIZE, MMX) \
static void OPNAME ## cavs_qpel ## SIZE ## _mc20_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
{\
OPNAME ## cavs_qpel ## SIZE ## _h_ ## MMX(dst, src, stride, stride);\
}\
\
static void OPNAME ## cavs_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
{\
OPNAME ## cavs_qpel ## SIZE ## _v1_ ## MMX(dst, src, stride, stride);\
}\
\
static void OPNAME ## cavs_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
{\
OPNAME ## cavs_qpel ## SIZE ## _v2_ ## MMX(dst, src, stride, stride);\
}\
\
static void OPNAME ## cavs_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
{\
OPNAME ## cavs_qpel ## SIZE ## _v3_ ## MMX(dst, src, stride, stride);\
}\
#define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t"
#define AVG_3DNOW_OP(a,b,temp, size) \
"mov" #size " " #b ", " #temp " \n\t"\
"pavgusb " #temp ", " #a " \n\t"\
"mov" #size " " #a ", " #b " \n\t"
#define AVG_MMXEXT_OP(a, b, temp, size) \
"mov" #size " " #b ", " #temp " \n\t"\
"pavgb " #temp ", " #a " \n\t"\
"mov" #size " " #a ", " #b " \n\t"
#endif /* (HAVE_MMXEXT_INLINE || HAVE_AMD3DNOW_INLINE) */
#if HAVE_MMX_EXTERNAL
static void put_cavs_qpel8_mc00_mmx(uint8_t *dst, const uint8_t *src,
ptrdiff_t stride)
{
ff_put_pixels8_mmx(dst, src, stride, 8);
}
static void avg_cavs_qpel8_mc00_mmx(uint8_t *dst, const uint8_t *src,
ptrdiff_t stride)
{
ff_avg_pixels8_mmx(dst, src, stride, 8);
}
static void avg_cavs_qpel8_mc00_mmxext(uint8_t *dst, const uint8_t *src,
ptrdiff_t stride)
{
ff_avg_pixels8_mmxext(dst, src, stride, 8);
}
static void put_cavs_qpel16_mc00_mmx(uint8_t *dst, const uint8_t *src,
ptrdiff_t stride)
{
ff_put_pixels16_mmx(dst, src, stride, 16);
}
static void avg_cavs_qpel16_mc00_mmx(uint8_t *dst, const uint8_t *src,
ptrdiff_t stride)
{
ff_avg_pixels16_mmx(dst, src, stride, 16);
}
static void avg_cavs_qpel16_mc00_mmxext(uint8_t *dst, const uint8_t *src,
ptrdiff_t stride)
{
ff_avg_pixels16_mmxext(dst, src, stride, 16);
}
static void put_cavs_qpel16_mc00_sse2(uint8_t *dst, const uint8_t *src,
ptrdiff_t stride)
{
ff_put_pixels16_sse2(dst, src, stride, 16);
}
static void avg_cavs_qpel16_mc00_sse2(uint8_t *dst, const uint8_t *src,
ptrdiff_t stride)
{
ff_avg_pixels16_sse2(dst, src, stride, 16);
}
#endif
static av_cold void cavsdsp_init_mmx(CAVSDSPContext *c,
AVCodecContext *avctx)
{
#if HAVE_MMX_EXTERNAL
c->put_cavs_qpel_pixels_tab[0][0] = put_cavs_qpel16_mc00_mmx;
c->put_cavs_qpel_pixels_tab[1][0] = put_cavs_qpel8_mc00_mmx;
c->avg_cavs_qpel_pixels_tab[0][0] = avg_cavs_qpel16_mc00_mmx;
c->avg_cavs_qpel_pixels_tab[1][0] = avg_cavs_qpel8_mc00_mmx;
c->cavs_idct8_add = cavs_idct8_add_mmx;
c->idct_perm = FF_IDCT_PERM_TRANSPOSE;
#endif /* HAVE_MMX_EXTERNAL */
}
#define DSPFUNC(PFX, IDX, NUM, EXT) \
c->PFX ## _cavs_qpel_pixels_tab[IDX][ 2] = PFX ## _cavs_qpel ## NUM ## _mc20_ ## EXT; \
c->PFX ## _cavs_qpel_pixels_tab[IDX][ 4] = PFX ## _cavs_qpel ## NUM ## _mc01_ ## EXT; \
c->PFX ## _cavs_qpel_pixels_tab[IDX][ 8] = PFX ## _cavs_qpel ## NUM ## _mc02_ ## EXT; \
c->PFX ## _cavs_qpel_pixels_tab[IDX][12] = PFX ## _cavs_qpel ## NUM ## _mc03_ ## EXT; \
#if HAVE_MMXEXT_INLINE
QPEL_CAVS(put_, PUT_OP, mmxext)
QPEL_CAVS(avg_, AVG_MMXEXT_OP, mmxext)
CAVS_MC(put_, 8, mmxext)
CAVS_MC(put_, 16, mmxext)
CAVS_MC(avg_, 8, mmxext)
CAVS_MC(avg_, 16, mmxext)
#endif /* HAVE_MMXEXT_INLINE */
#if HAVE_AMD3DNOW_INLINE
QPEL_CAVS(put_, PUT_OP, 3dnow)
QPEL_CAVS(avg_, AVG_3DNOW_OP, 3dnow)
CAVS_MC(put_, 8, 3dnow)
CAVS_MC(put_, 16,3dnow)
CAVS_MC(avg_, 8, 3dnow)
CAVS_MC(avg_, 16,3dnow)
static av_cold void cavsdsp_init_3dnow(CAVSDSPContext *c,
AVCodecContext *avctx)
{
DSPFUNC(put, 0, 16, 3dnow);
DSPFUNC(put, 1, 8, 3dnow);
DSPFUNC(avg, 0, 16, 3dnow);
DSPFUNC(avg, 1, 8, 3dnow);
}
#endif /* HAVE_AMD3DNOW_INLINE */
av_cold void ff_cavsdsp_init_x86(CAVSDSPContext *c, AVCodecContext *avctx)
{
av_unused int cpu_flags = av_get_cpu_flags();
if (X86_MMX(cpu_flags))
cavsdsp_init_mmx(c, avctx);
#if HAVE_AMD3DNOW_INLINE
if (INLINE_AMD3DNOW(cpu_flags))
cavsdsp_init_3dnow(c, avctx);
#endif /* HAVE_AMD3DNOW_INLINE */
#if HAVE_MMXEXT_INLINE
if (INLINE_MMXEXT(cpu_flags)) {
DSPFUNC(put, 0, 16, mmxext);
DSPFUNC(put, 1, 8, mmxext);
DSPFUNC(avg, 0, 16, mmxext);
DSPFUNC(avg, 1, 8, mmxext);
}
#endif
#if HAVE_MMX_EXTERNAL
if (EXTERNAL_MMXEXT(cpu_flags)) {
c->avg_cavs_qpel_pixels_tab[0][0] = avg_cavs_qpel16_mc00_mmxext;
c->avg_cavs_qpel_pixels_tab[1][0] = avg_cavs_qpel8_mc00_mmxext;
}
#endif
#if HAVE_SSE2_EXTERNAL
if (EXTERNAL_SSE2(cpu_flags)) {
c->put_cavs_qpel_pixels_tab[0][0] = put_cavs_qpel16_mc00_sse2;
c->avg_cavs_qpel_pixels_tab[0][0] = avg_cavs_qpel16_mc00_sse2;
c->cavs_idct8_add = cavs_idct8_add_sse2;
c->idct_perm = FF_IDCT_PERM_TRANSPOSE;
}
#endif
}

View file

@ -0,0 +1,43 @@
/*
* Opus encoder assembly optimizations
* Copyright (C) 2017 Ivan Kalvachev <ikalvachev@gmail.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/opus_pvq.h"
extern float ff_pvq_search_approx_sse2(float *X, int *y, int K, int N);
extern float ff_pvq_search_approx_sse4(float *X, int *y, int K, int N);
extern float ff_pvq_search_exact_avx (float *X, int *y, int K, int N);
av_cold void ff_celt_pvq_init_x86(CeltPVQ *s)
{
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_SSE2(cpu_flags))
s->pvq_search = ff_pvq_search_approx_sse2;
if (EXTERNAL_SSE4(cpu_flags))
s->pvq_search = ff_pvq_search_approx_sse4;
if (EXTERNAL_AVX_FAST(cpu_flags))
s->pvq_search = ff_pvq_search_exact_avx;
}

View file

@ -0,0 +1,385 @@
;******************************************************************************
;* SIMD optimized Opus encoder DSP function
;*
;* Copyright (C) 2017 Ivan Kalvachev <ikalvachev@gmail.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "config.asm"
%include "libavutil/x86/x86util.asm"
%ifdef __NASM_VER__
%use "smartalign"
ALIGNMODE p6
%endif
SECTION_RODATA 64
const_float_abs_mask: times 8 dd 0x7fffffff
const_align_abs_edge: times 8 dd 0
const_float_0_5: times 8 dd 0.5
const_float_1: times 8 dd 1.0
const_float_sign_mask: times 8 dd 0x80000000
const_int32_offsets:
%rep 8
dd $-const_int32_offsets
%endrep
SECTION .text
;
; Setup High Register to be used
; for holding memory constants
;
; %1 - the register to be used, assmues it is >= mm8
; %2 - name of the constant.
;
; Subsequent opcodes are going to use the constant in the form
; "addps m0, mm_const_name" and it would be turned into:
; "addps m0, [const_name]" on 32 bit arch or
; "addps m0, m8" on 64 bit arch
%macro SET_HI_REG_MM_CONSTANT 3 ; movop, reg, const_name
%if num_mmregs > 8
%define mm_%3 %2
%{1} %2, [%3] ; movaps m8, [const_name]
%else
%define mm_%3 [%3]
%endif
%endmacro
;
; Set Position Independent Code
; Base address of a constant
; %1 - the register to be used, if PIC is set
; %2 - name of the constant.
;
; Subsequent opcode are going to use the base address in the form
; "movaps m0, [pic_base_constant_name+r4]" and it would be turned into
; "movaps m0, [r5 + r4]" if PIC is enabled
; "movaps m0, [constant_name + r4]" if texrel are used
%macro SET_PIC_BASE 3; reg, const_label
%ifdef PIC
%{1} %2, [%3] ; lea r5, [rip+const]
%define pic_base_%3 %2
%else
%define pic_base_%3 %3
%endif
%endmacro
%macro PULSES_SEARCH 1
; m6 Syy_norm
; m7 Sxy_norm
addps m6, mm_const_float_0_5 ; Syy_norm += 1.0/2
pxor m1, m1 ; max_idx
xorps m3, m3 ; p_max
xor r4d, r4d
align 16
%%distortion_search:
movd xm2, dword r4d ; movd zero extends
%ifidn %1,add
movaps m4, [tmpY + r4] ; y[i]
movaps m5, [tmpX + r4] ; X[i]
%if USE_APPROXIMATION == 1
xorps m0, m0
cmpps m0, m0, m5, 4 ; m0 = (X[i] != 0.0)
%endif
addps m4, m6 ; m4 = Syy_new = y[i] + Syy_norm
addps m5, m7 ; m5 = Sxy_new = X[i] + Sxy_norm
%if USE_APPROXIMATION == 1
andps m5, m0 ; if(X[i] == 0) Sxy_new = 0; Prevent aproximation error from setting pulses in array padding.
%endif
%else
movaps m5, [tmpY + r4] ; m5 = y[i]
xorps m0, m0 ; m0 = 0;
cmpps m0, m0, m5, 1 ; m0 = (0<y)
subps m4, m6, m5 ; m4 = Syy_new = Syy_norm - y[i]
subps m5, m7, [tmpX + r4] ; m5 = Sxy_new = Sxy_norm - X[i]
andps m5, m0 ; (0<y)?m5:0
%endif
%if USE_APPROXIMATION == 1
rsqrtps m4, m4
mulps m5, m4 ; m5 = p = Sxy_new*approx(1/sqrt(Syy) )
%else
mulps m5, m5
divps m5, m4 ; m5 = p = Sxy_new*Sxy_new/Syy
%endif
VPBROADCASTD m2, xm2 ; m2=i (all lanes get same values, we add the offset-per-lane, later)
cmpps m0, m3, m5, 1 ; m0 = (m3 < m5) ; (p_max < p) ; (p > p_max)
maxps m3, m5 ; m3=max(p_max,p)
; maxps here is faster than blendvps, despite blend having lower latency.
pand m2, m0 ; This version seems faster than sse41 pblendvb
pmaxsw m1, m2 ; SSE2 signed word, so it would work for N < 32768/4
add r4d, mmsize
cmp r4d, Nd
jb %%distortion_search
por m1, mm_const_int32_offsets ; max_idx offsets per individual lane (skipped in the inner loop)
movdqa m4, m1 ; needed for the aligned y[max_idx]+=1; processing
%if mmsize >= 32
; Merge parallel maximums round 8 (4 vs 4)
vextractf128 xm5, ym3, 1 ; xmm5 = ymm3[1x128] = ymm3[255..128b]
cmpps xm0, xm3, xm5, 1 ; m0 = (m3 < m5) = ( p[0x128] < p[1x128] )
vextracti128 xm2, ym1, 1 ; xmm2 = ymm1[1x128] = ymm1[255..128b]
BLENDVPS xm3, xm5, xm0 ; max_idx = m0 ? max_idx[1x128] : max_idx[0x128]
PBLENDVB xm1, xm2, xm0 ; p = m0 ? p[1x128] : p[0x128]
%endif
; Merge parallel maximums round 4 (2 vs 2)
; m3=p[3210]
movhlps xm5, xm3 ; m5=p[xx32]
cmpps xm0, xm3, xm5, 1 ; m0 = (m3 < m5) = ( p[1,0] < p[3,2] )
pshufd xm2, xm1, q3232
BLENDVPS xm3, xm5, xm0 ; max_idx = m0 ? max_idx[3,2] : max_idx[1,0]
PBLENDVB xm1, xm2, xm0 ; p = m0 ? p[3,2] : p[1,0]
; Merge parallel maximums final round (1 vs 1)
shufps xm0, xm3, xm3, q1111 ; m0 = m3[1] = p[1]
cmpss xm0, xm3, 5 ; m0 = !(m0 >= m3) = !( p[1] >= p[0] )
pshufd xm2, xm1, q1111
PBLENDVB xm1, xm2, xm0
movd dword r4d, xm1 ; zero extends to the rest of r4q
VBROADCASTSS m3, [tmpX + r4]
%{1}ps m7, m3 ; Sxy += X[max_idx]
VBROADCASTSS m5, [tmpY + r4]
%{1}ps m6, m5 ; Syy += Y[max_idx]
; We have to update a single element in Y[i]
; However writing 4 bytes and then doing 16 byte load in the inner loop
; could cause a stall due to breaking write forwarding.
VPBROADCASTD m1, xm1
pcmpeqd m1, m1, m4 ; exactly 1 element matches max_idx and this finds it
and r4d, ~(mmsize-1) ; align address down, so the value pointed by max_idx is inside a mmsize load
movaps m5, [tmpY + r4] ; m5 = Y[y3...ym...y0]
andps m1, mm_const_float_1 ; m1 = [ 0...1.0...0]
%{1}ps m5, m1 ; m5 = Y[y3...ym...y0] +/- [0...1.0...0]
movaps [tmpY + r4], m5 ; Y[max_idx] +-= 1.0;
%endmacro
;
; We need one more register for
; PIC relative addressing. Use this
; to count it in cglobal
;
%ifdef PIC
%define num_pic_regs 1
%else
%define num_pic_regs 0
%endif
;
; Pyramid Vector Quantization Search implementation
;
; float * inX - Unaligned (SIMD) access, it will be overread,
; but extra data is masked away.
; int32 * outY - Should be aligned and padded buffer.
; It is used as temp buffer.
; uint32 K - Number of pulses to have after quantizations.
; uint32 N - Number of vector elements. Must be 0 < N < 256
;
%macro PVQ_FAST_SEARCH 1
cglobal pvq_search%1, 4, 5+num_pic_regs, 11, 256*4, inX, outY, K, N
%define tmpX rsp
%define tmpY outYq
movaps m0, [const_float_abs_mask]
shl Nd, 2 ; N *= sizeof(float); also 32 bit operation zeroes the high 32 bits in 64 bit mode.
mov r4d, Nd
neg r4d
and r4d, mmsize-1
SET_PIC_BASE lea, r5, const_align_abs_edge ; rip+const
movups m2, [pic_base_const_align_abs_edge + r4 - mmsize]
add Nd, r4d ; N = align(N, mmsize)
lea r4d, [Nd - mmsize] ; N is rounded up (aligned up) to mmsize, so r4 can't become negative here, unless N=0.
movups m1, [inXq + r4]
andps m1, m2
movaps [tmpX + r4], m1 ; Sx = abs( X[N-1] )
align 16
%%loop_abs_sum:
sub r4d, mmsize
jc %%end_loop_abs_sum
movups m2, [inXq + r4]
andps m2, m0
movaps [tmpX + r4], m2 ; tmpX[i]=abs(X[i])
addps m1, m2 ; Sx += abs(X[i])
jmp %%loop_abs_sum
align 16
%%end_loop_abs_sum:
HSUMPS m1, m2 ; m1 = Sx
xorps m0, m0
comiss xm0, xm1 ;
jz %%zero_input ; if (Sx==0) goto zero_input
cvtsi2ss xm0, dword Kd ; m0 = K
%if USE_APPROXIMATION == 1
rcpss xm1, xm1 ; m1 = approx(1/Sx)
mulss xm0, xm1 ; m0 = K*(1/Sx)
%else
divss xm0, xm1 ; b = K/Sx
; b = K/max_x
%endif
VBROADCASTSS m0, xm0
lea r4d, [Nd - mmsize]
pxor m5, m5 ; Sy ( Sum of abs( y[i]) )
xorps m6, m6 ; Syy ( Sum of y[i]*y[i] )
xorps m7, m7 ; Sxy ( Sum of X[i]*y[i] )
align 16
%%loop_guess:
movaps m1, [tmpX + r4] ; m1 = X[i]
mulps m2, m0, m1 ; m2 = res*X[i]
cvtps2dq m2, m2 ; yt = (int)lrintf( res*X[i] )
paddd m5, m2 ; Sy += yt
cvtdq2ps m2, m2 ; yt = (float)yt
mulps m1, m2 ; m1 = X[i]*yt
movaps [tmpY + r4], m2 ; y[i] = m2
addps m7, m1 ; Sxy += m1;
mulps m2, m2 ; m2 = yt*yt
addps m6, m2 ; Syy += m2
sub r4d, mmsize
jnc %%loop_guess
HSUMPS m6, m1 ; Syy_norm
HADDD m5, m4 ; pulses
movd dword r4d, xm5 ; zero extends to the rest of r4q
sub Kd, r4d ; K -= pulses , also 32 bit operation zeroes high 32 bit in 64 bit mode.
jz %%finish ; K - pulses == 0
SET_HI_REG_MM_CONSTANT movaps, m8, const_float_0_5
SET_HI_REG_MM_CONSTANT movaps, m9, const_float_1
SET_HI_REG_MM_CONSTANT movdqa, m10, const_int32_offsets
; Use Syy/2 in distortion parameter calculations.
; Saves pre and post-caclulation to correct Y[] values.
; Same precision, since float mantisa is normalized.
; The SQRT approximation does differ.
HSUMPS m7, m0 ; Sxy_norm
mulps m6, mm_const_float_0_5
jc %%remove_pulses_loop ; K - pulses < 0
align 16 ; K - pulses > 0
%%add_pulses_loop:
PULSES_SEARCH add ; m6 Syy_norm ; m7 Sxy_norm
sub Kd, 1
jnz %%add_pulses_loop
addps m6, m6 ; Syy*=2
jmp %%finish
align 16
%%remove_pulses_loop:
PULSES_SEARCH sub ; m6 Syy_norm ; m7 Sxy_norm
add Kd, 1
jnz %%remove_pulses_loop
addps m6, m6 ; Syy*=2
align 16
%%finish:
lea r4d, [Nd - mmsize]
movaps m2, [const_float_sign_mask]
align 16
%%restore_sign_loop:
movaps m0, [tmpY + r4] ; m0 = Y[i]
movups m1, [inXq + r4] ; m1 = X[i]
andps m1, m2 ; m1 = sign(X[i])
orps m0, m1 ; m0 = Y[i]*sign
cvtps2dq m3, m0 ; m3 = (int)m0
movaps [outYq + r4], m3
sub r4d, mmsize
jnc %%restore_sign_loop
%%return:
%if ARCH_X86_64 == 0 ; sbrdsp
movss r0m, xm6 ; return (float)Syy_norm
fld dword r0m
%else
movaps m0, m6 ; return (float)Syy_norm
%endif
RET
align 16
%%zero_input:
lea r4d, [Nd - mmsize]
xorps m0, m0
%%zero_loop:
movaps [outYq + r4], m0
sub r4d, mmsize
jnc %%zero_loop
movaps m6, [const_float_1]
jmp %%return
%endmacro
; if 1, use a float op that give half precision but execute for around 3 cycles.
; On Skylake & Ryzen the division is much faster (around 11c/3),
; that makes the full precision code about 2% slower.
; Opus also does use rsqrt approximation in their intrinsics code.
%define USE_APPROXIMATION 1
INIT_XMM sse2
PVQ_FAST_SEARCH _approx
INIT_XMM sse4
PVQ_FAST_SEARCH _approx
%define USE_APPROXIMATION 0
INIT_XMM avx
PVQ_FAST_SEARCH _exact

View file

@ -0,0 +1,94 @@
/*
* MMX/SSE/AVX constants used across x86 dsp optimizations.
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/mem.h"
#include "libavutil/x86/asm.h" // for xmm_reg
#include "constants.h"
DECLARE_ALIGNED(32, const ymm_reg, ff_pw_1) = { 0x0001000100010001ULL, 0x0001000100010001ULL,
0x0001000100010001ULL, 0x0001000100010001ULL };
DECLARE_ALIGNED(32, const ymm_reg, ff_pw_2) = { 0x0002000200020002ULL, 0x0002000200020002ULL,
0x0002000200020002ULL, 0x0002000200020002ULL };
DECLARE_ASM_ALIGNED(16, const xmm_reg, ff_pw_3) = { 0x0003000300030003ULL, 0x0003000300030003ULL };
DECLARE_ASM_ALIGNED(32, const ymm_reg, ff_pw_4) = { 0x0004000400040004ULL, 0x0004000400040004ULL,
0x0004000400040004ULL, 0x0004000400040004ULL };
DECLARE_ASM_ALIGNED(16, const xmm_reg, ff_pw_5) = { 0x0005000500050005ULL, 0x0005000500050005ULL };
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_8) = { 0x0008000800080008ULL, 0x0008000800080008ULL };
DECLARE_ASM_ALIGNED(16, const xmm_reg, ff_pw_9) = { 0x0009000900090009ULL, 0x0009000900090009ULL };
DECLARE_ALIGNED(8, const uint64_t, ff_pw_15) = 0x000F000F000F000FULL;
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_16) = { 0x0010001000100010ULL, 0x0010001000100010ULL };
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_17) = { 0x0011001100110011ULL, 0x0011001100110011ULL };
DECLARE_ASM_ALIGNED(16, const xmm_reg, ff_pw_18) = { 0x0012001200120012ULL, 0x0012001200120012ULL };
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_20) = { 0x0014001400140014ULL, 0x0014001400140014ULL };
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_32) = { 0x0020002000200020ULL, 0x0020002000200020ULL };
DECLARE_ASM_ALIGNED(8, const uint64_t, ff_pw_42) = 0x002A002A002A002AULL;
DECLARE_ASM_ALIGNED(8, const uint64_t, ff_pw_53) = 0x0035003500350035ULL;
DECLARE_ASM_ALIGNED(16, const xmm_reg, ff_pw_64) = { 0x0040004000400040ULL, 0x0040004000400040ULL };
DECLARE_ASM_ALIGNED(8, const uint64_t, ff_pw_96) = 0x0060006000600060ULL;
DECLARE_ASM_ALIGNED(8, const uint64_t, ff_pw_128) = 0x0080008000800080ULL;
DECLARE_ALIGNED(32, const ymm_reg, ff_pw_255) = { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL,
0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL };
DECLARE_ALIGNED(32, const ymm_reg, ff_pw_256) = { 0x0100010001000100ULL, 0x0100010001000100ULL,
0x0100010001000100ULL, 0x0100010001000100ULL };
DECLARE_ALIGNED(32, const ymm_reg, ff_pw_512) = { 0x0200020002000200ULL, 0x0200020002000200ULL,
0x0200020002000200ULL, 0x0200020002000200ULL };
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1019) = { 0x03FB03FB03FB03FBULL, 0x03FB03FB03FB03FBULL };
DECLARE_ALIGNED(32, const ymm_reg, ff_pw_1023) = { 0x03ff03ff03ff03ffULL, 0x03ff03ff03ff03ffULL,
0x03ff03ff03ff03ffULL, 0x03ff03ff03ff03ffULL};
DECLARE_ALIGNED(32, const ymm_reg, ff_pw_1024) = { 0x0400040004000400ULL, 0x0400040004000400ULL,
0x0400040004000400ULL, 0x0400040004000400ULL};
DECLARE_ALIGNED(32, const ymm_reg, ff_pw_2048) = { 0x0800080008000800ULL, 0x0800080008000800ULL,
0x0800080008000800ULL, 0x0800080008000800ULL };
DECLARE_ALIGNED(32, const ymm_reg, ff_pw_4095) = { 0x0fff0fff0fff0fffULL, 0x0fff0fff0fff0fffULL,
0x0fff0fff0fff0fffULL, 0x0fff0fff0fff0fffULL };
DECLARE_ALIGNED(32, const ymm_reg, ff_pw_4096) = { 0x1000100010001000ULL, 0x1000100010001000ULL,
0x1000100010001000ULL, 0x1000100010001000ULL };
DECLARE_ALIGNED(32, const ymm_reg, ff_pw_8192) = { 0x2000200020002000ULL, 0x2000200020002000ULL,
0x2000200020002000ULL, 0x2000200020002000ULL };
DECLARE_ALIGNED(32, const ymm_reg, ff_pw_m1) = { 0xFFFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL,
0xFFFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL };
DECLARE_ALIGNED(32, const ymm_reg, ff_pb_0) = { 0x0000000000000000ULL, 0x0000000000000000ULL,
0x0000000000000000ULL, 0x0000000000000000ULL };
DECLARE_ALIGNED(32, const ymm_reg, ff_pb_1) = { 0x0101010101010101ULL, 0x0101010101010101ULL,
0x0101010101010101ULL, 0x0101010101010101ULL };
DECLARE_ALIGNED(32, const ymm_reg, ff_pb_2) = { 0x0202020202020202ULL, 0x0202020202020202ULL,
0x0202020202020202ULL, 0x0202020202020202ULL };
DECLARE_ALIGNED(32, const ymm_reg, ff_pb_3) = { 0x0303030303030303ULL, 0x0303030303030303ULL,
0x0303030303030303ULL, 0x0303030303030303ULL };
DECLARE_ALIGNED(32, const xmm_reg, ff_pb_15) = { 0x0F0F0F0F0F0F0F0FULL, 0x0F0F0F0F0F0F0F0FULL };
DECLARE_ALIGNED(32, const ymm_reg, ff_pb_80) = { 0x8080808080808080ULL, 0x8080808080808080ULL,
0x8080808080808080ULL, 0x8080808080808080ULL };
DECLARE_ALIGNED(32, const ymm_reg, ff_pb_FE) = { 0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL,
0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL };
DECLARE_ALIGNED(8, const uint64_t, ff_pb_FC) = 0xFCFCFCFCFCFCFCFCULL;
DECLARE_ALIGNED(16, const xmm_reg, ff_ps_neg) = { 0x8000000080000000ULL, 0x8000000080000000ULL };
DECLARE_ALIGNED(32, const ymm_reg, ff_pd_1) = { 0x0000000100000001ULL, 0x0000000100000001ULL,
0x0000000100000001ULL, 0x0000000100000001ULL };
DECLARE_ALIGNED(32, const ymm_reg, ff_pd_16) = { 0x0000001000000010ULL, 0x0000001000000010ULL,
0x0000001000000010ULL, 0x0000001000000010ULL };
DECLARE_ALIGNED(32, const ymm_reg, ff_pd_32) = { 0x0000002000000020ULL, 0x0000002000000020ULL,
0x0000002000000020ULL, 0x0000002000000020ULL };
DECLARE_ALIGNED(32, const ymm_reg, ff_pd_8192) = { 0x0000200000002000ULL, 0x0000200000002000ULL,
0x0000200000002000ULL, 0x0000200000002000ULL };
DECLARE_ALIGNED(32, const ymm_reg, ff_pd_65535)= { 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL,
0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL };

View file

@ -0,0 +1,72 @@
/*
* MMX/SSE constants used across x86 dsp optimizations.
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVCODEC_X86_CONSTANTS_H
#define AVCODEC_X86_CONSTANTS_H
#include <stdint.h>
#include "libavutil/x86/asm.h"
extern const ymm_reg ff_pw_1;
extern const ymm_reg ff_pw_2;
extern const xmm_reg ff_pw_3;
extern const ymm_reg ff_pw_4;
extern const xmm_reg ff_pw_5;
extern const xmm_reg ff_pw_8;
extern const xmm_reg ff_pw_9;
extern const uint64_t ff_pw_15;
extern const xmm_reg ff_pw_16;
extern const xmm_reg ff_pw_18;
extern const xmm_reg ff_pw_20;
extern const xmm_reg ff_pw_32;
extern const uint64_t ff_pw_42;
extern const uint64_t ff_pw_53;
extern const xmm_reg ff_pw_64;
extern const uint64_t ff_pw_96;
extern const uint64_t ff_pw_128;
extern const ymm_reg ff_pw_255;
extern const ymm_reg ff_pw_256;
extern const ymm_reg ff_pw_512;
extern const ymm_reg ff_pw_1023;
extern const ymm_reg ff_pw_1024;
extern const ymm_reg ff_pw_2048;
extern const ymm_reg ff_pw_4095;
extern const ymm_reg ff_pw_4096;
extern const ymm_reg ff_pw_8192;
extern const ymm_reg ff_pw_m1;
extern const ymm_reg ff_pb_0;
extern const ymm_reg ff_pb_1;
extern const ymm_reg ff_pb_2;
extern const ymm_reg ff_pb_3;
extern const ymm_reg ff_pb_80;
extern const ymm_reg ff_pb_FE;
extern const uint64_t ff_pb_FC;
extern const xmm_reg ff_ps_neg;
extern const ymm_reg ff_pd_1;
extern const ymm_reg ff_pd_16;
extern const ymm_reg ff_pd_32;
extern const ymm_reg ff_pd_8192;
extern const ymm_reg ff_pd_65535;
#endif /* AVCODEC_X86_CONSTANTS_H */

View file

@ -0,0 +1,52 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/dcadsp.h"
#define LFE_FIR_FLOAT_FUNC(opt) \
void ff_lfe_fir0_float_##opt(float *pcm_samples, int32_t *lfe_samples, \
const float *filter_coeff, ptrdiff_t npcmblocks); \
void ff_lfe_fir1_float_##opt(float *pcm_samples, int32_t *lfe_samples, \
const float *filter_coeff, ptrdiff_t npcmblocks);
LFE_FIR_FLOAT_FUNC(sse)
LFE_FIR_FLOAT_FUNC(sse2)
LFE_FIR_FLOAT_FUNC(sse3)
LFE_FIR_FLOAT_FUNC(avx)
LFE_FIR_FLOAT_FUNC(fma3)
av_cold void ff_dcadsp_init_x86(DCADSPContext *s)
{
int cpu_flags = av_get_cpu_flags();
if (ARCH_X86_32 && EXTERNAL_SSE(cpu_flags))
s->lfe_fir_float[0] = ff_lfe_fir0_float_sse;
if (EXTERNAL_SSE2(cpu_flags))
s->lfe_fir_float[0] = ff_lfe_fir0_float_sse2;
if (EXTERNAL_SSE3(cpu_flags))
s->lfe_fir_float[1] = ff_lfe_fir1_float_sse3;
if (EXTERNAL_AVX(cpu_flags)) {
s->lfe_fir_float[0] = ff_lfe_fir0_float_avx;
s->lfe_fir_float[1] = ff_lfe_fir1_float_avx;
}
if (EXTERNAL_FMA3(cpu_flags))
s->lfe_fir_float[0] = ff_lfe_fir0_float_fma3;
}

View file

@ -0,0 +1,41 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/dct.h"
void ff_dct32_float_sse(FFTSample *out, const FFTSample *in);
void ff_dct32_float_sse2(FFTSample *out, const FFTSample *in);
void ff_dct32_float_avx(FFTSample *out, const FFTSample *in);
av_cold void ff_dct_init_x86(DCTContext *s)
{
int cpu_flags = av_get_cpu_flags();
#if ARCH_X86_32
if (EXTERNAL_SSE(cpu_flags))
s->dct32 = ff_dct32_float_sse;
#endif
if (EXTERNAL_SSE2(cpu_flags))
s->dct32 = ff_dct32_float_sse2;
if (EXTERNAL_AVX_FAST(cpu_flags))
s->dct32 = ff_dct32_float_avx;
}

View file

@ -0,0 +1,229 @@
/*
* x86 optimized discrete wavelet transform
* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
* Copyright (c) 2010 David Conrad
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/dirac_dwt.h"
#define COMPOSE_VERTICAL(ext, align) \
void ff_vertical_compose53iL0##ext(int16_t *b0, int16_t *b1, int16_t *b2, int width); \
void ff_vertical_compose_dirac53iH0##ext(int16_t *b0, int16_t *b1, int16_t *b2, int width); \
void ff_vertical_compose_dd137iL0##ext(int16_t *b0, int16_t *b1, int16_t *b2, int16_t *b3, int16_t *b4, int width); \
void ff_vertical_compose_dd97iH0##ext(int16_t *b0, int16_t *b1, int16_t *b2, int16_t *b3, int16_t *b4, int width); \
void ff_vertical_compose_haar##ext(int16_t *b0, int16_t *b1, int width); \
void ff_horizontal_compose_haar0i##ext(int16_t *b, int16_t *tmp, int w);\
void ff_horizontal_compose_haar1i##ext(int16_t *b, int16_t *tmp, int w);\
\
static void vertical_compose53iL0##ext(uint8_t *_b0, uint8_t *_b1, uint8_t *_b2, int width) \
{ \
int i, width_align = width&~(align-1); \
int16_t *b0 = (int16_t *)_b0; \
int16_t *b1 = (int16_t *)_b1; \
int16_t *b2 = (int16_t *)_b2; \
\
for(i=width_align; i<width; i++) \
b1[i] = COMPOSE_53iL0(b0[i], b1[i], b2[i]); \
\
ff_vertical_compose53iL0##ext(b0, b1, b2, width_align); \
} \
\
static void vertical_compose_dirac53iH0##ext(uint8_t *_b0, uint8_t *_b1, uint8_t *_b2, int width) \
{ \
int i, width_align = width&~(align-1); \
int16_t *b0 = (int16_t *)_b0; \
int16_t *b1 = (int16_t *)_b1; \
int16_t *b2 = (int16_t *)_b2; \
\
for(i=width_align; i<width; i++) \
b1[i] = COMPOSE_DIRAC53iH0(b0[i], b1[i], b2[i]); \
\
ff_vertical_compose_dirac53iH0##ext(b0, b1, b2, width_align); \
} \
\
static void vertical_compose_dd137iL0##ext(uint8_t *_b0, uint8_t *_b1, uint8_t *_b2, \
uint8_t *_b3, uint8_t *_b4, int width) \
{ \
int i, width_align = width&~(align-1); \
int16_t *b0 = (int16_t *)_b0; \
int16_t *b1 = (int16_t *)_b1; \
int16_t *b2 = (int16_t *)_b2; \
int16_t *b3 = (int16_t *)_b3; \
int16_t *b4 = (int16_t *)_b4; \
\
for(i=width_align; i<width; i++) \
b2[i] = COMPOSE_DD137iL0(b0[i], b1[i], b2[i], b3[i], b4[i]); \
\
ff_vertical_compose_dd137iL0##ext(b0, b1, b2, b3, b4, width_align); \
} \
\
static void vertical_compose_dd97iH0##ext(uint8_t *_b0, uint8_t *_b1, uint8_t *_b2, \
uint8_t *_b3, uint8_t *_b4, int width) \
{ \
int i, width_align = width&~(align-1); \
int16_t *b0 = (int16_t *)_b0; \
int16_t *b1 = (int16_t *)_b1; \
int16_t *b2 = (int16_t *)_b2; \
int16_t *b3 = (int16_t *)_b3; \
int16_t *b4 = (int16_t *)_b4; \
\
for(i=width_align; i<width; i++) \
b2[i] = COMPOSE_DD97iH0(b0[i], b1[i], b2[i], b3[i], b4[i]); \
\
ff_vertical_compose_dd97iH0##ext(b0, b1, b2, b3, b4, width_align); \
} \
static void vertical_compose_haar##ext(uint8_t *_b0, uint8_t *_b1, int width) \
{ \
int i, width_align = width&~(align-1); \
int16_t *b0 = (int16_t *)_b0; \
int16_t *b1 = (int16_t *)_b1; \
\
for(i=width_align; i<width; i++) { \
b0[i] = COMPOSE_HAARiL0(b0[i], b1[i]); \
b1[i] = COMPOSE_HAARiH0(b1[i], b0[i]); \
} \
\
ff_vertical_compose_haar##ext(b0, b1, width_align); \
} \
static void horizontal_compose_haar0i##ext(uint8_t *_b, uint8_t *_tmp, int w)\
{\
int w2= w>>1;\
int x= w2 - (w2&(align-1));\
int16_t *b = (int16_t *)_b; \
int16_t *tmp = (int16_t *)_tmp; \
\
ff_horizontal_compose_haar0i##ext(b, tmp, w);\
\
for (; x < w2; x++) {\
b[2*x ] = tmp[x];\
b[2*x+1] = COMPOSE_HAARiH0(b[x+w2], tmp[x]);\
}\
}\
static void horizontal_compose_haar1i##ext(uint8_t *_b, uint8_t *_tmp, int w)\
{\
int w2= w>>1;\
int x= w2 - (w2&(align-1));\
int16_t *b = (int16_t *)_b; \
int16_t *tmp = (int16_t *)_tmp; \
\
ff_horizontal_compose_haar1i##ext(b, tmp, w);\
\
for (; x < w2; x++) {\
b[2*x ] = (tmp[x] + 1)>>1;\
b[2*x+1] = (COMPOSE_HAARiH0(b[x+w2], tmp[x]) + 1)>>1;\
}\
}\
\
#if HAVE_X86ASM
#if !ARCH_X86_64
COMPOSE_VERTICAL(_mmx, 4)
#endif
COMPOSE_VERTICAL(_sse2, 8)
void ff_horizontal_compose_dd97i_ssse3(int16_t *_b, int16_t *_tmp, int w);
static void horizontal_compose_dd97i_ssse3(uint8_t *_b, uint8_t *_tmp, int w)
{
int w2= w>>1;
int x= w2 - (w2&7);
int16_t *b = (int16_t *)_b;
int16_t *tmp = (int16_t *)_tmp;
ff_horizontal_compose_dd97i_ssse3(b, tmp, w);
for (; x < w2; x++) {
b[2*x ] = (tmp[x] + 1)>>1;
b[2*x+1] = (COMPOSE_DD97iH0(tmp[x-1], tmp[x], b[x+w2], tmp[x+1], tmp[x+2]) + 1)>>1;
}
}
#endif
void ff_spatial_idwt_init_x86(DWTContext *d, enum dwt_type type)
{
#if HAVE_X86ASM
int mm_flags = av_get_cpu_flags();
#if !ARCH_X86_64
if (!(mm_flags & AV_CPU_FLAG_MMX))
return;
switch (type) {
case DWT_DIRAC_DD9_7:
d->vertical_compose_l0 = (void*)vertical_compose53iL0_mmx;
d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_mmx;
break;
case DWT_DIRAC_LEGALL5_3:
d->vertical_compose_l0 = (void*)vertical_compose53iL0_mmx;
d->vertical_compose_h0 = (void*)vertical_compose_dirac53iH0_mmx;
break;
case DWT_DIRAC_DD13_7:
d->vertical_compose_l0 = (void*)vertical_compose_dd137iL0_mmx;
d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_mmx;
break;
case DWT_DIRAC_HAAR0:
d->vertical_compose = (void*)vertical_compose_haar_mmx;
d->horizontal_compose = horizontal_compose_haar0i_mmx;
break;
case DWT_DIRAC_HAAR1:
d->vertical_compose = (void*)vertical_compose_haar_mmx;
d->horizontal_compose = horizontal_compose_haar1i_mmx;
break;
}
#endif
if (!(mm_flags & AV_CPU_FLAG_SSE2))
return;
switch (type) {
case DWT_DIRAC_DD9_7:
d->vertical_compose_l0 = (void*)vertical_compose53iL0_sse2;
d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_sse2;
break;
case DWT_DIRAC_LEGALL5_3:
d->vertical_compose_l0 = (void*)vertical_compose53iL0_sse2;
d->vertical_compose_h0 = (void*)vertical_compose_dirac53iH0_sse2;
break;
case DWT_DIRAC_DD13_7:
d->vertical_compose_l0 = (void*)vertical_compose_dd137iL0_sse2;
d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_sse2;
break;
case DWT_DIRAC_HAAR0:
d->vertical_compose = (void*)vertical_compose_haar_sse2;
d->horizontal_compose = horizontal_compose_haar0i_sse2;
break;
case DWT_DIRAC_HAAR1:
d->vertical_compose = (void*)vertical_compose_haar_sse2;
d->horizontal_compose = horizontal_compose_haar1i_sse2;
break;
}
if (!(mm_flags & AV_CPU_FLAG_SSSE3))
return;
switch (type) {
case DWT_DIRAC_DD9_7:
d->horizontal_compose = horizontal_compose_dd97i_ssse3;
break;
}
#endif // HAVE_X86ASM
}

View file

@ -0,0 +1,195 @@
/*
* Copyright (C) 2010 David Conrad
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/x86/cpu.h"
#include "libavcodec/diracdsp.h"
#include "fpel.h"
DECL_DIRAC_PIXOP(put, mmx);
DECL_DIRAC_PIXOP(avg, mmx);
DECL_DIRAC_PIXOP(avg, mmxext);
void ff_put_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h);
void ff_avg_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h);
void ff_put_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h);
void ff_avg_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h);
void ff_add_rect_clamped_mmx(uint8_t *, const uint16_t *, int, const int16_t *, int, int, int);
void ff_add_rect_clamped_sse2(uint8_t *, const uint16_t *, int, const int16_t *, int, int, int);
void ff_add_dirac_obmc8_mmx(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
void ff_add_dirac_obmc16_mmx(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
void ff_add_dirac_obmc32_mmx(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
void ff_add_dirac_obmc16_sse2(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
void ff_add_dirac_obmc32_sse2(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
void ff_put_rect_clamped_mmx(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height);
void ff_put_rect_clamped_sse2(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height);
void ff_put_signed_rect_clamped_mmx(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height);
void ff_put_signed_rect_clamped_sse2(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height);
void ff_put_signed_rect_clamped_10_sse4(uint8_t *dst, int dst_stride, const uint8_t *src, int src_stride, int width, int height);
void ff_dequant_subband_32_sse4(uint8_t *src, uint8_t *dst, ptrdiff_t stride, const int qf, const int qs, int tot_v, int tot_h);
#if HAVE_X86ASM
#define HPEL_FILTER(MMSIZE, EXT) \
void ff_dirac_hpel_filter_v_ ## EXT(uint8_t *, const uint8_t *, int, int); \
void ff_dirac_hpel_filter_h_ ## EXT(uint8_t *, const uint8_t *, int); \
\
static void dirac_hpel_filter_ ## EXT(uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, \
const uint8_t *src, int stride, int width, int height) \
{ \
while( height-- ) \
{ \
ff_dirac_hpel_filter_v_ ## EXT(dstv-MMSIZE, src-MMSIZE, stride, width+MMSIZE+5); \
ff_dirac_hpel_filter_h_ ## EXT(dsth, src, width); \
ff_dirac_hpel_filter_h_ ## EXT(dstc, dstv, width); \
\
dsth += stride; \
dstv += stride; \
dstc += stride; \
src += stride; \
} \
}
#define PIXFUNC(PFX, IDX, EXT) \
/*MMXDISABLEDc->PFX ## _dirac_pixels_tab[0][IDX] = ff_ ## PFX ## _dirac_pixels8_ ## EXT;*/ \
c->PFX ## _dirac_pixels_tab[1][IDX] = ff_ ## PFX ## _dirac_pixels16_ ## EXT; \
c->PFX ## _dirac_pixels_tab[2][IDX] = ff_ ## PFX ## _dirac_pixels32_ ## EXT
#define DIRAC_PIXOP(OPNAME2, OPNAME, EXT)\
void ff_ ## OPNAME2 ## _dirac_pixels8_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
{\
if (h&3)\
ff_ ## OPNAME2 ## _dirac_pixels8_c(dst, src, stride, h);\
else\
OPNAME ## _pixels8_ ## EXT(dst, src[0], stride, h);\
}\
void ff_ ## OPNAME2 ## _dirac_pixels16_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
{\
if (h&3)\
ff_ ## OPNAME2 ## _dirac_pixels16_c(dst, src, stride, h);\
else\
OPNAME ## _pixels16_ ## EXT(dst, src[0], stride, h);\
}\
void ff_ ## OPNAME2 ## _dirac_pixels32_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
{\
if (h&3) {\
ff_ ## OPNAME2 ## _dirac_pixels32_c(dst, src, stride, h);\
} else {\
OPNAME ## _pixels16_ ## EXT(dst , src[0] , stride, h);\
OPNAME ## _pixels16_ ## EXT(dst+16, src[0]+16, stride, h);\
}\
}
DIRAC_PIXOP(put, ff_put, mmx)
DIRAC_PIXOP(avg, ff_avg, mmx)
DIRAC_PIXOP(avg, ff_avg, mmxext)
void ff_put_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
{
if (h&3)
ff_put_dirac_pixels16_c(dst, src, stride, h);
else
ff_put_pixels16_sse2(dst, src[0], stride, h);
}
void ff_avg_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
{
if (h&3)
ff_avg_dirac_pixels16_c(dst, src, stride, h);
else
ff_avg_pixels16_sse2(dst, src[0], stride, h);
}
void ff_put_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
{
if (h&3) {
ff_put_dirac_pixels32_c(dst, src, stride, h);
} else {
ff_put_pixels16_sse2(dst , src[0] , stride, h);
ff_put_pixels16_sse2(dst+16, src[0]+16, stride, h);
}
}
void ff_avg_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
{
if (h&3) {
ff_avg_dirac_pixels32_c(dst, src, stride, h);
} else {
ff_avg_pixels16_sse2(dst , src[0] , stride, h);
ff_avg_pixels16_sse2(dst+16, src[0]+16, stride, h);
}
}
#else // HAVE_X86ASM
#define HPEL_FILTER(MMSIZE, EXT) \
void dirac_hpel_filter_ ## EXT(uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, \
const uint8_t *src, int stride, int width, int height);
#define PIXFUNC(PFX, IDX, EXT) do {} while (0)
#endif // HAVE_X86ASM
#if !ARCH_X86_64
HPEL_FILTER(8, mmx)
#endif
HPEL_FILTER(16, sse2)
void ff_diracdsp_init_x86(DiracDSPContext* c)
{
int mm_flags = av_get_cpu_flags();
if (EXTERNAL_MMX(mm_flags)) {
c->add_dirac_obmc[0] = ff_add_dirac_obmc8_mmx;
#if !ARCH_X86_64
c->add_dirac_obmc[1] = ff_add_dirac_obmc16_mmx;
c->add_dirac_obmc[2] = ff_add_dirac_obmc32_mmx;
c->dirac_hpel_filter = dirac_hpel_filter_mmx;
c->add_rect_clamped = ff_add_rect_clamped_mmx;
c->put_signed_rect_clamped[0] = (void *)ff_put_signed_rect_clamped_mmx;
#endif
PIXFUNC(put, 0, mmx);
PIXFUNC(avg, 0, mmx);
}
if (EXTERNAL_MMXEXT(mm_flags)) {
PIXFUNC(avg, 0, mmxext);
}
if (EXTERNAL_SSE2(mm_flags)) {
c->dirac_hpel_filter = dirac_hpel_filter_sse2;
c->add_rect_clamped = ff_add_rect_clamped_sse2;
c->put_signed_rect_clamped[0] = (void *)ff_put_signed_rect_clamped_sse2;
c->add_dirac_obmc[1] = ff_add_dirac_obmc16_sse2;
c->add_dirac_obmc[2] = ff_add_dirac_obmc32_sse2;
c->put_dirac_pixels_tab[1][0] = ff_put_dirac_pixels16_sse2;
c->avg_dirac_pixels_tab[1][0] = ff_avg_dirac_pixels16_sse2;
c->put_dirac_pixels_tab[2][0] = ff_put_dirac_pixels32_sse2;
c->avg_dirac_pixels_tab[2][0] = ff_avg_dirac_pixels32_sse2;
}
if (EXTERNAL_SSE4(mm_flags)) {
c->dequant_subband[1] = ff_dequant_subband_32_sse4;
c->put_signed_rect_clamped[1] = ff_put_signed_rect_clamped_10_sse4;
}
}

View file

@ -0,0 +1,37 @@
/*
* VC3/DNxHD SIMD functions
* Copyright (c) 2007 Baptiste Coudurier <baptiste dot coudurier at smartjog dot com>
*
* VC-3 encoder funded by the British Broadcasting Corporation
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/attributes.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/dnxhdenc.h"
void ff_get_pixels_8x4_sym_sse2(int16_t *block, const uint8_t *pixels,
ptrdiff_t line_size);
av_cold void ff_dnxhdenc_init_x86(DNXHDEncContext *ctx)
{
if (EXTERNAL_SSE2(av_get_cpu_flags())) {
if (ctx->cid_table->bit_depth == 8)
ctx->get_pixels_8x4_sym = ff_get_pixels_8x4_sym_sse2;
}
}

View file

@ -0,0 +1,52 @@
/*
* OpenEXR (.exr) image decoder
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/attributes.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/exrdsp.h"
void ff_reorder_pixels_sse2(uint8_t *dst, const uint8_t *src, ptrdiff_t size);
void ff_reorder_pixels_avx2(uint8_t *dst, const uint8_t *src, ptrdiff_t size);
void ff_predictor_ssse3(uint8_t *src, ptrdiff_t size);
void ff_predictor_avx(uint8_t *src, ptrdiff_t size);
void ff_predictor_avx2(uint8_t *src, ptrdiff_t size);
av_cold void ff_exrdsp_init_x86(ExrDSPContext *dsp)
{
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_SSE2(cpu_flags)) {
dsp->reorder_pixels = ff_reorder_pixels_sse2;
}
if (EXTERNAL_SSSE3(cpu_flags)) {
dsp->predictor = ff_predictor_ssse3;
}
if (EXTERNAL_AVX(cpu_flags)) {
dsp->predictor = ff_predictor_avx;
}
if (EXTERNAL_AVX2_FAST(cpu_flags)) {
dsp->reorder_pixels = ff_reorder_pixels_avx2;
dsp->predictor = ff_predictor_avx2;
}
}

View file

@ -0,0 +1,594 @@
/*
* SIMD-optimized forward DCT
* The gcc porting is Copyright (c) 2001 Fabrice Bellard.
* cleanup/optimizations are Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
* SSE2 optimization is Copyright (c) 2004 Denes Balatoni.
*
* from fdctam32.c - AP922 MMX(3D-Now) forward-DCT
*
* Intel Application Note AP-922 - fast, precise implementation of DCT
* http://developer.intel.com/vtune/cbts/appnotes.htm
*
* Also of inspiration:
* a page about fdct at http://www.geocities.com/ssavekar/dct.htm
* Skal's fdct at http://skal.planet-d.net/coding/dct.html
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/common.h"
#include "libavutil/x86/asm.h"
#include "fdct.h"
#if HAVE_MMX_INLINE
//////////////////////////////////////////////////////////////////////
//
// constants for the forward DCT
// -----------------------------
//
// Be sure to check that your compiler is aligning all constants to QWORD
// (8-byte) memory boundaries! Otherwise the unaligned memory access will
// severely stall MMX execution.
//
//////////////////////////////////////////////////////////////////////
#define BITS_FRW_ACC 3 //; 2 or 3 for accuracy
#define SHIFT_FRW_COL BITS_FRW_ACC
#define SHIFT_FRW_ROW (BITS_FRW_ACC + 17 - 3)
#define RND_FRW_ROW (1 << (SHIFT_FRW_ROW-1))
//#define RND_FRW_COL (1 << (SHIFT_FRW_COL-1))
#define X8(x) x,x,x,x,x,x,x,x
//concatenated table, for forward DCT transformation
DECLARE_ALIGNED(16, static const int16_t, fdct_tg_all_16)[24] = {
X8(13036), // tg * (2<<16) + 0.5
X8(27146), // tg * (2<<16) + 0.5
X8(-21746) // tg * (2<<16) + 0.5
};
DECLARE_ALIGNED(16, static const int16_t, ocos_4_16)[8] = {
X8(23170) //cos * (2<<15) + 0.5
};
DECLARE_ALIGNED(16, static const int16_t, fdct_one_corr)[8] = { X8(1) };
DECLARE_ALIGNED(8, static const int32_t, fdct_r_row)[2] = {RND_FRW_ROW, RND_FRW_ROW };
static const struct
{
DECLARE_ALIGNED(16, const int32_t, fdct_r_row_sse2)[4];
} fdct_r_row_sse2 =
{{
RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW
}};
//DECLARE_ALIGNED(16, static const long, fdct_r_row_sse2)[4] = {RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW};
DECLARE_ALIGNED(8, static const int16_t, tab_frw_01234567)[] = { // forward_dct coeff table
16384, 16384, 22725, 19266,
16384, 16384, 12873, 4520,
21407, 8867, 19266, -4520,
-8867, -21407, -22725, -12873,
16384, -16384, 12873, -22725,
-16384, 16384, 4520, 19266,
8867, -21407, 4520, -12873,
21407, -8867, 19266, -22725,
22725, 22725, 31521, 26722,
22725, 22725, 17855, 6270,
29692, 12299, 26722, -6270,
-12299, -29692, -31521, -17855,
22725, -22725, 17855, -31521,
-22725, 22725, 6270, 26722,
12299, -29692, 6270, -17855,
29692, -12299, 26722, -31521,
21407, 21407, 29692, 25172,
21407, 21407, 16819, 5906,
27969, 11585, 25172, -5906,
-11585, -27969, -29692, -16819,
21407, -21407, 16819, -29692,
-21407, 21407, 5906, 25172,
11585, -27969, 5906, -16819,
27969, -11585, 25172, -29692,
19266, 19266, 26722, 22654,
19266, 19266, 15137, 5315,
25172, 10426, 22654, -5315,
-10426, -25172, -26722, -15137,
19266, -19266, 15137, -26722,
-19266, 19266, 5315, 22654,
10426, -25172, 5315, -15137,
25172, -10426, 22654, -26722,
16384, 16384, 22725, 19266,
16384, 16384, 12873, 4520,
21407, 8867, 19266, -4520,
-8867, -21407, -22725, -12873,
16384, -16384, 12873, -22725,
-16384, 16384, 4520, 19266,
8867, -21407, 4520, -12873,
21407, -8867, 19266, -22725,
19266, 19266, 26722, 22654,
19266, 19266, 15137, 5315,
25172, 10426, 22654, -5315,
-10426, -25172, -26722, -15137,
19266, -19266, 15137, -26722,
-19266, 19266, 5315, 22654,
10426, -25172, 5315, -15137,
25172, -10426, 22654, -26722,
21407, 21407, 29692, 25172,
21407, 21407, 16819, 5906,
27969, 11585, 25172, -5906,
-11585, -27969, -29692, -16819,
21407, -21407, 16819, -29692,
-21407, 21407, 5906, 25172,
11585, -27969, 5906, -16819,
27969, -11585, 25172, -29692,
22725, 22725, 31521, 26722,
22725, 22725, 17855, 6270,
29692, 12299, 26722, -6270,
-12299, -29692, -31521, -17855,
22725, -22725, 17855, -31521,
-22725, 22725, 6270, 26722,
12299, -29692, 6270, -17855,
29692, -12299, 26722, -31521,
};
static const struct
{
DECLARE_ALIGNED(16, const int16_t, tab_frw_01234567_sse2)[256];
} tab_frw_01234567_sse2 =
{{
//DECLARE_ALIGNED(16, static const int16_t, tab_frw_01234567_sse2)[] = { // forward_dct coeff table
#define TABLE_SSE2 C4, C4, C1, C3, -C6, -C2, -C1, -C5, \
C4, C4, C5, C7, C2, C6, C3, -C7, \
-C4, C4, C7, C3, C6, -C2, C7, -C5, \
C4, -C4, C5, -C1, C2, -C6, C3, -C1,
// c1..c7 * cos(pi/4) * 2^15
#define C1 22725
#define C2 21407
#define C3 19266
#define C4 16384
#define C5 12873
#define C6 8867
#define C7 4520
TABLE_SSE2
#undef C1
#undef C2
#undef C3
#undef C4
#undef C5
#undef C6
#undef C7
#define C1 31521
#define C2 29692
#define C3 26722
#define C4 22725
#define C5 17855
#define C6 12299
#define C7 6270
TABLE_SSE2
#undef C1
#undef C2
#undef C3
#undef C4
#undef C5
#undef C6
#undef C7
#define C1 29692
#define C2 27969
#define C3 25172
#define C4 21407
#define C5 16819
#define C6 11585
#define C7 5906
TABLE_SSE2
#undef C1
#undef C2
#undef C3
#undef C4
#undef C5
#undef C6
#undef C7
#define C1 26722
#define C2 25172
#define C3 22654
#define C4 19266
#define C5 15137
#define C6 10426
#define C7 5315
TABLE_SSE2
#undef C1
#undef C2
#undef C3
#undef C4
#undef C5
#undef C6
#undef C7
#define C1 22725
#define C2 21407
#define C3 19266
#define C4 16384
#define C5 12873
#define C6 8867
#define C7 4520
TABLE_SSE2
#undef C1
#undef C2
#undef C3
#undef C4
#undef C5
#undef C6
#undef C7
#define C1 26722
#define C2 25172
#define C3 22654
#define C4 19266
#define C5 15137
#define C6 10426
#define C7 5315
TABLE_SSE2
#undef C1
#undef C2
#undef C3
#undef C4
#undef C5
#undef C6
#undef C7
#define C1 29692
#define C2 27969
#define C3 25172
#define C4 21407
#define C5 16819
#define C6 11585
#define C7 5906
TABLE_SSE2
#undef C1
#undef C2
#undef C3
#undef C4
#undef C5
#undef C6
#undef C7
#define C1 31521
#define C2 29692
#define C3 26722
#define C4 22725
#define C5 17855
#define C6 12299
#define C7 6270
TABLE_SSE2
}};
#define S(s) AV_TOSTRING(s) //AV_STRINGIFY is too long
#define FDCT_COL(cpu, mm, mov)\
static av_always_inline void fdct_col_##cpu(const int16_t *in, int16_t *out, int offset)\
{\
__asm__ volatile (\
#mov" 16(%0), %%"#mm"0 \n\t" \
#mov" 96(%0), %%"#mm"1 \n\t" \
#mov" %%"#mm"0, %%"#mm"2 \n\t" \
#mov" 32(%0), %%"#mm"3 \n\t" \
"paddsw %%"#mm"1, %%"#mm"0 \n\t" \
#mov" 80(%0), %%"#mm"4 \n\t" \
"psllw $"S(SHIFT_FRW_COL)", %%"#mm"0 \n\t" \
#mov" (%0), %%"#mm"5 \n\t" \
"paddsw %%"#mm"3, %%"#mm"4 \n\t" \
"paddsw 112(%0), %%"#mm"5 \n\t" \
"psllw $"S(SHIFT_FRW_COL)", %%"#mm"4 \n\t" \
#mov" %%"#mm"0, %%"#mm"6 \n\t" \
"psubsw %%"#mm"1, %%"#mm"2 \n\t" \
#mov" 16(%1), %%"#mm"1 \n\t" \
"psubsw %%"#mm"4, %%"#mm"0 \n\t" \
#mov" 48(%0), %%"#mm"7 \n\t" \
"pmulhw %%"#mm"0, %%"#mm"1 \n\t" \
"paddsw 64(%0), %%"#mm"7 \n\t" \
"psllw $"S(SHIFT_FRW_COL)", %%"#mm"5 \n\t" \
"paddsw %%"#mm"4, %%"#mm"6 \n\t" \
"psllw $"S(SHIFT_FRW_COL)", %%"#mm"7 \n\t" \
#mov" %%"#mm"5, %%"#mm"4 \n\t" \
"psubsw %%"#mm"7, %%"#mm"5 \n\t" \
"paddsw %%"#mm"5, %%"#mm"1 \n\t" \
"paddsw %%"#mm"7, %%"#mm"4 \n\t" \
"por (%2), %%"#mm"1 \n\t" \
"psllw $"S(SHIFT_FRW_COL)"+1, %%"#mm"2 \n\t" \
"pmulhw 16(%1), %%"#mm"5 \n\t" \
#mov" %%"#mm"4, %%"#mm"7 \n\t" \
"psubsw 80(%0), %%"#mm"3 \n\t" \
"psubsw %%"#mm"6, %%"#mm"4 \n\t" \
#mov" %%"#mm"1, 32(%3) \n\t" \
"paddsw %%"#mm"6, %%"#mm"7 \n\t" \
#mov" 48(%0), %%"#mm"1 \n\t" \
"psllw $"S(SHIFT_FRW_COL)"+1, %%"#mm"3 \n\t" \
"psubsw 64(%0), %%"#mm"1 \n\t" \
#mov" %%"#mm"2, %%"#mm"6 \n\t" \
#mov" %%"#mm"4, 64(%3) \n\t" \
"paddsw %%"#mm"3, %%"#mm"2 \n\t" \
"pmulhw (%4), %%"#mm"2 \n\t" \
"psubsw %%"#mm"3, %%"#mm"6 \n\t" \
"pmulhw (%4), %%"#mm"6 \n\t" \
"psubsw %%"#mm"0, %%"#mm"5 \n\t" \
"por (%2), %%"#mm"5 \n\t" \
"psllw $"S(SHIFT_FRW_COL)", %%"#mm"1 \n\t" \
"por (%2), %%"#mm"2 \n\t" \
#mov" %%"#mm"1, %%"#mm"4 \n\t" \
#mov" (%0), %%"#mm"3 \n\t" \
"paddsw %%"#mm"6, %%"#mm"1 \n\t" \
"psubsw 112(%0), %%"#mm"3 \n\t" \
"psubsw %%"#mm"6, %%"#mm"4 \n\t" \
#mov" (%1), %%"#mm"0 \n\t" \
"psllw $"S(SHIFT_FRW_COL)", %%"#mm"3 \n\t" \
#mov" 32(%1), %%"#mm"6 \n\t" \
"pmulhw %%"#mm"1, %%"#mm"0 \n\t" \
#mov" %%"#mm"7, (%3) \n\t" \
"pmulhw %%"#mm"4, %%"#mm"6 \n\t" \
#mov" %%"#mm"5, 96(%3) \n\t" \
#mov" %%"#mm"3, %%"#mm"7 \n\t" \
#mov" 32(%1), %%"#mm"5 \n\t" \
"psubsw %%"#mm"2, %%"#mm"7 \n\t" \
"paddsw %%"#mm"2, %%"#mm"3 \n\t" \
"pmulhw %%"#mm"7, %%"#mm"5 \n\t" \
"paddsw %%"#mm"3, %%"#mm"0 \n\t" \
"paddsw %%"#mm"4, %%"#mm"6 \n\t" \
"pmulhw (%1), %%"#mm"3 \n\t" \
"por (%2), %%"#mm"0 \n\t" \
"paddsw %%"#mm"7, %%"#mm"5 \n\t" \
"psubsw %%"#mm"6, %%"#mm"7 \n\t" \
#mov" %%"#mm"0, 16(%3) \n\t" \
"paddsw %%"#mm"4, %%"#mm"5 \n\t" \
#mov" %%"#mm"7, 48(%3) \n\t" \
"psubsw %%"#mm"1, %%"#mm"3 \n\t" \
#mov" %%"#mm"5, 80(%3) \n\t" \
#mov" %%"#mm"3, 112(%3) \n\t" \
: \
: "r" (in + offset), "r" (fdct_tg_all_16), "r" (fdct_one_corr), \
"r" (out + offset), "r" (ocos_4_16)); \
}
FDCT_COL(mmx, mm, movq)
FDCT_COL(sse2, xmm, movdqa)
static av_always_inline void fdct_row_sse2(const int16_t *in, int16_t *out)
{
__asm__ volatile(
#define FDCT_ROW_SSE2_H1(i,t) \
"movq " #i "(%0), %%xmm2 \n\t" \
"movq " #i "+8(%0), %%xmm0 \n\t" \
"movdqa " #t "+32(%1), %%xmm3 \n\t" \
"movdqa " #t "+48(%1), %%xmm7 \n\t" \
"movdqa " #t "(%1), %%xmm4 \n\t" \
"movdqa " #t "+16(%1), %%xmm5 \n\t"
#define FDCT_ROW_SSE2_H2(i,t) \
"movq " #i "(%0), %%xmm2 \n\t" \
"movq " #i "+8(%0), %%xmm0 \n\t" \
"movdqa " #t "+32(%1), %%xmm3 \n\t" \
"movdqa " #t "+48(%1), %%xmm7 \n\t"
#define FDCT_ROW_SSE2(i) \
"movq %%xmm2, %%xmm1 \n\t" \
"pshuflw $27, %%xmm0, %%xmm0 \n\t" \
"paddsw %%xmm0, %%xmm1 \n\t" \
"psubsw %%xmm0, %%xmm2 \n\t" \
"punpckldq %%xmm2, %%xmm1 \n\t" \
"pshufd $78, %%xmm1, %%xmm2 \n\t" \
"pmaddwd %%xmm2, %%xmm3 \n\t" \
"pmaddwd %%xmm1, %%xmm7 \n\t" \
"pmaddwd %%xmm5, %%xmm2 \n\t" \
"pmaddwd %%xmm4, %%xmm1 \n\t" \
"paddd %%xmm7, %%xmm3 \n\t" \
"paddd %%xmm2, %%xmm1 \n\t" \
"paddd %%xmm6, %%xmm3 \n\t" \
"paddd %%xmm6, %%xmm1 \n\t" \
"psrad %3, %%xmm3 \n\t" \
"psrad %3, %%xmm1 \n\t" \
"packssdw %%xmm3, %%xmm1 \n\t" \
"movdqa %%xmm1, " #i "(%4) \n\t"
"movdqa (%2), %%xmm6 \n\t"
FDCT_ROW_SSE2_H1(0,0)
FDCT_ROW_SSE2(0)
FDCT_ROW_SSE2_H2(64,0)
FDCT_ROW_SSE2(64)
FDCT_ROW_SSE2_H1(16,64)
FDCT_ROW_SSE2(16)
FDCT_ROW_SSE2_H2(112,64)
FDCT_ROW_SSE2(112)
FDCT_ROW_SSE2_H1(32,128)
FDCT_ROW_SSE2(32)
FDCT_ROW_SSE2_H2(96,128)
FDCT_ROW_SSE2(96)
FDCT_ROW_SSE2_H1(48,192)
FDCT_ROW_SSE2(48)
FDCT_ROW_SSE2_H2(80,192)
FDCT_ROW_SSE2(80)
:
: "r" (in), "r" (tab_frw_01234567_sse2.tab_frw_01234567_sse2),
"r" (fdct_r_row_sse2.fdct_r_row_sse2), "i" (SHIFT_FRW_ROW), "r" (out)
XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7")
);
}
static av_always_inline void fdct_row_mmxext(const int16_t *in, int16_t *out,
const int16_t *table)
{
__asm__ volatile (
"pshufw $0x1B, 8(%0), %%mm5 \n\t"
"movq (%0), %%mm0 \n\t"
"movq %%mm0, %%mm1 \n\t"
"paddsw %%mm5, %%mm0 \n\t"
"psubsw %%mm5, %%mm1 \n\t"
"movq %%mm0, %%mm2 \n\t"
"punpckldq %%mm1, %%mm0 \n\t"
"punpckhdq %%mm1, %%mm2 \n\t"
"movq (%1), %%mm1 \n\t"
"movq 8(%1), %%mm3 \n\t"
"movq 16(%1), %%mm4 \n\t"
"movq 24(%1), %%mm5 \n\t"
"movq 32(%1), %%mm6 \n\t"
"movq 40(%1), %%mm7 \n\t"
"pmaddwd %%mm0, %%mm1 \n\t"
"pmaddwd %%mm2, %%mm3 \n\t"
"pmaddwd %%mm0, %%mm4 \n\t"
"pmaddwd %%mm2, %%mm5 \n\t"
"pmaddwd %%mm0, %%mm6 \n\t"
"pmaddwd %%mm2, %%mm7 \n\t"
"pmaddwd 48(%1), %%mm0 \n\t"
"pmaddwd 56(%1), %%mm2 \n\t"
"paddd %%mm1, %%mm3 \n\t"
"paddd %%mm4, %%mm5 \n\t"
"paddd %%mm6, %%mm7 \n\t"
"paddd %%mm0, %%mm2 \n\t"
"movq (%2), %%mm0 \n\t"
"paddd %%mm0, %%mm3 \n\t"
"paddd %%mm0, %%mm5 \n\t"
"paddd %%mm0, %%mm7 \n\t"
"paddd %%mm0, %%mm2 \n\t"
"psrad $"S(SHIFT_FRW_ROW)", %%mm3 \n\t"
"psrad $"S(SHIFT_FRW_ROW)", %%mm5 \n\t"
"psrad $"S(SHIFT_FRW_ROW)", %%mm7 \n\t"
"psrad $"S(SHIFT_FRW_ROW)", %%mm2 \n\t"
"packssdw %%mm5, %%mm3 \n\t"
"packssdw %%mm2, %%mm7 \n\t"
"movq %%mm3, (%3) \n\t"
"movq %%mm7, 8(%3) \n\t"
:
: "r" (in), "r" (table), "r" (fdct_r_row), "r" (out));
}
static av_always_inline void fdct_row_mmx(const int16_t *in, int16_t *out, const int16_t *table)
{
//FIXME reorder (I do not have an old MMX-only CPU here to benchmark ...)
__asm__ volatile(
"movd 12(%0), %%mm1 \n\t"
"punpcklwd 8(%0), %%mm1 \n\t"
"movq %%mm1, %%mm2 \n\t"
"psrlq $0x20, %%mm1 \n\t"
"movq 0(%0), %%mm0 \n\t"
"punpcklwd %%mm2, %%mm1 \n\t"
"movq %%mm0, %%mm5 \n\t"
"paddsw %%mm1, %%mm0 \n\t"
"psubsw %%mm1, %%mm5 \n\t"
"movq %%mm0, %%mm2 \n\t"
"punpckldq %%mm5, %%mm0 \n\t"
"punpckhdq %%mm5, %%mm2 \n\t"
"movq 0(%1), %%mm1 \n\t"
"movq 8(%1), %%mm3 \n\t"
"movq 16(%1), %%mm4 \n\t"
"movq 24(%1), %%mm5 \n\t"
"movq 32(%1), %%mm6 \n\t"
"movq 40(%1), %%mm7 \n\t"
"pmaddwd %%mm0, %%mm1 \n\t"
"pmaddwd %%mm2, %%mm3 \n\t"
"pmaddwd %%mm0, %%mm4 \n\t"
"pmaddwd %%mm2, %%mm5 \n\t"
"pmaddwd %%mm0, %%mm6 \n\t"
"pmaddwd %%mm2, %%mm7 \n\t"
"pmaddwd 48(%1), %%mm0 \n\t"
"pmaddwd 56(%1), %%mm2 \n\t"
"paddd %%mm1, %%mm3 \n\t"
"paddd %%mm4, %%mm5 \n\t"
"paddd %%mm6, %%mm7 \n\t"
"paddd %%mm0, %%mm2 \n\t"
"movq (%2), %%mm0 \n\t"
"paddd %%mm0, %%mm3 \n\t"
"paddd %%mm0, %%mm5 \n\t"
"paddd %%mm0, %%mm7 \n\t"
"paddd %%mm0, %%mm2 \n\t"
"psrad $"S(SHIFT_FRW_ROW)", %%mm3 \n\t"
"psrad $"S(SHIFT_FRW_ROW)", %%mm5 \n\t"
"psrad $"S(SHIFT_FRW_ROW)", %%mm7 \n\t"
"psrad $"S(SHIFT_FRW_ROW)", %%mm2 \n\t"
"packssdw %%mm5, %%mm3 \n\t"
"packssdw %%mm2, %%mm7 \n\t"
"movq %%mm3, 0(%3) \n\t"
"movq %%mm7, 8(%3) \n\t"
:
: "r" (in), "r" (table), "r" (fdct_r_row), "r" (out));
}
void ff_fdct_mmx(int16_t *block)
{
DECLARE_ALIGNED(8, int64_t, align_tmp)[16];
int16_t * block1= (int16_t*)align_tmp;
const int16_t *table= tab_frw_01234567;
int i;
fdct_col_mmx(block, block1, 0);
fdct_col_mmx(block, block1, 4);
for(i=8;i>0;i--) {
fdct_row_mmx(block1, block, table);
block1 += 8;
table += 32;
block += 8;
}
}
#endif /* HAVE_MMX_INLINE */
#if HAVE_MMXEXT_INLINE
void ff_fdct_mmxext(int16_t *block)
{
DECLARE_ALIGNED(8, int64_t, align_tmp)[16];
int16_t *block1= (int16_t*)align_tmp;
const int16_t *table= tab_frw_01234567;
int i;
fdct_col_mmx(block, block1, 0);
fdct_col_mmx(block, block1, 4);
for(i=8;i>0;i--) {
fdct_row_mmxext(block1, block, table);
block1 += 8;
table += 32;
block += 8;
}
}
#endif /* HAVE_MMXEXT_INLINE */
#if HAVE_SSE2_INLINE
void ff_fdct_sse2(int16_t *block)
{
DECLARE_ALIGNED(16, int64_t, align_tmp)[16];
int16_t * const block1= (int16_t*)align_tmp;
fdct_col_sse2(block, block1, 0);
fdct_row_sse2(block1, block);
}
#endif /* HAVE_SSE2_INLINE */

View file

@ -0,0 +1,28 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVCODEC_X86_FDCT_H
#define AVCODEC_X86_FDCT_H
#include <stdint.h>
void ff_fdct_mmx(int16_t *block);
void ff_fdct_mmxext(int16_t *block);
void ff_fdct_sse2(int16_t *block);
#endif /* AVCODEC_X86_FDCT_H */

View file

@ -0,0 +1,44 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/avcodec.h"
#include "libavcodec/fdctdsp.h"
#include "fdct.h"
av_cold void ff_fdctdsp_init_x86(FDCTDSPContext *c, AVCodecContext *avctx,
unsigned high_bit_depth)
{
int cpu_flags = av_get_cpu_flags();
const int dct_algo = avctx->dct_algo;
if (!high_bit_depth) {
if ((dct_algo == FF_DCT_AUTO || dct_algo == FF_DCT_MMX)) {
if (INLINE_MMX(cpu_flags))
c->fdct = ff_fdct_mmx;
if (INLINE_MMXEXT(cpu_flags))
c->fdct = ff_fdct_mmxext;
if (INLINE_SSE2(cpu_flags))
c->fdct = ff_fdct_sse2;
}
}
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,38 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVCODEC_X86_FFT_H
#define AVCODEC_X86_FFT_H
#include "libavcodec/fft.h"
void ff_fft_permute_sse(FFTContext *s, FFTComplex *z);
void ff_fft_calc_avx(FFTContext *s, FFTComplex *z);
void ff_fft_calc_sse(FFTContext *s, FFTComplex *z);
void ff_fft_calc_3dnow(FFTContext *s, FFTComplex *z);
void ff_fft_calc_3dnowext(FFTContext *s, FFTComplex *z);
void ff_imdct_calc_3dnow(FFTContext *s, FFTSample *output, const FFTSample *input);
void ff_imdct_half_3dnow(FFTContext *s, FFTSample *output, const FFTSample *input);
void ff_imdct_calc_3dnowext(FFTContext *s, FFTSample *output, const FFTSample *input);
void ff_imdct_half_3dnowext(FFTContext *s, FFTSample *output, const FFTSample *input);
void ff_imdct_calc_sse(FFTContext *s, FFTSample *output, const FFTSample *input);
void ff_imdct_half_sse(FFTContext *s, FFTSample *output, const FFTSample *input);
void ff_imdct_half_avx(FFTContext *s, FFTSample *output, const FFTSample *input);
#endif /* AVCODEC_X86_FFT_H */

View file

@ -0,0 +1,61 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/cpu.h"
#include "fft.h"
av_cold void ff_fft_init_x86(FFTContext *s)
{
int cpu_flags = av_get_cpu_flags();
if (s->nbits > 16)
return;
#if ARCH_X86_32
if (EXTERNAL_AMD3DNOW(cpu_flags)) {
s->imdct_calc = ff_imdct_calc_3dnow;
s->imdct_half = ff_imdct_half_3dnow;
s->fft_calc = ff_fft_calc_3dnow;
}
if (EXTERNAL_AMD3DNOWEXT(cpu_flags)) {
s->imdct_calc = ff_imdct_calc_3dnowext;
s->imdct_half = ff_imdct_half_3dnowext;
s->fft_calc = ff_fft_calc_3dnowext;
}
#endif /* ARCH_X86_32 */
if (EXTERNAL_SSE(cpu_flags)) {
s->imdct_calc = ff_imdct_calc_sse;
s->imdct_half = ff_imdct_half_sse;
s->fft_permute = ff_fft_permute_sse;
s->fft_calc = ff_fft_calc_sse;
s->fft_permutation = FF_FFT_PERM_SWAP_LSBS;
}
if (EXTERNAL_AVX_FAST(cpu_flags) && s->nbits >= 5) {
s->imdct_half = ff_imdct_half_avx;
s->fft_calc = ff_fft_calc_avx;
s->fft_permutation = FF_FFT_PERM_AVX;
}
}

View file

@ -0,0 +1,115 @@
/*
* Copyright (c) 2014 James Almer
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavcodec/flacdsp.h"
#include "libavutil/x86/cpu.h"
#include "config.h"
void ff_flac_lpc_32_sse4(int32_t *samples, const int coeffs[32], int order,
int qlevel, int len);
void ff_flac_lpc_32_xop(int32_t *samples, const int coeffs[32], int order,
int qlevel, int len);
void ff_flac_enc_lpc_16_sse4(int32_t *, const int32_t *, int, int, const int32_t *,int);
#define DECORRELATE_FUNCS(fmt, opt) \
void ff_flac_decorrelate_ls_##fmt##_##opt(uint8_t **out, int32_t **in, int channels, \
int len, int shift); \
void ff_flac_decorrelate_rs_##fmt##_##opt(uint8_t **out, int32_t **in, int channels, \
int len, int shift); \
void ff_flac_decorrelate_ms_##fmt##_##opt(uint8_t **out, int32_t **in, int channels, \
int len, int shift); \
void ff_flac_decorrelate_indep2_##fmt##_##opt(uint8_t **out, int32_t **in, int channels, \
int len, int shift); \
void ff_flac_decorrelate_indep4_##fmt##_##opt(uint8_t **out, int32_t **in, int channels, \
int len, int shift); \
void ff_flac_decorrelate_indep6_##fmt##_##opt(uint8_t **out, int32_t **in, int channels, \
int len, int shift); \
void ff_flac_decorrelate_indep8_##fmt##_##opt(uint8_t **out, int32_t **in, int channels, \
int len, int shift)
DECORRELATE_FUNCS(16, sse2);
DECORRELATE_FUNCS(16, avx);
DECORRELATE_FUNCS(32, sse2);
DECORRELATE_FUNCS(32, avx);
av_cold void ff_flacdsp_init_x86(FLACDSPContext *c, enum AVSampleFormat fmt, int channels,
int bps)
{
#if HAVE_X86ASM
int cpu_flags = av_get_cpu_flags();
#if CONFIG_FLAC_DECODER
if (EXTERNAL_SSE2(cpu_flags)) {
if (fmt == AV_SAMPLE_FMT_S16) {
if (channels == 2)
c->decorrelate[0] = ff_flac_decorrelate_indep2_16_sse2;
else if (channels == 4)
c->decorrelate[0] = ff_flac_decorrelate_indep4_16_sse2;
else if (channels == 6)
c->decorrelate[0] = ff_flac_decorrelate_indep6_16_sse2;
else if (ARCH_X86_64 && channels == 8)
c->decorrelate[0] = ff_flac_decorrelate_indep8_16_sse2;
c->decorrelate[1] = ff_flac_decorrelate_ls_16_sse2;
c->decorrelate[2] = ff_flac_decorrelate_rs_16_sse2;
c->decorrelate[3] = ff_flac_decorrelate_ms_16_sse2;
} else if (fmt == AV_SAMPLE_FMT_S32) {
if (channels == 2)
c->decorrelate[0] = ff_flac_decorrelate_indep2_32_sse2;
else if (channels == 4)
c->decorrelate[0] = ff_flac_decorrelate_indep4_32_sse2;
else if (channels == 6)
c->decorrelate[0] = ff_flac_decorrelate_indep6_32_sse2;
else if (ARCH_X86_64 && channels == 8)
c->decorrelate[0] = ff_flac_decorrelate_indep8_32_sse2;
c->decorrelate[1] = ff_flac_decorrelate_ls_32_sse2;
c->decorrelate[2] = ff_flac_decorrelate_rs_32_sse2;
c->decorrelate[3] = ff_flac_decorrelate_ms_32_sse2;
}
}
if (EXTERNAL_SSE4(cpu_flags)) {
c->lpc32 = ff_flac_lpc_32_sse4;
}
if (EXTERNAL_AVX(cpu_flags)) {
if (fmt == AV_SAMPLE_FMT_S16) {
if (ARCH_X86_64 && channels == 8)
c->decorrelate[0] = ff_flac_decorrelate_indep8_16_avx;
} else if (fmt == AV_SAMPLE_FMT_S32) {
if (channels == 4)
c->decorrelate[0] = ff_flac_decorrelate_indep4_32_avx;
else if (channels == 6)
c->decorrelate[0] = ff_flac_decorrelate_indep6_32_avx;
else if (ARCH_X86_64 && channels == 8)
c->decorrelate[0] = ff_flac_decorrelate_indep8_32_avx;
}
}
if (EXTERNAL_XOP(cpu_flags)) {
c->lpc32 = ff_flac_lpc_32_xop;
}
#endif
#if CONFIG_FLAC_ENCODER
if (EXTERNAL_SSE4(cpu_flags)) {
if (CONFIG_GPL)
c->lpc16_encode = ff_flac_enc_lpc_16_sse4;
}
#endif
#endif /* HAVE_X86ASM */
}

View file

@ -0,0 +1,55 @@
/*
* Format Conversion Utils
* Copyright (c) 2000, 2001 Fabrice Bellard
* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
*
* MMX optimization by Nick Kurshev <nickols_k@mail.ru>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/fmtconvert.h"
#if HAVE_X86ASM
void ff_int32_to_float_fmul_scalar_sse (float *dst, const int32_t *src, float mul, int len);
void ff_int32_to_float_fmul_scalar_sse2(float *dst, const int32_t *src, float mul, int len);
void ff_int32_to_float_fmul_array8_sse (FmtConvertContext *c, float *dst, const int32_t *src,
const float *mul, int len);
void ff_int32_to_float_fmul_array8_sse2(FmtConvertContext *c, float *dst, const int32_t *src,
const float *mul, int len);
#endif /* HAVE_X86ASM */
av_cold void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx)
{
#if HAVE_X86ASM
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_SSE(cpu_flags)) {
c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_sse;
c->int32_to_float_fmul_array8 = ff_int32_to_float_fmul_array8_sse;
}
if (EXTERNAL_SSE2(cpu_flags)) {
c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_sse2;
c->int32_to_float_fmul_array8 = ff_int32_to_float_fmul_array8_sse2;
}
#endif /* HAVE_X86ASM */
}

View file

@ -0,0 +1,49 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVCODEC_X86_FPEL_H
#define AVCODEC_X86_FPEL_H
#include <stddef.h>
#include <stdint.h>
void ff_avg_pixels4_mmx(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_pixels4_mmxext(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_pixels8_mmx(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_pixels8_mmxext(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_pixels16_mmx(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_pixels16_mmxext(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_pixels4_mmx(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_pixels8_mmx(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_pixels16_mmx(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
#endif /* AVCODEC_X86_FPEL_H */

View file

@ -0,0 +1,35 @@
/*
* Copyright (c) 2014 James Almer
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdint.h>
#include "libavutil/attributes.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/g722dsp.h"
void ff_g722_apply_qmf_sse2(const int16_t *prev_samples, int xout[2]);
av_cold void ff_g722dsp_init_x86(G722DSPContext *dsp)
{
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_SSE2(cpu_flags))
dsp->apply_qmf = ff_g722_apply_qmf_sse2;
}

View file

@ -0,0 +1,39 @@
/*
* Copyright (c) 2013 Diego Biurrun <diego@biurrun.de>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdint.h>
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/h263dsp.h"
void ff_h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale);
void ff_h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale);
av_cold void ff_h263dsp_init_x86(H263DSPContext *c)
{
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_MMX(cpu_flags)) {
c->h263_h_loop_filter = ff_h263_h_loop_filter_mmx;
c->h263_v_loop_filter = ff_h263_v_loop_filter_mmx;
}
}

View file

@ -0,0 +1,208 @@
/*
* H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
* Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
/**
* @file
* H.264 / AVC / MPEG-4 part10 codec.
* non-SIMD x86-specific optimizations for H.264
* @author Michael Niedermayer <michaelni@gmx.at>
*/
#include <stddef.h>
#include "libavcodec/cabac.h"
#include "cabac.h"
#if HAVE_INLINE_ASM
#if ARCH_X86_64
#define REG64 "r"
#else
#define REG64 "m"
#endif
//FIXME use some macros to avoid duplicating get_cabac (cannot be done yet
//as that would make optimization work hard)
#if HAVE_7REGS && !BROKEN_COMPILER
#define decode_significance decode_significance_x86
static int decode_significance_x86(CABACContext *c, int max_coeff,
uint8_t *significant_coeff_ctx_base,
int *index, x86_reg last_off){
void *end= significant_coeff_ctx_base + max_coeff - 1;
int minusstart= -(intptr_t)significant_coeff_ctx_base;
int minusindex= 4-(intptr_t)index;
int bit;
x86_reg coeff_count;
#ifdef BROKEN_RELOCATIONS
void *tables;
__asm__ volatile(
"lea "MANGLE(ff_h264_cabac_tables)", %0 \n\t"
: "=&r"(tables)
: NAMED_CONSTRAINTS_ARRAY(ff_h264_cabac_tables)
);
#endif
__asm__ volatile(
"3: \n\t"
BRANCHLESS_GET_CABAC("%4", "%q4", "(%1)", "%3", "%w3",
"%5", "%q5", "%k0", "%b0",
"%c11(%6)", "%c12(%6)",
AV_STRINGIFY(H264_NORM_SHIFT_OFFSET),
AV_STRINGIFY(H264_LPS_RANGE_OFFSET),
AV_STRINGIFY(H264_MLPS_STATE_OFFSET),
"%13")
"test $1, %4 \n\t"
" jz 4f \n\t"
"add %10, %1 \n\t"
BRANCHLESS_GET_CABAC("%4", "%q4", "(%1)", "%3", "%w3",
"%5", "%q5", "%k0", "%b0",
"%c11(%6)", "%c12(%6)",
AV_STRINGIFY(H264_NORM_SHIFT_OFFSET),
AV_STRINGIFY(H264_LPS_RANGE_OFFSET),
AV_STRINGIFY(H264_MLPS_STATE_OFFSET),
"%13")
"sub %10, %1 \n\t"
"mov %2, %0 \n\t"
"movl %7, %%ecx \n\t"
"add %1, %%"FF_REG_c" \n\t"
"movl %%ecx, (%0) \n\t"
"test $1, %4 \n\t"
" jnz 5f \n\t"
"add"FF_OPSIZE" $4, %2 \n\t"
"4: \n\t"
"add $1, %1 \n\t"
"cmp %8, %1 \n\t"
" jb 3b \n\t"
"mov %2, %0 \n\t"
"movl %7, %%ecx \n\t"
"add %1, %%"FF_REG_c" \n\t"
"movl %%ecx, (%0) \n\t"
"5: \n\t"
"add %9, %k0 \n\t"
"shr $2, %k0 \n\t"
: "=&q"(coeff_count), "+r"(significant_coeff_ctx_base), "+m"(index),
"+&r"(c->low), "=&r"(bit), "+&r"(c->range)
: "r"(c), "m"(minusstart), "m"(end), "m"(minusindex), "m"(last_off),
"i"(offsetof(CABACContext, bytestream)),
"i"(offsetof(CABACContext, bytestream_end))
TABLES_ARG
: "%"FF_REG_c, "memory"
);
return coeff_count;
}
#define decode_significance_8x8 decode_significance_8x8_x86
static int decode_significance_8x8_x86(CABACContext *c,
uint8_t *significant_coeff_ctx_base,
int *index, uint8_t *last_coeff_ctx_base, const uint8_t *sig_off){
int minusindex= 4-(intptr_t)index;
int bit;
x86_reg coeff_count;
x86_reg last=0;
x86_reg state;
#ifdef BROKEN_RELOCATIONS
void *tables;
__asm__ volatile(
"lea "MANGLE(ff_h264_cabac_tables)", %0 \n\t"
: "=&r"(tables)
: NAMED_CONSTRAINTS_ARRAY(ff_h264_cabac_tables)
);
#endif
__asm__ volatile(
"mov %1, %6 \n\t"
"3: \n\t"
"mov %10, %0 \n\t"
"movzb (%0, %6), %6 \n\t"
"add %9, %6 \n\t"
BRANCHLESS_GET_CABAC("%4", "%q4", "(%6)", "%3", "%w3",
"%5", "%q5", "%k0", "%b0",
"%c12(%7)", "%c13(%7)",
AV_STRINGIFY(H264_NORM_SHIFT_OFFSET),
AV_STRINGIFY(H264_LPS_RANGE_OFFSET),
AV_STRINGIFY(H264_MLPS_STATE_OFFSET),
"%15")
"mov %1, %6 \n\t"
"test $1, %4 \n\t"
" jz 4f \n\t"
#ifdef BROKEN_RELOCATIONS
"movzb %c14(%15, %q6), %6\n\t"
#else
"movzb "MANGLE(ff_h264_cabac_tables)"+%c14(%6), %6\n\t"
#endif
"add %11, %6 \n\t"
BRANCHLESS_GET_CABAC("%4", "%q4", "(%6)", "%3", "%w3",
"%5", "%q5", "%k0", "%b0",
"%c12(%7)", "%c13(%7)",
AV_STRINGIFY(H264_NORM_SHIFT_OFFSET),
AV_STRINGIFY(H264_LPS_RANGE_OFFSET),
AV_STRINGIFY(H264_MLPS_STATE_OFFSET),
"%15")
"mov %2, %0 \n\t"
"mov %1, %6 \n\t"
"mov %k6, (%0) \n\t"
"test $1, %4 \n\t"
" jnz 5f \n\t"
"add"FF_OPSIZE" $4, %2 \n\t"
"4: \n\t"
"add $1, %6 \n\t"
"mov %6, %1 \n\t"
"cmp $63, %6 \n\t"
" jb 3b \n\t"
"mov %2, %0 \n\t"
"mov %k6, (%0) \n\t"
"5: \n\t"
"addl %8, %k0 \n\t"
"shr $2, %k0 \n\t"
: "=&q"(coeff_count), "+"REG64(last), "+"REG64(index), "+&r"(c->low),
"=&r"(bit), "+&r"(c->range), "=&r"(state)
: "r"(c), "m"(minusindex), "m"(significant_coeff_ctx_base),
REG64(sig_off), REG64(last_coeff_ctx_base),
"i"(offsetof(CABACContext, bytestream)),
"i"(offsetof(CABACContext, bytestream_end)),
"i"(H264_LAST_COEFF_FLAG_OFFSET_8x8_OFFSET) TABLES_ARG
: "%"FF_REG_c, "memory"
);
return coeff_count;
}
#endif /* HAVE_7REGS && BROKEN_COMPILER */
#endif /* HAVE_INLINE_ASM */

View file

@ -0,0 +1,410 @@
/*
* Copyright (c) 2010 Fiona Glaser <fiona@x264.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/avcodec.h"
#include "libavcodec/h264pred.h"
#define PRED4x4(TYPE, DEPTH, OPT) \
void ff_pred4x4_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, \
const uint8_t *topright, \
ptrdiff_t stride);
PRED4x4(dc, 10, mmxext)
PRED4x4(down_left, 10, sse2)
PRED4x4(down_left, 10, avx)
PRED4x4(down_right, 10, sse2)
PRED4x4(down_right, 10, ssse3)
PRED4x4(down_right, 10, avx)
PRED4x4(vertical_left, 10, sse2)
PRED4x4(vertical_left, 10, avx)
PRED4x4(vertical_right, 10, sse2)
PRED4x4(vertical_right, 10, ssse3)
PRED4x4(vertical_right, 10, avx)
PRED4x4(horizontal_up, 10, mmxext)
PRED4x4(horizontal_down, 10, sse2)
PRED4x4(horizontal_down, 10, ssse3)
PRED4x4(horizontal_down, 10, avx)
#define PRED8x8(TYPE, DEPTH, OPT) \
void ff_pred8x8_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, \
ptrdiff_t stride);
PRED8x8(dc, 10, mmxext)
PRED8x8(dc, 10, sse2)
PRED8x8(top_dc, 10, sse2)
PRED8x8(plane, 10, sse2)
PRED8x8(vertical, 10, sse2)
PRED8x8(horizontal, 10, sse2)
#define PRED8x8L(TYPE, DEPTH, OPT)\
void ff_pred8x8l_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, \
int has_topleft, \
int has_topright, \
ptrdiff_t stride);
PRED8x8L(dc, 10, sse2)
PRED8x8L(dc, 10, avx)
PRED8x8L(128_dc, 10, mmxext)
PRED8x8L(128_dc, 10, sse2)
PRED8x8L(top_dc, 10, sse2)
PRED8x8L(top_dc, 10, avx)
PRED8x8L(vertical, 10, sse2)
PRED8x8L(vertical, 10, avx)
PRED8x8L(horizontal, 10, sse2)
PRED8x8L(horizontal, 10, ssse3)
PRED8x8L(horizontal, 10, avx)
PRED8x8L(down_left, 10, sse2)
PRED8x8L(down_left, 10, ssse3)
PRED8x8L(down_left, 10, avx)
PRED8x8L(down_right, 10, sse2)
PRED8x8L(down_right, 10, ssse3)
PRED8x8L(down_right, 10, avx)
PRED8x8L(vertical_right, 10, sse2)
PRED8x8L(vertical_right, 10, ssse3)
PRED8x8L(vertical_right, 10, avx)
PRED8x8L(horizontal_up, 10, sse2)
PRED8x8L(horizontal_up, 10, ssse3)
PRED8x8L(horizontal_up, 10, avx)
#define PRED16x16(TYPE, DEPTH, OPT)\
void ff_pred16x16_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, \
ptrdiff_t stride);
PRED16x16(dc, 10, mmxext)
PRED16x16(dc, 10, sse2)
PRED16x16(top_dc, 10, mmxext)
PRED16x16(top_dc, 10, sse2)
PRED16x16(128_dc, 10, mmxext)
PRED16x16(128_dc, 10, sse2)
PRED16x16(left_dc, 10, mmxext)
PRED16x16(left_dc, 10, sse2)
PRED16x16(vertical, 10, mmxext)
PRED16x16(vertical, 10, sse2)
PRED16x16(horizontal, 10, mmxext)
PRED16x16(horizontal, 10, sse2)
/* 8-bit versions */
PRED16x16(vertical, 8, mmx)
PRED16x16(vertical, 8, sse)
PRED16x16(horizontal, 8, mmx)
PRED16x16(horizontal, 8, mmxext)
PRED16x16(horizontal, 8, ssse3)
PRED16x16(dc, 8, mmxext)
PRED16x16(dc, 8, sse2)
PRED16x16(dc, 8, ssse3)
PRED16x16(plane_h264, 8, mmx)
PRED16x16(plane_h264, 8, mmxext)
PRED16x16(plane_h264, 8, sse2)
PRED16x16(plane_h264, 8, ssse3)
PRED16x16(plane_rv40, 8, mmx)
PRED16x16(plane_rv40, 8, mmxext)
PRED16x16(plane_rv40, 8, sse2)
PRED16x16(plane_rv40, 8, ssse3)
PRED16x16(plane_svq3, 8, mmx)
PRED16x16(plane_svq3, 8, mmxext)
PRED16x16(plane_svq3, 8, sse2)
PRED16x16(plane_svq3, 8, ssse3)
PRED16x16(tm_vp8, 8, mmx)
PRED16x16(tm_vp8, 8, mmxext)
PRED16x16(tm_vp8, 8, sse2)
PRED16x16(tm_vp8, 8, avx2)
PRED8x8(top_dc, 8, mmxext)
PRED8x8(dc_rv40, 8, mmxext)
PRED8x8(dc, 8, mmxext)
PRED8x8(vertical, 8, mmx)
PRED8x8(horizontal, 8, mmx)
PRED8x8(horizontal, 8, mmxext)
PRED8x8(horizontal, 8, ssse3)
PRED8x8(plane, 8, mmx)
PRED8x8(plane, 8, mmxext)
PRED8x8(plane, 8, sse2)
PRED8x8(plane, 8, ssse3)
PRED8x8(tm_vp8, 8, mmx)
PRED8x8(tm_vp8, 8, mmxext)
PRED8x8(tm_vp8, 8, sse2)
PRED8x8(tm_vp8, 8, ssse3)
PRED8x8L(top_dc, 8, mmxext)
PRED8x8L(top_dc, 8, ssse3)
PRED8x8L(dc, 8, mmxext)
PRED8x8L(dc, 8, ssse3)
PRED8x8L(horizontal, 8, mmxext)
PRED8x8L(horizontal, 8, ssse3)
PRED8x8L(vertical, 8, mmxext)
PRED8x8L(vertical, 8, ssse3)
PRED8x8L(down_left, 8, mmxext)
PRED8x8L(down_left, 8, sse2)
PRED8x8L(down_left, 8, ssse3)
PRED8x8L(down_right, 8, mmxext)
PRED8x8L(down_right, 8, sse2)
PRED8x8L(down_right, 8, ssse3)
PRED8x8L(vertical_right, 8, mmxext)
PRED8x8L(vertical_right, 8, sse2)
PRED8x8L(vertical_right, 8, ssse3)
PRED8x8L(vertical_left, 8, sse2)
PRED8x8L(vertical_left, 8, ssse3)
PRED8x8L(horizontal_up, 8, mmxext)
PRED8x8L(horizontal_up, 8, ssse3)
PRED8x8L(horizontal_down, 8, mmxext)
PRED8x8L(horizontal_down, 8, sse2)
PRED8x8L(horizontal_down, 8, ssse3)
PRED4x4(dc, 8, mmxext)
PRED4x4(down_left, 8, mmxext)
PRED4x4(down_right, 8, mmxext)
PRED4x4(vertical_left, 8, mmxext)
PRED4x4(vertical_right, 8, mmxext)
PRED4x4(horizontal_up, 8, mmxext)
PRED4x4(horizontal_down, 8, mmxext)
PRED4x4(tm_vp8, 8, mmx)
PRED4x4(tm_vp8, 8, mmxext)
PRED4x4(tm_vp8, 8, ssse3)
PRED4x4(vertical_vp8, 8, mmxext)
av_cold void ff_h264_pred_init_x86(H264PredContext *h, int codec_id,
const int bit_depth,
const int chroma_format_idc)
{
int cpu_flags = av_get_cpu_flags();
if (bit_depth == 8) {
if (EXTERNAL_MMX(cpu_flags)) {
h->pred16x16[VERT_PRED8x8 ] = ff_pred16x16_vertical_8_mmx;
h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_8_mmx;
if (chroma_format_idc <= 1) {
h->pred8x8 [VERT_PRED8x8 ] = ff_pred8x8_vertical_8_mmx;
h->pred8x8 [HOR_PRED8x8 ] = ff_pred8x8_horizontal_8_mmx;
}
if (codec_id == AV_CODEC_ID_VP7 || codec_id == AV_CODEC_ID_VP8) {
h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_tm_vp8_8_mmx;
h->pred8x8 [PLANE_PRED8x8 ] = ff_pred8x8_tm_vp8_8_mmx;
h->pred4x4 [TM_VP8_PRED ] = ff_pred4x4_tm_vp8_8_mmx;
} else {
if (chroma_format_idc <= 1)
h->pred8x8 [PLANE_PRED8x8] = ff_pred8x8_plane_8_mmx;
if (codec_id == AV_CODEC_ID_SVQ3) {
if (cpu_flags & AV_CPU_FLAG_CMOV)
h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_svq3_8_mmx;
} else if (codec_id == AV_CODEC_ID_RV40) {
h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_rv40_8_mmx;
} else {
h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_h264_8_mmx;
}
}
}
if (EXTERNAL_MMXEXT(cpu_flags)) {
h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_8_mmxext;
h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_8_mmxext;
if (chroma_format_idc <= 1)
h->pred8x8[HOR_PRED8x8 ] = ff_pred8x8_horizontal_8_mmxext;
h->pred8x8l [TOP_DC_PRED ] = ff_pred8x8l_top_dc_8_mmxext;
h->pred8x8l [DC_PRED ] = ff_pred8x8l_dc_8_mmxext;
h->pred8x8l [HOR_PRED ] = ff_pred8x8l_horizontal_8_mmxext;
h->pred8x8l [VERT_PRED ] = ff_pred8x8l_vertical_8_mmxext;
h->pred8x8l [DIAG_DOWN_RIGHT_PRED ] = ff_pred8x8l_down_right_8_mmxext;
h->pred8x8l [VERT_RIGHT_PRED ] = ff_pred8x8l_vertical_right_8_mmxext;
h->pred8x8l [HOR_UP_PRED ] = ff_pred8x8l_horizontal_up_8_mmxext;
h->pred8x8l [DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_8_mmxext;
h->pred8x8l [HOR_DOWN_PRED ] = ff_pred8x8l_horizontal_down_8_mmxext;
h->pred4x4 [DIAG_DOWN_RIGHT_PRED ] = ff_pred4x4_down_right_8_mmxext;
h->pred4x4 [VERT_RIGHT_PRED ] = ff_pred4x4_vertical_right_8_mmxext;
h->pred4x4 [HOR_DOWN_PRED ] = ff_pred4x4_horizontal_down_8_mmxext;
h->pred4x4 [DC_PRED ] = ff_pred4x4_dc_8_mmxext;
if (codec_id == AV_CODEC_ID_VP7 || codec_id == AV_CODEC_ID_VP8 ||
codec_id == AV_CODEC_ID_H264) {
h->pred4x4 [DIAG_DOWN_LEFT_PRED] = ff_pred4x4_down_left_8_mmxext;
}
if (codec_id == AV_CODEC_ID_SVQ3 || codec_id == AV_CODEC_ID_H264) {
h->pred4x4 [VERT_LEFT_PRED ] = ff_pred4x4_vertical_left_8_mmxext;
}
if (codec_id != AV_CODEC_ID_RV40) {
h->pred4x4 [HOR_UP_PRED ] = ff_pred4x4_horizontal_up_8_mmxext;
}
if (codec_id == AV_CODEC_ID_SVQ3 || codec_id == AV_CODEC_ID_H264) {
if (chroma_format_idc <= 1) {
h->pred8x8[TOP_DC_PRED8x8 ] = ff_pred8x8_top_dc_8_mmxext;
h->pred8x8[DC_PRED8x8 ] = ff_pred8x8_dc_8_mmxext;
}
}
if (codec_id == AV_CODEC_ID_VP7 || codec_id == AV_CODEC_ID_VP8) {
h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_tm_vp8_8_mmxext;
h->pred8x8 [DC_PRED8x8 ] = ff_pred8x8_dc_rv40_8_mmxext;
h->pred8x8 [PLANE_PRED8x8 ] = ff_pred8x8_tm_vp8_8_mmxext;
h->pred4x4 [TM_VP8_PRED ] = ff_pred4x4_tm_vp8_8_mmxext;
h->pred4x4 [VERT_PRED ] = ff_pred4x4_vertical_vp8_8_mmxext;
} else {
if (chroma_format_idc <= 1)
h->pred8x8 [PLANE_PRED8x8] = ff_pred8x8_plane_8_mmxext;
if (codec_id == AV_CODEC_ID_SVQ3) {
h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_plane_svq3_8_mmxext;
} else if (codec_id == AV_CODEC_ID_RV40) {
h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_plane_rv40_8_mmxext;
} else {
h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_plane_h264_8_mmxext;
}
}
}
if (EXTERNAL_SSE(cpu_flags)) {
h->pred16x16[VERT_PRED8x8] = ff_pred16x16_vertical_8_sse;
}
if (EXTERNAL_SSE2(cpu_flags)) {
h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_8_sse2;
h->pred8x8l [DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_8_sse2;
h->pred8x8l [DIAG_DOWN_RIGHT_PRED ] = ff_pred8x8l_down_right_8_sse2;
h->pred8x8l [VERT_RIGHT_PRED ] = ff_pred8x8l_vertical_right_8_sse2;
h->pred8x8l [VERT_LEFT_PRED ] = ff_pred8x8l_vertical_left_8_sse2;
h->pred8x8l [HOR_DOWN_PRED ] = ff_pred8x8l_horizontal_down_8_sse2;
if (codec_id == AV_CODEC_ID_VP7 || codec_id == AV_CODEC_ID_VP8) {
h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_tm_vp8_8_sse2;
h->pred8x8 [PLANE_PRED8x8 ] = ff_pred8x8_tm_vp8_8_sse2;
} else {
if (chroma_format_idc <= 1)
h->pred8x8 [PLANE_PRED8x8] = ff_pred8x8_plane_8_sse2;
if (codec_id == AV_CODEC_ID_SVQ3) {
h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_svq3_8_sse2;
} else if (codec_id == AV_CODEC_ID_RV40) {
h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_rv40_8_sse2;
} else {
h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_h264_8_sse2;
}
}
}
if (EXTERNAL_SSSE3(cpu_flags)) {
h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_8_ssse3;
h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_8_ssse3;
if (chroma_format_idc <= 1)
h->pred8x8 [HOR_PRED8x8 ] = ff_pred8x8_horizontal_8_ssse3;
h->pred8x8l [TOP_DC_PRED ] = ff_pred8x8l_top_dc_8_ssse3;
h->pred8x8l [DC_PRED ] = ff_pred8x8l_dc_8_ssse3;
h->pred8x8l [HOR_PRED ] = ff_pred8x8l_horizontal_8_ssse3;
h->pred8x8l [VERT_PRED ] = ff_pred8x8l_vertical_8_ssse3;
h->pred8x8l [DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_8_ssse3;
h->pred8x8l [DIAG_DOWN_RIGHT_PRED ] = ff_pred8x8l_down_right_8_ssse3;
h->pred8x8l [VERT_RIGHT_PRED ] = ff_pred8x8l_vertical_right_8_ssse3;
h->pred8x8l [VERT_LEFT_PRED ] = ff_pred8x8l_vertical_left_8_ssse3;
h->pred8x8l [HOR_UP_PRED ] = ff_pred8x8l_horizontal_up_8_ssse3;
h->pred8x8l [HOR_DOWN_PRED ] = ff_pred8x8l_horizontal_down_8_ssse3;
if (codec_id == AV_CODEC_ID_VP7 || codec_id == AV_CODEC_ID_VP8) {
h->pred8x8 [PLANE_PRED8x8 ] = ff_pred8x8_tm_vp8_8_ssse3;
h->pred4x4 [TM_VP8_PRED ] = ff_pred4x4_tm_vp8_8_ssse3;
} else {
if (chroma_format_idc <= 1)
h->pred8x8 [PLANE_PRED8x8] = ff_pred8x8_plane_8_ssse3;
if (codec_id == AV_CODEC_ID_SVQ3) {
h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_svq3_8_ssse3;
} else if (codec_id == AV_CODEC_ID_RV40) {
h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_rv40_8_ssse3;
} else {
h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_h264_8_ssse3;
}
}
}
if(EXTERNAL_AVX2(cpu_flags)){
if (codec_id == AV_CODEC_ID_VP8) {
h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_tm_vp8_8_avx2;
}
}
} else if (bit_depth == 10) {
if (EXTERNAL_MMXEXT(cpu_flags)) {
h->pred4x4[DC_PRED ] = ff_pred4x4_dc_10_mmxext;
h->pred4x4[HOR_UP_PRED ] = ff_pred4x4_horizontal_up_10_mmxext;
if (chroma_format_idc <= 1)
h->pred8x8[DC_PRED8x8 ] = ff_pred8x8_dc_10_mmxext;
h->pred8x8l[DC_128_PRED ] = ff_pred8x8l_128_dc_10_mmxext;
h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_10_mmxext;
h->pred16x16[TOP_DC_PRED8x8 ] = ff_pred16x16_top_dc_10_mmxext;
h->pred16x16[DC_128_PRED8x8 ] = ff_pred16x16_128_dc_10_mmxext;
h->pred16x16[LEFT_DC_PRED8x8 ] = ff_pred16x16_left_dc_10_mmxext;
h->pred16x16[VERT_PRED8x8 ] = ff_pred16x16_vertical_10_mmxext;
h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_10_mmxext;
}
if (EXTERNAL_SSE2(cpu_flags)) {
h->pred4x4[DIAG_DOWN_LEFT_PRED ] = ff_pred4x4_down_left_10_sse2;
h->pred4x4[DIAG_DOWN_RIGHT_PRED] = ff_pred4x4_down_right_10_sse2;
h->pred4x4[VERT_LEFT_PRED ] = ff_pred4x4_vertical_left_10_sse2;
h->pred4x4[VERT_RIGHT_PRED ] = ff_pred4x4_vertical_right_10_sse2;
h->pred4x4[HOR_DOWN_PRED ] = ff_pred4x4_horizontal_down_10_sse2;
if (chroma_format_idc <= 1) {
h->pred8x8[DC_PRED8x8 ] = ff_pred8x8_dc_10_sse2;
h->pred8x8[TOP_DC_PRED8x8 ] = ff_pred8x8_top_dc_10_sse2;
h->pred8x8[PLANE_PRED8x8 ] = ff_pred8x8_plane_10_sse2;
h->pred8x8[VERT_PRED8x8 ] = ff_pred8x8_vertical_10_sse2;
h->pred8x8[HOR_PRED8x8 ] = ff_pred8x8_horizontal_10_sse2;
}
h->pred8x8l[VERT_PRED ] = ff_pred8x8l_vertical_10_sse2;
h->pred8x8l[HOR_PRED ] = ff_pred8x8l_horizontal_10_sse2;
h->pred8x8l[DC_PRED ] = ff_pred8x8l_dc_10_sse2;
h->pred8x8l[DC_128_PRED ] = ff_pred8x8l_128_dc_10_sse2;
h->pred8x8l[TOP_DC_PRED ] = ff_pred8x8l_top_dc_10_sse2;
h->pred8x8l[DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_10_sse2;
h->pred8x8l[DIAG_DOWN_RIGHT_PRED] = ff_pred8x8l_down_right_10_sse2;
h->pred8x8l[VERT_RIGHT_PRED ] = ff_pred8x8l_vertical_right_10_sse2;
h->pred8x8l[HOR_UP_PRED ] = ff_pred8x8l_horizontal_up_10_sse2;
h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_10_sse2;
h->pred16x16[TOP_DC_PRED8x8 ] = ff_pred16x16_top_dc_10_sse2;
h->pred16x16[DC_128_PRED8x8 ] = ff_pred16x16_128_dc_10_sse2;
h->pred16x16[LEFT_DC_PRED8x8 ] = ff_pred16x16_left_dc_10_sse2;
h->pred16x16[VERT_PRED8x8 ] = ff_pred16x16_vertical_10_sse2;
h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_10_sse2;
}
if (EXTERNAL_SSSE3(cpu_flags)) {
h->pred4x4[DIAG_DOWN_RIGHT_PRED] = ff_pred4x4_down_right_10_ssse3;
h->pred4x4[VERT_RIGHT_PRED ] = ff_pred4x4_vertical_right_10_ssse3;
h->pred4x4[HOR_DOWN_PRED ] = ff_pred4x4_horizontal_down_10_ssse3;
h->pred8x8l[HOR_PRED ] = ff_pred8x8l_horizontal_10_ssse3;
h->pred8x8l[DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_10_ssse3;
h->pred8x8l[DIAG_DOWN_RIGHT_PRED] = ff_pred8x8l_down_right_10_ssse3;
h->pred8x8l[VERT_RIGHT_PRED ] = ff_pred8x8l_vertical_right_10_ssse3;
h->pred8x8l[HOR_UP_PRED ] = ff_pred8x8l_horizontal_up_10_ssse3;
}
if (EXTERNAL_AVX(cpu_flags)) {
h->pred4x4[DIAG_DOWN_LEFT_PRED ] = ff_pred4x4_down_left_10_avx;
h->pred4x4[DIAG_DOWN_RIGHT_PRED] = ff_pred4x4_down_right_10_avx;
h->pred4x4[VERT_LEFT_PRED ] = ff_pred4x4_vertical_left_10_avx;
h->pred4x4[VERT_RIGHT_PRED ] = ff_pred4x4_vertical_right_10_avx;
h->pred4x4[HOR_DOWN_PRED ] = ff_pred4x4_horizontal_down_10_avx;
h->pred8x8l[VERT_PRED ] = ff_pred8x8l_vertical_10_avx;
h->pred8x8l[HOR_PRED ] = ff_pred8x8l_horizontal_10_avx;
h->pred8x8l[DC_PRED ] = ff_pred8x8l_dc_10_avx;
h->pred8x8l[TOP_DC_PRED ] = ff_pred8x8l_top_dc_10_avx;
h->pred8x8l[DIAG_DOWN_RIGHT_PRED] = ff_pred8x8l_down_right_10_avx;
h->pred8x8l[DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_10_avx;
h->pred8x8l[VERT_RIGHT_PRED ] = ff_pred8x8l_vertical_right_10_avx;
h->pred8x8l[HOR_UP_PRED ] = ff_pred8x8l_horizontal_up_10_avx;
}
}
}

View file

@ -0,0 +1,634 @@
/*
* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
* Copyright (c) 2011 Daniel Kang
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/h264dec.h"
#include "libavcodec/h264qpel.h"
#include "libavcodec/pixels.h"
#include "fpel.h"
#if HAVE_X86ASM
void ff_put_pixels4_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
int dstStride, int src1Stride, int h);
void ff_avg_pixels4_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
int dstStride, int src1Stride, int h);
void ff_put_pixels8_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
int dstStride, int src1Stride, int h);
void ff_avg_pixels8_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
int dstStride, int src1Stride, int h);
void ff_put_pixels16_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
int dstStride, int src1Stride, int h);
void ff_avg_pixels16_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
int dstStride, int src1Stride, int h);
#define ff_put_pixels8_l2_sse2 ff_put_pixels8_l2_mmxext
#define ff_avg_pixels8_l2_sse2 ff_avg_pixels8_l2_mmxext
#define ff_put_pixels16_l2_sse2 ff_put_pixels16_l2_mmxext
#define ff_avg_pixels16_l2_sse2 ff_avg_pixels16_l2_mmxext
#define ff_put_pixels16_mmxext ff_put_pixels16_mmx
#define ff_put_pixels8_mmxext ff_put_pixels8_mmx
#define ff_put_pixels4_mmxext ff_put_pixels4_mmx
#define DEF_QPEL(OPNAME)\
void ff_ ## OPNAME ## _h264_qpel4_h_lowpass_mmxext(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride);\
void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_mmxext(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride);\
void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_ssse3(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride);\
void ff_ ## OPNAME ## _h264_qpel4_h_lowpass_l2_mmxext(uint8_t *dst, const uint8_t *src, const uint8_t *src2, int dstStride, int src2Stride);\
void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_l2_mmxext(uint8_t *dst, const uint8_t *src, const uint8_t *src2, int dstStride, int src2Stride);\
void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_l2_ssse3(uint8_t *dst, const uint8_t *src, const uint8_t *src2, int dstStride, int src2Stride);\
void ff_ ## OPNAME ## _h264_qpel4_v_lowpass_mmxext(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride);\
void ff_ ## OPNAME ## _h264_qpel8or16_v_lowpass_op_mmxext(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h);\
void ff_ ## OPNAME ## _h264_qpel8or16_v_lowpass_sse2(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h);\
void ff_ ## OPNAME ## _h264_qpel4_hv_lowpass_v_mmxext(const uint8_t *src, int16_t *tmp, int srcStride);\
void ff_ ## OPNAME ## _h264_qpel4_hv_lowpass_h_mmxext(int16_t *tmp, uint8_t *dst, int dstStride);\
void ff_ ## OPNAME ## _h264_qpel8or16_hv1_lowpass_op_mmxext(const uint8_t *src, int16_t *tmp, int srcStride, int size);\
void ff_ ## OPNAME ## _h264_qpel8or16_hv1_lowpass_op_sse2(const uint8_t *src, int16_t *tmp, int srcStride, int size);\
void ff_ ## OPNAME ## _h264_qpel8or16_hv2_lowpass_op_mmxext(uint8_t *dst, int16_t *tmp, int dstStride, int unused, int h);\
void ff_ ## OPNAME ## _h264_qpel8or16_hv2_lowpass_ssse3(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size);\
void ff_ ## OPNAME ## _pixels4_l2_shift5_mmxext(uint8_t *dst, const int16_t *src16, const uint8_t *src8, int dstStride, int src8Stride, int h);\
void ff_ ## OPNAME ## _pixels8_l2_shift5_mmxext(uint8_t *dst, const int16_t *src16, const uint8_t *src8, int dstStride, int src8Stride, int h);
DEF_QPEL(avg)
DEF_QPEL(put)
static av_always_inline void ff_put_h264_qpel8or16_hv1_lowpass_mmxext(int16_t *tmp, const uint8_t *src, int tmpStride, int srcStride, int size)
{
int w = (size + 8) >> 2;
src -= 2 * srcStride + 2;
while (w--) {
ff_put_h264_qpel8or16_hv1_lowpass_op_mmxext(src, tmp, srcStride, size);
tmp += 4;
src += 4;
}
}
#define QPEL_H264(OPNAME, OP, MMX)\
static av_always_inline void ff_ ## OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, const uint8_t *src, int dstStride, int tmpStride, int srcStride){\
int w=3;\
src -= 2*srcStride+2;\
while(w--){\
ff_ ## OPNAME ## h264_qpel4_hv_lowpass_v_mmxext(src, tmp, srcStride);\
tmp += 4;\
src += 4;\
}\
tmp -= 3*4;\
ff_ ## OPNAME ## h264_qpel4_hv_lowpass_h_mmxext(tmp, dst, dstStride);\
}\
\
static av_always_inline void ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h){\
src -= 2*srcStride;\
ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_op_mmxext(dst, src, dstStride, srcStride, h);\
src += 4;\
dst += 4;\
ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_op_mmxext(dst, src, dstStride, srcStride, h);\
}\
static av_always_inline void ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\
int w = size>>4;\
do{\
ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_op_mmxext(dst, tmp, dstStride, 0, size);\
tmp += 8;\
dst += 8;\
}while(w--);\
}\
\
static av_always_inline void ff_ ## OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride){\
ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 8);\
}\
static av_always_inline void ff_ ## OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride){\
ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 16);\
ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
}\
\
static av_always_inline void ff_ ## OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride){\
ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
src += 8*srcStride;\
dst += 8*dstStride;\
ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
}\
\
static av_always_inline void ff_ ## OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, const uint8_t *src, const uint8_t *src2, int dstStride, int src2Stride){\
ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
src += 8*dstStride;\
dst += 8*dstStride;\
src2 += 8*src2Stride;\
ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
}\
\
static av_always_inline void ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, const uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\
ff_put_h264_qpel8or16_hv1_lowpass_ ## MMX(tmp, src, tmpStride, srcStride, size);\
ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\
}\
static av_always_inline void ff_ ## OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, const uint8_t *src, int dstStride, int tmpStride, int srcStride){\
ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride, 8);\
}\
\
static av_always_inline void ff_ ## OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, const uint8_t *src, int dstStride, int tmpStride, int srcStride){\
ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride, 16);\
}\
\
static av_always_inline void ff_ ## OPNAME ## pixels16_l2_shift5_ ## MMX(uint8_t *dst, const int16_t *src16, const uint8_t *src8, int dstStride, int src8Stride, int h)\
{\
ff_ ## OPNAME ## pixels8_l2_shift5_ ## MMX(dst , src16 , src8 , dstStride, src8Stride, h);\
ff_ ## OPNAME ## pixels8_l2_shift5_ ## MMX(dst+8, src16+8, src8+8, dstStride, src8Stride, h);\
}\
#if ARCH_X86_64
#define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
void ff_avg_h264_qpel16_h_lowpass_l2_ssse3(uint8_t *dst, const uint8_t *src, const uint8_t *src2, int dstStride, int src2Stride);
void ff_put_h264_qpel16_h_lowpass_l2_ssse3(uint8_t *dst, const uint8_t *src, const uint8_t *src2, int dstStride, int src2Stride);
#else // ARCH_X86_64
#define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
static av_always_inline void ff_ ## OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, const uint8_t *src, const uint8_t *src2, int dstStride, int src2Stride){\
ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
src += 8*dstStride;\
dst += 8*dstStride;\
src2 += 8*src2Stride;\
ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
}
#endif // ARCH_X86_64
#define QPEL_H264_H_XMM(OPNAME, OP, MMX)\
QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
static av_always_inline void ff_ ## OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride){\
ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
src += 8*srcStride;\
dst += 8*dstStride;\
ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
}\
#define QPEL_H264_V_XMM(OPNAME, OP, MMX)\
static av_always_inline void ff_ ## OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride){\
ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 8);\
}\
static av_always_inline void ff_ ## OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride){\
ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 16);\
ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
}
static av_always_inline void put_h264_qpel8or16_hv1_lowpass_sse2(int16_t *tmp,
const uint8_t *src,
int tmpStride,
int srcStride,
int size)
{
int w = (size+8)>>3;
src -= 2*srcStride+2;
while(w--){
ff_put_h264_qpel8or16_hv1_lowpass_op_sse2(src, tmp, srcStride, size);
tmp += 8;
src += 8;
}
}
#define QPEL_H264_HV_XMM(OPNAME, OP, MMX)\
static av_always_inline void ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, const uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\
put_h264_qpel8or16_hv1_lowpass_sse2(tmp, src, tmpStride, srcStride, size);\
ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\
}\
static av_always_inline void ff_ ## OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, const uint8_t *src, int dstStride, int tmpStride, int srcStride){\
ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 8);\
}\
static av_always_inline void ff_ ## OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, const uint8_t *src, int dstStride, int tmpStride, int srcStride){\
ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 16);\
}\
#define ff_put_h264_qpel8_h_lowpass_l2_sse2 ff_put_h264_qpel8_h_lowpass_l2_mmxext
#define ff_avg_h264_qpel8_h_lowpass_l2_sse2 ff_avg_h264_qpel8_h_lowpass_l2_mmxext
#define ff_put_h264_qpel16_h_lowpass_l2_sse2 ff_put_h264_qpel16_h_lowpass_l2_mmxext
#define ff_avg_h264_qpel16_h_lowpass_l2_sse2 ff_avg_h264_qpel16_h_lowpass_l2_mmxext
#define ff_put_h264_qpel8_v_lowpass_ssse3 ff_put_h264_qpel8_v_lowpass_sse2
#define ff_avg_h264_qpel8_v_lowpass_ssse3 ff_avg_h264_qpel8_v_lowpass_sse2
#define ff_put_h264_qpel16_v_lowpass_ssse3 ff_put_h264_qpel16_v_lowpass_sse2
#define ff_avg_h264_qpel16_v_lowpass_ssse3 ff_avg_h264_qpel16_v_lowpass_sse2
#define ff_put_h264_qpel8or16_hv2_lowpass_sse2 ff_put_h264_qpel8or16_hv2_lowpass_mmxext
#define ff_avg_h264_qpel8or16_hv2_lowpass_sse2 ff_avg_h264_qpel8or16_hv2_lowpass_mmxext
#define H264_MC(OPNAME, SIZE, MMX, ALIGN) \
H264_MC_C(OPNAME, SIZE, MMX, ALIGN)\
H264_MC_V(OPNAME, SIZE, MMX, ALIGN)\
H264_MC_H(OPNAME, SIZE, MMX, ALIGN)\
H264_MC_HV(OPNAME, SIZE, MMX, ALIGN)\
static void put_h264_qpel16_mc00_sse2 (uint8_t *dst, const uint8_t *src,
ptrdiff_t stride)
{
ff_put_pixels16_sse2(dst, src, stride, 16);
}
static void avg_h264_qpel16_mc00_sse2 (uint8_t *dst, const uint8_t *src,
ptrdiff_t stride)
{
ff_avg_pixels16_sse2(dst, src, stride, 16);
}
#define put_h264_qpel8_mc00_sse2 put_h264_qpel8_mc00_mmxext
#define avg_h264_qpel8_mc00_sse2 avg_h264_qpel8_mc00_mmxext
#define H264_MC_C(OPNAME, SIZE, MMX, ALIGN) \
static void OPNAME ## h264_qpel ## SIZE ## _mc00_ ## MMX (uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
{\
ff_ ## OPNAME ## pixels ## SIZE ## _ ## MMX(dst, src, stride, SIZE);\
}\
#define H264_MC_H(OPNAME, SIZE, MMX, ALIGN) \
static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
{\
ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src, stride, stride);\
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc20_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
{\
ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_ ## MMX(dst, src, stride, stride);\
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
{\
ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src+1, stride, stride);\
}\
#define H264_MC_V(OPNAME, SIZE, MMX, ALIGN) \
static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
{\
LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*SIZE]);\
ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
ff_ ## OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src, temp, stride, stride, SIZE);\
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
{\
ff_ ## OPNAME ## h264_qpel ## SIZE ## _v_lowpass_ ## MMX(dst, src, stride, stride);\
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
{\
LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*SIZE]);\
ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
ff_ ## OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src+stride, temp, stride, stride, SIZE);\
}\
#define H264_MC_HV(OPNAME, SIZE, MMX, ALIGN) \
static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
{\
LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*SIZE]);\
ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
{\
LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*SIZE]);\
ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\
ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
{\
LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*SIZE]);\
ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
{\
LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*SIZE]);\
ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\
ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
{\
LOCAL_ALIGNED(ALIGN, uint16_t, temp, [SIZE*(SIZE<8?12:24)]);\
ff_ ## OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(dst, temp, src, stride, SIZE, stride);\
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
{\
LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\
uint8_t * const halfHV= temp;\
int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
av_assert2(((int)temp & 7) == 0);\
ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, halfHV, stride, SIZE);\
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
{\
LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\
uint8_t * const halfHV= temp;\
int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
av_assert2(((int)temp & 7) == 0);\
ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, halfHV, stride, SIZE);\
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
{\
LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\
uint8_t * const halfHV= temp;\
int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
av_assert2(((int)temp & 7) == 0);\
ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
ff_ ## OPNAME ## pixels ## SIZE ## _l2_shift5_mmxext(dst, halfV+2, halfHV, stride, SIZE, SIZE);\
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
{\
LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\
uint8_t * const halfHV= temp;\
int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
av_assert2(((int)temp & 7) == 0);\
ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
ff_ ## OPNAME ## pixels ## SIZE ## _l2_shift5_mmxext(dst, halfV+3, halfHV, stride, SIZE, SIZE);\
}\
#define H264_MC_4816(MMX)\
H264_MC(put_, 4, MMX, 8)\
H264_MC(put_, 8, MMX, 8)\
H264_MC(put_, 16,MMX, 8)\
H264_MC(avg_, 4, MMX, 8)\
H264_MC(avg_, 8, MMX, 8)\
H264_MC(avg_, 16,MMX, 8)\
#define H264_MC_816(QPEL, XMM)\
QPEL(put_, 8, XMM, 16)\
QPEL(put_, 16,XMM, 16)\
QPEL(avg_, 8, XMM, 16)\
QPEL(avg_, 16,XMM, 16)\
QPEL_H264(put_, PUT_OP, mmxext)
QPEL_H264(avg_, AVG_MMXEXT_OP, mmxext)
QPEL_H264_V_XMM(put_, PUT_OP, sse2)
QPEL_H264_V_XMM(avg_,AVG_MMXEXT_OP, sse2)
QPEL_H264_HV_XMM(put_, PUT_OP, sse2)
QPEL_H264_HV_XMM(avg_,AVG_MMXEXT_OP, sse2)
QPEL_H264_H_XMM(put_, PUT_OP, ssse3)
QPEL_H264_H_XMM(avg_,AVG_MMXEXT_OP, ssse3)
QPEL_H264_HV_XMM(put_, PUT_OP, ssse3)
QPEL_H264_HV_XMM(avg_,AVG_MMXEXT_OP, ssse3)
H264_MC_4816(mmxext)
H264_MC_816(H264_MC_V, sse2)
H264_MC_816(H264_MC_HV, sse2)
H264_MC_816(H264_MC_H, ssse3)
H264_MC_816(H264_MC_HV, ssse3)
//10bit
#define LUMA_MC_OP(OP, NUM, DEPTH, TYPE, OPT) \
void ff_ ## OP ## _h264_qpel ## NUM ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT \
(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
#define LUMA_MC_ALL(DEPTH, TYPE, OPT) \
LUMA_MC_OP(put, 4, DEPTH, TYPE, OPT) \
LUMA_MC_OP(avg, 4, DEPTH, TYPE, OPT) \
LUMA_MC_OP(put, 8, DEPTH, TYPE, OPT) \
LUMA_MC_OP(avg, 8, DEPTH, TYPE, OPT) \
LUMA_MC_OP(put, 16, DEPTH, TYPE, OPT) \
LUMA_MC_OP(avg, 16, DEPTH, TYPE, OPT)
#define LUMA_MC_816(DEPTH, TYPE, OPT) \
LUMA_MC_OP(put, 8, DEPTH, TYPE, OPT) \
LUMA_MC_OP(avg, 8, DEPTH, TYPE, OPT) \
LUMA_MC_OP(put, 16, DEPTH, TYPE, OPT) \
LUMA_MC_OP(avg, 16, DEPTH, TYPE, OPT)
LUMA_MC_ALL(10, mc00, mmxext)
LUMA_MC_ALL(10, mc10, mmxext)
LUMA_MC_ALL(10, mc20, mmxext)
LUMA_MC_ALL(10, mc30, mmxext)
LUMA_MC_ALL(10, mc01, mmxext)
LUMA_MC_ALL(10, mc11, mmxext)
LUMA_MC_ALL(10, mc21, mmxext)
LUMA_MC_ALL(10, mc31, mmxext)
LUMA_MC_ALL(10, mc02, mmxext)
LUMA_MC_ALL(10, mc12, mmxext)
LUMA_MC_ALL(10, mc22, mmxext)
LUMA_MC_ALL(10, mc32, mmxext)
LUMA_MC_ALL(10, mc03, mmxext)
LUMA_MC_ALL(10, mc13, mmxext)
LUMA_MC_ALL(10, mc23, mmxext)
LUMA_MC_ALL(10, mc33, mmxext)
LUMA_MC_816(10, mc00, sse2)
LUMA_MC_816(10, mc10, sse2)
LUMA_MC_816(10, mc10, sse2_cache64)
LUMA_MC_816(10, mc10, ssse3_cache64)
LUMA_MC_816(10, mc20, sse2)
LUMA_MC_816(10, mc20, sse2_cache64)
LUMA_MC_816(10, mc20, ssse3_cache64)
LUMA_MC_816(10, mc30, sse2)
LUMA_MC_816(10, mc30, sse2_cache64)
LUMA_MC_816(10, mc30, ssse3_cache64)
LUMA_MC_816(10, mc01, sse2)
LUMA_MC_816(10, mc11, sse2)
LUMA_MC_816(10, mc21, sse2)
LUMA_MC_816(10, mc31, sse2)
LUMA_MC_816(10, mc02, sse2)
LUMA_MC_816(10, mc12, sse2)
LUMA_MC_816(10, mc22, sse2)
LUMA_MC_816(10, mc32, sse2)
LUMA_MC_816(10, mc03, sse2)
LUMA_MC_816(10, mc13, sse2)
LUMA_MC_816(10, mc23, sse2)
LUMA_MC_816(10, mc33, sse2)
#define QPEL16_OPMC(OP, MC, MMX)\
void ff_ ## OP ## _h264_qpel16_ ## MC ## _10_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride){\
ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst , src , stride);\
ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst+16, src+16, stride);\
src += 8*stride;\
dst += 8*stride;\
ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst , src , stride);\
ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst+16, src+16, stride);\
}
#define QPEL16_OP(MC, MMX)\
QPEL16_OPMC(put, MC, MMX)\
QPEL16_OPMC(avg, MC, MMX)
#define QPEL16(MMX)\
QPEL16_OP(mc00, MMX)\
QPEL16_OP(mc01, MMX)\
QPEL16_OP(mc02, MMX)\
QPEL16_OP(mc03, MMX)\
QPEL16_OP(mc10, MMX)\
QPEL16_OP(mc11, MMX)\
QPEL16_OP(mc12, MMX)\
QPEL16_OP(mc13, MMX)\
QPEL16_OP(mc20, MMX)\
QPEL16_OP(mc21, MMX)\
QPEL16_OP(mc22, MMX)\
QPEL16_OP(mc23, MMX)\
QPEL16_OP(mc30, MMX)\
QPEL16_OP(mc31, MMX)\
QPEL16_OP(mc32, MMX)\
QPEL16_OP(mc33, MMX)
#if ARCH_X86_32 // ARCH_X86_64 implies SSE2+
QPEL16(mmxext)
#endif
#endif /* HAVE_X86ASM */
#define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX) \
do { \
c->PFX ## _pixels_tab[IDX][ 0] = PREFIX ## PFX ## SIZE ## _mc00_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 1] = PREFIX ## PFX ## SIZE ## _mc10_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 2] = PREFIX ## PFX ## SIZE ## _mc20_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 3] = PREFIX ## PFX ## SIZE ## _mc30_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 4] = PREFIX ## PFX ## SIZE ## _mc01_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 5] = PREFIX ## PFX ## SIZE ## _mc11_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 6] = PREFIX ## PFX ## SIZE ## _mc21_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 7] = PREFIX ## PFX ## SIZE ## _mc31_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 8] = PREFIX ## PFX ## SIZE ## _mc02_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 9] = PREFIX ## PFX ## SIZE ## _mc12_ ## CPU; \
c->PFX ## _pixels_tab[IDX][10] = PREFIX ## PFX ## SIZE ## _mc22_ ## CPU; \
c->PFX ## _pixels_tab[IDX][11] = PREFIX ## PFX ## SIZE ## _mc32_ ## CPU; \
c->PFX ## _pixels_tab[IDX][12] = PREFIX ## PFX ## SIZE ## _mc03_ ## CPU; \
c->PFX ## _pixels_tab[IDX][13] = PREFIX ## PFX ## SIZE ## _mc13_ ## CPU; \
c->PFX ## _pixels_tab[IDX][14] = PREFIX ## PFX ## SIZE ## _mc23_ ## CPU; \
c->PFX ## _pixels_tab[IDX][15] = PREFIX ## PFX ## SIZE ## _mc33_ ## CPU; \
} while (0)
#define H264_QPEL_FUNCS(x, y, CPU) \
do { \
c->put_h264_qpel_pixels_tab[0][x + y * 4] = put_h264_qpel16_mc ## x ## y ## _ ## CPU; \
c->put_h264_qpel_pixels_tab[1][x + y * 4] = put_h264_qpel8_mc ## x ## y ## _ ## CPU; \
c->avg_h264_qpel_pixels_tab[0][x + y * 4] = avg_h264_qpel16_mc ## x ## y ## _ ## CPU; \
c->avg_h264_qpel_pixels_tab[1][x + y * 4] = avg_h264_qpel8_mc ## x ## y ## _ ## CPU; \
} while (0)
#define H264_QPEL_FUNCS_10(x, y, CPU) \
do { \
c->put_h264_qpel_pixels_tab[0][x + y * 4] = ff_put_h264_qpel16_mc ## x ## y ## _10_ ## CPU; \
c->put_h264_qpel_pixels_tab[1][x + y * 4] = ff_put_h264_qpel8_mc ## x ## y ## _10_ ## CPU; \
c->avg_h264_qpel_pixels_tab[0][x + y * 4] = ff_avg_h264_qpel16_mc ## x ## y ## _10_ ## CPU; \
c->avg_h264_qpel_pixels_tab[1][x + y * 4] = ff_avg_h264_qpel8_mc ## x ## y ## _10_ ## CPU; \
} while (0)
av_cold void ff_h264qpel_init_x86(H264QpelContext *c, int bit_depth)
{
#if HAVE_X86ASM
int high_bit_depth = bit_depth > 8;
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_MMXEXT(cpu_flags)) {
if (!high_bit_depth) {
SET_QPEL_FUNCS(put_h264_qpel, 0, 16, mmxext, );
SET_QPEL_FUNCS(put_h264_qpel, 1, 8, mmxext, );
SET_QPEL_FUNCS(put_h264_qpel, 2, 4, mmxext, );
SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, mmxext, );
SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, mmxext, );
SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, mmxext, );
} else if (bit_depth == 10) {
#if ARCH_X86_32
SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_mmxext, ff_);
SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_mmxext, ff_);
SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 10_mmxext, ff_);
SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 10_mmxext, ff_);
#endif
SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 10_mmxext, ff_);
SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 10_mmxext, ff_);
}
}
if (EXTERNAL_SSE2(cpu_flags)) {
if (!high_bit_depth) {
H264_QPEL_FUNCS(0, 1, sse2);
H264_QPEL_FUNCS(0, 2, sse2);
H264_QPEL_FUNCS(0, 3, sse2);
H264_QPEL_FUNCS(1, 1, sse2);
H264_QPEL_FUNCS(1, 2, sse2);
H264_QPEL_FUNCS(1, 3, sse2);
H264_QPEL_FUNCS(2, 1, sse2);
H264_QPEL_FUNCS(2, 2, sse2);
H264_QPEL_FUNCS(2, 3, sse2);
H264_QPEL_FUNCS(3, 1, sse2);
H264_QPEL_FUNCS(3, 2, sse2);
H264_QPEL_FUNCS(3, 3, sse2);
}
if (bit_depth == 10) {
SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_sse2, ff_);
SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 10_sse2, ff_);
SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_sse2, ff_);
SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 10_sse2, ff_);
H264_QPEL_FUNCS_10(1, 0, sse2_cache64);
H264_QPEL_FUNCS_10(2, 0, sse2_cache64);
H264_QPEL_FUNCS_10(3, 0, sse2_cache64);
}
}
if (EXTERNAL_SSE2_FAST(cpu_flags)) {
if (!high_bit_depth) {
H264_QPEL_FUNCS(0, 0, sse2);
}
}
if (EXTERNAL_SSSE3(cpu_flags)) {
if (!high_bit_depth) {
H264_QPEL_FUNCS(1, 0, ssse3);
H264_QPEL_FUNCS(1, 1, ssse3);
H264_QPEL_FUNCS(1, 2, ssse3);
H264_QPEL_FUNCS(1, 3, ssse3);
H264_QPEL_FUNCS(2, 0, ssse3);
H264_QPEL_FUNCS(2, 1, ssse3);
H264_QPEL_FUNCS(2, 2, ssse3);
H264_QPEL_FUNCS(2, 3, ssse3);
H264_QPEL_FUNCS(3, 0, ssse3);
H264_QPEL_FUNCS(3, 1, ssse3);
H264_QPEL_FUNCS(3, 2, ssse3);
H264_QPEL_FUNCS(3, 3, ssse3);
}
if (bit_depth == 10) {
H264_QPEL_FUNCS_10(1, 0, ssse3_cache64);
H264_QPEL_FUNCS_10(2, 0, ssse3_cache64);
H264_QPEL_FUNCS_10(3, 0, ssse3_cache64);
}
}
if (EXTERNAL_AVX(cpu_flags)) {
/* AVX implies 64 byte cache lines without the need to avoid unaligned
* memory accesses that cross the boundary between two cache lines.
* TODO: Port X264_CPU_CACHELINE_32/64 detection from x264 to avoid
* having to treat SSE2 functions with such properties as AVX. */
if (bit_depth == 10) {
H264_QPEL_FUNCS_10(1, 0, sse2);
H264_QPEL_FUNCS_10(2, 0, sse2);
H264_QPEL_FUNCS_10(3, 0, sse2);
}
}
#endif
}

View file

@ -0,0 +1,117 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdint.h>
#include "config.h"
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/h264chroma.h"
void ff_put_h264_chroma_mc8_rnd_mmx (uint8_t *dst, uint8_t *src,
ptrdiff_t stride, int h, int x, int y);
void ff_avg_h264_chroma_mc8_rnd_mmxext(uint8_t *dst, uint8_t *src,
ptrdiff_t stride, int h, int x, int y);
void ff_avg_h264_chroma_mc8_rnd_3dnow(uint8_t *dst, uint8_t *src,
ptrdiff_t stride, int h, int x, int y);
void ff_put_h264_chroma_mc4_mmx (uint8_t *dst, uint8_t *src,
ptrdiff_t stride, int h, int x, int y);
void ff_avg_h264_chroma_mc4_mmxext (uint8_t *dst, uint8_t *src,
ptrdiff_t stride, int h, int x, int y);
void ff_avg_h264_chroma_mc4_3dnow (uint8_t *dst, uint8_t *src,
ptrdiff_t stride, int h, int x, int y);
void ff_put_h264_chroma_mc2_mmxext (uint8_t *dst, uint8_t *src,
ptrdiff_t stride, int h, int x, int y);
void ff_avg_h264_chroma_mc2_mmxext (uint8_t *dst, uint8_t *src,
ptrdiff_t stride, int h, int x, int y);
void ff_put_h264_chroma_mc8_rnd_ssse3(uint8_t *dst, uint8_t *src,
ptrdiff_t stride, int h, int x, int y);
void ff_put_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src,
ptrdiff_t stride, int h, int x, int y);
void ff_avg_h264_chroma_mc8_rnd_ssse3(uint8_t *dst, uint8_t *src,
ptrdiff_t stride, int h, int x, int y);
void ff_avg_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src,
ptrdiff_t stride, int h, int x, int y);
#define CHROMA_MC(OP, NUM, DEPTH, OPT) \
void ff_ ## OP ## _h264_chroma_mc ## NUM ## _ ## DEPTH ## _ ## OPT \
(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride, int h, int x, int y);
CHROMA_MC(put, 2, 10, mmxext)
CHROMA_MC(avg, 2, 10, mmxext)
CHROMA_MC(put, 4, 10, mmxext)
CHROMA_MC(avg, 4, 10, mmxext)
CHROMA_MC(put, 8, 10, sse2)
CHROMA_MC(avg, 8, 10, sse2)
CHROMA_MC(put, 8, 10, avx)
CHROMA_MC(avg, 8, 10, avx)
av_cold void ff_h264chroma_init_x86(H264ChromaContext *c, int bit_depth)
{
int high_bit_depth = bit_depth > 8;
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_MMX(cpu_flags) && !high_bit_depth) {
c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_rnd_mmx;
c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_mmx;
}
if (EXTERNAL_AMD3DNOW(cpu_flags) && !high_bit_depth) {
c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_rnd_3dnow;
c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_3dnow;
}
if (EXTERNAL_MMXEXT(cpu_flags) && !high_bit_depth) {
c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_rnd_mmxext;
c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_mmxext;
c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_mmxext;
c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_mmxext;
}
if (EXTERNAL_MMXEXT(cpu_flags) && bit_depth > 8 && bit_depth <= 10) {
c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_10_mmxext;
c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_10_mmxext;
c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_10_mmxext;
c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_10_mmxext;
}
if (EXTERNAL_SSE2(cpu_flags) && bit_depth > 8 && bit_depth <= 10) {
c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_sse2;
c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_10_sse2;
}
if (EXTERNAL_SSSE3(cpu_flags) && !high_bit_depth) {
c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_rnd_ssse3;
c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_rnd_ssse3;
c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_ssse3;
c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_ssse3;
}
if (EXTERNAL_AVX(cpu_flags) && bit_depth > 8 && bit_depth <= 10) {
// AVX implies !cache64.
// TODO: Port cache(32|64) detection from x264.
c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_avx;
c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_10_avx;
}
}

View file

@ -0,0 +1,448 @@
/*
* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/h264dsp.h"
/***********************************/
/* IDCT */
#define IDCT_ADD_FUNC(NUM, DEPTH, OPT) \
void ff_h264_idct ## NUM ## _add_ ## DEPTH ## _ ## OPT(uint8_t *dst, \
int16_t *block, \
int stride);
IDCT_ADD_FUNC(, 8, mmx)
IDCT_ADD_FUNC(, 8, sse2)
IDCT_ADD_FUNC(, 8, avx)
IDCT_ADD_FUNC(, 10, sse2)
IDCT_ADD_FUNC(_dc, 8, mmxext)
IDCT_ADD_FUNC(_dc, 8, sse2)
IDCT_ADD_FUNC(_dc, 8, avx)
IDCT_ADD_FUNC(_dc, 10, mmxext)
IDCT_ADD_FUNC(8_dc, 8, mmxext)
IDCT_ADD_FUNC(8_dc, 10, sse2)
IDCT_ADD_FUNC(8, 8, mmx)
IDCT_ADD_FUNC(8, 8, sse2)
IDCT_ADD_FUNC(8, 10, sse2)
IDCT_ADD_FUNC(, 10, avx)
IDCT_ADD_FUNC(8_dc, 10, avx)
IDCT_ADD_FUNC(8, 10, avx)
#define IDCT_ADD_REP_FUNC(NUM, REP, DEPTH, OPT) \
void ff_h264_idct ## NUM ## _add ## REP ## _ ## DEPTH ## _ ## OPT \
(uint8_t *dst, const int *block_offset, \
int16_t *block, int stride, const uint8_t nnzc[6 * 8]);
IDCT_ADD_REP_FUNC(8, 4, 8, mmx)
IDCT_ADD_REP_FUNC(8, 4, 8, mmxext)
IDCT_ADD_REP_FUNC(8, 4, 8, sse2)
IDCT_ADD_REP_FUNC(8, 4, 10, sse2)
IDCT_ADD_REP_FUNC(8, 4, 10, avx)
IDCT_ADD_REP_FUNC(, 16, 8, mmx)
IDCT_ADD_REP_FUNC(, 16, 8, mmxext)
IDCT_ADD_REP_FUNC(, 16, 8, sse2)
IDCT_ADD_REP_FUNC(, 16, 10, sse2)
IDCT_ADD_REP_FUNC(, 16intra, 8, mmx)
IDCT_ADD_REP_FUNC(, 16intra, 8, mmxext)
IDCT_ADD_REP_FUNC(, 16intra, 8, sse2)
IDCT_ADD_REP_FUNC(, 16intra, 10, sse2)
IDCT_ADD_REP_FUNC(, 16, 10, avx)
IDCT_ADD_REP_FUNC(, 16intra, 10, avx)
#define IDCT_ADD_REP_FUNC2(NUM, REP, DEPTH, OPT) \
void ff_h264_idct ## NUM ## _add ## REP ## _ ## DEPTH ## _ ## OPT \
(uint8_t **dst, const int *block_offset, \
int16_t *block, int stride, const uint8_t nnzc[6 * 8]);
IDCT_ADD_REP_FUNC2(, 8, 8, mmx)
IDCT_ADD_REP_FUNC2(, 8, 8, mmxext)
IDCT_ADD_REP_FUNC2(, 8, 8, sse2)
IDCT_ADD_REP_FUNC2(, 8, 10, sse2)
IDCT_ADD_REP_FUNC2(, 8, 10, avx)
IDCT_ADD_REP_FUNC2(, 8_422, 8, mmx)
IDCT_ADD_REP_FUNC2(, 8_422, 10, sse2)
IDCT_ADD_REP_FUNC2(, 8_422, 10, avx)
void ff_h264_luma_dc_dequant_idct_mmx(int16_t *output, int16_t *input, int qmul);
void ff_h264_luma_dc_dequant_idct_sse2(int16_t *output, int16_t *input, int qmul);
/***********************************/
/* deblocking */
void ff_h264_loop_filter_strength_mmxext(int16_t bS[2][4][4], uint8_t nnz[40],
int8_t ref[2][40],
int16_t mv[2][40][2],
int bidir, int edges, int step,
int mask_mv0, int mask_mv1, int field);
#define LF_FUNC(DIR, TYPE, DEPTH, OPT) \
void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT(uint8_t *pix, \
ptrdiff_t stride, \
int alpha, \
int beta, \
int8_t *tc0);
#define LF_IFUNC(DIR, TYPE, DEPTH, OPT) \
void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT(uint8_t *pix, \
ptrdiff_t stride, \
int alpha, \
int beta);
#define LF_FUNCS(type, depth) \
LF_FUNC(h, chroma, depth, mmxext) \
LF_IFUNC(h, chroma_intra, depth, mmxext) \
LF_FUNC(h, chroma422, depth, mmxext) \
LF_IFUNC(h, chroma422_intra, depth, mmxext) \
LF_FUNC(v, chroma, depth, mmxext) \
LF_IFUNC(v, chroma_intra, depth, mmxext) \
LF_FUNC(h, luma, depth, mmxext) \
LF_IFUNC(h, luma_intra, depth, mmxext) \
LF_FUNC(h, luma, depth, sse2) \
LF_IFUNC(h, luma_intra, depth, sse2) \
LF_FUNC(v, luma, depth, sse2) \
LF_IFUNC(v, luma_intra, depth, sse2) \
LF_FUNC(h, chroma, depth, sse2) \
LF_IFUNC(h, chroma_intra, depth, sse2) \
LF_FUNC(h, chroma422, depth, sse2) \
LF_IFUNC(h, chroma422_intra, depth, sse2) \
LF_FUNC(v, chroma, depth, sse2) \
LF_IFUNC(v, chroma_intra, depth, sse2) \
LF_FUNC(h, luma, depth, avx) \
LF_IFUNC(h, luma_intra, depth, avx) \
LF_FUNC(v, luma, depth, avx) \
LF_IFUNC(v, luma_intra, depth, avx) \
LF_FUNC(h, chroma, depth, avx) \
LF_IFUNC(h, chroma_intra, depth, avx) \
LF_FUNC(h, chroma422, depth, avx) \
LF_IFUNC(h, chroma422_intra, depth, avx) \
LF_FUNC(v, chroma, depth, avx) \
LF_IFUNC(v, chroma_intra, depth, avx)
LF_FUNC(h, luma_mbaff, 8, sse2)
LF_FUNC(h, luma_mbaff, 8, avx)
LF_FUNCS(uint8_t, 8)
LF_FUNCS(uint16_t, 10)
#if ARCH_X86_32 && HAVE_MMXEXT_EXTERNAL
LF_FUNC(v8, luma, 8, mmxext)
static void deblock_v_luma_8_mmxext(uint8_t *pix, int stride, int alpha,
int beta, int8_t *tc0)
{
if ((tc0[0] & tc0[1]) >= 0)
ff_deblock_v8_luma_8_mmxext(pix + 0, stride, alpha, beta, tc0);
if ((tc0[2] & tc0[3]) >= 0)
ff_deblock_v8_luma_8_mmxext(pix + 8, stride, alpha, beta, tc0 + 2);
}
LF_IFUNC(v8, luma_intra, 8, mmxext)
static void deblock_v_luma_intra_8_mmxext(uint8_t *pix, int stride,
int alpha, int beta)
{
ff_deblock_v8_luma_intra_8_mmxext(pix + 0, stride, alpha, beta);
ff_deblock_v8_luma_intra_8_mmxext(pix + 8, stride, alpha, beta);
}
#endif /* ARCH_X86_32 && HAVE_MMXEXT_EXTERNAL */
LF_FUNC(v, luma, 10, mmxext)
LF_IFUNC(v, luma_intra, 10, mmxext)
/***********************************/
/* weighted prediction */
#define H264_WEIGHT(W, OPT) \
void ff_h264_weight_ ## W ## _ ## OPT(uint8_t *dst, ptrdiff_t stride, \
int height, int log2_denom, \
int weight, int offset);
#define H264_BIWEIGHT(W, OPT) \
void ff_h264_biweight_ ## W ## _ ## OPT(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride, int height, \
int log2_denom, int weightd, \
int weights, int offset);
#define H264_BIWEIGHT_MMX(W) \
H264_WEIGHT(W, mmxext) \
H264_BIWEIGHT(W, mmxext)
#define H264_BIWEIGHT_MMX_SSE(W) \
H264_BIWEIGHT_MMX(W) \
H264_WEIGHT(W, sse2) \
H264_BIWEIGHT(W, sse2) \
H264_BIWEIGHT(W, ssse3)
H264_BIWEIGHT_MMX_SSE(16)
H264_BIWEIGHT_MMX_SSE(8)
H264_BIWEIGHT_MMX(4)
#define H264_WEIGHT_10(W, DEPTH, OPT) \
void ff_h264_weight_ ## W ## _ ## DEPTH ## _ ## OPT(uint8_t *dst, \
ptrdiff_t stride, \
int height, \
int log2_denom, \
int weight, \
int offset);
#define H264_BIWEIGHT_10(W, DEPTH, OPT) \
void ff_h264_biweight_ ## W ## _ ## DEPTH ## _ ## OPT(uint8_t *dst, \
uint8_t *src, \
ptrdiff_t stride, \
int height, \
int log2_denom, \
int weightd, \
int weights, \
int offset);
#define H264_BIWEIGHT_10_SSE(W, DEPTH) \
H264_WEIGHT_10(W, DEPTH, sse2) \
H264_WEIGHT_10(W, DEPTH, sse4) \
H264_BIWEIGHT_10(W, DEPTH, sse2) \
H264_BIWEIGHT_10(W, DEPTH, sse4)
H264_BIWEIGHT_10_SSE(16, 10)
H264_BIWEIGHT_10_SSE(8, 10)
H264_BIWEIGHT_10_SSE(4, 10)
av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
const int chroma_format_idc)
{
#if HAVE_X86ASM
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_MMXEXT(cpu_flags) && chroma_format_idc <= 1)
c->h264_loop_filter_strength = ff_h264_loop_filter_strength_mmxext;
if (bit_depth == 8) {
if (EXTERNAL_MMX(cpu_flags)) {
c->h264_idct_dc_add =
c->h264_idct_add = ff_h264_idct_add_8_mmx;
c->h264_idct8_dc_add =
c->h264_idct8_add = ff_h264_idct8_add_8_mmx;
c->h264_idct_add16 = ff_h264_idct_add16_8_mmx;
c->h264_idct8_add4 = ff_h264_idct8_add4_8_mmx;
if (chroma_format_idc <= 1) {
c->h264_idct_add8 = ff_h264_idct_add8_8_mmx;
} else {
c->h264_idct_add8 = ff_h264_idct_add8_422_8_mmx;
}
c->h264_idct_add16intra = ff_h264_idct_add16intra_8_mmx;
if (cpu_flags & AV_CPU_FLAG_CMOV)
c->h264_luma_dc_dequant_idct = ff_h264_luma_dc_dequant_idct_mmx;
}
if (EXTERNAL_MMXEXT(cpu_flags)) {
c->h264_idct_dc_add = ff_h264_idct_dc_add_8_mmxext;
c->h264_idct8_dc_add = ff_h264_idct8_dc_add_8_mmxext;
c->h264_idct_add16 = ff_h264_idct_add16_8_mmxext;
c->h264_idct8_add4 = ff_h264_idct8_add4_8_mmxext;
if (chroma_format_idc <= 1)
c->h264_idct_add8 = ff_h264_idct_add8_8_mmxext;
c->h264_idct_add16intra = ff_h264_idct_add16intra_8_mmxext;
c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_8_mmxext;
c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_8_mmxext;
if (chroma_format_idc <= 1) {
c->h264_h_loop_filter_chroma = ff_deblock_h_chroma_8_mmxext;
c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma_intra_8_mmxext;
} else {
c->h264_h_loop_filter_chroma = ff_deblock_h_chroma422_8_mmxext;
c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma422_intra_8_mmxext;
}
#if ARCH_X86_32 && HAVE_MMXEXT_EXTERNAL
c->h264_v_loop_filter_luma = deblock_v_luma_8_mmxext;
c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_mmxext;
c->h264_v_loop_filter_luma_intra = deblock_v_luma_intra_8_mmxext;
c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_mmxext;
#endif /* ARCH_X86_32 && HAVE_MMXEXT_EXTERNAL */
c->weight_h264_pixels_tab[0] = ff_h264_weight_16_mmxext;
c->weight_h264_pixels_tab[1] = ff_h264_weight_8_mmxext;
c->weight_h264_pixels_tab[2] = ff_h264_weight_4_mmxext;
c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_mmxext;
c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_mmxext;
c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_mmxext;
}
if (EXTERNAL_SSE2(cpu_flags)) {
c->h264_idct8_add = ff_h264_idct8_add_8_sse2;
c->h264_idct_add16 = ff_h264_idct_add16_8_sse2;
c->h264_idct8_add4 = ff_h264_idct8_add4_8_sse2;
if (chroma_format_idc <= 1)
c->h264_idct_add8 = ff_h264_idct_add8_8_sse2;
c->h264_idct_add16intra = ff_h264_idct_add16intra_8_sse2;
c->h264_luma_dc_dequant_idct = ff_h264_luma_dc_dequant_idct_sse2;
c->weight_h264_pixels_tab[0] = ff_h264_weight_16_sse2;
c->weight_h264_pixels_tab[1] = ff_h264_weight_8_sse2;
c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_sse2;
c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_sse2;
c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_sse2;
c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_sse2;
c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_sse2;
c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_sse2;
#if ARCH_X86_64
c->h264_h_loop_filter_luma_mbaff = ff_deblock_h_luma_mbaff_8_sse2;
#endif
c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_8_sse2;
c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_8_sse2;
if (chroma_format_idc <= 1) {
c->h264_h_loop_filter_chroma = ff_deblock_h_chroma_8_sse2;
c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma_intra_8_sse2;
} else {
c->h264_h_loop_filter_chroma = ff_deblock_h_chroma422_8_sse2;
c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma422_intra_8_sse2;
}
c->h264_idct_add = ff_h264_idct_add_8_sse2;
c->h264_idct_dc_add = ff_h264_idct_dc_add_8_sse2;
}
if (EXTERNAL_SSSE3(cpu_flags)) {
c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_ssse3;
c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_ssse3;
}
if (EXTERNAL_AVX(cpu_flags)) {
c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_avx;
c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_avx;
c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_avx;
c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_avx;
#if ARCH_X86_64
c->h264_h_loop_filter_luma_mbaff = ff_deblock_h_luma_mbaff_8_avx;
#endif
c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_8_avx;
c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_8_avx;
if (chroma_format_idc <= 1) {
c->h264_h_loop_filter_chroma = ff_deblock_h_chroma_8_avx;
c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma_intra_8_avx;
} else {
c->h264_h_loop_filter_chroma = ff_deblock_h_chroma422_8_avx;
c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma422_intra_8_avx;
}
c->h264_idct_add = ff_h264_idct_add_8_avx;
c->h264_idct_dc_add = ff_h264_idct_dc_add_8_avx;
}
} else if (bit_depth == 10) {
if (EXTERNAL_MMXEXT(cpu_flags)) {
#if ARCH_X86_32
c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_10_mmxext;
c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_mmxext;
if (chroma_format_idc <= 1) {
c->h264_h_loop_filter_chroma = ff_deblock_h_chroma_10_mmxext;
} else {
c->h264_h_loop_filter_chroma = ff_deblock_h_chroma422_10_mmxext;
}
c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_mmxext;
c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_mmxext;
c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_mmxext;
c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_mmxext;
#endif /* ARCH_X86_32 */
c->h264_idct_dc_add = ff_h264_idct_dc_add_10_mmxext;
}
if (EXTERNAL_SSE2(cpu_flags)) {
c->h264_idct_add = ff_h264_idct_add_10_sse2;
c->h264_idct8_dc_add = ff_h264_idct8_dc_add_10_sse2;
c->h264_idct_add16 = ff_h264_idct_add16_10_sse2;
if (chroma_format_idc <= 1) {
c->h264_idct_add8 = ff_h264_idct_add8_10_sse2;
} else {
c->h264_idct_add8 = ff_h264_idct_add8_422_10_sse2;
}
c->h264_idct_add16intra = ff_h264_idct_add16intra_10_sse2;
#if HAVE_ALIGNED_STACK
c->h264_idct8_add = ff_h264_idct8_add_10_sse2;
c->h264_idct8_add4 = ff_h264_idct8_add4_10_sse2;
#endif /* HAVE_ALIGNED_STACK */
c->weight_h264_pixels_tab[0] = ff_h264_weight_16_10_sse2;
c->weight_h264_pixels_tab[1] = ff_h264_weight_8_10_sse2;
c->weight_h264_pixels_tab[2] = ff_h264_weight_4_10_sse2;
c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_10_sse2;
c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_10_sse2;
c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_10_sse2;
c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_10_sse2;
c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_sse2;
if (chroma_format_idc <= 1) {
c->h264_h_loop_filter_chroma = ff_deblock_h_chroma_10_sse2;
} else {
c->h264_h_loop_filter_chroma = ff_deblock_h_chroma422_10_sse2;
}
#if HAVE_ALIGNED_STACK
c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_sse2;
c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_sse2;
c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_sse2;
c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_sse2;
#endif /* HAVE_ALIGNED_STACK */
}
if (EXTERNAL_SSE4(cpu_flags)) {
c->weight_h264_pixels_tab[0] = ff_h264_weight_16_10_sse4;
c->weight_h264_pixels_tab[1] = ff_h264_weight_8_10_sse4;
c->weight_h264_pixels_tab[2] = ff_h264_weight_4_10_sse4;
c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_10_sse4;
c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_10_sse4;
c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_10_sse4;
}
if (EXTERNAL_AVX(cpu_flags)) {
c->h264_idct_dc_add =
c->h264_idct_add = ff_h264_idct_add_10_avx;
c->h264_idct8_dc_add = ff_h264_idct8_dc_add_10_avx;
c->h264_idct_add16 = ff_h264_idct_add16_10_avx;
if (chroma_format_idc <= 1) {
c->h264_idct_add8 = ff_h264_idct_add8_10_avx;
} else {
c->h264_idct_add8 = ff_h264_idct_add8_422_10_avx;
}
c->h264_idct_add16intra = ff_h264_idct_add16intra_10_avx;
#if HAVE_ALIGNED_STACK
c->h264_idct8_add = ff_h264_idct8_add_10_avx;
c->h264_idct8_add4 = ff_h264_idct8_add4_10_avx;
#endif /* HAVE_ALIGNED_STACK */
c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_10_avx;
c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_avx;
if (chroma_format_idc <= 1) {
c->h264_h_loop_filter_chroma = ff_deblock_h_chroma_10_avx;
} else {
c->h264_h_loop_filter_chroma = ff_deblock_h_chroma422_10_avx;
}
#if HAVE_ALIGNED_STACK
c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_avx;
c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_avx;
c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_avx;
c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_avx;
#endif /* HAVE_ALIGNED_STACK */
}
}
#endif
}

View file

@ -0,0 +1,259 @@
/*
* HEVC video decoder
*
* Copyright (C) 2012 - 2013 Guillaume Martres
* Copyright (C) 2013 - 2014 Pierre-Edouard Lepere
*
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVCODEC_X86_HEVCDSP_H
#define AVCODEC_X86_HEVCDSP_H
#include <stddef.h>
#include <stdint.h>
#define PEL_LINK(dst, idx1, idx2, idx3, name, D, opt) \
dst[idx1][idx2][idx3] = ff_hevc_put_hevc_ ## name ## _ ## D ## _##opt; \
dst ## _bi[idx1][idx2][idx3] = ff_hevc_put_hevc_bi_ ## name ## _ ## D ## _##opt; \
dst ## _uni[idx1][idx2][idx3] = ff_hevc_put_hevc_uni_ ## name ## _ ## D ## _##opt; \
dst ## _uni_w[idx1][idx2][idx3] = ff_hevc_put_hevc_uni_w_ ## name ## _ ## D ## _##opt; \
dst ## _bi_w[idx1][idx2][idx3] = ff_hevc_put_hevc_bi_w_ ## name ## _ ## D ## _##opt
#define PEL_PROTOTYPE(name, D, opt) \
void ff_hevc_put_hevc_ ## name ## _ ## D ## _##opt(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width); \
void ff_hevc_put_hevc_bi_ ## name ## _ ## D ## _##opt(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width); \
void ff_hevc_put_hevc_uni_ ## name ## _ ## D ## _##opt(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width); \
void ff_hevc_put_hevc_uni_w_ ## name ## _ ## D ## _##opt(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width); \
void ff_hevc_put_hevc_bi_w_ ## name ## _ ## D ## _##opt(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, int denom, int wx0, int wx1, int ox0, int ox1, intptr_t mx, intptr_t my, int width)
///////////////////////////////////////////////////////////////////////////////
// MC functions
///////////////////////////////////////////////////////////////////////////////
#define EPEL_PROTOTYPES(fname, bitd, opt) \
PEL_PROTOTYPE(fname##4, bitd, opt); \
PEL_PROTOTYPE(fname##6, bitd, opt); \
PEL_PROTOTYPE(fname##8, bitd, opt); \
PEL_PROTOTYPE(fname##12, bitd, opt); \
PEL_PROTOTYPE(fname##16, bitd, opt); \
PEL_PROTOTYPE(fname##24, bitd, opt); \
PEL_PROTOTYPE(fname##32, bitd, opt); \
PEL_PROTOTYPE(fname##48, bitd, opt); \
PEL_PROTOTYPE(fname##64, bitd, opt)
#define QPEL_PROTOTYPES(fname, bitd, opt) \
PEL_PROTOTYPE(fname##4, bitd, opt); \
PEL_PROTOTYPE(fname##8, bitd, opt); \
PEL_PROTOTYPE(fname##12, bitd, opt); \
PEL_PROTOTYPE(fname##16, bitd, opt); \
PEL_PROTOTYPE(fname##24, bitd, opt); \
PEL_PROTOTYPE(fname##32, bitd, opt); \
PEL_PROTOTYPE(fname##48, bitd, opt); \
PEL_PROTOTYPE(fname##64, bitd, opt)
#define WEIGHTING_PROTOTYPE(width, bitd, opt) \
void ff_hevc_put_hevc_uni_w##width##_##bitd##_##opt(uint8_t *dst, ptrdiff_t dststride, int16_t *_src, int height, int denom, int _wx, int _ox); \
void ff_hevc_put_hevc_bi_w##width##_##bitd##_##opt(uint8_t *dst, ptrdiff_t dststride, int16_t *_src, int16_t *_src2, int height, int denom, int _wx0, int _wx1, int _ox0, int _ox1)
#define WEIGHTING_PROTOTYPES(bitd, opt) \
WEIGHTING_PROTOTYPE(2, bitd, opt); \
WEIGHTING_PROTOTYPE(4, bitd, opt); \
WEIGHTING_PROTOTYPE(6, bitd, opt); \
WEIGHTING_PROTOTYPE(8, bitd, opt); \
WEIGHTING_PROTOTYPE(12, bitd, opt); \
WEIGHTING_PROTOTYPE(16, bitd, opt); \
WEIGHTING_PROTOTYPE(24, bitd, opt); \
WEIGHTING_PROTOTYPE(32, bitd, opt); \
WEIGHTING_PROTOTYPE(48, bitd, opt); \
WEIGHTING_PROTOTYPE(64, bitd, opt)
///////////////////////////////////////////////////////////////////////////////
// QPEL_PIXELS EPEL_PIXELS
///////////////////////////////////////////////////////////////////////////////
EPEL_PROTOTYPES(pel_pixels , 8, sse4);
EPEL_PROTOTYPES(pel_pixels , 10, sse4);
EPEL_PROTOTYPES(pel_pixels , 12, sse4);
void ff_hevc_put_hevc_pel_pixels16_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
void ff_hevc_put_hevc_pel_pixels24_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
void ff_hevc_put_hevc_pel_pixels32_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
void ff_hevc_put_hevc_pel_pixels48_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
void ff_hevc_put_hevc_pel_pixels64_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
void ff_hevc_put_hevc_pel_pixels16_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
void ff_hevc_put_hevc_pel_pixels24_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
void ff_hevc_put_hevc_pel_pixels32_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
void ff_hevc_put_hevc_pel_pixels48_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
void ff_hevc_put_hevc_pel_pixels64_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
void ff_hevc_put_hevc_uni_pel_pixels32_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
void ff_hevc_put_hevc_uni_pel_pixels48_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
void ff_hevc_put_hevc_uni_pel_pixels64_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
void ff_hevc_put_hevc_uni_pel_pixels96_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width); //used for 10bit
void ff_hevc_put_hevc_uni_pel_pixels128_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);//used for 10bit
void ff_hevc_put_hevc_bi_pel_pixels16_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
void ff_hevc_put_hevc_bi_pel_pixels24_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
void ff_hevc_put_hevc_bi_pel_pixels32_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
void ff_hevc_put_hevc_bi_pel_pixels48_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
void ff_hevc_put_hevc_bi_pel_pixels64_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
void ff_hevc_put_hevc_bi_pel_pixels16_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
void ff_hevc_put_hevc_bi_pel_pixels24_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
void ff_hevc_put_hevc_bi_pel_pixels32_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
void ff_hevc_put_hevc_bi_pel_pixels48_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
void ff_hevc_put_hevc_bi_pel_pixels64_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
///////////////////////////////////////////////////////////////////////////////
// EPEL
///////////////////////////////////////////////////////////////////////////////
EPEL_PROTOTYPES(epel_h , 8, sse4);
EPEL_PROTOTYPES(epel_h , 10, sse4);
EPEL_PROTOTYPES(epel_h , 12, sse4);
EPEL_PROTOTYPES(epel_v , 8, sse4);
EPEL_PROTOTYPES(epel_v , 10, sse4);
EPEL_PROTOTYPES(epel_v , 12, sse4);
EPEL_PROTOTYPES(epel_hv , 8, sse4);
EPEL_PROTOTYPES(epel_hv , 10, sse4);
EPEL_PROTOTYPES(epel_hv , 12, sse4);
PEL_PROTOTYPE(epel_h16, 8, avx2);
PEL_PROTOTYPE(epel_h24, 8, avx2);
PEL_PROTOTYPE(epel_h32, 8, avx2);
PEL_PROTOTYPE(epel_h48, 8, avx2);
PEL_PROTOTYPE(epel_h64, 8, avx2);
PEL_PROTOTYPE(epel_h16,10, avx2);
PEL_PROTOTYPE(epel_h24,10, avx2);
PEL_PROTOTYPE(epel_h32,10, avx2);
PEL_PROTOTYPE(epel_h48,10, avx2);
PEL_PROTOTYPE(epel_h64,10, avx2);
PEL_PROTOTYPE(epel_v16, 8, avx2);
PEL_PROTOTYPE(epel_v24, 8, avx2);
PEL_PROTOTYPE(epel_v32, 8, avx2);
PEL_PROTOTYPE(epel_v48, 8, avx2);
PEL_PROTOTYPE(epel_v64, 8, avx2);
PEL_PROTOTYPE(epel_v16,10, avx2);
PEL_PROTOTYPE(epel_v24,10, avx2);
PEL_PROTOTYPE(epel_v32,10, avx2);
PEL_PROTOTYPE(epel_v48,10, avx2);
PEL_PROTOTYPE(epel_v64,10, avx2);
PEL_PROTOTYPE(epel_hv16, 8, avx2);
PEL_PROTOTYPE(epel_hv24, 8, avx2);
PEL_PROTOTYPE(epel_hv32, 8, avx2);
PEL_PROTOTYPE(epel_hv48, 8, avx2);
PEL_PROTOTYPE(epel_hv64, 8, avx2);
PEL_PROTOTYPE(epel_hv16,10, avx2);
PEL_PROTOTYPE(epel_hv24,10, avx2);
PEL_PROTOTYPE(epel_hv32,10, avx2);
PEL_PROTOTYPE(epel_hv48,10, avx2);
PEL_PROTOTYPE(epel_hv64,10, avx2);
///////////////////////////////////////////////////////////////////////////////
// QPEL
///////////////////////////////////////////////////////////////////////////////
QPEL_PROTOTYPES(qpel_h , 8, sse4);
QPEL_PROTOTYPES(qpel_h , 10, sse4);
QPEL_PROTOTYPES(qpel_h , 12, sse4);
QPEL_PROTOTYPES(qpel_v, 8, sse4);
QPEL_PROTOTYPES(qpel_v, 10, sse4);
QPEL_PROTOTYPES(qpel_v, 12, sse4);
QPEL_PROTOTYPES(qpel_hv, 8, sse4);
QPEL_PROTOTYPES(qpel_hv, 10, sse4);
QPEL_PROTOTYPES(qpel_hv, 12, sse4);
PEL_PROTOTYPE(qpel_h16, 8, avx2);
PEL_PROTOTYPE(qpel_h24, 8, avx2);
PEL_PROTOTYPE(qpel_h32, 8, avx2);
PEL_PROTOTYPE(qpel_h48, 8, avx2);
PEL_PROTOTYPE(qpel_h64, 8, avx2);
PEL_PROTOTYPE(qpel_h16,10, avx2);
PEL_PROTOTYPE(qpel_h24,10, avx2);
PEL_PROTOTYPE(qpel_h32,10, avx2);
PEL_PROTOTYPE(qpel_h48,10, avx2);
PEL_PROTOTYPE(qpel_h64,10, avx2);
PEL_PROTOTYPE(qpel_v16, 8, avx2);
PEL_PROTOTYPE(qpel_v24, 8, avx2);
PEL_PROTOTYPE(qpel_v32, 8, avx2);
PEL_PROTOTYPE(qpel_v48, 8, avx2);
PEL_PROTOTYPE(qpel_v64, 8, avx2);
PEL_PROTOTYPE(qpel_v16,10, avx2);
PEL_PROTOTYPE(qpel_v24,10, avx2);
PEL_PROTOTYPE(qpel_v32,10, avx2);
PEL_PROTOTYPE(qpel_v48,10, avx2);
PEL_PROTOTYPE(qpel_v64,10, avx2);
PEL_PROTOTYPE(qpel_hv16, 8, avx2);
PEL_PROTOTYPE(qpel_hv24, 8, avx2);
PEL_PROTOTYPE(qpel_hv32, 8, avx2);
PEL_PROTOTYPE(qpel_hv48, 8, avx2);
PEL_PROTOTYPE(qpel_hv64, 8, avx2);
PEL_PROTOTYPE(qpel_hv16,10, avx2);
PEL_PROTOTYPE(qpel_hv24,10, avx2);
PEL_PROTOTYPE(qpel_hv32,10, avx2);
PEL_PROTOTYPE(qpel_hv48,10, avx2);
PEL_PROTOTYPE(qpel_hv64,10, avx2);
WEIGHTING_PROTOTYPES(8, sse4);
WEIGHTING_PROTOTYPES(10, sse4);
WEIGHTING_PROTOTYPES(12, sse4);
///////////////////////////////////////////////////////////////////////////////
// TRANSFORM_ADD
///////////////////////////////////////////////////////////////////////////////
void ff_hevc_add_residual_4_8_mmxext(uint8_t *dst, int16_t *res, ptrdiff_t stride);
void ff_hevc_add_residual_8_8_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
void ff_hevc_add_residual_16_8_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
void ff_hevc_add_residual_32_8_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
void ff_hevc_add_residual_8_8_avx(uint8_t *dst, int16_t *res, ptrdiff_t stride);
void ff_hevc_add_residual_16_8_avx(uint8_t *dst, int16_t *res, ptrdiff_t stride);
void ff_hevc_add_residual_32_8_avx(uint8_t *dst, int16_t *res, ptrdiff_t stride);
void ff_hevc_add_residual_32_8_avx2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
void ff_hevc_add_residual_4_10_mmxext(uint8_t *dst, int16_t *res, ptrdiff_t stride);
void ff_hevc_add_residual_8_10_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
void ff_hevc_add_residual_16_10_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
void ff_hevc_add_residual_32_10_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
void ff_hevc_add_residual_16_10_avx2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
void ff_hevc_add_residual_32_10_avx2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
#endif // AVCODEC_X86_HEVCDSP_H

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,57 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVCODEC_X86_HPELDSP_H
#define AVCODEC_X86_HPELDSP_H
#include <stddef.h>
#include <stdint.h>
#include "libavcodec/hpeldsp.h"
void ff_avg_pixels8_x2_mmx(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_pixels8_xy2_mmx(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_pixels8_xy2_mmxext(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_pixels8_xy2_ssse3(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_pixels16_xy2_mmx(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_pixels16_xy2_sse2(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_pixels16_xy2_ssse3(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_pixels8_xy2_mmx(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_pixels8_xy2_ssse3(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_pixels16_xy2_mmx(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_pixels16_xy2_sse2(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_pixels16_xy2_ssse3(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_hpeldsp_vp3_init_x86(HpelDSPContext *c, int cpu_flags, int flags);
#endif /* AVCODEC_X86_HPELDSP_H */

View file

@ -0,0 +1,313 @@
/*
* SIMD-optimized halfpel functions
* Copyright (c) 2000, 2001 Fabrice Bellard
* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
* MMX optimization by Nick Kurshev <nickols_k@mail.ru>
*/
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/avcodec.h"
#include "libavcodec/hpeldsp.h"
#include "libavcodec/pixels.h"
#include "fpel.h"
#include "hpeldsp.h"
void ff_put_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_pixels16_x2_mmxext(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_pixels16_x2_3dnow(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_pixels16_x2_sse2(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_pixels16_x2_sse2(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_pixels16_y2_sse2(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_pixels16_y2_sse2(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_no_rnd_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_no_rnd_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_no_rnd_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_no_rnd_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_pixels8_3dnow(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_pixels8_xy2_3dnow(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_approx_pixels8_xy2_mmxext(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_approx_pixels8_xy2_3dnow(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
#define avg_pixels8_mmx ff_avg_pixels8_mmx
#define avg_pixels8_x2_mmx ff_avg_pixels8_x2_mmx
#define avg_pixels16_mmx ff_avg_pixels16_mmx
#define avg_pixels8_xy2_mmx ff_avg_pixels8_xy2_mmx
#define avg_pixels16_xy2_mmx ff_avg_pixels16_xy2_mmx
#define put_pixels8_mmx ff_put_pixels8_mmx
#define put_pixels16_mmx ff_put_pixels16_mmx
#define put_pixels8_xy2_mmx ff_put_pixels8_xy2_mmx
#define put_pixels16_xy2_mmx ff_put_pixels16_xy2_mmx
#define avg_no_rnd_pixels16_mmx ff_avg_pixels16_mmx
#define put_no_rnd_pixels8_mmx ff_put_pixels8_mmx
#define put_no_rnd_pixels16_mmx ff_put_pixels16_mmx
#if HAVE_INLINE_ASM
/***********************************/
/* MMX no rounding */
#define DEF(x, y) x ## _no_rnd_ ## y ## _mmx
#define SET_RND MOVQ_WONE
#define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
#define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
#define STATIC static
#include "rnd_template.c"
#include "hpeldsp_rnd_template.c"
#undef DEF
#undef SET_RND
#undef PAVGBP
#undef PAVGB
#undef STATIC
#if HAVE_MMX
CALL_2X_PIXELS(avg_no_rnd_pixels16_y2_mmx, avg_no_rnd_pixels8_y2_mmx, 8)
CALL_2X_PIXELS(put_no_rnd_pixels16_y2_mmx, put_no_rnd_pixels8_y2_mmx, 8)
CALL_2X_PIXELS(avg_no_rnd_pixels16_xy2_mmx, avg_no_rnd_pixels8_xy2_mmx, 8)
CALL_2X_PIXELS(put_no_rnd_pixels16_xy2_mmx, put_no_rnd_pixels8_xy2_mmx, 8)
#endif
/***********************************/
/* MMX rounding */
#define DEF(x, y) x ## _ ## y ## _mmx
#define SET_RND MOVQ_WTWO
#define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
#define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
#include "hpeldsp_rnd_template.c"
#undef DEF
#define DEF(x, y) ff_ ## x ## _ ## y ## _mmx
#define STATIC
#include "rnd_template.c"
#undef DEF
#undef SET_RND
#undef PAVGBP
#undef PAVGB
#if HAVE_MMX
CALL_2X_PIXELS(avg_pixels16_y2_mmx, avg_pixels8_y2_mmx, 8)
CALL_2X_PIXELS(put_pixels16_y2_mmx, put_pixels8_y2_mmx, 8)
CALL_2X_PIXELS_EXPORT(ff_avg_pixels16_xy2_mmx, ff_avg_pixels8_xy2_mmx, 8)
CALL_2X_PIXELS_EXPORT(ff_put_pixels16_xy2_mmx, ff_put_pixels8_xy2_mmx, 8)
#endif
#endif /* HAVE_INLINE_ASM */
#if HAVE_X86ASM
#define HPELDSP_AVG_PIXELS16(CPUEXT) \
CALL_2X_PIXELS(put_no_rnd_pixels16_x2 ## CPUEXT, ff_put_no_rnd_pixels8_x2 ## CPUEXT, 8) \
CALL_2X_PIXELS(put_pixels16_y2 ## CPUEXT, ff_put_pixels8_y2 ## CPUEXT, 8) \
CALL_2X_PIXELS(put_no_rnd_pixels16_y2 ## CPUEXT, ff_put_no_rnd_pixels8_y2 ## CPUEXT, 8) \
CALL_2X_PIXELS(avg_pixels16 ## CPUEXT, ff_avg_pixels8 ## CPUEXT, 8) \
CALL_2X_PIXELS(avg_pixels16_x2 ## CPUEXT, ff_avg_pixels8_x2 ## CPUEXT, 8) \
CALL_2X_PIXELS(avg_pixels16_y2 ## CPUEXT, ff_avg_pixels8_y2 ## CPUEXT, 8) \
CALL_2X_PIXELS(avg_pixels16_xy2 ## CPUEXT, ff_avg_pixels8_xy2 ## CPUEXT, 8) \
CALL_2X_PIXELS(avg_approx_pixels16_xy2## CPUEXT, ff_avg_approx_pixels8_xy2## CPUEXT, 8)
HPELDSP_AVG_PIXELS16(_3dnow)
HPELDSP_AVG_PIXELS16(_mmxext)
#endif /* HAVE_X86ASM */
#define SET_HPEL_FUNCS_EXT(PFX, IDX, SIZE, CPU) \
if (HAVE_MMX_EXTERNAL) \
c->PFX ## _pixels_tab IDX [0] = PFX ## _pixels ## SIZE ## _ ## CPU;
#if HAVE_MMX_INLINE
#define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \
do { \
SET_HPEL_FUNCS_EXT(PFX, IDX, SIZE, CPU) \
c->PFX ## _pixels_tab IDX [1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \
c->PFX ## _pixels_tab IDX [2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \
c->PFX ## _pixels_tab IDX [3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU; \
} while (0)
#else
#define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \
do { \
SET_HPEL_FUNCS_EXT(PFX, IDX, SIZE, CPU) \
} while (0)
#endif
static void hpeldsp_init_mmx(HpelDSPContext *c, int flags)
{
SET_HPEL_FUNCS(put, [0], 16, mmx);
SET_HPEL_FUNCS(put_no_rnd, [0], 16, mmx);
SET_HPEL_FUNCS(avg, [0], 16, mmx);
SET_HPEL_FUNCS(avg_no_rnd, , 16, mmx);
SET_HPEL_FUNCS(put, [1], 8, mmx);
SET_HPEL_FUNCS(put_no_rnd, [1], 8, mmx);
if (HAVE_MMX_EXTERNAL) {
c->avg_pixels_tab[1][0] = ff_avg_pixels8_mmx;
c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_mmx;
}
#if HAVE_MMX_INLINE
c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx;
c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_mmx;
#endif
}
static void hpeldsp_init_mmxext(HpelDSPContext *c, int flags)
{
#if HAVE_MMXEXT_EXTERNAL
c->put_pixels_tab[0][1] = ff_put_pixels16_x2_mmxext;
c->put_pixels_tab[0][2] = put_pixels16_y2_mmxext;
c->avg_pixels_tab[0][0] = avg_pixels16_mmxext;
c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmxext;
c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmxext;
c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmxext;
c->put_pixels_tab[1][1] = ff_put_pixels8_x2_mmxext;
c->put_pixels_tab[1][2] = ff_put_pixels8_y2_mmxext;
c->avg_pixels_tab[1][0] = ff_avg_pixels8_mmxext;
c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_mmxext;
c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_mmxext;
c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_mmxext;
if (!(flags & AV_CODEC_FLAG_BITEXACT)) {
c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmxext;
c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmxext;
c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_mmxext;
c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_mmxext;
c->avg_pixels_tab[0][3] = avg_approx_pixels16_xy2_mmxext;
c->avg_pixels_tab[1][3] = ff_avg_approx_pixels8_xy2_mmxext;
}
#endif /* HAVE_MMXEXT_EXTERNAL */
}
static void hpeldsp_init_3dnow(HpelDSPContext *c, int flags)
{
#if HAVE_AMD3DNOW_EXTERNAL
c->put_pixels_tab[0][1] = ff_put_pixels16_x2_3dnow;
c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
c->avg_pixels_tab[0][0] = avg_pixels16_3dnow;
c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
c->put_pixels_tab[1][1] = ff_put_pixels8_x2_3dnow;
c->put_pixels_tab[1][2] = ff_put_pixels8_y2_3dnow;
c->avg_pixels_tab[1][0] = ff_avg_pixels8_3dnow;
c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_3dnow;
c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_3dnow;
c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_3dnow;
if (!(flags & AV_CODEC_FLAG_BITEXACT)){
c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow;
c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_3dnow;
c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_3dnow;
c->avg_pixels_tab[0][3] = avg_approx_pixels16_xy2_3dnow;
c->avg_pixels_tab[1][3] = ff_avg_approx_pixels8_xy2_3dnow;
}
#endif /* HAVE_AMD3DNOW_EXTERNAL */
}
static void hpeldsp_init_sse2_fast(HpelDSPContext *c, int flags)
{
#if HAVE_SSE2_EXTERNAL
c->put_pixels_tab[0][0] = ff_put_pixels16_sse2;
c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_sse2;
c->put_pixels_tab[0][1] = ff_put_pixels16_x2_sse2;
c->put_pixels_tab[0][2] = ff_put_pixels16_y2_sse2;
c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_sse2;
c->avg_pixels_tab[0][0] = ff_avg_pixels16_sse2;
c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_sse2;
c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_sse2;
c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_sse2;
#endif /* HAVE_SSE2_EXTERNAL */
}
static void hpeldsp_init_ssse3(HpelDSPContext *c, int flags)
{
#if HAVE_SSSE3_EXTERNAL
c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_ssse3;
c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_ssse3;
c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_ssse3;
c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_ssse3;
#endif
}
av_cold void ff_hpeldsp_init_x86(HpelDSPContext *c, int flags)
{
int cpu_flags = av_get_cpu_flags();
if (INLINE_MMX(cpu_flags))
hpeldsp_init_mmx(c, flags);
if (EXTERNAL_AMD3DNOW(cpu_flags))
hpeldsp_init_3dnow(c, flags);
if (EXTERNAL_MMXEXT(cpu_flags))
hpeldsp_init_mmxext(c, flags);
if (EXTERNAL_SSE2_FAST(cpu_flags))
hpeldsp_init_sse2_fast(c, flags);
if (EXTERNAL_SSSE3(cpu_flags))
hpeldsp_init_ssse3(c, flags);
if (CONFIG_VP3_DECODER)
ff_hpeldsp_vp3_init_x86(c, cpu_flags, flags);
}

View file

@ -0,0 +1,202 @@
/*
* SIMD-optimized halfpel functions are compiled twice for rnd/no_rnd
* Copyright (c) 2000, 2001 Fabrice Bellard
* Copyright (c) 2003-2004 Michael Niedermayer <michaelni@gmx.at>
*
* MMX optimization by Nick Kurshev <nickols_k@mail.ru>
* mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
* and improved by Zdenek Kabelac <kabi@users.sf.net>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stddef.h>
#include <stdint.h>
// put_pixels
av_unused static void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
{
MOVQ_BFE(mm6);
__asm__ volatile(
"lea (%3, %3), %%"FF_REG_a" \n\t"
".p2align 3 \n\t"
"1: \n\t"
"movq (%1), %%mm0 \n\t"
"movq 1(%1), %%mm1 \n\t"
"movq (%1, %3), %%mm2 \n\t"
"movq 1(%1, %3), %%mm3 \n\t"
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
"movq %%mm4, (%2) \n\t"
"movq %%mm5, (%2, %3) \n\t"
"add %%"FF_REG_a", %1 \n\t"
"add %%"FF_REG_a", %2 \n\t"
"movq (%1), %%mm0 \n\t"
"movq 1(%1), %%mm1 \n\t"
"movq (%1, %3), %%mm2 \n\t"
"movq 1(%1, %3), %%mm3 \n\t"
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
"movq %%mm4, (%2) \n\t"
"movq %%mm5, (%2, %3) \n\t"
"add %%"FF_REG_a", %1 \n\t"
"add %%"FF_REG_a", %2 \n\t"
"subl $4, %0 \n\t"
"jnz 1b \n\t"
:"+g"(h), "+S"(pixels), "+D"(block)
:"r"((x86_reg)line_size)
:FF_REG_a, "memory");
}
av_unused static void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
{
MOVQ_BFE(mm6);
__asm__ volatile(
"lea (%3, %3), %%"FF_REG_a" \n\t"
".p2align 3 \n\t"
"1: \n\t"
"movq (%1), %%mm0 \n\t"
"movq 1(%1), %%mm1 \n\t"
"movq (%1, %3), %%mm2 \n\t"
"movq 1(%1, %3), %%mm3 \n\t"
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
"movq %%mm4, (%2) \n\t"
"movq %%mm5, (%2, %3) \n\t"
"movq 8(%1), %%mm0 \n\t"
"movq 9(%1), %%mm1 \n\t"
"movq 8(%1, %3), %%mm2 \n\t"
"movq 9(%1, %3), %%mm3 \n\t"
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
"movq %%mm4, 8(%2) \n\t"
"movq %%mm5, 8(%2, %3) \n\t"
"add %%"FF_REG_a", %1 \n\t"
"add %%"FF_REG_a", %2 \n\t"
"movq (%1), %%mm0 \n\t"
"movq 1(%1), %%mm1 \n\t"
"movq (%1, %3), %%mm2 \n\t"
"movq 1(%1, %3), %%mm3 \n\t"
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
"movq %%mm4, (%2) \n\t"
"movq %%mm5, (%2, %3) \n\t"
"movq 8(%1), %%mm0 \n\t"
"movq 9(%1), %%mm1 \n\t"
"movq 8(%1, %3), %%mm2 \n\t"
"movq 9(%1, %3), %%mm3 \n\t"
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
"movq %%mm4, 8(%2) \n\t"
"movq %%mm5, 8(%2, %3) \n\t"
"add %%"FF_REG_a", %1 \n\t"
"add %%"FF_REG_a", %2 \n\t"
"subl $4, %0 \n\t"
"jnz 1b \n\t"
:"+g"(h), "+S"(pixels), "+D"(block)
:"r"((x86_reg)line_size)
:FF_REG_a, "memory");
}
av_unused static void DEF(put, pixels8_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
{
MOVQ_BFE(mm6);
__asm__ volatile(
"lea (%3, %3), %%"FF_REG_a" \n\t"
"movq (%1), %%mm0 \n\t"
".p2align 3 \n\t"
"1: \n\t"
"movq (%1, %3), %%mm1 \n\t"
"movq (%1, %%"FF_REG_a"),%%mm2\n\t"
PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5)
"movq %%mm4, (%2) \n\t"
"movq %%mm5, (%2, %3) \n\t"
"add %%"FF_REG_a", %1 \n\t"
"add %%"FF_REG_a", %2 \n\t"
"movq (%1, %3), %%mm1 \n\t"
"movq (%1, %%"FF_REG_a"),%%mm0\n\t"
PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5)
"movq %%mm4, (%2) \n\t"
"movq %%mm5, (%2, %3) \n\t"
"add %%"FF_REG_a", %1 \n\t"
"add %%"FF_REG_a", %2 \n\t"
"subl $4, %0 \n\t"
"jnz 1b \n\t"
:"+g"(h), "+S"(pixels), "+D"(block)
:"r"((x86_reg)line_size)
:FF_REG_a, "memory");
}
av_unused static void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
{
MOVQ_BFE(mm6);
__asm__ volatile(
".p2align 3 \n\t"
"1: \n\t"
"movq (%1), %%mm0 \n\t"
"movq 1(%1), %%mm1 \n\t"
"movq (%2), %%mm3 \n\t"
PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
PAVGB_MMX(%%mm3, %%mm2, %%mm0, %%mm6)
"movq %%mm0, (%2) \n\t"
"movq 8(%1), %%mm0 \n\t"
"movq 9(%1), %%mm1 \n\t"
"movq 8(%2), %%mm3 \n\t"
PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
PAVGB_MMX(%%mm3, %%mm2, %%mm0, %%mm6)
"movq %%mm0, 8(%2) \n\t"
"add %3, %1 \n\t"
"add %3, %2 \n\t"
"subl $1, %0 \n\t"
"jnz 1b \n\t"
:"+g"(h), "+S"(pixels), "+D"(block)
:"r"((x86_reg)line_size)
:"memory");
}
av_unused static void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
{
MOVQ_BFE(mm6);
__asm__ volatile(
"lea (%3, %3), %%"FF_REG_a" \n\t"
"movq (%1), %%mm0 \n\t"
".p2align 3 \n\t"
"1: \n\t"
"movq (%1, %3), %%mm1 \n\t"
"movq (%1, %%"FF_REG_a"), %%mm2 \n\t"
PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5)
"movq (%2), %%mm3 \n\t"
PAVGB_MMX(%%mm3, %%mm4, %%mm0, %%mm6)
"movq (%2, %3), %%mm3 \n\t"
PAVGB_MMX(%%mm3, %%mm5, %%mm1, %%mm6)
"movq %%mm0, (%2) \n\t"
"movq %%mm1, (%2, %3) \n\t"
"add %%"FF_REG_a", %1 \n\t"
"add %%"FF_REG_a", %2 \n\t"
"movq (%1, %3), %%mm1 \n\t"
"movq (%1, %%"FF_REG_a"), %%mm0 \n\t"
PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5)
"movq (%2), %%mm3 \n\t"
PAVGB_MMX(%%mm3, %%mm4, %%mm2, %%mm6)
"movq (%2, %3), %%mm3 \n\t"
PAVGB_MMX(%%mm3, %%mm5, %%mm1, %%mm6)
"movq %%mm2, (%2) \n\t"
"movq %%mm1, (%2, %3) \n\t"
"add %%"FF_REG_a", %1 \n\t"
"add %%"FF_REG_a", %2 \n\t"
"subl $4, %0 \n\t"
"jnz 1b \n\t"
:"+g"(h), "+S"(pixels), "+D"(block)
:"r"((x86_reg)line_size)
:FF_REG_a, "memory");
}

View file

@ -0,0 +1,56 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/avcodec.h"
#include "libavcodec/hpeldsp.h"
#include "hpeldsp.h"
void ff_put_no_rnd_pixels8_x2_exact_mmxext(uint8_t *block,
const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_no_rnd_pixels8_x2_exact_3dnow(uint8_t *block,
const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_no_rnd_pixels8_y2_exact_mmxext(uint8_t *block,
const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_no_rnd_pixels8_y2_exact_3dnow(uint8_t *block,
const uint8_t *pixels,
ptrdiff_t line_size, int h);
av_cold void ff_hpeldsp_vp3_init_x86(HpelDSPContext *c, int cpu_flags, int flags)
{
if (EXTERNAL_AMD3DNOW(cpu_flags)) {
if (flags & AV_CODEC_FLAG_BITEXACT) {
c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_3dnow;
c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_3dnow;
}
}
if (EXTERNAL_MMXEXT(cpu_flags)) {
if (flags & AV_CODEC_FLAG_BITEXACT) {
c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_mmxext;
c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_mmxext;
}
}
}

View file

@ -0,0 +1,61 @@
/*
* Copyright (c) 2009 Loren Merritt <lorenm@u.washington.edu>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/pixdesc.h"
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/huffyuvdsp.h"
void ff_add_int16_mmx(uint16_t *dst, const uint16_t *src, unsigned mask, int w);
void ff_add_int16_sse2(uint16_t *dst, const uint16_t *src, unsigned mask, int w);
void ff_add_int16_avx2(uint16_t *dst, const uint16_t *src, unsigned mask, int w);
void ff_add_hfyu_left_pred_bgr32_mmx(uint8_t *dst, const uint8_t *src,
intptr_t w, uint8_t *left);
void ff_add_hfyu_left_pred_bgr32_sse2(uint8_t *dst, const uint8_t *src,
intptr_t w, uint8_t *left);
void ff_add_hfyu_median_pred_int16_mmxext(uint16_t *dst, const uint16_t *top, const uint16_t *diff, unsigned mask, int w, int *left, int *left_top);
av_cold void ff_huffyuvdsp_init_x86(HuffYUVDSPContext *c, enum AVPixelFormat pix_fmt)
{
int cpu_flags = av_get_cpu_flags();
const AVPixFmtDescriptor *pix_desc = av_pix_fmt_desc_get(pix_fmt);
if (ARCH_X86_32 && EXTERNAL_MMX(cpu_flags)) {
c->add_hfyu_left_pred_bgr32 = ff_add_hfyu_left_pred_bgr32_mmx;
c->add_int16 = ff_add_int16_mmx;
}
if (EXTERNAL_MMXEXT(cpu_flags) && pix_desc && pix_desc->comp[0].depth<16) {
c->add_hfyu_median_pred_int16 = ff_add_hfyu_median_pred_int16_mmxext;
}
if (EXTERNAL_SSE2(cpu_flags)) {
c->add_int16 = ff_add_int16_sse2;
c->add_hfyu_left_pred_bgr32 = ff_add_hfyu_left_pred_bgr32_sse2;
}
if (EXTERNAL_AVX2_FAST(cpu_flags)) {
c->add_int16 = ff_add_int16_avx2;
}
}

View file

@ -0,0 +1,60 @@
/*
* SIMD-optimized HuffYUV encoding functions
* Copyright (c) 2000, 2001 Fabrice Bellard
* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
*
* MMX optimization by Nick Kurshev <nickols_k@mail.ru>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/pixdesc.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/huffyuvencdsp.h"
void ff_diff_int16_mmx (uint16_t *dst, const uint16_t *src1, const uint16_t *src2,
unsigned mask, int w);
void ff_diff_int16_sse2(uint16_t *dst, const uint16_t *src1, const uint16_t *src2,
unsigned mask, int w);
void ff_diff_int16_avx2(uint16_t *dst, const uint16_t *src1, const uint16_t *src2,
unsigned mask, int w);
void ff_sub_hfyu_median_pred_int16_mmxext(uint16_t *dst, const uint16_t *src1, const uint16_t *src2,
unsigned mask, int w, int *left, int *left_top);
av_cold void ff_huffyuvencdsp_init_x86(HuffYUVEncDSPContext *c, AVCodecContext *avctx)
{
av_unused int cpu_flags = av_get_cpu_flags();
const AVPixFmtDescriptor *pix_desc = av_pix_fmt_desc_get(avctx->pix_fmt);
if (ARCH_X86_32 && EXTERNAL_MMX(cpu_flags)) {
c->diff_int16 = ff_diff_int16_mmx;
}
if (EXTERNAL_MMXEXT(cpu_flags) && pix_desc && pix_desc->comp[0].depth<16) {
c->sub_hfyu_median_pred_int16 = ff_sub_hfyu_median_pred_int16_mmxext;
}
if (EXTERNAL_SSE2(cpu_flags)) {
c->diff_int16 = ff_diff_int16_sse2;
}
if (EXTERNAL_AVX2_FAST(cpu_flags)) {
c->diff_int16 = ff_diff_int16_avx2;
}
}

View file

@ -0,0 +1,39 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVCODEC_X86_IDCTDSP_H
#define AVCODEC_X86_IDCTDSP_H
#include <stddef.h>
#include <stdint.h>
void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
ptrdiff_t line_size);
void ff_add_pixels_clamped_sse2(const int16_t *block, uint8_t *pixels,
ptrdiff_t line_size);
void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
ptrdiff_t line_size);
void ff_put_pixels_clamped_sse2(const int16_t *block, uint8_t *pixels,
ptrdiff_t line_size);
void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
ptrdiff_t line_size);
void ff_put_signed_pixels_clamped_sse2(const int16_t *block, uint8_t *pixels,
ptrdiff_t line_size);
#endif /* AVCODEC_X86_IDCTDSP_H */

View file

@ -0,0 +1,162 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/avcodec.h"
#include "libavcodec/idctdsp.h"
#include "idctdsp.h"
#include "simple_idct.h"
/* Input permutation for the simple_idct_mmx */
static const uint8_t simple_mmx_permutation[64] = {
0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
};
static const uint8_t idct_sse2_row_perm[8] = { 0, 4, 1, 5, 2, 6, 3, 7 };
av_cold int ff_init_scantable_permutation_x86(uint8_t *idct_permutation,
enum idct_permutation_type perm_type)
{
int i;
switch (perm_type) {
case FF_IDCT_PERM_SIMPLE:
for (i = 0; i < 64; i++)
idct_permutation[i] = simple_mmx_permutation[i];
return 1;
case FF_IDCT_PERM_SSE2:
for (i = 0; i < 64; i++)
idct_permutation[i] = (i & 0x38) | idct_sse2_row_perm[i & 7];
return 1;
}
return 0;
}
av_cold void ff_idctdsp_init_x86(IDCTDSPContext *c, AVCodecContext *avctx,
unsigned high_bit_depth)
{
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_MMX(cpu_flags)) {
c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;
c->put_pixels_clamped = ff_put_pixels_clamped_mmx;
c->add_pixels_clamped = ff_add_pixels_clamped_mmx;
if (!high_bit_depth &&
avctx->lowres == 0 &&
(avctx->idct_algo == FF_IDCT_AUTO ||
avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
avctx->idct_algo == FF_IDCT_SIMPLEMMX)) {
c->idct_put = ff_simple_idct_put_mmx;
c->idct_add = ff_simple_idct_add_mmx;
c->idct = ff_simple_idct_mmx;
c->perm_type = FF_IDCT_PERM_SIMPLE;
}
}
if (EXTERNAL_SSE2(cpu_flags)) {
c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_sse2;
c->put_pixels_clamped = ff_put_pixels_clamped_sse2;
c->add_pixels_clamped = ff_add_pixels_clamped_sse2;
if (!high_bit_depth &&
avctx->lowres == 0 &&
(avctx->idct_algo == FF_IDCT_AUTO ||
avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
avctx->idct_algo == FF_IDCT_SIMPLEMMX)) {
c->idct_put = ff_simple_idct_put_sse2;
c->idct_add = ff_simple_idct_add_sse2;
c->perm_type = FF_IDCT_PERM_SIMPLE;
}
if (ARCH_X86_64 &&
!high_bit_depth &&
avctx->lowres == 0 &&
(avctx->idct_algo == FF_IDCT_AUTO ||
avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
avctx->idct_algo == FF_IDCT_SIMPLEMMX ||
avctx->idct_algo == FF_IDCT_SIMPLE)) {
c->idct = ff_simple_idct8_sse2;
c->idct_put = ff_simple_idct8_put_sse2;
c->idct_add = ff_simple_idct8_add_sse2;
c->perm_type = FF_IDCT_PERM_TRANSPOSE;
}
}
if (ARCH_X86_64 && avctx->lowres == 0) {
if (EXTERNAL_AVX(cpu_flags) &&
!high_bit_depth &&
(avctx->idct_algo == FF_IDCT_AUTO ||
avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
avctx->idct_algo == FF_IDCT_SIMPLEMMX ||
avctx->idct_algo == FF_IDCT_SIMPLE)) {
c->idct = ff_simple_idct8_avx;
c->idct_put = ff_simple_idct8_put_avx;
c->idct_add = ff_simple_idct8_add_avx;
c->perm_type = FF_IDCT_PERM_TRANSPOSE;
}
if (avctx->bits_per_raw_sample == 10 &&
avctx->codec_id != AV_CODEC_ID_MPEG4 &&
(avctx->idct_algo == FF_IDCT_AUTO ||
avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
avctx->idct_algo == FF_IDCT_SIMPLE)) {
if (EXTERNAL_SSE2(cpu_flags)) {
c->idct_put = ff_simple_idct10_put_sse2;
c->idct_add = NULL;
c->idct = ff_simple_idct10_sse2;
c->perm_type = FF_IDCT_PERM_TRANSPOSE;
}
if (EXTERNAL_AVX(cpu_flags)) {
c->idct_put = ff_simple_idct10_put_avx;
c->idct_add = NULL;
c->idct = ff_simple_idct10_avx;
c->perm_type = FF_IDCT_PERM_TRANSPOSE;
}
}
if (avctx->bits_per_raw_sample == 12 &&
(avctx->idct_algo == FF_IDCT_AUTO ||
avctx->idct_algo == FF_IDCT_SIMPLEMMX)) {
if (EXTERNAL_SSE2(cpu_flags)) {
c->idct_put = ff_simple_idct12_put_sse2;
c->idct_add = NULL;
c->idct = ff_simple_idct12_sse2;
c->perm_type = FF_IDCT_PERM_TRANSPOSE;
}
if (EXTERNAL_AVX(cpu_flags)) {
c->idct_put = ff_simple_idct12_put_avx;
c->idct_add = NULL;
c->idct = ff_simple_idct12_avx;
c->perm_type = FF_IDCT_PERM_TRANSPOSE;
}
}
}
}

View file

@ -0,0 +1,100 @@
/*
* inline assembly helper macros
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVCODEC_X86_INLINE_ASM_H
#define AVCODEC_X86_INLINE_ASM_H
#include "constants.h"
#define MOVQ_WONE(regd) \
__asm__ volatile ( \
"pcmpeqd %%" #regd ", %%" #regd " \n\t" \
"psrlw $15, %%" #regd ::)
#define JUMPALIGN() __asm__ volatile (".p2align 3"::)
#define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%"#regd", %%"#regd ::)
#define MOVQ_BFE(regd) \
__asm__ volatile ( \
"pcmpeqd %%"#regd", %%"#regd" \n\t" \
"paddb %%"#regd", %%"#regd" \n\t" ::)
#ifndef PIC
#define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_pw_2))
#else
// for shared library it's better to use this way for accessing constants
// pcmpeqd -> -1
#define MOVQ_WTWO(regd) \
__asm__ volatile ( \
"pcmpeqd %%"#regd", %%"#regd" \n\t" \
"psrlw $15, %%"#regd" \n\t" \
"psllw $1, %%"#regd" \n\t"::)
#endif
// using regr as temporary and for the output result
// first argument is unmodified and second is trashed
// regfe is supposed to contain 0xfefefefefefefefe
#define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
"movq "#rega", "#regr" \n\t" \
"pand "#regb", "#regr" \n\t" \
"pxor "#rega", "#regb" \n\t" \
"pand "#regfe", "#regb" \n\t" \
"psrlq $1, "#regb" \n\t" \
"paddb "#regb", "#regr" \n\t"
#define PAVGB_MMX(rega, regb, regr, regfe) \
"movq "#rega", "#regr" \n\t" \
"por "#regb", "#regr" \n\t" \
"pxor "#rega", "#regb" \n\t" \
"pand "#regfe", "#regb" \n\t" \
"psrlq $1, "#regb" \n\t" \
"psubb "#regb", "#regr" \n\t"
// mm6 is supposed to contain 0xfefefefefefefefe
#define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
"movq "#rega", "#regr" \n\t" \
"movq "#regc", "#regp" \n\t" \
"pand "#regb", "#regr" \n\t" \
"pand "#regd", "#regp" \n\t" \
"pxor "#rega", "#regb" \n\t" \
"pxor "#regc", "#regd" \n\t" \
"pand %%mm6, "#regb" \n\t" \
"pand %%mm6, "#regd" \n\t" \
"psrlq $1, "#regb" \n\t" \
"psrlq $1, "#regd" \n\t" \
"paddb "#regb", "#regr" \n\t" \
"paddb "#regd", "#regp" \n\t"
#define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
"movq "#rega", "#regr" \n\t" \
"movq "#regc", "#regp" \n\t" \
"por "#regb", "#regr" \n\t" \
"por "#regd", "#regp" \n\t" \
"pxor "#rega", "#regb" \n\t" \
"pxor "#regc", "#regd" \n\t" \
"pand %%mm6, "#regb" \n\t" \
"pand %%mm6, "#regd" \n\t" \
"psrlq $1, "#regd" \n\t" \
"psrlq $1, "#regb" \n\t" \
"psubb "#regb", "#regr" \n\t" \
"psubb "#regd", "#regp" \n\t"
#endif /* AVCODEC_X86_INLINE_ASM_H */

View file

@ -0,0 +1,60 @@
/*
* SIMD optimized JPEG 2000 DSP functions
* Copyright (c) 2015 James Almer
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/jpeg2000dsp.h"
void ff_ict_float_sse(void *src0, void *src1, void *src2, int csize);
void ff_ict_float_avx(void *src0, void *src1, void *src2, int csize);
void ff_ict_float_fma3(void *src0, void *src1, void *src2, int csize);
void ff_ict_float_fma4(void *src0, void *src1, void *src2, int csize);
void ff_rct_int_sse2 (void *src0, void *src1, void *src2, int csize);
void ff_rct_int_avx2 (void *src0, void *src1, void *src2, int csize);
av_cold void ff_jpeg2000dsp_init_x86(Jpeg2000DSPContext *c)
{
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_SSE(cpu_flags)) {
c->mct_decode[FF_DWT97] = ff_ict_float_sse;
}
if (EXTERNAL_SSE2(cpu_flags)) {
c->mct_decode[FF_DWT53] = ff_rct_int_sse2;
}
if (EXTERNAL_AVX_FAST(cpu_flags)) {
c->mct_decode[FF_DWT97] = ff_ict_float_avx;
}
if (EXTERNAL_FMA4(cpu_flags)) {
c->mct_decode[FF_DWT97] = ff_ict_float_fma4;
}
if (EXTERNAL_FMA3_FAST(cpu_flags)) {
c->mct_decode[FF_DWT97] = ff_ict_float_fma3;
}
if (EXTERNAL_AVX2_FAST(cpu_flags)) {
c->mct_decode[FF_DWT53] = ff_rct_int_avx2;
}
}

View file

@ -0,0 +1,56 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/lossless_audiodsp.h"
int32_t ff_scalarproduct_and_madd_int16_mmxext(int16_t *v1, const int16_t *v2,
const int16_t *v3,
int order, int mul);
int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2,
const int16_t *v3,
int order, int mul);
int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2,
const int16_t *v3,
int order, int mul);
int32_t ff_scalarproduct_and_madd_int32_sse4(int16_t *v1, const int32_t *v2,
const int16_t *v3,
int order, int mul);
av_cold void ff_llauddsp_init_x86(LLAudDSPContext *c)
{
#if HAVE_X86ASM
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_MMXEXT(cpu_flags))
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmxext;
if (EXTERNAL_SSE2(cpu_flags))
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
if (EXTERNAL_SSSE3(cpu_flags) &&
!(cpu_flags & (AV_CPU_FLAG_SSE42 | AV_CPU_FLAG_3DNOW))) // cachesplit
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
if (EXTERNAL_SSE4(cpu_flags))
c->scalarproduct_and_madd_int32 = ff_scalarproduct_and_madd_int32_sse4;
#endif
}

View file

@ -0,0 +1,128 @@
/*
* Lossless video DSP utils
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#include "libavutil/x86/asm.h"
#include "../lossless_videodsp.h"
#include "libavutil/x86/cpu.h"
void ff_add_bytes_mmx(uint8_t *dst, uint8_t *src, ptrdiff_t w);
void ff_add_bytes_sse2(uint8_t *dst, uint8_t *src, ptrdiff_t w);
void ff_add_bytes_avx2(uint8_t *dst, uint8_t *src, ptrdiff_t w);
void ff_add_median_pred_mmxext(uint8_t *dst, const uint8_t *top,
const uint8_t *diff, ptrdiff_t w,
int *left, int *left_top);
void ff_add_median_pred_sse2(uint8_t *dst, const uint8_t *top,
const uint8_t *diff, ptrdiff_t w,
int *left, int *left_top);
int ff_add_left_pred_ssse3(uint8_t *dst, const uint8_t *src,
ptrdiff_t w, int left);
int ff_add_left_pred_unaligned_ssse3(uint8_t *dst, const uint8_t *src,
ptrdiff_t w, int left);
int ff_add_left_pred_unaligned_avx2(uint8_t *dst, const uint8_t *src,
ptrdiff_t w, int left);
int ff_add_left_pred_int16_ssse3(uint16_t *dst, const uint16_t *src, unsigned mask, ptrdiff_t w, unsigned acc);
int ff_add_left_pred_int16_unaligned_ssse3(uint16_t *dst, const uint16_t *src, unsigned mask, ptrdiff_t w, unsigned acc);
void ff_add_gradient_pred_ssse3(uint8_t *src, const ptrdiff_t stride, const ptrdiff_t width);
void ff_add_gradient_pred_avx2(uint8_t *src, const ptrdiff_t stride, const ptrdiff_t width);
#if HAVE_INLINE_ASM && HAVE_7REGS && ARCH_X86_32
static void add_median_pred_cmov(uint8_t *dst, const uint8_t *top,
const uint8_t *diff, ptrdiff_t w,
int *left, int *left_top)
{
x86_reg w2 = -w;
x86_reg x;
int l = *left & 0xff;
int tl = *left_top & 0xff;
int t;
__asm__ volatile (
"mov %7, %3 \n"
"1: \n"
"movzbl (%3, %4), %2 \n"
"mov %2, %k3 \n"
"sub %b1, %b3 \n"
"add %b0, %b3 \n"
"mov %2, %1 \n"
"cmp %0, %2 \n"
"cmovg %0, %2 \n"
"cmovg %1, %0 \n"
"cmp %k3, %0 \n"
"cmovg %k3, %0 \n"
"mov %7, %3 \n"
"cmp %2, %0 \n"
"cmovl %2, %0 \n"
"add (%6, %4), %b0 \n"
"mov %b0, (%5, %4) \n"
"inc %4 \n"
"jl 1b \n"
: "+&q"(l), "+&q"(tl), "=&r"(t), "=&q"(x), "+&r"(w2)
: "r"(dst + w), "r"(diff + w), "rm"(top + w)
);
*left = l;
*left_top = tl;
}
#endif
void ff_llviddsp_init_x86(LLVidDSPContext *c)
{
int cpu_flags = av_get_cpu_flags();
#if HAVE_INLINE_ASM && HAVE_7REGS && ARCH_X86_32
if (cpu_flags & AV_CPU_FLAG_CMOV)
c->add_median_pred = add_median_pred_cmov;
#endif
if (ARCH_X86_32 && EXTERNAL_MMX(cpu_flags)) {
c->add_bytes = ff_add_bytes_mmx;
}
if (ARCH_X86_32 && EXTERNAL_MMXEXT(cpu_flags)) {
/* slower than cmov version on AMD */
if (!(cpu_flags & AV_CPU_FLAG_3DNOW))
c->add_median_pred = ff_add_median_pred_mmxext;
}
if (EXTERNAL_SSE2(cpu_flags)) {
c->add_bytes = ff_add_bytes_sse2;
c->add_median_pred = ff_add_median_pred_sse2;
}
if (EXTERNAL_SSSE3(cpu_flags)) {
c->add_left_pred = ff_add_left_pred_ssse3;
c->add_left_pred_int16 = ff_add_left_pred_int16_ssse3;
c->add_gradient_pred = ff_add_gradient_pred_ssse3;
}
if (EXTERNAL_SSSE3_FAST(cpu_flags)) {
c->add_left_pred = ff_add_left_pred_unaligned_ssse3;
c->add_left_pred_int16 = ff_add_left_pred_int16_unaligned_ssse3;
}
if (EXTERNAL_AVX2_FAST(cpu_flags)) {
c->add_bytes = ff_add_bytes_avx2;
c->add_left_pred = ff_add_left_pred_unaligned_avx2;
c->add_gradient_pred = ff_add_gradient_pred_avx2;
}
}

View file

@ -0,0 +1,111 @@
/*
* SIMD-optimized lossless video encoding functions
* Copyright (c) 2000, 2001 Fabrice Bellard
* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
*
* MMX optimization by Nick Kurshev <nickols_k@mail.ru>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/lossless_videoencdsp.h"
#include "libavcodec/mathops.h"
void ff_diff_bytes_mmx(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
intptr_t w);
void ff_diff_bytes_sse2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
intptr_t w);
void ff_diff_bytes_avx2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
intptr_t w);
void ff_sub_left_predict_avx(uint8_t *dst, uint8_t *src,
ptrdiff_t stride, ptrdiff_t width, int height);
#if HAVE_INLINE_ASM
static void sub_median_pred_mmxext(uint8_t *dst, const uint8_t *src1,
const uint8_t *src2, intptr_t w,
int *left, int *left_top)
{
x86_reg i = 0;
uint8_t l, lt;
__asm__ volatile (
"movq (%1, %0), %%mm0 \n\t" // LT
"psllq $8, %%mm0 \n\t"
"1: \n\t"
"movq (%1, %0), %%mm1 \n\t" // T
"movq -1(%2, %0), %%mm2 \n\t" // L
"movq (%2, %0), %%mm3 \n\t" // X
"movq %%mm2, %%mm4 \n\t" // L
"psubb %%mm0, %%mm2 \n\t"
"paddb %%mm1, %%mm2 \n\t" // L + T - LT
"movq %%mm4, %%mm5 \n\t" // L
"pmaxub %%mm1, %%mm4 \n\t" // max(T, L)
"pminub %%mm5, %%mm1 \n\t" // min(T, L)
"pminub %%mm2, %%mm4 \n\t"
"pmaxub %%mm1, %%mm4 \n\t"
"psubb %%mm4, %%mm3 \n\t" // dst - pred
"movq %%mm3, (%3, %0) \n\t"
"add $8, %0 \n\t"
"movq -1(%1, %0), %%mm0 \n\t" // LT
"cmp %4, %0 \n\t"
" jb 1b \n\t"
: "+r" (i)
: "r" (src1), "r" (src2), "r" (dst), "r" ((x86_reg) w));
l = *left;
lt = *left_top;
dst[0] = src2[0] - mid_pred(l, src1[0], (l + src1[0] - lt) & 0xFF);
*left_top = src1[w - 1];
*left = src2[w - 1];
}
#endif /* HAVE_INLINE_ASM */
av_cold void ff_llvidencdsp_init_x86(LLVidEncDSPContext *c)
{
av_unused int cpu_flags = av_get_cpu_flags();
if (ARCH_X86_32 && EXTERNAL_MMX(cpu_flags)) {
c->diff_bytes = ff_diff_bytes_mmx;
}
#if HAVE_INLINE_ASM
if (INLINE_MMXEXT(cpu_flags)) {
c->sub_median_pred = sub_median_pred_mmxext;
}
#endif /* HAVE_INLINE_ASM */
if (EXTERNAL_SSE2(cpu_flags)) {
c->diff_bytes = ff_diff_bytes_sse2;
}
if (EXTERNAL_AVX(cpu_flags)) {
c->sub_left_predict = ff_sub_left_predict_avx;
}
if (EXTERNAL_AVX2_FAST(cpu_flags)) {
c->diff_bytes = ff_diff_bytes_avx2;
}
}

View file

@ -0,0 +1,162 @@
/*
* SIMD-optimized LPC functions
* Copyright (c) 2007 Loren Merritt
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/mem.h"
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/lpc.h"
DECLARE_ASM_CONST(16, double, pd_1)[2] = { 1.0, 1.0 };
DECLARE_ASM_CONST(16, double, pd_2)[2] = { 2.0, 2.0 };
#if HAVE_SSE2_INLINE
static void lpc_apply_welch_window_sse2(const int32_t *data, int len,
double *w_data)
{
double c = 2.0 / (len-1.0);
int n2 = len>>1;
x86_reg i = -n2*sizeof(int32_t);
x86_reg j = n2*sizeof(int32_t);
__asm__ volatile(
"movsd %4, %%xmm7 \n\t"
"movapd "MANGLE(pd_1)", %%xmm6 \n\t"
"movapd "MANGLE(pd_2)", %%xmm5 \n\t"
"movlhps %%xmm7, %%xmm7 \n\t"
"subpd %%xmm5, %%xmm7 \n\t"
"addsd %%xmm6, %%xmm7 \n\t"
"test $1, %5 \n\t"
"jz 2f \n\t"
#define WELCH(MOVPD, offset)\
"1: \n\t"\
"movapd %%xmm7, %%xmm1 \n\t"\
"mulpd %%xmm1, %%xmm1 \n\t"\
"movapd %%xmm6, %%xmm0 \n\t"\
"subpd %%xmm1, %%xmm0 \n\t"\
"pshufd $0x4e, %%xmm0, %%xmm1 \n\t"\
"cvtpi2pd (%3,%0), %%xmm2 \n\t"\
"cvtpi2pd "#offset"*4(%3,%1), %%xmm3 \n\t"\
"mulpd %%xmm0, %%xmm2 \n\t"\
"mulpd %%xmm1, %%xmm3 \n\t"\
"movapd %%xmm2, (%2,%0,2) \n\t"\
MOVPD" %%xmm3, "#offset"*8(%2,%1,2) \n\t"\
"subpd %%xmm5, %%xmm7 \n\t"\
"sub $8, %1 \n\t"\
"add $8, %0 \n\t"\
"jl 1b \n\t"\
WELCH("movupd", -1)
"jmp 3f \n\t"
"2: \n\t"
WELCH("movapd", -2)
"3: \n\t"
:"+&r"(i), "+&r"(j)
:"r"(w_data+n2), "r"(data+n2), "m"(c), "r"(len)
NAMED_CONSTRAINTS_ARRAY_ADD(pd_1,pd_2)
XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm5", "%xmm6", "%xmm7")
);
#undef WELCH
}
static void lpc_compute_autocorr_sse2(const double *data, int len, int lag,
double *autoc)
{
int j;
if((x86_reg)data & 15)
data++;
for(j=0; j<lag; j+=2){
x86_reg i = -len*sizeof(double);
if(j == lag-2) {
__asm__ volatile(
"movsd "MANGLE(pd_1)", %%xmm0 \n\t"
"movsd "MANGLE(pd_1)", %%xmm1 \n\t"
"movsd "MANGLE(pd_1)", %%xmm2 \n\t"
"1: \n\t"
"movapd (%2,%0), %%xmm3 \n\t"
"movupd -8(%3,%0), %%xmm4 \n\t"
"movapd (%3,%0), %%xmm5 \n\t"
"mulpd %%xmm3, %%xmm4 \n\t"
"mulpd %%xmm3, %%xmm5 \n\t"
"mulpd -16(%3,%0), %%xmm3 \n\t"
"addpd %%xmm4, %%xmm1 \n\t"
"addpd %%xmm5, %%xmm0 \n\t"
"addpd %%xmm3, %%xmm2 \n\t"
"add $16, %0 \n\t"
"jl 1b \n\t"
"movhlps %%xmm0, %%xmm3 \n\t"
"movhlps %%xmm1, %%xmm4 \n\t"
"movhlps %%xmm2, %%xmm5 \n\t"
"addsd %%xmm3, %%xmm0 \n\t"
"addsd %%xmm4, %%xmm1 \n\t"
"addsd %%xmm5, %%xmm2 \n\t"
"movsd %%xmm0, (%1) \n\t"
"movsd %%xmm1, 8(%1) \n\t"
"movsd %%xmm2, 16(%1) \n\t"
:"+&r"(i)
:"r"(autoc+j), "r"(data+len), "r"(data+len-j)
NAMED_CONSTRAINTS_ARRAY_ADD(pd_1)
:"memory"
);
} else {
__asm__ volatile(
"movsd "MANGLE(pd_1)", %%xmm0 \n\t"
"movsd "MANGLE(pd_1)", %%xmm1 \n\t"
"1: \n\t"
"movapd (%3,%0), %%xmm3 \n\t"
"movupd -8(%4,%0), %%xmm4 \n\t"
"mulpd %%xmm3, %%xmm4 \n\t"
"mulpd (%4,%0), %%xmm3 \n\t"
"addpd %%xmm4, %%xmm1 \n\t"
"addpd %%xmm3, %%xmm0 \n\t"
"add $16, %0 \n\t"
"jl 1b \n\t"
"movhlps %%xmm0, %%xmm3 \n\t"
"movhlps %%xmm1, %%xmm4 \n\t"
"addsd %%xmm3, %%xmm0 \n\t"
"addsd %%xmm4, %%xmm1 \n\t"
"movsd %%xmm0, %1 \n\t"
"movsd %%xmm1, %2 \n\t"
:"+&r"(i), "=m"(autoc[j]), "=m"(autoc[j+1])
:"r"(data+len), "r"(data+len-j)
NAMED_CONSTRAINTS_ARRAY_ADD(pd_1)
);
}
}
}
#endif /* HAVE_SSE2_INLINE */
av_cold void ff_lpc_init_x86(LPCContext *c)
{
#if HAVE_SSE2_INLINE
int cpu_flags = av_get_cpu_flags();
if (INLINE_SSE2(cpu_flags) || INLINE_SSE2_SLOW(cpu_flags)) {
c->lpc_apply_welch_window = lpc_apply_welch_window_sse2;
c->lpc_compute_autocorr = lpc_compute_autocorr_sse2;
}
#endif /* HAVE_SSE2_INLINE */
}

View file

@ -0,0 +1,133 @@
/*
* simple math operations
* Copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at> et al
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVCODEC_X86_MATHOPS_H
#define AVCODEC_X86_MATHOPS_H
#include "config.h"
#include "libavutil/common.h"
#include "libavutil/x86/asm.h"
#if HAVE_INLINE_ASM
#if ARCH_X86_32
#define MULL MULL
static av_always_inline av_const int MULL(int a, int b, unsigned shift)
{
int rt, dummy;
__asm__ (
"imull %3 \n\t"
"shrdl %4, %%edx, %%eax \n\t"
:"=a"(rt), "=d"(dummy)
:"a"(a), "rm"(b), "ci"((uint8_t)shift)
);
return rt;
}
#define MULH MULH
static av_always_inline av_const int MULH(int a, int b)
{
int rt, dummy;
__asm__ (
"imull %3"
:"=d"(rt), "=a"(dummy)
:"a"(a), "rm"(b)
);
return rt;
}
#define MUL64 MUL64
static av_always_inline av_const int64_t MUL64(int a, int b)
{
int64_t rt;
__asm__ (
"imull %2"
:"=A"(rt)
:"a"(a), "rm"(b)
);
return rt;
}
#endif /* ARCH_X86_32 */
#if HAVE_I686
/* median of 3 */
#define mid_pred mid_pred
static inline av_const int mid_pred(int a, int b, int c)
{
int i=b;
__asm__ (
"cmp %2, %1 \n\t"
"cmovg %1, %0 \n\t"
"cmovg %2, %1 \n\t"
"cmp %3, %1 \n\t"
"cmovl %3, %1 \n\t"
"cmp %1, %0 \n\t"
"cmovg %1, %0 \n\t"
:"+&r"(i), "+&r"(a)
:"r"(b), "r"(c)
);
return i;
}
#if HAVE_6REGS
#define COPY3_IF_LT(x, y, a, b, c, d)\
__asm__ volatile(\
"cmpl %0, %3 \n\t"\
"cmovl %3, %0 \n\t"\
"cmovl %4, %1 \n\t"\
"cmovl %5, %2 \n\t"\
: "+&r" (x), "+&r" (a), "+r" (c)\
: "r" (y), "r" (b), "r" (d)\
);
#endif /* HAVE_6REGS */
#endif /* HAVE_I686 */
#define MASK_ABS(mask, level) \
__asm__ ("cdq \n\t" \
"xorl %1, %0 \n\t" \
"subl %1, %0 \n\t" \
: "+a"(level), "=&d"(mask))
// avoid +32 for shift optimization (gcc should do that ...)
#define NEG_SSR32 NEG_SSR32
static inline int32_t NEG_SSR32( int32_t a, int8_t s){
__asm__ ("sarl %1, %0\n\t"
: "+r" (a)
: "ic" ((uint8_t)(-s))
);
return a;
}
#define NEG_USR32 NEG_USR32
static inline uint32_t NEG_USR32(uint32_t a, int8_t s){
__asm__ ("shrl %1, %0\n\t"
: "+r" (a)
: "ic" ((uint8_t)(-s))
);
return a;
}
#endif /* HAVE_INLINE_ASM */
#endif /* AVCODEC_X86_MATHOPS_H */

View file

@ -0,0 +1,221 @@
;******************************************************************************
;* SIMD optimized non-power-of-two MDCT functions
;*
;* Copyright (C) 2017 Rostislav Pehlivanov <atomnuker@gmail.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION_RODATA 32
perm_neg: dd 2, 5, 3, 4, 6, 1, 7, 0
perm_pos: dd 0, 7, 1, 6, 4, 3, 5, 2
sign_adjust_r: times 4 dd 0x80000000, 0x00000000
sign_adjust_5: dd 0x00000000, 0x80000000, 0x80000000, 0x00000000
SECTION .text
%if ARCH_X86_64
;*****************************************************************************************
;void ff_fft15_avx(FFTComplex *out, FFTComplex *in, FFTComplex *exptab, ptrdiff_t stride);
;*****************************************************************************************
%macro FFT5 3 ; %1 - in_offset, %2 - dst1 (64bit used), %3 - dst2
VBROADCASTSD m0, [inq + %1] ; in[ 0].re, in[ 0].im, in[ 0].re, in[ 0].im
movsd xm1, [inq + 1*16 + 8 + %1] ; in[ 3].re, in[ 3].im, 0, 0
movsd xm4, [inq + 6*16 + 0 + %1] ; in[12].re, in[12].im, 0, 0
movhps xm1, [inq + 3*16 + 0 + %1] ; in[ 3].re, in[ 3].im, in[ 6].re, in[ 6].im
movhps xm4, [inq + 4*16 + 8 + %1] ; in[12].re, in[12].im, in[ 9].re, in[ 9].im
subps xm2, xm1, xm4 ; t[2].im, t[2].re, t[3].im, t[3].re
addps xm1, xm4 ; t[0].re, t[0].im, t[1].re, t[1].im
movhlps %2, xm1 ; t[0].re, t[1].re, t[0].im, t[1].im
addps %2, xm1
addps %2, xm0 ; DC[0].re, DC[0].im, junk...
movlhps %2, %2 ; DC[0].re, DC[0].im, DC[0].re, DC[0].im
shufps xm3, xm1, xm2, q0110 ; t[0].re, t[0].im, t[2].re, t[2].im
shufps xm1, xm2, q2332 ; t[1].re, t[1].im, t[3].re, t[3].im
mulps xm%3, xm1, xm5
mulps xm4, xm3, xm6
mulps xm1, xm6
xorps xm1, xm7
mulps xm3, xm5
addsubps xm3, xm1 ; t[0].re, t[0].im, t[2].re, t[2].im
subps xm%3, xm4 ; t[4].re, t[4].im, t[5].re, t[5].im
movhlps xm2, xm%3, xm3 ; t[2].re, t[2].im, t[5].re, t[5].im
movlhps xm3, xm%3 ; t[0].re, t[0].im, t[4].re, t[4].im
xorps xm2, xm7
addps xm%3, xm2, xm3
subps xm3, xm2
shufps xm3, xm3, q1032
vinsertf128 m%3, m%3, xm3, 1 ; All ACs (tmp[1] through to tmp[4])
addps m%3, m%3, m0 ; Finally offset with DCs
%endmacro
%macro BUTTERFLIES_DC 1 ; %1 - exptab_offset
mulps xm0, xm9, [exptabq + %1 + 16*0]
mulps xm1, xm10, [exptabq + %1 + 16*1]
haddps xm0, xm1
movhlps xm1, xm0 ; t[0].re, t[1].re, t[0].im, t[1].im
addps xm0, xm1
addps xm0, xm8
movsd [outq], xm0
%endmacro
%macro BUTTERFLIES_AC 1 ; %1 - exptab_offset
mulps m0, m12, [exptabq + 64*0 + 0*mmsize + %1]
mulps m1, m12, [exptabq + 64*0 + 1*mmsize + %1]
mulps m2, m13, [exptabq + 64*1 + 0*mmsize + %1]
mulps m3, m13, [exptabq + 64*1 + 1*mmsize + %1]
addps m0, m0, m2
addps m1, m1, m3
addps m0, m0, m11
shufps m1, m1, m1, q2301
addps m0, m0, m1
vextractf128 xm1, m0, 1
movlps [outq + strideq*1], xm0
movhps [outq + strideq*2], xm0
movlps [outq + stride3q], xm1
movhps [outq + strideq*4], xm1
%endmacro
INIT_YMM avx
cglobal fft15, 4, 5, 14, out, in, exptab, stride, stride5
shl strideq, 3
movaps xm5, [exptabq + 480 + 16*0]
movaps xm6, [exptabq + 480 + 16*1]
movaps xm7, [sign_adjust_5]
FFT5 0, xm8, 11
FFT5 8, xm9, 12
FFT5 16, xm10, 13
%define stride3q inq
lea stride3q, [strideq + strideq*2]
lea stride5q, [strideq + strideq*4]
BUTTERFLIES_DC (8*6 + 4*0)*2*4
BUTTERFLIES_AC (8*0 + 0*0)*2*4
add outq, stride5q
BUTTERFLIES_DC (8*6 + 4*1)*2*4
BUTTERFLIES_AC (8*2 + 0*0)*2*4
add outq, stride5q
BUTTERFLIES_DC (8*6 + 4*2)*2*4
BUTTERFLIES_AC (8*4 + 0*0)*2*4
RET
%endif ; ARCH_X86_64
;*******************************************************************************************************
;void ff_mdct15_postreindex(FFTComplex *out, FFTComplex *in, FFTComplex *exp, int *lut, ptrdiff_t len8);
;*******************************************************************************************************
%macro LUT_LOAD_4D 3
mov r4d, [lutq + %3q*4 + 0]
movsd xmm%1, [inq + r4q*8]
mov r4d, [lutq + %3q*4 + 4]
movhps xmm%1, [inq + r4q*8]
%if cpuflag(avx2)
mov r4d, [lutq + %3q*4 + 8]
movsd %2, [inq + r4q*8]
mov r4d, [lutq + %3q*4 + 12]
movhps %2, [inq + r4q*8]
vinsertf128 %1, %1, %2, 1
%endif
%endmacro
%macro POSTROTATE_FN 1
cglobal mdct15_postreindex, 5, 7, 8 + cpuflag(avx2)*2, out, in, exp, lut, len8, offset_p, offset_n
xor offset_nq, offset_nq
lea offset_pq, [len8q*2 - %1]
movaps m7, [sign_adjust_r]
%if cpuflag(avx2)
movaps m8, [perm_pos]
movaps m9, [perm_neg]
%endif
.loop:
movups m0, [expq + offset_pq*8] ; exp[p0].re, exp[p0].im, exp[p1].re, exp[p1].im, exp[p2].re, exp[p2].im, exp[p3].re, exp[p3].im
movups m1, [expq + offset_nq*8] ; exp[n3].re, exp[n3].im, exp[n2].re, exp[n2].im, exp[n1].re, exp[n1].im, exp[n0].re, exp[n0].im
LUT_LOAD_4D m3, xm4, offset_p ; in[p0].re, in[p0].im, in[p1].re, in[p1].im, in[p2].re, in[p2].im, in[p3].re, in[p3].im
LUT_LOAD_4D m4, xm5, offset_n ; in[n3].re, in[n3].im, in[n2].re, in[n2].im, in[n1].re, in[n1].im, in[n0].re, in[n0].im
mulps m5, m3, m0 ; in[p].reim * exp[p].reim
mulps m6, m4, m1 ; in[n].reim * exp[n].reim
xorps m5, m7 ; in[p].re *= -1, in[p].im *= 1
xorps m6, m7 ; in[n].re *= -1, in[n].im *= 1
shufps m3, m3, m3, q2301 ; in[p].imre
shufps m4, m4, m4, q2301 ; in[n].imre
mulps m3, m0 ; in[p].imre * exp[p].reim
mulps m4, m1 ; in[n].imre * exp[n].reim
haddps m3, m6 ; out[n0].im, out[n1].im, out[n3].re, out[n2].re, out[n2].im, out[n3].im, out[n1].re, out[n0].re
haddps m5, m4 ; out[p0].re, out[p1].re, out[p3].im, out[p2].im, out[p2].re, out[p3].re, out[p1].im, out[p0].im
%if cpuflag(avx2)
vpermps m3, m9, m3 ; out[n3].im, out[n3].re, out[n2].im, out[n2].re, out[n1].im, out[n1].re, out[n0].im, out[n0].re
vpermps m5, m8, m5 ; out[p0].re, out[p0].im, out[p1].re, out[p1].im, out[p2].re, out[p2].im, out[p3].re, out[p3].im
%else
shufps m3, m3, m3, q0312
shufps m5, m5, m5, q2130
%endif
movups [outq + offset_nq*8], m3
movups [outq + offset_pq*8], m5
sub offset_pq, %1
add offset_nq, %1
cmp offset_nq, offset_pq
jle .loop
REP_RET
%endmacro
INIT_XMM sse3
POSTROTATE_FN 2
%if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
INIT_YMM avx2
POSTROTATE_FN 4
%endif

View file

@ -0,0 +1,99 @@
/*
* SIMD optimized non-power-of-two MDCT functions
*
* Copyright (C) 2017 Rostislav Pehlivanov <atomnuker@gmail.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/mdct15.h"
void ff_mdct15_postreindex_sse3(FFTComplex *out, FFTComplex *in, FFTComplex *exp, int *lut, ptrdiff_t len8);
void ff_mdct15_postreindex_avx2(FFTComplex *out, FFTComplex *in, FFTComplex *exp, int *lut, ptrdiff_t len8);
void ff_fft15_avx(FFTComplex *out, FFTComplex *in, FFTComplex *exptab, ptrdiff_t stride);
static void perm_twiddles(MDCT15Context *s)
{
int k;
FFTComplex tmp[30];
/* 5-point FFT twiddles */
s->exptab[60].re = s->exptab[60].im = s->exptab[19].re;
s->exptab[61].re = s->exptab[61].im = s->exptab[19].im;
s->exptab[62].re = s->exptab[62].im = s->exptab[20].re;
s->exptab[63].re = s->exptab[63].im = s->exptab[20].im;
/* 15-point FFT twiddles */
for (k = 0; k < 5; k++) {
tmp[6*k + 0] = s->exptab[k + 0];
tmp[6*k + 2] = s->exptab[k + 5];
tmp[6*k + 4] = s->exptab[k + 10];
tmp[6*k + 1] = s->exptab[2 * (k + 0)];
tmp[6*k + 3] = s->exptab[2 * (k + 5)];
tmp[6*k + 5] = s->exptab[2 * k + 5 ];
}
for (k = 0; k < 6; k++) {
FFTComplex ac_exp[] = {
{ tmp[6*1 + k].re, tmp[6*1 + k].re },
{ tmp[6*2 + k].re, tmp[6*2 + k].re },
{ tmp[6*3 + k].re, tmp[6*3 + k].re },
{ tmp[6*4 + k].re, tmp[6*4 + k].re },
{ tmp[6*1 + k].im, -tmp[6*1 + k].im },
{ tmp[6*2 + k].im, -tmp[6*2 + k].im },
{ tmp[6*3 + k].im, -tmp[6*3 + k].im },
{ tmp[6*4 + k].im, -tmp[6*4 + k].im },
};
memcpy(s->exptab + 8*k, ac_exp, 8*sizeof(FFTComplex));
}
/* Specialcase when k = 0 */
for (k = 0; k < 3; k++) {
FFTComplex dc_exp[] = {
{ tmp[2*k + 0].re, -tmp[2*k + 0].im },
{ tmp[2*k + 0].im, tmp[2*k + 0].re },
{ tmp[2*k + 1].re, -tmp[2*k + 1].im },
{ tmp[2*k + 1].im, tmp[2*k + 1].re },
};
memcpy(s->exptab + 8*6 + 4*k, dc_exp, 4*sizeof(FFTComplex));
}
}
av_cold void ff_mdct15_init_x86(MDCT15Context *s)
{
int adjust_twiddles = 0;
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_SSE3(cpu_flags))
s->postreindex = ff_mdct15_postreindex_sse3;
if (ARCH_X86_64 && EXTERNAL_AVX(cpu_flags)) {
s->fft15 = ff_fft15_avx;
adjust_twiddles = 1;
}
if (ARCH_X86_64 && EXTERNAL_AVX2_FAST(cpu_flags))
s->postreindex = ff_mdct15_postreindex_avx2;
if (adjust_twiddles)
perm_twiddles(s);
}

View file

@ -0,0 +1,651 @@
/*
* SIMD-optimized motion estimation
* Copyright (c) 2000, 2001 Fabrice Bellard
* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
*
* MMX optimization by Nick Kurshev <nickols_k@mail.ru>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/me_cmp.h"
#include "libavcodec/mpegvideo.h"
int ff_sum_abs_dctelem_mmx(int16_t *block);
int ff_sum_abs_dctelem_mmxext(int16_t *block);
int ff_sum_abs_dctelem_sse2(int16_t *block);
int ff_sum_abs_dctelem_ssse3(int16_t *block);
int ff_sse8_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
ptrdiff_t stride, int h);
int ff_sse16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
ptrdiff_t stride, int h);
int ff_sse16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
ptrdiff_t stride, int h);
int ff_hf_noise8_mmx(uint8_t *pix1, ptrdiff_t stride, int h);
int ff_hf_noise16_mmx(uint8_t *pix1, ptrdiff_t stride, int h);
int ff_sad8_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
ptrdiff_t stride, int h);
int ff_sad16_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
ptrdiff_t stride, int h);
int ff_sad16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
ptrdiff_t stride, int h);
int ff_sad8_x2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
ptrdiff_t stride, int h);
int ff_sad16_x2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
ptrdiff_t stride, int h);
int ff_sad16_x2_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
ptrdiff_t stride, int h);
int ff_sad8_y2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
ptrdiff_t stride, int h);
int ff_sad16_y2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
ptrdiff_t stride, int h);
int ff_sad16_y2_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
ptrdiff_t stride, int h);
int ff_sad8_approx_xy2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
ptrdiff_t stride, int h);
int ff_sad16_approx_xy2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
ptrdiff_t stride, int h);
int ff_sad16_approx_xy2_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
ptrdiff_t stride, int h);
int ff_vsad_intra8_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
ptrdiff_t stride, int h);
int ff_vsad_intra16_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
ptrdiff_t stride, int h);
int ff_vsad_intra16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
ptrdiff_t stride, int h);
int ff_vsad8_approx_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
ptrdiff_t stride, int h);
int ff_vsad16_approx_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
ptrdiff_t stride, int h);
int ff_vsad16_approx_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
ptrdiff_t stride, int h);
#define hadamard_func(cpu) \
int ff_hadamard8_diff_ ## cpu(MpegEncContext *s, uint8_t *src1, \
uint8_t *src2, ptrdiff_t stride, int h); \
int ff_hadamard8_diff16_ ## cpu(MpegEncContext *s, uint8_t *src1, \
uint8_t *src2, ptrdiff_t stride, int h);
hadamard_func(mmx)
hadamard_func(mmxext)
hadamard_func(sse2)
hadamard_func(ssse3)
#if HAVE_X86ASM
static int nsse16_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
ptrdiff_t stride, int h)
{
int score1, score2;
if (c)
score1 = c->mecc.sse[0](c, pix1, pix2, stride, h);
else
score1 = ff_sse16_mmx(c, pix1, pix2, stride, h);
score2 = ff_hf_noise16_mmx(pix1, stride, h) + ff_hf_noise8_mmx(pix1+8, stride, h)
- ff_hf_noise16_mmx(pix2, stride, h) - ff_hf_noise8_mmx(pix2+8, stride, h);
if (c)
return score1 + FFABS(score2) * c->avctx->nsse_weight;
else
return score1 + FFABS(score2) * 8;
}
static int nsse8_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
ptrdiff_t stride, int h)
{
int score1 = ff_sse8_mmx(c, pix1, pix2, stride, h);
int score2 = ff_hf_noise8_mmx(pix1, stride, h) -
ff_hf_noise8_mmx(pix2, stride, h);
if (c)
return score1 + FFABS(score2) * c->avctx->nsse_weight;
else
return score1 + FFABS(score2) * 8;
}
#endif /* HAVE_X86ASM */
#if HAVE_INLINE_ASM
static int vsad_intra16_mmx(MpegEncContext *v, uint8_t *pix, uint8_t *dummy,
ptrdiff_t stride, int h)
{
int tmp;
av_assert2((((int) pix) & 7) == 0);
av_assert2((stride & 7) == 0);
#define SUM(in0, in1, out0, out1) \
"movq (%0), %%mm2\n" \
"movq 8(%0), %%mm3\n" \
"add %2,%0\n" \
"movq %%mm2, " #out0 "\n" \
"movq %%mm3, " #out1 "\n" \
"psubusb " #in0 ", %%mm2\n" \
"psubusb " #in1 ", %%mm3\n" \
"psubusb " #out0 ", " #in0 "\n" \
"psubusb " #out1 ", " #in1 "\n" \
"por %%mm2, " #in0 "\n" \
"por %%mm3, " #in1 "\n" \
"movq " #in0 ", %%mm2\n" \
"movq " #in1 ", %%mm3\n" \
"punpcklbw %%mm7, " #in0 "\n" \
"punpcklbw %%mm7, " #in1 "\n" \
"punpckhbw %%mm7, %%mm2\n" \
"punpckhbw %%mm7, %%mm3\n" \
"paddw " #in1 ", " #in0 "\n" \
"paddw %%mm3, %%mm2\n" \
"paddw %%mm2, " #in0 "\n" \
"paddw " #in0 ", %%mm6\n"
__asm__ volatile (
"movl %3, %%ecx\n"
"pxor %%mm6, %%mm6\n"
"pxor %%mm7, %%mm7\n"
"movq (%0), %%mm0\n"
"movq 8(%0), %%mm1\n"
"add %2, %0\n"
"jmp 2f\n"
"1:\n"
SUM(%%mm4, %%mm5, %%mm0, %%mm1)
"2:\n"
SUM(%%mm0, %%mm1, %%mm4, %%mm5)
"subl $2, %%ecx\n"
"jnz 1b\n"
"movq %%mm6, %%mm0\n"
"psrlq $32, %%mm6\n"
"paddw %%mm6, %%mm0\n"
"movq %%mm0, %%mm6\n"
"psrlq $16, %%mm0\n"
"paddw %%mm6, %%mm0\n"
"movd %%mm0, %1\n"
: "+r" (pix), "=r" (tmp)
: "r" (stride), "m" (h)
: "%ecx");
return tmp & 0xFFFF;
}
#undef SUM
static int vsad16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
ptrdiff_t stride, int h)
{
int tmp;
av_assert2((((int) pix1) & 7) == 0);
av_assert2((((int) pix2) & 7) == 0);
av_assert2((stride & 7) == 0);
#define SUM(in0, in1, out0, out1) \
"movq (%0), %%mm2\n" \
"movq (%1), " #out0 "\n" \
"movq 8(%0), %%mm3\n" \
"movq 8(%1), " #out1 "\n" \
"add %3, %0\n" \
"add %3, %1\n" \
"psubb " #out0 ", %%mm2\n" \
"psubb " #out1 ", %%mm3\n" \
"pxor %%mm7, %%mm2\n" \
"pxor %%mm7, %%mm3\n" \
"movq %%mm2, " #out0 "\n" \
"movq %%mm3, " #out1 "\n" \
"psubusb " #in0 ", %%mm2\n" \
"psubusb " #in1 ", %%mm3\n" \
"psubusb " #out0 ", " #in0 "\n" \
"psubusb " #out1 ", " #in1 "\n" \
"por %%mm2, " #in0 "\n" \
"por %%mm3, " #in1 "\n" \
"movq " #in0 ", %%mm2\n" \
"movq " #in1 ", %%mm3\n" \
"punpcklbw %%mm7, " #in0 "\n" \
"punpcklbw %%mm7, " #in1 "\n" \
"punpckhbw %%mm7, %%mm2\n" \
"punpckhbw %%mm7, %%mm3\n" \
"paddw " #in1 ", " #in0 "\n" \
"paddw %%mm3, %%mm2\n" \
"paddw %%mm2, " #in0 "\n" \
"paddw " #in0 ", %%mm6\n"
__asm__ volatile (
"movl %4, %%ecx\n"
"pxor %%mm6, %%mm6\n"
"pcmpeqw %%mm7, %%mm7\n"
"psllw $15, %%mm7\n"
"packsswb %%mm7, %%mm7\n"
"movq (%0), %%mm0\n"
"movq (%1), %%mm2\n"
"movq 8(%0), %%mm1\n"
"movq 8(%1), %%mm3\n"
"add %3, %0\n"
"add %3, %1\n"
"psubb %%mm2, %%mm0\n"
"psubb %%mm3, %%mm1\n"
"pxor %%mm7, %%mm0\n"
"pxor %%mm7, %%mm1\n"
"jmp 2f\n"
"1:\n"
SUM(%%mm4, %%mm5, %%mm0, %%mm1)
"2:\n"
SUM(%%mm0, %%mm1, %%mm4, %%mm5)
"subl $2, %%ecx\n"
"jnz 1b\n"
"movq %%mm6, %%mm0\n"
"psrlq $32, %%mm6\n"
"paddw %%mm6, %%mm0\n"
"movq %%mm0, %%mm6\n"
"psrlq $16, %%mm0\n"
"paddw %%mm6, %%mm0\n"
"movd %%mm0, %2\n"
: "+r" (pix1), "+r" (pix2), "=r" (tmp)
: "r" (stride), "m" (h)
: "%ecx");
return tmp & 0x7FFF;
}
#undef SUM
DECLARE_ASM_CONST(8, uint64_t, round_tab)[3] = {
0x0000000000000000ULL,
0x0001000100010001ULL,
0x0002000200020002ULL,
};
static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2,
ptrdiff_t stride, int h)
{
x86_reg len = -stride * h;
__asm__ volatile (
".p2align 4 \n\t"
"1: \n\t"
"movq (%1, %%"FF_REG_a"), %%mm0 \n\t"
"movq (%2, %%"FF_REG_a"), %%mm2 \n\t"
"movq (%2, %%"FF_REG_a"), %%mm4 \n\t"
"add %3, %%"FF_REG_a" \n\t"
"psubusb %%mm0, %%mm2 \n\t"
"psubusb %%mm4, %%mm0 \n\t"
"movq (%1, %%"FF_REG_a"), %%mm1 \n\t"
"movq (%2, %%"FF_REG_a"), %%mm3 \n\t"
"movq (%2, %%"FF_REG_a"), %%mm5 \n\t"
"psubusb %%mm1, %%mm3 \n\t"
"psubusb %%mm5, %%mm1 \n\t"
"por %%mm2, %%mm0 \n\t"
"por %%mm1, %%mm3 \n\t"
"movq %%mm0, %%mm1 \n\t"
"movq %%mm3, %%mm2 \n\t"
"punpcklbw %%mm7, %%mm0 \n\t"
"punpckhbw %%mm7, %%mm1 \n\t"
"punpcklbw %%mm7, %%mm3 \n\t"
"punpckhbw %%mm7, %%mm2 \n\t"
"paddw %%mm1, %%mm0 \n\t"
"paddw %%mm3, %%mm2 \n\t"
"paddw %%mm2, %%mm0 \n\t"
"paddw %%mm0, %%mm6 \n\t"
"add %3, %%"FF_REG_a" \n\t"
" js 1b \n\t"
: "+a" (len)
: "r" (blk1 - len), "r" (blk2 - len), "r" (stride));
}
static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2,
ptrdiff_t stride, int h)
{
x86_reg len = -stride * h;
__asm__ volatile (
".p2align 4 \n\t"
"1: \n\t"
"movq (%1, %%"FF_REG_a"), %%mm0 \n\t"
"movq (%2, %%"FF_REG_a"), %%mm1 \n\t"
"movq (%1, %%"FF_REG_a"), %%mm2 \n\t"
"movq (%2, %%"FF_REG_a"), %%mm3 \n\t"
"punpcklbw %%mm7, %%mm0 \n\t"
"punpcklbw %%mm7, %%mm1 \n\t"
"punpckhbw %%mm7, %%mm2 \n\t"
"punpckhbw %%mm7, %%mm3 \n\t"
"paddw %%mm0, %%mm1 \n\t"
"paddw %%mm2, %%mm3 \n\t"
"movq (%3, %%"FF_REG_a"), %%mm4 \n\t"
"movq (%3, %%"FF_REG_a"), %%mm2 \n\t"
"paddw %%mm5, %%mm1 \n\t"
"paddw %%mm5, %%mm3 \n\t"
"psrlw $1, %%mm1 \n\t"
"psrlw $1, %%mm3 \n\t"
"packuswb %%mm3, %%mm1 \n\t"
"psubusb %%mm1, %%mm4 \n\t"
"psubusb %%mm2, %%mm1 \n\t"
"por %%mm4, %%mm1 \n\t"
"movq %%mm1, %%mm0 \n\t"
"punpcklbw %%mm7, %%mm0 \n\t"
"punpckhbw %%mm7, %%mm1 \n\t"
"paddw %%mm1, %%mm0 \n\t"
"paddw %%mm0, %%mm6 \n\t"
"add %4, %%"FF_REG_a" \n\t"
" js 1b \n\t"
: "+a" (len)
: "r" (blk1a - len), "r" (blk1b - len), "r" (blk2 - len),
"r" (stride));
}
static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2,
ptrdiff_t stride, int h)
{
x86_reg len = -stride * h;
__asm__ volatile (
"movq (%1, %%"FF_REG_a"), %%mm0\n\t"
"movq 1(%1, %%"FF_REG_a"), %%mm2\n\t"
"movq %%mm0, %%mm1 \n\t"
"movq %%mm2, %%mm3 \n\t"
"punpcklbw %%mm7, %%mm0 \n\t"
"punpckhbw %%mm7, %%mm1 \n\t"
"punpcklbw %%mm7, %%mm2 \n\t"
"punpckhbw %%mm7, %%mm3 \n\t"
"paddw %%mm2, %%mm0 \n\t"
"paddw %%mm3, %%mm1 \n\t"
".p2align 4 \n\t"
"1: \n\t"
"movq (%2, %%"FF_REG_a"), %%mm2\n\t"
"movq 1(%2, %%"FF_REG_a"), %%mm4\n\t"
"movq %%mm2, %%mm3 \n\t"
"movq %%mm4, %%mm5 \n\t"
"punpcklbw %%mm7, %%mm2 \n\t"
"punpckhbw %%mm7, %%mm3 \n\t"
"punpcklbw %%mm7, %%mm4 \n\t"
"punpckhbw %%mm7, %%mm5 \n\t"
"paddw %%mm4, %%mm2 \n\t"
"paddw %%mm5, %%mm3 \n\t"
"movq %5, %%mm5 \n\t"
"paddw %%mm2, %%mm0 \n\t"
"paddw %%mm3, %%mm1 \n\t"
"paddw %%mm5, %%mm0 \n\t"
"paddw %%mm5, %%mm1 \n\t"
"movq (%3, %%"FF_REG_a"), %%mm4 \n\t"
"movq (%3, %%"FF_REG_a"), %%mm5 \n\t"
"psrlw $2, %%mm0 \n\t"
"psrlw $2, %%mm1 \n\t"
"packuswb %%mm1, %%mm0 \n\t"
"psubusb %%mm0, %%mm4 \n\t"
"psubusb %%mm5, %%mm0 \n\t"
"por %%mm4, %%mm0 \n\t"
"movq %%mm0, %%mm4 \n\t"
"punpcklbw %%mm7, %%mm0 \n\t"
"punpckhbw %%mm7, %%mm4 \n\t"
"paddw %%mm0, %%mm6 \n\t"
"paddw %%mm4, %%mm6 \n\t"
"movq %%mm2, %%mm0 \n\t"
"movq %%mm3, %%mm1 \n\t"
"add %4, %%"FF_REG_a" \n\t"
" js 1b \n\t"
: "+a" (len)
: "r" (blk1 - len), "r" (blk1 - len + stride), "r" (blk2 - len),
"r" (stride), "m" (round_tab[2]));
}
static inline int sum_mmx(void)
{
int ret;
__asm__ volatile (
"movq %%mm6, %%mm0 \n\t"
"psrlq $32, %%mm6 \n\t"
"paddw %%mm0, %%mm6 \n\t"
"movq %%mm6, %%mm0 \n\t"
"psrlq $16, %%mm6 \n\t"
"paddw %%mm0, %%mm6 \n\t"
"movd %%mm6, %0 \n\t"
: "=r" (ret));
return ret & 0xFFFF;
}
static inline void sad8_x2a_mmx(uint8_t *blk1, uint8_t *blk2,
ptrdiff_t stride, int h)
{
sad8_2_mmx(blk1, blk1 + 1, blk2, stride, h);
}
static inline void sad8_y2a_mmx(uint8_t *blk1, uint8_t *blk2,
ptrdiff_t stride, int h)
{
sad8_2_mmx(blk1, blk1 + stride, blk2, stride, h);
}
#define PIX_SAD(suf) \
static int sad8_ ## suf(MpegEncContext *v, uint8_t *blk2, \
uint8_t *blk1, ptrdiff_t stride, int h) \
{ \
av_assert2(h == 8); \
__asm__ volatile ( \
"pxor %%mm7, %%mm7 \n\t" \
"pxor %%mm6, %%mm6 \n\t" \
:); \
\
sad8_1_ ## suf(blk1, blk2, stride, 8); \
\
return sum_ ## suf(); \
} \
\
static int sad8_x2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
uint8_t *blk1, ptrdiff_t stride, int h) \
{ \
av_assert2(h == 8); \
__asm__ volatile ( \
"pxor %%mm7, %%mm7 \n\t" \
"pxor %%mm6, %%mm6 \n\t" \
"movq %0, %%mm5 \n\t" \
:: "m" (round_tab[1])); \
\
sad8_x2a_ ## suf(blk1, blk2, stride, 8); \
\
return sum_ ## suf(); \
} \
\
static int sad8_y2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
uint8_t *blk1, ptrdiff_t stride, int h) \
{ \
av_assert2(h == 8); \
__asm__ volatile ( \
"pxor %%mm7, %%mm7 \n\t" \
"pxor %%mm6, %%mm6 \n\t" \
"movq %0, %%mm5 \n\t" \
:: "m" (round_tab[1])); \
\
sad8_y2a_ ## suf(blk1, blk2, stride, 8); \
\
return sum_ ## suf(); \
} \
\
static int sad8_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
uint8_t *blk1, ptrdiff_t stride, int h) \
{ \
av_assert2(h == 8); \
__asm__ volatile ( \
"pxor %%mm7, %%mm7 \n\t" \
"pxor %%mm6, %%mm6 \n\t" \
::); \
\
sad8_4_ ## suf(blk1, blk2, stride, 8); \
\
return sum_ ## suf(); \
} \
\
static int sad16_ ## suf(MpegEncContext *v, uint8_t *blk2, \
uint8_t *blk1, ptrdiff_t stride, int h) \
{ \
__asm__ volatile ( \
"pxor %%mm7, %%mm7 \n\t" \
"pxor %%mm6, %%mm6 \n\t" \
:); \
\
sad8_1_ ## suf(blk1, blk2, stride, h); \
sad8_1_ ## suf(blk1 + 8, blk2 + 8, stride, h); \
\
return sum_ ## suf(); \
} \
\
static int sad16_x2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
uint8_t *blk1, ptrdiff_t stride, int h) \
{ \
__asm__ volatile ( \
"pxor %%mm7, %%mm7 \n\t" \
"pxor %%mm6, %%mm6 \n\t" \
"movq %0, %%mm5 \n\t" \
:: "m" (round_tab[1])); \
\
sad8_x2a_ ## suf(blk1, blk2, stride, h); \
sad8_x2a_ ## suf(blk1 + 8, blk2 + 8, stride, h); \
\
return sum_ ## suf(); \
} \
\
static int sad16_y2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
uint8_t *blk1, ptrdiff_t stride, int h) \
{ \
__asm__ volatile ( \
"pxor %%mm7, %%mm7 \n\t" \
"pxor %%mm6, %%mm6 \n\t" \
"movq %0, %%mm5 \n\t" \
:: "m" (round_tab[1])); \
\
sad8_y2a_ ## suf(blk1, blk2, stride, h); \
sad8_y2a_ ## suf(blk1 + 8, blk2 + 8, stride, h); \
\
return sum_ ## suf(); \
} \
\
static int sad16_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
uint8_t *blk1, ptrdiff_t stride, int h) \
{ \
__asm__ volatile ( \
"pxor %%mm7, %%mm7 \n\t" \
"pxor %%mm6, %%mm6 \n\t" \
::); \
\
sad8_4_ ## suf(blk1, blk2, stride, h); \
sad8_4_ ## suf(blk1 + 8, blk2 + 8, stride, h); \
\
return sum_ ## suf(); \
} \
PIX_SAD(mmx)
#endif /* HAVE_INLINE_ASM */
av_cold void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx)
{
int cpu_flags = av_get_cpu_flags();
#if HAVE_INLINE_ASM
if (INLINE_MMX(cpu_flags)) {
c->pix_abs[0][0] = sad16_mmx;
c->pix_abs[0][1] = sad16_x2_mmx;
c->pix_abs[0][2] = sad16_y2_mmx;
c->pix_abs[0][3] = sad16_xy2_mmx;
c->pix_abs[1][0] = sad8_mmx;
c->pix_abs[1][1] = sad8_x2_mmx;
c->pix_abs[1][2] = sad8_y2_mmx;
c->pix_abs[1][3] = sad8_xy2_mmx;
c->sad[0] = sad16_mmx;
c->sad[1] = sad8_mmx;
c->vsad[4] = vsad_intra16_mmx;
if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {
c->vsad[0] = vsad16_mmx;
}
}
#endif /* HAVE_INLINE_ASM */
if (EXTERNAL_MMX(cpu_flags)) {
c->hadamard8_diff[0] = ff_hadamard8_diff16_mmx;
c->hadamard8_diff[1] = ff_hadamard8_diff_mmx;
c->sum_abs_dctelem = ff_sum_abs_dctelem_mmx;
c->sse[0] = ff_sse16_mmx;
c->sse[1] = ff_sse8_mmx;
#if HAVE_X86ASM
c->nsse[0] = nsse16_mmx;
c->nsse[1] = nsse8_mmx;
#endif
}
if (EXTERNAL_MMXEXT(cpu_flags)) {
c->hadamard8_diff[0] = ff_hadamard8_diff16_mmxext;
c->hadamard8_diff[1] = ff_hadamard8_diff_mmxext;
c->sum_abs_dctelem = ff_sum_abs_dctelem_mmxext;
c->sad[0] = ff_sad16_mmxext;
c->sad[1] = ff_sad8_mmxext;
c->pix_abs[0][0] = ff_sad16_mmxext;
c->pix_abs[0][1] = ff_sad16_x2_mmxext;
c->pix_abs[0][2] = ff_sad16_y2_mmxext;
c->pix_abs[1][0] = ff_sad8_mmxext;
c->pix_abs[1][1] = ff_sad8_x2_mmxext;
c->pix_abs[1][2] = ff_sad8_y2_mmxext;
c->vsad[4] = ff_vsad_intra16_mmxext;
c->vsad[5] = ff_vsad_intra8_mmxext;
if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {
c->pix_abs[0][3] = ff_sad16_approx_xy2_mmxext;
c->pix_abs[1][3] = ff_sad8_approx_xy2_mmxext;
c->vsad[0] = ff_vsad16_approx_mmxext;
c->vsad[1] = ff_vsad8_approx_mmxext;
}
}
if (EXTERNAL_SSE2(cpu_flags)) {
c->sse[0] = ff_sse16_sse2;
c->sum_abs_dctelem = ff_sum_abs_dctelem_sse2;
#if HAVE_ALIGNED_STACK
c->hadamard8_diff[0] = ff_hadamard8_diff16_sse2;
c->hadamard8_diff[1] = ff_hadamard8_diff_sse2;
#endif
if (!(cpu_flags & AV_CPU_FLAG_SSE2SLOW) && avctx->codec_id != AV_CODEC_ID_SNOW) {
c->sad[0] = ff_sad16_sse2;
c->pix_abs[0][0] = ff_sad16_sse2;
c->pix_abs[0][1] = ff_sad16_x2_sse2;
c->pix_abs[0][2] = ff_sad16_y2_sse2;
c->vsad[4] = ff_vsad_intra16_sse2;
if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {
c->pix_abs[0][3] = ff_sad16_approx_xy2_sse2;
c->vsad[0] = ff_vsad16_approx_sse2;
}
}
}
if (EXTERNAL_SSSE3(cpu_flags)) {
c->sum_abs_dctelem = ff_sum_abs_dctelem_ssse3;
#if HAVE_ALIGNED_STACK
c->hadamard8_diff[0] = ff_hadamard8_diff16_ssse3;
c->hadamard8_diff[1] = ff_hadamard8_diff_ssse3;
#endif
}
}

View file

@ -0,0 +1,204 @@
/*
* MLP DSP functions x86-optimized
* Copyright (c) 2009 Ramiro Polla
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/mlpdsp.h"
#include "libavcodec/mlp.h"
#define REMATRIX_CHANNEL_FUNC(opt) \
void ff_mlp_rematrix_channel_##opt(int32_t *samples, \
const int32_t *coeffs, \
const uint8_t *bypassed_lsbs, \
const int8_t *noise_buffer, \
int index, \
unsigned int dest_ch, \
uint16_t blockpos, \
unsigned int maxchan, \
int matrix_noise_shift, \
int access_unit_size_pow2, \
int32_t mask);
REMATRIX_CHANNEL_FUNC(sse4)
REMATRIX_CHANNEL_FUNC(avx2_bmi2)
#if HAVE_7REGS && HAVE_INLINE_ASM && HAVE_INLINE_ASM_NONLOCAL_LABELS
extern char ff_mlp_firorder_8;
extern char ff_mlp_firorder_7;
extern char ff_mlp_firorder_6;
extern char ff_mlp_firorder_5;
extern char ff_mlp_firorder_4;
extern char ff_mlp_firorder_3;
extern char ff_mlp_firorder_2;
extern char ff_mlp_firorder_1;
extern char ff_mlp_firorder_0;
extern char ff_mlp_iirorder_4;
extern char ff_mlp_iirorder_3;
extern char ff_mlp_iirorder_2;
extern char ff_mlp_iirorder_1;
extern char ff_mlp_iirorder_0;
static const void * const firtable[9] = { &ff_mlp_firorder_0, &ff_mlp_firorder_1,
&ff_mlp_firorder_2, &ff_mlp_firorder_3,
&ff_mlp_firorder_4, &ff_mlp_firorder_5,
&ff_mlp_firorder_6, &ff_mlp_firorder_7,
&ff_mlp_firorder_8 };
static const void * const iirtable[5] = { &ff_mlp_iirorder_0, &ff_mlp_iirorder_1,
&ff_mlp_iirorder_2, &ff_mlp_iirorder_3,
&ff_mlp_iirorder_4 };
#if ARCH_X86_64
#define MLPMUL(label, offset, offs, offc) \
LABEL_MANGLE(label)": \n\t" \
"movslq "offset"+"offs"(%0), %%rax\n\t" \
"movslq "offset"+"offc"(%1), %%rdx\n\t" \
"imul %%rdx, %%rax\n\t" \
"add %%rax, %%rsi\n\t"
#define FIRMULREG(label, offset, firc)\
LABEL_MANGLE(label)": \n\t" \
"movslq "#offset"(%0), %%rax\n\t" \
"imul %"#firc", %%rax\n\t" \
"add %%rax, %%rsi\n\t"
#define CLEAR_ACCUM \
"xor %%rsi, %%rsi\n\t"
#define SHIFT_ACCUM \
"shr %%cl, %%rsi\n\t"
#define ACCUM "%%rdx"
#define RESULT "%%rsi"
#define RESULT32 "%%esi"
#else /* if ARCH_X86_32 */
#define MLPMUL(label, offset, offs, offc) \
LABEL_MANGLE(label)": \n\t" \
"mov "offset"+"offs"(%0), %%eax\n\t" \
"imull "offset"+"offc"(%1) \n\t" \
"add %%eax , %%esi\n\t" \
"adc %%edx , %%ecx\n\t"
#define FIRMULREG(label, offset, firc) \
MLPMUL(label, #offset, "0", "0")
#define CLEAR_ACCUM \
"xor %%esi, %%esi\n\t" \
"xor %%ecx, %%ecx\n\t"
#define SHIFT_ACCUM \
"mov %%ecx, %%edx\n\t" \
"mov %%esi, %%eax\n\t" \
"movzbl %7 , %%ecx\n\t" \
"shrd %%cl, %%edx, %%eax\n\t" \
#define ACCUM "%%edx"
#define RESULT "%%eax"
#define RESULT32 "%%eax"
#endif /* !ARCH_X86_64 */
#define BINC AV_STRINGIFY(4* MAX_CHANNELS)
#define IOFFS AV_STRINGIFY(4*(MAX_FIR_ORDER + MAX_BLOCKSIZE))
#define IOFFC AV_STRINGIFY(4* MAX_FIR_ORDER)
#define FIRMUL(label, offset) MLPMUL(label, #offset, "0", "0")
#define IIRMUL(label, offset) MLPMUL(label, #offset, IOFFS, IOFFC)
static void mlp_filter_channel_x86(int32_t *state, const int32_t *coeff,
int firorder, int iirorder,
unsigned int filter_shift, int32_t mask,
int blocksize, int32_t *sample_buffer)
{
const void *firjump = firtable[firorder];
const void *iirjump = iirtable[iirorder];
blocksize = -blocksize;
__asm__ volatile(
"1: \n\t"
CLEAR_ACCUM
"jmp *%5 \n\t"
FIRMUL (ff_mlp_firorder_8, 0x1c )
FIRMUL (ff_mlp_firorder_7, 0x18 )
FIRMUL (ff_mlp_firorder_6, 0x14 )
FIRMUL (ff_mlp_firorder_5, 0x10 )
FIRMUL (ff_mlp_firorder_4, 0x0c )
FIRMUL (ff_mlp_firorder_3, 0x08 )
FIRMUL (ff_mlp_firorder_2, 0x04 )
FIRMULREG(ff_mlp_firorder_1, 0x00, 8)
LABEL_MANGLE(ff_mlp_firorder_0)":\n\t"
"jmp *%6 \n\t"
IIRMUL (ff_mlp_iirorder_4, 0x0c )
IIRMUL (ff_mlp_iirorder_3, 0x08 )
IIRMUL (ff_mlp_iirorder_2, 0x04 )
IIRMUL (ff_mlp_iirorder_1, 0x00 )
LABEL_MANGLE(ff_mlp_iirorder_0)":\n\t"
SHIFT_ACCUM
"mov "RESULT" ,"ACCUM" \n\t"
"add (%2) ,"RESULT" \n\t"
"and %4 ,"RESULT" \n\t"
"sub $4 , %0 \n\t"
"mov "RESULT32", (%0) \n\t"
"mov "RESULT32", (%2) \n\t"
"add $"BINC" , %2 \n\t"
"sub "ACCUM" ,"RESULT" \n\t"
"mov "RESULT32","IOFFS"(%0) \n\t"
"incl %3 \n\t"
"js 1b \n\t"
: /* 0*/"+r"(state),
/* 1*/"+r"(coeff),
/* 2*/"+r"(sample_buffer),
#if ARCH_X86_64
/* 3*/"+r"(blocksize)
: /* 4*/"r"((x86_reg)mask), /* 5*/"r"(firjump),
/* 6*/"r"(iirjump) , /* 7*/"c"(filter_shift)
, /* 8*/"r"((int64_t)coeff[0])
: "rax", "rdx", "rsi"
#else /* ARCH_X86_32 */
/* 3*/"+m"(blocksize)
: /* 4*/"m"( mask), /* 5*/"m"(firjump),
/* 6*/"m"(iirjump) , /* 7*/"m"(filter_shift)
: "eax", "edx", "esi", "ecx"
#endif /* !ARCH_X86_64 */
);
}
#endif /* HAVE_7REGS && HAVE_INLINE_ASM */
av_cold void ff_mlpdsp_init_x86(MLPDSPContext *c)
{
int cpu_flags = av_get_cpu_flags();
#if HAVE_7REGS && HAVE_INLINE_ASM && HAVE_INLINE_ASM_NONLOCAL_LABELS
if (INLINE_MMX(cpu_flags))
c->mlp_filter_channel = mlp_filter_channel_x86;
#endif
if (ARCH_X86_64 && EXTERNAL_SSE4(cpu_flags))
c->mlp_rematrix_channel = ff_mlp_rematrix_channel_sse4;
if (ARCH_X86_64 && EXTERNAL_AVX2_FAST(cpu_flags) && cpu_flags & AV_CPU_FLAG_BMI2)
c->mlp_rematrix_channel = ff_mlp_rematrix_channel_avx2_bmi2;
}

View file

@ -0,0 +1,289 @@
/*
* SIMD-optimized MP3 decoding functions
* Copyright (c) 2010 Vitor Sessak
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/internal.h"
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/mpegaudiodsp.h"
#define DECL(CPU)\
static void imdct36_blocks_ ## CPU(float *out, float *buf, float *in, int count, int switch_point, int block_type);\
void ff_imdct36_float_ ## CPU(float *out, float *buf, float *in, float *win);
#if HAVE_X86ASM
#if ARCH_X86_32
DECL(sse)
#endif
DECL(sse2)
DECL(sse3)
DECL(ssse3)
DECL(avx)
#endif /* HAVE_X86ASM */
void ff_four_imdct36_float_sse(float *out, float *buf, float *in, float *win,
float *tmpbuf);
void ff_four_imdct36_float_avx(float *out, float *buf, float *in, float *win,
float *tmpbuf);
DECLARE_ALIGNED(16, static float, mdct_win_sse)[2][4][4*40];
#if HAVE_6REGS && HAVE_SSE_INLINE
#define MACS(rt, ra, rb) rt+=(ra)*(rb)
#define MLSS(rt, ra, rb) rt-=(ra)*(rb)
#define SUM8(op, sum, w, p) \
{ \
op(sum, (w)[0 * 64], (p)[0 * 64]); \
op(sum, (w)[1 * 64], (p)[1 * 64]); \
op(sum, (w)[2 * 64], (p)[2 * 64]); \
op(sum, (w)[3 * 64], (p)[3 * 64]); \
op(sum, (w)[4 * 64], (p)[4 * 64]); \
op(sum, (w)[5 * 64], (p)[5 * 64]); \
op(sum, (w)[6 * 64], (p)[6 * 64]); \
op(sum, (w)[7 * 64], (p)[7 * 64]); \
}
static void apply_window(const float *buf, const float *win1,
const float *win2, float *sum1, float *sum2, int len)
{
x86_reg count = - 4*len;
const float *win1a = win1+len;
const float *win2a = win2+len;
const float *bufa = buf+len;
float *sum1a = sum1+len;
float *sum2a = sum2+len;
#define MULT(a, b) \
"movaps " #a "(%1,%0), %%xmm1 \n\t" \
"movaps " #a "(%3,%0), %%xmm2 \n\t" \
"mulps %%xmm2, %%xmm1 \n\t" \
"subps %%xmm1, %%xmm0 \n\t" \
"mulps " #b "(%2,%0), %%xmm2 \n\t" \
"subps %%xmm2, %%xmm4 \n\t" \
__asm__ volatile(
"1: \n\t"
"xorps %%xmm0, %%xmm0 \n\t"
"xorps %%xmm4, %%xmm4 \n\t"
MULT( 0, 0)
MULT( 256, 64)
MULT( 512, 128)
MULT( 768, 192)
MULT(1024, 256)
MULT(1280, 320)
MULT(1536, 384)
MULT(1792, 448)
"movaps %%xmm0, (%4,%0) \n\t"
"movaps %%xmm4, (%5,%0) \n\t"
"add $16, %0 \n\t"
"jl 1b \n\t"
:"+&r"(count)
:"r"(win1a), "r"(win2a), "r"(bufa), "r"(sum1a), "r"(sum2a)
);
#undef MULT
}
static void apply_window_mp3(float *in, float *win, int *unused, float *out,
ptrdiff_t incr)
{
LOCAL_ALIGNED_16(float, suma, [17]);
LOCAL_ALIGNED_16(float, sumb, [17]);
LOCAL_ALIGNED_16(float, sumc, [17]);
LOCAL_ALIGNED_16(float, sumd, [17]);
float sum;
/* copy to avoid wrap */
__asm__ volatile(
"movaps 0(%0), %%xmm0 \n\t" \
"movaps 16(%0), %%xmm1 \n\t" \
"movaps 32(%0), %%xmm2 \n\t" \
"movaps 48(%0), %%xmm3 \n\t" \
"movaps %%xmm0, 0(%1) \n\t" \
"movaps %%xmm1, 16(%1) \n\t" \
"movaps %%xmm2, 32(%1) \n\t" \
"movaps %%xmm3, 48(%1) \n\t" \
"movaps 64(%0), %%xmm0 \n\t" \
"movaps 80(%0), %%xmm1 \n\t" \
"movaps 96(%0), %%xmm2 \n\t" \
"movaps 112(%0), %%xmm3 \n\t" \
"movaps %%xmm0, 64(%1) \n\t" \
"movaps %%xmm1, 80(%1) \n\t" \
"movaps %%xmm2, 96(%1) \n\t" \
"movaps %%xmm3, 112(%1) \n\t"
::"r"(in), "r"(in+512)
:"memory"
);
apply_window(in + 16, win , win + 512, suma, sumc, 16);
apply_window(in + 32, win + 48, win + 640, sumb, sumd, 16);
SUM8(MACS, suma[0], win + 32, in + 48);
sumc[ 0] = 0;
sumb[16] = 0;
sumd[16] = 0;
#define SUMS(suma, sumb, sumc, sumd, out1, out2) \
"movups " #sumd "(%4), %%xmm0 \n\t" \
"shufps $0x1b, %%xmm0, %%xmm0 \n\t" \
"subps " #suma "(%1), %%xmm0 \n\t" \
"movaps %%xmm0," #out1 "(%0) \n\t" \
\
"movups " #sumc "(%3), %%xmm0 \n\t" \
"shufps $0x1b, %%xmm0, %%xmm0 \n\t" \
"addps " #sumb "(%2), %%xmm0 \n\t" \
"movaps %%xmm0," #out2 "(%0) \n\t"
if (incr == 1) {
__asm__ volatile(
SUMS( 0, 48, 4, 52, 0, 112)
SUMS(16, 32, 20, 36, 16, 96)
SUMS(32, 16, 36, 20, 32, 80)
SUMS(48, 0, 52, 4, 48, 64)
:"+&r"(out)
:"r"(&suma[0]), "r"(&sumb[0]), "r"(&sumc[0]), "r"(&sumd[0])
:"memory"
);
out += 16*incr;
} else {
int j;
float *out2 = out + 32 * incr;
out[0 ] = -suma[ 0];
out += incr;
out2 -= incr;
for(j=1;j<16;j++) {
*out = -suma[ j] + sumd[16-j];
*out2 = sumb[16-j] + sumc[ j];
out += incr;
out2 -= incr;
}
}
sum = 0;
SUM8(MLSS, sum, win + 16 + 32, in + 32);
*out = sum;
}
#endif /* HAVE_6REGS && HAVE_SSE_INLINE */
#if HAVE_X86ASM
#define DECL_IMDCT_BLOCKS(CPU1, CPU2) \
static void imdct36_blocks_ ## CPU1(float *out, float *buf, float *in, \
int count, int switch_point, int block_type) \
{ \
int align_end = count - (count & 3); \
int j; \
for (j = 0; j < align_end; j+= 4) { \
LOCAL_ALIGNED_16(float, tmpbuf, [1024]); \
float *win = mdct_win_sse[switch_point && j < 4][block_type]; \
/* apply window & overlap with previous buffer */ \
\
/* select window */ \
ff_four_imdct36_float_ ## CPU2(out, buf, in, win, tmpbuf); \
in += 4*18; \
buf += 4*18; \
out += 4; \
} \
for (; j < count; j++) { \
/* apply window & overlap with previous buffer */ \
\
/* select window */ \
int win_idx = (switch_point && j < 2) ? 0 : block_type; \
float *win = ff_mdct_win_float[win_idx + (4 & -(j & 1))]; \
\
ff_imdct36_float_ ## CPU1(out, buf, in, win); \
\
in += 18; \
buf++; \
out++; \
} \
}
#if HAVE_SSE
#if ARCH_X86_32
DECL_IMDCT_BLOCKS(sse,sse)
#endif
DECL_IMDCT_BLOCKS(sse2,sse)
DECL_IMDCT_BLOCKS(sse3,sse)
DECL_IMDCT_BLOCKS(ssse3,sse)
#endif
#if HAVE_AVX_EXTERNAL
DECL_IMDCT_BLOCKS(avx,avx)
#endif
#endif /* HAVE_X86ASM */
av_cold void ff_mpadsp_init_x86(MPADSPContext *s)
{
av_unused int cpu_flags = av_get_cpu_flags();
int i, j;
for (j = 0; j < 4; j++) {
for (i = 0; i < 40; i ++) {
mdct_win_sse[0][j][4*i ] = ff_mdct_win_float[j ][i];
mdct_win_sse[0][j][4*i + 1] = ff_mdct_win_float[j + 4][i];
mdct_win_sse[0][j][4*i + 2] = ff_mdct_win_float[j ][i];
mdct_win_sse[0][j][4*i + 3] = ff_mdct_win_float[j + 4][i];
mdct_win_sse[1][j][4*i ] = ff_mdct_win_float[0 ][i];
mdct_win_sse[1][j][4*i + 1] = ff_mdct_win_float[4 ][i];
mdct_win_sse[1][j][4*i + 2] = ff_mdct_win_float[j ][i];
mdct_win_sse[1][j][4*i + 3] = ff_mdct_win_float[j + 4][i];
}
}
#if HAVE_6REGS && HAVE_SSE_INLINE
if (INLINE_SSE(cpu_flags)) {
s->apply_window_float = apply_window_mp3;
}
#endif /* HAVE_SSE_INLINE */
#if HAVE_X86ASM
#if HAVE_SSE
#if ARCH_X86_32
if (EXTERNAL_SSE(cpu_flags)) {
s->imdct36_blocks_float = imdct36_blocks_sse;
}
#endif
if (EXTERNAL_SSE2(cpu_flags)) {
s->imdct36_blocks_float = imdct36_blocks_sse2;
}
if (EXTERNAL_SSE3(cpu_flags)) {
s->imdct36_blocks_float = imdct36_blocks_sse3;
}
if (EXTERNAL_SSSE3(cpu_flags)) {
s->imdct36_blocks_float = imdct36_blocks_ssse3;
}
#endif
#if HAVE_AVX_EXTERNAL
if (EXTERNAL_AVX(cpu_flags)) {
s->imdct36_blocks_float = imdct36_blocks_avx;
}
#endif
#endif /* HAVE_X86ASM */
}

View file

@ -0,0 +1,469 @@
/*
* Optimized for ia32 CPUs by Nick Kurshev <nickols_k@mail.ru>
* H.263, MPEG-1, MPEG-2 dequantizer & draw_edges by Michael Niedermayer <michaelni@gmx.at>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/avcodec.h"
#include "libavcodec/mpegvideo.h"
#include "libavcodec/mpegvideodata.h"
#if HAVE_MMX_INLINE
static void dct_unquantize_h263_intra_mmx(MpegEncContext *s,
int16_t *block, int n, int qscale)
{
x86_reg level, qmul, qadd, nCoeffs;
qmul = qscale << 1;
av_assert2(s->block_last_index[n]>=0 || s->h263_aic);
if (!s->h263_aic) {
if (n < 4)
level = block[0] * s->y_dc_scale;
else
level = block[0] * s->c_dc_scale;
qadd = (qscale - 1) | 1;
}else{
qadd = 0;
level= block[0];
}
if(s->ac_pred)
nCoeffs=63;
else
nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ];
__asm__ volatile(
"movd %1, %%mm6 \n\t" //qmul
"packssdw %%mm6, %%mm6 \n\t"
"packssdw %%mm6, %%mm6 \n\t"
"movd %2, %%mm5 \n\t" //qadd
"pxor %%mm7, %%mm7 \n\t"
"packssdw %%mm5, %%mm5 \n\t"
"packssdw %%mm5, %%mm5 \n\t"
"psubw %%mm5, %%mm7 \n\t"
"pxor %%mm4, %%mm4 \n\t"
".p2align 4 \n\t"
"1: \n\t"
"movq (%0, %3), %%mm0 \n\t"
"movq 8(%0, %3), %%mm1 \n\t"
"pmullw %%mm6, %%mm0 \n\t"
"pmullw %%mm6, %%mm1 \n\t"
"movq (%0, %3), %%mm2 \n\t"
"movq 8(%0, %3), %%mm3 \n\t"
"pcmpgtw %%mm4, %%mm2 \n\t" // block[i] < 0 ? -1 : 0
"pcmpgtw %%mm4, %%mm3 \n\t" // block[i] < 0 ? -1 : 0
"pxor %%mm2, %%mm0 \n\t"
"pxor %%mm3, %%mm1 \n\t"
"paddw %%mm7, %%mm0 \n\t"
"paddw %%mm7, %%mm1 \n\t"
"pxor %%mm0, %%mm2 \n\t"
"pxor %%mm1, %%mm3 \n\t"
"pcmpeqw %%mm7, %%mm0 \n\t" // block[i] == 0 ? -1 : 0
"pcmpeqw %%mm7, %%mm1 \n\t" // block[i] == 0 ? -1 : 0
"pandn %%mm2, %%mm0 \n\t"
"pandn %%mm3, %%mm1 \n\t"
"movq %%mm0, (%0, %3) \n\t"
"movq %%mm1, 8(%0, %3) \n\t"
"add $16, %3 \n\t"
"jng 1b \n\t"
::"r" (block+nCoeffs), "rm"(qmul), "rm" (qadd), "r" (2*(-nCoeffs))
: "memory"
);
block[0]= level;
}
static void dct_unquantize_h263_inter_mmx(MpegEncContext *s,
int16_t *block, int n, int qscale)
{
x86_reg qmul, qadd, nCoeffs;
qmul = qscale << 1;
qadd = (qscale - 1) | 1;
av_assert2(s->block_last_index[n]>=0 || s->h263_aic);
nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
__asm__ volatile(
"movd %1, %%mm6 \n\t" //qmul
"packssdw %%mm6, %%mm6 \n\t"
"packssdw %%mm6, %%mm6 \n\t"
"movd %2, %%mm5 \n\t" //qadd
"pxor %%mm7, %%mm7 \n\t"
"packssdw %%mm5, %%mm5 \n\t"
"packssdw %%mm5, %%mm5 \n\t"
"psubw %%mm5, %%mm7 \n\t"
"pxor %%mm4, %%mm4 \n\t"
".p2align 4 \n\t"
"1: \n\t"
"movq (%0, %3), %%mm0 \n\t"
"movq 8(%0, %3), %%mm1 \n\t"
"pmullw %%mm6, %%mm0 \n\t"
"pmullw %%mm6, %%mm1 \n\t"
"movq (%0, %3), %%mm2 \n\t"
"movq 8(%0, %3), %%mm3 \n\t"
"pcmpgtw %%mm4, %%mm2 \n\t" // block[i] < 0 ? -1 : 0
"pcmpgtw %%mm4, %%mm3 \n\t" // block[i] < 0 ? -1 : 0
"pxor %%mm2, %%mm0 \n\t"
"pxor %%mm3, %%mm1 \n\t"
"paddw %%mm7, %%mm0 \n\t"
"paddw %%mm7, %%mm1 \n\t"
"pxor %%mm0, %%mm2 \n\t"
"pxor %%mm1, %%mm3 \n\t"
"pcmpeqw %%mm7, %%mm0 \n\t" // block[i] == 0 ? -1 : 0
"pcmpeqw %%mm7, %%mm1 \n\t" // block[i] == 0 ? -1 : 0
"pandn %%mm2, %%mm0 \n\t"
"pandn %%mm3, %%mm1 \n\t"
"movq %%mm0, (%0, %3) \n\t"
"movq %%mm1, 8(%0, %3) \n\t"
"add $16, %3 \n\t"
"jng 1b \n\t"
::"r" (block+nCoeffs), "rm"(qmul), "rm" (qadd), "r" (2*(-nCoeffs))
: "memory"
);
}
static void dct_unquantize_mpeg1_intra_mmx(MpegEncContext *s,
int16_t *block, int n, int qscale)
{
x86_reg nCoeffs;
const uint16_t *quant_matrix;
int block0;
av_assert2(s->block_last_index[n]>=0);
nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1;
if (n < 4)
block0 = block[0] * s->y_dc_scale;
else
block0 = block[0] * s->c_dc_scale;
/* XXX: only MPEG-1 */
quant_matrix = s->intra_matrix;
__asm__ volatile(
"pcmpeqw %%mm7, %%mm7 \n\t"
"psrlw $15, %%mm7 \n\t"
"movd %2, %%mm6 \n\t"
"packssdw %%mm6, %%mm6 \n\t"
"packssdw %%mm6, %%mm6 \n\t"
"mov %3, %%"FF_REG_a" \n\t"
".p2align 4 \n\t"
"1: \n\t"
"movq (%0, %%"FF_REG_a"), %%mm0 \n\t"
"movq 8(%0, %%"FF_REG_a"), %%mm1\n\t"
"movq (%1, %%"FF_REG_a"), %%mm4 \n\t"
"movq 8(%1, %%"FF_REG_a"), %%mm5\n\t"
"pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i]
"pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i]
"pxor %%mm2, %%mm2 \n\t"
"pxor %%mm3, %%mm3 \n\t"
"pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0
"pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0
"pxor %%mm2, %%mm0 \n\t"
"pxor %%mm3, %%mm1 \n\t"
"psubw %%mm2, %%mm0 \n\t" // abs(block[i])
"psubw %%mm3, %%mm1 \n\t" // abs(block[i])
"pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*q
"pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*q
"pxor %%mm4, %%mm4 \n\t"
"pxor %%mm5, %%mm5 \n\t" // FIXME slow
"pcmpeqw (%0, %%"FF_REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
"pcmpeqw 8(%0, %%"FF_REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0
"psraw $3, %%mm0 \n\t"
"psraw $3, %%mm1 \n\t"
"psubw %%mm7, %%mm0 \n\t"
"psubw %%mm7, %%mm1 \n\t"
"por %%mm7, %%mm0 \n\t"
"por %%mm7, %%mm1 \n\t"
"pxor %%mm2, %%mm0 \n\t"
"pxor %%mm3, %%mm1 \n\t"
"psubw %%mm2, %%mm0 \n\t"
"psubw %%mm3, %%mm1 \n\t"
"pandn %%mm0, %%mm4 \n\t"
"pandn %%mm1, %%mm5 \n\t"
"movq %%mm4, (%0, %%"FF_REG_a") \n\t"
"movq %%mm5, 8(%0, %%"FF_REG_a")\n\t"
"add $16, %%"FF_REG_a" \n\t"
"js 1b \n\t"
::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "g" (-2*nCoeffs)
: "%"FF_REG_a, "memory"
);
block[0]= block0;
}
static void dct_unquantize_mpeg1_inter_mmx(MpegEncContext *s,
int16_t *block, int n, int qscale)
{
x86_reg nCoeffs;
const uint16_t *quant_matrix;
av_assert2(s->block_last_index[n]>=0);
nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1;
quant_matrix = s->inter_matrix;
__asm__ volatile(
"pcmpeqw %%mm7, %%mm7 \n\t"
"psrlw $15, %%mm7 \n\t"
"movd %2, %%mm6 \n\t"
"packssdw %%mm6, %%mm6 \n\t"
"packssdw %%mm6, %%mm6 \n\t"
"mov %3, %%"FF_REG_a" \n\t"
".p2align 4 \n\t"
"1: \n\t"
"movq (%0, %%"FF_REG_a"), %%mm0 \n\t"
"movq 8(%0, %%"FF_REG_a"), %%mm1\n\t"
"movq (%1, %%"FF_REG_a"), %%mm4 \n\t"
"movq 8(%1, %%"FF_REG_a"), %%mm5\n\t"
"pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i]
"pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i]
"pxor %%mm2, %%mm2 \n\t"
"pxor %%mm3, %%mm3 \n\t"
"pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0
"pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0
"pxor %%mm2, %%mm0 \n\t"
"pxor %%mm3, %%mm1 \n\t"
"psubw %%mm2, %%mm0 \n\t" // abs(block[i])
"psubw %%mm3, %%mm1 \n\t" // abs(block[i])
"paddw %%mm0, %%mm0 \n\t" // abs(block[i])*2
"paddw %%mm1, %%mm1 \n\t" // abs(block[i])*2
"paddw %%mm7, %%mm0 \n\t" // abs(block[i])*2 + 1
"paddw %%mm7, %%mm1 \n\t" // abs(block[i])*2 + 1
"pmullw %%mm4, %%mm0 \n\t" // (abs(block[i])*2 + 1)*q
"pmullw %%mm5, %%mm1 \n\t" // (abs(block[i])*2 + 1)*q
"pxor %%mm4, %%mm4 \n\t"
"pxor %%mm5, %%mm5 \n\t" // FIXME slow
"pcmpeqw (%0, %%"FF_REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
"pcmpeqw 8(%0, %%"FF_REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0
"psraw $4, %%mm0 \n\t"
"psraw $4, %%mm1 \n\t"
"psubw %%mm7, %%mm0 \n\t"
"psubw %%mm7, %%mm1 \n\t"
"por %%mm7, %%mm0 \n\t"
"por %%mm7, %%mm1 \n\t"
"pxor %%mm2, %%mm0 \n\t"
"pxor %%mm3, %%mm1 \n\t"
"psubw %%mm2, %%mm0 \n\t"
"psubw %%mm3, %%mm1 \n\t"
"pandn %%mm0, %%mm4 \n\t"
"pandn %%mm1, %%mm5 \n\t"
"movq %%mm4, (%0, %%"FF_REG_a") \n\t"
"movq %%mm5, 8(%0, %%"FF_REG_a")\n\t"
"add $16, %%"FF_REG_a" \n\t"
"js 1b \n\t"
::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "g" (-2*nCoeffs)
: "%"FF_REG_a, "memory"
);
}
static void dct_unquantize_mpeg2_intra_mmx(MpegEncContext *s,
int16_t *block, int n, int qscale)
{
x86_reg nCoeffs;
const uint16_t *quant_matrix;
int block0;
av_assert2(s->block_last_index[n]>=0);
if (s->q_scale_type) qscale = ff_mpeg2_non_linear_qscale[qscale];
else qscale <<= 1;
if(s->alternate_scan) nCoeffs= 63; //FIXME
else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ];
if (n < 4)
block0 = block[0] * s->y_dc_scale;
else
block0 = block[0] * s->c_dc_scale;
quant_matrix = s->intra_matrix;
__asm__ volatile(
"pcmpeqw %%mm7, %%mm7 \n\t"
"psrlw $15, %%mm7 \n\t"
"movd %2, %%mm6 \n\t"
"packssdw %%mm6, %%mm6 \n\t"
"packssdw %%mm6, %%mm6 \n\t"
"mov %3, %%"FF_REG_a" \n\t"
".p2align 4 \n\t"
"1: \n\t"
"movq (%0, %%"FF_REG_a"), %%mm0 \n\t"
"movq 8(%0, %%"FF_REG_a"), %%mm1\n\t"
"movq (%1, %%"FF_REG_a"), %%mm4 \n\t"
"movq 8(%1, %%"FF_REG_a"), %%mm5\n\t"
"pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i]
"pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i]
"pxor %%mm2, %%mm2 \n\t"
"pxor %%mm3, %%mm3 \n\t"
"pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0
"pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0
"pxor %%mm2, %%mm0 \n\t"
"pxor %%mm3, %%mm1 \n\t"
"psubw %%mm2, %%mm0 \n\t" // abs(block[i])
"psubw %%mm3, %%mm1 \n\t" // abs(block[i])
"pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*q
"pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*q
"pxor %%mm4, %%mm4 \n\t"
"pxor %%mm5, %%mm5 \n\t" // FIXME slow
"pcmpeqw (%0, %%"FF_REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
"pcmpeqw 8(%0, %%"FF_REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0
"psraw $4, %%mm0 \n\t"
"psraw $4, %%mm1 \n\t"
"pxor %%mm2, %%mm0 \n\t"
"pxor %%mm3, %%mm1 \n\t"
"psubw %%mm2, %%mm0 \n\t"
"psubw %%mm3, %%mm1 \n\t"
"pandn %%mm0, %%mm4 \n\t"
"pandn %%mm1, %%mm5 \n\t"
"movq %%mm4, (%0, %%"FF_REG_a") \n\t"
"movq %%mm5, 8(%0, %%"FF_REG_a")\n\t"
"add $16, %%"FF_REG_a" \n\t"
"jng 1b \n\t"
::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "g" (-2*nCoeffs)
: "%"FF_REG_a, "memory"
);
block[0]= block0;
//Note, we do not do mismatch control for intra as errors cannot accumulate
}
static void dct_unquantize_mpeg2_inter_mmx(MpegEncContext *s,
int16_t *block, int n, int qscale)
{
x86_reg nCoeffs;
const uint16_t *quant_matrix;
av_assert2(s->block_last_index[n]>=0);
if (s->q_scale_type) qscale = ff_mpeg2_non_linear_qscale[qscale];
else qscale <<= 1;
if(s->alternate_scan) nCoeffs= 63; //FIXME
else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ];
quant_matrix = s->inter_matrix;
__asm__ volatile(
"pcmpeqw %%mm7, %%mm7 \n\t"
"psrlq $48, %%mm7 \n\t"
"movd %2, %%mm6 \n\t"
"packssdw %%mm6, %%mm6 \n\t"
"packssdw %%mm6, %%mm6 \n\t"
"mov %3, %%"FF_REG_a" \n\t"
".p2align 4 \n\t"
"1: \n\t"
"movq (%0, %%"FF_REG_a"), %%mm0 \n\t"
"movq 8(%0, %%"FF_REG_a"), %%mm1\n\t"
"movq (%1, %%"FF_REG_a"), %%mm4 \n\t"
"movq 8(%1, %%"FF_REG_a"), %%mm5\n\t"
"pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i]
"pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i]
"pxor %%mm2, %%mm2 \n\t"
"pxor %%mm3, %%mm3 \n\t"
"pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0
"pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0
"pxor %%mm2, %%mm0 \n\t"
"pxor %%mm3, %%mm1 \n\t"
"psubw %%mm2, %%mm0 \n\t" // abs(block[i])
"psubw %%mm3, %%mm1 \n\t" // abs(block[i])
"paddw %%mm0, %%mm0 \n\t" // abs(block[i])*2
"paddw %%mm1, %%mm1 \n\t" // abs(block[i])*2
"pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*2*q
"pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*2*q
"paddw %%mm4, %%mm0 \n\t" // (abs(block[i])*2 + 1)*q
"paddw %%mm5, %%mm1 \n\t" // (abs(block[i])*2 + 1)*q
"pxor %%mm4, %%mm4 \n\t"
"pxor %%mm5, %%mm5 \n\t" // FIXME slow
"pcmpeqw (%0, %%"FF_REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
"pcmpeqw 8(%0, %%"FF_REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0
"psrlw $5, %%mm0 \n\t"
"psrlw $5, %%mm1 \n\t"
"pxor %%mm2, %%mm0 \n\t"
"pxor %%mm3, %%mm1 \n\t"
"psubw %%mm2, %%mm0 \n\t"
"psubw %%mm3, %%mm1 \n\t"
"pandn %%mm0, %%mm4 \n\t"
"pandn %%mm1, %%mm5 \n\t"
"pxor %%mm4, %%mm7 \n\t"
"pxor %%mm5, %%mm7 \n\t"
"movq %%mm4, (%0, %%"FF_REG_a") \n\t"
"movq %%mm5, 8(%0, %%"FF_REG_a")\n\t"
"add $16, %%"FF_REG_a" \n\t"
"jng 1b \n\t"
"movd 124(%0, %3), %%mm0 \n\t"
"movq %%mm7, %%mm6 \n\t"
"psrlq $32, %%mm7 \n\t"
"pxor %%mm6, %%mm7 \n\t"
"movq %%mm7, %%mm6 \n\t"
"psrlq $16, %%mm7 \n\t"
"pxor %%mm6, %%mm7 \n\t"
"pslld $31, %%mm7 \n\t"
"psrlq $15, %%mm7 \n\t"
"pxor %%mm7, %%mm0 \n\t"
"movd %%mm0, 124(%0, %3) \n\t"
::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "r" (-2*nCoeffs)
: "%"FF_REG_a, "memory"
);
}
#endif /* HAVE_MMX_INLINE */
av_cold void ff_mpv_common_init_x86(MpegEncContext *s)
{
#if HAVE_MMX_INLINE
int cpu_flags = av_get_cpu_flags();
if (INLINE_MMX(cpu_flags)) {
s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_mmx;
s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_mmx;
s->dct_unquantize_mpeg1_intra = dct_unquantize_mpeg1_intra_mmx;
s->dct_unquantize_mpeg1_inter = dct_unquantize_mpeg1_inter_mmx;
if (!(s->avctx->flags & AV_CODEC_FLAG_BITEXACT))
s->dct_unquantize_mpeg2_intra = dct_unquantize_mpeg2_intra_mmx;
s->dct_unquantize_mpeg2_inter = dct_unquantize_mpeg2_inter_mmx;
}
#endif /* HAVE_MMX_INLINE */
}

View file

@ -0,0 +1,161 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/mpegvideodsp.h"
#include "libavcodec/videodsp.h"
#if HAVE_INLINE_ASM
static void gmc_mmx(uint8_t *dst, uint8_t *src,
int stride, int h, int ox, int oy,
int dxx, int dxy, int dyx, int dyy,
int shift, int r, int width, int height)
{
const int w = 8;
const int ix = ox >> (16 + shift);
const int iy = oy >> (16 + shift);
const int oxs = ox >> 4;
const int oys = oy >> 4;
const int dxxs = dxx >> 4;
const int dxys = dxy >> 4;
const int dyxs = dyx >> 4;
const int dyys = dyy >> 4;
const uint16_t r4[4] = { r, r, r, r };
const uint16_t dxy4[4] = { dxys, dxys, dxys, dxys };
const uint16_t dyy4[4] = { dyys, dyys, dyys, dyys };
const uint64_t shift2 = 2 * shift;
#define MAX_STRIDE 4096U
#define MAX_H 8U
uint8_t edge_buf[(MAX_H + 1) * MAX_STRIDE];
int x, y;
const int dxw = (dxx - (1 << (16 + shift))) * (w - 1);
const int dyh = (dyy - (1 << (16 + shift))) * (h - 1);
const int dxh = dxy * (h - 1);
const int dyw = dyx * (w - 1);
int need_emu = (unsigned) ix >= width - w || width < w ||
(unsigned) iy >= height - h || height< h
;
if ( // non-constant fullpel offset (3% of blocks)
((ox ^ (ox + dxw)) | (ox ^ (ox + dxh)) | (ox ^ (ox + dxw + dxh)) |
(oy ^ (oy + dyw)) | (oy ^ (oy + dyh)) | (oy ^ (oy + dyw + dyh))) >> (16 + shift) ||
// uses more than 16 bits of subpel mv (only at huge resolution)
(dxx | dxy | dyx | dyy) & 15 ||
(need_emu && (h > MAX_H || stride > MAX_STRIDE))) {
// FIXME could still use mmx for some of the rows
ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy,
shift, r, width, height);
return;
}
src += ix + iy * stride;
if (need_emu) {
ff_emulated_edge_mc_8(edge_buf, src, stride, stride, w + 1, h + 1, ix, iy, width, height);
src = edge_buf;
}
__asm__ volatile (
"movd %0, %%mm6 \n\t"
"pxor %%mm7, %%mm7 \n\t"
"punpcklwd %%mm6, %%mm6 \n\t"
"punpcklwd %%mm6, %%mm6 \n\t"
:: "r" (1 << shift));
for (x = 0; x < w; x += 4) {
uint16_t dx4[4] = { oxs - dxys + dxxs * (x + 0),
oxs - dxys + dxxs * (x + 1),
oxs - dxys + dxxs * (x + 2),
oxs - dxys + dxxs * (x + 3) };
uint16_t dy4[4] = { oys - dyys + dyxs * (x + 0),
oys - dyys + dyxs * (x + 1),
oys - dyys + dyxs * (x + 2),
oys - dyys + dyxs * (x + 3) };
for (y = 0; y < h; y++) {
__asm__ volatile (
"movq %0, %%mm4 \n\t"
"movq %1, %%mm5 \n\t"
"paddw %2, %%mm4 \n\t"
"paddw %3, %%mm5 \n\t"
"movq %%mm4, %0 \n\t"
"movq %%mm5, %1 \n\t"
"psrlw $12, %%mm4 \n\t"
"psrlw $12, %%mm5 \n\t"
: "+m" (*dx4), "+m" (*dy4)
: "m" (*dxy4), "m" (*dyy4));
__asm__ volatile (
"movq %%mm6, %%mm2 \n\t"
"movq %%mm6, %%mm1 \n\t"
"psubw %%mm4, %%mm2 \n\t"
"psubw %%mm5, %%mm1 \n\t"
"movq %%mm2, %%mm0 \n\t"
"movq %%mm4, %%mm3 \n\t"
"pmullw %%mm1, %%mm0 \n\t" // (s - dx) * (s - dy)
"pmullw %%mm5, %%mm3 \n\t" // dx * dy
"pmullw %%mm5, %%mm2 \n\t" // (s - dx) * dy
"pmullw %%mm4, %%mm1 \n\t" // dx * (s - dy)
"movd %4, %%mm5 \n\t"
"movd %3, %%mm4 \n\t"
"punpcklbw %%mm7, %%mm5 \n\t"
"punpcklbw %%mm7, %%mm4 \n\t"
"pmullw %%mm5, %%mm3 \n\t" // src[1, 1] * dx * dy
"pmullw %%mm4, %%mm2 \n\t" // src[0, 1] * (s - dx) * dy
"movd %2, %%mm5 \n\t"
"movd %1, %%mm4 \n\t"
"punpcklbw %%mm7, %%mm5 \n\t"
"punpcklbw %%mm7, %%mm4 \n\t"
"pmullw %%mm5, %%mm1 \n\t" // src[1, 0] * dx * (s - dy)
"pmullw %%mm4, %%mm0 \n\t" // src[0, 0] * (s - dx) * (s - dy)
"paddw %5, %%mm1 \n\t"
"paddw %%mm3, %%mm2 \n\t"
"paddw %%mm1, %%mm0 \n\t"
"paddw %%mm2, %%mm0 \n\t"
"psrlw %6, %%mm0 \n\t"
"packuswb %%mm0, %%mm0 \n\t"
"movd %%mm0, %0 \n\t"
: "=m" (dst[x + y * stride])
: "m" (src[0]), "m" (src[1]),
"m" (src[stride]), "m" (src[stride + 1]),
"m" (*r4), "m" (shift2));
src += stride;
}
src += 4 - h * stride;
}
}
#endif /* HAVE_INLINE_ASM */
av_cold void ff_mpegvideodsp_init_x86(MpegVideoDSPContext *c)
{
#if HAVE_INLINE_ASM
int cpu_flags = av_get_cpu_flags();
if (INLINE_MMX(cpu_flags))
c->gmc = gmc_mmx;
#endif /* HAVE_INLINE_ASM */
}

View file

@ -0,0 +1,244 @@
/*
* The simplest mpeg encoder (well, it was the simplest!)
* Copyright (c) 2000,2001 Fabrice Bellard
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/avcodec.h"
#include "libavcodec/dct.h"
#include "libavcodec/mpegvideo.h"
/* not permutated inverse zigzag_direct + 1 for MMX quantizer */
DECLARE_ALIGNED(16, static const uint16_t, inv_zigzag_direct16)[64] = {
1, 2, 6, 7, 15, 16, 28, 29,
3, 5, 8, 14, 17, 27, 30, 43,
4, 9, 13, 18, 26, 31, 42, 44,
10, 12, 19, 25, 32, 41, 45, 54,
11, 20, 24, 33, 40, 46, 53, 55,
21, 23, 34, 39, 47, 52, 56, 61,
22, 35, 38, 48, 51, 57, 60, 62,
36, 37, 49, 50, 58, 59, 63, 64,
};
#if HAVE_6REGS
#if HAVE_MMX_INLINE
#define COMPILE_TEMPLATE_MMXEXT 0
#define COMPILE_TEMPLATE_SSE2 0
#define COMPILE_TEMPLATE_SSSE3 0
#define RENAME(a) a ## _mmx
#define RENAME_FDCT(a) a ## _mmx
#include "mpegvideoenc_template.c"
#endif /* HAVE_MMX_INLINE */
#if HAVE_MMXEXT_INLINE
#undef COMPILE_TEMPLATE_SSSE3
#undef COMPILE_TEMPLATE_SSE2
#undef COMPILE_TEMPLATE_MMXEXT
#define COMPILE_TEMPLATE_MMXEXT 1
#define COMPILE_TEMPLATE_SSE2 0
#define COMPILE_TEMPLATE_SSSE3 0
#undef RENAME
#undef RENAME_FDCT
#define RENAME(a) a ## _mmxext
#define RENAME_FDCT(a) a ## _mmxext
#include "mpegvideoenc_template.c"
#endif /* HAVE_MMXEXT_INLINE */
#if HAVE_SSE2_INLINE
#undef COMPILE_TEMPLATE_MMXEXT
#undef COMPILE_TEMPLATE_SSE2
#undef COMPILE_TEMPLATE_SSSE3
#define COMPILE_TEMPLATE_MMXEXT 0
#define COMPILE_TEMPLATE_SSE2 1
#define COMPILE_TEMPLATE_SSSE3 0
#undef RENAME
#undef RENAME_FDCT
#define RENAME(a) a ## _sse2
#define RENAME_FDCT(a) a ## _sse2
#include "mpegvideoenc_template.c"
#endif /* HAVE_SSE2_INLINE */
#if HAVE_SSSE3_INLINE
#undef COMPILE_TEMPLATE_MMXEXT
#undef COMPILE_TEMPLATE_SSE2
#undef COMPILE_TEMPLATE_SSSE3
#define COMPILE_TEMPLATE_MMXEXT 0
#define COMPILE_TEMPLATE_SSE2 1
#define COMPILE_TEMPLATE_SSSE3 1
#undef RENAME
#undef RENAME_FDCT
#define RENAME(a) a ## _ssse3
#define RENAME_FDCT(a) a ## _sse2
#include "mpegvideoenc_template.c"
#endif /* HAVE_SSSE3_INLINE */
#endif /* HAVE_6REGS */
#if HAVE_INLINE_ASM
#if HAVE_MMX_INLINE
static void denoise_dct_mmx(MpegEncContext *s, int16_t *block){
const int intra= s->mb_intra;
int *sum= s->dct_error_sum[intra];
uint16_t *offset= s->dct_offset[intra];
s->dct_count[intra]++;
__asm__ volatile(
"pxor %%mm7, %%mm7 \n\t"
"1: \n\t"
"pxor %%mm0, %%mm0 \n\t"
"pxor %%mm1, %%mm1 \n\t"
"movq (%0), %%mm2 \n\t"
"movq 8(%0), %%mm3 \n\t"
"pcmpgtw %%mm2, %%mm0 \n\t"
"pcmpgtw %%mm3, %%mm1 \n\t"
"pxor %%mm0, %%mm2 \n\t"
"pxor %%mm1, %%mm3 \n\t"
"psubw %%mm0, %%mm2 \n\t"
"psubw %%mm1, %%mm3 \n\t"
"movq %%mm2, %%mm4 \n\t"
"movq %%mm3, %%mm5 \n\t"
"psubusw (%2), %%mm2 \n\t"
"psubusw 8(%2), %%mm3 \n\t"
"pxor %%mm0, %%mm2 \n\t"
"pxor %%mm1, %%mm3 \n\t"
"psubw %%mm0, %%mm2 \n\t"
"psubw %%mm1, %%mm3 \n\t"
"movq %%mm2, (%0) \n\t"
"movq %%mm3, 8(%0) \n\t"
"movq %%mm4, %%mm2 \n\t"
"movq %%mm5, %%mm3 \n\t"
"punpcklwd %%mm7, %%mm4 \n\t"
"punpckhwd %%mm7, %%mm2 \n\t"
"punpcklwd %%mm7, %%mm5 \n\t"
"punpckhwd %%mm7, %%mm3 \n\t"
"paddd (%1), %%mm4 \n\t"
"paddd 8(%1), %%mm2 \n\t"
"paddd 16(%1), %%mm5 \n\t"
"paddd 24(%1), %%mm3 \n\t"
"movq %%mm4, (%1) \n\t"
"movq %%mm2, 8(%1) \n\t"
"movq %%mm5, 16(%1) \n\t"
"movq %%mm3, 24(%1) \n\t"
"add $16, %0 \n\t"
"add $32, %1 \n\t"
"add $16, %2 \n\t"
"cmp %3, %0 \n\t"
" jb 1b \n\t"
: "+r" (block), "+r" (sum), "+r" (offset)
: "r"(block+64)
);
}
#endif /* HAVE_MMX_INLINE */
#if HAVE_SSE2_INLINE
static void denoise_dct_sse2(MpegEncContext *s, int16_t *block){
const int intra= s->mb_intra;
int *sum= s->dct_error_sum[intra];
uint16_t *offset= s->dct_offset[intra];
s->dct_count[intra]++;
__asm__ volatile(
"pxor %%xmm7, %%xmm7 \n\t"
"1: \n\t"
"pxor %%xmm0, %%xmm0 \n\t"
"pxor %%xmm1, %%xmm1 \n\t"
"movdqa (%0), %%xmm2 \n\t"
"movdqa 16(%0), %%xmm3 \n\t"
"pcmpgtw %%xmm2, %%xmm0 \n\t"
"pcmpgtw %%xmm3, %%xmm1 \n\t"
"pxor %%xmm0, %%xmm2 \n\t"
"pxor %%xmm1, %%xmm3 \n\t"
"psubw %%xmm0, %%xmm2 \n\t"
"psubw %%xmm1, %%xmm3 \n\t"
"movdqa %%xmm2, %%xmm4 \n\t"
"movdqa %%xmm3, %%xmm5 \n\t"
"psubusw (%2), %%xmm2 \n\t"
"psubusw 16(%2), %%xmm3 \n\t"
"pxor %%xmm0, %%xmm2 \n\t"
"pxor %%xmm1, %%xmm3 \n\t"
"psubw %%xmm0, %%xmm2 \n\t"
"psubw %%xmm1, %%xmm3 \n\t"
"movdqa %%xmm2, (%0) \n\t"
"movdqa %%xmm3, 16(%0) \n\t"
"movdqa %%xmm4, %%xmm6 \n\t"
"movdqa %%xmm5, %%xmm0 \n\t"
"punpcklwd %%xmm7, %%xmm4 \n\t"
"punpckhwd %%xmm7, %%xmm6 \n\t"
"punpcklwd %%xmm7, %%xmm5 \n\t"
"punpckhwd %%xmm7, %%xmm0 \n\t"
"paddd (%1), %%xmm4 \n\t"
"paddd 16(%1), %%xmm6 \n\t"
"paddd 32(%1), %%xmm5 \n\t"
"paddd 48(%1), %%xmm0 \n\t"
"movdqa %%xmm4, (%1) \n\t"
"movdqa %%xmm6, 16(%1) \n\t"
"movdqa %%xmm5, 32(%1) \n\t"
"movdqa %%xmm0, 48(%1) \n\t"
"add $32, %0 \n\t"
"add $64, %1 \n\t"
"add $32, %2 \n\t"
"cmp %3, %0 \n\t"
" jb 1b \n\t"
: "+r" (block), "+r" (sum), "+r" (offset)
: "r"(block+64)
XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7")
);
}
#endif /* HAVE_SSE2_INLINE */
#endif /* HAVE_INLINE_ASM */
av_cold void ff_dct_encode_init_x86(MpegEncContext *s)
{
const int dct_algo = s->avctx->dct_algo;
if (dct_algo == FF_DCT_AUTO || dct_algo == FF_DCT_MMX) {
#if HAVE_MMX_INLINE
int cpu_flags = av_get_cpu_flags();
if (INLINE_MMX(cpu_flags)) {
#if HAVE_6REGS
s->dct_quantize = dct_quantize_mmx;
#endif
s->denoise_dct = denoise_dct_mmx;
}
#endif
#if HAVE_6REGS && HAVE_MMXEXT_INLINE
if (INLINE_MMXEXT(cpu_flags))
s->dct_quantize = dct_quantize_mmxext;
#endif
#if HAVE_SSE2_INLINE
if (INLINE_SSE2(cpu_flags)) {
#if HAVE_6REGS
s->dct_quantize = dct_quantize_sse2;
#endif
s->denoise_dct = denoise_dct_sse2;
}
#endif
#if HAVE_6REGS && HAVE_SSSE3_INLINE
if (INLINE_SSSE3(cpu_flags))
s->dct_quantize = dct_quantize_ssse3;
#endif
}
}

View file

@ -0,0 +1,109 @@
/*
* QNS functions are compiled 3 times for MMX/3DNOW/SSSE3
* Copyright (c) 2004 Michael Niedermayer
*
* MMX optimization by Michael Niedermayer <michaelni@gmx.at>
* 3DNow! and SSSE3 optimization by Zuxy Meng <zuxy.meng@gmail.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdint.h>
#include "libavutil/avassert.h"
#include "libavutil/common.h"
#include "libavutil/x86/asm.h"
#include "inline_asm.h"
#define MAX_ABS (512 >> (SCALE_OFFSET>0 ? SCALE_OFFSET : 0))
static int DEF(try_8x8basis)(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale)
{
x86_reg i=0;
av_assert2(FFABS(scale) < MAX_ABS);
scale<<= 16 + SCALE_OFFSET - BASIS_SHIFT + RECON_SHIFT;
SET_RND(mm6);
__asm__ volatile(
"pxor %%mm7, %%mm7 \n\t"
"movd %4, %%mm5 \n\t"
"punpcklwd %%mm5, %%mm5 \n\t"
"punpcklwd %%mm5, %%mm5 \n\t"
".p2align 4 \n\t"
"1: \n\t"
"movq (%1, %0), %%mm0 \n\t"
"movq 8(%1, %0), %%mm1 \n\t"
PMULHRW(%%mm0, %%mm1, %%mm5, %%mm6)
"paddw (%2, %0), %%mm0 \n\t"
"paddw 8(%2, %0), %%mm1 \n\t"
"psraw $6, %%mm0 \n\t"
"psraw $6, %%mm1 \n\t"
"pmullw (%3, %0), %%mm0 \n\t"
"pmullw 8(%3, %0), %%mm1 \n\t"
"pmaddwd %%mm0, %%mm0 \n\t"
"pmaddwd %%mm1, %%mm1 \n\t"
"paddd %%mm1, %%mm0 \n\t"
"psrld $4, %%mm0 \n\t"
"paddd %%mm0, %%mm7 \n\t"
"add $16, %0 \n\t"
"cmp $128, %0 \n\t" //FIXME optimize & bench
" jb 1b \n\t"
PHADDD(%%mm7, %%mm6)
"psrld $2, %%mm7 \n\t"
"movd %%mm7, %0 \n\t"
: "+r" (i)
: "r"(basis), "r"(rem), "r"(weight), "g"(scale)
);
return i;
}
static void DEF(add_8x8basis)(int16_t rem[64], int16_t basis[64], int scale)
{
x86_reg i=0;
if(FFABS(scale) < MAX_ABS){
scale<<= 16 + SCALE_OFFSET - BASIS_SHIFT + RECON_SHIFT;
SET_RND(mm6);
__asm__ volatile(
"movd %3, %%mm5 \n\t"
"punpcklwd %%mm5, %%mm5 \n\t"
"punpcklwd %%mm5, %%mm5 \n\t"
".p2align 4 \n\t"
"1: \n\t"
"movq (%1, %0), %%mm0 \n\t"
"movq 8(%1, %0), %%mm1 \n\t"
PMULHRW(%%mm0, %%mm1, %%mm5, %%mm6)
"paddw (%2, %0), %%mm0 \n\t"
"paddw 8(%2, %0), %%mm1 \n\t"
"movq %%mm0, (%2, %0) \n\t"
"movq %%mm1, 8(%2, %0) \n\t"
"add $16, %0 \n\t"
"cmp $128, %0 \n\t" // FIXME optimize & bench
" jb 1b \n\t"
: "+r" (i)
: "r"(basis), "r"(rem), "g"(scale)
);
}else{
for(i=0; i<8*8; i++){
rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
}
}
}

View file

@ -0,0 +1,423 @@
/*
* MPEG video MMX templates
*
* Copyright (c) 2002 Michael Niedermayer <michaelni@gmx.at>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdint.h>
#include "libavutil/internal.h"
#include "libavutil/x86/asm.h"
#include "libavcodec/mpegutils.h"
#include "libavcodec/mpegvideo.h"
#include "fdct.h"
#undef MMREG_WIDTH
#undef MM
#undef MOVQ
#undef SPREADW
#undef PMAXW
#undef PMAX
#undef SAVE_SIGN
#undef RESTORE_SIGN
#if COMPILE_TEMPLATE_SSE2
#define MMREG_WIDTH "16"
#define MM "%%xmm"
#define MOVQ "movdqa"
#define SPREADW(a) \
"pshuflw $0, "a", "a" \n\t"\
"punpcklwd "a", "a" \n\t"
#define PMAXW(a,b) "pmaxsw "a", "b" \n\t"
#define PMAX(a,b) \
"movhlps "a", "b" \n\t"\
PMAXW(b, a)\
"pshuflw $0x0E, "a", "b" \n\t"\
PMAXW(b, a)\
"pshuflw $0x01, "a", "b" \n\t"\
PMAXW(b, a)
#else
#define MMREG_WIDTH "8"
#define MM "%%mm"
#define MOVQ "movq"
#if COMPILE_TEMPLATE_MMXEXT
#define SPREADW(a) "pshufw $0, "a", "a" \n\t"
#define PMAXW(a,b) "pmaxsw "a", "b" \n\t"
#define PMAX(a,b) \
"pshufw $0x0E, "a", "b" \n\t"\
PMAXW(b, a)\
"pshufw $0x01, "a", "b" \n\t"\
PMAXW(b, a)
#else
#define SPREADW(a) \
"punpcklwd "a", "a" \n\t"\
"punpcklwd "a", "a" \n\t"
#define PMAXW(a,b) \
"psubusw "a", "b" \n\t"\
"paddw "a", "b" \n\t"
#define PMAX(a,b) \
"movq "a", "b" \n\t"\
"psrlq $32, "a" \n\t"\
PMAXW(b, a)\
"movq "a", "b" \n\t"\
"psrlq $16, "a" \n\t"\
PMAXW(b, a)
#endif
#endif
#if COMPILE_TEMPLATE_SSSE3
#define SAVE_SIGN(a,b) \
"movdqa "b", "a" \n\t"\
"pabsw "b", "b" \n\t"
#define RESTORE_SIGN(a,b) \
"psignw "a", "b" \n\t"
#else
#define SAVE_SIGN(a,b) \
"pxor "a", "a" \n\t"\
"pcmpgtw "b", "a" \n\t" /* block[i] <= 0 ? 0xFF : 0x00 */\
"pxor "a", "b" \n\t"\
"psubw "a", "b" \n\t" /* ABS(block[i]) */
#define RESTORE_SIGN(a,b) \
"pxor "a", "b" \n\t"\
"psubw "a", "b" \n\t" // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i])
#endif
static int RENAME(dct_quantize)(MpegEncContext *s,
int16_t *block, int n,
int qscale, int *overflow)
{
x86_reg last_non_zero_p1;
int level=0, q; //=0 is because gcc says uninitialized ...
const uint16_t *qmat, *bias;
LOCAL_ALIGNED_16(int16_t, temp_block, [64]);
av_assert2((7&(int)(&temp_block[0])) == 0); //did gcc align it correctly?
//s->fdct (block);
RENAME_FDCT(ff_fdct)(block); // cannot be anything else ...
if(s->dct_error_sum)
s->denoise_dct(s, block);
if (s->mb_intra) {
int dummy;
if (n < 4){
q = s->y_dc_scale;
bias = s->q_intra_matrix16[qscale][1];
qmat = s->q_intra_matrix16[qscale][0];
}else{
q = s->c_dc_scale;
bias = s->q_chroma_intra_matrix16[qscale][1];
qmat = s->q_chroma_intra_matrix16[qscale][0];
}
/* note: block[0] is assumed to be positive */
if (!s->h263_aic) {
__asm__ volatile (
"mul %%ecx \n\t"
: "=d" (level), "=a"(dummy)
: "a" ((block[0]>>2) + q), "c" (ff_inverse[q<<1])
);
} else
/* For AIC we skip quant/dequant of INTRADC */
level = (block[0] + 4)>>3;
block[0]=0; //avoid fake overflow
// temp_block[0] = (block[0] + (q >> 1)) / q;
last_non_zero_p1 = 1;
} else {
last_non_zero_p1 = 0;
bias = s->q_inter_matrix16[qscale][1];
qmat = s->q_inter_matrix16[qscale][0];
}
if((s->out_format == FMT_H263 || s->out_format == FMT_H261) && s->mpeg_quant==0){
__asm__ volatile(
"movd %%"FF_REG_a", "MM"3 \n\t" // last_non_zero_p1
SPREADW(MM"3")
"pxor "MM"7, "MM"7 \n\t" // 0
"pxor "MM"4, "MM"4 \n\t" // 0
MOVQ" (%2), "MM"5 \n\t" // qmat[0]
"pxor "MM"6, "MM"6 \n\t"
"psubw (%3), "MM"6 \n\t" // -bias[0]
"mov $-128, %%"FF_REG_a" \n\t"
".p2align 4 \n\t"
"1: \n\t"
MOVQ" (%1, %%"FF_REG_a"), "MM"0 \n\t" // block[i]
SAVE_SIGN(MM"1", MM"0") // ABS(block[i])
"psubusw "MM"6, "MM"0 \n\t" // ABS(block[i]) + bias[0]
"pmulhw "MM"5, "MM"0 \n\t" // (ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16
"por "MM"0, "MM"4 \n\t"
RESTORE_SIGN(MM"1", MM"0") // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i])
MOVQ" "MM"0, (%5, %%"FF_REG_a") \n\t"
"pcmpeqw "MM"7, "MM"0 \n\t" // out==0 ? 0xFF : 0x00
MOVQ" (%4, %%"FF_REG_a"), "MM"1 \n\t"
MOVQ" "MM"7, (%1, %%"FF_REG_a") \n\t" // 0
"pandn "MM"1, "MM"0 \n\t"
PMAXW(MM"0", MM"3")
"add $"MMREG_WIDTH", %%"FF_REG_a" \n\t"
" js 1b \n\t"
PMAX(MM"3", MM"0")
"movd "MM"3, %%"FF_REG_a" \n\t"
"movzbl %%al, %%eax \n\t" // last_non_zero_p1
: "+a" (last_non_zero_p1)
: "r" (block+64), "r" (qmat), "r" (bias),
"r" (inv_zigzag_direct16 + 64), "r" (temp_block + 64)
XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7")
);
}else{ // FMT_H263
__asm__ volatile(
"movd %%"FF_REG_a", "MM"3 \n\t" // last_non_zero_p1
SPREADW(MM"3")
"pxor "MM"7, "MM"7 \n\t" // 0
"pxor "MM"4, "MM"4 \n\t" // 0
"mov $-128, %%"FF_REG_a" \n\t"
".p2align 4 \n\t"
"1: \n\t"
MOVQ" (%1, %%"FF_REG_a"), "MM"0 \n\t" // block[i]
SAVE_SIGN(MM"1", MM"0") // ABS(block[i])
MOVQ" (%3, %%"FF_REG_a"), "MM"6 \n\t" // bias[0]
"paddusw "MM"6, "MM"0 \n\t" // ABS(block[i]) + bias[0]
MOVQ" (%2, %%"FF_REG_a"), "MM"5 \n\t" // qmat[i]
"pmulhw "MM"5, "MM"0 \n\t" // (ABS(block[i])*qmat[0] + bias[0]*qmat[0])>>16
"por "MM"0, "MM"4 \n\t"
RESTORE_SIGN(MM"1", MM"0") // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i])
MOVQ" "MM"0, (%5, %%"FF_REG_a") \n\t"
"pcmpeqw "MM"7, "MM"0 \n\t" // out==0 ? 0xFF : 0x00
MOVQ" (%4, %%"FF_REG_a"), "MM"1 \n\t"
MOVQ" "MM"7, (%1, %%"FF_REG_a") \n\t" // 0
"pandn "MM"1, "MM"0 \n\t"
PMAXW(MM"0", MM"3")
"add $"MMREG_WIDTH", %%"FF_REG_a" \n\t"
" js 1b \n\t"
PMAX(MM"3", MM"0")
"movd "MM"3, %%"FF_REG_a" \n\t"
"movzbl %%al, %%eax \n\t" // last_non_zero_p1
: "+a" (last_non_zero_p1)
: "r" (block+64), "r" (qmat+64), "r" (bias+64),
"r" (inv_zigzag_direct16 + 64), "r" (temp_block + 64)
XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7")
);
}
__asm__ volatile(
"movd %1, "MM"1 \n\t" // max_qcoeff
SPREADW(MM"1")
"psubusw "MM"1, "MM"4 \n\t"
"packuswb "MM"4, "MM"4 \n\t"
#if COMPILE_TEMPLATE_SSE2
"packsswb "MM"4, "MM"4 \n\t"
#endif
"movd "MM"4, %0 \n\t" // *overflow
: "=g" (*overflow)
: "g" (s->max_qcoeff)
);
if(s->mb_intra) block[0]= level;
else block[0]= temp_block[0];
if (s->idsp.perm_type == FF_IDCT_PERM_SIMPLE) {
if(last_non_zero_p1 <= 1) goto end;
block[0x08] = temp_block[0x01]; block[0x10] = temp_block[0x08];
block[0x20] = temp_block[0x10];
if(last_non_zero_p1 <= 4) goto end;
block[0x18] = temp_block[0x09]; block[0x04] = temp_block[0x02];
block[0x09] = temp_block[0x03];
if(last_non_zero_p1 <= 7) goto end;
block[0x14] = temp_block[0x0A]; block[0x28] = temp_block[0x11];
block[0x12] = temp_block[0x18]; block[0x02] = temp_block[0x20];
if(last_non_zero_p1 <= 11) goto end;
block[0x1A] = temp_block[0x19]; block[0x24] = temp_block[0x12];
block[0x19] = temp_block[0x0B]; block[0x01] = temp_block[0x04];
block[0x0C] = temp_block[0x05];
if(last_non_zero_p1 <= 16) goto end;
block[0x11] = temp_block[0x0C]; block[0x29] = temp_block[0x13];
block[0x16] = temp_block[0x1A]; block[0x0A] = temp_block[0x21];
block[0x30] = temp_block[0x28]; block[0x22] = temp_block[0x30];
block[0x38] = temp_block[0x29]; block[0x06] = temp_block[0x22];
if(last_non_zero_p1 <= 24) goto end;
block[0x1B] = temp_block[0x1B]; block[0x21] = temp_block[0x14];
block[0x1C] = temp_block[0x0D]; block[0x05] = temp_block[0x06];
block[0x0D] = temp_block[0x07]; block[0x15] = temp_block[0x0E];
block[0x2C] = temp_block[0x15]; block[0x13] = temp_block[0x1C];
if(last_non_zero_p1 <= 32) goto end;
block[0x0B] = temp_block[0x23]; block[0x34] = temp_block[0x2A];
block[0x2A] = temp_block[0x31]; block[0x32] = temp_block[0x38];
block[0x3A] = temp_block[0x39]; block[0x26] = temp_block[0x32];
block[0x39] = temp_block[0x2B]; block[0x03] = temp_block[0x24];
if(last_non_zero_p1 <= 40) goto end;
block[0x1E] = temp_block[0x1D]; block[0x25] = temp_block[0x16];
block[0x1D] = temp_block[0x0F]; block[0x2D] = temp_block[0x17];
block[0x17] = temp_block[0x1E]; block[0x0E] = temp_block[0x25];
block[0x31] = temp_block[0x2C]; block[0x2B] = temp_block[0x33];
if(last_non_zero_p1 <= 48) goto end;
block[0x36] = temp_block[0x3A]; block[0x3B] = temp_block[0x3B];
block[0x23] = temp_block[0x34]; block[0x3C] = temp_block[0x2D];
block[0x07] = temp_block[0x26]; block[0x1F] = temp_block[0x1F];
block[0x0F] = temp_block[0x27]; block[0x35] = temp_block[0x2E];
if(last_non_zero_p1 <= 56) goto end;
block[0x2E] = temp_block[0x35]; block[0x33] = temp_block[0x3C];
block[0x3E] = temp_block[0x3D]; block[0x27] = temp_block[0x36];
block[0x3D] = temp_block[0x2F]; block[0x2F] = temp_block[0x37];
block[0x37] = temp_block[0x3E]; block[0x3F] = temp_block[0x3F];
}else if(s->idsp.perm_type == FF_IDCT_PERM_LIBMPEG2){
if(last_non_zero_p1 <= 1) goto end;
block[0x04] = temp_block[0x01];
block[0x08] = temp_block[0x08]; block[0x10] = temp_block[0x10];
if(last_non_zero_p1 <= 4) goto end;
block[0x0C] = temp_block[0x09]; block[0x01] = temp_block[0x02];
block[0x05] = temp_block[0x03];
if(last_non_zero_p1 <= 7) goto end;
block[0x09] = temp_block[0x0A]; block[0x14] = temp_block[0x11];
block[0x18] = temp_block[0x18]; block[0x20] = temp_block[0x20];
if(last_non_zero_p1 <= 11) goto end;
block[0x1C] = temp_block[0x19];
block[0x11] = temp_block[0x12]; block[0x0D] = temp_block[0x0B];
block[0x02] = temp_block[0x04]; block[0x06] = temp_block[0x05];
if(last_non_zero_p1 <= 16) goto end;
block[0x0A] = temp_block[0x0C]; block[0x15] = temp_block[0x13];
block[0x19] = temp_block[0x1A]; block[0x24] = temp_block[0x21];
block[0x28] = temp_block[0x28]; block[0x30] = temp_block[0x30];
block[0x2C] = temp_block[0x29]; block[0x21] = temp_block[0x22];
if(last_non_zero_p1 <= 24) goto end;
block[0x1D] = temp_block[0x1B]; block[0x12] = temp_block[0x14];
block[0x0E] = temp_block[0x0D]; block[0x03] = temp_block[0x06];
block[0x07] = temp_block[0x07]; block[0x0B] = temp_block[0x0E];
block[0x16] = temp_block[0x15]; block[0x1A] = temp_block[0x1C];
if(last_non_zero_p1 <= 32) goto end;
block[0x25] = temp_block[0x23]; block[0x29] = temp_block[0x2A];
block[0x34] = temp_block[0x31]; block[0x38] = temp_block[0x38];
block[0x3C] = temp_block[0x39]; block[0x31] = temp_block[0x32];
block[0x2D] = temp_block[0x2B]; block[0x22] = temp_block[0x24];
if(last_non_zero_p1 <= 40) goto end;
block[0x1E] = temp_block[0x1D]; block[0x13] = temp_block[0x16];
block[0x0F] = temp_block[0x0F]; block[0x17] = temp_block[0x17];
block[0x1B] = temp_block[0x1E]; block[0x26] = temp_block[0x25];
block[0x2A] = temp_block[0x2C]; block[0x35] = temp_block[0x33];
if(last_non_zero_p1 <= 48) goto end;
block[0x39] = temp_block[0x3A]; block[0x3D] = temp_block[0x3B];
block[0x32] = temp_block[0x34]; block[0x2E] = temp_block[0x2D];
block[0x23] = temp_block[0x26]; block[0x1F] = temp_block[0x1F];
block[0x27] = temp_block[0x27]; block[0x2B] = temp_block[0x2E];
if(last_non_zero_p1 <= 56) goto end;
block[0x36] = temp_block[0x35]; block[0x3A] = temp_block[0x3C];
block[0x3E] = temp_block[0x3D]; block[0x33] = temp_block[0x36];
block[0x2F] = temp_block[0x2F]; block[0x37] = temp_block[0x37];
block[0x3B] = temp_block[0x3E]; block[0x3F] = temp_block[0x3F];
} else if (s->idsp.perm_type == FF_IDCT_PERM_NONE) {
if(last_non_zero_p1 <= 1) goto end;
block[0x01] = temp_block[0x01];
block[0x08] = temp_block[0x08]; block[0x10] = temp_block[0x10];
if(last_non_zero_p1 <= 4) goto end;
block[0x09] = temp_block[0x09]; block[0x02] = temp_block[0x02];
block[0x03] = temp_block[0x03];
if(last_non_zero_p1 <= 7) goto end;
block[0x0A] = temp_block[0x0A]; block[0x11] = temp_block[0x11];
block[0x18] = temp_block[0x18]; block[0x20] = temp_block[0x20];
if(last_non_zero_p1 <= 11) goto end;
block[0x19] = temp_block[0x19];
block[0x12] = temp_block[0x12]; block[0x0B] = temp_block[0x0B];
block[0x04] = temp_block[0x04]; block[0x05] = temp_block[0x05];
if(last_non_zero_p1 <= 16) goto end;
block[0x0C] = temp_block[0x0C]; block[0x13] = temp_block[0x13];
block[0x1A] = temp_block[0x1A]; block[0x21] = temp_block[0x21];
block[0x28] = temp_block[0x28]; block[0x30] = temp_block[0x30];
block[0x29] = temp_block[0x29]; block[0x22] = temp_block[0x22];
if(last_non_zero_p1 <= 24) goto end;
block[0x1B] = temp_block[0x1B]; block[0x14] = temp_block[0x14];
block[0x0D] = temp_block[0x0D]; block[0x06] = temp_block[0x06];
block[0x07] = temp_block[0x07]; block[0x0E] = temp_block[0x0E];
block[0x15] = temp_block[0x15]; block[0x1C] = temp_block[0x1C];
if(last_non_zero_p1 <= 32) goto end;
block[0x23] = temp_block[0x23]; block[0x2A] = temp_block[0x2A];
block[0x31] = temp_block[0x31]; block[0x38] = temp_block[0x38];
block[0x39] = temp_block[0x39]; block[0x32] = temp_block[0x32];
block[0x2B] = temp_block[0x2B]; block[0x24] = temp_block[0x24];
if(last_non_zero_p1 <= 40) goto end;
block[0x1D] = temp_block[0x1D]; block[0x16] = temp_block[0x16];
block[0x0F] = temp_block[0x0F]; block[0x17] = temp_block[0x17];
block[0x1E] = temp_block[0x1E]; block[0x25] = temp_block[0x25];
block[0x2C] = temp_block[0x2C]; block[0x33] = temp_block[0x33];
if(last_non_zero_p1 <= 48) goto end;
block[0x3A] = temp_block[0x3A]; block[0x3B] = temp_block[0x3B];
block[0x34] = temp_block[0x34]; block[0x2D] = temp_block[0x2D];
block[0x26] = temp_block[0x26]; block[0x1F] = temp_block[0x1F];
block[0x27] = temp_block[0x27]; block[0x2E] = temp_block[0x2E];
if(last_non_zero_p1 <= 56) goto end;
block[0x35] = temp_block[0x35]; block[0x3C] = temp_block[0x3C];
block[0x3D] = temp_block[0x3D]; block[0x36] = temp_block[0x36];
block[0x2F] = temp_block[0x2F]; block[0x37] = temp_block[0x37];
block[0x3E] = temp_block[0x3E]; block[0x3F] = temp_block[0x3F];
} else if (s->idsp.perm_type == FF_IDCT_PERM_TRANSPOSE) {
if(last_non_zero_p1 <= 1) goto end;
block[0x08] = temp_block[0x01];
block[0x01] = temp_block[0x08]; block[0x02] = temp_block[0x10];
if(last_non_zero_p1 <= 4) goto end;
block[0x09] = temp_block[0x09]; block[0x10] = temp_block[0x02];
block[0x18] = temp_block[0x03];
if(last_non_zero_p1 <= 7) goto end;
block[0x11] = temp_block[0x0A]; block[0x0A] = temp_block[0x11];
block[0x03] = temp_block[0x18]; block[0x04] = temp_block[0x20];
if(last_non_zero_p1 <= 11) goto end;
block[0x0B] = temp_block[0x19];
block[0x12] = temp_block[0x12]; block[0x19] = temp_block[0x0B];
block[0x20] = temp_block[0x04]; block[0x28] = temp_block[0x05];
if(last_non_zero_p1 <= 16) goto end;
block[0x21] = temp_block[0x0C]; block[0x1A] = temp_block[0x13];
block[0x13] = temp_block[0x1A]; block[0x0C] = temp_block[0x21];
block[0x05] = temp_block[0x28]; block[0x06] = temp_block[0x30];
block[0x0D] = temp_block[0x29]; block[0x14] = temp_block[0x22];
if(last_non_zero_p1 <= 24) goto end;
block[0x1B] = temp_block[0x1B]; block[0x22] = temp_block[0x14];
block[0x29] = temp_block[0x0D]; block[0x30] = temp_block[0x06];
block[0x38] = temp_block[0x07]; block[0x31] = temp_block[0x0E];
block[0x2A] = temp_block[0x15]; block[0x23] = temp_block[0x1C];
if(last_non_zero_p1 <= 32) goto end;
block[0x1C] = temp_block[0x23]; block[0x15] = temp_block[0x2A];
block[0x0E] = temp_block[0x31]; block[0x07] = temp_block[0x38];
block[0x0F] = temp_block[0x39]; block[0x16] = temp_block[0x32];
block[0x1D] = temp_block[0x2B]; block[0x24] = temp_block[0x24];
if(last_non_zero_p1 <= 40) goto end;
block[0x2B] = temp_block[0x1D]; block[0x32] = temp_block[0x16];
block[0x39] = temp_block[0x0F]; block[0x3A] = temp_block[0x17];
block[0x33] = temp_block[0x1E]; block[0x2C] = temp_block[0x25];
block[0x25] = temp_block[0x2C]; block[0x1E] = temp_block[0x33];
if(last_non_zero_p1 <= 48) goto end;
block[0x17] = temp_block[0x3A]; block[0x1F] = temp_block[0x3B];
block[0x26] = temp_block[0x34]; block[0x2D] = temp_block[0x2D];
block[0x34] = temp_block[0x26]; block[0x3B] = temp_block[0x1F];
block[0x3C] = temp_block[0x27]; block[0x35] = temp_block[0x2E];
if(last_non_zero_p1 <= 56) goto end;
block[0x2E] = temp_block[0x35]; block[0x27] = temp_block[0x3C];
block[0x2F] = temp_block[0x3D]; block[0x36] = temp_block[0x36];
block[0x3D] = temp_block[0x2F]; block[0x3E] = temp_block[0x37];
block[0x37] = temp_block[0x3E]; block[0x3F] = temp_block[0x3F];
} else {
av_log(s, AV_LOG_DEBUG, "s->idsp.perm_type: %d\n",
(int)s->idsp.perm_type);
av_assert0(s->idsp.perm_type == FF_IDCT_PERM_NONE ||
s->idsp.perm_type == FF_IDCT_PERM_LIBMPEG2 ||
s->idsp.perm_type == FF_IDCT_PERM_SIMPLE ||
s->idsp.perm_type == FF_IDCT_PERM_TRANSPOSE);
}
end:
return last_non_zero_p1 - 1;
}

View file

@ -0,0 +1,272 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/attributes.h"
#include "libavutil/avassert.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/avcodec.h"
#include "libavcodec/mpegvideoencdsp.h"
int ff_pix_sum16_mmx(uint8_t *pix, int line_size);
int ff_pix_sum16_mmxext(uint8_t *pix, int line_size);
int ff_pix_sum16_sse2(uint8_t *pix, int line_size);
int ff_pix_sum16_xop(uint8_t *pix, int line_size);
int ff_pix_norm1_mmx(uint8_t *pix, int line_size);
int ff_pix_norm1_sse2(uint8_t *pix, int line_size);
#if HAVE_INLINE_ASM
#define PHADDD(a, t) \
"movq " #a ", " #t " \n\t" \
"psrlq $32, " #a " \n\t" \
"paddd " #t ", " #a " \n\t"
/*
* pmulhw: dst[0 - 15] = (src[0 - 15] * dst[0 - 15])[16 - 31]
* pmulhrw: dst[0 - 15] = (src[0 - 15] * dst[0 - 15] + 0x8000)[16 - 31]
* pmulhrsw: dst[0 - 15] = (src[0 - 15] * dst[0 - 15] + 0x4000)[15 - 30]
*/
#define PMULHRW(x, y, s, o) \
"pmulhw " #s ", " #x " \n\t" \
"pmulhw " #s ", " #y " \n\t" \
"paddw " #o ", " #x " \n\t" \
"paddw " #o ", " #y " \n\t" \
"psraw $1, " #x " \n\t" \
"psraw $1, " #y " \n\t"
#define DEF(x) x ## _mmx
#define SET_RND MOVQ_WONE
#define SCALE_OFFSET 1
#include "mpegvideoenc_qns_template.c"
#undef DEF
#undef SET_RND
#undef SCALE_OFFSET
#undef PMULHRW
#define DEF(x) x ## _3dnow
#define SET_RND(x)
#define SCALE_OFFSET 0
#define PMULHRW(x, y, s, o) \
"pmulhrw " #s ", " #x " \n\t" \
"pmulhrw " #s ", " #y " \n\t"
#include "mpegvideoenc_qns_template.c"
#undef DEF
#undef SET_RND
#undef SCALE_OFFSET
#undef PMULHRW
#if HAVE_SSSE3_INLINE
#undef PHADDD
#define DEF(x) x ## _ssse3
#define SET_RND(x)
#define SCALE_OFFSET -1
#define PHADDD(a, t) \
"pshufw $0x0E, " #a ", " #t " \n\t" \
/* faster than phaddd on core2 */ \
"paddd " #t ", " #a " \n\t"
#define PMULHRW(x, y, s, o) \
"pmulhrsw " #s ", " #x " \n\t" \
"pmulhrsw " #s ", " #y " \n\t"
#include "mpegvideoenc_qns_template.c"
#undef DEF
#undef SET_RND
#undef SCALE_OFFSET
#undef PMULHRW
#undef PHADDD
#endif /* HAVE_SSSE3_INLINE */
/* Draw the edges of width 'w' of an image of size width, height
* this MMX version can only handle w == 8 || w == 16. */
static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height,
int w, int h, int sides)
{
uint8_t *ptr, *last_line;
int i;
last_line = buf + (height - 1) * wrap;
/* left and right */
ptr = buf;
if (w == 8) {
__asm__ volatile (
"1: \n\t"
"movd (%0), %%mm0 \n\t"
"punpcklbw %%mm0, %%mm0 \n\t"
"punpcklwd %%mm0, %%mm0 \n\t"
"punpckldq %%mm0, %%mm0 \n\t"
"movq %%mm0, -8(%0) \n\t"
"movq -8(%0, %2), %%mm1 \n\t"
"punpckhbw %%mm1, %%mm1 \n\t"
"punpckhwd %%mm1, %%mm1 \n\t"
"punpckhdq %%mm1, %%mm1 \n\t"
"movq %%mm1, (%0, %2) \n\t"
"add %1, %0 \n\t"
"cmp %3, %0 \n\t"
"jb 1b \n\t"
: "+r" (ptr)
: "r" ((x86_reg) wrap), "r" ((x86_reg) width),
"r" (ptr + wrap * height));
} else if (w == 16) {
__asm__ volatile (
"1: \n\t"
"movd (%0), %%mm0 \n\t"
"punpcklbw %%mm0, %%mm0 \n\t"
"punpcklwd %%mm0, %%mm0 \n\t"
"punpckldq %%mm0, %%mm0 \n\t"
"movq %%mm0, -8(%0) \n\t"
"movq %%mm0, -16(%0) \n\t"
"movq -8(%0, %2), %%mm1 \n\t"
"punpckhbw %%mm1, %%mm1 \n\t"
"punpckhwd %%mm1, %%mm1 \n\t"
"punpckhdq %%mm1, %%mm1 \n\t"
"movq %%mm1, (%0, %2) \n\t"
"movq %%mm1, 8(%0, %2) \n\t"
"add %1, %0 \n\t"
"cmp %3, %0 \n\t"
"jb 1b \n\t"
: "+r"(ptr)
: "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
);
} else {
av_assert1(w == 4);
__asm__ volatile (
"1: \n\t"
"movd (%0), %%mm0 \n\t"
"punpcklbw %%mm0, %%mm0 \n\t"
"punpcklwd %%mm0, %%mm0 \n\t"
"movd %%mm0, -4(%0) \n\t"
"movd -4(%0, %2), %%mm1 \n\t"
"punpcklbw %%mm1, %%mm1 \n\t"
"punpckhwd %%mm1, %%mm1 \n\t"
"punpckhdq %%mm1, %%mm1 \n\t"
"movd %%mm1, (%0, %2) \n\t"
"add %1, %0 \n\t"
"cmp %3, %0 \n\t"
"jb 1b \n\t"
: "+r" (ptr)
: "r" ((x86_reg) wrap), "r" ((x86_reg) width),
"r" (ptr + wrap * height));
}
/* top and bottom (and hopefully also the corners) */
if (sides & EDGE_TOP) {
for (i = 0; i < h; i += 4) {
ptr = buf - (i + 1) * wrap - w;
__asm__ volatile (
"1: \n\t"
"movq (%1, %0), %%mm0 \n\t"
"movq %%mm0, (%0) \n\t"
"movq %%mm0, (%0, %2) \n\t"
"movq %%mm0, (%0, %2, 2) \n\t"
"movq %%mm0, (%0, %3) \n\t"
"add $8, %0 \n\t"
"cmp %4, %0 \n\t"
"jb 1b \n\t"
: "+r" (ptr)
: "r" ((x86_reg) buf - (x86_reg) ptr - w),
"r" ((x86_reg) - wrap), "r" ((x86_reg) - wrap * 3),
"r" (ptr + width + 2 * w));
}
}
if (sides & EDGE_BOTTOM) {
for (i = 0; i < h; i += 4) {
ptr = last_line + (i + 1) * wrap - w;
__asm__ volatile (
"1: \n\t"
"movq (%1, %0), %%mm0 \n\t"
"movq %%mm0, (%0) \n\t"
"movq %%mm0, (%0, %2) \n\t"
"movq %%mm0, (%0, %2, 2) \n\t"
"movq %%mm0, (%0, %3) \n\t"
"add $8, %0 \n\t"
"cmp %4, %0 \n\t"
"jb 1b \n\t"
: "+r" (ptr)
: "r" ((x86_reg) last_line - (x86_reg) ptr - w),
"r" ((x86_reg) wrap), "r" ((x86_reg) wrap * 3),
"r" (ptr + width + 2 * w));
}
}
}
#endif /* HAVE_INLINE_ASM */
av_cold void ff_mpegvideoencdsp_init_x86(MpegvideoEncDSPContext *c,
AVCodecContext *avctx)
{
int cpu_flags = av_get_cpu_flags();
#if ARCH_X86_32
if (EXTERNAL_MMX(cpu_flags)) {
c->pix_sum = ff_pix_sum16_mmx;
c->pix_norm1 = ff_pix_norm1_mmx;
}
if (EXTERNAL_MMXEXT(cpu_flags)) {
c->pix_sum = ff_pix_sum16_mmxext;
}
#endif
if (EXTERNAL_SSE2(cpu_flags)) {
c->pix_sum = ff_pix_sum16_sse2;
c->pix_norm1 = ff_pix_norm1_sse2;
}
if (EXTERNAL_XOP(cpu_flags)) {
c->pix_sum = ff_pix_sum16_xop;
}
#if HAVE_INLINE_ASM
if (INLINE_MMX(cpu_flags)) {
if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {
c->try_8x8basis = try_8x8basis_mmx;
}
c->add_8x8basis = add_8x8basis_mmx;
if (avctx->bits_per_raw_sample <= 8) {
c->draw_edges = draw_edges_mmx;
}
}
if (INLINE_AMD3DNOW(cpu_flags)) {
if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {
c->try_8x8basis = try_8x8basis_3dnow;
}
c->add_8x8basis = add_8x8basis_3dnow;
}
#if HAVE_SSSE3_INLINE
if (INLINE_SSSE3(cpu_flags)) {
if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {
c->try_8x8basis = try_8x8basis_ssse3;
}
c->add_8x8basis = add_8x8basis_ssse3;
}
#endif /* HAVE_SSSE3_INLINE */
#endif /* HAVE_INLINE_ASM */
}

View file

@ -0,0 +1,35 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/opusdsp.h"
void ff_opus_postfilter_fma3(float *data, int period, float *gains, int len);
float ff_opus_deemphasis_fma3(float *out, float *in, float coeff, int len);
av_cold void ff_opus_dsp_init_x86(OpusDSP *ctx)
{
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_FMA3_FAST(cpu_flags)) {
ctx->postfilter = ff_opus_postfilter_fma3;
ctx->deemphasis = ff_opus_deemphasis_fma3;
}
}

View file

@ -0,0 +1,52 @@
/*
* SIMD-optimized pixel operations
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/pixblockdsp.h"
void ff_get_pixels_mmx(int16_t *block, const uint8_t *pixels, ptrdiff_t stride);
void ff_get_pixels_sse2(int16_t *block, const uint8_t *pixels, ptrdiff_t stride);
void ff_diff_pixels_mmx(int16_t *block, const uint8_t *s1, const uint8_t *s2,
ptrdiff_t stride);
void ff_diff_pixels_sse2(int16_t *block, const uint8_t *s1, const uint8_t *s2,
ptrdiff_t stride);
av_cold void ff_pixblockdsp_init_x86(PixblockDSPContext *c,
AVCodecContext *avctx,
unsigned high_bit_depth)
{
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_MMX(cpu_flags)) {
if (!high_bit_depth)
c->get_pixels = ff_get_pixels_mmx;
c->diff_pixels_unaligned =
c->diff_pixels = ff_diff_pixels_mmx;
}
if (EXTERNAL_SSE2(cpu_flags)) {
if (!high_bit_depth)
c->get_pixels = ff_get_pixels_sse2;
c->diff_pixels_unaligned =
c->diff_pixels = ff_diff_pixels_sse2;
}
}

View file

@ -0,0 +1,50 @@
/*
* x86 PNG optimizations.
* Copyright (c) 2008 Loren Merrit <lorenm@u.washington.edu>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/attributes.h"
#include "libavutil/common.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/pngdsp.h"
void ff_add_png_paeth_prediction_mmxext(uint8_t *dst, uint8_t *src,
uint8_t *top, int w, int bpp);
void ff_add_png_paeth_prediction_ssse3(uint8_t *dst, uint8_t *src,
uint8_t *top, int w, int bpp);
void ff_add_bytes_l2_mmx (uint8_t *dst, uint8_t *src1,
uint8_t *src2, int w);
void ff_add_bytes_l2_sse2(uint8_t *dst, uint8_t *src1,
uint8_t *src2, int w);
av_cold void ff_pngdsp_init_x86(PNGDSPContext *dsp)
{
int cpu_flags = av_get_cpu_flags();
#if ARCH_X86_32
if (EXTERNAL_MMX(cpu_flags))
dsp->add_bytes_l2 = ff_add_bytes_l2_mmx;
#endif
if (EXTERNAL_MMXEXT(cpu_flags))
dsp->add_paeth_prediction = ff_add_png_paeth_prediction_mmxext;
if (EXTERNAL_SSE2(cpu_flags))
dsp->add_bytes_l2 = ff_add_bytes_l2_sse2;
if (EXTERNAL_SSSE3(cpu_flags))
dsp->add_paeth_prediction = ff_add_png_paeth_prediction_ssse3;
}

View file

@ -0,0 +1,50 @@
/*
* Apple ProRes compatible decoder
*
* Copyright (c) 2010-2011 Maxim Poliakovski
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/attributes.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/idctdsp.h"
#include "libavcodec/proresdsp.h"
void ff_prores_idct_put_10_sse2(uint16_t *dst, ptrdiff_t linesize,
int16_t *block, const int16_t *qmat);
void ff_prores_idct_put_10_avx (uint16_t *dst, ptrdiff_t linesize,
int16_t *block, const int16_t *qmat);
av_cold void ff_proresdsp_init_x86(ProresDSPContext *dsp, AVCodecContext *avctx)
{
#if ARCH_X86_64
int cpu_flags = av_get_cpu_flags();
if (avctx->bits_per_raw_sample == 10){
if (EXTERNAL_SSE2(cpu_flags)) {
dsp->idct_permutation_type = FF_IDCT_PERM_TRANSPOSE;
dsp->idct_put = ff_prores_idct_put_10_sse2;
}
if (EXTERNAL_AVX(cpu_flags)) {
dsp->idct_permutation_type = FF_IDCT_PERM_TRANSPOSE;
dsp->idct_put = ff_prores_idct_put_10_avx;
}
}
#endif /* ARCH_X86_64 */
}

View file

@ -0,0 +1,544 @@
/*
* quarterpel DSP functions
* Copyright (c) 2000, 2001 Fabrice Bellard
* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stddef.h>
#include <stdint.h>
#include "config.h"
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/pixels.h"
#include "libavcodec/qpeldsp.h"
#include "fpel.h"
void ff_put_pixels8_l2_mmxext(uint8_t *dst,
const uint8_t *src1, const uint8_t *src2,
int dstStride, int src1Stride, int h);
void ff_put_no_rnd_pixels8_l2_mmxext(uint8_t *dst,
const uint8_t *src1, const uint8_t *src2,
int dstStride, int src1Stride, int h);
void ff_avg_pixels8_l2_mmxext(uint8_t *dst,
const uint8_t *src1, const uint8_t *src2,
int dstStride, int src1Stride, int h);
void ff_put_pixels16_l2_mmxext(uint8_t *dst,
const uint8_t *src1, const uint8_t *src2,
int dstStride, int src1Stride, int h);
void ff_avg_pixels16_l2_mmxext(uint8_t *dst,
const uint8_t *src1, const uint8_t *src2,
int dstStride, int src1Stride, int h);
void ff_put_no_rnd_pixels16_l2_mmxext(uint8_t *dst,
const uint8_t *src1, const uint8_t *src2,
int dstStride, int src1Stride, int h);
void ff_put_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, const uint8_t *src,
int dstStride, int srcStride, int h);
void ff_avg_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, const uint8_t *src,
int dstStride, int srcStride, int h);
void ff_put_no_rnd_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst,
const uint8_t *src,
int dstStride, int srcStride,
int h);
void ff_put_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, const uint8_t *src,
int dstStride, int srcStride, int h);
void ff_avg_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, const uint8_t *src,
int dstStride, int srcStride, int h);
void ff_put_no_rnd_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst,
const uint8_t *src,
int dstStride, int srcStride,
int h);
void ff_put_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, const uint8_t *src,
int dstStride, int srcStride);
void ff_avg_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, const uint8_t *src,
int dstStride, int srcStride);
void ff_put_no_rnd_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst,
const uint8_t *src,
int dstStride, int srcStride);
void ff_put_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, const uint8_t *src,
int dstStride, int srcStride);
void ff_avg_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, const uint8_t *src,
int dstStride, int srcStride);
void ff_put_no_rnd_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst,
const uint8_t *src,
int dstStride, int srcStride);
#define ff_put_no_rnd_pixels16_mmxext ff_put_pixels16_mmx
#define ff_put_no_rnd_pixels8_mmxext ff_put_pixels8_mmx
#if HAVE_X86ASM
#define ff_put_pixels16_mmxext ff_put_pixels16_mmx
#define ff_put_pixels8_mmxext ff_put_pixels8_mmx
#define QPEL_OP(OPNAME, RND, MMX) \
static void OPNAME ## qpel8_mc00_ ## MMX(uint8_t *dst, \
const uint8_t *src, \
ptrdiff_t stride) \
{ \
ff_ ## OPNAME ## pixels8_ ## MMX(dst, src, stride, 8); \
} \
\
static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, \
const uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t temp[8]; \
uint8_t *const half = (uint8_t *) temp; \
ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \
stride, 8); \
ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src, half, \
stride, stride, 8); \
} \
\
static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, \
const uint8_t *src, \
ptrdiff_t stride) \
{ \
ff_ ## OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, \
stride, 8); \
} \
\
static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, \
const uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t temp[8]; \
uint8_t *const half = (uint8_t *) temp; \
ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \
stride, 8); \
ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src + 1, half, stride, \
stride, 8); \
} \
\
static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, \
const uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t temp[8]; \
uint8_t *const half = (uint8_t *) temp; \
ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, \
8, stride); \
ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src, half, \
stride, stride, 8); \
} \
\
static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, \
const uint8_t *src, \
ptrdiff_t stride) \
{ \
ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, \
stride, stride); \
} \
\
static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, \
const uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t temp[8]; \
uint8_t *const half = (uint8_t *) temp; \
ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, \
8, stride); \
ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src + stride, half, stride,\
stride, 8); \
} \
\
static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, \
const uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t half[8 + 9]; \
uint8_t *const halfH = (uint8_t *) half + 64; \
uint8_t *const halfHV = (uint8_t *) half; \
ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
stride, 9); \
ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, \
stride, 9); \
ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, \
stride, 8, 8); \
} \
\
static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, \
const uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t half[8 + 9]; \
uint8_t *const halfH = (uint8_t *) half + 64; \
uint8_t *const halfHV = (uint8_t *) half; \
ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
stride, 9); \
ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
stride, 9); \
ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, \
stride, 8, 8); \
} \
\
static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, \
const uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t half[8 + 9]; \
uint8_t *const halfH = (uint8_t *) half + 64; \
uint8_t *const halfHV = (uint8_t *) half; \
ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
stride, 9); \
ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, \
stride, 9); \
ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, \
stride, 8, 8); \
} \
\
static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, \
const uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t half[8 + 9]; \
uint8_t *const halfH = (uint8_t *) half + 64; \
uint8_t *const halfHV = (uint8_t *) half; \
ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
stride, 9); \
ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
stride, 9); \
ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, \
stride, 8, 8); \
} \
\
static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, \
const uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t half[8 + 9]; \
uint8_t *const halfH = (uint8_t *) half + 64; \
uint8_t *const halfHV = (uint8_t *) half; \
ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
stride, 9); \
ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, \
stride, 8, 8); \
} \
\
static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, \
const uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t half[8 + 9]; \
uint8_t *const halfH = (uint8_t *) half + 64; \
uint8_t *const halfHV = (uint8_t *) half; \
ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
stride, 9); \
ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, \
stride, 8, 8); \
} \
\
static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, \
const uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t half[8 + 9]; \
uint8_t *const halfH = (uint8_t *) half; \
ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
stride, 9); \
ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, \
8, stride, 9); \
ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, \
stride, 8); \
} \
\
static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, \
const uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t half[8 + 9]; \
uint8_t *const halfH = (uint8_t *) half; \
ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
stride, 9); \
ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
stride, 9); \
ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, \
stride, 8); \
} \
\
static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, \
const uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t half[9]; \
uint8_t *const halfH = (uint8_t *) half; \
ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
stride, 9); \
ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, \
stride, 8); \
} \
\
static void OPNAME ## qpel16_mc00_ ## MMX(uint8_t *dst, \
const uint8_t *src, \
ptrdiff_t stride) \
{ \
ff_ ## OPNAME ## pixels16_ ## MMX(dst, src, stride, 16); \
} \
\
static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, \
const uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t temp[32]; \
uint8_t *const half = (uint8_t *) temp; \
ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \
stride, 16); \
ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, \
stride, 16); \
} \
\
static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, \
const uint8_t *src, \
ptrdiff_t stride) \
{ \
ff_ ## OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, \
stride, stride, 16);\
} \
\
static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, \
const uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t temp[32]; \
uint8_t *const half = (uint8_t*) temp; \
ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \
stride, 16); \
ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src + 1, half, \
stride, stride, 16); \
} \
\
static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, \
const uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t temp[32]; \
uint8_t *const half = (uint8_t *) temp; \
ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \
stride); \
ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, \
stride, 16); \
} \
\
static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, \
const uint8_t *src, \
ptrdiff_t stride) \
{ \
ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, \
stride, stride); \
} \
\
static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, \
const uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t temp[32]; \
uint8_t *const half = (uint8_t *) temp; \
ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \
stride); \
ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, \
stride, stride, 16); \
} \
\
static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, \
const uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t half[16 * 2 + 17 * 2]; \
uint8_t *const halfH = (uint8_t *) half + 256; \
uint8_t *const halfHV = (uint8_t *) half; \
ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
stride, 17); \
ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
stride, 17); \
ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
16, 16); \
ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, \
stride, 16, 16); \
} \
\
static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, \
const uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t half[16 * 2 + 17 * 2]; \
uint8_t *const halfH = (uint8_t *) half + 256; \
uint8_t *const halfHV = (uint8_t *) half; \
ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
stride, 17); \
ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
stride, 17); \
ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
16, 16); \
ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, \
stride, 16, 16); \
} \
\
static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, \
const uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t half[16 * 2 + 17 * 2]; \
uint8_t *const halfH = (uint8_t *) half + 256; \
uint8_t *const halfHV = (uint8_t *) half; \
ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
stride, 17); \
ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
stride, 17); \
ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
16, 16); \
ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, \
stride, 16, 16); \
} \
\
static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, \
const uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t half[16 * 2 + 17 * 2]; \
uint8_t *const halfH = (uint8_t *) half + 256; \
uint8_t *const halfHV = (uint8_t *) half; \
ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
stride, 17); \
ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
stride, 17); \
ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
16, 16); \
ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, \
stride, 16, 16); \
} \
\
static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, \
const uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t half[16 * 2 + 17 * 2]; \
uint8_t *const halfH = (uint8_t *) half + 256; \
uint8_t *const halfHV = (uint8_t *) half; \
ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
stride, 17); \
ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
16, 16); \
ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, \
stride, 16, 16); \
} \
\
static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, \
const uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t half[16 * 2 + 17 * 2]; \
uint8_t *const halfH = (uint8_t *) half + 256; \
uint8_t *const halfHV = (uint8_t *) half; \
ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
stride, 17); \
ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
16, 16); \
ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, \
stride, 16, 16); \
} \
\
static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, \
const uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t half[17 * 2]; \
uint8_t *const halfH = (uint8_t *) half; \
ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
stride, 17); \
ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
stride, 17); \
ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, \
stride, 16); \
} \
\
static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, \
const uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t half[17 * 2]; \
uint8_t *const halfH = (uint8_t *) half; \
ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
stride, 17); \
ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
stride, 17); \
ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, \
stride, 16); \
} \
\
static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, \
const uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t half[17 * 2]; \
uint8_t *const halfH = (uint8_t *) half; \
ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
stride, 17); \
ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, \
stride, 16); \
}
QPEL_OP(put_, _, mmxext)
QPEL_OP(avg_, _, mmxext)
QPEL_OP(put_no_rnd_, _no_rnd_, mmxext)
#endif /* HAVE_X86ASM */
#define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX) \
do { \
c->PFX ## _pixels_tab[IDX][ 0] = PREFIX ## PFX ## SIZE ## _mc00_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 1] = PREFIX ## PFX ## SIZE ## _mc10_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 2] = PREFIX ## PFX ## SIZE ## _mc20_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 3] = PREFIX ## PFX ## SIZE ## _mc30_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 4] = PREFIX ## PFX ## SIZE ## _mc01_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 5] = PREFIX ## PFX ## SIZE ## _mc11_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 6] = PREFIX ## PFX ## SIZE ## _mc21_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 7] = PREFIX ## PFX ## SIZE ## _mc31_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 8] = PREFIX ## PFX ## SIZE ## _mc02_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 9] = PREFIX ## PFX ## SIZE ## _mc12_ ## CPU; \
c->PFX ## _pixels_tab[IDX][10] = PREFIX ## PFX ## SIZE ## _mc22_ ## CPU; \
c->PFX ## _pixels_tab[IDX][11] = PREFIX ## PFX ## SIZE ## _mc32_ ## CPU; \
c->PFX ## _pixels_tab[IDX][12] = PREFIX ## PFX ## SIZE ## _mc03_ ## CPU; \
c->PFX ## _pixels_tab[IDX][13] = PREFIX ## PFX ## SIZE ## _mc13_ ## CPU; \
c->PFX ## _pixels_tab[IDX][14] = PREFIX ## PFX ## SIZE ## _mc23_ ## CPU; \
c->PFX ## _pixels_tab[IDX][15] = PREFIX ## PFX ## SIZE ## _mc33_ ## CPU; \
} while (0)
av_cold void ff_qpeldsp_init_x86(QpelDSPContext *c)
{
int cpu_flags = av_get_cpu_flags();
if (X86_MMXEXT(cpu_flags)) {
#if HAVE_MMXEXT_EXTERNAL
SET_QPEL_FUNCS(avg_qpel, 0, 16, mmxext, );
SET_QPEL_FUNCS(avg_qpel, 1, 8, mmxext, );
SET_QPEL_FUNCS(put_qpel, 0, 16, mmxext, );
SET_QPEL_FUNCS(put_qpel, 1, 8, mmxext, );
SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmxext, );
SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmxext, );
#endif /* HAVE_MMXEXT_EXTERNAL */
}
}

View file

@ -0,0 +1,175 @@
/*
* SIMD-optimized halfpel functions are compiled twice for rnd/no_rnd
* Copyright (c) 2000, 2001 Fabrice Bellard
* Copyright (c) 2003-2004 Michael Niedermayer <michaelni@gmx.at>
*
* MMX optimization by Nick Kurshev <nickols_k@mail.ru>
* mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
* and improved by Zdenek Kabelac <kabi@users.sf.net>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stddef.h>
#include <stdint.h>
#include "inline_asm.h"
// put_pixels
av_unused STATIC void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h)
{
MOVQ_ZERO(mm7);
SET_RND(mm6); // =2 for rnd and =1 for no_rnd version
__asm__ volatile(
"movq (%1), %%mm0 \n\t"
"movq 1(%1), %%mm4 \n\t"
"movq %%mm0, %%mm1 \n\t"
"movq %%mm4, %%mm5 \n\t"
"punpcklbw %%mm7, %%mm0 \n\t"
"punpcklbw %%mm7, %%mm4 \n\t"
"punpckhbw %%mm7, %%mm1 \n\t"
"punpckhbw %%mm7, %%mm5 \n\t"
"paddusw %%mm0, %%mm4 \n\t"
"paddusw %%mm1, %%mm5 \n\t"
"xor %%"FF_REG_a", %%"FF_REG_a" \n\t"
"add %3, %1 \n\t"
".p2align 3 \n\t"
"1: \n\t"
"movq (%1, %%"FF_REG_a"), %%mm0 \n\t"
"movq 1(%1, %%"FF_REG_a"), %%mm2 \n\t"
"movq %%mm0, %%mm1 \n\t"
"movq %%mm2, %%mm3 \n\t"
"punpcklbw %%mm7, %%mm0 \n\t"
"punpcklbw %%mm7, %%mm2 \n\t"
"punpckhbw %%mm7, %%mm1 \n\t"
"punpckhbw %%mm7, %%mm3 \n\t"
"paddusw %%mm2, %%mm0 \n\t"
"paddusw %%mm3, %%mm1 \n\t"
"paddusw %%mm6, %%mm4 \n\t"
"paddusw %%mm6, %%mm5 \n\t"
"paddusw %%mm0, %%mm4 \n\t"
"paddusw %%mm1, %%mm5 \n\t"
"psrlw $2, %%mm4 \n\t"
"psrlw $2, %%mm5 \n\t"
"packuswb %%mm5, %%mm4 \n\t"
"movq %%mm4, (%2, %%"FF_REG_a") \n\t"
"add %3, %%"FF_REG_a" \n\t"
"movq (%1, %%"FF_REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3
"movq 1(%1, %%"FF_REG_a"), %%mm4 \n\t"
"movq %%mm2, %%mm3 \n\t"
"movq %%mm4, %%mm5 \n\t"
"punpcklbw %%mm7, %%mm2 \n\t"
"punpcklbw %%mm7, %%mm4 \n\t"
"punpckhbw %%mm7, %%mm3 \n\t"
"punpckhbw %%mm7, %%mm5 \n\t"
"paddusw %%mm2, %%mm4 \n\t"
"paddusw %%mm3, %%mm5 \n\t"
"paddusw %%mm6, %%mm0 \n\t"
"paddusw %%mm6, %%mm1 \n\t"
"paddusw %%mm4, %%mm0 \n\t"
"paddusw %%mm5, %%mm1 \n\t"
"psrlw $2, %%mm0 \n\t"
"psrlw $2, %%mm1 \n\t"
"packuswb %%mm1, %%mm0 \n\t"
"movq %%mm0, (%2, %%"FF_REG_a") \n\t"
"add %3, %%"FF_REG_a" \n\t"
"subl $2, %0 \n\t"
"jnz 1b \n\t"
:"+g"(h), "+S"(pixels)
:"D"(block), "r"((x86_reg)line_size)
:FF_REG_a, "memory");
}
// avg_pixels
// this routine is 'slightly' suboptimal but mostly unused
av_unused STATIC void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h)
{
MOVQ_ZERO(mm7);
SET_RND(mm6); // =2 for rnd and =1 for no_rnd version
__asm__ volatile(
"movq (%1), %%mm0 \n\t"
"movq 1(%1), %%mm4 \n\t"
"movq %%mm0, %%mm1 \n\t"
"movq %%mm4, %%mm5 \n\t"
"punpcklbw %%mm7, %%mm0 \n\t"
"punpcklbw %%mm7, %%mm4 \n\t"
"punpckhbw %%mm7, %%mm1 \n\t"
"punpckhbw %%mm7, %%mm5 \n\t"
"paddusw %%mm0, %%mm4 \n\t"
"paddusw %%mm1, %%mm5 \n\t"
"xor %%"FF_REG_a", %%"FF_REG_a" \n\t"
"add %3, %1 \n\t"
".p2align 3 \n\t"
"1: \n\t"
"movq (%1, %%"FF_REG_a"), %%mm0 \n\t"
"movq 1(%1, %%"FF_REG_a"), %%mm2 \n\t"
"movq %%mm0, %%mm1 \n\t"
"movq %%mm2, %%mm3 \n\t"
"punpcklbw %%mm7, %%mm0 \n\t"
"punpcklbw %%mm7, %%mm2 \n\t"
"punpckhbw %%mm7, %%mm1 \n\t"
"punpckhbw %%mm7, %%mm3 \n\t"
"paddusw %%mm2, %%mm0 \n\t"
"paddusw %%mm3, %%mm1 \n\t"
"paddusw %%mm6, %%mm4 \n\t"
"paddusw %%mm6, %%mm5 \n\t"
"paddusw %%mm0, %%mm4 \n\t"
"paddusw %%mm1, %%mm5 \n\t"
"psrlw $2, %%mm4 \n\t"
"psrlw $2, %%mm5 \n\t"
"movq (%2, %%"FF_REG_a"), %%mm3 \n\t"
"packuswb %%mm5, %%mm4 \n\t"
"pcmpeqd %%mm2, %%mm2 \n\t"
"paddb %%mm2, %%mm2 \n\t"
PAVGB_MMX(%%mm3, %%mm4, %%mm5, %%mm2)
"movq %%mm5, (%2, %%"FF_REG_a") \n\t"
"add %3, %%"FF_REG_a" \n\t"
"movq (%1, %%"FF_REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3
"movq 1(%1, %%"FF_REG_a"), %%mm4 \n\t"
"movq %%mm2, %%mm3 \n\t"
"movq %%mm4, %%mm5 \n\t"
"punpcklbw %%mm7, %%mm2 \n\t"
"punpcklbw %%mm7, %%mm4 \n\t"
"punpckhbw %%mm7, %%mm3 \n\t"
"punpckhbw %%mm7, %%mm5 \n\t"
"paddusw %%mm2, %%mm4 \n\t"
"paddusw %%mm3, %%mm5 \n\t"
"paddusw %%mm6, %%mm0 \n\t"
"paddusw %%mm6, %%mm1 \n\t"
"paddusw %%mm4, %%mm0 \n\t"
"paddusw %%mm5, %%mm1 \n\t"
"psrlw $2, %%mm0 \n\t"
"psrlw $2, %%mm1 \n\t"
"movq (%2, %%"FF_REG_a"), %%mm3 \n\t"
"packuswb %%mm1, %%mm0 \n\t"
"pcmpeqd %%mm2, %%mm2 \n\t"
"paddb %%mm2, %%mm2 \n\t"
PAVGB_MMX(%%mm3, %%mm0, %%mm1, %%mm2)
"movq %%mm1, (%2, %%"FF_REG_a") \n\t"
"add %3, %%"FF_REG_a" \n\t"
"subl $2, %0 \n\t"
"jnz 1b \n\t"
:"+g"(h), "+S"(pixels)
:"D"(block), "r"((x86_reg)line_size)
:FF_REG_a, "memory");
}

View file

@ -0,0 +1,48 @@
/*
* RV30/40 MMX/SSE2 optimizations
* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/rv34dsp.h"
void ff_rv34_idct_dc_mmxext(int16_t *block);
void ff_rv34_idct_dc_noround_mmxext(int16_t *block);
void ff_rv34_idct_dc_add_mmx(uint8_t *dst, ptrdiff_t stride, int dc);
void ff_rv34_idct_dc_add_sse2(uint8_t *dst, ptrdiff_t stride, int dc);
void ff_rv34_idct_dc_add_sse4(uint8_t *dst, ptrdiff_t stride, int dc);
void ff_rv34_idct_add_mmxext(uint8_t *dst, ptrdiff_t stride, int16_t *block);
av_cold void ff_rv34dsp_init_x86(RV34DSPContext* c)
{
int cpu_flags = av_get_cpu_flags();
if (ARCH_X86_32 && EXTERNAL_MMX(cpu_flags))
c->rv34_idct_dc_add = ff_rv34_idct_dc_add_mmx;
if (EXTERNAL_MMXEXT(cpu_flags)) {
c->rv34_inv_transform_dc = ff_rv34_idct_dc_noround_mmxext;
c->rv34_idct_add = ff_rv34_idct_add_mmxext;
}
if (EXTERNAL_SSE2(cpu_flags))
c->rv34_idct_dc_add = ff_rv34_idct_dc_add_sse2;
if (EXTERNAL_SSE4(cpu_flags))
c->rv34_idct_dc_add = ff_rv34_idct_dc_add_sse4;
}

View file

@ -0,0 +1,278 @@
/*
* RV40 decoder motion compensation functions x86-optimised
* Copyright (c) 2008 Konstantin Shishkov
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
/**
* @file
* RV40 decoder motion compensation functions x86-optimised
* 2,0 and 0,2 have h264 equivalents.
* 3,3 is bugged in the rv40 format and maps to _xy2 version
*/
#include "libavcodec/rv34dsp.h"
#include "libavutil/attributes.h"
#include "libavutil/mem.h"
#include "libavutil/x86/cpu.h"
#include "hpeldsp.h"
#define DEFINE_FN(op, size, insn) \
static void op##_rv40_qpel##size##_mc33_##insn(uint8_t *dst, const uint8_t *src, \
ptrdiff_t stride) \
{ \
ff_##op##_pixels##size##_xy2_##insn(dst, src, stride, size); \
}
#if HAVE_X86ASM
void ff_put_rv40_chroma_mc8_mmx (uint8_t *dst, uint8_t *src,
ptrdiff_t stride, int h, int x, int y);
void ff_avg_rv40_chroma_mc8_mmxext(uint8_t *dst, uint8_t *src,
ptrdiff_t stride, int h, int x, int y);
void ff_avg_rv40_chroma_mc8_3dnow(uint8_t *dst, uint8_t *src,
ptrdiff_t stride, int h, int x, int y);
void ff_put_rv40_chroma_mc4_mmx (uint8_t *dst, uint8_t *src,
ptrdiff_t stride, int h, int x, int y);
void ff_avg_rv40_chroma_mc4_mmxext(uint8_t *dst, uint8_t *src,
ptrdiff_t stride, int h, int x, int y);
void ff_avg_rv40_chroma_mc4_3dnow(uint8_t *dst, uint8_t *src,
ptrdiff_t stride, int h, int x, int y);
#define DECLARE_WEIGHT(opt) \
void ff_rv40_weight_func_rnd_16_##opt(uint8_t *dst, uint8_t *src1, uint8_t *src2, \
int w1, int w2, ptrdiff_t stride); \
void ff_rv40_weight_func_rnd_8_##opt (uint8_t *dst, uint8_t *src1, uint8_t *src2, \
int w1, int w2, ptrdiff_t stride); \
void ff_rv40_weight_func_nornd_16_##opt(uint8_t *dst, uint8_t *src1, uint8_t *src2, \
int w1, int w2, ptrdiff_t stride); \
void ff_rv40_weight_func_nornd_8_##opt (uint8_t *dst, uint8_t *src1, uint8_t *src2, \
int w1, int w2, ptrdiff_t stride);
DECLARE_WEIGHT(mmxext)
DECLARE_WEIGHT(sse2)
DECLARE_WEIGHT(ssse3)
/** @{ */
/**
* Define one qpel function.
* LOOPSIZE must be already set to the number of pixels processed per
* iteration in the inner loop of the called functions.
* COFF(x) must be already defined so as to provide the offset into any
* array of coeffs used by the called function for the qpel position x.
*/
#define QPEL_FUNC_DECL(OP, SIZE, PH, PV, OPT) \
static void OP ## rv40_qpel ##SIZE ##_mc ##PH ##PV ##OPT(uint8_t *dst, \
const uint8_t *src, \
ptrdiff_t stride) \
{ \
int i; \
if (PH && PV) { \
LOCAL_ALIGNED(16, uint8_t, tmp, [SIZE * (SIZE + 5)]); \
uint8_t *tmpptr = tmp + SIZE * 2; \
src -= stride * 2; \
\
for (i = 0; i < SIZE; i += LOOPSIZE) \
ff_put_rv40_qpel_h ##OPT(tmp + i, SIZE, src + i, stride, \
SIZE + 5, HCOFF(PH)); \
for (i = 0; i < SIZE; i += LOOPSIZE) \
ff_ ##OP ##rv40_qpel_v ##OPT(dst + i, stride, tmpptr + i, \
SIZE, SIZE, VCOFF(PV)); \
} else if (PV) { \
for (i = 0; i < SIZE; i += LOOPSIZE) \
ff_ ##OP ##rv40_qpel_v ## OPT(dst + i, stride, src + i, \
stride, SIZE, VCOFF(PV)); \
} else { \
for (i = 0; i < SIZE; i += LOOPSIZE) \
ff_ ##OP ##rv40_qpel_h ## OPT(dst + i, stride, src + i, \
stride, SIZE, HCOFF(PH)); \
} \
}
/** Declare functions for sizes 8 and 16 and given operations
* and qpel position. */
#define QPEL_FUNCS_DECL(OP, PH, PV, OPT) \
QPEL_FUNC_DECL(OP, 8, PH, PV, OPT) \
QPEL_FUNC_DECL(OP, 16, PH, PV, OPT)
/** Declare all functions for all sizes and qpel positions */
#define QPEL_MC_DECL(OP, OPT) \
void ff_ ##OP ##rv40_qpel_h ##OPT(uint8_t *dst, ptrdiff_t dstStride, \
const uint8_t *src, \
ptrdiff_t srcStride, \
int len, int m); \
void ff_ ##OP ##rv40_qpel_v ##OPT(uint8_t *dst, ptrdiff_t dstStride, \
const uint8_t *src, \
ptrdiff_t srcStride, \
int len, int m); \
QPEL_FUNCS_DECL(OP, 0, 1, OPT) \
QPEL_FUNCS_DECL(OP, 0, 3, OPT) \
QPEL_FUNCS_DECL(OP, 1, 0, OPT) \
QPEL_FUNCS_DECL(OP, 1, 1, OPT) \
QPEL_FUNCS_DECL(OP, 1, 2, OPT) \
QPEL_FUNCS_DECL(OP, 1, 3, OPT) \
QPEL_FUNCS_DECL(OP, 2, 1, OPT) \
QPEL_FUNCS_DECL(OP, 2, 2, OPT) \
QPEL_FUNCS_DECL(OP, 2, 3, OPT) \
QPEL_FUNCS_DECL(OP, 3, 0, OPT) \
QPEL_FUNCS_DECL(OP, 3, 1, OPT) \
QPEL_FUNCS_DECL(OP, 3, 2, OPT)
/** @} */
#define LOOPSIZE 8
#define HCOFF(x) (32 * ((x) - 1))
#define VCOFF(x) (32 * ((x) - 1))
QPEL_MC_DECL(put_, _ssse3)
QPEL_MC_DECL(avg_, _ssse3)
#undef LOOPSIZE
#undef HCOFF
#undef VCOFF
#define LOOPSIZE 8
#define HCOFF(x) (64 * ((x) - 1))
#define VCOFF(x) (64 * ((x) - 1))
QPEL_MC_DECL(put_, _sse2)
QPEL_MC_DECL(avg_, _sse2)
#if ARCH_X86_32
#undef LOOPSIZE
#undef HCOFF
#undef VCOFF
#define LOOPSIZE 4
#define HCOFF(x) (64 * ((x) - 1))
#define VCOFF(x) (64 * ((x) - 1))
QPEL_MC_DECL(put_, _mmx)
#define ff_put_rv40_qpel_h_mmxext ff_put_rv40_qpel_h_mmx
#define ff_put_rv40_qpel_v_mmxext ff_put_rv40_qpel_v_mmx
QPEL_MC_DECL(avg_, _mmxext)
#define ff_put_rv40_qpel_h_3dnow ff_put_rv40_qpel_h_mmx
#define ff_put_rv40_qpel_v_3dnow ff_put_rv40_qpel_v_mmx
QPEL_MC_DECL(avg_, _3dnow)
#endif
/** @{ */
/** Set one function */
#define QPEL_FUNC_SET(OP, SIZE, PH, PV, OPT) \
c-> OP ## pixels_tab[2 - SIZE / 8][4 * PV + PH] = OP ## rv40_qpel ##SIZE ## _mc ##PH ##PV ##OPT;
/** Set functions put and avg for sizes 8 and 16 and a given qpel position */
#define QPEL_FUNCS_SET(OP, PH, PV, OPT) \
QPEL_FUNC_SET(OP, 8, PH, PV, OPT) \
QPEL_FUNC_SET(OP, 16, PH, PV, OPT)
/** Set all functions for all sizes and qpel positions */
#define QPEL_MC_SET(OP, OPT) \
QPEL_FUNCS_SET (OP, 0, 1, OPT) \
QPEL_FUNCS_SET (OP, 0, 3, OPT) \
QPEL_FUNCS_SET (OP, 1, 0, OPT) \
QPEL_FUNCS_SET (OP, 1, 1, OPT) \
QPEL_FUNCS_SET (OP, 1, 2, OPT) \
QPEL_FUNCS_SET (OP, 1, 3, OPT) \
QPEL_FUNCS_SET (OP, 2, 1, OPT) \
QPEL_FUNCS_SET (OP, 2, 2, OPT) \
QPEL_FUNCS_SET (OP, 2, 3, OPT) \
QPEL_FUNCS_SET (OP, 3, 0, OPT) \
QPEL_FUNCS_SET (OP, 3, 1, OPT) \
QPEL_FUNCS_SET (OP, 3, 2, OPT)
/** @} */
DEFINE_FN(put, 8, ssse3)
DEFINE_FN(put, 16, sse2)
DEFINE_FN(put, 16, ssse3)
DEFINE_FN(avg, 8, mmxext)
DEFINE_FN(avg, 8, ssse3)
DEFINE_FN(avg, 16, sse2)
DEFINE_FN(avg, 16, ssse3)
#endif /* HAVE_X86ASM */
#if HAVE_MMX_INLINE
DEFINE_FN(put, 8, mmx)
DEFINE_FN(avg, 8, mmx)
DEFINE_FN(put, 16, mmx)
DEFINE_FN(avg, 16, mmx)
#endif
av_cold void ff_rv40dsp_init_x86(RV34DSPContext *c)
{
av_unused int cpu_flags = av_get_cpu_flags();
#if HAVE_MMX_INLINE
if (INLINE_MMX(cpu_flags)) {
c->put_pixels_tab[0][15] = put_rv40_qpel16_mc33_mmx;
c->put_pixels_tab[1][15] = put_rv40_qpel8_mc33_mmx;
c->avg_pixels_tab[0][15] = avg_rv40_qpel16_mc33_mmx;
c->avg_pixels_tab[1][15] = avg_rv40_qpel8_mc33_mmx;
}
#endif /* HAVE_MMX_INLINE */
#if HAVE_X86ASM
if (EXTERNAL_MMX(cpu_flags)) {
c->put_chroma_pixels_tab[0] = ff_put_rv40_chroma_mc8_mmx;
c->put_chroma_pixels_tab[1] = ff_put_rv40_chroma_mc4_mmx;
#if ARCH_X86_32
QPEL_MC_SET(put_, _mmx)
#endif
}
if (EXTERNAL_AMD3DNOW(cpu_flags)) {
c->avg_chroma_pixels_tab[0] = ff_avg_rv40_chroma_mc8_3dnow;
c->avg_chroma_pixels_tab[1] = ff_avg_rv40_chroma_mc4_3dnow;
#if ARCH_X86_32
QPEL_MC_SET(avg_, _3dnow)
#endif
}
if (EXTERNAL_MMXEXT(cpu_flags)) {
c->avg_pixels_tab[1][15] = avg_rv40_qpel8_mc33_mmxext;
c->avg_chroma_pixels_tab[0] = ff_avg_rv40_chroma_mc8_mmxext;
c->avg_chroma_pixels_tab[1] = ff_avg_rv40_chroma_mc4_mmxext;
c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_mmxext;
c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_rnd_8_mmxext;
c->rv40_weight_pixels_tab[1][0] = ff_rv40_weight_func_nornd_16_mmxext;
c->rv40_weight_pixels_tab[1][1] = ff_rv40_weight_func_nornd_8_mmxext;
#if ARCH_X86_32
QPEL_MC_SET(avg_, _mmxext)
#endif
}
if (EXTERNAL_SSE2(cpu_flags)) {
c->put_pixels_tab[0][15] = put_rv40_qpel16_mc33_sse2;
c->avg_pixels_tab[0][15] = avg_rv40_qpel16_mc33_sse2;
c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_sse2;
c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_rnd_8_sse2;
c->rv40_weight_pixels_tab[1][0] = ff_rv40_weight_func_nornd_16_sse2;
c->rv40_weight_pixels_tab[1][1] = ff_rv40_weight_func_nornd_8_sse2;
QPEL_MC_SET(put_, _sse2)
QPEL_MC_SET(avg_, _sse2)
}
if (EXTERNAL_SSSE3(cpu_flags)) {
c->put_pixels_tab[0][15] = put_rv40_qpel16_mc33_ssse3;
c->put_pixels_tab[1][15] = put_rv40_qpel8_mc33_ssse3;
c->avg_pixels_tab[0][15] = avg_rv40_qpel16_mc33_ssse3;
c->avg_pixels_tab[1][15] = avg_rv40_qpel8_mc33_ssse3;
c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_ssse3;
c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_rnd_8_ssse3;
c->rv40_weight_pixels_tab[1][0] = ff_rv40_weight_func_nornd_16_ssse3;
c->rv40_weight_pixels_tab[1][1] = ff_rv40_weight_func_nornd_8_ssse3;
QPEL_MC_SET(put_, _ssse3)
QPEL_MC_SET(avg_, _ssse3)
}
#endif /* HAVE_X86ASM */
}

View file

@ -0,0 +1,51 @@
/*
* Bluetooth low-complexity, subband codec (SBC)
*
* Copyright (C) 2017 Aurelien Jacobs <aurel@gnuage.org>
* Copyright (C) 2008-2010 Nokia Corporation
* Copyright (C) 2004-2010 Marcel Holtmann <marcel@holtmann.org>
* Copyright (C) 2004-2005 Henryk Ploetz <henryk@ploetzli.ch>
* Copyright (C) 2005-2006 Brad Midgley <bmidgley@xmission.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
/**
* @file
* SBC MMX optimization for some basic "building bricks"
*/
#include "libavutil/cpu.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/sbcdsp.h"
void ff_sbc_analyze_4_mmx(const int16_t *in, int32_t *out, const int16_t *consts);
void ff_sbc_analyze_8_mmx(const int16_t *in, int32_t *out, const int16_t *consts);
void ff_sbc_calc_scalefactors_mmx(int32_t sb_sample_f[16][2][8],
uint32_t scale_factor[2][8],
int blocks, int channels, int subbands);
av_cold void ff_sbcdsp_init_x86(SBCDSPContext *s)
{
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_MMX(cpu_flags)) {
s->sbc_analyze_4 = ff_sbc_analyze_4_mmx;
s->sbc_analyze_8 = ff_sbc_analyze_8_mmx;
s->sbc_calc_scalefactors = ff_sbc_calc_scalefactors_mmx;
}
}

View file

@ -0,0 +1,548 @@
;******************************************************************************
;* AAC Spectral Band Replication decoding functions
;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION_RODATA
; mask equivalent for multiply by -1.0 1.0
ps_mask times 2 dd 1<<31, 0
ps_mask2 times 2 dd 0, 1<<31
ps_mask3 dd 0, 0, 0, 1<<31
ps_noise0 times 2 dd 1.0, 0.0,
ps_noise2 times 2 dd -1.0, 0.0
ps_noise13 dd 0.0, 1.0, 0.0, -1.0
dd 0.0, -1.0, 0.0, 1.0
dd 0.0, 1.0, 0.0, -1.0
cextern sbr_noise_table
cextern ps_neg
SECTION .text
INIT_XMM sse
cglobal sbr_sum_square, 2, 3, 6
mov r2d, r1d
xorps m0, m0
xorps m1, m1
sar r2, 3
jz .prepare
.loop:
movu m2, [r0 + 0]
movu m3, [r0 + 16]
movu m4, [r0 + 32]
movu m5, [r0 + 48]
mulps m2, m2
mulps m3, m3
mulps m4, m4
mulps m5, m5
addps m0, m2
addps m1, m3
addps m0, m4
addps m1, m5
add r0, 64
dec r2
jnz .loop
.prepare:
and r1, 7
sar r1, 1
jz .end
; len is a multiple of 2, thus there are at least 4 elements to process
.endloop:
movu m2, [r0]
add r0, 16
mulps m2, m2
dec r1
addps m0, m2
jnz .endloop
.end:
addps m0, m1
movhlps m2, m0
addps m0, m2
movss m1, m0
shufps m0, m0, 1
addss m0, m1
%if ARCH_X86_64 == 0
movss r0m, m0
fld dword r0m
%endif
RET
%define STEP 40*4*2
cglobal sbr_hf_g_filt, 5, 6, 5
lea r1, [r1 + 8*r4] ; offset by ixh elements into X_high
mov r5, r3
and r3, 0xFC
lea r2, [r2 + r3*4]
lea r0, [r0 + r3*8]
neg r3
jz .loop1
.loop4:
movlps m0, [r2 + 4*r3 + 0]
movlps m1, [r2 + 4*r3 + 8]
movlps m2, [r1 + 0*STEP]
movlps m3, [r1 + 2*STEP]
movhps m2, [r1 + 1*STEP]
movhps m3, [r1 + 3*STEP]
unpcklps m0, m0
unpcklps m1, m1
mulps m0, m2
mulps m1, m3
movu [r0 + 8*r3 + 0], m0
movu [r0 + 8*r3 + 16], m1
add r1, 4*STEP
add r3, 4
jnz .loop4
and r5, 3 ; number of single element loops
jz .end
.loop1: ; element 0 and 1 can be computed at the same time
movss m0, [r2]
movlps m2, [r1]
unpcklps m0, m0
mulps m2, m0
movlps [r0], m2
add r0, 8
add r2, 4
add r1, STEP
dec r5
jnz .loop1
.end:
RET
; void ff_sbr_hf_gen_sse(float (*X_high)[2], const float (*X_low)[2],
; const float alpha0[2], const float alpha1[2],
; float bw, int start, int end)
;
cglobal sbr_hf_gen, 4,4,8, X_high, X_low, alpha0, alpha1, BW, S, E
; load alpha factors
%define bw m0
%if ARCH_X86_64 == 0 || WIN64
movss bw, BWm
%endif
movlps m2, [alpha1q]
movlps m1, [alpha0q]
shufps bw, bw, 0
mulps m2, bw ; (a1[0] a1[1])*bw
mulps m1, bw ; (a0[0] a0[1])*bw = (a2 a3)
mulps m2, bw ; (a1[0] a1[1])*bw*bw = (a0 a1)
mova m3, m1
mova m4, m2
; Set pointers
%if ARCH_X86_64 == 0 || WIN64
; start and end 6th and 7th args on stack
mov r2d, Sm
mov r3d, Em
DEFINE_ARGS X_high, X_low, start, end
%else
; BW does not actually occupy a register, so shift by 1
DEFINE_ARGS X_high, X_low, alpha0, alpha1, start, end
movsxd startq, startd
movsxd endq, endd
%endif
sub startq, endq ; neg num of loops
lea X_highq, [X_highq + endq*2*4]
lea X_lowq, [X_lowq + endq*2*4 - 2*2*4]
shl startq, 3 ; offset from num loops
mova m0, [X_lowq + startq]
shufps m3, m3, q1111
shufps m4, m4, q1111
xorps m3, [ps_mask]
shufps m1, m1, q0000
shufps m2, m2, q0000
xorps m4, [ps_mask]
.loop2:
movu m7, [X_lowq + startq + 8] ; BbCc
mova m6, m0
mova m5, m7
shufps m0, m0, q2301 ; aAbB
shufps m7, m7, q2301 ; bBcC
mulps m0, m4
mulps m7, m3
mulps m6, m2
mulps m5, m1
addps m7, m0
mova m0, [X_lowq + startq + 16] ; CcDd
addps m7, m0
addps m6, m5
addps m7, m6
mova [X_highq + startq], m7
add startq, 16
jnz .loop2
RET
cglobal sbr_sum64x5, 1,2,4,z
lea r1q, [zq+ 256]
.loop:
mova m0, [zq+ 0]
mova m2, [zq+ 16]
mova m1, [zq+ 256]
mova m3, [zq+ 272]
addps m0, [zq+ 512]
addps m2, [zq+ 528]
addps m1, [zq+ 768]
addps m3, [zq+ 784]
addps m0, [zq+1024]
addps m2, [zq+1040]
addps m0, m1
addps m2, m3
mova [zq], m0
mova [zq+16], m2
add zq, 32
cmp zq, r1q
jne .loop
REP_RET
INIT_XMM sse
cglobal sbr_qmf_post_shuffle, 2,3,4,W,z
lea r2q, [zq + (64-4)*4]
mova m3, [ps_neg]
.loop:
mova m1, [zq]
xorps m0, m3, [r2q]
shufps m0, m0, m0, q0123
unpcklps m2, m0, m1
unpckhps m0, m0, m1
mova [Wq + 0], m2
mova [Wq + 16], m0
add Wq, 32
sub r2q, 16
add zq, 16
cmp zq, r2q
jl .loop
REP_RET
INIT_XMM sse
cglobal sbr_neg_odd_64, 1,2,4,z
lea r1q, [zq+256]
.loop:
mova m0, [zq+ 0]
mova m1, [zq+16]
mova m2, [zq+32]
mova m3, [zq+48]
xorps m0, [ps_mask2]
xorps m1, [ps_mask2]
xorps m2, [ps_mask2]
xorps m3, [ps_mask2]
mova [zq+ 0], m0
mova [zq+16], m1
mova [zq+32], m2
mova [zq+48], m3
add zq, 64
cmp zq, r1q
jne .loop
REP_RET
; void ff_sbr_qmf_deint_bfly_sse2(float *v, const float *src0, const float *src1)
%macro SBR_QMF_DEINT_BFLY 0
cglobal sbr_qmf_deint_bfly, 3,5,8, v,src0,src1,vrev,c
mov cq, 64*4-2*mmsize
lea vrevq, [vq + 64*4]
.loop:
mova m0, [src0q+cq]
mova m1, [src1q]
mova m4, [src0q+cq+mmsize]
mova m5, [src1q+mmsize]
%if cpuflag(sse2)
pshufd m2, m0, q0123
pshufd m3, m1, q0123
pshufd m6, m4, q0123
pshufd m7, m5, q0123
%else
shufps m2, m0, m0, q0123
shufps m3, m1, m1, q0123
shufps m6, m4, m4, q0123
shufps m7, m5, m5, q0123
%endif
addps m5, m2
subps m0, m7
addps m1, m6
subps m4, m3
mova [vrevq], m1
mova [vrevq+mmsize], m5
mova [vq+cq], m0
mova [vq+cq+mmsize], m4
add src1q, 2*mmsize
add vrevq, 2*mmsize
sub cq, 2*mmsize
jge .loop
REP_RET
%endmacro
INIT_XMM sse
SBR_QMF_DEINT_BFLY
INIT_XMM sse2
SBR_QMF_DEINT_BFLY
INIT_XMM sse2
cglobal sbr_qmf_pre_shuffle, 1,4,6,z
%define OFFSET (32*4-2*mmsize)
mov r3q, OFFSET
lea r1q, [zq + (32+1)*4]
lea r2q, [zq + 64*4]
mova m5, [ps_neg]
.loop:
movu m0, [r1q]
movu m2, [r1q + mmsize]
movu m1, [zq + r3q + 4 + mmsize]
movu m3, [zq + r3q + 4]
pxor m2, m5
pxor m0, m5
pshufd m2, m2, q0123
pshufd m0, m0, q0123
SBUTTERFLY dq, 2, 3, 4
SBUTTERFLY dq, 0, 1, 4
mova [r2q + 2*r3q + 0*mmsize], m2
mova [r2q + 2*r3q + 1*mmsize], m3
mova [r2q + 2*r3q + 2*mmsize], m0
mova [r2q + 2*r3q + 3*mmsize], m1
add r1q, 2*mmsize
sub r3q, 2*mmsize
jge .loop
movq m2, [zq]
movq [r2q], m2
REP_RET
%ifdef PIC
%define NREGS 1
%if UNIX64
%define NOISE_TABLE r6q ; r5q is m_max
%else
%define NOISE_TABLE r5q
%endif
%else
%define NREGS 0
%define NOISE_TABLE sbr_noise_table
%endif
%macro LOAD_NST 1
%ifdef PIC
lea NOISE_TABLE, [%1]
mova m0, [kxq + NOISE_TABLE]
%else
mova m0, [kxq + %1]
%endif
%endmacro
INIT_XMM sse2
; sbr_hf_apply_noise_0(float (*Y)[2], const float *s_m,
; const float *q_filt, int noise,
; int kx, int m_max)
cglobal sbr_hf_apply_noise_0, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
mova m0, [ps_noise0]
jmp apply_noise_main
; sbr_hf_apply_noise_1(float (*Y)[2], const float *s_m,
; const float *q_filt, int noise,
; int kx, int m_max)
cglobal sbr_hf_apply_noise_1, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
and kxq, 1
shl kxq, 4
LOAD_NST ps_noise13
jmp apply_noise_main
; sbr_hf_apply_noise_2(float (*Y)[2], const float *s_m,
; const float *q_filt, int noise,
; int kx, int m_max)
cglobal sbr_hf_apply_noise_2, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
mova m0, [ps_noise2]
jmp apply_noise_main
; sbr_hf_apply_noise_3(float (*Y)[2], const float *s_m,
; const float *q_filt, int noise,
; int kx, int m_max)
cglobal sbr_hf_apply_noise_3, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
and kxq, 1
shl kxq, 4
LOAD_NST ps_noise13+16
apply_noise_main:
%if ARCH_X86_64 == 0 || WIN64
mov kxd, m_maxm
DEFINE_ARGS Y, s_m, q_filt, noise, count
%else
DEFINE_ARGS Y, s_m, q_filt, noise, kx, count
%endif
movsxdifnidn noiseq, noised
dec noiseq
shl countd, 2
%ifdef PIC
lea NOISE_TABLE, [sbr_noise_table]
%endif
lea Yq, [Yq + 2*countq]
add s_mq, countq
add q_filtq, countq
shl noiseq, 3
pxor m5, m5
neg countq
.loop:
mova m1, [q_filtq + countq]
movu m3, [noiseq + NOISE_TABLE + 1*mmsize]
movu m4, [noiseq + NOISE_TABLE + 2*mmsize]
add noiseq, 2*mmsize
and noiseq, 0x1ff<<3
punpckhdq m2, m1, m1
punpckldq m1, m1
mulps m1, m3 ; m2 = q_filt[m] * ff_sbr_noise_table[noise]
mulps m2, m4 ; m2 = q_filt[m] * ff_sbr_noise_table[noise]
mova m3, [s_mq + countq]
; TODO: replace by a vpermd in AVX2
punpckhdq m4, m3, m3
punpckldq m3, m3
pcmpeqd m6, m3, m5 ; m6 == 0
pcmpeqd m7, m4, m5 ; m7 == 0
mulps m3, m0 ; s_m[m] * phi_sign
mulps m4, m0 ; s_m[m] * phi_sign
pand m1, m6
pand m2, m7
movu m6, [Yq + 2*countq]
movu m7, [Yq + 2*countq + mmsize]
addps m3, m1
addps m4, m2
addps m6, m3
addps m7, m4
movu [Yq + 2*countq], m6
movu [Yq + 2*countq + mmsize], m7
add countq, mmsize
jl .loop
RET
INIT_XMM sse
cglobal sbr_qmf_deint_neg, 2,4,4,v,src,vrev,c
%define COUNT 32*4
%define OFFSET 32*4
mov cq, -COUNT
lea vrevq, [vq + OFFSET + COUNT]
add vq, OFFSET-mmsize
add srcq, 2*COUNT
mova m3, [ps_neg]
.loop:
mova m0, [srcq + 2*cq + 0*mmsize]
mova m1, [srcq + 2*cq + 1*mmsize]
shufps m2, m0, m1, q2020
shufps m1, m0, q1313
xorps m2, m3
mova [vq], m1
mova [vrevq + cq], m2
sub vq, mmsize
add cq, mmsize
jl .loop
REP_RET
%macro SBR_AUTOCORRELATE 0
cglobal sbr_autocorrelate, 2,3,8,32, x, phi, cnt
mov cntq, 37*8
add xq, cntq
neg cntq
%if cpuflag(sse3)
%define MOVH movsd
movddup m5, [xq+cntq]
%else
%define MOVH movlps
movlps m5, [xq+cntq]
movlhps m5, m5
%endif
MOVH m7, [xq+cntq+8 ]
MOVH m1, [xq+cntq+16]
shufps m7, m7, q0110
shufps m1, m1, q0110
mulps m3, m5, m7 ; x[0][0] * x[1][0], x[0][1] * x[1][1], x[0][0] * x[1][1], x[0][1] * x[1][0]
mulps m4, m5, m5 ; x[0][0] * x[0][0], x[0][1] * x[0][1];
mulps m5, m1 ; real_sum2 = x[0][0] * x[2][0], x[0][1] * x[2][1]; imag_sum2 = x[0][0] * x[2][1], x[0][1] * x[2][0]
movaps [rsp ], m3
movaps [rsp+16], m4
add cntq, 8
MOVH m2, [xq+cntq+16]
movlhps m7, m7
shufps m2, m2, q0110
mulps m6, m7, m1 ; real_sum1 = x[1][0] * x[2][0], x[1][1] * x[2][1]; imag_sum1 += x[1][0] * x[2][1], x[1][1] * x[2][0]
mulps m4, m7, m2
mulps m7, m7 ; real_sum0 = x[1][0] * x[1][0], x[1][1] * x[1][1];
addps m5, m4 ; real_sum2 += x[1][0] * x[3][0], x[1][1] * x[3][1]; imag_sum2 += x[1][0] * x[3][1], x[1][1] * x[3][0]
align 16
.loop:
add cntq, 8
MOVH m0, [xq+cntq+16]
movlhps m1, m1
shufps m0, m0, q0110
mulps m3, m1, m2
mulps m4, m1, m0
mulps m1, m1
addps m6, m3 ; real_sum1 += x[i][0] * x[i + 1][0], x[i][1] * x[i + 1][1]; imag_sum1 += x[i][0] * x[i + 1][1], x[i][1] * x[i + 1][0];
addps m5, m4 ; real_sum2 += x[i][0] * x[i + 2][0], x[i][1] * x[i + 2][1]; imag_sum2 += x[i][0] * x[i + 2][1], x[i][1] * x[i + 2][0];
addps m7, m1 ; real_sum0 += x[i][0] * x[i][0], x[i][1] * x[i][1];
add cntq, 8
MOVH m1, [xq+cntq+16]
movlhps m2, m2
shufps m1, m1, q0110
mulps m3, m2, m0
mulps m4, m2, m1
mulps m2, m2
addps m6, m3 ; real_sum1 += x[i][0] * x[i + 1][0], x[i][1] * x[i + 1][1]; imag_sum1 += x[i][0] * x[i + 1][1], x[i][1] * x[i + 1][0];
addps m5, m4 ; real_sum2 += x[i][0] * x[i + 2][0], x[i][1] * x[i + 2][1]; imag_sum2 += x[i][0] * x[i + 2][1], x[i][1] * x[i + 2][0];
addps m7, m2 ; real_sum0 += x[i][0] * x[i][0], x[i][1] * x[i][1];
add cntq, 8
MOVH m2, [xq+cntq+16]
movlhps m0, m0
shufps m2, m2, q0110
mulps m3, m0, m1
mulps m4, m0, m2
mulps m0, m0
addps m6, m3 ; real_sum1 += x[i][0] * x[i + 1][0], x[i][1] * x[i + 1][1]; imag_sum1 += x[i][0] * x[i + 1][1], x[i][1] * x[i + 1][0];
addps m5, m4 ; real_sum2 += x[i][0] * x[i + 2][0], x[i][1] * x[i + 2][1]; imag_sum2 += x[i][0] * x[i + 2][1], x[i][1] * x[i + 2][0];
addps m7, m0 ; real_sum0 += x[i][0] * x[i][0], x[i][1] * x[i][1];
jl .loop
movlhps m1, m1
mulps m2, m1
mulps m1, m1
addps m2, m6 ; real_sum1 + x[38][0] * x[39][0], x[38][1] * x[39][1]; imag_sum1 + x[38][0] * x[39][1], x[38][1] * x[39][0];
addps m1, m7 ; real_sum0 + x[38][0] * x[38][0], x[38][1] * x[38][1];
addps m6, [rsp ] ; real_sum1 + x[ 0][0] * x[ 1][0], x[ 0][1] * x[ 1][1]; imag_sum1 + x[ 0][0] * x[ 1][1], x[ 0][1] * x[ 1][0];
addps m7, [rsp+16] ; real_sum0 + x[ 0][0] * x[ 0][0], x[ 0][1] * x[ 0][1];
xorps m2, [ps_mask3]
xorps m5, [ps_mask3]
xorps m6, [ps_mask3]
HADDPS m2, m5, m3
HADDPS m7, m6, m4
%if cpuflag(sse3)
movshdup m0, m1
%else
movss m0, m1
shufps m1, m1, q0001
%endif
addss m1, m0
movaps [phiq ], m2
movhps [phiq+0x18], m7
movss [phiq+0x28], m7
movss [phiq+0x10], m1
RET
%endmacro
INIT_XMM sse
SBR_AUTOCORRELATE
INIT_XMM sse3
SBR_AUTOCORRELATE

View file

@ -0,0 +1,87 @@
/*
* AAC Spectral Band Replication decoding functions
* Copyright (c) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/sbrdsp.h"
float ff_sbr_sum_square_sse(float (*x)[2], int n);
void ff_sbr_sum64x5_sse(float *z);
void ff_sbr_hf_g_filt_sse(float (*Y)[2], const float (*X_high)[40][2],
const float *g_filt, int m_max, intptr_t ixh);
void ff_sbr_hf_gen_sse(float (*X_high)[2], const float (*X_low)[2],
const float alpha0[2], const float alpha1[2],
float bw, int start, int end);
void ff_sbr_neg_odd_64_sse(float *z);
void ff_sbr_qmf_post_shuffle_sse(float W[32][2], const float *z);
void ff_sbr_qmf_deint_bfly_sse(float *v, const float *src0, const float *src1);
void ff_sbr_qmf_deint_bfly_sse2(float *v, const float *src0, const float *src1);
void ff_sbr_qmf_pre_shuffle_sse2(float *z);
void ff_sbr_hf_apply_noise_0_sse2(float (*Y)[2], const float *s_m,
const float *q_filt, int noise,
int kx, int m_max);
void ff_sbr_hf_apply_noise_1_sse2(float (*Y)[2], const float *s_m,
const float *q_filt, int noise,
int kx, int m_max);
void ff_sbr_hf_apply_noise_2_sse2(float (*Y)[2], const float *s_m,
const float *q_filt, int noise,
int kx, int m_max);
void ff_sbr_hf_apply_noise_3_sse2(float (*Y)[2], const float *s_m,
const float *q_filt, int noise,
int kx, int m_max);
void ff_sbr_qmf_deint_neg_sse(float *v, const float *src);
void ff_sbr_autocorrelate_sse (const float x[40][2], float phi[3][2][2]);
void ff_sbr_autocorrelate_sse3(const float x[40][2], float phi[3][2][2]);
av_cold void ff_sbrdsp_init_x86(SBRDSPContext *s)
{
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_SSE(cpu_flags)) {
s->neg_odd_64 = ff_sbr_neg_odd_64_sse;
s->sum_square = ff_sbr_sum_square_sse;
s->sum64x5 = ff_sbr_sum64x5_sse;
s->hf_g_filt = ff_sbr_hf_g_filt_sse;
s->hf_gen = ff_sbr_hf_gen_sse;
s->qmf_post_shuffle = ff_sbr_qmf_post_shuffle_sse;
s->qmf_deint_bfly = ff_sbr_qmf_deint_bfly_sse;
s->qmf_deint_neg = ff_sbr_qmf_deint_neg_sse;
s->autocorrelate = ff_sbr_autocorrelate_sse;
}
if (EXTERNAL_SSE2(cpu_flags)) {
s->qmf_deint_bfly = ff_sbr_qmf_deint_bfly_sse2;
s->qmf_pre_shuffle = ff_sbr_qmf_pre_shuffle_sse2;
s->hf_apply_noise[0] = ff_sbr_hf_apply_noise_0_sse2;
s->hf_apply_noise[1] = ff_sbr_hf_apply_noise_1_sse2;
s->hf_apply_noise[2] = ff_sbr_hf_apply_noise_2_sse2;
s->hf_apply_noise[3] = ff_sbr_hf_apply_noise_3_sse2;
}
if (EXTERNAL_SSE3(cpu_flags)) {
s->autocorrelate = ff_sbr_autocorrelate_sse3;
}
}

View file

@ -0,0 +1,53 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVCODEC_X86_SIMPLE_IDCT_H
#define AVCODEC_X86_SIMPLE_IDCT_H
#include <stddef.h>
#include <stdint.h>
void ff_simple_idct_mmx(int16_t *block);
void ff_simple_idct_add_mmx(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
void ff_simple_idct_put_mmx(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
void ff_simple_idct_add_sse2(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
void ff_simple_idct_put_sse2(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
void ff_simple_idct8_sse2(int16_t *block);
void ff_simple_idct8_avx(int16_t *block);
void ff_simple_idct8_put_sse2(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
void ff_simple_idct8_put_avx(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
void ff_simple_idct8_add_sse2(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
void ff_simple_idct8_add_avx(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
void ff_simple_idct10_sse2(int16_t *block);
void ff_simple_idct10_avx(int16_t *block);
void ff_simple_idct10_put_sse2(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
void ff_simple_idct10_put_avx(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
void ff_simple_idct12_sse2(int16_t *block);
void ff_simple_idct12_avx(int16_t *block);
void ff_simple_idct12_put_sse2(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
void ff_simple_idct12_put_avx(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
#endif /* AVCODEC_X86_SIMPLE_IDCT_H */

View file

@ -0,0 +1,908 @@
/*
* MMX and SSE2 optimized snow DSP utils
* Copyright (c) 2005-2006 Robert Edele <yartrebo@earthlink.net>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/cpu.h"
#include "libavutil/x86/asm.h"
#include "libavcodec/avcodec.h"
#include "libavcodec/snow.h"
#include "libavcodec/snow_dwt.h"
#if HAVE_INLINE_ASM
static void ff_snow_horizontal_compose97i_sse2(IDWTELEM *b, IDWTELEM *temp, int width){
const int w2= (width+1)>>1;
const int w_l= (width>>1);
const int w_r= w2 - 1;
int i;
{ // Lift 0
IDWTELEM * const ref = b + w2 - 1;
IDWTELEM b_0 = b[0]; //By allowing the first entry in b[0] to be calculated twice
// (the first time erroneously), we allow the SSE2 code to run an extra pass.
// The savings in code and time are well worth having to store this value and
// calculate b[0] correctly afterwards.
i = 0;
__asm__ volatile(
"pcmpeqd %%xmm7, %%xmm7 \n\t"
"pcmpeqd %%xmm3, %%xmm3 \n\t"
"psllw $1, %%xmm3 \n\t"
"paddw %%xmm7, %%xmm3 \n\t"
"psllw $13, %%xmm3 \n\t"
::);
for(; i<w_l-15; i+=16){
__asm__ volatile(
"movdqu (%1), %%xmm1 \n\t"
"movdqu 16(%1), %%xmm5 \n\t"
"movdqu 2(%1), %%xmm2 \n\t"
"movdqu 18(%1), %%xmm6 \n\t"
"paddw %%xmm1, %%xmm2 \n\t"
"paddw %%xmm5, %%xmm6 \n\t"
"paddw %%xmm7, %%xmm2 \n\t"
"paddw %%xmm7, %%xmm6 \n\t"
"pmulhw %%xmm3, %%xmm2 \n\t"
"pmulhw %%xmm3, %%xmm6 \n\t"
"paddw (%0), %%xmm2 \n\t"
"paddw 16(%0), %%xmm6 \n\t"
"movdqa %%xmm2, (%0) \n\t"
"movdqa %%xmm6, 16(%0) \n\t"
:: "r"(&b[i]), "r"(&ref[i])
: "memory"
);
}
snow_horizontal_compose_lift_lead_out(i, b, b, ref, width, w_l, 0, W_DM, W_DO, W_DS);
b[0] = b_0 - ((W_DM * 2 * ref[1]+W_DO)>>W_DS);
}
{ // Lift 1
IDWTELEM * const dst = b+w2;
i = 0;
for(; (((x86_reg)&dst[i]) & 0x1F) && i<w_r; i++){
dst[i] = dst[i] - (b[i] + b[i + 1]);
}
for(; i<w_r-15; i+=16){
__asm__ volatile(
"movdqu (%1), %%xmm1 \n\t"
"movdqu 16(%1), %%xmm5 \n\t"
"movdqu 2(%1), %%xmm2 \n\t"
"movdqu 18(%1), %%xmm6 \n\t"
"paddw %%xmm1, %%xmm2 \n\t"
"paddw %%xmm5, %%xmm6 \n\t"
"movdqa (%0), %%xmm0 \n\t"
"movdqa 16(%0), %%xmm4 \n\t"
"psubw %%xmm2, %%xmm0 \n\t"
"psubw %%xmm6, %%xmm4 \n\t"
"movdqa %%xmm0, (%0) \n\t"
"movdqa %%xmm4, 16(%0) \n\t"
:: "r"(&dst[i]), "r"(&b[i])
: "memory"
);
}
snow_horizontal_compose_lift_lead_out(i, dst, dst, b, width, w_r, 1, W_CM, W_CO, W_CS);
}
{ // Lift 2
IDWTELEM * const ref = b+w2 - 1;
IDWTELEM b_0 = b[0];
i = 0;
__asm__ volatile(
"psllw $15, %%xmm7 \n\t"
"pcmpeqw %%xmm6, %%xmm6 \n\t"
"psrlw $13, %%xmm6 \n\t"
"paddw %%xmm7, %%xmm6 \n\t"
::);
for(; i<w_l-15; i+=16){
__asm__ volatile(
"movdqu (%1), %%xmm0 \n\t"
"movdqu 16(%1), %%xmm4 \n\t"
"movdqu 2(%1), %%xmm1 \n\t"
"movdqu 18(%1), %%xmm5 \n\t" //FIXME try aligned reads and shifts
"paddw %%xmm6, %%xmm0 \n\t"
"paddw %%xmm6, %%xmm4 \n\t"
"paddw %%xmm7, %%xmm1 \n\t"
"paddw %%xmm7, %%xmm5 \n\t"
"pavgw %%xmm1, %%xmm0 \n\t"
"pavgw %%xmm5, %%xmm4 \n\t"
"psubw %%xmm7, %%xmm0 \n\t"
"psubw %%xmm7, %%xmm4 \n\t"
"psraw $1, %%xmm0 \n\t"
"psraw $1, %%xmm4 \n\t"
"movdqa (%0), %%xmm1 \n\t"
"movdqa 16(%0), %%xmm5 \n\t"
"paddw %%xmm1, %%xmm0 \n\t"
"paddw %%xmm5, %%xmm4 \n\t"
"psraw $2, %%xmm0 \n\t"
"psraw $2, %%xmm4 \n\t"
"paddw %%xmm1, %%xmm0 \n\t"
"paddw %%xmm5, %%xmm4 \n\t"
"movdqa %%xmm0, (%0) \n\t"
"movdqa %%xmm4, 16(%0) \n\t"
:: "r"(&b[i]), "r"(&ref[i])
: "memory"
);
}
snow_horizontal_compose_liftS_lead_out(i, b, b, ref, width, w_l);
b[0] = b_0 + ((2 * ref[1] + W_BO-1 + 4 * b_0) >> W_BS);
}
{ // Lift 3
IDWTELEM * const src = b+w2;
i = 0;
for(; (((x86_reg)&temp[i]) & 0x1F) && i<w_r; i++){
temp[i] = src[i] - ((-W_AM*(b[i] + b[i+1]))>>W_AS);
}
for(; i<w_r-7; i+=8){
__asm__ volatile(
"movdqu 2(%1), %%xmm2 \n\t"
"movdqu 18(%1), %%xmm6 \n\t"
"paddw (%1), %%xmm2 \n\t"
"paddw 16(%1), %%xmm6 \n\t"
"movdqu (%0), %%xmm0 \n\t"
"movdqu 16(%0), %%xmm4 \n\t"
"paddw %%xmm2, %%xmm0 \n\t"
"paddw %%xmm6, %%xmm4 \n\t"
"psraw $1, %%xmm2 \n\t"
"psraw $1, %%xmm6 \n\t"
"paddw %%xmm0, %%xmm2 \n\t"
"paddw %%xmm4, %%xmm6 \n\t"
"movdqa %%xmm2, (%2) \n\t"
"movdqa %%xmm6, 16(%2) \n\t"
:: "r"(&src[i]), "r"(&b[i]), "r"(&temp[i])
: "memory"
);
}
snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -W_AM, W_AO+1, W_AS);
}
{
snow_interleave_line_header(&i, width, b, temp);
for (; (i & 0x3E) != 0x3E; i-=2){
b[i+1] = temp[i>>1];
b[i] = b[i>>1];
}
for (i-=62; i>=0; i-=64){
__asm__ volatile(
"movdqa (%1), %%xmm0 \n\t"
"movdqa 16(%1), %%xmm2 \n\t"
"movdqa 32(%1), %%xmm4 \n\t"
"movdqa 48(%1), %%xmm6 \n\t"
"movdqa (%1), %%xmm1 \n\t"
"movdqa 16(%1), %%xmm3 \n\t"
"movdqa 32(%1), %%xmm5 \n\t"
"movdqa 48(%1), %%xmm7 \n\t"
"punpcklwd (%2), %%xmm0 \n\t"
"punpcklwd 16(%2), %%xmm2 \n\t"
"punpcklwd 32(%2), %%xmm4 \n\t"
"punpcklwd 48(%2), %%xmm6 \n\t"
"movdqa %%xmm0, (%0) \n\t"
"movdqa %%xmm2, 32(%0) \n\t"
"movdqa %%xmm4, 64(%0) \n\t"
"movdqa %%xmm6, 96(%0) \n\t"
"punpckhwd (%2), %%xmm1 \n\t"
"punpckhwd 16(%2), %%xmm3 \n\t"
"punpckhwd 32(%2), %%xmm5 \n\t"
"punpckhwd 48(%2), %%xmm7 \n\t"
"movdqa %%xmm1, 16(%0) \n\t"
"movdqa %%xmm3, 48(%0) \n\t"
"movdqa %%xmm5, 80(%0) \n\t"
"movdqa %%xmm7, 112(%0) \n\t"
:: "r"(&(b)[i]), "r"(&(b)[i>>1]), "r"(&(temp)[i>>1])
: "memory"
);
}
}
}
static void ff_snow_horizontal_compose97i_mmx(IDWTELEM *b, IDWTELEM *temp, int width){
const int w2= (width+1)>>1;
const int w_l= (width>>1);
const int w_r= w2 - 1;
int i;
{ // Lift 0
IDWTELEM * const ref = b + w2 - 1;
i = 1;
b[0] = b[0] - ((W_DM * 2 * ref[1]+W_DO)>>W_DS);
__asm__ volatile(
"pcmpeqw %%mm7, %%mm7 \n\t"
"pcmpeqw %%mm3, %%mm3 \n\t"
"psllw $1, %%mm3 \n\t"
"paddw %%mm7, %%mm3 \n\t"
"psllw $13, %%mm3 \n\t"
::);
for(; i<w_l-7; i+=8){
__asm__ volatile(
"movq (%1), %%mm2 \n\t"
"movq 8(%1), %%mm6 \n\t"
"paddw 2(%1), %%mm2 \n\t"
"paddw 10(%1), %%mm6 \n\t"
"paddw %%mm7, %%mm2 \n\t"
"paddw %%mm7, %%mm6 \n\t"
"pmulhw %%mm3, %%mm2 \n\t"
"pmulhw %%mm3, %%mm6 \n\t"
"paddw (%0), %%mm2 \n\t"
"paddw 8(%0), %%mm6 \n\t"
"movq %%mm2, (%0) \n\t"
"movq %%mm6, 8(%0) \n\t"
:: "r"(&b[i]), "r"(&ref[i])
: "memory"
);
}
snow_horizontal_compose_lift_lead_out(i, b, b, ref, width, w_l, 0, W_DM, W_DO, W_DS);
}
{ // Lift 1
IDWTELEM * const dst = b+w2;
i = 0;
for(; i<w_r-7; i+=8){
__asm__ volatile(
"movq (%1), %%mm2 \n\t"
"movq 8(%1), %%mm6 \n\t"
"paddw 2(%1), %%mm2 \n\t"
"paddw 10(%1), %%mm6 \n\t"
"movq (%0), %%mm0 \n\t"
"movq 8(%0), %%mm4 \n\t"
"psubw %%mm2, %%mm0 \n\t"
"psubw %%mm6, %%mm4 \n\t"
"movq %%mm0, (%0) \n\t"
"movq %%mm4, 8(%0) \n\t"
:: "r"(&dst[i]), "r"(&b[i])
: "memory"
);
}
snow_horizontal_compose_lift_lead_out(i, dst, dst, b, width, w_r, 1, W_CM, W_CO, W_CS);
}
{ // Lift 2
IDWTELEM * const ref = b+w2 - 1;
i = 1;
b[0] = b[0] + (((2 * ref[1] + W_BO) + 4 * b[0]) >> W_BS);
__asm__ volatile(
"psllw $15, %%mm7 \n\t"
"pcmpeqw %%mm6, %%mm6 \n\t"
"psrlw $13, %%mm6 \n\t"
"paddw %%mm7, %%mm6 \n\t"
::);
for(; i<w_l-7; i+=8){
__asm__ volatile(
"movq (%1), %%mm0 \n\t"
"movq 8(%1), %%mm4 \n\t"
"movq 2(%1), %%mm1 \n\t"
"movq 10(%1), %%mm5 \n\t"
"paddw %%mm6, %%mm0 \n\t"
"paddw %%mm6, %%mm4 \n\t"
"paddw %%mm7, %%mm1 \n\t"
"paddw %%mm7, %%mm5 \n\t"
"pavgw %%mm1, %%mm0 \n\t"
"pavgw %%mm5, %%mm4 \n\t"
"psubw %%mm7, %%mm0 \n\t"
"psubw %%mm7, %%mm4 \n\t"
"psraw $1, %%mm0 \n\t"
"psraw $1, %%mm4 \n\t"
"movq (%0), %%mm1 \n\t"
"movq 8(%0), %%mm5 \n\t"
"paddw %%mm1, %%mm0 \n\t"
"paddw %%mm5, %%mm4 \n\t"
"psraw $2, %%mm0 \n\t"
"psraw $2, %%mm4 \n\t"
"paddw %%mm1, %%mm0 \n\t"
"paddw %%mm5, %%mm4 \n\t"
"movq %%mm0, (%0) \n\t"
"movq %%mm4, 8(%0) \n\t"
:: "r"(&b[i]), "r"(&ref[i])
: "memory"
);
}
snow_horizontal_compose_liftS_lead_out(i, b, b, ref, width, w_l);
}
{ // Lift 3
IDWTELEM * const src = b+w2;
i = 0;
for(; i<w_r-7; i+=8){
__asm__ volatile(
"movq 2(%1), %%mm2 \n\t"
"movq 10(%1), %%mm6 \n\t"
"paddw (%1), %%mm2 \n\t"
"paddw 8(%1), %%mm6 \n\t"
"movq (%0), %%mm0 \n\t"
"movq 8(%0), %%mm4 \n\t"
"paddw %%mm2, %%mm0 \n\t"
"paddw %%mm6, %%mm4 \n\t"
"psraw $1, %%mm2 \n\t"
"psraw $1, %%mm6 \n\t"
"paddw %%mm0, %%mm2 \n\t"
"paddw %%mm4, %%mm6 \n\t"
"movq %%mm2, (%2) \n\t"
"movq %%mm6, 8(%2) \n\t"
:: "r"(&src[i]), "r"(&b[i]), "r"(&temp[i])
: "memory"
);
}
snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -W_AM, W_AO+1, W_AS);
}
{
snow_interleave_line_header(&i, width, b, temp);
for (; (i & 0x1E) != 0x1E; i-=2){
b[i+1] = temp[i>>1];
b[i] = b[i>>1];
}
for (i-=30; i>=0; i-=32){
__asm__ volatile(
"movq (%1), %%mm0 \n\t"
"movq 8(%1), %%mm2 \n\t"
"movq 16(%1), %%mm4 \n\t"
"movq 24(%1), %%mm6 \n\t"
"movq (%1), %%mm1 \n\t"
"movq 8(%1), %%mm3 \n\t"
"movq 16(%1), %%mm5 \n\t"
"movq 24(%1), %%mm7 \n\t"
"punpcklwd (%2), %%mm0 \n\t"
"punpcklwd 8(%2), %%mm2 \n\t"
"punpcklwd 16(%2), %%mm4 \n\t"
"punpcklwd 24(%2), %%mm6 \n\t"
"movq %%mm0, (%0) \n\t"
"movq %%mm2, 16(%0) \n\t"
"movq %%mm4, 32(%0) \n\t"
"movq %%mm6, 48(%0) \n\t"
"punpckhwd (%2), %%mm1 \n\t"
"punpckhwd 8(%2), %%mm3 \n\t"
"punpckhwd 16(%2), %%mm5 \n\t"
"punpckhwd 24(%2), %%mm7 \n\t"
"movq %%mm1, 8(%0) \n\t"
"movq %%mm3, 24(%0) \n\t"
"movq %%mm5, 40(%0) \n\t"
"movq %%mm7, 56(%0) \n\t"
:: "r"(&b[i]), "r"(&b[i>>1]), "r"(&temp[i>>1])
: "memory"
);
}
}
}
#if HAVE_7REGS
#define snow_vertical_compose_sse2_load_add(op,r,t0,t1,t2,t3)\
""op" ("r",%%"FF_REG_d"), %%"t0" \n\t"\
""op" 16("r",%%"FF_REG_d"), %%"t1" \n\t"\
""op" 32("r",%%"FF_REG_d"), %%"t2" \n\t"\
""op" 48("r",%%"FF_REG_d"), %%"t3" \n\t"
#define snow_vertical_compose_sse2_load(r,t0,t1,t2,t3)\
snow_vertical_compose_sse2_load_add("movdqa",r,t0,t1,t2,t3)
#define snow_vertical_compose_sse2_add(r,t0,t1,t2,t3)\
snow_vertical_compose_sse2_load_add("paddw",r,t0,t1,t2,t3)
#define snow_vertical_compose_r2r_sub(s0,s1,s2,s3,t0,t1,t2,t3)\
"psubw %%"s0", %%"t0" \n\t"\
"psubw %%"s1", %%"t1" \n\t"\
"psubw %%"s2", %%"t2" \n\t"\
"psubw %%"s3", %%"t3" \n\t"
#define snow_vertical_compose_sse2_store(w,s0,s1,s2,s3)\
"movdqa %%"s0", ("w",%%"FF_REG_d") \n\t"\
"movdqa %%"s1", 16("w",%%"FF_REG_d") \n\t"\
"movdqa %%"s2", 32("w",%%"FF_REG_d") \n\t"\
"movdqa %%"s3", 48("w",%%"FF_REG_d") \n\t"
#define snow_vertical_compose_sra(n,t0,t1,t2,t3)\
"psraw $"n", %%"t0" \n\t"\
"psraw $"n", %%"t1" \n\t"\
"psraw $"n", %%"t2" \n\t"\
"psraw $"n", %%"t3" \n\t"
#define snow_vertical_compose_r2r_add(s0,s1,s2,s3,t0,t1,t2,t3)\
"paddw %%"s0", %%"t0" \n\t"\
"paddw %%"s1", %%"t1" \n\t"\
"paddw %%"s2", %%"t2" \n\t"\
"paddw %%"s3", %%"t3" \n\t"
#define snow_vertical_compose_r2r_pmulhw(s0,s1,s2,s3,t0,t1,t2,t3)\
"pmulhw %%"s0", %%"t0" \n\t"\
"pmulhw %%"s1", %%"t1" \n\t"\
"pmulhw %%"s2", %%"t2" \n\t"\
"pmulhw %%"s3", %%"t3" \n\t"
#define snow_vertical_compose_sse2_move(s0,s1,s2,s3,t0,t1,t2,t3)\
"movdqa %%"s0", %%"t0" \n\t"\
"movdqa %%"s1", %%"t1" \n\t"\
"movdqa %%"s2", %%"t2" \n\t"\
"movdqa %%"s3", %%"t3" \n\t"
static void ff_snow_vertical_compose97i_sse2(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width){
x86_reg i = width;
while(i & 0x1F)
{
i--;
b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS;
b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS;
b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS;
b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
}
i+=i;
__asm__ volatile (
"jmp 2f \n\t"
"1: \n\t"
snow_vertical_compose_sse2_load("%4","xmm0","xmm2","xmm4","xmm6")
snow_vertical_compose_sse2_add("%6","xmm0","xmm2","xmm4","xmm6")
"pcmpeqw %%xmm0, %%xmm0 \n\t"
"pcmpeqw %%xmm2, %%xmm2 \n\t"
"paddw %%xmm2, %%xmm2 \n\t"
"paddw %%xmm0, %%xmm2 \n\t"
"psllw $13, %%xmm2 \n\t"
snow_vertical_compose_r2r_add("xmm0","xmm0","xmm0","xmm0","xmm1","xmm3","xmm5","xmm7")
snow_vertical_compose_r2r_pmulhw("xmm2","xmm2","xmm2","xmm2","xmm1","xmm3","xmm5","xmm7")
snow_vertical_compose_sse2_add("%5","xmm1","xmm3","xmm5","xmm7")
snow_vertical_compose_sse2_store("%5","xmm1","xmm3","xmm5","xmm7")
snow_vertical_compose_sse2_load("%4","xmm0","xmm2","xmm4","xmm6")
snow_vertical_compose_sse2_add("%3","xmm1","xmm3","xmm5","xmm7")
snow_vertical_compose_r2r_sub("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6")
snow_vertical_compose_sse2_store("%4","xmm0","xmm2","xmm4","xmm6")
"pcmpeqw %%xmm7, %%xmm7 \n\t"
"pcmpeqw %%xmm5, %%xmm5 \n\t"
"psllw $15, %%xmm7 \n\t"
"psrlw $13, %%xmm5 \n\t"
"paddw %%xmm7, %%xmm5 \n\t"
snow_vertical_compose_r2r_add("xmm5","xmm5","xmm5","xmm5","xmm0","xmm2","xmm4","xmm6")
"movq (%2,%%"FF_REG_d"), %%xmm1 \n\t"
"movq 8(%2,%%"FF_REG_d"), %%xmm3 \n\t"
"paddw %%xmm7, %%xmm1 \n\t"
"paddw %%xmm7, %%xmm3 \n\t"
"pavgw %%xmm1, %%xmm0 \n\t"
"pavgw %%xmm3, %%xmm2 \n\t"
"movq 16(%2,%%"FF_REG_d"), %%xmm1 \n\t"
"movq 24(%2,%%"FF_REG_d"), %%xmm3 \n\t"
"paddw %%xmm7, %%xmm1 \n\t"
"paddw %%xmm7, %%xmm3 \n\t"
"pavgw %%xmm1, %%xmm4 \n\t"
"pavgw %%xmm3, %%xmm6 \n\t"
snow_vertical_compose_r2r_sub("xmm7","xmm7","xmm7","xmm7","xmm0","xmm2","xmm4","xmm6")
snow_vertical_compose_sra("1","xmm0","xmm2","xmm4","xmm6")
snow_vertical_compose_sse2_add("%3","xmm0","xmm2","xmm4","xmm6")
snow_vertical_compose_sra("2","xmm0","xmm2","xmm4","xmm6")
snow_vertical_compose_sse2_add("%3","xmm0","xmm2","xmm4","xmm6")
snow_vertical_compose_sse2_store("%3","xmm0","xmm2","xmm4","xmm6")
snow_vertical_compose_sse2_add("%1","xmm0","xmm2","xmm4","xmm6")
snow_vertical_compose_sse2_move("xmm0","xmm2","xmm4","xmm6","xmm1","xmm3","xmm5","xmm7")
snow_vertical_compose_sra("1","xmm0","xmm2","xmm4","xmm6")
snow_vertical_compose_r2r_add("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6")
snow_vertical_compose_sse2_add("%2","xmm0","xmm2","xmm4","xmm6")
snow_vertical_compose_sse2_store("%2","xmm0","xmm2","xmm4","xmm6")
"2: \n\t"
"sub $64, %%"FF_REG_d" \n\t"
"jge 1b \n\t"
:"+d"(i)
:"r"(b0),"r"(b1),"r"(b2),"r"(b3),"r"(b4),"r"(b5));
}
#define snow_vertical_compose_mmx_load_add(op,r,t0,t1,t2,t3)\
""op" ("r",%%"FF_REG_d"), %%"t0" \n\t"\
""op" 8("r",%%"FF_REG_d"), %%"t1" \n\t"\
""op" 16("r",%%"FF_REG_d"), %%"t2" \n\t"\
""op" 24("r",%%"FF_REG_d"), %%"t3" \n\t"
#define snow_vertical_compose_mmx_load(r,t0,t1,t2,t3)\
snow_vertical_compose_mmx_load_add("movq",r,t0,t1,t2,t3)
#define snow_vertical_compose_mmx_add(r,t0,t1,t2,t3)\
snow_vertical_compose_mmx_load_add("paddw",r,t0,t1,t2,t3)
#define snow_vertical_compose_mmx_store(w,s0,s1,s2,s3)\
"movq %%"s0", ("w",%%"FF_REG_d") \n\t"\
"movq %%"s1", 8("w",%%"FF_REG_d") \n\t"\
"movq %%"s2", 16("w",%%"FF_REG_d") \n\t"\
"movq %%"s3", 24("w",%%"FF_REG_d") \n\t"
#define snow_vertical_compose_mmx_move(s0,s1,s2,s3,t0,t1,t2,t3)\
"movq %%"s0", %%"t0" \n\t"\
"movq %%"s1", %%"t1" \n\t"\
"movq %%"s2", %%"t2" \n\t"\
"movq %%"s3", %%"t3" \n\t"
static void ff_snow_vertical_compose97i_mmx(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width){
x86_reg i = width;
while(i & 15)
{
i--;
b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS;
b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS;
b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS;
b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
}
i+=i;
__asm__ volatile(
"jmp 2f \n\t"
"1: \n\t"
snow_vertical_compose_mmx_load("%4","mm1","mm3","mm5","mm7")
snow_vertical_compose_mmx_add("%6","mm1","mm3","mm5","mm7")
"pcmpeqw %%mm0, %%mm0 \n\t"
"pcmpeqw %%mm2, %%mm2 \n\t"
"paddw %%mm2, %%mm2 \n\t"
"paddw %%mm0, %%mm2 \n\t"
"psllw $13, %%mm2 \n\t"
snow_vertical_compose_r2r_add("mm0","mm0","mm0","mm0","mm1","mm3","mm5","mm7")
snow_vertical_compose_r2r_pmulhw("mm2","mm2","mm2","mm2","mm1","mm3","mm5","mm7")
snow_vertical_compose_mmx_add("%5","mm1","mm3","mm5","mm7")
snow_vertical_compose_mmx_store("%5","mm1","mm3","mm5","mm7")
snow_vertical_compose_mmx_load("%4","mm0","mm2","mm4","mm6")
snow_vertical_compose_mmx_add("%3","mm1","mm3","mm5","mm7")
snow_vertical_compose_r2r_sub("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6")
snow_vertical_compose_mmx_store("%4","mm0","mm2","mm4","mm6")
"pcmpeqw %%mm7, %%mm7 \n\t"
"pcmpeqw %%mm5, %%mm5 \n\t"
"psllw $15, %%mm7 \n\t"
"psrlw $13, %%mm5 \n\t"
"paddw %%mm7, %%mm5 \n\t"
snow_vertical_compose_r2r_add("mm5","mm5","mm5","mm5","mm0","mm2","mm4","mm6")
"movq (%2,%%"FF_REG_d"), %%mm1 \n\t"
"movq 8(%2,%%"FF_REG_d"), %%mm3 \n\t"
"paddw %%mm7, %%mm1 \n\t"
"paddw %%mm7, %%mm3 \n\t"
"pavgw %%mm1, %%mm0 \n\t"
"pavgw %%mm3, %%mm2 \n\t"
"movq 16(%2,%%"FF_REG_d"), %%mm1 \n\t"
"movq 24(%2,%%"FF_REG_d"), %%mm3 \n\t"
"paddw %%mm7, %%mm1 \n\t"
"paddw %%mm7, %%mm3 \n\t"
"pavgw %%mm1, %%mm4 \n\t"
"pavgw %%mm3, %%mm6 \n\t"
snow_vertical_compose_r2r_sub("mm7","mm7","mm7","mm7","mm0","mm2","mm4","mm6")
snow_vertical_compose_sra("1","mm0","mm2","mm4","mm6")
snow_vertical_compose_mmx_add("%3","mm0","mm2","mm4","mm6")
snow_vertical_compose_sra("2","mm0","mm2","mm4","mm6")
snow_vertical_compose_mmx_add("%3","mm0","mm2","mm4","mm6")
snow_vertical_compose_mmx_store("%3","mm0","mm2","mm4","mm6")
snow_vertical_compose_mmx_add("%1","mm0","mm2","mm4","mm6")
snow_vertical_compose_mmx_move("mm0","mm2","mm4","mm6","mm1","mm3","mm5","mm7")
snow_vertical_compose_sra("1","mm0","mm2","mm4","mm6")
snow_vertical_compose_r2r_add("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6")
snow_vertical_compose_mmx_add("%2","mm0","mm2","mm4","mm6")
snow_vertical_compose_mmx_store("%2","mm0","mm2","mm4","mm6")
"2: \n\t"
"sub $32, %%"FF_REG_d" \n\t"
"jge 1b \n\t"
:"+d"(i)
:"r"(b0),"r"(b1),"r"(b2),"r"(b3),"r"(b4),"r"(b5));
}
#endif //HAVE_7REGS
#if HAVE_6REGS
#define snow_inner_add_yblock_sse2_header \
IDWTELEM * * dst_array = sb->line + src_y;\
x86_reg tmp;\
__asm__ volatile(\
"mov %7, %%"FF_REG_c" \n\t"\
"mov %6, %2 \n\t"\
"mov %4, %%"FF_REG_S" \n\t"\
"pxor %%xmm7, %%xmm7 \n\t" /* 0 */\
"pcmpeqd %%xmm3, %%xmm3 \n\t"\
"psllw $15, %%xmm3 \n\t"\
"psrlw $12, %%xmm3 \n\t" /* FRAC_BITS >> 1 */\
"1: \n\t"\
"mov %1, %%"FF_REG_D" \n\t"\
"mov (%%"FF_REG_D"), %%"FF_REG_D" \n\t"\
"add %3, %%"FF_REG_D" \n\t"
#define snow_inner_add_yblock_sse2_start_8(out_reg1, out_reg2, ptr_offset, s_offset)\
"mov "FF_PTR_SIZE"*"ptr_offset"(%%"FF_REG_a"), %%"FF_REG_d"; \n\t"\
"movq (%%"FF_REG_d"), %%"out_reg1" \n\t"\
"movq (%%"FF_REG_d", %%"FF_REG_c"), %%"out_reg2" \n\t"\
"punpcklbw %%xmm7, %%"out_reg1" \n\t"\
"punpcklbw %%xmm7, %%"out_reg2" \n\t"\
"movq "s_offset"(%%"FF_REG_S"), %%xmm0 \n\t"\
"movq "s_offset"+16(%%"FF_REG_S"), %%xmm4 \n\t"\
"punpcklbw %%xmm7, %%xmm0 \n\t"\
"punpcklbw %%xmm7, %%xmm4 \n\t"\
"pmullw %%xmm0, %%"out_reg1" \n\t"\
"pmullw %%xmm4, %%"out_reg2" \n\t"
#define snow_inner_add_yblock_sse2_start_16(out_reg1, out_reg2, ptr_offset, s_offset)\
"mov "FF_PTR_SIZE"*"ptr_offset"(%%"FF_REG_a"), %%"FF_REG_d"; \n\t"\
"movq (%%"FF_REG_d"), %%"out_reg1" \n\t"\
"movq 8(%%"FF_REG_d"), %%"out_reg2" \n\t"\
"punpcklbw %%xmm7, %%"out_reg1" \n\t"\
"punpcklbw %%xmm7, %%"out_reg2" \n\t"\
"movq "s_offset"(%%"FF_REG_S"), %%xmm0 \n\t"\
"movq "s_offset"+8(%%"FF_REG_S"), %%xmm4 \n\t"\
"punpcklbw %%xmm7, %%xmm0 \n\t"\
"punpcklbw %%xmm7, %%xmm4 \n\t"\
"pmullw %%xmm0, %%"out_reg1" \n\t"\
"pmullw %%xmm4, %%"out_reg2" \n\t"
#define snow_inner_add_yblock_sse2_accum_8(ptr_offset, s_offset) \
snow_inner_add_yblock_sse2_start_8("xmm2", "xmm6", ptr_offset, s_offset)\
"paddusw %%xmm2, %%xmm1 \n\t"\
"paddusw %%xmm6, %%xmm5 \n\t"
#define snow_inner_add_yblock_sse2_accum_16(ptr_offset, s_offset) \
snow_inner_add_yblock_sse2_start_16("xmm2", "xmm6", ptr_offset, s_offset)\
"paddusw %%xmm2, %%xmm1 \n\t"\
"paddusw %%xmm6, %%xmm5 \n\t"
#define snow_inner_add_yblock_sse2_end_common1\
"add $32, %%"FF_REG_S" \n\t"\
"add %%"FF_REG_c", %0 \n\t"\
"add %%"FF_REG_c", "FF_PTR_SIZE"*3(%%"FF_REG_a"); \n\t"\
"add %%"FF_REG_c", "FF_PTR_SIZE"*2(%%"FF_REG_a"); \n\t"\
"add %%"FF_REG_c", "FF_PTR_SIZE"*1(%%"FF_REG_a"); \n\t"\
"add %%"FF_REG_c", (%%"FF_REG_a") \n\t"
#define snow_inner_add_yblock_sse2_end_common2\
"jnz 1b \n\t"\
:"+m"(dst8),"+m"(dst_array),"=&r"(tmp)\
:\
"rm"((x86_reg)(src_x<<1)),"m"(obmc),"a"(block),"m"(b_h),"m"(src_stride):\
XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", )\
"%"FF_REG_c"","%"FF_REG_S"","%"FF_REG_D"","%"FF_REG_d"");
#define snow_inner_add_yblock_sse2_end_8\
"sal $1, %%"FF_REG_c" \n\t"\
"add"FF_OPSIZE" $"FF_PTR_SIZE"*2, %1 \n\t"\
snow_inner_add_yblock_sse2_end_common1\
"sar $1, %%"FF_REG_c" \n\t"\
"sub $2, %2 \n\t"\
snow_inner_add_yblock_sse2_end_common2
#define snow_inner_add_yblock_sse2_end_16\
"add"FF_OPSIZE" $"FF_PTR_SIZE"*1, %1 \n\t"\
snow_inner_add_yblock_sse2_end_common1\
"dec %2 \n\t"\
snow_inner_add_yblock_sse2_end_common2
static void inner_add_yblock_bw_8_obmc_16_bh_even_sse2(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h,
int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){
snow_inner_add_yblock_sse2_header
snow_inner_add_yblock_sse2_start_8("xmm1", "xmm5", "3", "0")
snow_inner_add_yblock_sse2_accum_8("2", "8")
snow_inner_add_yblock_sse2_accum_8("1", "128")
snow_inner_add_yblock_sse2_accum_8("0", "136")
"mov %0, %%"FF_REG_d" \n\t"
"movdqa (%%"FF_REG_D"), %%xmm0 \n\t"
"movdqa %%xmm1, %%xmm2 \n\t"
"punpckhwd %%xmm7, %%xmm1 \n\t"
"punpcklwd %%xmm7, %%xmm2 \n\t"
"paddd %%xmm2, %%xmm0 \n\t"
"movdqa 16(%%"FF_REG_D"), %%xmm2\n\t"
"paddd %%xmm1, %%xmm2 \n\t"
"paddd %%xmm3, %%xmm0 \n\t"
"paddd %%xmm3, %%xmm2 \n\t"
"mov %1, %%"FF_REG_D" \n\t"
"mov "FF_PTR_SIZE"(%%"FF_REG_D"), %%"FF_REG_D"; \n\t"
"add %3, %%"FF_REG_D" \n\t"
"movdqa (%%"FF_REG_D"), %%xmm4 \n\t"
"movdqa %%xmm5, %%xmm6 \n\t"
"punpckhwd %%xmm7, %%xmm5 \n\t"
"punpcklwd %%xmm7, %%xmm6 \n\t"
"paddd %%xmm6, %%xmm4 \n\t"
"movdqa 16(%%"FF_REG_D"), %%xmm6\n\t"
"paddd %%xmm5, %%xmm6 \n\t"
"paddd %%xmm3, %%xmm4 \n\t"
"paddd %%xmm3, %%xmm6 \n\t"
"psrad $8, %%xmm0 \n\t" /* FRAC_BITS. */
"psrad $8, %%xmm2 \n\t" /* FRAC_BITS. */
"packssdw %%xmm2, %%xmm0 \n\t"
"packuswb %%xmm7, %%xmm0 \n\t"
"movq %%xmm0, (%%"FF_REG_d") \n\t"
"psrad $8, %%xmm4 \n\t" /* FRAC_BITS. */
"psrad $8, %%xmm6 \n\t" /* FRAC_BITS. */
"packssdw %%xmm6, %%xmm4 \n\t"
"packuswb %%xmm7, %%xmm4 \n\t"
"movq %%xmm4, (%%"FF_REG_d",%%"FF_REG_c"); \n\t"
snow_inner_add_yblock_sse2_end_8
}
static void inner_add_yblock_bw_16_obmc_32_sse2(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h,
int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){
snow_inner_add_yblock_sse2_header
snow_inner_add_yblock_sse2_start_16("xmm1", "xmm5", "3", "0")
snow_inner_add_yblock_sse2_accum_16("2", "16")
snow_inner_add_yblock_sse2_accum_16("1", "512")
snow_inner_add_yblock_sse2_accum_16("0", "528")
"mov %0, %%"FF_REG_d" \n\t"
"psrlw $4, %%xmm1 \n\t"
"psrlw $4, %%xmm5 \n\t"
"paddw (%%"FF_REG_D"), %%xmm1 \n\t"
"paddw 16(%%"FF_REG_D"), %%xmm5 \n\t"
"paddw %%xmm3, %%xmm1 \n\t"
"paddw %%xmm3, %%xmm5 \n\t"
"psraw $4, %%xmm1 \n\t" /* FRAC_BITS. */
"psraw $4, %%xmm5 \n\t" /* FRAC_BITS. */
"packuswb %%xmm5, %%xmm1 \n\t"
"movdqu %%xmm1, (%%"FF_REG_d") \n\t"
snow_inner_add_yblock_sse2_end_16
}
#define snow_inner_add_yblock_mmx_header \
IDWTELEM * * dst_array = sb->line + src_y;\
x86_reg tmp;\
__asm__ volatile(\
"mov %7, %%"FF_REG_c" \n\t"\
"mov %6, %2 \n\t"\
"mov %4, %%"FF_REG_S" \n\t"\
"pxor %%mm7, %%mm7 \n\t" /* 0 */\
"pcmpeqd %%mm3, %%mm3 \n\t"\
"psllw $15, %%mm3 \n\t"\
"psrlw $12, %%mm3 \n\t" /* FRAC_BITS >> 1 */\
"1: \n\t"\
"mov %1, %%"FF_REG_D" \n\t"\
"mov (%%"FF_REG_D"), %%"FF_REG_D" \n\t"\
"add %3, %%"FF_REG_D" \n\t"
#define snow_inner_add_yblock_mmx_start(out_reg1, out_reg2, ptr_offset, s_offset, d_offset)\
"mov "FF_PTR_SIZE"*"ptr_offset"(%%"FF_REG_a"), %%"FF_REG_d"; \n\t"\
"movd "d_offset"(%%"FF_REG_d"), %%"out_reg1" \n\t"\
"movd "d_offset"+4(%%"FF_REG_d"), %%"out_reg2" \n\t"\
"punpcklbw %%mm7, %%"out_reg1" \n\t"\
"punpcklbw %%mm7, %%"out_reg2" \n\t"\
"movd "s_offset"(%%"FF_REG_S"), %%mm0 \n\t"\
"movd "s_offset"+4(%%"FF_REG_S"), %%mm4 \n\t"\
"punpcklbw %%mm7, %%mm0 \n\t"\
"punpcklbw %%mm7, %%mm4 \n\t"\
"pmullw %%mm0, %%"out_reg1" \n\t"\
"pmullw %%mm4, %%"out_reg2" \n\t"
#define snow_inner_add_yblock_mmx_accum(ptr_offset, s_offset, d_offset) \
snow_inner_add_yblock_mmx_start("mm2", "mm6", ptr_offset, s_offset, d_offset)\
"paddusw %%mm2, %%mm1 \n\t"\
"paddusw %%mm6, %%mm5 \n\t"
#define snow_inner_add_yblock_mmx_mix(read_offset, write_offset)\
"mov %0, %%"FF_REG_d" \n\t"\
"psrlw $4, %%mm1 \n\t"\
"psrlw $4, %%mm5 \n\t"\
"paddw "read_offset"(%%"FF_REG_D"), %%mm1 \n\t"\
"paddw "read_offset"+8(%%"FF_REG_D"), %%mm5 \n\t"\
"paddw %%mm3, %%mm1 \n\t"\
"paddw %%mm3, %%mm5 \n\t"\
"psraw $4, %%mm1 \n\t"\
"psraw $4, %%mm5 \n\t"\
"packuswb %%mm5, %%mm1 \n\t"\
"movq %%mm1, "write_offset"(%%"FF_REG_d") \n\t"
#define snow_inner_add_yblock_mmx_end(s_step)\
"add $"s_step", %%"FF_REG_S" \n\t"\
"add %%"FF_REG_c", "FF_PTR_SIZE"*3(%%"FF_REG_a"); \n\t"\
"add %%"FF_REG_c", "FF_PTR_SIZE"*2(%%"FF_REG_a"); \n\t"\
"add %%"FF_REG_c", "FF_PTR_SIZE"*1(%%"FF_REG_a"); \n\t"\
"add %%"FF_REG_c", (%%"FF_REG_a") \n\t"\
"add"FF_OPSIZE " $"FF_PTR_SIZE"*1, %1 \n\t"\
"add %%"FF_REG_c", %0 \n\t"\
"dec %2 \n\t"\
"jnz 1b \n\t"\
:"+m"(dst8),"+m"(dst_array),"=&r"(tmp)\
:\
"rm"((x86_reg)(src_x<<1)),"m"(obmc),"a"(block),"m"(b_h),"m"(src_stride):\
"%"FF_REG_c"","%"FF_REG_S"","%"FF_REG_D"","%"FF_REG_d"");
static void inner_add_yblock_bw_8_obmc_16_mmx(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h,
int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){
snow_inner_add_yblock_mmx_header
snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "0", "0")
snow_inner_add_yblock_mmx_accum("2", "8", "0")
snow_inner_add_yblock_mmx_accum("1", "128", "0")
snow_inner_add_yblock_mmx_accum("0", "136", "0")
snow_inner_add_yblock_mmx_mix("0", "0")
snow_inner_add_yblock_mmx_end("16")
}
static void inner_add_yblock_bw_16_obmc_32_mmx(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h,
int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){
snow_inner_add_yblock_mmx_header
snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "0", "0")
snow_inner_add_yblock_mmx_accum("2", "16", "0")
snow_inner_add_yblock_mmx_accum("1", "512", "0")
snow_inner_add_yblock_mmx_accum("0", "528", "0")
snow_inner_add_yblock_mmx_mix("0", "0")
snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "8", "8")
snow_inner_add_yblock_mmx_accum("2", "24", "8")
snow_inner_add_yblock_mmx_accum("1", "520", "8")
snow_inner_add_yblock_mmx_accum("0", "536", "8")
snow_inner_add_yblock_mmx_mix("16", "8")
snow_inner_add_yblock_mmx_end("32")
}
static void ff_snow_inner_add_yblock_sse2(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){
if (b_w == 16)
inner_add_yblock_bw_16_obmc_32_sse2(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
else if (b_w == 8 && obmc_stride == 16) {
if (!(b_h & 1))
inner_add_yblock_bw_8_obmc_16_bh_even_sse2(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
else
inner_add_yblock_bw_8_obmc_16_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
} else
ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
}
static void ff_snow_inner_add_yblock_mmx(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){
if (b_w == 16)
inner_add_yblock_bw_16_obmc_32_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
else if (b_w == 8 && obmc_stride == 16)
inner_add_yblock_bw_8_obmc_16_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
else
ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
}
#endif /* HAVE_6REGS */
#endif /* HAVE_INLINE_ASM */
av_cold void ff_dwt_init_x86(SnowDWTContext *c)
{
#if HAVE_INLINE_ASM
int mm_flags = av_get_cpu_flags();
if (mm_flags & AV_CPU_FLAG_MMX) {
if(mm_flags & AV_CPU_FLAG_SSE2 & 0){
c->horizontal_compose97i = ff_snow_horizontal_compose97i_sse2;
#if HAVE_7REGS
c->vertical_compose97i = ff_snow_vertical_compose97i_sse2;
#endif
#if HAVE_6REGS
c->inner_add_yblock = ff_snow_inner_add_yblock_sse2;
#endif
}
else{
if (mm_flags & AV_CPU_FLAG_MMXEXT) {
c->horizontal_compose97i = ff_snow_horizontal_compose97i_mmx;
#if HAVE_7REGS
c->vertical_compose97i = ff_snow_vertical_compose97i_mmx;
#endif
}
#if HAVE_6REGS
c->inner_add_yblock = ff_snow_inner_add_yblock_mmx;
#endif
}
}
#endif /* HAVE_INLINE_ASM */
}

View file

@ -0,0 +1,42 @@
/*
* Copyright (c) 2007 Loren Merritt
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/svq1enc.h"
int ff_ssd_int8_vs_int16_mmx(const int8_t *pix1, const int16_t *pix2,
intptr_t size);
int ff_ssd_int8_vs_int16_sse2(const int8_t *pix1, const int16_t *pix2,
intptr_t size);
av_cold void ff_svq1enc_init_x86(SVQ1EncContext *c)
{
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_MMX(cpu_flags)) {
c->ssd_int8_vs_int16 = ff_ssd_int8_vs_int16_mmx;
}
if (EXTERNAL_SSE2(cpu_flags)) {
c->ssd_int8_vs_int16 = ff_ssd_int8_vs_int16_sse2;
}
}

View file

@ -0,0 +1,74 @@
/*
* Copyright (c) 2012-2014 Christophe Gisquet <christophe.gisquet@gmail.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/synth_filter.h"
#define SYNTH_FILTER_FUNC(opt) \
void ff_synth_filter_inner_##opt(float *synth_buf_ptr, float synth_buf2[32], \
const float window[512], \
float out[32], intptr_t offset, float scale); \
static void synth_filter_##opt(FFTContext *imdct, \
float *synth_buf_ptr, int *synth_buf_offset, \
float synth_buf2[32], const float window[512], \
float out[32], const float in[32], float scale) \
{ \
float *synth_buf= synth_buf_ptr + *synth_buf_offset; \
\
imdct->imdct_half(imdct, synth_buf, in); \
\
ff_synth_filter_inner_##opt(synth_buf, synth_buf2, window, \
out, *synth_buf_offset, scale); \
\
*synth_buf_offset = (*synth_buf_offset - 32) & 511; \
} \
#if HAVE_X86ASM
#if ARCH_X86_32
SYNTH_FILTER_FUNC(sse)
#endif
SYNTH_FILTER_FUNC(sse2)
SYNTH_FILTER_FUNC(avx)
SYNTH_FILTER_FUNC(fma3)
#endif /* HAVE_X86ASM */
av_cold void ff_synth_filter_init_x86(SynthFilterContext *s)
{
#if HAVE_X86ASM
int cpu_flags = av_get_cpu_flags();
#if ARCH_X86_32
if (EXTERNAL_SSE(cpu_flags)) {
s->synth_filter_float = synth_filter_sse;
}
#endif
if (EXTERNAL_SSE2(cpu_flags)) {
s->synth_filter_float = synth_filter_sse2;
}
if (EXTERNAL_AVX_FAST(cpu_flags)) {
s->synth_filter_float = synth_filter_avx;
}
if (EXTERNAL_FMA3_FAST(cpu_flags)) {
s->synth_filter_float = synth_filter_fma3;
}
#endif /* HAVE_X86ASM */
}

View file

@ -0,0 +1,45 @@
/*
* Copyright (c) 2015 Paul B Mahol
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavcodec/takdsp.h"
#include "libavutil/x86/cpu.h"
#include "config.h"
void ff_tak_decorrelate_ls_sse2(int32_t *p1, int32_t *p2, int length);
void ff_tak_decorrelate_sr_sse2(int32_t *p1, int32_t *p2, int length);
void ff_tak_decorrelate_sm_sse2(int32_t *p1, int32_t *p2, int length);
void ff_tak_decorrelate_sf_sse4(int32_t *p1, int32_t *p2, int length, int dshift, int dfactor);
av_cold void ff_takdsp_init_x86(TAKDSPContext *c)
{
#if HAVE_X86ASM
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_SSE2(cpu_flags)) {
c->decorrelate_ls = ff_tak_decorrelate_ls_sse2;
c->decorrelate_sr = ff_tak_decorrelate_sr_sse2;
c->decorrelate_sm = ff_tak_decorrelate_sm_sse2;
}
if (EXTERNAL_SSE4(cpu_flags)) {
c->decorrelate_sf = ff_tak_decorrelate_sf_sse4;
}
#endif
}

View file

@ -0,0 +1,42 @@
/*
* Copyright (c) 2014 James Almer
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavcodec/ttadsp.h"
#include "libavutil/x86/cpu.h"
#include "config.h"
void ff_tta_filter_process_ssse3(int32_t *qm, int32_t *dx, int32_t *dl,
int32_t *error, int32_t *in, int32_t shift,
int32_t round);
void ff_tta_filter_process_sse4(int32_t *qm, int32_t *dx, int32_t *dl,
int32_t *error, int32_t *in, int32_t shift,
int32_t round);
av_cold void ff_ttadsp_init_x86(TTADSPContext *c)
{
#if HAVE_X86ASM
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_SSSE3(cpu_flags))
c->filter_process = ff_tta_filter_process_ssse3;
if (EXTERNAL_SSE4(cpu_flags))
c->filter_process = ff_tta_filter_process_sse4;
#endif
}

View file

@ -0,0 +1,42 @@
/*
* Copyright (c) 2014-2016 James Almer
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavcodec/ttaencdsp.h"
#include "libavutil/x86/cpu.h"
#include "config.h"
void ff_ttaenc_filter_process_ssse3(int32_t *qm, int32_t *dx, int32_t *dl,
int32_t *error, int32_t *in, int32_t shift,
int32_t round);
void ff_ttaenc_filter_process_sse4(int32_t *qm, int32_t *dx, int32_t *dl,
int32_t *error, int32_t *in, int32_t shift,
int32_t round);
av_cold void ff_ttaencdsp_init_x86(TTAEncDSPContext *c)
{
#if HAVE_X86ASM
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_SSSE3(cpu_flags))
c->filter_process = ff_ttaenc_filter_process_ssse3;
if (EXTERNAL_SSE4(cpu_flags))
c->filter_process = ff_ttaenc_filter_process_sse4;
#endif
}

View file

@ -0,0 +1,54 @@
/*
* Copyright (c) 2017 Paul B Mahol
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/utvideodsp.h"
void ff_restore_rgb_planes_sse2(uint8_t *src_r, uint8_t *src_g, uint8_t *src_b,
ptrdiff_t linesize_r, ptrdiff_t linesize_g,
ptrdiff_t linesize_b, int width, int height);
void ff_restore_rgb_planes_avx2(uint8_t *src_r, uint8_t *src_g, uint8_t *src_b,
ptrdiff_t linesize_r, ptrdiff_t linesize_g,
ptrdiff_t linesize_b, int width, int height);
void ff_restore_rgb_planes10_sse2(uint16_t *src_r, uint16_t *src_g, uint16_t *src_b,
ptrdiff_t linesize_r, ptrdiff_t linesize_g,
ptrdiff_t linesize_b, int width, int height);
void ff_restore_rgb_planes10_avx2(uint16_t *src_r, uint16_t *src_g, uint16_t *src_b,
ptrdiff_t linesize_r, ptrdiff_t linesize_g,
ptrdiff_t linesize_b, int width, int height);
av_cold void ff_utvideodsp_init_x86(UTVideoDSPContext *c)
{
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_SSE2(cpu_flags)) {
c->restore_rgb_planes = ff_restore_rgb_planes_sse2;
c->restore_rgb_planes10 = ff_restore_rgb_planes10_sse2;
}
if (EXTERNAL_AVX2_FAST(cpu_flags)) {
c->restore_rgb_planes = ff_restore_rgb_planes_avx2;
c->restore_rgb_planes10 = ff_restore_rgb_planes10_avx2;
}
}

View file

@ -0,0 +1,56 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/cpu.h"
#include "libavcodec/v210dec.h"
extern void ff_v210_planar_unpack_unaligned_ssse3(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width);
extern void ff_v210_planar_unpack_unaligned_avx(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width);
extern void ff_v210_planar_unpack_unaligned_avx2(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width);
extern void ff_v210_planar_unpack_aligned_ssse3(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width);
extern void ff_v210_planar_unpack_aligned_avx(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width);
extern void ff_v210_planar_unpack_aligned_avx2(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width);
av_cold void ff_v210_x86_init(V210DecContext *s)
{
#if HAVE_X86ASM
int cpu_flags = av_get_cpu_flags();
if (s->aligned_input) {
if (cpu_flags & AV_CPU_FLAG_SSSE3)
s->unpack_frame = ff_v210_planar_unpack_aligned_ssse3;
if (HAVE_AVX_EXTERNAL && cpu_flags & AV_CPU_FLAG_AVX)
s->unpack_frame = ff_v210_planar_unpack_aligned_avx;
if (HAVE_AVX2_EXTERNAL && cpu_flags & AV_CPU_FLAG_AVX2)
s->unpack_frame = ff_v210_planar_unpack_aligned_avx2;
}
else {
if (cpu_flags & AV_CPU_FLAG_SSSE3)
s->unpack_frame = ff_v210_planar_unpack_unaligned_ssse3;
if (HAVE_AVX_EXTERNAL && cpu_flags & AV_CPU_FLAG_AVX)
s->unpack_frame = ff_v210_planar_unpack_unaligned_avx;
if (HAVE_AVX2_EXTERNAL && cpu_flags & AV_CPU_FLAG_AVX2)
s->unpack_frame = ff_v210_planar_unpack_unaligned_avx2;
}
#endif
}

View file

@ -0,0 +1,54 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/x86/cpu.h"
#include "libavcodec/v210enc.h"
void ff_v210_planar_pack_8_ssse3(const uint8_t *y, const uint8_t *u,
const uint8_t *v, uint8_t *dst,
ptrdiff_t width);
void ff_v210_planar_pack_8_avx(const uint8_t *y, const uint8_t *u,
const uint8_t *v, uint8_t *dst, ptrdiff_t width);
void ff_v210_planar_pack_8_avx2(const uint8_t *y, const uint8_t *u,
const uint8_t *v, uint8_t *dst, ptrdiff_t width);
void ff_v210_planar_pack_10_ssse3(const uint16_t *y, const uint16_t *u,
const uint16_t *v, uint8_t *dst,
ptrdiff_t width);
void ff_v210_planar_pack_10_avx2(const uint16_t *y, const uint16_t *u,
const uint16_t *v, uint8_t *dst,
ptrdiff_t width);
av_cold void ff_v210enc_init_x86(V210EncContext *s)
{
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_SSSE3(cpu_flags)) {
s->pack_line_8 = ff_v210_planar_pack_8_ssse3;
s->pack_line_10 = ff_v210_planar_pack_10_ssse3;
}
if (EXTERNAL_AVX(cpu_flags))
s->pack_line_8 = ff_v210_planar_pack_8_avx;
if (EXTERNAL_AVX2(cpu_flags)) {
s->sample_factor_8 = 2;
s->pack_line_8 = ff_v210_planar_pack_8_avx2;
s->sample_factor_10 = 2;
s->pack_line_10 = ff_v210_planar_pack_10_avx2;
}
}

View file

@ -0,0 +1,29 @@
/*
* VC-1 and WMV3 decoder - X86 DSP init functions
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVCODEC_X86_VC1DSP_H
#define AVCODEC_X86_VC1DSP_H
#include "libavcodec/vc1dsp.h"
void ff_vc1dsp_init_mmx(VC1DSPContext *dsp);
void ff_vc1dsp_init_mmxext(VC1DSPContext *dsp);
#endif /* AVCODEC_X86_VC1DSP_H */

View file

@ -0,0 +1,168 @@
/*
* VC-1 and WMV3 - DSP functions MMX-optimized
* Copyright (c) 2007 Christophe GISQUET <christophe.gisquet@free.fr>
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/cpu.h"
#include "libavutil/x86/asm.h"
#include "libavcodec/vc1dsp.h"
#include "fpel.h"
#include "vc1dsp.h"
#include "config.h"
#define LOOP_FILTER(EXT) \
void ff_vc1_v_loop_filter4_ ## EXT(uint8_t *src, int stride, int pq); \
void ff_vc1_h_loop_filter4_ ## EXT(uint8_t *src, int stride, int pq); \
void ff_vc1_v_loop_filter8_ ## EXT(uint8_t *src, int stride, int pq); \
void ff_vc1_h_loop_filter8_ ## EXT(uint8_t *src, int stride, int pq); \
\
static void vc1_v_loop_filter16_ ## EXT(uint8_t *src, int stride, int pq) \
{ \
ff_vc1_v_loop_filter8_ ## EXT(src, stride, pq); \
ff_vc1_v_loop_filter8_ ## EXT(src+8, stride, pq); \
} \
\
static void vc1_h_loop_filter16_ ## EXT(uint8_t *src, int stride, int pq) \
{ \
ff_vc1_h_loop_filter8_ ## EXT(src, stride, pq); \
ff_vc1_h_loop_filter8_ ## EXT(src+8*stride, stride, pq); \
}
#if HAVE_X86ASM
LOOP_FILTER(mmxext)
LOOP_FILTER(sse2)
LOOP_FILTER(ssse3)
void ff_vc1_h_loop_filter8_sse4(uint8_t *src, int stride, int pq);
static void vc1_h_loop_filter16_sse4(uint8_t *src, int stride, int pq)
{
ff_vc1_h_loop_filter8_sse4(src, stride, pq);
ff_vc1_h_loop_filter8_sse4(src+8*stride, stride, pq);
}
#define DECLARE_FUNCTION(OP, DEPTH, INSN) \
static void OP##vc1_mspel_mc00_##DEPTH##INSN(uint8_t *dst, \
const uint8_t *src, ptrdiff_t stride, int rnd) \
{ \
ff_ ## OP ## pixels ## DEPTH ## INSN(dst, src, stride, DEPTH); \
}
DECLARE_FUNCTION(put_, 8, _mmx)
DECLARE_FUNCTION(put_, 16, _mmx)
DECLARE_FUNCTION(avg_, 8, _mmx)
DECLARE_FUNCTION(avg_, 16, _mmx)
DECLARE_FUNCTION(avg_, 8, _mmxext)
DECLARE_FUNCTION(avg_, 16, _mmxext)
DECLARE_FUNCTION(put_, 16, _sse2)
DECLARE_FUNCTION(avg_, 16, _sse2)
#endif /* HAVE_X86ASM */
void ff_put_vc1_chroma_mc8_nornd_mmx (uint8_t *dst, uint8_t *src,
ptrdiff_t stride, int h, int x, int y);
void ff_avg_vc1_chroma_mc8_nornd_mmxext(uint8_t *dst, uint8_t *src,
ptrdiff_t stride, int h, int x, int y);
void ff_avg_vc1_chroma_mc8_nornd_3dnow(uint8_t *dst, uint8_t *src,
ptrdiff_t stride, int h, int x, int y);
void ff_put_vc1_chroma_mc8_nornd_ssse3(uint8_t *dst, uint8_t *src,
ptrdiff_t stride, int h, int x, int y);
void ff_avg_vc1_chroma_mc8_nornd_ssse3(uint8_t *dst, uint8_t *src,
ptrdiff_t stride, int h, int x, int y);
void ff_vc1_inv_trans_4x4_dc_mmxext(uint8_t *dest, ptrdiff_t linesize,
int16_t *block);
void ff_vc1_inv_trans_4x8_dc_mmxext(uint8_t *dest, ptrdiff_t linesize,
int16_t *block);
void ff_vc1_inv_trans_8x4_dc_mmxext(uint8_t *dest, ptrdiff_t linesize,
int16_t *block);
void ff_vc1_inv_trans_8x8_dc_mmxext(uint8_t *dest, ptrdiff_t linesize,
int16_t *block);
av_cold void ff_vc1dsp_init_x86(VC1DSPContext *dsp)
{
int cpu_flags = av_get_cpu_flags();
if (HAVE_6REGS && INLINE_MMX(cpu_flags))
if (EXTERNAL_MMX(cpu_flags))
ff_vc1dsp_init_mmx(dsp);
if (HAVE_6REGS && INLINE_MMXEXT(cpu_flags))
if (EXTERNAL_MMXEXT(cpu_flags))
ff_vc1dsp_init_mmxext(dsp);
#define ASSIGN_LF(EXT) \
dsp->vc1_v_loop_filter4 = ff_vc1_v_loop_filter4_ ## EXT; \
dsp->vc1_h_loop_filter4 = ff_vc1_h_loop_filter4_ ## EXT; \
dsp->vc1_v_loop_filter8 = ff_vc1_v_loop_filter8_ ## EXT; \
dsp->vc1_h_loop_filter8 = ff_vc1_h_loop_filter8_ ## EXT; \
dsp->vc1_v_loop_filter16 = vc1_v_loop_filter16_ ## EXT; \
dsp->vc1_h_loop_filter16 = vc1_h_loop_filter16_ ## EXT
#if HAVE_X86ASM
if (EXTERNAL_MMX(cpu_flags)) {
dsp->put_no_rnd_vc1_chroma_pixels_tab[0] = ff_put_vc1_chroma_mc8_nornd_mmx;
dsp->put_vc1_mspel_pixels_tab[1][0] = put_vc1_mspel_mc00_8_mmx;
dsp->put_vc1_mspel_pixels_tab[0][0] = put_vc1_mspel_mc00_16_mmx;
dsp->avg_vc1_mspel_pixels_tab[1][0] = avg_vc1_mspel_mc00_8_mmx;
dsp->avg_vc1_mspel_pixels_tab[0][0] = avg_vc1_mspel_mc00_16_mmx;
}
if (EXTERNAL_AMD3DNOW(cpu_flags)) {
dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_nornd_3dnow;
}
if (EXTERNAL_MMXEXT(cpu_flags)) {
ASSIGN_LF(mmxext);
dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_nornd_mmxext;
dsp->avg_vc1_mspel_pixels_tab[1][0] = avg_vc1_mspel_mc00_8_mmxext;
dsp->avg_vc1_mspel_pixels_tab[0][0] = avg_vc1_mspel_mc00_16_mmxext;
dsp->vc1_inv_trans_8x8_dc = ff_vc1_inv_trans_8x8_dc_mmxext;
dsp->vc1_inv_trans_4x8_dc = ff_vc1_inv_trans_4x8_dc_mmxext;
dsp->vc1_inv_trans_8x4_dc = ff_vc1_inv_trans_8x4_dc_mmxext;
dsp->vc1_inv_trans_4x4_dc = ff_vc1_inv_trans_4x4_dc_mmxext;
}
if (EXTERNAL_SSE2(cpu_flags)) {
dsp->vc1_v_loop_filter8 = ff_vc1_v_loop_filter8_sse2;
dsp->vc1_h_loop_filter8 = ff_vc1_h_loop_filter8_sse2;
dsp->vc1_v_loop_filter16 = vc1_v_loop_filter16_sse2;
dsp->vc1_h_loop_filter16 = vc1_h_loop_filter16_sse2;
dsp->put_vc1_mspel_pixels_tab[0][0] = put_vc1_mspel_mc00_16_sse2;
dsp->avg_vc1_mspel_pixels_tab[0][0] = avg_vc1_mspel_mc00_16_sse2;
}
if (EXTERNAL_SSSE3(cpu_flags)) {
ASSIGN_LF(ssse3);
dsp->put_no_rnd_vc1_chroma_pixels_tab[0] = ff_put_vc1_chroma_mc8_nornd_ssse3;
dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_nornd_ssse3;
}
if (EXTERNAL_SSE4(cpu_flags)) {
dsp->vc1_h_loop_filter8 = ff_vc1_h_loop_filter8_sse4;
dsp->vc1_h_loop_filter16 = vc1_h_loop_filter16_sse4;
}
#endif /* HAVE_X86ASM */
}

View file

@ -0,0 +1,486 @@
/*
* VC-1 and WMV3 - DSP functions MMX-optimized
* Copyright (c) 2007 Christophe GISQUET <christophe.gisquet@free.fr>
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#include "libavutil/cpu.h"
#include "libavutil/mem.h"
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/vc1dsp.h"
#include "constants.h"
#include "fpel.h"
#include "vc1dsp.h"
#if HAVE_6REGS && HAVE_INLINE_ASM && HAVE_MMX_EXTERNAL
void ff_vc1_put_ver_16b_shift2_mmx(int16_t *dst,
const uint8_t *src, x86_reg stride,
int rnd, int64_t shift);
void ff_vc1_put_hor_16b_shift2_mmx(uint8_t *dst, x86_reg stride,
const int16_t *src, int rnd);
void ff_vc1_avg_hor_16b_shift2_mmxext(uint8_t *dst, x86_reg stride,
const int16_t *src, int rnd);
#define OP_PUT(S,D)
#define OP_AVG(S,D) "pavgb " #S ", " #D " \n\t"
/** Add rounder from mm7 to mm3 and pack result at destination */
#define NORMALIZE_MMX(SHIFT) \
"paddw %%mm7, %%mm3 \n\t" /* +bias-r */ \
"paddw %%mm7, %%mm4 \n\t" /* +bias-r */ \
"psraw "SHIFT", %%mm3 \n\t" \
"psraw "SHIFT", %%mm4 \n\t"
#define TRANSFER_DO_PACK(OP) \
"packuswb %%mm4, %%mm3 \n\t" \
OP((%2), %%mm3) \
"movq %%mm3, (%2) \n\t"
#define TRANSFER_DONT_PACK(OP) \
OP(0(%2), %%mm3) \
OP(8(%2), %%mm4) \
"movq %%mm3, 0(%2) \n\t" \
"movq %%mm4, 8(%2) \n\t"
/** @see MSPEL_FILTER13_CORE for use as UNPACK macro */
#define DO_UNPACK(reg) "punpcklbw %%mm0, " reg "\n\t"
#define DONT_UNPACK(reg)
/** Compute the rounder 32-r or 8-r and unpacks it to mm7 */
#define LOAD_ROUNDER_MMX(ROUND) \
"movd "ROUND", %%mm7 \n\t" \
"punpcklwd %%mm7, %%mm7 \n\t" \
"punpckldq %%mm7, %%mm7 \n\t"
/**
* Purely vertical or horizontal 1/2 shift interpolation.
* Sacrifice mm6 for *9 factor.
*/
#define VC1_SHIFT2(OP, OPNAME)\
static void OPNAME ## vc1_shift2_mmx(uint8_t *dst, const uint8_t *src,\
x86_reg stride, int rnd, x86_reg offset)\
{\
rnd = 8-rnd;\
__asm__ volatile(\
"mov $8, %%"FF_REG_c" \n\t"\
LOAD_ROUNDER_MMX("%5")\
"movq "MANGLE(ff_pw_9)", %%mm6\n\t"\
"1: \n\t"\
"movd 0(%0 ), %%mm3 \n\t"\
"movd 4(%0 ), %%mm4 \n\t"\
"movd 0(%0,%2), %%mm1 \n\t"\
"movd 4(%0,%2), %%mm2 \n\t"\
"add %2, %0 \n\t"\
"punpcklbw %%mm0, %%mm3 \n\t"\
"punpcklbw %%mm0, %%mm4 \n\t"\
"punpcklbw %%mm0, %%mm1 \n\t"\
"punpcklbw %%mm0, %%mm2 \n\t"\
"paddw %%mm1, %%mm3 \n\t"\
"paddw %%mm2, %%mm4 \n\t"\
"movd 0(%0,%3), %%mm1 \n\t"\
"movd 4(%0,%3), %%mm2 \n\t"\
"pmullw %%mm6, %%mm3 \n\t" /* 0,9,9,0*/\
"pmullw %%mm6, %%mm4 \n\t" /* 0,9,9,0*/\
"punpcklbw %%mm0, %%mm1 \n\t"\
"punpcklbw %%mm0, %%mm2 \n\t"\
"psubw %%mm1, %%mm3 \n\t" /*-1,9,9,0*/\
"psubw %%mm2, %%mm4 \n\t" /*-1,9,9,0*/\
"movd 0(%0,%2), %%mm1 \n\t"\
"movd 4(%0,%2), %%mm2 \n\t"\
"punpcklbw %%mm0, %%mm1 \n\t"\
"punpcklbw %%mm0, %%mm2 \n\t"\
"psubw %%mm1, %%mm3 \n\t" /*-1,9,9,-1*/\
"psubw %%mm2, %%mm4 \n\t" /*-1,9,9,-1*/\
NORMALIZE_MMX("$4")\
"packuswb %%mm4, %%mm3 \n\t"\
OP((%1), %%mm3)\
"movq %%mm3, (%1) \n\t"\
"add %6, %0 \n\t"\
"add %4, %1 \n\t"\
"dec %%"FF_REG_c" \n\t"\
"jnz 1b \n\t"\
: "+r"(src), "+r"(dst)\
: "r"(offset), "r"(-2*offset), "g"(stride), "m"(rnd),\
"g"(stride-offset)\
NAMED_CONSTRAINTS_ADD(ff_pw_9)\
: "%"FF_REG_c, "memory"\
);\
}
VC1_SHIFT2(OP_PUT, put_)
VC1_SHIFT2(OP_AVG, avg_)
/**
* Core of the 1/4 and 3/4 shift bicubic interpolation.
*
* @param UNPACK Macro unpacking arguments from 8 to 16 bits (can be empty).
* @param MOVQ "movd 1" or "movq 2", if data read is already unpacked.
* @param A1 Address of 1st tap (beware of unpacked/packed).
* @param A2 Address of 2nd tap
* @param A3 Address of 3rd tap
* @param A4 Address of 4th tap
*/
#define MSPEL_FILTER13_CORE(UNPACK, MOVQ, A1, A2, A3, A4) \
MOVQ "*0+"A1", %%mm1 \n\t" \
MOVQ "*4+"A1", %%mm2 \n\t" \
UNPACK("%%mm1") \
UNPACK("%%mm2") \
"pmullw "MANGLE(ff_pw_3)", %%mm1\n\t" \
"pmullw "MANGLE(ff_pw_3)", %%mm2\n\t" \
MOVQ "*0+"A2", %%mm3 \n\t" \
MOVQ "*4+"A2", %%mm4 \n\t" \
UNPACK("%%mm3") \
UNPACK("%%mm4") \
"pmullw %%mm6, %%mm3 \n\t" /* *18 */ \
"pmullw %%mm6, %%mm4 \n\t" /* *18 */ \
"psubw %%mm1, %%mm3 \n\t" /* 18,-3 */ \
"psubw %%mm2, %%mm4 \n\t" /* 18,-3 */ \
MOVQ "*0+"A4", %%mm1 \n\t" \
MOVQ "*4+"A4", %%mm2 \n\t" \
UNPACK("%%mm1") \
UNPACK("%%mm2") \
"psllw $2, %%mm1 \n\t" /* 4* */ \
"psllw $2, %%mm2 \n\t" /* 4* */ \
"psubw %%mm1, %%mm3 \n\t" /* -4,18,-3 */ \
"psubw %%mm2, %%mm4 \n\t" /* -4,18,-3 */ \
MOVQ "*0+"A3", %%mm1 \n\t" \
MOVQ "*4+"A3", %%mm2 \n\t" \
UNPACK("%%mm1") \
UNPACK("%%mm2") \
"pmullw %%mm5, %%mm1 \n\t" /* *53 */ \
"pmullw %%mm5, %%mm2 \n\t" /* *53 */ \
"paddw %%mm1, %%mm3 \n\t" /* 4,53,18,-3 */ \
"paddw %%mm2, %%mm4 \n\t" /* 4,53,18,-3 */
/**
* Macro to build the vertical 16 bits version of vc1_put_shift[13].
* Here, offset=src_stride. Parameters passed A1 to A4 must use
* %3 (src_stride) and %4 (3*src_stride).
*
* @param NAME Either 1 or 3
* @see MSPEL_FILTER13_CORE for information on A1->A4
*/
#define MSPEL_FILTER13_VER_16B(NAME, A1, A2, A3, A4) \
static void \
vc1_put_ver_16b_ ## NAME ## _mmx(int16_t *dst, const uint8_t *src, \
x86_reg src_stride, \
int rnd, int64_t shift) \
{ \
int h = 8; \
src -= src_stride; \
__asm__ volatile( \
LOAD_ROUNDER_MMX("%5") \
"movq "MANGLE(ff_pw_53)", %%mm5\n\t" \
"movq "MANGLE(ff_pw_18)", %%mm6\n\t" \
".p2align 3 \n\t" \
"1: \n\t" \
MSPEL_FILTER13_CORE(DO_UNPACK, "movd 1", A1, A2, A3, A4) \
NORMALIZE_MMX("%6") \
TRANSFER_DONT_PACK(OP_PUT) \
/* Last 3 (in fact 4) bytes on the line */ \
"movd 8+"A1", %%mm1 \n\t" \
DO_UNPACK("%%mm1") \
"movq %%mm1, %%mm3 \n\t" \
"paddw %%mm1, %%mm1 \n\t" \
"paddw %%mm3, %%mm1 \n\t" /* 3* */ \
"movd 8+"A2", %%mm3 \n\t" \
DO_UNPACK("%%mm3") \
"pmullw %%mm6, %%mm3 \n\t" /* *18 */ \
"psubw %%mm1, %%mm3 \n\t" /*18,-3 */ \
"movd 8+"A3", %%mm1 \n\t" \
DO_UNPACK("%%mm1") \
"pmullw %%mm5, %%mm1 \n\t" /* *53 */ \
"paddw %%mm1, %%mm3 \n\t" /*53,18,-3 */ \
"movd 8+"A4", %%mm1 \n\t" \
DO_UNPACK("%%mm1") \
"psllw $2, %%mm1 \n\t" /* 4* */ \
"psubw %%mm1, %%mm3 \n\t" \
"paddw %%mm7, %%mm3 \n\t" \
"psraw %6, %%mm3 \n\t" \
"movq %%mm3, 16(%2) \n\t" \
"add %3, %1 \n\t" \
"add $24, %2 \n\t" \
"decl %0 \n\t" \
"jnz 1b \n\t" \
: "+r"(h), "+r" (src), "+r" (dst) \
: "r"(src_stride), "r"(3*src_stride), \
"m"(rnd), "m"(shift) \
NAMED_CONSTRAINTS_ADD(ff_pw_3,ff_pw_53,ff_pw_18) \
: "memory" \
); \
}
/**
* Macro to build the horizontal 16 bits version of vc1_put_shift[13].
* Here, offset=16 bits, so parameters passed A1 to A4 should be simple.
*
* @param NAME Either 1 or 3
* @see MSPEL_FILTER13_CORE for information on A1->A4
*/
#define MSPEL_FILTER13_HOR_16B(NAME, A1, A2, A3, A4, OP, OPNAME) \
static void \
OPNAME ## vc1_hor_16b_ ## NAME ## _mmx(uint8_t *dst, x86_reg stride, \
const int16_t *src, int rnd) \
{ \
int h = 8; \
src -= 1; \
rnd -= (-4+58+13-3)*256; /* Add -256 bias */ \
__asm__ volatile( \
LOAD_ROUNDER_MMX("%4") \
"movq "MANGLE(ff_pw_18)", %%mm6 \n\t" \
"movq "MANGLE(ff_pw_53)", %%mm5 \n\t" \
".p2align 3 \n\t" \
"1: \n\t" \
MSPEL_FILTER13_CORE(DONT_UNPACK, "movq 2", A1, A2, A3, A4) \
NORMALIZE_MMX("$7") \
/* Remove bias */ \
"paddw "MANGLE(ff_pw_128)", %%mm3 \n\t" \
"paddw "MANGLE(ff_pw_128)", %%mm4 \n\t" \
TRANSFER_DO_PACK(OP) \
"add $24, %1 \n\t" \
"add %3, %2 \n\t" \
"decl %0 \n\t" \
"jnz 1b \n\t" \
: "+r"(h), "+r" (src), "+r" (dst) \
: "r"(stride), "m"(rnd) \
NAMED_CONSTRAINTS_ADD(ff_pw_3,ff_pw_18,ff_pw_53,ff_pw_128) \
: "memory" \
); \
}
/**
* Macro to build the 8 bits, any direction, version of vc1_put_shift[13].
* Here, offset=src_stride. Parameters passed A1 to A4 must use
* %3 (offset) and %4 (3*offset).
*
* @param NAME Either 1 or 3
* @see MSPEL_FILTER13_CORE for information on A1->A4
*/
#define MSPEL_FILTER13_8B(NAME, A1, A2, A3, A4, OP, OPNAME) \
static void \
OPNAME ## vc1_## NAME ## _mmx(uint8_t *dst, const uint8_t *src, \
x86_reg stride, int rnd, x86_reg offset) \
{ \
int h = 8; \
src -= offset; \
rnd = 32-rnd; \
__asm__ volatile ( \
LOAD_ROUNDER_MMX("%6") \
"movq "MANGLE(ff_pw_53)", %%mm5 \n\t" \
"movq "MANGLE(ff_pw_18)", %%mm6 \n\t" \
".p2align 3 \n\t" \
"1: \n\t" \
MSPEL_FILTER13_CORE(DO_UNPACK, "movd 1", A1, A2, A3, A4) \
NORMALIZE_MMX("$6") \
TRANSFER_DO_PACK(OP) \
"add %5, %1 \n\t" \
"add %5, %2 \n\t" \
"decl %0 \n\t" \
"jnz 1b \n\t" \
: "+r"(h), "+r" (src), "+r" (dst) \
: "r"(offset), "r"(3*offset), "g"(stride), "m"(rnd) \
NAMED_CONSTRAINTS_ADD(ff_pw_53,ff_pw_18,ff_pw_3) \
: "memory" \
); \
}
/** 1/4 shift bicubic interpolation */
MSPEL_FILTER13_8B (shift1, "0(%1,%4 )", "0(%1,%3,2)", "0(%1,%3 )", "0(%1 )", OP_PUT, put_)
MSPEL_FILTER13_8B (shift1, "0(%1,%4 )", "0(%1,%3,2)", "0(%1,%3 )", "0(%1 )", OP_AVG, avg_)
MSPEL_FILTER13_VER_16B(shift1, "0(%1,%4 )", "0(%1,%3,2)", "0(%1,%3 )", "0(%1 )")
MSPEL_FILTER13_HOR_16B(shift1, "2*3(%1)", "2*2(%1)", "2*1(%1)", "2*0(%1)", OP_PUT, put_)
MSPEL_FILTER13_HOR_16B(shift1, "2*3(%1)", "2*2(%1)", "2*1(%1)", "2*0(%1)", OP_AVG, avg_)
/** 3/4 shift bicubic interpolation */
MSPEL_FILTER13_8B (shift3, "0(%1 )", "0(%1,%3 )", "0(%1,%3,2)", "0(%1,%4 )", OP_PUT, put_)
MSPEL_FILTER13_8B (shift3, "0(%1 )", "0(%1,%3 )", "0(%1,%3,2)", "0(%1,%4 )", OP_AVG, avg_)
MSPEL_FILTER13_VER_16B(shift3, "0(%1 )", "0(%1,%3 )", "0(%1,%3,2)", "0(%1,%4 )")
MSPEL_FILTER13_HOR_16B(shift3, "2*0(%1)", "2*1(%1)", "2*2(%1)", "2*3(%1)", OP_PUT, put_)
MSPEL_FILTER13_HOR_16B(shift3, "2*0(%1)", "2*1(%1)", "2*2(%1)", "2*3(%1)", OP_AVG, avg_)
typedef void (*vc1_mspel_mc_filter_ver_16bits)(int16_t *dst, const uint8_t *src, x86_reg src_stride, int rnd, int64_t shift);
typedef void (*vc1_mspel_mc_filter_hor_16bits)(uint8_t *dst, x86_reg dst_stride, const int16_t *src, int rnd);
typedef void (*vc1_mspel_mc_filter_8bits)(uint8_t *dst, const uint8_t *src, x86_reg stride, int rnd, x86_reg offset);
/**
* Interpolate fractional pel values by applying proper vertical then
* horizontal filter.
*
* @param dst Destination buffer for interpolated pels.
* @param src Source buffer.
* @param stride Stride for both src and dst buffers.
* @param hmode Horizontal filter (expressed in quarter pixels shift).
* @param hmode Vertical filter.
* @param rnd Rounding bias.
*/
#define VC1_MSPEL_MC(OP, INSTR)\
static void OP ## vc1_mspel_mc(uint8_t *dst, const uint8_t *src, int stride,\
int hmode, int vmode, int rnd)\
{\
static const vc1_mspel_mc_filter_ver_16bits vc1_put_shift_ver_16bits[] =\
{ NULL, vc1_put_ver_16b_shift1_mmx, ff_vc1_put_ver_16b_shift2_mmx, vc1_put_ver_16b_shift3_mmx };\
static const vc1_mspel_mc_filter_hor_16bits vc1_put_shift_hor_16bits[] =\
{ NULL, OP ## vc1_hor_16b_shift1_mmx, ff_vc1_ ## OP ## hor_16b_shift2_ ## INSTR, OP ## vc1_hor_16b_shift3_mmx };\
static const vc1_mspel_mc_filter_8bits vc1_put_shift_8bits[] =\
{ NULL, OP ## vc1_shift1_mmx, OP ## vc1_shift2_mmx, OP ## vc1_shift3_mmx };\
\
__asm__ volatile(\
"pxor %%mm0, %%mm0 \n\t"\
::: "memory"\
);\
\
if (vmode) { /* Vertical filter to apply */\
if (hmode) { /* Horizontal filter to apply, output to tmp */\
static const int shift_value[] = { 0, 5, 1, 5 };\
int shift = (shift_value[hmode]+shift_value[vmode])>>1;\
int r;\
LOCAL_ALIGNED(16, int16_t, tmp, [12*8]);\
\
r = (1<<(shift-1)) + rnd-1;\
vc1_put_shift_ver_16bits[vmode](tmp, src-1, stride, r, shift);\
\
vc1_put_shift_hor_16bits[hmode](dst, stride, tmp+1, 64-rnd);\
return;\
}\
else { /* No horizontal filter, output 8 lines to dst */\
vc1_put_shift_8bits[vmode](dst, src, stride, 1-rnd, stride);\
return;\
}\
}\
\
/* Horizontal mode with no vertical mode */\
vc1_put_shift_8bits[hmode](dst, src, stride, rnd, 1);\
} \
static void OP ## vc1_mspel_mc_16(uint8_t *dst, const uint8_t *src, \
int stride, int hmode, int vmode, int rnd)\
{ \
OP ## vc1_mspel_mc(dst + 0, src + 0, stride, hmode, vmode, rnd); \
OP ## vc1_mspel_mc(dst + 8, src + 8, stride, hmode, vmode, rnd); \
dst += 8*stride; src += 8*stride; \
OP ## vc1_mspel_mc(dst + 0, src + 0, stride, hmode, vmode, rnd); \
OP ## vc1_mspel_mc(dst + 8, src + 8, stride, hmode, vmode, rnd); \
}
VC1_MSPEL_MC(put_, mmx)
VC1_MSPEL_MC(avg_, mmxext)
/** Macro to ease bicubic filter interpolation functions declarations */
#define DECLARE_FUNCTION(a, b) \
static void put_vc1_mspel_mc ## a ## b ## _mmx(uint8_t *dst, \
const uint8_t *src, \
ptrdiff_t stride, \
int rnd) \
{ \
put_vc1_mspel_mc(dst, src, stride, a, b, rnd); \
}\
static void avg_vc1_mspel_mc ## a ## b ## _mmxext(uint8_t *dst, \
const uint8_t *src, \
ptrdiff_t stride, \
int rnd) \
{ \
avg_vc1_mspel_mc(dst, src, stride, a, b, rnd); \
}\
static void put_vc1_mspel_mc ## a ## b ## _16_mmx(uint8_t *dst, \
const uint8_t *src, \
ptrdiff_t stride, \
int rnd) \
{ \
put_vc1_mspel_mc_16(dst, src, stride, a, b, rnd); \
}\
static void avg_vc1_mspel_mc ## a ## b ## _16_mmxext(uint8_t *dst, \
const uint8_t *src,\
ptrdiff_t stride, \
int rnd) \
{ \
avg_vc1_mspel_mc_16(dst, src, stride, a, b, rnd); \
}
DECLARE_FUNCTION(0, 1)
DECLARE_FUNCTION(0, 2)
DECLARE_FUNCTION(0, 3)
DECLARE_FUNCTION(1, 0)
DECLARE_FUNCTION(1, 1)
DECLARE_FUNCTION(1, 2)
DECLARE_FUNCTION(1, 3)
DECLARE_FUNCTION(2, 0)
DECLARE_FUNCTION(2, 1)
DECLARE_FUNCTION(2, 2)
DECLARE_FUNCTION(2, 3)
DECLARE_FUNCTION(3, 0)
DECLARE_FUNCTION(3, 1)
DECLARE_FUNCTION(3, 2)
DECLARE_FUNCTION(3, 3)
#define FN_ASSIGN(OP, X, Y, INSN) \
dsp->OP##vc1_mspel_pixels_tab[1][X+4*Y] = OP##vc1_mspel_mc##X##Y##INSN; \
dsp->OP##vc1_mspel_pixels_tab[0][X+4*Y] = OP##vc1_mspel_mc##X##Y##_16##INSN
av_cold void ff_vc1dsp_init_mmx(VC1DSPContext *dsp)
{
FN_ASSIGN(put_, 0, 1, _mmx);
FN_ASSIGN(put_, 0, 2, _mmx);
FN_ASSIGN(put_, 0, 3, _mmx);
FN_ASSIGN(put_, 1, 0, _mmx);
FN_ASSIGN(put_, 1, 1, _mmx);
FN_ASSIGN(put_, 1, 2, _mmx);
FN_ASSIGN(put_, 1, 3, _mmx);
FN_ASSIGN(put_, 2, 0, _mmx);
FN_ASSIGN(put_, 2, 1, _mmx);
FN_ASSIGN(put_, 2, 2, _mmx);
FN_ASSIGN(put_, 2, 3, _mmx);
FN_ASSIGN(put_, 3, 0, _mmx);
FN_ASSIGN(put_, 3, 1, _mmx);
FN_ASSIGN(put_, 3, 2, _mmx);
FN_ASSIGN(put_, 3, 3, _mmx);
}
av_cold void ff_vc1dsp_init_mmxext(VC1DSPContext *dsp)
{
FN_ASSIGN(avg_, 0, 1, _mmxext);
FN_ASSIGN(avg_, 0, 2, _mmxext);
FN_ASSIGN(avg_, 0, 3, _mmxext);
FN_ASSIGN(avg_, 1, 0, _mmxext);
FN_ASSIGN(avg_, 1, 1, _mmxext);
FN_ASSIGN(avg_, 1, 2, _mmxext);
FN_ASSIGN(avg_, 1, 3, _mmxext);
FN_ASSIGN(avg_, 2, 0, _mmxext);
FN_ASSIGN(avg_, 2, 1, _mmxext);
FN_ASSIGN(avg_, 2, 2, _mmxext);
FN_ASSIGN(avg_, 2, 3, _mmxext);
FN_ASSIGN(avg_, 3, 0, _mmxext);
FN_ASSIGN(avg_, 3, 1, _mmxext);
FN_ASSIGN(avg_, 3, 2, _mmxext);
FN_ASSIGN(avg_, 3, 3, _mmxext);
}
#endif /* HAVE_6REGS && HAVE_INLINE_ASM && HAVE_MMX_EXTERNAL */

View file

@ -0,0 +1,309 @@
/*
* Copyright (C) 2002-2012 Michael Niedermayer
* Copyright (C) 2012 Ronald S. Bultje
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#include "libavutil/attributes.h"
#include "libavutil/avassert.h"
#include "libavutil/common.h"
#include "libavutil/cpu.h"
#include "libavutil/mem.h"
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/videodsp.h"
#if HAVE_X86ASM
typedef void emu_edge_vfix_func(uint8_t *dst, x86_reg dst_stride,
const uint8_t *src, x86_reg src_stride,
x86_reg start_y, x86_reg end_y, x86_reg bh);
typedef void emu_edge_vvar_func(uint8_t *dst, x86_reg dst_stride,
const uint8_t *src, x86_reg src_stride,
x86_reg start_y, x86_reg end_y, x86_reg bh,
x86_reg w);
extern emu_edge_vfix_func ff_emu_edge_vfix1_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix2_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix3_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix4_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix5_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix6_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix7_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix8_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix9_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix10_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix11_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix12_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix13_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix14_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix15_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix16_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix17_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix18_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix19_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix20_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix21_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix22_mmx;
#if ARCH_X86_32
static emu_edge_vfix_func * const vfixtbl_mmx[22] = {
&ff_emu_edge_vfix1_mmx, &ff_emu_edge_vfix2_mmx, &ff_emu_edge_vfix3_mmx,
&ff_emu_edge_vfix4_mmx, &ff_emu_edge_vfix5_mmx, &ff_emu_edge_vfix6_mmx,
&ff_emu_edge_vfix7_mmx, &ff_emu_edge_vfix8_mmx, &ff_emu_edge_vfix9_mmx,
&ff_emu_edge_vfix10_mmx, &ff_emu_edge_vfix11_mmx, &ff_emu_edge_vfix12_mmx,
&ff_emu_edge_vfix13_mmx, &ff_emu_edge_vfix14_mmx, &ff_emu_edge_vfix15_mmx,
&ff_emu_edge_vfix16_mmx, &ff_emu_edge_vfix17_mmx, &ff_emu_edge_vfix18_mmx,
&ff_emu_edge_vfix19_mmx, &ff_emu_edge_vfix20_mmx, &ff_emu_edge_vfix21_mmx,
&ff_emu_edge_vfix22_mmx
};
#endif
extern emu_edge_vvar_func ff_emu_edge_vvar_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix16_sse;
extern emu_edge_vfix_func ff_emu_edge_vfix17_sse;
extern emu_edge_vfix_func ff_emu_edge_vfix18_sse;
extern emu_edge_vfix_func ff_emu_edge_vfix19_sse;
extern emu_edge_vfix_func ff_emu_edge_vfix20_sse;
extern emu_edge_vfix_func ff_emu_edge_vfix21_sse;
extern emu_edge_vfix_func ff_emu_edge_vfix22_sse;
static emu_edge_vfix_func * const vfixtbl_sse[22] = {
ff_emu_edge_vfix1_mmx, ff_emu_edge_vfix2_mmx, ff_emu_edge_vfix3_mmx,
ff_emu_edge_vfix4_mmx, ff_emu_edge_vfix5_mmx, ff_emu_edge_vfix6_mmx,
ff_emu_edge_vfix7_mmx, ff_emu_edge_vfix8_mmx, ff_emu_edge_vfix9_mmx,
ff_emu_edge_vfix10_mmx, ff_emu_edge_vfix11_mmx, ff_emu_edge_vfix12_mmx,
ff_emu_edge_vfix13_mmx, ff_emu_edge_vfix14_mmx, ff_emu_edge_vfix15_mmx,
ff_emu_edge_vfix16_sse, ff_emu_edge_vfix17_sse, ff_emu_edge_vfix18_sse,
ff_emu_edge_vfix19_sse, ff_emu_edge_vfix20_sse, ff_emu_edge_vfix21_sse,
ff_emu_edge_vfix22_sse
};
extern emu_edge_vvar_func ff_emu_edge_vvar_sse;
typedef void emu_edge_hfix_func(uint8_t *dst, x86_reg dst_stride,
x86_reg start_x, x86_reg bh);
typedef void emu_edge_hvar_func(uint8_t *dst, x86_reg dst_stride,
x86_reg start_x, x86_reg n_words, x86_reg bh);
extern emu_edge_hfix_func ff_emu_edge_hfix2_mmx;
extern emu_edge_hfix_func ff_emu_edge_hfix4_mmx;
extern emu_edge_hfix_func ff_emu_edge_hfix6_mmx;
extern emu_edge_hfix_func ff_emu_edge_hfix8_mmx;
extern emu_edge_hfix_func ff_emu_edge_hfix10_mmx;
extern emu_edge_hfix_func ff_emu_edge_hfix12_mmx;
extern emu_edge_hfix_func ff_emu_edge_hfix14_mmx;
extern emu_edge_hfix_func ff_emu_edge_hfix16_mmx;
extern emu_edge_hfix_func ff_emu_edge_hfix18_mmx;
extern emu_edge_hfix_func ff_emu_edge_hfix20_mmx;
extern emu_edge_hfix_func ff_emu_edge_hfix22_mmx;
#if ARCH_X86_32
static emu_edge_hfix_func * const hfixtbl_mmx[11] = {
ff_emu_edge_hfix2_mmx, ff_emu_edge_hfix4_mmx, ff_emu_edge_hfix6_mmx,
ff_emu_edge_hfix8_mmx, ff_emu_edge_hfix10_mmx, ff_emu_edge_hfix12_mmx,
ff_emu_edge_hfix14_mmx, ff_emu_edge_hfix16_mmx, ff_emu_edge_hfix18_mmx,
ff_emu_edge_hfix20_mmx, ff_emu_edge_hfix22_mmx
};
#endif
extern emu_edge_hvar_func ff_emu_edge_hvar_mmx;
extern emu_edge_hfix_func ff_emu_edge_hfix16_sse2;
extern emu_edge_hfix_func ff_emu_edge_hfix18_sse2;
extern emu_edge_hfix_func ff_emu_edge_hfix20_sse2;
extern emu_edge_hfix_func ff_emu_edge_hfix22_sse2;
static emu_edge_hfix_func * const hfixtbl_sse2[11] = {
ff_emu_edge_hfix2_mmx, ff_emu_edge_hfix4_mmx, ff_emu_edge_hfix6_mmx,
ff_emu_edge_hfix8_mmx, ff_emu_edge_hfix10_mmx, ff_emu_edge_hfix12_mmx,
ff_emu_edge_hfix14_mmx, ff_emu_edge_hfix16_sse2, ff_emu_edge_hfix18_sse2,
ff_emu_edge_hfix20_sse2, ff_emu_edge_hfix22_sse2
};
extern emu_edge_hvar_func ff_emu_edge_hvar_sse2;
#if HAVE_AVX2_EXTERNAL
extern emu_edge_hfix_func ff_emu_edge_hfix8_avx2;
extern emu_edge_hfix_func ff_emu_edge_hfix10_avx2;
extern emu_edge_hfix_func ff_emu_edge_hfix12_avx2;
extern emu_edge_hfix_func ff_emu_edge_hfix14_avx2;
extern emu_edge_hfix_func ff_emu_edge_hfix16_avx2;
extern emu_edge_hfix_func ff_emu_edge_hfix18_avx2;
extern emu_edge_hfix_func ff_emu_edge_hfix20_avx2;
extern emu_edge_hfix_func ff_emu_edge_hfix22_avx2;
static emu_edge_hfix_func * const hfixtbl_avx2[11] = {
ff_emu_edge_hfix2_mmx, ff_emu_edge_hfix4_mmx, ff_emu_edge_hfix6_mmx,
ff_emu_edge_hfix8_avx2, ff_emu_edge_hfix10_avx2, ff_emu_edge_hfix12_avx2,
ff_emu_edge_hfix14_avx2, ff_emu_edge_hfix16_avx2, ff_emu_edge_hfix18_avx2,
ff_emu_edge_hfix20_avx2, ff_emu_edge_hfix22_avx2
};
extern emu_edge_hvar_func ff_emu_edge_hvar_avx2;
#endif
static av_always_inline void emulated_edge_mc(uint8_t *dst, const uint8_t *src,
ptrdiff_t dst_stride,
ptrdiff_t src_stride,
x86_reg block_w, x86_reg block_h,
x86_reg src_x, x86_reg src_y,
x86_reg w, x86_reg h,
emu_edge_vfix_func * const *vfix_tbl,
emu_edge_vvar_func *v_extend_var,
emu_edge_hfix_func * const *hfix_tbl,
emu_edge_hvar_func *h_extend_var)
{
x86_reg start_y, start_x, end_y, end_x, src_y_add = 0, p;
if (!w || !h)
return;
av_assert2(block_w <= FFABS(dst_stride));
if (src_y >= h) {
src -= src_y*src_stride;
src_y_add = h - 1;
src_y = h - 1;
} else if (src_y <= -block_h) {
src -= src_y*src_stride;
src_y_add = 1 - block_h;
src_y = 1 - block_h;
}
if (src_x >= w) {
src += w - 1 - src_x;
src_x = w - 1;
} else if (src_x <= -block_w) {
src += 1 - block_w - src_x;
src_x = 1 - block_w;
}
start_y = FFMAX(0, -src_y);
start_x = FFMAX(0, -src_x);
end_y = FFMIN(block_h, h-src_y);
end_x = FFMIN(block_w, w-src_x);
av_assert2(start_x < end_x && block_w > 0);
av_assert2(start_y < end_y && block_h > 0);
// fill in the to-be-copied part plus all above/below
src += (src_y_add + start_y) * src_stride + start_x;
w = end_x - start_x;
if (w <= 22) {
vfix_tbl[w - 1](dst + start_x, dst_stride, src, src_stride,
start_y, end_y, block_h);
} else {
v_extend_var(dst + start_x, dst_stride, src, src_stride,
start_y, end_y, block_h, w);
}
// fill left
if (start_x) {
if (start_x <= 22) {
hfix_tbl[(start_x - 1) >> 1](dst, dst_stride, start_x, block_h);
} else {
h_extend_var(dst, dst_stride,
start_x, (start_x + 1) >> 1, block_h);
}
}
// fill right
p = block_w - end_x;
if (p) {
if (p <= 22) {
hfix_tbl[(p - 1) >> 1](dst + end_x - (p & 1), dst_stride,
-!(p & 1), block_h);
} else {
h_extend_var(dst + end_x - (p & 1), dst_stride,
-!(p & 1), (p + 1) >> 1, block_h);
}
}
}
#if ARCH_X86_32
static av_noinline void emulated_edge_mc_mmx(uint8_t *buf, const uint8_t *src,
ptrdiff_t buf_stride,
ptrdiff_t src_stride,
int block_w, int block_h,
int src_x, int src_y, int w, int h)
{
emulated_edge_mc(buf, src, buf_stride, src_stride, block_w, block_h,
src_x, src_y, w, h, vfixtbl_mmx, &ff_emu_edge_vvar_mmx,
hfixtbl_mmx, &ff_emu_edge_hvar_mmx);
}
static av_noinline void emulated_edge_mc_sse(uint8_t *buf, const uint8_t *src,
ptrdiff_t buf_stride,
ptrdiff_t src_stride,
int block_w, int block_h,
int src_x, int src_y, int w, int h)
{
emulated_edge_mc(buf, src, buf_stride, src_stride, block_w, block_h,
src_x, src_y, w, h, vfixtbl_sse, &ff_emu_edge_vvar_sse,
hfixtbl_mmx, &ff_emu_edge_hvar_mmx);
}
#endif
static av_noinline void emulated_edge_mc_sse2(uint8_t *buf, const uint8_t *src,
ptrdiff_t buf_stride,
ptrdiff_t src_stride,
int block_w, int block_h,
int src_x, int src_y, int w,
int h)
{
emulated_edge_mc(buf, src, buf_stride, src_stride, block_w, block_h,
src_x, src_y, w, h, vfixtbl_sse, &ff_emu_edge_vvar_sse,
hfixtbl_sse2, &ff_emu_edge_hvar_sse2);
}
#if HAVE_AVX2_EXTERNAL
static av_noinline void emulated_edge_mc_avx2(uint8_t *buf, const uint8_t *src,
ptrdiff_t buf_stride,
ptrdiff_t src_stride,
int block_w, int block_h,
int src_x, int src_y, int w,
int h)
{
emulated_edge_mc(buf, src, buf_stride, src_stride, block_w, block_h,
src_x, src_y, w, h, vfixtbl_sse, &ff_emu_edge_vvar_sse,
hfixtbl_avx2, &ff_emu_edge_hvar_avx2);
}
#endif /* HAVE_AVX2_EXTERNAL */
#endif /* HAVE_X86ASM */
void ff_prefetch_mmxext(uint8_t *buf, ptrdiff_t stride, int h);
void ff_prefetch_3dnow(uint8_t *buf, ptrdiff_t stride, int h);
av_cold void ff_videodsp_init_x86(VideoDSPContext *ctx, int bpc)
{
#if HAVE_X86ASM
int cpu_flags = av_get_cpu_flags();
#if ARCH_X86_32
if (EXTERNAL_MMX(cpu_flags) && bpc <= 8) {
ctx->emulated_edge_mc = emulated_edge_mc_mmx;
}
if (EXTERNAL_AMD3DNOW(cpu_flags)) {
ctx->prefetch = ff_prefetch_3dnow;
}
#endif /* ARCH_X86_32 */
if (EXTERNAL_MMXEXT(cpu_flags)) {
ctx->prefetch = ff_prefetch_mmxext;
}
#if ARCH_X86_32
if (EXTERNAL_SSE(cpu_flags) && bpc <= 8) {
ctx->emulated_edge_mc = emulated_edge_mc_sse;
}
#endif /* ARCH_X86_32 */
if (EXTERNAL_SSE2(cpu_flags) && bpc <= 8) {
ctx->emulated_edge_mc = emulated_edge_mc_sse2;
}
#if HAVE_AVX2_EXTERNAL
if (EXTERNAL_AVX2(cpu_flags) && bpc <= 8) {
ctx->emulated_edge_mc = emulated_edge_mc_avx2;
}
#endif
#endif /* HAVE_X86ASM */
}

View file

@ -0,0 +1,42 @@
/*
* Copyright (C) 2006 Loren Merritt <lorenm@u.washington.edu>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/vorbisdsp.h"
void ff_vorbis_inverse_coupling_3dnow(float *mag, float *ang,
intptr_t blocksize);
void ff_vorbis_inverse_coupling_sse(float *mag, float *ang,
intptr_t blocksize);
av_cold void ff_vorbisdsp_init_x86(VorbisDSPContext *dsp)
{
int cpu_flags = av_get_cpu_flags();
#if ARCH_X86_32
if (EXTERNAL_AMD3DNOW(cpu_flags))
dsp->vorbis_inverse_coupling = ff_vorbis_inverse_coupling_3dnow;
#endif /* ARCH_X86_32 */
if (EXTERNAL_SSE(cpu_flags))
dsp->vorbis_inverse_coupling = ff_vorbis_inverse_coupling_sse;
}

View file

@ -0,0 +1,71 @@
/*
* Copyright (c) 2009 David Conrad <lessen42@gmail.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdint.h>
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/avcodec.h"
#include "libavcodec/vp3dsp.h"
void ff_vp3_idct_put_mmx(uint8_t *dest, ptrdiff_t stride, int16_t *block);
void ff_vp3_idct_add_mmx(uint8_t *dest, ptrdiff_t stride, int16_t *block);
void ff_vp3_idct_put_sse2(uint8_t *dest, ptrdiff_t stride, int16_t *block);
void ff_vp3_idct_add_sse2(uint8_t *dest, ptrdiff_t stride, int16_t *block);
void ff_vp3_idct_dc_add_mmxext(uint8_t *dest, ptrdiff_t stride, int16_t *block);
void ff_vp3_v_loop_filter_mmxext(uint8_t *src, ptrdiff_t stride,
int *bounding_values);
void ff_vp3_h_loop_filter_mmxext(uint8_t *src, ptrdiff_t stride,
int *bounding_values);
void ff_put_vp_no_rnd_pixels8_l2_mmx(uint8_t *dst, const uint8_t *a,
const uint8_t *b, ptrdiff_t stride,
int h);
av_cold void ff_vp3dsp_init_x86(VP3DSPContext *c, int flags)
{
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_MMX(cpu_flags)) {
c->put_no_rnd_pixels_l2 = ff_put_vp_no_rnd_pixels8_l2_mmx;
#if ARCH_X86_32
c->idct_put = ff_vp3_idct_put_mmx;
c->idct_add = ff_vp3_idct_add_mmx;
#endif
}
if (EXTERNAL_MMXEXT(cpu_flags)) {
c->idct_dc_add = ff_vp3_idct_dc_add_mmxext;
if (!(flags & AV_CODEC_FLAG_BITEXACT)) {
c->v_loop_filter = ff_vp3_v_loop_filter_mmxext;
c->h_loop_filter = ff_vp3_h_loop_filter_mmxext;
}
}
if (EXTERNAL_SSE2(cpu_flags)) {
c->idct_put = ff_vp3_idct_put_sse2;
c->idct_add = ff_vp3_idct_add_sse2;
}
}

View file

@ -0,0 +1,51 @@
/**
* VP5 and VP6 compatible video decoder (arith decoder)
*
* Copyright (C) 2006 Aurelien Jacobs <aurel@gnuage.org>
* Copyright (C) 2010 Eli Friedman
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVCODEC_X86_VP56_ARITH_H
#define AVCODEC_X86_VP56_ARITH_H
#if HAVE_INLINE_ASM && HAVE_FAST_CMOV && HAVE_6REGS
#define vp56_rac_get_prob vp56_rac_get_prob
static av_always_inline int vp56_rac_get_prob(VP56RangeCoder *c, uint8_t prob)
{
unsigned int code_word = vp56_rac_renorm(c);
unsigned int low = 1 + (((c->high - 1) * prob) >> 8);
unsigned int low_shift = low << 16;
int bit = 0;
c->code_word = code_word;
__asm__(
"subl %4, %1 \n\t"
"subl %3, %2 \n\t"
"setae %b0 \n\t"
"cmovb %4, %1 \n\t"
"cmovb %5, %2 \n\t"
: "+q"(bit), "+&r"(c->high), "+&r"(c->code_word)
: "r"(low_shift), "r"(low), "r"(code_word)
);
return bit;
}
#endif
#endif /* AVCODEC_X86_VP56_ARITH_H */

View file

@ -0,0 +1,45 @@
/*
* VP6 MMX/SSE2 optimizations
* Copyright (C) 2009 Sebastien Lucas <sebastien.lucas@gmail.com>
* Copyright (C) 2009 Zuxy Meng <zuxy.meng@gmail.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/vp56dsp.h"
void ff_vp6_filter_diag4_mmx(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
const int16_t *h_weights,const int16_t *v_weights);
void ff_vp6_filter_diag4_sse2(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
const int16_t *h_weights,const int16_t *v_weights);
av_cold void ff_vp6dsp_init_x86(VP56DSPContext *c)
{
int cpu_flags = av_get_cpu_flags();
#if ARCH_X86_32
if (EXTERNAL_MMX(cpu_flags)) {
c->vp6_filter_diag4 = ff_vp6_filter_diag4_mmx;
}
#endif
if (EXTERNAL_SSE2(cpu_flags)) {
c->vp6_filter_diag4 = ff_vp6_filter_diag4_sse2;
}
}

View file

@ -0,0 +1,467 @@
/*
* VP8 DSP functions x86-optimized
* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
* Copyright (c) 2010 Fiona Glaser <fiona@x264.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/mem.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/vp8dsp.h"
#if HAVE_X86ASM
/*
* MC functions
*/
void ff_put_vp8_epel4_h4_mmxext(uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
void ff_put_vp8_epel4_h6_mmxext(uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
void ff_put_vp8_epel4_v4_mmxext(uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
void ff_put_vp8_epel4_v6_mmxext(uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
void ff_put_vp8_epel8_h4_sse2 (uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
void ff_put_vp8_epel8_h6_sse2 (uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
void ff_put_vp8_epel8_v4_sse2 (uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
void ff_put_vp8_epel8_v6_sse2 (uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
void ff_put_vp8_epel4_h4_ssse3 (uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
void ff_put_vp8_epel4_h6_ssse3 (uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
void ff_put_vp8_epel4_v4_ssse3 (uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
void ff_put_vp8_epel4_v6_ssse3 (uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
void ff_put_vp8_epel8_h4_ssse3 (uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
void ff_put_vp8_epel8_h6_ssse3 (uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
void ff_put_vp8_epel8_v4_ssse3 (uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
void ff_put_vp8_epel8_v6_ssse3 (uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
void ff_put_vp8_bilinear4_h_mmxext(uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
void ff_put_vp8_bilinear8_h_sse2 (uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
void ff_put_vp8_bilinear4_h_ssse3 (uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
void ff_put_vp8_bilinear8_h_ssse3 (uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
void ff_put_vp8_bilinear4_v_mmxext(uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
void ff_put_vp8_bilinear8_v_sse2 (uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
void ff_put_vp8_bilinear4_v_ssse3 (uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
void ff_put_vp8_bilinear8_v_ssse3 (uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
void ff_put_vp8_pixels8_mmx (uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
void ff_put_vp8_pixels16_mmx(uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
void ff_put_vp8_pixels16_sse(uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
#define TAP_W16(OPT, FILTERTYPE, TAPTYPE) \
static void ff_put_vp8_ ## FILTERTYPE ## 16_ ## TAPTYPE ## _ ## OPT( \
uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \
ptrdiff_t srcstride, int height, int mx, int my) \
{ \
ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \
dst, dststride, src, srcstride, height, mx, my); \
ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \
dst + 8, dststride, src + 8, srcstride, height, mx, my); \
}
#define TAP_W8(OPT, FILTERTYPE, TAPTYPE) \
static void ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \
uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \
ptrdiff_t srcstride, int height, int mx, int my) \
{ \
ff_put_vp8_ ## FILTERTYPE ## 4_ ## TAPTYPE ## _ ## OPT( \
dst, dststride, src, srcstride, height, mx, my); \
ff_put_vp8_ ## FILTERTYPE ## 4_ ## TAPTYPE ## _ ## OPT( \
dst + 4, dststride, src + 4, srcstride, height, mx, my); \
}
#if ARCH_X86_32
TAP_W8 (mmxext, epel, h4)
TAP_W8 (mmxext, epel, h6)
TAP_W16(mmxext, epel, h6)
TAP_W8 (mmxext, epel, v4)
TAP_W8 (mmxext, epel, v6)
TAP_W16(mmxext, epel, v6)
TAP_W8 (mmxext, bilinear, h)
TAP_W16(mmxext, bilinear, h)
TAP_W8 (mmxext, bilinear, v)
TAP_W16(mmxext, bilinear, v)
#endif
TAP_W16(sse2, epel, h6)
TAP_W16(sse2, epel, v6)
TAP_W16(sse2, bilinear, h)
TAP_W16(sse2, bilinear, v)
TAP_W16(ssse3, epel, h6)
TAP_W16(ssse3, epel, v6)
TAP_W16(ssse3, bilinear, h)
TAP_W16(ssse3, bilinear, v)
#define HVTAP(OPT, ALIGN, TAPNUMX, TAPNUMY, SIZE, MAXHEIGHT) \
static void ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## v ## TAPNUMY ## _ ## OPT( \
uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \
ptrdiff_t srcstride, int height, int mx, int my) \
{ \
LOCAL_ALIGNED(ALIGN, uint8_t, tmp, [SIZE * (MAXHEIGHT + TAPNUMY - 1)]); \
uint8_t *tmpptr = tmp + SIZE * (TAPNUMY / 2 - 1); \
src -= srcstride * (TAPNUMY / 2 - 1); \
ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## _ ## OPT( \
tmp, SIZE, src, srcstride, height + TAPNUMY - 1, mx, my); \
ff_put_vp8_epel ## SIZE ## _v ## TAPNUMY ## _ ## OPT( \
dst, dststride, tmpptr, SIZE, height, mx, my); \
}
#if ARCH_X86_32
#define HVTAPMMX(x, y) \
HVTAP(mmxext, 8, x, y, 4, 8) \
HVTAP(mmxext, 8, x, y, 8, 16)
HVTAP(mmxext, 8, 6, 6, 16, 16)
#else
#define HVTAPMMX(x, y) \
HVTAP(mmxext, 8, x, y, 4, 8)
#endif
HVTAPMMX(4, 4)
HVTAPMMX(4, 6)
HVTAPMMX(6, 4)
HVTAPMMX(6, 6)
#define HVTAPSSE2(x, y, w) \
HVTAP(sse2, 16, x, y, w, 16) \
HVTAP(ssse3, 16, x, y, w, 16)
HVTAPSSE2(4, 4, 8)
HVTAPSSE2(4, 6, 8)
HVTAPSSE2(6, 4, 8)
HVTAPSSE2(6, 6, 8)
HVTAPSSE2(6, 6, 16)
HVTAP(ssse3, 16, 4, 4, 4, 8)
HVTAP(ssse3, 16, 4, 6, 4, 8)
HVTAP(ssse3, 16, 6, 4, 4, 8)
HVTAP(ssse3, 16, 6, 6, 4, 8)
#define HVBILIN(OPT, ALIGN, SIZE, MAXHEIGHT) \
static void ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT( \
uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \
ptrdiff_t srcstride, int height, int mx, int my) \
{ \
LOCAL_ALIGNED(ALIGN, uint8_t, tmp, [SIZE * (MAXHEIGHT + 2)]); \
ff_put_vp8_bilinear ## SIZE ## _h_ ## OPT( \
tmp, SIZE, src, srcstride, height + 1, mx, my); \
ff_put_vp8_bilinear ## SIZE ## _v_ ## OPT( \
dst, dststride, tmp, SIZE, height, mx, my); \
}
HVBILIN(mmxext, 8, 4, 8)
#if ARCH_X86_32
HVBILIN(mmxext, 8, 8, 16)
HVBILIN(mmxext, 8, 16, 16)
#endif
HVBILIN(sse2, 8, 8, 16)
HVBILIN(sse2, 8, 16, 16)
HVBILIN(ssse3, 8, 4, 8)
HVBILIN(ssse3, 8, 8, 16)
HVBILIN(ssse3, 8, 16, 16)
void ff_vp8_idct_dc_add_mmx(uint8_t *dst, int16_t block[16],
ptrdiff_t stride);
void ff_vp8_idct_dc_add_sse2(uint8_t *dst, int16_t block[16],
ptrdiff_t stride);
void ff_vp8_idct_dc_add_sse4(uint8_t *dst, int16_t block[16],
ptrdiff_t stride);
void ff_vp8_idct_dc_add4y_mmx(uint8_t *dst, int16_t block[4][16],
ptrdiff_t stride);
void ff_vp8_idct_dc_add4y_sse2(uint8_t *dst, int16_t block[4][16],
ptrdiff_t stride);
void ff_vp8_idct_dc_add4uv_mmx(uint8_t *dst, int16_t block[2][16],
ptrdiff_t stride);
void ff_vp8_luma_dc_wht_mmx(int16_t block[4][4][16], int16_t dc[16]);
void ff_vp8_luma_dc_wht_sse(int16_t block[4][4][16], int16_t dc[16]);
void ff_vp8_idct_add_mmx(uint8_t *dst, int16_t block[16], ptrdiff_t stride);
void ff_vp8_idct_add_sse(uint8_t *dst, int16_t block[16], ptrdiff_t stride);
#define DECLARE_LOOP_FILTER(NAME) \
void ff_vp8_v_loop_filter_simple_ ## NAME(uint8_t *dst, \
ptrdiff_t stride, \
int flim); \
void ff_vp8_h_loop_filter_simple_ ## NAME(uint8_t *dst, \
ptrdiff_t stride, \
int flim); \
void ff_vp8_v_loop_filter16y_inner_ ## NAME (uint8_t *dst, \
ptrdiff_t stride, \
int e, int i, int hvt); \
void ff_vp8_h_loop_filter16y_inner_ ## NAME (uint8_t *dst, \
ptrdiff_t stride, \
int e, int i, int hvt); \
void ff_vp8_v_loop_filter8uv_inner_ ## NAME (uint8_t *dstU, \
uint8_t *dstV, \
ptrdiff_t s, \
int e, int i, int hvt); \
void ff_vp8_h_loop_filter8uv_inner_ ## NAME (uint8_t *dstU, \
uint8_t *dstV, \
ptrdiff_t s, \
int e, int i, int hvt); \
void ff_vp8_v_loop_filter16y_mbedge_ ## NAME(uint8_t *dst, \
ptrdiff_t stride, \
int e, int i, int hvt); \
void ff_vp8_h_loop_filter16y_mbedge_ ## NAME(uint8_t *dst, \
ptrdiff_t stride, \
int e, int i, int hvt); \
void ff_vp8_v_loop_filter8uv_mbedge_ ## NAME(uint8_t *dstU, \
uint8_t *dstV, \
ptrdiff_t s, \
int e, int i, int hvt); \
void ff_vp8_h_loop_filter8uv_mbedge_ ## NAME(uint8_t *dstU, \
uint8_t *dstV, \
ptrdiff_t s, \
int e, int i, int hvt);
DECLARE_LOOP_FILTER(mmx)
DECLARE_LOOP_FILTER(mmxext)
DECLARE_LOOP_FILTER(sse2)
DECLARE_LOOP_FILTER(ssse3)
DECLARE_LOOP_FILTER(sse4)
#endif /* HAVE_X86ASM */
#define VP8_LUMA_MC_FUNC(IDX, SIZE, OPT) \
c->put_vp8_epel_pixels_tab[IDX][0][2] = ff_put_vp8_epel ## SIZE ## _h6_ ## OPT; \
c->put_vp8_epel_pixels_tab[IDX][2][0] = ff_put_vp8_epel ## SIZE ## _v6_ ## OPT; \
c->put_vp8_epel_pixels_tab[IDX][2][2] = ff_put_vp8_epel ## SIZE ## _h6v6_ ## OPT
#define VP8_MC_FUNC(IDX, SIZE, OPT) \
c->put_vp8_epel_pixels_tab[IDX][0][1] = ff_put_vp8_epel ## SIZE ## _h4_ ## OPT; \
c->put_vp8_epel_pixels_tab[IDX][1][0] = ff_put_vp8_epel ## SIZE ## _v4_ ## OPT; \
c->put_vp8_epel_pixels_tab[IDX][1][1] = ff_put_vp8_epel ## SIZE ## _h4v4_ ## OPT; \
c->put_vp8_epel_pixels_tab[IDX][1][2] = ff_put_vp8_epel ## SIZE ## _h6v4_ ## OPT; \
c->put_vp8_epel_pixels_tab[IDX][2][1] = ff_put_vp8_epel ## SIZE ## _h4v6_ ## OPT; \
VP8_LUMA_MC_FUNC(IDX, SIZE, OPT)
#define VP8_BILINEAR_MC_FUNC(IDX, SIZE, OPT) \
c->put_vp8_bilinear_pixels_tab[IDX][0][1] = ff_put_vp8_bilinear ## SIZE ## _h_ ## OPT; \
c->put_vp8_bilinear_pixels_tab[IDX][0][2] = ff_put_vp8_bilinear ## SIZE ## _h_ ## OPT; \
c->put_vp8_bilinear_pixels_tab[IDX][1][0] = ff_put_vp8_bilinear ## SIZE ## _v_ ## OPT; \
c->put_vp8_bilinear_pixels_tab[IDX][1][1] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT; \
c->put_vp8_bilinear_pixels_tab[IDX][1][2] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT; \
c->put_vp8_bilinear_pixels_tab[IDX][2][0] = ff_put_vp8_bilinear ## SIZE ## _v_ ## OPT; \
c->put_vp8_bilinear_pixels_tab[IDX][2][1] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT; \
c->put_vp8_bilinear_pixels_tab[IDX][2][2] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT
av_cold void ff_vp78dsp_init_x86(VP8DSPContext *c)
{
#if HAVE_X86ASM
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_MMX(cpu_flags)) {
#if ARCH_X86_32
c->put_vp8_epel_pixels_tab[0][0][0] =
c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_mmx;
#endif
c->put_vp8_epel_pixels_tab[1][0][0] =
c->put_vp8_bilinear_pixels_tab[1][0][0] = ff_put_vp8_pixels8_mmx;
}
/* note that 4-tap width=16 functions are missing because w=16
* is only used for luma, and luma is always a copy or sixtap. */
if (EXTERNAL_MMXEXT(cpu_flags)) {
VP8_MC_FUNC(2, 4, mmxext);
VP8_BILINEAR_MC_FUNC(2, 4, mmxext);
#if ARCH_X86_32
VP8_LUMA_MC_FUNC(0, 16, mmxext);
VP8_MC_FUNC(1, 8, mmxext);
VP8_BILINEAR_MC_FUNC(0, 16, mmxext);
VP8_BILINEAR_MC_FUNC(1, 8, mmxext);
#endif
}
if (EXTERNAL_SSE(cpu_flags)) {
c->put_vp8_epel_pixels_tab[0][0][0] =
c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_sse;
}
if (EXTERNAL_SSE2(cpu_flags) || EXTERNAL_SSE2_SLOW(cpu_flags)) {
VP8_LUMA_MC_FUNC(0, 16, sse2);
VP8_MC_FUNC(1, 8, sse2);
VP8_BILINEAR_MC_FUNC(0, 16, sse2);
VP8_BILINEAR_MC_FUNC(1, 8, sse2);
}
if (EXTERNAL_SSSE3(cpu_flags)) {
VP8_LUMA_MC_FUNC(0, 16, ssse3);
VP8_MC_FUNC(1, 8, ssse3);
VP8_MC_FUNC(2, 4, ssse3);
VP8_BILINEAR_MC_FUNC(0, 16, ssse3);
VP8_BILINEAR_MC_FUNC(1, 8, ssse3);
VP8_BILINEAR_MC_FUNC(2, 4, ssse3);
}
#endif /* HAVE_X86ASM */
}
av_cold void ff_vp8dsp_init_x86(VP8DSPContext *c)
{
#if HAVE_X86ASM
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_MMX(cpu_flags)) {
c->vp8_idct_dc_add4uv = ff_vp8_idct_dc_add4uv_mmx;
#if ARCH_X86_32
c->vp8_idct_dc_add = ff_vp8_idct_dc_add_mmx;
c->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_mmx;
c->vp8_idct_add = ff_vp8_idct_add_mmx;
c->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_mmx;
c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_mmx;
c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_mmx;
c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_mmx;
c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_mmx;
c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_mmx;
c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_mmx;
c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_mmx;
c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_mmx;
c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_mmx;
c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_mmx;
#endif
}
/* note that 4-tap width=16 functions are missing because w=16
* is only used for luma, and luma is always a copy or sixtap. */
if (EXTERNAL_MMXEXT(cpu_flags)) {
#if ARCH_X86_32
c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_mmxext;
c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_mmxext;
c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_mmxext;
c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_mmxext;
c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_mmxext;
c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_mmxext;
c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_mmxext;
c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_mmxext;
c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_mmxext;
c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_mmxext;
#endif
}
if (EXTERNAL_SSE(cpu_flags)) {
c->vp8_idct_add = ff_vp8_idct_add_sse;
c->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_sse;
}
if (EXTERNAL_SSE2(cpu_flags) || EXTERNAL_SSE2_SLOW(cpu_flags)) {
c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_sse2;
c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_sse2;
c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_sse2;
c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_sse2;
c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_sse2;
}
if (EXTERNAL_SSE2(cpu_flags)) {
c->vp8_idct_dc_add = ff_vp8_idct_dc_add_sse2;
c->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_sse2;
c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse2;
c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_sse2;
c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_sse2;
c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_sse2;
c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_sse2;
}
if (EXTERNAL_SSSE3(cpu_flags)) {
c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_ssse3;
c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_ssse3;
c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_ssse3;
c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_ssse3;
c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_ssse3;
c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_ssse3;
c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_ssse3;
c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_ssse3;
c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_ssse3;
c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_ssse3;
}
if (EXTERNAL_SSE4(cpu_flags)) {
c->vp8_idct_dc_add = ff_vp8_idct_dc_add_sse4;
c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse4;
c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_sse4;
c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_sse4;
}
#endif /* HAVE_X86ASM */
}

View file

@ -0,0 +1,416 @@
/*
* VP9 SIMD optimizations
*
* Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/mem.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/vp9dsp.h"
#include "libavcodec/x86/vp9dsp_init.h"
#if HAVE_X86ASM
decl_fpel_func(put, 4, , mmx);
decl_fpel_func(put, 8, , mmx);
decl_fpel_func(put, 16, , sse);
decl_fpel_func(put, 32, , sse);
decl_fpel_func(put, 64, , sse);
decl_fpel_func(avg, 4, _8, mmxext);
decl_fpel_func(avg, 8, _8, mmxext);
decl_fpel_func(avg, 16, _8, sse2);
decl_fpel_func(avg, 32, _8, sse2);
decl_fpel_func(avg, 64, _8, sse2);
decl_fpel_func(put, 32, , avx);
decl_fpel_func(put, 64, , avx);
decl_fpel_func(avg, 32, _8, avx2);
decl_fpel_func(avg, 64, _8, avx2);
decl_mc_funcs(4, mmxext, int16_t, 8, 8);
decl_mc_funcs(8, sse2, int16_t, 8, 8);
decl_mc_funcs(4, ssse3, int8_t, 32, 8);
decl_mc_funcs(8, ssse3, int8_t, 32, 8);
#if ARCH_X86_64
decl_mc_funcs(16, ssse3, int8_t, 32, 8);
decl_mc_funcs(32, avx2, int8_t, 32, 8);
#endif
mc_rep_funcs(16, 8, 8, sse2, int16_t, 8, 8)
#if ARCH_X86_32
mc_rep_funcs(16, 8, 8, ssse3, int8_t, 32, 8)
#endif
mc_rep_funcs(32, 16, 16, sse2, int16_t, 8, 8)
mc_rep_funcs(32, 16, 16, ssse3, int8_t, 32, 8)
mc_rep_funcs(64, 32, 32, sse2, int16_t, 8, 8)
mc_rep_funcs(64, 32, 32, ssse3, int8_t, 32, 8)
#if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
mc_rep_funcs(64, 32, 32, avx2, int8_t, 32, 8)
#endif
extern const int8_t ff_filters_ssse3[3][15][4][32];
extern const int16_t ff_filters_sse2[3][15][8][8];
filters_8tap_2d_fn2(put, 16, 8, 1, mmxext, sse2, sse2)
filters_8tap_2d_fn2(avg, 16, 8, 1, mmxext, sse2, sse2)
filters_8tap_2d_fn2(put, 16, 8, 1, ssse3, ssse3, ssse3)
filters_8tap_2d_fn2(avg, 16, 8, 1, ssse3, ssse3, ssse3)
#if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
filters_8tap_2d_fn(put, 64, 32, 8, 1, avx2, ssse3)
filters_8tap_2d_fn(put, 32, 32, 8, 1, avx2, ssse3)
filters_8tap_2d_fn(avg, 64, 32, 8, 1, avx2, ssse3)
filters_8tap_2d_fn(avg, 32, 32, 8, 1, avx2, ssse3)
#endif
filters_8tap_1d_fn3(put, 8, mmxext, sse2, sse2)
filters_8tap_1d_fn3(avg, 8, mmxext, sse2, sse2)
filters_8tap_1d_fn3(put, 8, ssse3, ssse3, ssse3)
filters_8tap_1d_fn3(avg, 8, ssse3, ssse3, ssse3)
#if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
filters_8tap_1d_fn2(put, 64, 8, avx2, ssse3)
filters_8tap_1d_fn2(put, 32, 8, avx2, ssse3)
filters_8tap_1d_fn2(avg, 64, 8, avx2, ssse3)
filters_8tap_1d_fn2(avg, 32, 8, avx2, ssse3)
#endif
#define itxfm_func(typea, typeb, size, opt) \
void ff_vp9_##typea##_##typeb##_##size##x##size##_add_##opt(uint8_t *dst, ptrdiff_t stride, \
int16_t *block, int eob)
#define itxfm_funcs(size, opt) \
itxfm_func(idct, idct, size, opt); \
itxfm_func(iadst, idct, size, opt); \
itxfm_func(idct, iadst, size, opt); \
itxfm_func(iadst, iadst, size, opt)
itxfm_func(idct, idct, 4, mmxext);
itxfm_func(idct, iadst, 4, sse2);
itxfm_func(iadst, idct, 4, sse2);
itxfm_func(iadst, iadst, 4, sse2);
itxfm_funcs(4, ssse3);
itxfm_funcs(8, sse2);
itxfm_funcs(8, ssse3);
itxfm_funcs(8, avx);
itxfm_funcs(16, sse2);
itxfm_funcs(16, ssse3);
itxfm_funcs(16, avx);
itxfm_func(idct, idct, 32, sse2);
itxfm_func(idct, idct, 32, ssse3);
itxfm_func(idct, idct, 32, avx);
itxfm_func(iwht, iwht, 4, mmx);
itxfm_funcs(16, avx2);
itxfm_func(idct, idct, 32, avx2);
#undef itxfm_func
#undef itxfm_funcs
#define lpf_funcs(size1, size2, opt) \
void ff_vp9_loop_filter_v_##size1##_##size2##_##opt(uint8_t *dst, ptrdiff_t stride, \
int E, int I, int H); \
void ff_vp9_loop_filter_h_##size1##_##size2##_##opt(uint8_t *dst, ptrdiff_t stride, \
int E, int I, int H)
lpf_funcs(4, 8, mmxext);
lpf_funcs(8, 8, mmxext);
lpf_funcs(16, 16, sse2);
lpf_funcs(16, 16, ssse3);
lpf_funcs(16, 16, avx);
lpf_funcs(44, 16, sse2);
lpf_funcs(44, 16, ssse3);
lpf_funcs(44, 16, avx);
lpf_funcs(84, 16, sse2);
lpf_funcs(84, 16, ssse3);
lpf_funcs(84, 16, avx);
lpf_funcs(48, 16, sse2);
lpf_funcs(48, 16, ssse3);
lpf_funcs(48, 16, avx);
lpf_funcs(88, 16, sse2);
lpf_funcs(88, 16, ssse3);
lpf_funcs(88, 16, avx);
#undef lpf_funcs
#define ipred_func(size, type, opt) \
void ff_vp9_ipred_##type##_##size##x##size##_##opt(uint8_t *dst, ptrdiff_t stride, \
const uint8_t *l, const uint8_t *a)
ipred_func(8, v, mmx);
#define ipred_dc_funcs(size, opt) \
ipred_func(size, dc, opt); \
ipred_func(size, dc_left, opt); \
ipred_func(size, dc_top, opt)
ipred_dc_funcs(4, mmxext);
ipred_dc_funcs(8, mmxext);
#define ipred_dir_tm_funcs(size, opt) \
ipred_func(size, tm, opt); \
ipred_func(size, dl, opt); \
ipred_func(size, dr, opt); \
ipred_func(size, hd, opt); \
ipred_func(size, hu, opt); \
ipred_func(size, vl, opt); \
ipred_func(size, vr, opt)
ipred_dir_tm_funcs(4, mmxext);
ipred_func(16, v, sse);
ipred_func(32, v, sse);
ipred_dc_funcs(16, sse2);
ipred_dc_funcs(32, sse2);
#define ipred_dir_tm_h_funcs(size, opt) \
ipred_dir_tm_funcs(size, opt); \
ipred_func(size, h, opt)
ipred_dir_tm_h_funcs(8, sse2);
ipred_dir_tm_h_funcs(16, sse2);
ipred_dir_tm_h_funcs(32, sse2);
ipred_func(4, h, sse2);
#define ipred_all_funcs(size, opt) \
ipred_dc_funcs(size, opt); \
ipred_dir_tm_h_funcs(size, opt)
// FIXME hd/vl_4x4_ssse3 does not exist
ipred_all_funcs(4, ssse3);
ipred_all_funcs(8, ssse3);
ipred_all_funcs(16, ssse3);
ipred_all_funcs(32, ssse3);
ipred_dir_tm_h_funcs(8, avx);
ipred_dir_tm_h_funcs(16, avx);
ipred_dir_tm_h_funcs(32, avx);
ipred_func(32, v, avx);
ipred_dc_funcs(32, avx2);
ipred_func(32, h, avx2);
ipred_func(32, tm, avx2);
#undef ipred_func
#undef ipred_dir_tm_h_funcs
#undef ipred_dir_tm_funcs
#undef ipred_dc_funcs
#endif /* HAVE_X86ASM */
av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int bpp, int bitexact)
{
#if HAVE_X86ASM
int cpu_flags;
if (bpp == 10) {
ff_vp9dsp_init_10bpp_x86(dsp, bitexact);
return;
} else if (bpp == 12) {
ff_vp9dsp_init_12bpp_x86(dsp, bitexact);
return;
}
cpu_flags = av_get_cpu_flags();
#define init_lpf(opt) do { \
dsp->loop_filter_16[0] = ff_vp9_loop_filter_h_16_16_##opt; \
dsp->loop_filter_16[1] = ff_vp9_loop_filter_v_16_16_##opt; \
dsp->loop_filter_mix2[0][0][0] = ff_vp9_loop_filter_h_44_16_##opt; \
dsp->loop_filter_mix2[0][0][1] = ff_vp9_loop_filter_v_44_16_##opt; \
dsp->loop_filter_mix2[0][1][0] = ff_vp9_loop_filter_h_48_16_##opt; \
dsp->loop_filter_mix2[0][1][1] = ff_vp9_loop_filter_v_48_16_##opt; \
dsp->loop_filter_mix2[1][0][0] = ff_vp9_loop_filter_h_84_16_##opt; \
dsp->loop_filter_mix2[1][0][1] = ff_vp9_loop_filter_v_84_16_##opt; \
dsp->loop_filter_mix2[1][1][0] = ff_vp9_loop_filter_h_88_16_##opt; \
dsp->loop_filter_mix2[1][1][1] = ff_vp9_loop_filter_v_88_16_##opt; \
} while (0)
#define init_ipred(sz, opt, t, e) \
dsp->intra_pred[TX_##sz##X##sz][e##_PRED] = ff_vp9_ipred_##t##_##sz##x##sz##_##opt
#define ff_vp9_ipred_hd_4x4_ssse3 ff_vp9_ipred_hd_4x4_mmxext
#define ff_vp9_ipred_vl_4x4_ssse3 ff_vp9_ipred_vl_4x4_mmxext
#define init_dir_tm_ipred(sz, opt) do { \
init_ipred(sz, opt, dl, DIAG_DOWN_LEFT); \
init_ipred(sz, opt, dr, DIAG_DOWN_RIGHT); \
init_ipred(sz, opt, hd, HOR_DOWN); \
init_ipred(sz, opt, vl, VERT_LEFT); \
init_ipred(sz, opt, hu, HOR_UP); \
init_ipred(sz, opt, tm, TM_VP8); \
init_ipred(sz, opt, vr, VERT_RIGHT); \
} while (0)
#define init_dir_tm_h_ipred(sz, opt) do { \
init_dir_tm_ipred(sz, opt); \
init_ipred(sz, opt, h, HOR); \
} while (0)
#define init_dc_ipred(sz, opt) do { \
init_ipred(sz, opt, dc, DC); \
init_ipred(sz, opt, dc_left, LEFT_DC); \
init_ipred(sz, opt, dc_top, TOP_DC); \
} while (0)
#define init_all_ipred(sz, opt) do { \
init_dc_ipred(sz, opt); \
init_dir_tm_h_ipred(sz, opt); \
} while (0)
if (EXTERNAL_MMX(cpu_flags)) {
init_fpel_func(4, 0, 4, put, , mmx);
init_fpel_func(3, 0, 8, put, , mmx);
if (!bitexact) {
dsp->itxfm_add[4 /* lossless */][DCT_DCT] =
dsp->itxfm_add[4 /* lossless */][ADST_DCT] =
dsp->itxfm_add[4 /* lossless */][DCT_ADST] =
dsp->itxfm_add[4 /* lossless */][ADST_ADST] = ff_vp9_iwht_iwht_4x4_add_mmx;
}
init_ipred(8, mmx, v, VERT);
}
if (EXTERNAL_MMXEXT(cpu_flags)) {
dsp->loop_filter_8[0][0] = ff_vp9_loop_filter_h_4_8_mmxext;
dsp->loop_filter_8[0][1] = ff_vp9_loop_filter_v_4_8_mmxext;
dsp->loop_filter_8[1][0] = ff_vp9_loop_filter_h_8_8_mmxext;
dsp->loop_filter_8[1][1] = ff_vp9_loop_filter_v_8_8_mmxext;
init_subpel2(4, 0, 4, put, 8, mmxext);
init_subpel2(4, 1, 4, avg, 8, mmxext);
init_fpel_func(4, 1, 4, avg, _8, mmxext);
init_fpel_func(3, 1, 8, avg, _8, mmxext);
dsp->itxfm_add[TX_4X4][DCT_DCT] = ff_vp9_idct_idct_4x4_add_mmxext;
init_dc_ipred(4, mmxext);
init_dc_ipred(8, mmxext);
init_dir_tm_ipred(4, mmxext);
}
if (EXTERNAL_SSE(cpu_flags)) {
init_fpel_func(2, 0, 16, put, , sse);
init_fpel_func(1, 0, 32, put, , sse);
init_fpel_func(0, 0, 64, put, , sse);
init_ipred(16, sse, v, VERT);
init_ipred(32, sse, v, VERT);
}
if (EXTERNAL_SSE2(cpu_flags)) {
init_subpel3_8to64(0, put, 8, sse2);
init_subpel3_8to64(1, avg, 8, sse2);
init_fpel_func(2, 1, 16, avg, _8, sse2);
init_fpel_func(1, 1, 32, avg, _8, sse2);
init_fpel_func(0, 1, 64, avg, _8, sse2);
init_lpf(sse2);
dsp->itxfm_add[TX_4X4][ADST_DCT] = ff_vp9_idct_iadst_4x4_add_sse2;
dsp->itxfm_add[TX_4X4][DCT_ADST] = ff_vp9_iadst_idct_4x4_add_sse2;
dsp->itxfm_add[TX_4X4][ADST_ADST] = ff_vp9_iadst_iadst_4x4_add_sse2;
dsp->itxfm_add[TX_8X8][DCT_DCT] = ff_vp9_idct_idct_8x8_add_sse2;
dsp->itxfm_add[TX_8X8][ADST_DCT] = ff_vp9_idct_iadst_8x8_add_sse2;
dsp->itxfm_add[TX_8X8][DCT_ADST] = ff_vp9_iadst_idct_8x8_add_sse2;
dsp->itxfm_add[TX_8X8][ADST_ADST] = ff_vp9_iadst_iadst_8x8_add_sse2;
dsp->itxfm_add[TX_16X16][DCT_DCT] = ff_vp9_idct_idct_16x16_add_sse2;
dsp->itxfm_add[TX_16X16][ADST_DCT] = ff_vp9_idct_iadst_16x16_add_sse2;
dsp->itxfm_add[TX_16X16][DCT_ADST] = ff_vp9_iadst_idct_16x16_add_sse2;
dsp->itxfm_add[TX_16X16][ADST_ADST] = ff_vp9_iadst_iadst_16x16_add_sse2;
dsp->itxfm_add[TX_32X32][ADST_ADST] =
dsp->itxfm_add[TX_32X32][ADST_DCT] =
dsp->itxfm_add[TX_32X32][DCT_ADST] =
dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_sse2;
init_dc_ipred(16, sse2);
init_dc_ipred(32, sse2);
init_dir_tm_h_ipred(8, sse2);
init_dir_tm_h_ipred(16, sse2);
init_dir_tm_h_ipred(32, sse2);
init_ipred(4, sse2, h, HOR);
}
if (EXTERNAL_SSSE3(cpu_flags)) {
init_subpel3(0, put, 8, ssse3);
init_subpel3(1, avg, 8, ssse3);
dsp->itxfm_add[TX_4X4][DCT_DCT] = ff_vp9_idct_idct_4x4_add_ssse3;
dsp->itxfm_add[TX_4X4][ADST_DCT] = ff_vp9_idct_iadst_4x4_add_ssse3;
dsp->itxfm_add[TX_4X4][DCT_ADST] = ff_vp9_iadst_idct_4x4_add_ssse3;
dsp->itxfm_add[TX_4X4][ADST_ADST] = ff_vp9_iadst_iadst_4x4_add_ssse3;
dsp->itxfm_add[TX_8X8][DCT_DCT] = ff_vp9_idct_idct_8x8_add_ssse3;
dsp->itxfm_add[TX_8X8][ADST_DCT] = ff_vp9_idct_iadst_8x8_add_ssse3;
dsp->itxfm_add[TX_8X8][DCT_ADST] = ff_vp9_iadst_idct_8x8_add_ssse3;
dsp->itxfm_add[TX_8X8][ADST_ADST] = ff_vp9_iadst_iadst_8x8_add_ssse3;
dsp->itxfm_add[TX_16X16][DCT_DCT] = ff_vp9_idct_idct_16x16_add_ssse3;
dsp->itxfm_add[TX_16X16][ADST_DCT] = ff_vp9_idct_iadst_16x16_add_ssse3;
dsp->itxfm_add[TX_16X16][DCT_ADST] = ff_vp9_iadst_idct_16x16_add_ssse3;
dsp->itxfm_add[TX_16X16][ADST_ADST] = ff_vp9_iadst_iadst_16x16_add_ssse3;
dsp->itxfm_add[TX_32X32][ADST_ADST] =
dsp->itxfm_add[TX_32X32][ADST_DCT] =
dsp->itxfm_add[TX_32X32][DCT_ADST] =
dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_ssse3;
init_lpf(ssse3);
init_all_ipred(4, ssse3);
init_all_ipred(8, ssse3);
init_all_ipred(16, ssse3);
init_all_ipred(32, ssse3);
}
if (EXTERNAL_AVX(cpu_flags)) {
dsp->itxfm_add[TX_8X8][DCT_DCT] = ff_vp9_idct_idct_8x8_add_avx;
dsp->itxfm_add[TX_8X8][ADST_DCT] = ff_vp9_idct_iadst_8x8_add_avx;
dsp->itxfm_add[TX_8X8][DCT_ADST] = ff_vp9_iadst_idct_8x8_add_avx;
dsp->itxfm_add[TX_8X8][ADST_ADST] = ff_vp9_iadst_iadst_8x8_add_avx;
dsp->itxfm_add[TX_16X16][DCT_DCT] = ff_vp9_idct_idct_16x16_add_avx;
dsp->itxfm_add[TX_16X16][ADST_DCT] = ff_vp9_idct_iadst_16x16_add_avx;
dsp->itxfm_add[TX_16X16][DCT_ADST] = ff_vp9_iadst_idct_16x16_add_avx;
dsp->itxfm_add[TX_16X16][ADST_ADST] = ff_vp9_iadst_iadst_16x16_add_avx;
dsp->itxfm_add[TX_32X32][ADST_ADST] =
dsp->itxfm_add[TX_32X32][ADST_DCT] =
dsp->itxfm_add[TX_32X32][DCT_ADST] =
dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_avx;
init_lpf(avx);
init_dir_tm_h_ipred(8, avx);
init_dir_tm_h_ipred(16, avx);
init_dir_tm_h_ipred(32, avx);
}
if (EXTERNAL_AVX_FAST(cpu_flags)) {
init_fpel_func(1, 0, 32, put, , avx);
init_fpel_func(0, 0, 64, put, , avx);
init_ipred(32, avx, v, VERT);
}
if (EXTERNAL_AVX2_FAST(cpu_flags)) {
init_fpel_func(1, 1, 32, avg, _8, avx2);
init_fpel_func(0, 1, 64, avg, _8, avx2);
if (ARCH_X86_64) {
#if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
dsp->itxfm_add[TX_16X16][DCT_DCT] = ff_vp9_idct_idct_16x16_add_avx2;
dsp->itxfm_add[TX_16X16][ADST_DCT] = ff_vp9_idct_iadst_16x16_add_avx2;
dsp->itxfm_add[TX_16X16][DCT_ADST] = ff_vp9_iadst_idct_16x16_add_avx2;
dsp->itxfm_add[TX_16X16][ADST_ADST] = ff_vp9_iadst_iadst_16x16_add_avx2;
dsp->itxfm_add[TX_32X32][ADST_ADST] =
dsp->itxfm_add[TX_32X32][ADST_DCT] =
dsp->itxfm_add[TX_32X32][DCT_ADST] =
dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_avx2;
init_subpel3_32_64(0, put, 8, avx2);
init_subpel3_32_64(1, avg, 8, avx2);
#endif
}
init_dc_ipred(32, avx2);
init_ipred(32, avx2, h, HOR);
init_ipred(32, avx2, tm, TM_VP8);
}
#undef init_fpel
#undef init_subpel1
#undef init_subpel2
#undef init_subpel3
#endif /* HAVE_X86ASM */
}

View file

@ -0,0 +1,189 @@
/*
* VP9 SIMD optimizations
*
* Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVCODEC_X86_VP9DSP_INIT_H
#define AVCODEC_X86_VP9DSP_INIT_H
#include "libavcodec/vp9dsp.h"
// hack to force-expand BPC
#define cat(a, bpp, b) a##bpp##b
#define decl_fpel_func(avg, sz, bpp, opt) \
void ff_vp9_##avg##sz##bpp##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
const uint8_t *src, ptrdiff_t src_stride, \
int h, int mx, int my)
#define decl_mc_func(avg, sz, dir, opt, type, f_sz, bpp) \
void ff_vp9_##avg##_8tap_1d_##dir##_##sz##_##bpp##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
const uint8_t *src, ptrdiff_t src_stride, \
int h, const type (*filter)[f_sz])
#define decl_mc_funcs(sz, opt, type, fsz, bpp) \
decl_mc_func(put, sz, h, opt, type, fsz, bpp); \
decl_mc_func(avg, sz, h, opt, type, fsz, bpp); \
decl_mc_func(put, sz, v, opt, type, fsz, bpp); \
decl_mc_func(avg, sz, v, opt, type, fsz, bpp)
#define decl_ipred_fn(type, sz, bpp, opt) \
void ff_vp9_ipred_##type##_##sz##x##sz##_##bpp##_##opt(uint8_t *dst, \
ptrdiff_t stride, \
const uint8_t *l, \
const uint8_t *a)
#define decl_ipred_fns(type, bpp, opt4, opt8_16_32) \
decl_ipred_fn(type, 4, bpp, opt4); \
decl_ipred_fn(type, 8, bpp, opt8_16_32); \
decl_ipred_fn(type, 16, bpp, opt8_16_32); \
decl_ipred_fn(type, 32, bpp, opt8_16_32)
#define decl_itxfm_func(typea, typeb, size, bpp, opt) \
void cat(ff_vp9_##typea##_##typeb##_##size##x##size##_add_, bpp, _##opt)(uint8_t *dst, \
ptrdiff_t stride, \
int16_t *block, \
int eob)
#define decl_itxfm_funcs(size, bpp, opt) \
decl_itxfm_func(idct, idct, size, bpp, opt); \
decl_itxfm_func(iadst, idct, size, bpp, opt); \
decl_itxfm_func(idct, iadst, size, bpp, opt); \
decl_itxfm_func(iadst, iadst, size, bpp, opt)
#define mc_rep_func(avg, sz, hsz, hszb, dir, opt, type, f_sz, bpp) \
static av_always_inline void \
ff_vp9_##avg##_8tap_1d_##dir##_##sz##_##bpp##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
const uint8_t *src, ptrdiff_t src_stride, \
int h, const type (*filter)[f_sz]) \
{ \
ff_vp9_##avg##_8tap_1d_##dir##_##hsz##_##bpp##_##opt(dst, dst_stride, src, \
src_stride, h, filter); \
ff_vp9_##avg##_8tap_1d_##dir##_##hsz##_##bpp##_##opt(dst + hszb, dst_stride, src + hszb, \
src_stride, h, filter); \
}
#define mc_rep_funcs(sz, hsz, hszb, opt, type, fsz, bpp) \
mc_rep_func(put, sz, hsz, hszb, h, opt, type, fsz, bpp) \
mc_rep_func(avg, sz, hsz, hszb, h, opt, type, fsz, bpp) \
mc_rep_func(put, sz, hsz, hszb, v, opt, type, fsz, bpp) \
mc_rep_func(avg, sz, hsz, hszb, v, opt, type, fsz, bpp)
#define filter_8tap_1d_fn(op, sz, f, f_opt, fname, dir, dvar, bpp, opt) \
static void op##_8tap_##fname##_##sz##dir##_##bpp##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
const uint8_t *src, ptrdiff_t src_stride, \
int h, int mx, int my) \
{ \
ff_vp9_##op##_8tap_1d_##dir##_##sz##_##bpp##_##opt(dst, dst_stride, src, src_stride, \
h, ff_filters_##f_opt[f][dvar - 1]); \
}
#define filters_8tap_1d_fn(op, sz, dir, dvar, bpp, opt, f_opt) \
filter_8tap_1d_fn(op, sz, FILTER_8TAP_REGULAR, f_opt, regular, dir, dvar, bpp, opt) \
filter_8tap_1d_fn(op, sz, FILTER_8TAP_SHARP, f_opt, sharp, dir, dvar, bpp, opt) \
filter_8tap_1d_fn(op, sz, FILTER_8TAP_SMOOTH, f_opt, smooth, dir, dvar, bpp, opt)
#define filters_8tap_1d_fn2(op, sz, bpp, opt, f_opt) \
filters_8tap_1d_fn(op, sz, h, mx, bpp, opt, f_opt) \
filters_8tap_1d_fn(op, sz, v, my, bpp, opt, f_opt)
#define filters_8tap_1d_fn3(op, bpp, opt4, opt8, f_opt) \
filters_8tap_1d_fn2(op, 64, bpp, opt8, f_opt) \
filters_8tap_1d_fn2(op, 32, bpp, opt8, f_opt) \
filters_8tap_1d_fn2(op, 16, bpp, opt8, f_opt) \
filters_8tap_1d_fn2(op, 8, bpp, opt8, f_opt) \
filters_8tap_1d_fn2(op, 4, bpp, opt4, f_opt)
#define filter_8tap_2d_fn(op, sz, f, f_opt, fname, align, bpp, bytes, opt) \
static void op##_8tap_##fname##_##sz##hv_##bpp##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
const uint8_t *src, ptrdiff_t src_stride, \
int h, int mx, int my) \
{ \
LOCAL_ALIGNED_##align(uint8_t, temp, [71 * 64 * bytes]); \
ff_vp9_put_8tap_1d_h_##sz##_##bpp##_##opt(temp, 64 * bytes, src - 3 * src_stride, \
src_stride, h + 7, \
ff_filters_##f_opt[f][mx - 1]); \
ff_vp9_##op##_8tap_1d_v_##sz##_##bpp##_##opt(dst, dst_stride, temp + 3 * bytes * 64, \
64 * bytes, h, \
ff_filters_##f_opt[f][my - 1]); \
}
#define filters_8tap_2d_fn(op, sz, align, bpp, bytes, opt, f_opt) \
filter_8tap_2d_fn(op, sz, FILTER_8TAP_REGULAR, f_opt, regular, align, bpp, bytes, opt) \
filter_8tap_2d_fn(op, sz, FILTER_8TAP_SHARP, f_opt, sharp, align, bpp, bytes, opt) \
filter_8tap_2d_fn(op, sz, FILTER_8TAP_SMOOTH, f_opt, smooth, align, bpp, bytes, opt)
#define filters_8tap_2d_fn2(op, align, bpp, bytes, opt4, opt8, f_opt) \
filters_8tap_2d_fn(op, 64, align, bpp, bytes, opt8, f_opt) \
filters_8tap_2d_fn(op, 32, align, bpp, bytes, opt8, f_opt) \
filters_8tap_2d_fn(op, 16, align, bpp, bytes, opt8, f_opt) \
filters_8tap_2d_fn(op, 8, align, bpp, bytes, opt8, f_opt) \
filters_8tap_2d_fn(op, 4, align, bpp, bytes, opt4, f_opt)
#define init_fpel_func(idx1, idx2, sz, type, bpp, opt) \
dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][0][0] = \
dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][0][0] = \
dsp->mc[idx1][FILTER_8TAP_SHARP ][idx2][0][0] = \
dsp->mc[idx1][FILTER_BILINEAR ][idx2][0][0] = ff_vp9_##type##sz##bpp##_##opt
#define init_subpel1(idx1, idx2, idxh, idxv, sz, dir, type, bpp, opt) \
dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][idxh][idxv] = \
type##_8tap_smooth_##sz##dir##_##bpp##_##opt; \
dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] = \
type##_8tap_regular_##sz##dir##_##bpp##_##opt; \
dsp->mc[idx1][FILTER_8TAP_SHARP ][idx2][idxh][idxv] = \
type##_8tap_sharp_##sz##dir##_##bpp##_##opt
#define init_subpel2(idx1, idx2, sz, type, bpp, opt) \
init_subpel1(idx1, idx2, 1, 1, sz, hv, type, bpp, opt); \
init_subpel1(idx1, idx2, 0, 1, sz, v, type, bpp, opt); \
init_subpel1(idx1, idx2, 1, 0, sz, h, type, bpp, opt)
#define init_subpel3_32_64(idx, type, bpp, opt) \
init_subpel2(0, idx, 64, type, bpp, opt); \
init_subpel2(1, idx, 32, type, bpp, opt)
#define init_subpel3_8to64(idx, type, bpp, opt) \
init_subpel3_32_64(idx, type, bpp, opt); \
init_subpel2(2, idx, 16, type, bpp, opt); \
init_subpel2(3, idx, 8, type, bpp, opt)
#define init_subpel3(idx, type, bpp, opt) \
init_subpel3_8to64(idx, type, bpp, opt); \
init_subpel2(4, idx, 4, type, bpp, opt)
#define init_ipred_func(type, enum, sz, bpp, opt) \
dsp->intra_pred[TX_##sz##X##sz][enum##_PRED] = \
cat(ff_vp9_ipred_##type##_##sz##x##sz##_, bpp, _##opt)
#define init_8_16_32_ipred_funcs(type, enum, bpp, opt) \
init_ipred_func(type, enum, 8, bpp, opt); \
init_ipred_func(type, enum, 16, bpp, opt); \
init_ipred_func(type, enum, 32, bpp, opt)
#define init_ipred_funcs(type, enum, bpp, opt) \
init_ipred_func(type, enum, 4, bpp, opt); \
init_8_16_32_ipred_funcs(type, enum, bpp, opt)
void ff_vp9dsp_init_10bpp_x86(VP9DSPContext *dsp, int bitexact);
void ff_vp9dsp_init_12bpp_x86(VP9DSPContext *dsp, int bitexact);
void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp);
#endif /* AVCODEC_X86_VP9DSP_INIT_H */

View file

@ -0,0 +1,25 @@
/*
* VP9 SIMD optimizations
*
* Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#define BPC 10
#define INIT_FUNC ff_vp9dsp_init_10bpp_x86
#include "vp9dsp_init_16bpp_template.c"

View file

@ -0,0 +1,25 @@
/*
* VP9 SIMD optimizations
*
* Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#define BPC 12
#define INIT_FUNC ff_vp9dsp_init_12bpp_x86
#include "vp9dsp_init_16bpp_template.c"

Some files were not shown because too many files have changed in this diff Show more