Rename ffmpeg-4.2-fit to ffmpeg-4-fit

2025-03-09 15:49:59 +00:00 · 2021-03-02 17:48:40 +08:00 · 2021-03-02 17:48:40 +08:00 · 27712fdda7
commit 27712fdda7
parent b19074721c
720 changed files with 14 additions and 14 deletions
--- a/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/Makefile
+++ b/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/Makefile
@ -0,0 +1,199 @@
+OBJS                                   += x86/constants.o               \
+
+# subsystems
+OBJS-$(CONFIG_AC3DSP)                  += x86/ac3dsp_init.o
+OBJS-$(CONFIG_AUDIODSP)                += x86/audiodsp_init.o
+OBJS-$(CONFIG_BLOCKDSP)                += x86/blockdsp_init.o
+OBJS-$(CONFIG_BSWAPDSP)                += x86/bswapdsp_init.o
+OBJS-$(CONFIG_DCT)                     += x86/dct_init.o
+OBJS-$(CONFIG_DIRAC_DECODER)           += x86/diracdsp_init.o           \
+                                          x86/dirac_dwt_init.o
+OBJS-$(CONFIG_FDCTDSP)                 += x86/fdctdsp_init.o
+OBJS-$(CONFIG_FFT)                     += x86/fft_init.o
+OBJS-$(CONFIG_FLACDSP)                 += x86/flacdsp_init.o
+OBJS-$(CONFIG_FMTCONVERT)              += x86/fmtconvert_init.o
+OBJS-$(CONFIG_H263DSP)                 += x86/h263dsp_init.o
+OBJS-$(CONFIG_H264CHROMA)              += x86/h264chroma_init.o
+OBJS-$(CONFIG_H264DSP)                 += x86/h264dsp_init.o
+OBJS-$(CONFIG_H264PRED)                += x86/h264_intrapred_init.o
+OBJS-$(CONFIG_H264QPEL)                += x86/h264_qpel.o
+OBJS-$(CONFIG_HPELDSP)                 += x86/hpeldsp_init.o
+OBJS-$(CONFIG_LLAUDDSP)                += x86/lossless_audiodsp_init.o
+OBJS-$(CONFIG_LLVIDDSP)                += x86/lossless_videodsp_init.o
+OBJS-$(CONFIG_LLVIDENCDSP)             += x86/lossless_videoencdsp_init.o
+OBJS-$(CONFIG_HUFFYUVDSP)              += x86/huffyuvdsp_init.o
+OBJS-$(CONFIG_HUFFYUVENCDSP)           += x86/huffyuvencdsp_init.o
+OBJS-$(CONFIG_IDCTDSP)                 += x86/idctdsp_init.o
+OBJS-$(CONFIG_LPC)                     += x86/lpc.o
+OBJS-$(CONFIG_MDCT15)                  += x86/mdct15_init.o
+OBJS-$(CONFIG_ME_CMP)                  += x86/me_cmp_init.o
+OBJS-$(CONFIG_MPEGAUDIODSP)            += x86/mpegaudiodsp.o
+OBJS-$(CONFIG_MPEGVIDEO)               += x86/mpegvideo.o              \
+                                          x86/mpegvideodsp.o
+OBJS-$(CONFIG_MPEGVIDEOENC)            += x86/mpegvideoenc.o           \
+                                          x86/mpegvideoencdsp_init.o
+OBJS-$(CONFIG_PIXBLOCKDSP)             += x86/pixblockdsp_init.o
+OBJS-$(CONFIG_QPELDSP)                 += x86/qpeldsp_init.o
+OBJS-$(CONFIG_RV34DSP)                 += x86/rv34dsp_init.o
+OBJS-$(CONFIG_VC1DSP)                  += x86/vc1dsp_init.o
+OBJS-$(CONFIG_VIDEODSP)                += x86/videodsp_init.o
+OBJS-$(CONFIG_VP3DSP)                  += x86/vp3dsp_init.o
+OBJS-$(CONFIG_VP8DSP)                  += x86/vp8dsp_init.o
+OBJS-$(CONFIG_XMM_CLOBBER_TEST)        += x86/w64xmmtest.o
+
+# decoders/encoders
+OBJS-$(CONFIG_AAC_DECODER)             += x86/aacpsdsp_init.o          \
+                                          x86/sbrdsp_init.o
+OBJS-$(CONFIG_AAC_ENCODER)             += x86/aacencdsp_init.o
+OBJS-$(CONFIG_ADPCM_G722_DECODER)      += x86/g722dsp_init.o
+OBJS-$(CONFIG_ADPCM_G722_ENCODER)      += x86/g722dsp_init.o
+OBJS-$(CONFIG_ALAC_DECODER)            += x86/alacdsp_init.o
+OBJS-$(CONFIG_APNG_DECODER)            += x86/pngdsp_init.o
+OBJS-$(CONFIG_CAVS_DECODER)            += x86/cavsdsp.o
+OBJS-$(CONFIG_DCA_DECODER)             += x86/dcadsp_init.o x86/synth_filter_init.o
+OBJS-$(CONFIG_DNXHD_ENCODER)           += x86/dnxhdenc_init.o
+OBJS-$(CONFIG_EXR_DECODER)             += x86/exrdsp_init.o
+OBJS-$(CONFIG_OPUS_DECODER)            += x86/opusdsp_init.o
+OBJS-$(CONFIG_OPUS_ENCODER)            += x86/celt_pvq_init.o
+OBJS-$(CONFIG_HEVC_DECODER)            += x86/hevcdsp_init.o
+OBJS-$(CONFIG_JPEG2000_DECODER)        += x86/jpeg2000dsp_init.o
+OBJS-$(CONFIG_LSCR_DECODER)            += x86/pngdsp_init.o
+OBJS-$(CONFIG_MLP_DECODER)             += x86/mlpdsp_init.o
+OBJS-$(CONFIG_MPEG4_DECODER)           += x86/xvididct_init.o
+OBJS-$(CONFIG_PNG_DECODER)             += x86/pngdsp_init.o
+OBJS-$(CONFIG_PRORES_DECODER)          += x86/proresdsp_init.o
+OBJS-$(CONFIG_PRORES_LGPL_DECODER)     += x86/proresdsp_init.o
+OBJS-$(CONFIG_RV40_DECODER)            += x86/rv40dsp_init.o
+OBJS-$(CONFIG_SBC_ENCODER)             += x86/sbcdsp_init.o
+OBJS-$(CONFIG_SVQ1_ENCODER)            += x86/svq1enc_init.o
+OBJS-$(CONFIG_TAK_DECODER)             += x86/takdsp_init.o
+OBJS-$(CONFIG_TRUEHD_DECODER)          += x86/mlpdsp_init.o
+OBJS-$(CONFIG_TTA_DECODER)             += x86/ttadsp_init.o
+OBJS-$(CONFIG_TTA_ENCODER)             += x86/ttaencdsp_init.o
+OBJS-$(CONFIG_UTVIDEO_DECODER)         += x86/utvideodsp_init.o
+OBJS-$(CONFIG_V210_DECODER)            += x86/v210-init.o
+OBJS-$(CONFIG_V210_ENCODER)            += x86/v210enc_init.o
+OBJS-$(CONFIG_VORBIS_DECODER)          += x86/vorbisdsp_init.o
+OBJS-$(CONFIG_VP3_DECODER)             += x86/hpeldsp_vp3_init.o
+OBJS-$(CONFIG_VP6_DECODER)             += x86/vp6dsp_init.o
+OBJS-$(CONFIG_VP9_DECODER)             += x86/vp9dsp_init.o            \
+                                          x86/vp9dsp_init_10bpp.o      \
+                                          x86/vp9dsp_init_12bpp.o      \
+                                          x86/vp9dsp_init_16bpp.o
+OBJS-$(CONFIG_WEBP_DECODER)            += x86/vp8dsp_init.o
+
+
+# GCC inline assembly optimizations
+# subsystems
+MMX-OBJS-$(CONFIG_FDCTDSP)             += x86/fdct.o
+MMX-OBJS-$(CONFIG_VC1DSP)              += x86/vc1dsp_mmx.o
+
+# decoders/encoders
+MMX-OBJS-$(CONFIG_SNOW_DECODER)        += x86/snowdsp.o
+MMX-OBJS-$(CONFIG_SNOW_ENCODER)        += x86/snowdsp.o
+
+# subsystems
+X86ASM-OBJS-$(CONFIG_AC3DSP)           += x86/ac3dsp.o                  \
+                                          x86/ac3dsp_downmix.o
+X86ASM-OBJS-$(CONFIG_AUDIODSP)         += x86/audiodsp.o
+X86ASM-OBJS-$(CONFIG_BLOCKDSP)         += x86/blockdsp.o
+X86ASM-OBJS-$(CONFIG_BSWAPDSP)         += x86/bswapdsp.o
+X86ASM-OBJS-$(CONFIG_DCT)              += x86/dct32.o
+X86ASM-OBJS-$(CONFIG_FFT)              += x86/fft.o
+X86ASM-OBJS-$(CONFIG_FMTCONVERT)       += x86/fmtconvert.o
+X86ASM-OBJS-$(CONFIG_H263DSP)          += x86/h263_loopfilter.o
+X86ASM-OBJS-$(CONFIG_H264CHROMA)       += x86/h264_chromamc.o           \
+                                          x86/h264_chromamc_10bit.o
+X86ASM-OBJS-$(CONFIG_H264DSP)          += x86/h264_deblock.o            \
+                                          x86/h264_deblock_10bit.o      \
+                                          x86/h264_idct.o               \
+                                          x86/h264_idct_10bit.o         \
+                                          x86/h264_weight.o             \
+                                          x86/h264_weight_10bit.o
+X86ASM-OBJS-$(CONFIG_H264PRED)         += x86/h264_intrapred.o          \
+                                          x86/h264_intrapred_10bit.o
+X86ASM-OBJS-$(CONFIG_H264QPEL)         += x86/h264_qpel_8bit.o          \
+                                          x86/h264_qpel_10bit.o         \
+                                          x86/fpel.o                    \
+                                          x86/qpel.o
+X86ASM-OBJS-$(CONFIG_HPELDSP)          += x86/fpel.o                    \
+                                          x86/hpeldsp.o
+X86ASM-OBJS-$(CONFIG_HUFFYUVDSP)       += x86/huffyuvdsp.o
+X86ASM-OBJS-$(CONFIG_HUFFYUVENCDSP)    += x86/huffyuvencdsp.o
+X86ASM-OBJS-$(CONFIG_IDCTDSP)          += x86/idctdsp.o
+X86ASM-OBJS-$(CONFIG_LLAUDDSP)         += x86/lossless_audiodsp.o
+X86ASM-OBJS-$(CONFIG_LLVIDDSP)         += x86/lossless_videodsp.o
+X86ASM-OBJS-$(CONFIG_LLVIDENCDSP)      += x86/lossless_videoencdsp.o
+X86ASM-OBJS-$(CONFIG_MDCT15)           += x86/mdct15.o
+X86ASM-OBJS-$(CONFIG_ME_CMP)           += x86/me_cmp.o
+X86ASM-OBJS-$(CONFIG_MPEGAUDIODSP)     += x86/imdct36.o
+X86ASM-OBJS-$(CONFIG_MPEGVIDEOENC)     += x86/mpegvideoencdsp.o
+X86ASM-OBJS-$(CONFIG_OPUS_DECODER)     += x86/opusdsp.o
+X86ASM-OBJS-$(CONFIG_OPUS_ENCODER)     += x86/celt_pvq_search.o
+X86ASM-OBJS-$(CONFIG_PIXBLOCKDSP)      += x86/pixblockdsp.o
+X86ASM-OBJS-$(CONFIG_QPELDSP)          += x86/qpeldsp.o                 \
+                                          x86/fpel.o                    \
+                                          x86/qpel.o
+X86ASM-OBJS-$(CONFIG_RV34DSP)          += x86/rv34dsp.o
+X86ASM-OBJS-$(CONFIG_VC1DSP)           += x86/vc1dsp_loopfilter.o       \
+                                          x86/vc1dsp_mc.o
+X86ASM-OBJS-$(CONFIG_IDCTDSP)          += x86/simple_idct10.o           \
+                                          x86/simple_idct.o
+X86ASM-OBJS-$(CONFIG_VIDEODSP)         += x86/videodsp.o
+X86ASM-OBJS-$(CONFIG_VP3DSP)           += x86/vp3dsp.o
+X86ASM-OBJS-$(CONFIG_VP8DSP)           += x86/vp8dsp.o                  \
+                                          x86/vp8dsp_loopfilter.o
+
+# decoders/encoders
+X86ASM-OBJS-$(CONFIG_AAC_DECODER)      += x86/aacpsdsp.o                \
+                                          x86/sbrdsp.o
+X86ASM-OBJS-$(CONFIG_AAC_ENCODER)      += x86/aacencdsp.o
+X86ASM-OBJS-$(CONFIG_ADPCM_G722_DECODER) += x86/g722dsp.o
+X86ASM-OBJS-$(CONFIG_ADPCM_G722_ENCODER) += x86/g722dsp.o
+X86ASM-OBJS-$(CONFIG_ALAC_DECODER)     += x86/alacdsp.o
+X86ASM-OBJS-$(CONFIG_APNG_DECODER)     += x86/pngdsp.o
+X86ASM-OBJS-$(CONFIG_CAVS_DECODER)     += x86/cavsidct.o
+X86ASM-OBJS-$(CONFIG_DCA_DECODER)      += x86/dcadsp.o x86/synth_filter.o
+X86ASM-OBJS-$(CONFIG_DIRAC_DECODER)    += x86/diracdsp.o                \
+                                          x86/dirac_dwt.o
+X86ASM-OBJS-$(CONFIG_DNXHD_ENCODER)    += x86/dnxhdenc.o
+X86ASM-OBJS-$(CONFIG_EXR_DECODER)      += x86/exrdsp.o
+X86ASM-OBJS-$(CONFIG_FLAC_DECODER)     += x86/flacdsp.o
+ifdef CONFIG_GPL
+X86ASM-OBJS-$(CONFIG_FLAC_ENCODER)     += x86/flac_dsp_gpl.o
+endif
+X86ASM-OBJS-$(CONFIG_HEVC_DECODER)     += x86/hevc_add_res.o            \
+                                          x86/hevc_deblock.o            \
+                                          x86/hevc_idct.o               \
+                                          x86/hevc_mc.o                 \
+                                          x86/hevc_sao.o                \
+                                          x86/hevc_sao_10bit.o
+X86ASM-OBJS-$(CONFIG_JPEG2000_DECODER) += x86/jpeg2000dsp.o
+X86ASM-OBJS-$(CONFIG_LSCR_DECODER)     += x86/pngdsp.o
+X86ASM-OBJS-$(CONFIG_MLP_DECODER)      += x86/mlpdsp.o
+X86ASM-OBJS-$(CONFIG_MPEG4_DECODER)    += x86/xvididct.o
+X86ASM-OBJS-$(CONFIG_PNG_DECODER)      += x86/pngdsp.o
+X86ASM-OBJS-$(CONFIG_PRORES_DECODER)   += x86/proresdsp.o
+X86ASM-OBJS-$(CONFIG_PRORES_LGPL_DECODER) += x86/proresdsp.o
+X86ASM-OBJS-$(CONFIG_RV40_DECODER)     += x86/rv40dsp.o
+X86ASM-OBJS-$(CONFIG_SBC_ENCODER)      += x86/sbcdsp.o
+X86ASM-OBJS-$(CONFIG_SVQ1_ENCODER)     += x86/svq1enc.o
+X86ASM-OBJS-$(CONFIG_TAK_DECODER)      += x86/takdsp.o
+X86ASM-OBJS-$(CONFIG_TRUEHD_DECODER)   += x86/mlpdsp.o
+X86ASM-OBJS-$(CONFIG_TTA_DECODER)      += x86/ttadsp.o
+X86ASM-OBJS-$(CONFIG_TTA_ENCODER)      += x86/ttaencdsp.o
+X86ASM-OBJS-$(CONFIG_UTVIDEO_DECODER)  += x86/utvideodsp.o
+X86ASM-OBJS-$(CONFIG_V210_ENCODER)     += x86/v210enc.o
+X86ASM-OBJS-$(CONFIG_V210_DECODER)     += x86/v210.o
+X86ASM-OBJS-$(CONFIG_VORBIS_DECODER)   += x86/vorbisdsp.o
+X86ASM-OBJS-$(CONFIG_VP3_DECODER)      += x86/hpeldsp_vp3.o
+X86ASM-OBJS-$(CONFIG_VP6_DECODER)      += x86/vp6dsp.o
+X86ASM-OBJS-$(CONFIG_VP9_DECODER)      += x86/vp9intrapred.o            \
+                                          x86/vp9intrapred_16bpp.o      \
+                                          x86/vp9itxfm.o                \
+                                          x86/vp9itxfm_16bpp.o          \
+                                          x86/vp9lpf.o                  \
+                                          x86/vp9lpf_16bpp.o            \
+                                          x86/vp9mc.o                   \
+                                          x86/vp9mc_16bpp.o
+X86ASM-OBJS-$(CONFIG_WEBP_DECODER)     += x86/vp8dsp.o
--- a/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/aacencdsp.asm
+++ b/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/aacencdsp.asm
@ -0,0 +1,86 @@
+;******************************************************************************
+;* SIMD optimized AAC encoder DSP functions
+;*
+;* Copyright (C) 2016 Rostislav Pehlivanov <atomnuker@gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+float_abs_mask: times 4 dd 0x7fffffff
+
+SECTION .text
+
+;*******************************************************************
+;void ff_abs_pow34(float *out, const float *in, const int size);
+;*******************************************************************
+INIT_XMM sse
+cglobal abs_pow34, 3, 3, 3, out, in, size
+    mova   m2, [float_abs_mask]
+    shl    sizeq, 2
+    add    inq, sizeq
+    add    outq, sizeq
+    neg    sizeq
+.loop:
+    andps  m0, m2, [inq+sizeq]
+    sqrtps m1, m0
+    mulps  m0, m1
+    sqrtps m0, m0
+    mova   [outq+sizeq], m0
+    add    sizeq, mmsize
+    jl    .loop
+    RET
+
+;*******************************************************************
+;void ff_aac_quantize_bands(int *out, const float *in, const float *scaled,
+;                           int size, int is_signed, int maxval, const float Q34,
+;                           const float rounding)
+;*******************************************************************
+INIT_XMM sse2
+cglobal aac_quantize_bands, 5, 5, 6, out, in, scaled, size, is_signed, maxval, Q34, rounding
+%if UNIX64 == 0
+    movss     m0, Q34m
+    movss     m1, roundingm
+    cvtsi2ss  m3, dword maxvalm
+%else
+    cvtsi2ss  m3, maxvald
+%endif
+    shufps    m0, m0, 0
+    shufps    m1, m1, 0
+    shufps    m3, m3, 0
+    shl       is_signedd, 31
+    movd      m4, is_signedd
+    shufps    m4, m4, 0
+    shl       sized,   2
+    add       inq, sizeq
+    add       outq, sizeq
+    add       scaledq, sizeq
+    neg       sizeq
+.loop:
+    mulps     m2, m0, [scaledq+sizeq]
+    addps     m2, m1
+    minps     m2, m3
+    andps     m5, m4, [inq+sizeq]
+    orps      m2, m5
+    cvttps2dq m2, m2
+    mova      [outq+sizeq], m2
+    add       sizeq, mmsize
+    jl       .loop
+    RET
--- a/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/aacencdsp_init.c
+++ b/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/aacencdsp_init.c
@ -0,0 +1,43 @@
+/*
+ * AAC encoder assembly optimizations
+ * Copyright (C) 2016 Rostislav Pehlivanov <atomnuker@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+#include "libavutil/float_dsp.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/aacenc.h"
+
+void ff_abs_pow34_sse(float *out, const float *in, const int size);
+
+void ff_aac_quantize_bands_sse2(int *out, const float *in, const float *scaled,
+                                int size, int is_signed, int maxval, const float Q34,
+                                const float rounding);
+
+av_cold void ff_aac_dsp_init_x86(AACEncContext *s)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_SSE(cpu_flags))
+        s->abs_pow34   = ff_abs_pow34_sse;
+
+    if (EXTERNAL_SSE2(cpu_flags))
+        s->quant_bands = ff_aac_quantize_bands_sse2;
+}
--- a/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/aacpsdsp.asm
+++ b/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/aacpsdsp.asm
@ -0,0 +1,487 @@
+;******************************************************************************
+;* SIMD optimized MPEG-4 Parametric Stereo decoding functions
+;*
+;* Copyright (C) 2015 James Almer
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+ps_p1m1p1m1: dd 0, 0x80000000, 0, 0x80000000
+
+SECTION .text
+
+;*************************************************************************
+;void ff_ps_add_squares_<opt>(float *dst, const float (*src)[2], int n);
+;*************************************************************************
+%macro PS_ADD_SQUARES 1
+cglobal ps_add_squares, 3, 3, %1, dst, src, n
+    shl    nd, 3
+    add  srcq, nq
+    neg    nq
+
+align 16
+.loop:
+    movaps m0, [srcq+nq]
+    movaps m1, [srcq+nq+mmsize]
+    mulps  m0, m0
+    mulps  m1, m1
+    HADDPS m0, m1, m2
+    addps  m0, [dstq]
+    movaps [dstq], m0
+    add  dstq, mmsize
+    add    nq, mmsize*2
+    jl .loop
+    REP_RET
+%endmacro
+
+INIT_XMM sse
+PS_ADD_SQUARES 2
+INIT_XMM sse3
+PS_ADD_SQUARES 3
+
+;*******************************************************************
+;void ff_ps_mul_pair_single_sse(float (*dst)[2], float (*src0)[2],
+;                                   float *src1, int n);
+;*******************************************************************
+INIT_XMM sse
+cglobal ps_mul_pair_single, 4, 4, 4, dst, src1, src2, n
+    shl      nd, 3
+    add   src1q, nq
+    add    dstq, nq
+    neg      nq
+
+align 16
+.loop:
+    movu     m0, [src1q+nq]
+    movu     m1, [src1q+nq+mmsize]
+    mova     m2, [src2q]
+    mova     m3, m2
+    unpcklps m2, m2
+    unpckhps m3, m3
+    mulps    m0, m2
+    mulps    m1, m3
+    mova [dstq+nq], m0
+    mova [dstq+nq+mmsize], m1
+    add   src2q, mmsize
+    add      nq, mmsize*2
+    jl .loop
+    REP_RET
+
+;***********************************************************************
+;void ff_ps_stereo_interpolate_sse3(float (*l)[2], float (*r)[2],
+;                                   float h[2][4], float h_step[2][4],
+;                                   int len);
+;***********************************************************************
+INIT_XMM sse3
+cglobal ps_stereo_interpolate, 5, 5, 6, l, r, h, h_step, n
+    movaps   m0, [hq]
+    movaps   m1, [h_stepq]
+    unpcklps m4, m0, m0
+    unpckhps m0, m0
+    unpcklps m5, m1, m1
+    unpckhps m1, m1
+    shl      nd, 3
+    add      lq, nq
+    add      rq, nq
+    neg      nq
+
+align 16
+.loop:
+    addps    m4, m5
+    addps    m0, m1
+    movddup  m2, [lq+nq]
+    movddup  m3, [rq+nq]
+    mulps    m2, m4
+    mulps    m3, m0
+    addps    m2, m3
+    movsd  [lq+nq], m2
+    movhps [rq+nq], m2
+    add      nq, 8
+    jl .loop
+    REP_RET
+
+;***************************************************************************
+;void ps_stereo_interpolate_ipdopd_sse3(float (*l)[2], float (*r)[2],
+;                                       float h[2][4], float h_step[2][4],
+;                                       int len);
+;***************************************************************************
+INIT_XMM sse3
+cglobal ps_stereo_interpolate_ipdopd, 5, 5, 10, l, r, h, h_step, n
+    movaps   m0, [hq]
+    movaps   m1, [hq+mmsize]
+%if ARCH_X86_64
+    movaps   m8, [h_stepq]
+    movaps   m9, [h_stepq+mmsize]
+    %define  H_STEP0 m8
+    %define  H_STEP1 m9
+%else
+    %define  H_STEP0 [h_stepq]
+    %define  H_STEP1 [h_stepq+mmsize]
+%endif
+    shl      nd, 3
+    add      lq, nq
+    add      rq, nq
+    neg      nq
+
+align 16
+.loop:
+    addps    m0, H_STEP0
+    addps    m1, H_STEP1
+    movddup  m2, [lq+nq]
+    movddup  m3, [rq+nq]
+    shufps   m4, m2, m2, q2301
+    shufps   m5, m3, m3, q2301
+    unpcklps m6, m0, m0
+    unpckhps m7, m0, m0
+    mulps    m2, m6
+    mulps    m3, m7
+    unpcklps m6, m1, m1
+    unpckhps m7, m1, m1
+    mulps    m4, m6
+    mulps    m5, m7
+    addps    m2, m3
+    addsubps m2, m4
+    addsubps m2, m5
+    movsd  [lq+nq], m2
+    movhps [rq+nq], m2
+    add      nq, 8
+    jl .loop
+    REP_RET
+
+;**********************************************************
+;void ps_hybrid_analysis_ileave_sse(float out[2][38][64],
+;                                   float (*in)[32][2],
+;                                   int i, int len)
+;**********************************************************
+INIT_XMM sse
+cglobal ps_hybrid_analysis_ileave, 3, 7, 5, out, in, i, len, in0, in1, tmp
+    movsxdifnidn        iq, id
+    mov               lend, 32 << 3
+    lea                inq, [inq+iq*4]
+    mov               tmpd, id
+    shl               tmpd, 8
+    add               outq, tmpq
+    mov               tmpd, 64
+    sub               tmpd, id
+    mov                 id, tmpd
+
+    test                id, 1
+    jne .loop4
+    test                id, 2
+    jne .loop8
+
+align 16
+.loop16:
+    mov               in0q, inq
+    mov               in1q, 38*64*4
+    add               in1q, in0q
+    mov               tmpd, lend
+
+.inner_loop16:
+    movaps              m0, [in0q]
+    movaps              m1, [in1q]
+    movaps              m2, [in0q+lenq]
+    movaps              m3, [in1q+lenq]
+    TRANSPOSE4x4PS 0, 1, 2, 3, 4
+    movaps          [outq], m0
+    movaps     [outq+lenq], m1
+    movaps   [outq+lenq*2], m2
+    movaps [outq+3*32*2*4], m3
+    lea               in0q, [in0q+lenq*2]
+    lea               in1q, [in1q+lenq*2]
+    add               outq, mmsize
+    sub               tmpd, mmsize
+    jg .inner_loop16
+    add                inq, 16
+    add               outq, 3*32*2*4
+    sub                 id, 4
+    jg .loop16
+    RET
+
+align 16
+.loop8:
+    mov               in0q, inq
+    mov               in1q, 38*64*4
+    add               in1q, in0q
+    mov               tmpd, lend
+
+.inner_loop8:
+    movlps              m0, [in0q]
+    movlps              m1, [in1q]
+    movhps              m0, [in0q+lenq]
+    movhps              m1, [in1q+lenq]
+    SBUTTERFLYPS 0, 1, 2
+    SBUTTERFLYPD 0, 1, 2
+    movaps          [outq], m0
+    movaps     [outq+lenq], m1
+    lea               in0q, [in0q+lenq*2]
+    lea               in1q, [in1q+lenq*2]
+    add               outq, mmsize
+    sub               tmpd, mmsize
+    jg .inner_loop8
+    add                inq, 8
+    add               outq, lenq
+    sub                 id, 2
+    jg .loop16
+    RET
+
+align 16
+.loop4:
+    mov               in0q, inq
+    mov               in1q, 38*64*4
+    add               in1q, in0q
+    mov               tmpd, lend
+
+.inner_loop4:
+    movss               m0, [in0q]
+    movss               m1, [in1q]
+    movss               m2, [in0q+lenq]
+    movss               m3, [in1q+lenq]
+    movlhps             m0, m1
+    movlhps             m2, m3
+    shufps              m0, m2, q2020
+    movaps          [outq], m0
+    lea               in0q, [in0q+lenq*2]
+    lea               in1q, [in1q+lenq*2]
+    add               outq, mmsize
+    sub               tmpd, mmsize
+    jg .inner_loop4
+    add                inq, 4
+    sub                 id, 1
+    test                id, 2
+    jne .loop8
+    cmp                 id, 4
+    jge .loop16
+    RET
+
+;***********************************************************
+;void ps_hybrid_synthesis_deint_sse4(float out[2][38][64],
+;                                    float (*in)[32][2],
+;                                    int i, int len)
+;***********************************************************
+%macro HYBRID_SYNTHESIS_DEINT 0
+cglobal ps_hybrid_synthesis_deint, 3, 7, 5, out, in, i, len, out0, out1, tmp
+%if cpuflag(sse4)
+%define MOVH movsd
+%else
+%define MOVH movlps
+%endif
+    movsxdifnidn        iq, id
+    mov               lend, 32 << 3
+    lea               outq, [outq+iq*4]
+    mov               tmpd, id
+    shl               tmpd, 8
+    add                inq, tmpq
+    mov               tmpd, 64
+    sub               tmpd, id
+    mov                 id, tmpd
+
+    test                id, 1
+    jne .loop4
+    test                id, 2
+    jne .loop8
+
+align 16
+.loop16:
+    mov              out0q, outq
+    mov              out1q, 38*64*4
+    add              out1q, out0q
+    mov               tmpd, lend
+
+.inner_loop16:
+    movaps              m0, [inq]
+    movaps              m1, [inq+lenq]
+    movaps              m2, [inq+lenq*2]
+    movaps              m3, [inq+3*32*2*4]
+    TRANSPOSE4x4PS 0, 1, 2, 3, 4
+    movaps         [out0q], m0
+    movaps         [out1q], m1
+    movaps    [out0q+lenq], m2
+    movaps    [out1q+lenq], m3
+    lea              out0q, [out0q+lenq*2]
+    lea              out1q, [out1q+lenq*2]
+    add                inq, mmsize
+    sub               tmpd, mmsize
+    jg .inner_loop16
+    add               outq, 16
+    add                inq, 3*32*2*4
+    sub                 id, 4
+    jg .loop16
+    RET
+
+align 16
+.loop8:
+    mov              out0q, outq
+    mov              out1q, 38*64*4
+    add              out1q, out0q
+    mov               tmpd, lend
+
+.inner_loop8:
+    movaps              m0, [inq]
+    movaps              m1, [inq+lenq]
+    SBUTTERFLYPS 0, 1, 2
+    SBUTTERFLYPD 0, 1, 2
+    MOVH           [out0q], m0
+    MOVH           [out1q], m1
+    movhps    [out0q+lenq], m0
+    movhps    [out1q+lenq], m1
+    lea              out0q, [out0q+lenq*2]
+    lea              out1q, [out1q+lenq*2]
+    add                inq, mmsize
+    sub               tmpd, mmsize
+    jg .inner_loop8
+    add               outq, 8
+    add                inq, lenq
+    sub                 id, 2
+    jg .loop16
+    RET
+
+align 16
+.loop4:
+    mov              out0q, outq
+    mov              out1q, 38*64*4
+    add              out1q, out0q
+    mov               tmpd, lend
+
+.inner_loop4:
+    movaps              m0, [inq]
+    movss          [out0q], m0
+%if cpuflag(sse4)
+    extractps      [out1q], m0, 1
+    extractps [out0q+lenq], m0, 2
+    extractps [out1q+lenq], m0, 3
+%else
+    movhlps             m1, m0
+    movss     [out0q+lenq], m1
+    shufps              m0, m0, 0xb1
+    movss          [out1q], m0
+    movhlps             m1, m0
+    movss     [out1q+lenq], m1
+%endif
+    lea              out0q, [out0q+lenq*2]
+    lea              out1q, [out1q+lenq*2]
+    add                inq, mmsize
+    sub               tmpd, mmsize
+    jg .inner_loop4
+    add               outq, 4
+    sub                 id, 1
+    test                id, 2
+    jne .loop8
+    cmp                 id, 4
+    jge .loop16
+    RET
+%endmacro
+
+INIT_XMM sse
+HYBRID_SYNTHESIS_DEINT
+INIT_XMM sse4
+HYBRID_SYNTHESIS_DEINT
+
+;*******************************************************************
+;void ff_ps_hybrid_analysis_<opt>(float (*out)[2], float (*in)[2],
+;                                 const float (*filter)[8][2],
+;                                 ptrdiff_t stride, int n);
+;*******************************************************************
+%macro PS_HYBRID_ANALYSIS_LOOP 3
+    movu     %1, [inq+mmsize*%3]
+    movu     m1, [inq+mmsize*(5-%3)+8]
+%if cpuflag(sse3)
+    pshufd   %2, %1, q2301
+    pshufd   m4, m1, q0123
+    pshufd   m1, m1, q1032
+    pshufd   m2, [filterq+nq+mmsize*%3], q2301
+    addsubps %2, m4
+    addsubps %1, m1
+%else
+    mova     m2, [filterq+nq+mmsize*%3]
+    mova     %2, %1
+    mova     m4, m1
+    shufps   %2, %2, q2301
+    shufps   m4, m4, q0123
+    shufps   m1, m1, q1032
+    shufps   m2, m2, q2301
+    xorps    m4, m7
+    xorps    m1, m7
+    subps    %2, m4
+    subps    %1, m1
+%endif
+    mulps    %2, m2
+    mulps    %1, m2
+%if %3
+    addps    m3, %2
+    addps    m0, %1
+%endif
+%endmacro
+
+%macro PS_HYBRID_ANALYSIS 0
+cglobal ps_hybrid_analysis, 5, 5, 8, out, in, filter, stride, n
+%if cpuflag(sse3)
+%define MOVH movsd
+%else
+%define MOVH movlps
+%endif
+    shl strideq, 3
+    shl nd, 6
+    add filterq, nq
+    neg nq
+    mova m7, [ps_p1m1p1m1]
+
+align 16
+.loop:
+    PS_HYBRID_ANALYSIS_LOOP m0, m3, 0
+    PS_HYBRID_ANALYSIS_LOOP m5, m6, 1
+    PS_HYBRID_ANALYSIS_LOOP m5, m6, 2
+
+%if cpuflag(sse3)
+    pshufd   m3, m3, q2301
+    xorps    m0, m7
+    hsubps   m3, m0
+    pshufd   m1, m3, q0020
+    pshufd   m3, m3, q0031
+    addps    m1, m3
+    movsd    m2, [inq+6*8]
+%else
+    mova     m1, m3
+    mova     m2, m0
+    shufps   m1, m1, q2301
+    shufps   m2, m2, q2301
+    subps    m1, m3
+    addps    m2, m0
+    unpcklps m3, m1, m2
+    unpckhps m1, m2
+    addps    m1, m3
+    movu     m2, [inq+6*8] ; faster than movlps and no risk of overread
+%endif
+    movss    m3, [filterq+nq+8*6]
+    SPLATD   m3
+    mulps    m2, m3
+    addps    m1, m2
+    MOVH [outq], m1
+    add    outq, strideq
+    add      nq, 64
+    jl .loop
+    REP_RET
+%endmacro
+
+INIT_XMM sse
+PS_HYBRID_ANALYSIS
+INIT_XMM sse3
+PS_HYBRID_ANALYSIS
--- a/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/aacpsdsp_init.c
+++ b/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/aacpsdsp_init.c
@ -0,0 +1,72 @@
+/*
+ * SIMD optimized MPEG-4 Parametric Stereo decoding functions
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stddef.h>
+
+#include "config.h"
+
+#include "libavutil/x86/cpu.h"
+#include "libavutil/attributes.h"
+#include "libavcodec/aacpsdsp.h"
+
+void ff_ps_add_squares_sse  (float *dst, const float (*src)[2], int n);
+void ff_ps_add_squares_sse3 (float *dst, const float (*src)[2], int n);
+void ff_ps_mul_pair_single_sse (float (*dst)[2], float (*src0)[2],
+                                float *src1, int n);
+void ff_ps_hybrid_analysis_sse (float (*out)[2], float (*in)[2],
+                                const float (*filter)[8][2],
+                                ptrdiff_t stride, int n);
+void ff_ps_hybrid_analysis_sse3(float (*out)[2], float (*in)[2],
+                                const float (*filter)[8][2],
+                                ptrdiff_t stride, int n);
+void ff_ps_stereo_interpolate_sse3(float (*l)[2], float (*r)[2],
+                                   float h[2][4], float h_step[2][4],
+                                   int len);
+void ff_ps_stereo_interpolate_ipdopd_sse3(float (*l)[2], float (*r)[2],
+                                          float h[2][4], float h_step[2][4],
+                                          int len);
+void ff_ps_hybrid_synthesis_deint_sse(float out[2][38][64], float (*in)[32][2],
+                                      int i, int len);
+void ff_ps_hybrid_synthesis_deint_sse4(float out[2][38][64], float (*in)[32][2],
+                                       int i, int len);
+void ff_ps_hybrid_analysis_ileave_sse(float (*out)[32][2], float L[2][38][64],
+                                      int i, int len);
+
+av_cold void ff_psdsp_init_x86(PSDSPContext *s)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_SSE(cpu_flags)) {
+        s->add_squares            = ff_ps_add_squares_sse;
+        s->mul_pair_single        = ff_ps_mul_pair_single_sse;
+        s->hybrid_analysis_ileave = ff_ps_hybrid_analysis_ileave_sse;
+        s->hybrid_synthesis_deint = ff_ps_hybrid_synthesis_deint_sse;
+        s->hybrid_analysis        = ff_ps_hybrid_analysis_sse;
+    }
+    if (EXTERNAL_SSE3(cpu_flags)) {
+        s->add_squares            = ff_ps_add_squares_sse3;
+        s->stereo_interpolate[0]  = ff_ps_stereo_interpolate_sse3;
+        s->stereo_interpolate[1]  = ff_ps_stereo_interpolate_ipdopd_sse3;
+        s->hybrid_analysis        = ff_ps_hybrid_analysis_sse3;
+    }
+    if (EXTERNAL_SSE4(cpu_flags)) {
+        s->hybrid_synthesis_deint = ff_ps_hybrid_synthesis_deint_sse4;
+    }
+}
--- a/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/ac3dsp_init.c
+++ b/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/ac3dsp_init.c
@ -0,0 +1,164 @@
+/*
+ * x86-optimized AC-3 DSP functions
+ * Copyright (c) 2011 Justin Ruggles
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/mem.h"
+#include "libavutil/x86/asm.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/ac3.h"
+#include "libavcodec/ac3dsp.h"
+
+void ff_ac3_exponent_min_mmx   (uint8_t *exp, int num_reuse_blocks, int nb_coefs);
+void ff_ac3_exponent_min_mmxext(uint8_t *exp, int num_reuse_blocks, int nb_coefs);
+void ff_ac3_exponent_min_sse2  (uint8_t *exp, int num_reuse_blocks, int nb_coefs);
+
+int ff_ac3_max_msb_abs_int16_mmx  (const int16_t *src, int len);
+int ff_ac3_max_msb_abs_int16_mmxext(const int16_t *src, int len);
+int ff_ac3_max_msb_abs_int16_sse2 (const int16_t *src, int len);
+int ff_ac3_max_msb_abs_int16_ssse3(const int16_t *src, int len);
+
+void ff_ac3_lshift_int16_mmx (int16_t *src, unsigned int len, unsigned int shift);
+void ff_ac3_lshift_int16_sse2(int16_t *src, unsigned int len, unsigned int shift);
+
+void ff_ac3_rshift_int32_mmx (int32_t *src, unsigned int len, unsigned int shift);
+void ff_ac3_rshift_int32_sse2(int32_t *src, unsigned int len, unsigned int shift);
+
+void ff_float_to_fixed24_3dnow(int32_t *dst, const float *src, unsigned int len);
+void ff_float_to_fixed24_sse  (int32_t *dst, const float *src, unsigned int len);
+void ff_float_to_fixed24_sse2 (int32_t *dst, const float *src, unsigned int len);
+
+int ff_ac3_compute_mantissa_size_sse2(uint16_t mant_cnt[6][16]);
+
+void ff_ac3_extract_exponents_sse2 (uint8_t *exp, int32_t *coef, int nb_coefs);
+void ff_ac3_extract_exponents_ssse3(uint8_t *exp, int32_t *coef, int nb_coefs);
+
+void ff_apply_window_int16_round_mmxext(int16_t *output, const int16_t *input,
+                                        const int16_t *window, unsigned int len);
+void ff_apply_window_int16_round_sse2(int16_t *output, const int16_t *input,
+                                      const int16_t *window, unsigned int len);
+void ff_apply_window_int16_mmxext(int16_t *output, const int16_t *input,
+                                  const int16_t *window, unsigned int len);
+void ff_apply_window_int16_sse2(int16_t *output, const int16_t *input,
+                                const int16_t *window, unsigned int len);
+void ff_apply_window_int16_ssse3(int16_t *output, const int16_t *input,
+                                 const int16_t *window, unsigned int len);
+void ff_apply_window_int16_ssse3_atom(int16_t *output, const int16_t *input,
+                                      const int16_t *window, unsigned int len);
+
+av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c, int bit_exact)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_MMX(cpu_flags)) {
+        c->ac3_exponent_min = ff_ac3_exponent_min_mmx;
+        c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_mmx;
+        c->ac3_lshift_int16 = ff_ac3_lshift_int16_mmx;
+        c->ac3_rshift_int32 = ff_ac3_rshift_int32_mmx;
+    }
+    if (EXTERNAL_AMD3DNOW(cpu_flags)) {
+        if (!bit_exact) {
+            c->float_to_fixed24 = ff_float_to_fixed24_3dnow;
+        }
+    }
+    if (EXTERNAL_MMXEXT(cpu_flags)) {
+        c->ac3_exponent_min = ff_ac3_exponent_min_mmxext;
+        c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_mmxext;
+        if (bit_exact) {
+            c->apply_window_int16 = ff_apply_window_int16_mmxext;
+        } else {
+            c->apply_window_int16 = ff_apply_window_int16_round_mmxext;
+        }
+    }
+    if (EXTERNAL_SSE(cpu_flags)) {
+        c->float_to_fixed24 = ff_float_to_fixed24_sse;
+    }
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        c->ac3_exponent_min = ff_ac3_exponent_min_sse2;
+        c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_sse2;
+        c->float_to_fixed24 = ff_float_to_fixed24_sse2;
+        c->compute_mantissa_size = ff_ac3_compute_mantissa_size_sse2;
+        c->extract_exponents = ff_ac3_extract_exponents_sse2;
+        if (bit_exact) {
+            c->apply_window_int16 = ff_apply_window_int16_sse2;
+        }
+    }
+
+    if (EXTERNAL_SSE2_FAST(cpu_flags)) {
+        c->ac3_lshift_int16 = ff_ac3_lshift_int16_sse2;
+        c->ac3_rshift_int32 = ff_ac3_rshift_int32_sse2;
+        if (!bit_exact) {
+            c->apply_window_int16 = ff_apply_window_int16_round_sse2;
+        }
+    }
+
+    if (EXTERNAL_SSSE3(cpu_flags)) {
+        c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_ssse3;
+        if (cpu_flags & AV_CPU_FLAG_ATOM) {
+            c->apply_window_int16 = ff_apply_window_int16_ssse3_atom;
+        } else {
+            c->extract_exponents = ff_ac3_extract_exponents_ssse3;
+            c->apply_window_int16 = ff_apply_window_int16_ssse3;
+        }
+    }
+}
+
+#define DOWNMIX_FUNC_OPT(ch, opt)                                       \
+void ff_ac3_downmix_ ## ch ## _to_1_ ## opt(float **samples,            \
+                                            float **matrix, int len);   \
+void ff_ac3_downmix_ ## ch ## _to_2_ ## opt(float **samples,            \
+                                            float **matrix, int len);
+
+#define DOWNMIX_FUNCS(opt)   \
+    DOWNMIX_FUNC_OPT(3, opt) \
+    DOWNMIX_FUNC_OPT(4, opt) \
+    DOWNMIX_FUNC_OPT(5, opt) \
+    DOWNMIX_FUNC_OPT(6, opt)
+
+DOWNMIX_FUNCS(sse)
+DOWNMIX_FUNCS(avx)
+DOWNMIX_FUNCS(fma3)
+
+void ff_ac3dsp_set_downmix_x86(AC3DSPContext *c)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+#define SET_DOWNMIX(ch, suf, SUF)                                       \
+    if (ch == c->in_channels) {                                         \
+        if (EXTERNAL_ ## SUF (cpu_flags)) {                             \
+            if (c->out_channels == 1)                                   \
+                c->downmix = ff_ac3_downmix_ ## ch ## _to_1_ ## suf;    \
+            else                                                        \
+                c->downmix = ff_ac3_downmix_ ## ch ## _to_2_ ## suf;    \
+        }                                                               \
+    }
+
+#define SET_DOWNMIX_ALL(suf, SUF)                   \
+    SET_DOWNMIX(3, suf, SUF)                        \
+    SET_DOWNMIX(4, suf, SUF)                        \
+    SET_DOWNMIX(5, suf, SUF)                        \
+    SET_DOWNMIX(6, suf, SUF)
+
+    SET_DOWNMIX_ALL(sse,  SSE)
+    if (!(cpu_flags & AV_CPU_FLAG_AVXSLOW)) {
+        SET_DOWNMIX_ALL(avx,  AVX)
+        SET_DOWNMIX_ALL(fma3, FMA3)
+    }
+}
--- a/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/alacdsp_init.c
+++ b/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/alacdsp_init.c
@ -0,0 +1,44 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/alacdsp.h"
+#include "config.h"
+
+void ff_alac_decorrelate_stereo_sse4(int32_t *buffer[2], int nb_samples,
+                                     int decorr_shift, int decorr_left_weight);
+void ff_alac_append_extra_bits_stereo_sse2(int32_t *buffer[2], int32_t *extra_bits_buffer[2],
+                                           int extra_bits, int channels, int nb_samples);
+void ff_alac_append_extra_bits_mono_sse2(int32_t *buffer[2], int32_t *extra_bits_buffer[2],
+                                         int extra_bits, int channels, int nb_samples);
+
+av_cold void ff_alacdsp_init_x86(ALACDSPContext *c)
+{
+#if HAVE_X86ASM
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        c->append_extra_bits[0] = ff_alac_append_extra_bits_mono_sse2;
+        c->append_extra_bits[1] = ff_alac_append_extra_bits_stereo_sse2;
+    }
+    if (EXTERNAL_SSE4(cpu_flags)) {
+        c->decorrelate_stereo   = ff_alac_decorrelate_stereo_sse4;
+    }
+#endif /* HAVE_X86ASM */
+}
--- a/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/audiodsp_init.c
+++ b/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/audiodsp_init.c
@ -0,0 +1,66 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/audiodsp.h"
+
+int32_t ff_scalarproduct_int16_mmxext(const int16_t *v1, const int16_t *v2,
+                                      int order);
+int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2,
+                                    int order);
+
+void ff_vector_clip_int32_mmx(int32_t *dst, const int32_t *src,
+                              int32_t min, int32_t max, unsigned int len);
+void ff_vector_clip_int32_sse2(int32_t *dst, const int32_t *src,
+                               int32_t min, int32_t max, unsigned int len);
+void ff_vector_clip_int32_int_sse2(int32_t *dst, const int32_t *src,
+                                   int32_t min, int32_t max, unsigned int len);
+void ff_vector_clip_int32_sse4(int32_t *dst, const int32_t *src,
+                               int32_t min, int32_t max, unsigned int len);
+void ff_vector_clipf_sse(float *dst, const float *src,
+                         int len, float min, float max);
+
+av_cold void ff_audiodsp_init_x86(AudioDSPContext *c)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_MMX(cpu_flags))
+        c->vector_clip_int32 = ff_vector_clip_int32_mmx;
+
+    if (EXTERNAL_MMXEXT(cpu_flags))
+        c->scalarproduct_int16 = ff_scalarproduct_int16_mmxext;
+
+    if (EXTERNAL_SSE(cpu_flags))
+        c->vector_clipf = ff_vector_clipf_sse;
+
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
+        if (cpu_flags & AV_CPU_FLAG_ATOM)
+            c->vector_clip_int32 = ff_vector_clip_int32_int_sse2;
+        else
+            c->vector_clip_int32 = ff_vector_clip_int32_sse2;
+    }
+
+    if (EXTERNAL_SSE4(cpu_flags))
+        c->vector_clip_int32 = ff_vector_clip_int32_sse4;
+}
--- a/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/blockdsp_init.c
+++ b/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/blockdsp_init.c
@ -0,0 +1,60 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavutil/internal.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/blockdsp.h"
+#include "libavcodec/version.h"
+
+void ff_clear_block_mmx(int16_t *block);
+void ff_clear_block_sse(int16_t *block);
+void ff_clear_block_avx(int16_t *block);
+void ff_clear_blocks_mmx(int16_t *blocks);
+void ff_clear_blocks_sse(int16_t *blocks);
+void ff_clear_blocks_avx(int16_t *blocks);
+
+av_cold void ff_blockdsp_init_x86(BlockDSPContext *c,
+                                  AVCodecContext *avctx)
+{
+#if HAVE_X86ASM
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_MMX(cpu_flags)) {
+        c->clear_block  = ff_clear_block_mmx;
+        c->clear_blocks = ff_clear_blocks_mmx;
+    }
+
+    /* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */
+    if (CONFIG_XVMC && avctx->hwaccel && avctx->hwaccel->decode_mb)
+        return;
+
+    if (EXTERNAL_SSE(cpu_flags)) {
+        c->clear_block  = ff_clear_block_sse;
+        c->clear_blocks = ff_clear_blocks_sse;
+    }
+    if (EXTERNAL_AVX_FAST(cpu_flags)) {
+        c->clear_block  = ff_clear_block_avx;
+        c->clear_blocks = ff_clear_blocks_avx;
+    }
+#endif /* HAVE_X86ASM */
+}
--- a/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/bswapdsp_init.c
+++ b/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/bswapdsp_init.c
@ -0,0 +1,40 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/bswapdsp.h"
+
+void ff_bswap32_buf_sse2(uint32_t *dst, const uint32_t *src, int w);
+void ff_bswap32_buf_ssse3(uint32_t *dst, const uint32_t *src, int w);
+void ff_bswap32_buf_avx2(uint32_t *dst, const uint32_t *src, int w);
+
+av_cold void ff_bswapdsp_init_x86(BswapDSPContext *c)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_SSE2(cpu_flags))
+        c->bswap_buf = ff_bswap32_buf_sse2;
+    if (EXTERNAL_SSSE3(cpu_flags))
+        c->bswap_buf = ff_bswap32_buf_ssse3;
+    if (EXTERNAL_AVX2_FAST(cpu_flags))
+        c->bswap_buf = ff_bswap32_buf_avx2;
+}
--- a/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/cabac.h
+++ b/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/cabac.h
@ -0,0 +1,301 @@
+/*
+ * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_X86_CABAC_H
+#define AVCODEC_X86_CABAC_H
+
+#include "libavcodec/cabac.h"
+#include "libavutil/attributes.h"
+#include "libavutil/macros.h"
+#include "libavutil/x86/asm.h"
+#include "config.h"
+
+#if   (defined(__i386) && defined(__clang__) && (__clang_major__<2 || (__clang_major__==2 && __clang_minor__<10)))\
+   || (                  !defined(__clang__) && defined(__llvm__) && __GNUC__==4 && __GNUC_MINOR__==2 && __GNUC_PATCHLEVEL__<=1)\
+   || (defined(__INTEL_COMPILER) && defined(_MSC_VER))
+#       define BROKEN_COMPILER 1
+#else
+#       define BROKEN_COMPILER 0
+#endif
+
+#if HAVE_INLINE_ASM
+
+#ifndef UNCHECKED_BITSTREAM_READER
+#define UNCHECKED_BITSTREAM_READER !CONFIG_SAFE_BITSTREAM_READER
+#endif
+
+#if UNCHECKED_BITSTREAM_READER
+#define END_CHECK(end) ""
+#else
+#define END_CHECK(end) \
+        "cmp    "end"       , %%"FF_REG_c"                              \n\t"\
+        "jge    1f                                                      \n\t"
+#endif
+
+#ifdef BROKEN_RELOCATIONS
+#define TABLES_ARG , "r"(tables)
+
+#if HAVE_FAST_CMOV
+#define BRANCHLESS_GET_CABAC_UPDATE(ret, retq, low, range, tmp) \
+        "cmp    "low"       , "tmp"                        \n\t"\
+        "cmova  %%ecx       , "range"                      \n\t"\
+        "sbb    %%rcx       , %%rcx                        \n\t"\
+        "and    %%ecx       , "tmp"                        \n\t"\
+        "xor    %%rcx       , "retq"                       \n\t"\
+        "sub    "tmp"       , "low"                        \n\t"
+#else /* HAVE_FAST_CMOV */
+#define BRANCHLESS_GET_CABAC_UPDATE(ret, retq, low, range, tmp) \
+/* P4 Prescott has crappy cmov,sbb,64-bit shift so avoid them */ \
+        "sub    "low"       , "tmp"                        \n\t"\
+        "sar    $31         , "tmp"                        \n\t"\
+        "sub    %%ecx       , "range"                      \n\t"\
+        "and    "tmp"       , "range"                      \n\t"\
+        "add    %%ecx       , "range"                      \n\t"\
+        "shl    $17         , %%ecx                        \n\t"\
+        "and    "tmp"       , %%ecx                        \n\t"\
+        "sub    %%ecx       , "low"                        \n\t"\
+        "xor    "tmp"       , "ret"                        \n\t"\
+        "movslq "ret"       , "retq"                       \n\t"
+#endif /* HAVE_FAST_CMOV */
+
+#define BRANCHLESS_GET_CABAC(ret, retq, statep, low, lowword, range, rangeq, tmp, tmpbyte, byte, end, norm_off, lps_off, mlps_off, tables) \
+        "movzbl "statep"    , "ret"                                     \n\t"\
+        "mov    "range"     , "tmp"                                     \n\t"\
+        "and    $0xC0       , "range"                                   \n\t"\
+        "lea    ("ret", "range", 2), %%ecx                              \n\t"\
+        "movzbl "lps_off"("tables", %%rcx), "range"                     \n\t"\
+        "sub    "range"     , "tmp"                                     \n\t"\
+        "mov    "tmp"       , %%ecx                                     \n\t"\
+        "shl    $17         , "tmp"                                     \n\t"\
+        BRANCHLESS_GET_CABAC_UPDATE(ret, retq, low, range, tmp)              \
+        "movzbl "norm_off"("tables", "rangeq"), %%ecx                   \n\t"\
+        "shl    %%cl        , "range"                                   \n\t"\
+        "movzbl "mlps_off"+128("tables", "retq"), "tmp"                 \n\t"\
+        "shl    %%cl        , "low"                                     \n\t"\
+        "mov    "tmpbyte"   , "statep"                                  \n\t"\
+        "test   "lowword"   , "lowword"                                 \n\t"\
+        "jnz    2f                                                      \n\t"\
+        "mov    "byte"      , %%"FF_REG_c"                              \n\t"\
+        END_CHECK(end)\
+        "add"FF_OPSIZE" $2  , "byte"                                    \n\t"\
+        "1:                                                             \n\t"\
+        "movzwl (%%"FF_REG_c") , "tmp"                                  \n\t"\
+        "lea    -1("low")   , %%ecx                                     \n\t"\
+        "xor    "low"       , %%ecx                                     \n\t"\
+        "shr    $15         , %%ecx                                     \n\t"\
+        "bswap  "tmp"                                                   \n\t"\
+        "shr    $15         , "tmp"                                     \n\t"\
+        "movzbl "norm_off"("tables", %%rcx), %%ecx                      \n\t"\
+        "sub    $0xFFFF     , "tmp"                                     \n\t"\
+        "neg    %%ecx                                                   \n\t"\
+        "add    $7          , %%ecx                                     \n\t"\
+        "shl    %%cl        , "tmp"                                     \n\t"\
+        "add    "tmp"       , "low"                                     \n\t"\
+        "2:                                                             \n\t"
+
+#else /* BROKEN_RELOCATIONS */
+#define TABLES_ARG NAMED_CONSTRAINTS_ARRAY_ADD(ff_h264_cabac_tables)
+#define RIP_ARG
+
+#if HAVE_FAST_CMOV
+#define BRANCHLESS_GET_CABAC_UPDATE(ret, low, range, tmp)\
+        "mov    "tmp"       , %%ecx     \n\t"\
+        "shl    $17         , "tmp"     \n\t"\
+        "cmp    "low"       , "tmp"     \n\t"\
+        "cmova  %%ecx       , "range"   \n\t"\
+        "sbb    %%ecx       , %%ecx     \n\t"\
+        "and    %%ecx       , "tmp"     \n\t"\
+        "xor    %%ecx       , "ret"     \n\t"\
+        "sub    "tmp"       , "low"     \n\t"
+#else /* HAVE_FAST_CMOV */
+#define BRANCHLESS_GET_CABAC_UPDATE(ret, low, range, tmp)\
+        "mov    "tmp"       , %%ecx     \n\t"\
+        "shl    $17         , "tmp"     \n\t"\
+        "sub    "low"       , "tmp"     \n\t"\
+        "sar    $31         , "tmp"     \n\t" /*lps_mask*/\
+        "sub    %%ecx       , "range"   \n\t" /*RangeLPS - range*/\
+        "and    "tmp"       , "range"   \n\t" /*(RangeLPS - range)&lps_mask*/\
+        "add    %%ecx       , "range"   \n\t" /*new range*/\
+        "shl    $17         , %%ecx     \n\t"\
+        "and    "tmp"       , %%ecx     \n\t"\
+        "sub    %%ecx       , "low"     \n\t"\
+        "xor    "tmp"       , "ret"     \n\t"
+#endif /* HAVE_FAST_CMOV */
+
+#define BRANCHLESS_GET_CABAC(ret, retq, statep, low, lowword, range, rangeq, tmp, tmpbyte, byte, end, norm_off, lps_off, mlps_off, tables) \
+        "movzbl "statep"    , "ret"                                     \n\t"\
+        "mov    "range"     , "tmp"                                     \n\t"\
+        "and    $0xC0       , "range"                                   \n\t"\
+        "movzbl "MANGLE(ff_h264_cabac_tables)"+"lps_off"("ret", "range", 2), "range" \n\t"\
+        "sub    "range"     , "tmp"                                     \n\t"\
+        BRANCHLESS_GET_CABAC_UPDATE(ret, low, range, tmp)                    \
+        "movzbl "MANGLE(ff_h264_cabac_tables)"+"norm_off"("range"), %%ecx    \n\t"\
+        "shl    %%cl        , "range"                                   \n\t"\
+        "movzbl "MANGLE(ff_h264_cabac_tables)"+"mlps_off"+128("ret"), "tmp"  \n\t"\
+        "shl    %%cl        , "low"                                     \n\t"\
+        "mov    "tmpbyte"   , "statep"                                  \n\t"\
+        "test   "lowword"   , "lowword"                                 \n\t"\
+        " jnz   2f                                                      \n\t"\
+        "mov    "byte"      , %%"FF_REG_c"                              \n\t"\
+        END_CHECK(end)\
+        "add"FF_OPSIZE" $2  , "byte"                                    \n\t"\
+        "1:                                                             \n\t"\
+        "movzwl (%%"FF_REG_c") , "tmp"                                  \n\t"\
+        "lea    -1("low")   , %%ecx                                     \n\t"\
+        "xor    "low"       , %%ecx                                     \n\t"\
+        "shr    $15         , %%ecx                                     \n\t"\
+        "bswap  "tmp"                                                   \n\t"\
+        "shr    $15         , "tmp"                                     \n\t"\
+        "movzbl "MANGLE(ff_h264_cabac_tables)"+"norm_off"(%%ecx), %%ecx \n\t"\
+        "sub    $0xFFFF     , "tmp"                                     \n\t"\
+        "neg    %%ecx                                                   \n\t"\
+        "add    $7          , %%ecx                                     \n\t"\
+        "shl    %%cl        , "tmp"                                     \n\t"\
+        "add    "tmp"       , "low"                                     \n\t"\
+        "2:                                                             \n\t"
+
+#endif /* BROKEN_RELOCATIONS */
+
+#if HAVE_7REGS && !BROKEN_COMPILER
+#define get_cabac_inline get_cabac_inline_x86
+static av_always_inline int get_cabac_inline_x86(CABACContext *c,
+                                                 uint8_t *const state)
+{
+    int bit, tmp;
+#ifdef BROKEN_RELOCATIONS
+    void *tables;
+
+    __asm__ volatile(
+        "lea    "MANGLE(ff_h264_cabac_tables)", %0      \n\t"
+        : "=&r"(tables)
+        : NAMED_CONSTRAINTS_ARRAY(ff_h264_cabac_tables)
+    );
+#endif
+
+    __asm__ volatile(
+        BRANCHLESS_GET_CABAC("%0", "%q0", "(%4)", "%1", "%w1",
+                             "%2", "%q2", "%3", "%b3",
+                             "%c6(%5)", "%c7(%5)",
+                             AV_STRINGIFY(H264_NORM_SHIFT_OFFSET),
+                             AV_STRINGIFY(H264_LPS_RANGE_OFFSET),
+                             AV_STRINGIFY(H264_MLPS_STATE_OFFSET),
+                             "%8")
+        : "=&r"(bit), "=&r"(c->low), "=&r"(c->range), "=&q"(tmp)
+        : "r"(state), "r"(c),
+          "i"(offsetof(CABACContext, bytestream)),
+          "i"(offsetof(CABACContext, bytestream_end))
+          TABLES_ARG
+          ,"1"(c->low), "2"(c->range)
+        : "%"FF_REG_c, "memory"
+    );
+    return bit & 1;
+}
+#endif /* HAVE_7REGS && !BROKEN_COMPILER */
+
+#if !BROKEN_COMPILER
+#define get_cabac_bypass_sign get_cabac_bypass_sign_x86
+static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
+{
+    x86_reg tmp;
+    __asm__ volatile(
+        "movl        %c6(%2), %k1       \n\t"
+        "movl        %c3(%2), %%eax     \n\t"
+        "shl             $17, %k1       \n\t"
+        "add           %%eax, %%eax     \n\t"
+        "sub             %k1, %%eax     \n\t"
+        "cdq                            \n\t"
+        "and           %%edx, %k1       \n\t"
+        "add             %k1, %%eax     \n\t"
+        "xor           %%edx, %%ecx     \n\t"
+        "sub           %%edx, %%ecx     \n\t"
+        "test           %%ax, %%ax      \n\t"
+        "jnz              1f            \n\t"
+        "mov         %c4(%2), %1        \n\t"
+        "subl        $0xFFFF, %%eax     \n\t"
+        "movzwl         (%1), %%edx     \n\t"
+        "bswap         %%edx            \n\t"
+        "shrl            $15, %%edx     \n\t"
+#if UNCHECKED_BITSTREAM_READER
+        "add              $2, %1        \n\t"
+        "addl          %%edx, %%eax     \n\t"
+        "mov              %1, %c4(%2)   \n\t"
+#else
+        "addl          %%edx, %%eax     \n\t"
+        "cmp         %c5(%2), %1        \n\t"
+        "jge              1f            \n\t"
+        "add"FF_OPSIZE"   $2, %c4(%2)   \n\t"
+#endif
+        "1:                             \n\t"
+        "movl          %%eax, %c3(%2)   \n\t"
+
+        : "+c"(val), "=&r"(tmp)
+        : "r"(c),
+          "i"(offsetof(CABACContext, low)),
+          "i"(offsetof(CABACContext, bytestream)),
+          "i"(offsetof(CABACContext, bytestream_end)),
+          "i"(offsetof(CABACContext, range))
+        : "%eax", "%edx", "memory"
+    );
+    return val;
+}
+
+#define get_cabac_bypass get_cabac_bypass_x86
+static av_always_inline int get_cabac_bypass_x86(CABACContext *c)
+{
+    x86_reg tmp;
+    int res;
+    __asm__ volatile(
+        "movl        %c6(%2), %k1       \n\t"
+        "movl        %c3(%2), %%eax     \n\t"
+        "shl             $17, %k1       \n\t"
+        "add           %%eax, %%eax     \n\t"
+        "sub             %k1, %%eax     \n\t"
+        "cdq                            \n\t"
+        "and           %%edx, %k1       \n\t"
+        "add             %k1, %%eax     \n\t"
+        "inc           %%edx            \n\t"
+        "test           %%ax, %%ax      \n\t"
+        "jnz              1f            \n\t"
+        "mov         %c4(%2), %1        \n\t"
+        "subl        $0xFFFF, %%eax     \n\t"
+        "movzwl         (%1), %%ecx     \n\t"
+        "bswap         %%ecx            \n\t"
+        "shrl            $15, %%ecx     \n\t"
+        "addl          %%ecx, %%eax     \n\t"
+        "cmp         %c5(%2), %1        \n\t"
+        "jge              1f            \n\t"
+        "add"FF_OPSIZE"   $2, %c4(%2)   \n\t"
+        "1:                             \n\t"
+        "movl          %%eax, %c3(%2)   \n\t"
+
+        : "=&d"(res), "=&r"(tmp)
+        : "r"(c),
+          "i"(offsetof(CABACContext, low)),
+          "i"(offsetof(CABACContext, bytestream)),
+          "i"(offsetof(CABACContext, bytestream_end)),
+          "i"(offsetof(CABACContext, range))
+        : "%eax", "%ecx", "memory"
+    );
+    return res;
+}
+#endif /* !BROKEN_COMPILER */
+
+#endif /* HAVE_INLINE_ASM */
+#endif /* AVCODEC_X86_CABAC_H */
--- a/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/cavsdsp.c
+++ b/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/cavsdsp.c
@ -0,0 +1,463 @@
+/*
+ * Chinese AVS video (AVS1-P2, JiZhun profile) decoder.
+ * Copyright (c) 2006  Stefan Gehrer <stefan.gehrer@gmx.de>
+ *
+ * MMX-optimized DSP functions, based on H.264 optimizations by
+ * Michael Niedermayer and Loren Merritt
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/common.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/asm.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/cavsdsp.h"
+#include "libavcodec/idctdsp.h"
+#include "constants.h"
+#include "fpel.h"
+#include "idctdsp.h"
+#include "config.h"
+
+
+#if HAVE_MMX_EXTERNAL
+
+void ff_cavs_idct8_mmx(int16_t *out, const int16_t *in);
+
+static void cavs_idct8_add_mmx(uint8_t *dst, int16_t *block, ptrdiff_t stride)
+{
+    LOCAL_ALIGNED(16, int16_t, b2, [64]);
+    ff_cavs_idct8_mmx(b2, block);
+    ff_add_pixels_clamped_mmx(b2, dst, stride);
+}
+
+void ff_cavs_idct8_sse2(int16_t *out, const int16_t *in);
+
+static void cavs_idct8_add_sse2(uint8_t *dst, int16_t *block, ptrdiff_t stride)
+{
+    LOCAL_ALIGNED(16, int16_t, b2, [64]);
+    ff_cavs_idct8_sse2(b2, block);
+    ff_add_pixels_clamped_sse2(b2, dst, stride);
+}
+
+#endif /* HAVE_MMX_EXTERNAL */
+
+#if (HAVE_MMXEXT_INLINE || HAVE_AMD3DNOW_INLINE)
+
+/*****************************************************************************
+ *
+ * motion compensation
+ *
+ ****************************************************************************/
+
+/* vertical filter [-1 -2 96 42 -7  0]  */
+#define QPEL_CAVSV1(A,B,C,D,E,F,OP,ADD, MUL1, MUL2) \
+        "movd (%0), "#F"            \n\t"\
+        "movq "#C", %%mm6           \n\t"\
+        "pmullw "MANGLE(MUL1)", %%mm6\n\t"\
+        "movq "#D", %%mm7           \n\t"\
+        "pmullw "MANGLE(MUL2)", %%mm7\n\t"\
+        "psllw $3, "#E"             \n\t"\
+        "psubw "#E", %%mm6          \n\t"\
+        "psraw $3, "#E"             \n\t"\
+        "paddw %%mm7, %%mm6         \n\t"\
+        "paddw "#E", %%mm6          \n\t"\
+        "paddw "#B", "#B"           \n\t"\
+        "pxor %%mm7, %%mm7          \n\t"\
+        "add %2, %0                 \n\t"\
+        "punpcklbw %%mm7, "#F"      \n\t"\
+        "psubw "#B", %%mm6          \n\t"\
+        "psraw $1, "#B"             \n\t"\
+        "psubw "#A", %%mm6          \n\t"\
+        "paddw "MANGLE(ADD)", %%mm6 \n\t"\
+        "psraw $7, %%mm6            \n\t"\
+        "packuswb %%mm6, %%mm6      \n\t"\
+        OP(%%mm6, (%1), A, d)            \
+        "add %3, %1                 \n\t"
+
+/* vertical filter [ 0 -1  5  5 -1  0]  */
+#define QPEL_CAVSV2(A,B,C,D,E,F,OP,ADD, MUL1, MUL2) \
+        "movd (%0), "#F"            \n\t"\
+        "movq "#C", %%mm6           \n\t"\
+        "paddw "#D", %%mm6          \n\t"\
+        "pmullw "MANGLE(MUL1)", %%mm6\n\t"\
+        "add %2, %0                 \n\t"\
+        "punpcklbw %%mm7, "#F"      \n\t"\
+        "psubw "#B", %%mm6          \n\t"\
+        "psubw "#E", %%mm6          \n\t"\
+        "paddw "MANGLE(ADD)", %%mm6 \n\t"\
+        "psraw $3, %%mm6            \n\t"\
+        "packuswb %%mm6, %%mm6      \n\t"\
+        OP(%%mm6, (%1), A, d)            \
+        "add %3, %1                 \n\t"
+
+/* vertical filter [ 0 -7 42 96 -2 -1]  */
+#define QPEL_CAVSV3(A,B,C,D,E,F,OP,ADD, MUL1, MUL2) \
+        "movd (%0), "#F"            \n\t"\
+        "movq "#C", %%mm6           \n\t"\
+        "pmullw "MANGLE(MUL2)", %%mm6\n\t"\
+        "movq "#D", %%mm7           \n\t"\
+        "pmullw "MANGLE(MUL1)", %%mm7\n\t"\
+        "psllw $3, "#B"             \n\t"\
+        "psubw "#B", %%mm6          \n\t"\
+        "psraw $3, "#B"             \n\t"\
+        "paddw %%mm7, %%mm6         \n\t"\
+        "paddw "#B", %%mm6          \n\t"\
+        "paddw "#E", "#E"           \n\t"\
+        "pxor %%mm7, %%mm7          \n\t"\
+        "add %2, %0                 \n\t"\
+        "punpcklbw %%mm7, "#F"      \n\t"\
+        "psubw "#E", %%mm6          \n\t"\
+        "psraw $1, "#E"             \n\t"\
+        "psubw "#F", %%mm6          \n\t"\
+        "paddw "MANGLE(ADD)", %%mm6 \n\t"\
+        "psraw $7, %%mm6            \n\t"\
+        "packuswb %%mm6, %%mm6      \n\t"\
+        OP(%%mm6, (%1), A, d)            \
+        "add %3, %1                 \n\t"
+
+
+#define QPEL_CAVSVNUM(VOP,OP,ADD,MUL1,MUL2)\
+    int w= 2;\
+    src -= 2*srcStride;\
+    \
+    while(w--){\
+      __asm__ volatile(\
+        "pxor %%mm7, %%mm7          \n\t"\
+        "movd (%0), %%mm0           \n\t"\
+        "add %2, %0                 \n\t"\
+        "movd (%0), %%mm1           \n\t"\
+        "add %2, %0                 \n\t"\
+        "movd (%0), %%mm2           \n\t"\
+        "add %2, %0                 \n\t"\
+        "movd (%0), %%mm3           \n\t"\
+        "add %2, %0                 \n\t"\
+        "movd (%0), %%mm4           \n\t"\
+        "add %2, %0                 \n\t"\
+        "punpcklbw %%mm7, %%mm0     \n\t"\
+        "punpcklbw %%mm7, %%mm1     \n\t"\
+        "punpcklbw %%mm7, %%mm2     \n\t"\
+        "punpcklbw %%mm7, %%mm3     \n\t"\
+        "punpcklbw %%mm7, %%mm4     \n\t"\
+        VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, ADD, MUL1, MUL2)\
+        VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, ADD, MUL1, MUL2)\
+        VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, ADD, MUL1, MUL2)\
+        VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, ADD, MUL1, MUL2)\
+        VOP(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP, ADD, MUL1, MUL2)\
+        VOP(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP, ADD, MUL1, MUL2)\
+        VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, ADD, MUL1, MUL2)\
+        VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, ADD, MUL1, MUL2)\
+        \
+        : "+a"(src), "+c"(dst)\
+        : "S"((x86_reg)srcStride), "r"((x86_reg)dstStride)\
+          NAMED_CONSTRAINTS_ADD(ADD,MUL1,MUL2)\
+        : "memory"\
+     );\
+     if(h==16){\
+        __asm__ volatile(\
+            VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, ADD, MUL1, MUL2)\
+            VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, ADD, MUL1, MUL2)\
+            VOP(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP, ADD, MUL1, MUL2)\
+            VOP(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP, ADD, MUL1, MUL2)\
+            VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, ADD, MUL1, MUL2)\
+            VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, ADD, MUL1, MUL2)\
+            VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, ADD, MUL1, MUL2)\
+            VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, ADD, MUL1, MUL2)\
+            \
+           : "+a"(src), "+c"(dst)\
+           : "S"((x86_reg)srcStride), "r"((x86_reg)dstStride)\
+             NAMED_CONSTRAINTS_ADD(ADD,MUL1,MUL2)\
+           : "memory"\
+        );\
+     }\
+     src += 4-(h+5)*srcStride;\
+     dst += 4-h*dstStride;\
+   }
+
+#define QPEL_CAVS(OPNAME, OP, MMX)\
+static void OPNAME ## cavs_qpel8_h_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride)\
+{\
+    int h=8;\
+    __asm__ volatile(\
+        "pxor %%mm7, %%mm7          \n\t"\
+        "movq "MANGLE(ff_pw_5)", %%mm6\n\t"\
+        "1:                         \n\t"\
+        "movq    (%0), %%mm0        \n\t"\
+        "movq   1(%0), %%mm2        \n\t"\
+        "movq %%mm0, %%mm1          \n\t"\
+        "movq %%mm2, %%mm3          \n\t"\
+        "punpcklbw %%mm7, %%mm0     \n\t"\
+        "punpckhbw %%mm7, %%mm1     \n\t"\
+        "punpcklbw %%mm7, %%mm2     \n\t"\
+        "punpckhbw %%mm7, %%mm3     \n\t"\
+        "paddw %%mm2, %%mm0         \n\t"\
+        "paddw %%mm3, %%mm1         \n\t"\
+        "pmullw %%mm6, %%mm0        \n\t"\
+        "pmullw %%mm6, %%mm1        \n\t"\
+        "movq   -1(%0), %%mm2       \n\t"\
+        "movq    2(%0), %%mm4       \n\t"\
+        "movq %%mm2, %%mm3          \n\t"\
+        "movq %%mm4, %%mm5          \n\t"\
+        "punpcklbw %%mm7, %%mm2     \n\t"\
+        "punpckhbw %%mm7, %%mm3     \n\t"\
+        "punpcklbw %%mm7, %%mm4     \n\t"\
+        "punpckhbw %%mm7, %%mm5     \n\t"\
+        "paddw %%mm4, %%mm2         \n\t"\
+        "paddw %%mm3, %%mm5         \n\t"\
+        "psubw %%mm2, %%mm0         \n\t"\
+        "psubw %%mm5, %%mm1         \n\t"\
+        "movq "MANGLE(ff_pw_4)", %%mm5\n\t"\
+        "paddw %%mm5, %%mm0         \n\t"\
+        "paddw %%mm5, %%mm1         \n\t"\
+        "psraw $3, %%mm0            \n\t"\
+        "psraw $3, %%mm1            \n\t"\
+        "packuswb %%mm1, %%mm0      \n\t"\
+        OP(%%mm0, (%1),%%mm5, q)         \
+        "add %3, %0                 \n\t"\
+        "add %4, %1                 \n\t"\
+        "decl %2                    \n\t"\
+        " jnz 1b                    \n\t"\
+        : "+a"(src), "+c"(dst), "+m"(h)\
+        : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride)\
+          NAMED_CONSTRAINTS_ADD(ff_pw_4,ff_pw_5)\
+        : "memory"\
+    );\
+}\
+\
+static inline void OPNAME ## cavs_qpel8or16_v1_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride, int h)\
+{                                                                       \
+  QPEL_CAVSVNUM(QPEL_CAVSV1,OP,ff_pw_64,ff_pw_96,ff_pw_42)      \
+}\
+\
+static inline void OPNAME ## cavs_qpel8or16_v2_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride, int h)\
+{                                                                       \
+  QPEL_CAVSVNUM(QPEL_CAVSV2,OP,ff_pw_4,ff_pw_5,ff_pw_42)        \
+}\
+\
+static inline void OPNAME ## cavs_qpel8or16_v3_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride, int h)\
+{                                                                       \
+  QPEL_CAVSVNUM(QPEL_CAVSV3,OP,ff_pw_64,ff_pw_96,ff_pw_42)      \
+}\
+\
+static void OPNAME ## cavs_qpel8_v1_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride)\
+{                                                                       \
+    OPNAME ## cavs_qpel8or16_v1_ ## MMX(dst  , src  , dstStride, srcStride, 8);\
+}\
+static void OPNAME ## cavs_qpel16_v1_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride)\
+{                                                                       \
+    OPNAME ## cavs_qpel8or16_v1_ ## MMX(dst  , src  , dstStride, srcStride, 16);\
+    OPNAME ## cavs_qpel8or16_v1_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
+}\
+\
+static void OPNAME ## cavs_qpel8_v2_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride)\
+{                                                                       \
+    OPNAME ## cavs_qpel8or16_v2_ ## MMX(dst  , src  , dstStride, srcStride, 8);\
+}\
+static void OPNAME ## cavs_qpel16_v2_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride)\
+{                                                                       \
+    OPNAME ## cavs_qpel8or16_v2_ ## MMX(dst  , src  , dstStride, srcStride, 16);\
+    OPNAME ## cavs_qpel8or16_v2_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
+}\
+\
+static void OPNAME ## cavs_qpel8_v3_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride)\
+{                                                                       \
+    OPNAME ## cavs_qpel8or16_v3_ ## MMX(dst  , src  , dstStride, srcStride, 8);\
+}\
+static void OPNAME ## cavs_qpel16_v3_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride)\
+{                                                                       \
+    OPNAME ## cavs_qpel8or16_v3_ ## MMX(dst  , src  , dstStride, srcStride, 16);\
+    OPNAME ## cavs_qpel8or16_v3_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
+}\
+\
+static void OPNAME ## cavs_qpel16_h_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride)\
+{                                                                       \
+    OPNAME ## cavs_qpel8_h_ ## MMX(dst  , src  , dstStride, srcStride);\
+    OPNAME ## cavs_qpel8_h_ ## MMX(dst+8, src+8, dstStride, srcStride);\
+    src += 8*srcStride;\
+    dst += 8*dstStride;\
+    OPNAME ## cavs_qpel8_h_ ## MMX(dst  , src  , dstStride, srcStride);\
+    OPNAME ## cavs_qpel8_h_ ## MMX(dst+8, src+8, dstStride, srcStride);\
+}\
+
+#define CAVS_MC(OPNAME, SIZE, MMX) \
+static void OPNAME ## cavs_qpel ## SIZE ## _mc20_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
+{\
+    OPNAME ## cavs_qpel ## SIZE ## _h_ ## MMX(dst, src, stride, stride);\
+}\
+\
+static void OPNAME ## cavs_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
+{\
+    OPNAME ## cavs_qpel ## SIZE ## _v1_ ## MMX(dst, src, stride, stride);\
+}\
+\
+static void OPNAME ## cavs_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
+{\
+    OPNAME ## cavs_qpel ## SIZE ## _v2_ ## MMX(dst, src, stride, stride);\
+}\
+\
+static void OPNAME ## cavs_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
+{\
+    OPNAME ## cavs_qpel ## SIZE ## _v3_ ## MMX(dst, src, stride, stride);\
+}\
+
+#define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b "    \n\t"
+#define AVG_3DNOW_OP(a,b,temp, size) \
+"mov" #size " " #b ", " #temp "   \n\t"\
+"pavgusb " #temp ", " #a "        \n\t"\
+"mov" #size " " #a ", " #b "      \n\t"
+#define AVG_MMXEXT_OP(a, b, temp, size) \
+"mov" #size " " #b ", " #temp "   \n\t"\
+"pavgb " #temp ", " #a "          \n\t"\
+"mov" #size " " #a ", " #b "      \n\t"
+
+#endif /* (HAVE_MMXEXT_INLINE || HAVE_AMD3DNOW_INLINE) */
+
+#if HAVE_MMX_EXTERNAL
+static void put_cavs_qpel8_mc00_mmx(uint8_t *dst, const uint8_t *src,
+                                    ptrdiff_t stride)
+{
+    ff_put_pixels8_mmx(dst, src, stride, 8);
+}
+
+static void avg_cavs_qpel8_mc00_mmx(uint8_t *dst, const uint8_t *src,
+                                    ptrdiff_t stride)
+{
+    ff_avg_pixels8_mmx(dst, src, stride, 8);
+}
+
+static void avg_cavs_qpel8_mc00_mmxext(uint8_t *dst, const uint8_t *src,
+                                       ptrdiff_t stride)
+{
+    ff_avg_pixels8_mmxext(dst, src, stride, 8);
+}
+
+static void put_cavs_qpel16_mc00_mmx(uint8_t *dst, const uint8_t *src,
+                                     ptrdiff_t stride)
+{
+    ff_put_pixels16_mmx(dst, src, stride, 16);
+}
+
+static void avg_cavs_qpel16_mc00_mmx(uint8_t *dst, const uint8_t *src,
+                                     ptrdiff_t stride)
+{
+    ff_avg_pixels16_mmx(dst, src, stride, 16);
+}
+
+static void avg_cavs_qpel16_mc00_mmxext(uint8_t *dst, const uint8_t *src,
+                                        ptrdiff_t stride)
+{
+    ff_avg_pixels16_mmxext(dst, src, stride, 16);
+}
+
+static void put_cavs_qpel16_mc00_sse2(uint8_t *dst, const uint8_t *src,
+                                      ptrdiff_t stride)
+{
+    ff_put_pixels16_sse2(dst, src, stride, 16);
+}
+
+static void avg_cavs_qpel16_mc00_sse2(uint8_t *dst, const uint8_t *src,
+                                      ptrdiff_t stride)
+{
+    ff_avg_pixels16_sse2(dst, src, stride, 16);
+}
+#endif
+
+static av_cold void cavsdsp_init_mmx(CAVSDSPContext *c,
+                                     AVCodecContext *avctx)
+{
+#if HAVE_MMX_EXTERNAL
+    c->put_cavs_qpel_pixels_tab[0][0] = put_cavs_qpel16_mc00_mmx;
+    c->put_cavs_qpel_pixels_tab[1][0] = put_cavs_qpel8_mc00_mmx;
+    c->avg_cavs_qpel_pixels_tab[0][0] = avg_cavs_qpel16_mc00_mmx;
+    c->avg_cavs_qpel_pixels_tab[1][0] = avg_cavs_qpel8_mc00_mmx;
+
+    c->cavs_idct8_add = cavs_idct8_add_mmx;
+    c->idct_perm      = FF_IDCT_PERM_TRANSPOSE;
+#endif /* HAVE_MMX_EXTERNAL */
+}
+
+#define DSPFUNC(PFX, IDX, NUM, EXT)                                                       \
+    c->PFX ## _cavs_qpel_pixels_tab[IDX][ 2] = PFX ## _cavs_qpel ## NUM ## _mc20_ ## EXT; \
+    c->PFX ## _cavs_qpel_pixels_tab[IDX][ 4] = PFX ## _cavs_qpel ## NUM ## _mc01_ ## EXT; \
+    c->PFX ## _cavs_qpel_pixels_tab[IDX][ 8] = PFX ## _cavs_qpel ## NUM ## _mc02_ ## EXT; \
+    c->PFX ## _cavs_qpel_pixels_tab[IDX][12] = PFX ## _cavs_qpel ## NUM ## _mc03_ ## EXT; \
+
+#if HAVE_MMXEXT_INLINE
+QPEL_CAVS(put_,        PUT_OP, mmxext)
+QPEL_CAVS(avg_, AVG_MMXEXT_OP, mmxext)
+
+CAVS_MC(put_,  8, mmxext)
+CAVS_MC(put_, 16, mmxext)
+CAVS_MC(avg_,  8, mmxext)
+CAVS_MC(avg_, 16, mmxext)
+#endif /* HAVE_MMXEXT_INLINE */
+
+#if HAVE_AMD3DNOW_INLINE
+QPEL_CAVS(put_,       PUT_OP, 3dnow)
+QPEL_CAVS(avg_, AVG_3DNOW_OP, 3dnow)
+
+CAVS_MC(put_, 8, 3dnow)
+CAVS_MC(put_, 16,3dnow)
+CAVS_MC(avg_, 8, 3dnow)
+CAVS_MC(avg_, 16,3dnow)
+
+static av_cold void cavsdsp_init_3dnow(CAVSDSPContext *c,
+                                       AVCodecContext *avctx)
+{
+    DSPFUNC(put, 0, 16, 3dnow);
+    DSPFUNC(put, 1,  8, 3dnow);
+    DSPFUNC(avg, 0, 16, 3dnow);
+    DSPFUNC(avg, 1,  8, 3dnow);
+}
+#endif /* HAVE_AMD3DNOW_INLINE */
+
+av_cold void ff_cavsdsp_init_x86(CAVSDSPContext *c, AVCodecContext *avctx)
+{
+    av_unused int cpu_flags = av_get_cpu_flags();
+
+    if (X86_MMX(cpu_flags))
+        cavsdsp_init_mmx(c, avctx);
+
+#if HAVE_AMD3DNOW_INLINE
+    if (INLINE_AMD3DNOW(cpu_flags))
+        cavsdsp_init_3dnow(c, avctx);
+#endif /* HAVE_AMD3DNOW_INLINE */
+#if HAVE_MMXEXT_INLINE
+    if (INLINE_MMXEXT(cpu_flags)) {
+        DSPFUNC(put, 0, 16, mmxext);
+        DSPFUNC(put, 1,  8, mmxext);
+        DSPFUNC(avg, 0, 16, mmxext);
+        DSPFUNC(avg, 1,  8, mmxext);
+    }
+#endif
+#if HAVE_MMX_EXTERNAL
+    if (EXTERNAL_MMXEXT(cpu_flags)) {
+        c->avg_cavs_qpel_pixels_tab[0][0] = avg_cavs_qpel16_mc00_mmxext;
+        c->avg_cavs_qpel_pixels_tab[1][0] = avg_cavs_qpel8_mc00_mmxext;
+    }
+#endif
+#if HAVE_SSE2_EXTERNAL
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        c->put_cavs_qpel_pixels_tab[0][0] = put_cavs_qpel16_mc00_sse2;
+        c->avg_cavs_qpel_pixels_tab[0][0] = avg_cavs_qpel16_mc00_sse2;
+
+        c->cavs_idct8_add = cavs_idct8_add_sse2;
+        c->idct_perm      = FF_IDCT_PERM_TRANSPOSE;
+    }
+#endif
+}
--- a/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/celt_pvq_init.c
+++ b/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/celt_pvq_init.c
@ -0,0 +1,43 @@
+/*
+ * Opus encoder assembly optimizations
+ * Copyright (C) 2017 Ivan Kalvachev <ikalvachev@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/opus_pvq.h"
+
+extern float ff_pvq_search_approx_sse2(float *X, int *y, int K, int N);
+extern float ff_pvq_search_approx_sse4(float *X, int *y, int K, int N);
+extern float ff_pvq_search_exact_avx  (float *X, int *y, int K, int N);
+
+av_cold void ff_celt_pvq_init_x86(CeltPVQ *s)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_SSE2(cpu_flags))
+        s->pvq_search = ff_pvq_search_approx_sse2;
+
+    if (EXTERNAL_SSE4(cpu_flags))
+        s->pvq_search = ff_pvq_search_approx_sse4;
+
+    if (EXTERNAL_AVX_FAST(cpu_flags))
+        s->pvq_search = ff_pvq_search_exact_avx;
+}
--- a/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/celt_pvq_search.asm
+++ b/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/celt_pvq_search.asm
@ -0,0 +1,385 @@
+;******************************************************************************
+;* SIMD optimized Opus encoder DSP function
+;*
+;* Copyright (C) 2017 Ivan Kalvachev <ikalvachev@gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "config.asm"
+%include "libavutil/x86/x86util.asm"
+
+%ifdef __NASM_VER__
+%use "smartalign"
+ALIGNMODE p6
+%endif
+
+SECTION_RODATA 64
+
+const_float_abs_mask:   times 8 dd 0x7fffffff
+const_align_abs_edge:   times 8 dd 0
+
+const_float_0_5:        times 8 dd 0.5
+const_float_1:          times 8 dd 1.0
+const_float_sign_mask:  times 8 dd 0x80000000
+
+const_int32_offsets:
+                        %rep 8
+                                dd $-const_int32_offsets
+                        %endrep
+SECTION .text
+
+;
+;   Setup High Register to be used
+;   for holding memory constants
+;
+; %1 - the register to be used, assmues it is >= mm8
+; %2 - name of the constant.
+;
+; Subsequent opcodes are going to use the constant in the form
+; "addps m0, mm_const_name" and it would be turned into:
+; "addps m0, [const_name]" on 32 bit arch or
+; "addps m0, m8" on 64 bit arch
+%macro SET_HI_REG_MM_CONSTANT 3 ; movop, reg, const_name
+%if num_mmregs > 8
+    %define  mm_%3   %2
+    %{1}        %2, [%3]    ; movaps m8, [const_name]
+%else
+    %define  mm_%3  [%3]
+%endif
+%endmacro
+
+;
+;   Set Position Independent Code
+;       Base address of a constant
+; %1 - the register to be used, if PIC is set
+; %2 - name of the constant.
+;
+; Subsequent opcode are going to use the base address in the form
+; "movaps m0, [pic_base_constant_name+r4]" and it would be turned into
+; "movaps m0, [r5 + r4]" if PIC is enabled
+; "movaps m0, [constant_name + r4]" if texrel are used
+%macro SET_PIC_BASE 3; reg, const_label
+%ifdef PIC
+    %{1}     %2, [%3]      ; lea r5, [rip+const]
+    %define  pic_base_%3 %2
+%else
+    %define  pic_base_%3 %3
+%endif
+%endmacro
+
+%macro PULSES_SEARCH 1
+; m6 Syy_norm
+; m7 Sxy_norm
+    addps          m6, mm_const_float_0_5   ; Syy_norm += 1.0/2
+    pxor           m1, m1                   ; max_idx
+    xorps          m3, m3                   ; p_max
+    xor           r4d, r4d
+align 16
+%%distortion_search:
+    movd          xm2, dword r4d    ; movd zero extends
+%ifidn %1,add
+    movaps         m4, [tmpY + r4]  ; y[i]
+    movaps         m5, [tmpX + r4]  ; X[i]
+
+  %if USE_APPROXIMATION == 1
+    xorps          m0, m0
+    cmpps          m0, m0, m5, 4    ; m0 = (X[i] != 0.0)
+  %endif
+
+    addps          m4, m6           ; m4 = Syy_new = y[i] + Syy_norm
+    addps          m5, m7           ; m5 = Sxy_new = X[i] + Sxy_norm
+
+  %if USE_APPROXIMATION == 1
+    andps          m5, m0           ; if(X[i] == 0) Sxy_new = 0; Prevent aproximation error from setting pulses in array padding.
+  %endif
+
+%else
+    movaps         m5, [tmpY + r4]      ; m5 = y[i]
+
+    xorps          m0, m0               ; m0 = 0;
+    cmpps          m0, m0, m5, 1        ; m0 = (0<y)
+
+    subps          m4, m6, m5           ; m4 = Syy_new = Syy_norm - y[i]
+    subps          m5, m7, [tmpX + r4]  ; m5 = Sxy_new = Sxy_norm - X[i]
+    andps          m5, m0               ; (0<y)?m5:0
+%endif
+
+%if USE_APPROXIMATION == 1
+    rsqrtps        m4, m4
+    mulps          m5, m4           ; m5 = p = Sxy_new*approx(1/sqrt(Syy) )
+%else
+    mulps          m5, m5
+    divps          m5, m4           ; m5 = p = Sxy_new*Sxy_new/Syy
+%endif
+    VPBROADCASTD   m2, xm2          ; m2=i (all lanes get same values, we add the offset-per-lane, later)
+
+    cmpps          m0, m3, m5, 1    ; m0 = (m3 < m5) ; (p_max < p) ; (p > p_max)
+    maxps          m3, m5           ; m3=max(p_max,p)
+                                    ; maxps here is faster than blendvps, despite blend having lower latency.
+
+    pand           m2, m0           ; This version seems faster than sse41 pblendvb
+    pmaxsw         m1, m2           ; SSE2 signed word, so it would work for N < 32768/4
+
+    add           r4d, mmsize
+    cmp           r4d, Nd
+    jb   %%distortion_search
+
+    por            m1, mm_const_int32_offsets  ; max_idx offsets per individual lane (skipped in the inner loop)
+    movdqa         m4, m1                      ; needed for the aligned y[max_idx]+=1; processing
+
+%if mmsize >= 32
+; Merge parallel maximums round 8 (4 vs 4)
+
+    vextractf128  xm5, ym3, 1       ; xmm5 = ymm3[1x128] = ymm3[255..128b]
+    cmpps         xm0, xm3, xm5, 1  ; m0 = (m3 < m5) = ( p[0x128] < p[1x128] )
+
+    vextracti128  xm2, ym1, 1       ; xmm2 = ymm1[1x128] = ymm1[255..128b]
+    BLENDVPS      xm3, xm5, xm0     ; max_idx = m0 ? max_idx[1x128] : max_idx[0x128]
+    PBLENDVB      xm1, xm2, xm0     ; p       = m0 ? p[1x128]       : p[0x128]
+%endif
+
+; Merge parallel maximums round 4 (2 vs 2)
+                                    ; m3=p[3210]
+    movhlps       xm5, xm3          ; m5=p[xx32]
+    cmpps         xm0, xm3, xm5, 1  ; m0 = (m3 < m5) = ( p[1,0] < p[3,2] )
+
+    pshufd        xm2, xm1, q3232
+    BLENDVPS      xm3, xm5, xm0     ; max_idx = m0 ? max_idx[3,2] : max_idx[1,0]
+    PBLENDVB      xm1, xm2, xm0     ; p       = m0 ? p[3,2]       : p[1,0]
+
+; Merge parallel maximums final round (1 vs 1)
+    shufps        xm0, xm3, xm3, q1111  ; m0 = m3[1] = p[1]
+    cmpss         xm0, xm3, 5           ; m0 = !(m0 >= m3) = !( p[1] >= p[0] )
+
+    pshufd        xm2, xm1, q1111
+    PBLENDVB      xm1, xm2, xm0
+
+    movd    dword r4d, xm1          ; zero extends to the rest of r4q
+
+    VBROADCASTSS   m3, [tmpX + r4]
+    %{1}ps         m7, m3           ; Sxy += X[max_idx]
+
+    VBROADCASTSS   m5, [tmpY + r4]
+    %{1}ps         m6, m5           ; Syy += Y[max_idx]
+
+    ; We have to update a single element in Y[i]
+    ; However writing 4 bytes and then doing 16 byte load in the inner loop
+    ; could cause a stall due to breaking write forwarding.
+    VPBROADCASTD   m1, xm1
+    pcmpeqd        m1, m1, m4           ; exactly 1 element matches max_idx and this finds it
+
+    and           r4d, ~(mmsize-1)      ; align address down, so the value pointed by max_idx is inside a mmsize load
+    movaps         m5, [tmpY + r4]      ; m5 = Y[y3...ym...y0]
+    andps          m1, mm_const_float_1 ; m1 =  [ 0...1.0...0]
+    %{1}ps         m5, m1               ; m5 = Y[y3...ym...y0] +/- [0...1.0...0]
+    movaps [tmpY + r4], m5              ; Y[max_idx] +-= 1.0;
+%endmacro
+
+;
+; We need one more register for
+; PIC relative addressing. Use this
+; to count it in cglobal
+;
+%ifdef PIC
+  %define num_pic_regs 1
+%else
+  %define num_pic_regs 0
+%endif
+
+;
+; Pyramid Vector Quantization Search implementation
+;
+; float * inX   - Unaligned (SIMD) access, it will be overread,
+;                 but extra data is masked away.
+; int32 * outY  - Should be aligned and padded buffer.
+;                 It is used as temp buffer.
+; uint32 K      - Number of pulses to have after quantizations.
+; uint32 N      - Number of vector elements. Must be 0 < N < 256
+;
+%macro PVQ_FAST_SEARCH 1
+cglobal pvq_search%1, 4, 5+num_pic_regs, 11, 256*4, inX, outY, K, N
+%define tmpX rsp
+%define tmpY outYq
+
+    movaps     m0, [const_float_abs_mask]
+    shl        Nd, 2    ; N *= sizeof(float); also 32 bit operation zeroes the high 32 bits in 64 bit mode.
+    mov       r4d, Nd
+
+    neg       r4d
+    and       r4d, mmsize-1
+
+    SET_PIC_BASE lea, r5, const_align_abs_edge  ; rip+const
+    movups     m2, [pic_base_const_align_abs_edge + r4 - mmsize]
+
+    add        Nd, r4d              ; N = align(N, mmsize)
+
+    lea       r4d, [Nd - mmsize]    ; N is rounded up (aligned up) to mmsize, so r4 can't become negative here, unless N=0.
+    movups     m1, [inXq + r4]
+    andps      m1, m2
+    movaps  [tmpX + r4], m1         ; Sx = abs( X[N-1] )
+
+align 16
+%%loop_abs_sum:
+    sub       r4d, mmsize
+    jc   %%end_loop_abs_sum
+
+    movups     m2, [inXq + r4]
+    andps      m2, m0
+
+    movaps  [tmpX + r4], m2 ; tmpX[i]=abs(X[i])
+    addps      m1, m2       ; Sx += abs(X[i])
+    jmp  %%loop_abs_sum
+
+align 16
+%%end_loop_abs_sum:
+
+    HSUMPS     m1, m2       ; m1  = Sx
+
+    xorps      m0, m0
+    comiss    xm0, xm1      ;
+    jz   %%zero_input       ; if (Sx==0) goto zero_input
+
+    cvtsi2ss  xm0, dword Kd ; m0 = K
+%if USE_APPROXIMATION == 1
+    rcpss     xm1, xm1      ; m1 = approx(1/Sx)
+    mulss     xm0, xm1      ; m0 = K*(1/Sx)
+%else
+    divss     xm0, xm1      ; b = K/Sx
+                            ; b = K/max_x
+%endif
+
+    VBROADCASTSS  m0, xm0
+
+    lea       r4d, [Nd - mmsize]
+    pxor       m5, m5             ; Sy    ( Sum of abs( y[i]) )
+    xorps      m6, m6             ; Syy   ( Sum of y[i]*y[i]  )
+    xorps      m7, m7             ; Sxy   ( Sum of X[i]*y[i]  )
+align 16
+%%loop_guess:
+    movaps     m1, [tmpX + r4]    ; m1   = X[i]
+    mulps      m2, m0, m1         ; m2   = res*X[i]
+    cvtps2dq   m2, m2             ; yt   = (int)lrintf( res*X[i] )
+    paddd      m5, m2             ; Sy  += yt
+    cvtdq2ps   m2, m2             ; yt   = (float)yt
+    mulps      m1, m2             ; m1   = X[i]*yt
+    movaps  [tmpY + r4], m2       ; y[i] = m2
+    addps      m7, m1             ; Sxy += m1;
+    mulps      m2, m2             ; m2   = yt*yt
+    addps      m6, m2             ; Syy += m2
+
+    sub       r4d, mmsize
+    jnc  %%loop_guess
+
+    HSUMPS     m6, m1       ; Syy_norm
+    HADDD      m5, m4       ; pulses
+
+    movd  dword r4d, xm5    ; zero extends to the rest of r4q
+
+    sub        Kd, r4d      ; K -= pulses , also 32 bit operation zeroes high 32 bit in 64 bit mode.
+    jz   %%finish           ; K - pulses == 0
+
+    SET_HI_REG_MM_CONSTANT movaps,  m8, const_float_0_5
+    SET_HI_REG_MM_CONSTANT movaps,  m9, const_float_1
+    SET_HI_REG_MM_CONSTANT movdqa, m10, const_int32_offsets
+    ; Use Syy/2 in distortion parameter calculations.
+    ; Saves pre and post-caclulation to correct Y[] values.
+    ; Same precision, since float mantisa is normalized.
+    ; The SQRT approximation does differ.
+    HSUMPS     m7, m0         ; Sxy_norm
+    mulps      m6, mm_const_float_0_5
+
+    jc   %%remove_pulses_loop   ; K - pulses < 0
+
+align 16                        ; K - pulses > 0
+%%add_pulses_loop:
+
+    PULSES_SEARCH add   ; m6 Syy_norm ; m7 Sxy_norm
+
+    sub        Kd, 1
+    jnz  %%add_pulses_loop
+
+    addps      m6, m6 ; Syy*=2
+
+    jmp  %%finish
+
+align 16
+%%remove_pulses_loop:
+
+    PULSES_SEARCH sub   ; m6 Syy_norm ; m7 Sxy_norm
+
+    add        Kd, 1
+    jnz  %%remove_pulses_loop
+
+    addps      m6, m6 ; Syy*=2
+
+align 16
+%%finish:
+    lea       r4d, [Nd - mmsize]
+    movaps     m2, [const_float_sign_mask]
+
+align 16
+%%restore_sign_loop:
+    movaps     m0, [tmpY + r4]    ; m0 = Y[i]
+    movups     m1, [inXq + r4]    ; m1 = X[i]
+    andps      m1, m2             ; m1 = sign(X[i])
+    orps       m0, m1             ; m0 = Y[i]*sign
+    cvtps2dq   m3, m0             ; m3 = (int)m0
+    movaps  [outYq + r4], m3
+
+    sub       r4d, mmsize
+    jnc  %%restore_sign_loop
+%%return:
+
+%if ARCH_X86_64 == 0    ; sbrdsp
+    movss     r0m, xm6  ; return (float)Syy_norm
+    fld dword r0m
+%else
+    movaps     m0, m6   ; return (float)Syy_norm
+%endif
+
+    RET
+
+align 16
+%%zero_input:
+    lea       r4d, [Nd - mmsize]
+    xorps      m0, m0
+%%zero_loop:
+    movaps  [outYq + r4], m0
+    sub       r4d, mmsize
+    jnc  %%zero_loop
+
+    movaps     m6, [const_float_1]
+    jmp  %%return
+%endmacro
+
+; if 1, use a float op that give half precision but execute for around 3 cycles.
+; On Skylake & Ryzen the division is much faster (around 11c/3),
+; that makes the full precision code about 2% slower.
+; Opus also does use rsqrt approximation in their intrinsics code.
+%define USE_APPROXIMATION   1
+
+INIT_XMM sse2
+PVQ_FAST_SEARCH _approx
+
+INIT_XMM sse4
+PVQ_FAST_SEARCH _approx
+
+%define USE_APPROXIMATION   0
+
+INIT_XMM avx
+PVQ_FAST_SEARCH _exact
--- a/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/constants.c
+++ b/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/constants.c
@ -0,0 +1,94 @@
+/*
+ * MMX/SSE/AVX constants used across x86 dsp optimizations.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/mem.h"
+#include "libavutil/x86/asm.h" // for xmm_reg
+#include "constants.h"
+
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pw_1)    = { 0x0001000100010001ULL, 0x0001000100010001ULL,
+                                                    0x0001000100010001ULL, 0x0001000100010001ULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pw_2)    = { 0x0002000200020002ULL, 0x0002000200020002ULL,
+                                                    0x0002000200020002ULL, 0x0002000200020002ULL };
+DECLARE_ASM_ALIGNED(16, const xmm_reg,  ff_pw_3)    = { 0x0003000300030003ULL, 0x0003000300030003ULL };
+DECLARE_ASM_ALIGNED(32, const ymm_reg,  ff_pw_4)    = { 0x0004000400040004ULL, 0x0004000400040004ULL,
+                                                    0x0004000400040004ULL, 0x0004000400040004ULL };
+DECLARE_ASM_ALIGNED(16, const xmm_reg,  ff_pw_5)    = { 0x0005000500050005ULL, 0x0005000500050005ULL };
+DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_8)    = { 0x0008000800080008ULL, 0x0008000800080008ULL };
+DECLARE_ASM_ALIGNED(16, const xmm_reg,  ff_pw_9)    = { 0x0009000900090009ULL, 0x0009000900090009ULL };
+DECLARE_ALIGNED(8,  const uint64_t, ff_pw_15)   =   0x000F000F000F000FULL;
+DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_16)   = { 0x0010001000100010ULL, 0x0010001000100010ULL };
+DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_17)   = { 0x0011001100110011ULL, 0x0011001100110011ULL };
+DECLARE_ASM_ALIGNED(16, const xmm_reg,  ff_pw_18)   = { 0x0012001200120012ULL, 0x0012001200120012ULL };
+DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_20)   = { 0x0014001400140014ULL, 0x0014001400140014ULL };
+DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_32)   = { 0x0020002000200020ULL, 0x0020002000200020ULL };
+DECLARE_ASM_ALIGNED(8,  const uint64_t, ff_pw_42)   =   0x002A002A002A002AULL;
+DECLARE_ASM_ALIGNED(8,  const uint64_t, ff_pw_53)   =   0x0035003500350035ULL;
+DECLARE_ASM_ALIGNED(16, const xmm_reg,  ff_pw_64)   = { 0x0040004000400040ULL, 0x0040004000400040ULL };
+DECLARE_ASM_ALIGNED(8,  const uint64_t, ff_pw_96)   =   0x0060006000600060ULL;
+DECLARE_ASM_ALIGNED(8,  const uint64_t, ff_pw_128)  =   0x0080008000800080ULL;
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pw_255)  = { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL,
+                                                    0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pw_256)  = { 0x0100010001000100ULL, 0x0100010001000100ULL,
+                                                    0x0100010001000100ULL, 0x0100010001000100ULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pw_512)  = { 0x0200020002000200ULL, 0x0200020002000200ULL,
+                                                    0x0200020002000200ULL, 0x0200020002000200ULL };
+DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_1019) = { 0x03FB03FB03FB03FBULL, 0x03FB03FB03FB03FBULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pw_1023) = { 0x03ff03ff03ff03ffULL, 0x03ff03ff03ff03ffULL,
+                                                    0x03ff03ff03ff03ffULL, 0x03ff03ff03ff03ffULL};
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pw_1024) = { 0x0400040004000400ULL, 0x0400040004000400ULL,
+                                                    0x0400040004000400ULL, 0x0400040004000400ULL};
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pw_2048) = { 0x0800080008000800ULL, 0x0800080008000800ULL,
+                                                    0x0800080008000800ULL, 0x0800080008000800ULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pw_4095) = { 0x0fff0fff0fff0fffULL, 0x0fff0fff0fff0fffULL,
+                                                    0x0fff0fff0fff0fffULL, 0x0fff0fff0fff0fffULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pw_4096) = { 0x1000100010001000ULL, 0x1000100010001000ULL,
+                                                    0x1000100010001000ULL, 0x1000100010001000ULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pw_8192) = { 0x2000200020002000ULL, 0x2000200020002000ULL,
+                                                    0x2000200020002000ULL, 0x2000200020002000ULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pw_m1)   = { 0xFFFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL,
+                                                    0xFFFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL };
+
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pb_0)    = { 0x0000000000000000ULL, 0x0000000000000000ULL,
+                                                    0x0000000000000000ULL, 0x0000000000000000ULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pb_1)    = { 0x0101010101010101ULL, 0x0101010101010101ULL,
+                                                    0x0101010101010101ULL, 0x0101010101010101ULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pb_2)    = { 0x0202020202020202ULL, 0x0202020202020202ULL,
+                                                    0x0202020202020202ULL, 0x0202020202020202ULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pb_3)    = { 0x0303030303030303ULL, 0x0303030303030303ULL,
+                                                    0x0303030303030303ULL, 0x0303030303030303ULL };
+DECLARE_ALIGNED(32, const xmm_reg,  ff_pb_15)   = { 0x0F0F0F0F0F0F0F0FULL, 0x0F0F0F0F0F0F0F0FULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pb_80)   = { 0x8080808080808080ULL, 0x8080808080808080ULL,
+                                                    0x8080808080808080ULL, 0x8080808080808080ULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pb_FE)   = { 0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL,
+                                                    0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL };
+DECLARE_ALIGNED(8,  const uint64_t, ff_pb_FC)   =   0xFCFCFCFCFCFCFCFCULL;
+
+DECLARE_ALIGNED(16, const xmm_reg,  ff_ps_neg)  = { 0x8000000080000000ULL, 0x8000000080000000ULL };
+
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pd_1)    = { 0x0000000100000001ULL, 0x0000000100000001ULL,
+                                                    0x0000000100000001ULL, 0x0000000100000001ULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pd_16)   = { 0x0000001000000010ULL, 0x0000001000000010ULL,
+                                                    0x0000001000000010ULL, 0x0000001000000010ULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pd_32)   = { 0x0000002000000020ULL, 0x0000002000000020ULL,
+                                                    0x0000002000000020ULL, 0x0000002000000020ULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pd_8192) = { 0x0000200000002000ULL, 0x0000200000002000ULL,
+                                                    0x0000200000002000ULL, 0x0000200000002000ULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pd_65535)= { 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL,
+                                                    0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL };
--- a/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/constants.h
+++ b/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/constants.h
@ -0,0 +1,72 @@
+/*
+ * MMX/SSE constants used across x86 dsp optimizations.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_X86_CONSTANTS_H
+#define AVCODEC_X86_CONSTANTS_H
+
+#include <stdint.h>
+
+#include "libavutil/x86/asm.h"
+
+extern const ymm_reg  ff_pw_1;
+extern const ymm_reg  ff_pw_2;
+extern const xmm_reg  ff_pw_3;
+extern const ymm_reg  ff_pw_4;
+extern const xmm_reg  ff_pw_5;
+extern const xmm_reg  ff_pw_8;
+extern const xmm_reg  ff_pw_9;
+extern const uint64_t ff_pw_15;
+extern const xmm_reg  ff_pw_16;
+extern const xmm_reg  ff_pw_18;
+extern const xmm_reg  ff_pw_20;
+extern const xmm_reg  ff_pw_32;
+extern const uint64_t ff_pw_42;
+extern const uint64_t ff_pw_53;
+extern const xmm_reg  ff_pw_64;
+extern const uint64_t ff_pw_96;
+extern const uint64_t ff_pw_128;
+extern const ymm_reg  ff_pw_255;
+extern const ymm_reg  ff_pw_256;
+extern const ymm_reg  ff_pw_512;
+extern const ymm_reg  ff_pw_1023;
+extern const ymm_reg  ff_pw_1024;
+extern const ymm_reg  ff_pw_2048;
+extern const ymm_reg  ff_pw_4095;
+extern const ymm_reg  ff_pw_4096;
+extern const ymm_reg  ff_pw_8192;
+extern const ymm_reg  ff_pw_m1;
+
+extern const ymm_reg  ff_pb_0;
+extern const ymm_reg  ff_pb_1;
+extern const ymm_reg  ff_pb_2;
+extern const ymm_reg  ff_pb_3;
+extern const ymm_reg  ff_pb_80;
+extern const ymm_reg  ff_pb_FE;
+extern const uint64_t ff_pb_FC;
+
+extern const xmm_reg  ff_ps_neg;
+
+extern const ymm_reg  ff_pd_1;
+extern const ymm_reg  ff_pd_16;
+extern const ymm_reg  ff_pd_32;
+extern const ymm_reg  ff_pd_8192;
+extern const ymm_reg  ff_pd_65535;
+
+#endif /* AVCODEC_X86_CONSTANTS_H */
--- a/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/dcadsp_init.c
+++ b/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/dcadsp_init.c
@ -0,0 +1,52 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/dcadsp.h"
+
+#define LFE_FIR_FLOAT_FUNC(opt)                                               \
+void ff_lfe_fir0_float_##opt(float *pcm_samples, int32_t *lfe_samples,         \
+                             const float *filter_coeff, ptrdiff_t npcmblocks); \
+void ff_lfe_fir1_float_##opt(float *pcm_samples, int32_t *lfe_samples,         \
+                             const float *filter_coeff, ptrdiff_t npcmblocks);
+
+LFE_FIR_FLOAT_FUNC(sse)
+LFE_FIR_FLOAT_FUNC(sse2)
+LFE_FIR_FLOAT_FUNC(sse3)
+LFE_FIR_FLOAT_FUNC(avx)
+LFE_FIR_FLOAT_FUNC(fma3)
+
+av_cold void ff_dcadsp_init_x86(DCADSPContext *s)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (ARCH_X86_32 && EXTERNAL_SSE(cpu_flags))
+        s->lfe_fir_float[0] = ff_lfe_fir0_float_sse;
+    if (EXTERNAL_SSE2(cpu_flags))
+        s->lfe_fir_float[0] = ff_lfe_fir0_float_sse2;
+    if (EXTERNAL_SSE3(cpu_flags))
+        s->lfe_fir_float[1] = ff_lfe_fir1_float_sse3;
+    if (EXTERNAL_AVX(cpu_flags)) {
+        s->lfe_fir_float[0] = ff_lfe_fir0_float_avx;
+        s->lfe_fir_float[1] = ff_lfe_fir1_float_avx;
+    }
+    if (EXTERNAL_FMA3(cpu_flags))
+        s->lfe_fir_float[0] = ff_lfe_fir0_float_fma3;
+}
--- a/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/dct_init.c
+++ b/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/dct_init.c
@ -0,0 +1,41 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/dct.h"
+
+void ff_dct32_float_sse(FFTSample *out, const FFTSample *in);
+void ff_dct32_float_sse2(FFTSample *out, const FFTSample *in);
+void ff_dct32_float_avx(FFTSample *out, const FFTSample *in);
+
+av_cold void ff_dct_init_x86(DCTContext *s)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+#if ARCH_X86_32
+    if (EXTERNAL_SSE(cpu_flags))
+        s->dct32 = ff_dct32_float_sse;
+#endif
+    if (EXTERNAL_SSE2(cpu_flags))
+        s->dct32 = ff_dct32_float_sse2;
+    if (EXTERNAL_AVX_FAST(cpu_flags))
+        s->dct32 = ff_dct32_float_avx;
+}
--- a/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/dirac_dwt_init.c
+++ b/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/dirac_dwt_init.c
@ -0,0 +1,229 @@
+/*
+ * x86 optimized discrete wavelet transform
+ * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (c) 2010 David Conrad
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/x86/asm.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/dirac_dwt.h"
+
+#define COMPOSE_VERTICAL(ext, align) \
+void ff_vertical_compose53iL0##ext(int16_t *b0, int16_t *b1, int16_t *b2, int width); \
+void ff_vertical_compose_dirac53iH0##ext(int16_t *b0, int16_t *b1, int16_t *b2, int width); \
+void ff_vertical_compose_dd137iL0##ext(int16_t *b0, int16_t *b1, int16_t *b2, int16_t *b3, int16_t *b4, int width); \
+void ff_vertical_compose_dd97iH0##ext(int16_t *b0, int16_t *b1, int16_t *b2, int16_t *b3, int16_t *b4, int width); \
+void ff_vertical_compose_haar##ext(int16_t *b0, int16_t *b1, int width); \
+void ff_horizontal_compose_haar0i##ext(int16_t *b, int16_t *tmp, int w);\
+void ff_horizontal_compose_haar1i##ext(int16_t *b, int16_t *tmp, int w);\
+\
+static void vertical_compose53iL0##ext(uint8_t *_b0, uint8_t *_b1, uint8_t *_b2, int width) \
+{ \
+    int i, width_align = width&~(align-1); \
+    int16_t *b0 = (int16_t *)_b0; \
+    int16_t *b1 = (int16_t *)_b1; \
+    int16_t *b2 = (int16_t *)_b2; \
+\
+    for(i=width_align; i<width; i++) \
+        b1[i] = COMPOSE_53iL0(b0[i], b1[i], b2[i]); \
+\
+    ff_vertical_compose53iL0##ext(b0, b1, b2, width_align); \
+} \
+\
+static void vertical_compose_dirac53iH0##ext(uint8_t *_b0, uint8_t *_b1, uint8_t *_b2, int width) \
+{ \
+    int i, width_align = width&~(align-1); \
+    int16_t *b0 = (int16_t *)_b0; \
+    int16_t *b1 = (int16_t *)_b1; \
+    int16_t *b2 = (int16_t *)_b2; \
+\
+    for(i=width_align; i<width; i++) \
+        b1[i] = COMPOSE_DIRAC53iH0(b0[i], b1[i], b2[i]); \
+\
+    ff_vertical_compose_dirac53iH0##ext(b0, b1, b2, width_align); \
+} \
+\
+static void vertical_compose_dd137iL0##ext(uint8_t *_b0, uint8_t *_b1, uint8_t *_b2, \
+                                           uint8_t *_b3, uint8_t *_b4, int width) \
+{ \
+    int i, width_align = width&~(align-1); \
+    int16_t *b0 = (int16_t *)_b0; \
+    int16_t *b1 = (int16_t *)_b1; \
+    int16_t *b2 = (int16_t *)_b2; \
+    int16_t *b3 = (int16_t *)_b3; \
+    int16_t *b4 = (int16_t *)_b4; \
+\
+    for(i=width_align; i<width; i++) \
+        b2[i] = COMPOSE_DD137iL0(b0[i], b1[i], b2[i], b3[i], b4[i]); \
+\
+    ff_vertical_compose_dd137iL0##ext(b0, b1, b2, b3, b4, width_align); \
+} \
+\
+static void vertical_compose_dd97iH0##ext(uint8_t *_b0, uint8_t *_b1, uint8_t *_b2, \
+                                          uint8_t *_b3, uint8_t *_b4, int width) \
+{ \
+    int i, width_align = width&~(align-1); \
+    int16_t *b0 = (int16_t *)_b0; \
+    int16_t *b1 = (int16_t *)_b1; \
+    int16_t *b2 = (int16_t *)_b2; \
+    int16_t *b3 = (int16_t *)_b3; \
+    int16_t *b4 = (int16_t *)_b4; \
+\
+    for(i=width_align; i<width; i++) \
+        b2[i] = COMPOSE_DD97iH0(b0[i], b1[i], b2[i], b3[i], b4[i]); \
+\
+    ff_vertical_compose_dd97iH0##ext(b0, b1, b2, b3, b4, width_align); \
+} \
+static void vertical_compose_haar##ext(uint8_t *_b0, uint8_t *_b1, int width) \
+{ \
+    int i, width_align = width&~(align-1); \
+    int16_t *b0 = (int16_t *)_b0; \
+    int16_t *b1 = (int16_t *)_b1; \
+\
+    for(i=width_align; i<width; i++) { \
+        b0[i] = COMPOSE_HAARiL0(b0[i], b1[i]); \
+        b1[i] = COMPOSE_HAARiH0(b1[i], b0[i]); \
+    } \
+\
+    ff_vertical_compose_haar##ext(b0, b1, width_align); \
+} \
+static void horizontal_compose_haar0i##ext(uint8_t *_b, uint8_t *_tmp, int w)\
+{\
+    int w2= w>>1;\
+    int x= w2 - (w2&(align-1));\
+    int16_t *b = (int16_t *)_b; \
+    int16_t *tmp = (int16_t *)_tmp; \
+\
+    ff_horizontal_compose_haar0i##ext(b, tmp, w);\
+\
+    for (; x < w2; x++) {\
+        b[2*x  ] = tmp[x];\
+        b[2*x+1] = COMPOSE_HAARiH0(b[x+w2], tmp[x]);\
+    }\
+}\
+static void horizontal_compose_haar1i##ext(uint8_t *_b, uint8_t *_tmp, int w)\
+{\
+    int w2= w>>1;\
+    int x= w2 - (w2&(align-1));\
+    int16_t *b = (int16_t *)_b; \
+    int16_t *tmp = (int16_t *)_tmp; \
+\
+    ff_horizontal_compose_haar1i##ext(b, tmp, w);\
+\
+    for (; x < w2; x++) {\
+        b[2*x  ] = (tmp[x] + 1)>>1;\
+        b[2*x+1] = (COMPOSE_HAARiH0(b[x+w2], tmp[x]) + 1)>>1;\
+    }\
+}\
+\
+
+#if HAVE_X86ASM
+#if !ARCH_X86_64
+COMPOSE_VERTICAL(_mmx, 4)
+#endif
+COMPOSE_VERTICAL(_sse2, 8)
+
+
+void ff_horizontal_compose_dd97i_ssse3(int16_t *_b, int16_t *_tmp, int w);
+
+static void horizontal_compose_dd97i_ssse3(uint8_t *_b, uint8_t *_tmp, int w)
+{
+    int w2= w>>1;
+    int x= w2 - (w2&7);
+    int16_t *b = (int16_t *)_b;
+    int16_t *tmp = (int16_t *)_tmp;
+
+    ff_horizontal_compose_dd97i_ssse3(b, tmp, w);
+
+    for (; x < w2; x++) {
+        b[2*x  ] = (tmp[x] + 1)>>1;
+        b[2*x+1] = (COMPOSE_DD97iH0(tmp[x-1], tmp[x], b[x+w2], tmp[x+1], tmp[x+2]) + 1)>>1;
+    }
+}
+#endif
+
+void ff_spatial_idwt_init_x86(DWTContext *d, enum dwt_type type)
+{
+#if HAVE_X86ASM
+  int mm_flags = av_get_cpu_flags();
+
+#if !ARCH_X86_64
+    if (!(mm_flags & AV_CPU_FLAG_MMX))
+        return;
+
+    switch (type) {
+    case DWT_DIRAC_DD9_7:
+        d->vertical_compose_l0 = (void*)vertical_compose53iL0_mmx;
+        d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_mmx;
+        break;
+    case DWT_DIRAC_LEGALL5_3:
+        d->vertical_compose_l0 = (void*)vertical_compose53iL0_mmx;
+        d->vertical_compose_h0 = (void*)vertical_compose_dirac53iH0_mmx;
+        break;
+    case DWT_DIRAC_DD13_7:
+        d->vertical_compose_l0 = (void*)vertical_compose_dd137iL0_mmx;
+        d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_mmx;
+        break;
+    case DWT_DIRAC_HAAR0:
+        d->vertical_compose   = (void*)vertical_compose_haar_mmx;
+        d->horizontal_compose = horizontal_compose_haar0i_mmx;
+        break;
+    case DWT_DIRAC_HAAR1:
+        d->vertical_compose   = (void*)vertical_compose_haar_mmx;
+        d->horizontal_compose = horizontal_compose_haar1i_mmx;
+        break;
+    }
+#endif
+
+    if (!(mm_flags & AV_CPU_FLAG_SSE2))
+        return;
+
+    switch (type) {
+    case DWT_DIRAC_DD9_7:
+        d->vertical_compose_l0 = (void*)vertical_compose53iL0_sse2;
+        d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_sse2;
+        break;
+    case DWT_DIRAC_LEGALL5_3:
+        d->vertical_compose_l0 = (void*)vertical_compose53iL0_sse2;
+        d->vertical_compose_h0 = (void*)vertical_compose_dirac53iH0_sse2;
+        break;
+    case DWT_DIRAC_DD13_7:
+        d->vertical_compose_l0 = (void*)vertical_compose_dd137iL0_sse2;
+        d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_sse2;
+        break;
+    case DWT_DIRAC_HAAR0:
+        d->vertical_compose   = (void*)vertical_compose_haar_sse2;
+        d->horizontal_compose = horizontal_compose_haar0i_sse2;
+        break;
+    case DWT_DIRAC_HAAR1:
+        d->vertical_compose   = (void*)vertical_compose_haar_sse2;
+        d->horizontal_compose = horizontal_compose_haar1i_sse2;
+        break;
+    }
+
+    if (!(mm_flags & AV_CPU_FLAG_SSSE3))
+        return;
+
+    switch (type) {
+    case DWT_DIRAC_DD9_7:
+        d->horizontal_compose = horizontal_compose_dd97i_ssse3;
+        break;
+    }
+#endif // HAVE_X86ASM
+}
--- a/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/diracdsp_init.c
+++ b/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/diracdsp_init.c
@ -0,0 +1,195 @@
+/*
+ * Copyright (C) 2010 David Conrad
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/diracdsp.h"
+#include "fpel.h"
+
+DECL_DIRAC_PIXOP(put, mmx);
+DECL_DIRAC_PIXOP(avg, mmx);
+DECL_DIRAC_PIXOP(avg, mmxext);
+
+void ff_put_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h);
+void ff_avg_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h);
+void ff_put_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h);
+void ff_avg_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h);
+
+void ff_add_rect_clamped_mmx(uint8_t *, const uint16_t *, int, const int16_t *, int, int, int);
+void ff_add_rect_clamped_sse2(uint8_t *, const uint16_t *, int, const int16_t *, int, int, int);
+
+void ff_add_dirac_obmc8_mmx(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
+void ff_add_dirac_obmc16_mmx(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
+void ff_add_dirac_obmc32_mmx(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
+
+void ff_add_dirac_obmc16_sse2(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
+void ff_add_dirac_obmc32_sse2(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
+
+void ff_put_rect_clamped_mmx(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height);
+void ff_put_rect_clamped_sse2(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height);
+void ff_put_signed_rect_clamped_mmx(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height);
+void ff_put_signed_rect_clamped_sse2(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height);
+void ff_put_signed_rect_clamped_10_sse4(uint8_t *dst, int dst_stride, const uint8_t *src, int src_stride, int width, int height);
+
+void ff_dequant_subband_32_sse4(uint8_t *src, uint8_t *dst, ptrdiff_t stride, const int qf, const int qs, int tot_v, int tot_h);
+
+#if HAVE_X86ASM
+
+#define HPEL_FILTER(MMSIZE, EXT)                                                             \
+    void ff_dirac_hpel_filter_v_ ## EXT(uint8_t *, const uint8_t *, int, int);               \
+    void ff_dirac_hpel_filter_h_ ## EXT(uint8_t *, const uint8_t *, int);                    \
+                                                                                             \
+    static void dirac_hpel_filter_ ## EXT(uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,       \
+                                          const uint8_t *src, int stride, int width, int height)   \
+    {                                                                                        \
+        while( height-- )                                                                    \
+        {                                                                                    \
+            ff_dirac_hpel_filter_v_ ## EXT(dstv-MMSIZE, src-MMSIZE, stride, width+MMSIZE+5); \
+            ff_dirac_hpel_filter_h_ ## EXT(dsth, src, width);                                \
+            ff_dirac_hpel_filter_h_ ## EXT(dstc, dstv, width);                               \
+                                                                                             \
+            dsth += stride;                                                                  \
+            dstv += stride;                                                                  \
+            dstc += stride;                                                                  \
+            src  += stride;                                                                  \
+        }                                                                                    \
+    }
+
+#define PIXFUNC(PFX, IDX, EXT)                                                   \
+    /*MMXDISABLEDc->PFX ## _dirac_pixels_tab[0][IDX] = ff_ ## PFX ## _dirac_pixels8_ ## EXT;*/  \
+    c->PFX ## _dirac_pixels_tab[1][IDX] = ff_ ## PFX ## _dirac_pixels16_ ## EXT; \
+    c->PFX ## _dirac_pixels_tab[2][IDX] = ff_ ## PFX ## _dirac_pixels32_ ## EXT
+
+#define DIRAC_PIXOP(OPNAME2, OPNAME, EXT)\
+void ff_ ## OPNAME2 ## _dirac_pixels8_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
+{\
+    if (h&3)\
+        ff_ ## OPNAME2 ## _dirac_pixels8_c(dst, src, stride, h);\
+    else\
+        OPNAME ## _pixels8_ ## EXT(dst, src[0], stride, h);\
+}\
+void ff_ ## OPNAME2 ## _dirac_pixels16_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
+{\
+    if (h&3)\
+        ff_ ## OPNAME2 ## _dirac_pixels16_c(dst, src, stride, h);\
+    else\
+        OPNAME ## _pixels16_ ## EXT(dst, src[0], stride, h);\
+}\
+void ff_ ## OPNAME2 ## _dirac_pixels32_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
+{\
+    if (h&3) {\
+        ff_ ## OPNAME2 ## _dirac_pixels32_c(dst, src, stride, h);\
+    } else {\
+        OPNAME ## _pixels16_ ## EXT(dst   , src[0]   , stride, h);\
+        OPNAME ## _pixels16_ ## EXT(dst+16, src[0]+16, stride, h);\
+    }\
+}
+
+DIRAC_PIXOP(put, ff_put, mmx)
+DIRAC_PIXOP(avg, ff_avg, mmx)
+DIRAC_PIXOP(avg, ff_avg, mmxext)
+
+void ff_put_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
+{
+    if (h&3)
+        ff_put_dirac_pixels16_c(dst, src, stride, h);
+    else
+        ff_put_pixels16_sse2(dst, src[0], stride, h);
+}
+void ff_avg_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
+{
+    if (h&3)
+        ff_avg_dirac_pixels16_c(dst, src, stride, h);
+    else
+        ff_avg_pixels16_sse2(dst, src[0], stride, h);
+}
+void ff_put_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
+{
+    if (h&3) {
+        ff_put_dirac_pixels32_c(dst, src, stride, h);
+    } else {
+        ff_put_pixels16_sse2(dst   , src[0]   , stride, h);
+        ff_put_pixels16_sse2(dst+16, src[0]+16, stride, h);
+    }
+}
+void ff_avg_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
+{
+    if (h&3) {
+        ff_avg_dirac_pixels32_c(dst, src, stride, h);
+    } else {
+        ff_avg_pixels16_sse2(dst   , src[0]   , stride, h);
+        ff_avg_pixels16_sse2(dst+16, src[0]+16, stride, h);
+    }
+}
+
+#else // HAVE_X86ASM
+
+#define HPEL_FILTER(MMSIZE, EXT)                                                     \
+    void dirac_hpel_filter_ ## EXT(uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,              \
+                                   const uint8_t *src, int stride, int width, int height);
+
+#define PIXFUNC(PFX, IDX, EXT) do {} while (0)
+
+#endif // HAVE_X86ASM
+
+#if !ARCH_X86_64
+HPEL_FILTER(8, mmx)
+#endif
+HPEL_FILTER(16, sse2)
+
+void ff_diracdsp_init_x86(DiracDSPContext* c)
+{
+    int mm_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_MMX(mm_flags)) {
+        c->add_dirac_obmc[0] = ff_add_dirac_obmc8_mmx;
+#if !ARCH_X86_64
+        c->add_dirac_obmc[1] = ff_add_dirac_obmc16_mmx;
+        c->add_dirac_obmc[2] = ff_add_dirac_obmc32_mmx;
+        c->dirac_hpel_filter = dirac_hpel_filter_mmx;
+        c->add_rect_clamped = ff_add_rect_clamped_mmx;
+        c->put_signed_rect_clamped[0] = (void *)ff_put_signed_rect_clamped_mmx;
+#endif
+        PIXFUNC(put, 0, mmx);
+        PIXFUNC(avg, 0, mmx);
+    }
+
+    if (EXTERNAL_MMXEXT(mm_flags)) {
+        PIXFUNC(avg, 0, mmxext);
+    }
+
+    if (EXTERNAL_SSE2(mm_flags)) {
+        c->dirac_hpel_filter = dirac_hpel_filter_sse2;
+        c->add_rect_clamped = ff_add_rect_clamped_sse2;
+        c->put_signed_rect_clamped[0] = (void *)ff_put_signed_rect_clamped_sse2;
+
+        c->add_dirac_obmc[1] = ff_add_dirac_obmc16_sse2;
+        c->add_dirac_obmc[2] = ff_add_dirac_obmc32_sse2;
+
+        c->put_dirac_pixels_tab[1][0] = ff_put_dirac_pixels16_sse2;
+        c->avg_dirac_pixels_tab[1][0] = ff_avg_dirac_pixels16_sse2;
+        c->put_dirac_pixels_tab[2][0] = ff_put_dirac_pixels32_sse2;
+        c->avg_dirac_pixels_tab[2][0] = ff_avg_dirac_pixels32_sse2;
+    }
+
+    if (EXTERNAL_SSE4(mm_flags)) {
+        c->dequant_subband[1]         = ff_dequant_subband_32_sse4;
+        c->put_signed_rect_clamped[1] = ff_put_signed_rect_clamped_10_sse4;
+    }
+}
--- a/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/dnxhdenc_init.c
+++ b/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/dnxhdenc_init.c
@ -0,0 +1,37 @@
+/*
+ * VC3/DNxHD SIMD functions
+ * Copyright (c) 2007 Baptiste Coudurier <baptiste dot coudurier at smartjog dot com>
+ *
+ * VC-3 encoder funded by the British Broadcasting Corporation
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/dnxhdenc.h"
+
+void ff_get_pixels_8x4_sym_sse2(int16_t *block, const uint8_t *pixels,
+                                ptrdiff_t line_size);
+
+av_cold void ff_dnxhdenc_init_x86(DNXHDEncContext *ctx)
+{
+    if (EXTERNAL_SSE2(av_get_cpu_flags())) {
+        if (ctx->cid_table->bit_depth == 8)
+            ctx->get_pixels_8x4_sym = ff_get_pixels_8x4_sym_sse2;
+    }
+}
--- a/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/exrdsp_init.c
+++ b/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/exrdsp_init.c
@ -0,0 +1,52 @@
+/*
+ * OpenEXR (.exr) image decoder
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/exrdsp.h"
+
+void ff_reorder_pixels_sse2(uint8_t *dst, const uint8_t *src, ptrdiff_t size);
+
+void ff_reorder_pixels_avx2(uint8_t *dst, const uint8_t *src, ptrdiff_t size);
+
+void ff_predictor_ssse3(uint8_t *src, ptrdiff_t size);
+
+void ff_predictor_avx(uint8_t *src, ptrdiff_t size);
+
+void ff_predictor_avx2(uint8_t *src, ptrdiff_t size);
+
+av_cold void ff_exrdsp_init_x86(ExrDSPContext *dsp)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        dsp->reorder_pixels = ff_reorder_pixels_sse2;
+    }
+    if (EXTERNAL_SSSE3(cpu_flags)) {
+        dsp->predictor = ff_predictor_ssse3;
+    }
+    if (EXTERNAL_AVX(cpu_flags)) {
+        dsp->predictor = ff_predictor_avx;
+    }
+    if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+        dsp->reorder_pixels = ff_reorder_pixels_avx2;
+        dsp->predictor      = ff_predictor_avx2;
+    }
+}
--- a/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/fdct.c
+++ b/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/fdct.c
@ -0,0 +1,594 @@
+/*
+ * SIMD-optimized forward DCT
+ * The gcc porting is Copyright (c) 2001 Fabrice Bellard.
+ * cleanup/optimizations are Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
+ * SSE2 optimization is Copyright (c) 2004 Denes Balatoni.
+ *
+ * from  fdctam32.c - AP922 MMX(3D-Now) forward-DCT
+ *
+ *  Intel Application Note AP-922 - fast, precise implementation of DCT
+ *        http://developer.intel.com/vtune/cbts/appnotes.htm
+ *
+ * Also of inspiration:
+ * a page about fdct at http://www.geocities.com/ssavekar/dct.htm
+ * Skal's fdct at http://skal.planet-d.net/coding/dct.html
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/common.h"
+#include "libavutil/x86/asm.h"
+#include "fdct.h"
+
+#if HAVE_MMX_INLINE
+
+//////////////////////////////////////////////////////////////////////
+//
+// constants for the forward DCT
+// -----------------------------
+//
+// Be sure to check that your compiler is aligning all constants to QWORD
+// (8-byte) memory boundaries!  Otherwise the unaligned memory access will
+// severely stall MMX execution.
+//
+//////////////////////////////////////////////////////////////////////
+
+#define BITS_FRW_ACC   3 //; 2 or 3 for accuracy
+#define SHIFT_FRW_COL  BITS_FRW_ACC
+#define SHIFT_FRW_ROW  (BITS_FRW_ACC + 17 - 3)
+#define RND_FRW_ROW    (1 << (SHIFT_FRW_ROW-1))
+//#define RND_FRW_COL    (1 << (SHIFT_FRW_COL-1))
+
+#define X8(x) x,x,x,x,x,x,x,x
+
+//concatenated table, for forward DCT transformation
+DECLARE_ALIGNED(16, static const int16_t, fdct_tg_all_16)[24] = {
+    X8(13036),  // tg * (2<<16) + 0.5
+    X8(27146),  // tg * (2<<16) + 0.5
+    X8(-21746)  // tg * (2<<16) + 0.5
+};
+
+DECLARE_ALIGNED(16, static const int16_t, ocos_4_16)[8] = {
+    X8(23170)   //cos * (2<<15) + 0.5
+};
+
+DECLARE_ALIGNED(16, static const int16_t, fdct_one_corr)[8] = { X8(1) };
+
+DECLARE_ALIGNED(8, static const int32_t, fdct_r_row)[2] = {RND_FRW_ROW, RND_FRW_ROW };
+
+static const struct
+{
+ DECLARE_ALIGNED(16, const int32_t, fdct_r_row_sse2)[4];
+} fdct_r_row_sse2 =
+{{
+ RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW
+}};
+//DECLARE_ALIGNED(16, static const long, fdct_r_row_sse2)[4] = {RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW};
+
+DECLARE_ALIGNED(8, static const int16_t, tab_frw_01234567)[] = {  // forward_dct coeff table
+  16384,   16384,   22725,   19266,
+  16384,   16384,   12873,    4520,
+  21407,    8867,   19266,   -4520,
+  -8867,  -21407,  -22725,  -12873,
+  16384,  -16384,   12873,  -22725,
+ -16384,   16384,    4520,   19266,
+   8867,  -21407,    4520,  -12873,
+  21407,   -8867,   19266,  -22725,
+
+  22725,   22725,   31521,   26722,
+  22725,   22725,   17855,    6270,
+  29692,   12299,   26722,   -6270,
+ -12299,  -29692,  -31521,  -17855,
+  22725,  -22725,   17855,  -31521,
+ -22725,   22725,    6270,   26722,
+  12299,  -29692,    6270,  -17855,
+  29692,  -12299,   26722,  -31521,
+
+  21407,   21407,   29692,   25172,
+  21407,   21407,   16819,    5906,
+  27969,   11585,   25172,   -5906,
+ -11585,  -27969,  -29692,  -16819,
+  21407,  -21407,   16819,  -29692,
+ -21407,   21407,    5906,   25172,
+  11585,  -27969,    5906,  -16819,
+  27969,  -11585,   25172,  -29692,
+
+  19266,   19266,   26722,   22654,
+  19266,   19266,   15137,    5315,
+  25172,   10426,   22654,   -5315,
+ -10426,  -25172,  -26722,  -15137,
+  19266,  -19266,   15137,  -26722,
+ -19266,   19266,    5315,   22654,
+  10426,  -25172,    5315,  -15137,
+  25172,  -10426,   22654,  -26722,
+
+  16384,   16384,   22725,   19266,
+  16384,   16384,   12873,    4520,
+  21407,    8867,   19266,   -4520,
+  -8867,  -21407,  -22725,  -12873,
+  16384,  -16384,   12873,  -22725,
+ -16384,   16384,    4520,   19266,
+   8867,  -21407,    4520,  -12873,
+  21407,   -8867,   19266,  -22725,
+
+  19266,   19266,   26722,   22654,
+  19266,   19266,   15137,    5315,
+  25172,   10426,   22654,   -5315,
+ -10426,  -25172,  -26722,  -15137,
+  19266,  -19266,   15137,  -26722,
+ -19266,   19266,    5315,   22654,
+  10426,  -25172,    5315,  -15137,
+  25172,  -10426,   22654,  -26722,
+
+  21407,   21407,   29692,   25172,
+  21407,   21407,   16819,    5906,
+  27969,   11585,   25172,   -5906,
+ -11585,  -27969,  -29692,  -16819,
+  21407,  -21407,   16819,  -29692,
+ -21407,   21407,    5906,   25172,
+  11585,  -27969,    5906,  -16819,
+  27969,  -11585,   25172,  -29692,
+
+  22725,   22725,   31521,   26722,
+  22725,   22725,   17855,    6270,
+  29692,   12299,   26722,   -6270,
+ -12299,  -29692,  -31521,  -17855,
+  22725,  -22725,   17855,  -31521,
+ -22725,   22725,    6270,   26722,
+  12299,  -29692,    6270,  -17855,
+  29692,  -12299,   26722,  -31521,
+};
+
+static const struct
+{
+ DECLARE_ALIGNED(16, const int16_t, tab_frw_01234567_sse2)[256];
+} tab_frw_01234567_sse2 =
+{{
+//DECLARE_ALIGNED(16, static const int16_t, tab_frw_01234567_sse2)[] = {  // forward_dct coeff table
+#define TABLE_SSE2 C4,  C4,  C1,  C3, -C6, -C2, -C1, -C5, \
+                   C4,  C4,  C5,  C7,  C2,  C6,  C3, -C7, \
+                  -C4,  C4,  C7,  C3,  C6, -C2,  C7, -C5, \
+                   C4, -C4,  C5, -C1,  C2, -C6,  C3, -C1,
+// c1..c7 * cos(pi/4) * 2^15
+#define C1 22725
+#define C2 21407
+#define C3 19266
+#define C4 16384
+#define C5 12873
+#define C6 8867
+#define C7 4520
+TABLE_SSE2
+
+#undef C1
+#undef C2
+#undef C3
+#undef C4
+#undef C5
+#undef C6
+#undef C7
+#define C1 31521
+#define C2 29692
+#define C3 26722
+#define C4 22725
+#define C5 17855
+#define C6 12299
+#define C7 6270
+TABLE_SSE2
+
+#undef C1
+#undef C2
+#undef C3
+#undef C4
+#undef C5
+#undef C6
+#undef C7
+#define C1 29692
+#define C2 27969
+#define C3 25172
+#define C4 21407
+#define C5 16819
+#define C6 11585
+#define C7 5906
+TABLE_SSE2
+
+#undef C1
+#undef C2
+#undef C3
+#undef C4
+#undef C5
+#undef C6
+#undef C7
+#define C1 26722
+#define C2 25172
+#define C3 22654
+#define C4 19266
+#define C5 15137
+#define C6 10426
+#define C7 5315
+TABLE_SSE2
+
+#undef C1
+#undef C2
+#undef C3
+#undef C4
+#undef C5
+#undef C6
+#undef C7
+#define C1 22725
+#define C2 21407
+#define C3 19266
+#define C4 16384
+#define C5 12873
+#define C6 8867
+#define C7 4520
+TABLE_SSE2
+
+#undef C1
+#undef C2
+#undef C3
+#undef C4
+#undef C5
+#undef C6
+#undef C7
+#define C1 26722
+#define C2 25172
+#define C3 22654
+#define C4 19266
+#define C5 15137
+#define C6 10426
+#define C7 5315
+TABLE_SSE2
+
+#undef C1
+#undef C2
+#undef C3
+#undef C4
+#undef C5
+#undef C6
+#undef C7
+#define C1 29692
+#define C2 27969
+#define C3 25172
+#define C4 21407
+#define C5 16819
+#define C6 11585
+#define C7 5906
+TABLE_SSE2
+
+#undef C1
+#undef C2
+#undef C3
+#undef C4
+#undef C5
+#undef C6
+#undef C7
+#define C1 31521
+#define C2 29692
+#define C3 26722
+#define C4 22725
+#define C5 17855
+#define C6 12299
+#define C7 6270
+TABLE_SSE2
+}};
+
+#define S(s) AV_TOSTRING(s) //AV_STRINGIFY is too long
+
+#define FDCT_COL(cpu, mm, mov)\
+static av_always_inline void fdct_col_##cpu(const int16_t *in, int16_t *out, int offset)\
+{\
+    __asm__ volatile (\
+        #mov"      16(%0),  %%"#mm"0 \n\t" \
+        #mov"      96(%0),  %%"#mm"1 \n\t" \
+        #mov"    %%"#mm"0,  %%"#mm"2 \n\t" \
+        #mov"      32(%0),  %%"#mm"3 \n\t" \
+        "paddsw  %%"#mm"1,  %%"#mm"0 \n\t" \
+        #mov"      80(%0),  %%"#mm"4 \n\t" \
+        "psllw  $"S(SHIFT_FRW_COL)", %%"#mm"0 \n\t" \
+        #mov"        (%0),  %%"#mm"5 \n\t" \
+        "paddsw  %%"#mm"3,  %%"#mm"4 \n\t" \
+        "paddsw   112(%0),  %%"#mm"5 \n\t" \
+        "psllw  $"S(SHIFT_FRW_COL)", %%"#mm"4 \n\t" \
+        #mov"    %%"#mm"0,  %%"#mm"6 \n\t" \
+        "psubsw  %%"#mm"1,  %%"#mm"2 \n\t" \
+        #mov"      16(%1),  %%"#mm"1 \n\t" \
+        "psubsw  %%"#mm"4,  %%"#mm"0 \n\t" \
+        #mov"      48(%0),  %%"#mm"7 \n\t" \
+        "pmulhw  %%"#mm"0,  %%"#mm"1 \n\t" \
+        "paddsw    64(%0),  %%"#mm"7 \n\t" \
+        "psllw  $"S(SHIFT_FRW_COL)", %%"#mm"5 \n\t" \
+        "paddsw  %%"#mm"4,  %%"#mm"6 \n\t" \
+        "psllw  $"S(SHIFT_FRW_COL)", %%"#mm"7 \n\t" \
+        #mov"    %%"#mm"5,  %%"#mm"4 \n\t" \
+        "psubsw  %%"#mm"7,  %%"#mm"5 \n\t" \
+        "paddsw  %%"#mm"5,  %%"#mm"1 \n\t" \
+        "paddsw  %%"#mm"7,  %%"#mm"4 \n\t" \
+        "por         (%2),  %%"#mm"1 \n\t" \
+        "psllw  $"S(SHIFT_FRW_COL)"+1, %%"#mm"2 \n\t" \
+        "pmulhw    16(%1),  %%"#mm"5 \n\t" \
+        #mov"    %%"#mm"4,  %%"#mm"7 \n\t" \
+        "psubsw    80(%0),  %%"#mm"3 \n\t" \
+        "psubsw  %%"#mm"6,  %%"#mm"4 \n\t" \
+        #mov"    %%"#mm"1,    32(%3) \n\t" \
+        "paddsw  %%"#mm"6,  %%"#mm"7 \n\t" \
+        #mov"      48(%0),  %%"#mm"1 \n\t" \
+        "psllw  $"S(SHIFT_FRW_COL)"+1, %%"#mm"3 \n\t" \
+        "psubsw    64(%0),  %%"#mm"1 \n\t" \
+        #mov"    %%"#mm"2,  %%"#mm"6 \n\t" \
+        #mov"    %%"#mm"4,    64(%3) \n\t" \
+        "paddsw  %%"#mm"3,  %%"#mm"2 \n\t" \
+        "pmulhw      (%4),  %%"#mm"2 \n\t" \
+        "psubsw  %%"#mm"3,  %%"#mm"6 \n\t" \
+        "pmulhw      (%4),  %%"#mm"6 \n\t" \
+        "psubsw  %%"#mm"0,  %%"#mm"5 \n\t" \
+        "por         (%2),  %%"#mm"5 \n\t" \
+        "psllw  $"S(SHIFT_FRW_COL)", %%"#mm"1 \n\t" \
+        "por         (%2),  %%"#mm"2 \n\t" \
+        #mov"    %%"#mm"1,  %%"#mm"4 \n\t" \
+        #mov"        (%0),  %%"#mm"3 \n\t" \
+        "paddsw  %%"#mm"6,  %%"#mm"1 \n\t" \
+        "psubsw   112(%0),  %%"#mm"3 \n\t" \
+        "psubsw  %%"#mm"6,  %%"#mm"4 \n\t" \
+        #mov"        (%1),  %%"#mm"0 \n\t" \
+        "psllw  $"S(SHIFT_FRW_COL)", %%"#mm"3 \n\t" \
+        #mov"      32(%1),  %%"#mm"6 \n\t" \
+        "pmulhw  %%"#mm"1,  %%"#mm"0 \n\t" \
+        #mov"    %%"#mm"7,      (%3) \n\t" \
+        "pmulhw  %%"#mm"4,  %%"#mm"6 \n\t" \
+        #mov"    %%"#mm"5,    96(%3) \n\t" \
+        #mov"    %%"#mm"3,  %%"#mm"7 \n\t" \
+        #mov"      32(%1),  %%"#mm"5 \n\t" \
+        "psubsw  %%"#mm"2,  %%"#mm"7 \n\t" \
+        "paddsw  %%"#mm"2,  %%"#mm"3 \n\t" \
+        "pmulhw  %%"#mm"7,  %%"#mm"5 \n\t" \
+        "paddsw  %%"#mm"3,  %%"#mm"0 \n\t" \
+        "paddsw  %%"#mm"4,  %%"#mm"6 \n\t" \
+        "pmulhw      (%1),  %%"#mm"3 \n\t" \
+        "por         (%2),  %%"#mm"0 \n\t" \
+        "paddsw  %%"#mm"7,  %%"#mm"5 \n\t" \
+        "psubsw  %%"#mm"6,  %%"#mm"7 \n\t" \
+        #mov"    %%"#mm"0,    16(%3) \n\t" \
+        "paddsw  %%"#mm"4,  %%"#mm"5 \n\t" \
+        #mov"    %%"#mm"7,    48(%3) \n\t" \
+        "psubsw  %%"#mm"1,  %%"#mm"3 \n\t" \
+        #mov"    %%"#mm"5,    80(%3) \n\t" \
+        #mov"    %%"#mm"3,   112(%3) \n\t" \
+        : \
+        : "r" (in  + offset), "r" (fdct_tg_all_16), "r" (fdct_one_corr), \
+          "r" (out + offset), "r" (ocos_4_16)); \
+}
+
+FDCT_COL(mmx, mm, movq)
+FDCT_COL(sse2, xmm, movdqa)
+
+static av_always_inline void fdct_row_sse2(const int16_t *in, int16_t *out)
+{
+    __asm__ volatile(
+#define FDCT_ROW_SSE2_H1(i,t)                    \
+        "movq      " #i "(%0), %%xmm2      \n\t" \
+        "movq      " #i "+8(%0), %%xmm0    \n\t" \
+        "movdqa    " #t "+32(%1), %%xmm3   \n\t" \
+        "movdqa    " #t "+48(%1), %%xmm7   \n\t" \
+        "movdqa    " #t "(%1), %%xmm4      \n\t" \
+        "movdqa    " #t "+16(%1), %%xmm5   \n\t"
+
+#define FDCT_ROW_SSE2_H2(i,t)                    \
+        "movq      " #i "(%0), %%xmm2      \n\t" \
+        "movq      " #i "+8(%0), %%xmm0    \n\t" \
+        "movdqa    " #t "+32(%1), %%xmm3   \n\t" \
+        "movdqa    " #t "+48(%1), %%xmm7   \n\t"
+
+#define FDCT_ROW_SSE2(i)                      \
+        "movq      %%xmm2, %%xmm1       \n\t" \
+        "pshuflw   $27, %%xmm0, %%xmm0  \n\t" \
+        "paddsw    %%xmm0, %%xmm1       \n\t" \
+        "psubsw    %%xmm0, %%xmm2       \n\t" \
+        "punpckldq %%xmm2, %%xmm1       \n\t" \
+        "pshufd    $78, %%xmm1, %%xmm2  \n\t" \
+        "pmaddwd   %%xmm2, %%xmm3       \n\t" \
+        "pmaddwd   %%xmm1, %%xmm7       \n\t" \
+        "pmaddwd   %%xmm5, %%xmm2       \n\t" \
+        "pmaddwd   %%xmm4, %%xmm1       \n\t" \
+        "paddd     %%xmm7, %%xmm3       \n\t" \
+        "paddd     %%xmm2, %%xmm1       \n\t" \
+        "paddd     %%xmm6, %%xmm3       \n\t" \
+        "paddd     %%xmm6, %%xmm1       \n\t" \
+        "psrad     %3, %%xmm3           \n\t" \
+        "psrad     %3, %%xmm1           \n\t" \
+        "packssdw  %%xmm3, %%xmm1       \n\t" \
+        "movdqa    %%xmm1, " #i "(%4)   \n\t"
+
+        "movdqa    (%2), %%xmm6         \n\t"
+        FDCT_ROW_SSE2_H1(0,0)
+        FDCT_ROW_SSE2(0)
+        FDCT_ROW_SSE2_H2(64,0)
+        FDCT_ROW_SSE2(64)
+
+        FDCT_ROW_SSE2_H1(16,64)
+        FDCT_ROW_SSE2(16)
+        FDCT_ROW_SSE2_H2(112,64)
+        FDCT_ROW_SSE2(112)
+
+        FDCT_ROW_SSE2_H1(32,128)
+        FDCT_ROW_SSE2(32)
+        FDCT_ROW_SSE2_H2(96,128)
+        FDCT_ROW_SSE2(96)
+
+        FDCT_ROW_SSE2_H1(48,192)
+        FDCT_ROW_SSE2(48)
+        FDCT_ROW_SSE2_H2(80,192)
+        FDCT_ROW_SSE2(80)
+        :
+        : "r" (in), "r" (tab_frw_01234567_sse2.tab_frw_01234567_sse2),
+          "r" (fdct_r_row_sse2.fdct_r_row_sse2), "i" (SHIFT_FRW_ROW), "r" (out)
+          XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3",
+                            "%xmm4", "%xmm5", "%xmm6", "%xmm7")
+    );
+}
+
+static av_always_inline void fdct_row_mmxext(const int16_t *in, int16_t *out,
+                                             const int16_t *table)
+{
+    __asm__ volatile (
+        "pshufw    $0x1B, 8(%0), %%mm5 \n\t"
+        "movq       (%0), %%mm0 \n\t"
+        "movq      %%mm0, %%mm1 \n\t"
+        "paddsw    %%mm5, %%mm0 \n\t"
+        "psubsw    %%mm5, %%mm1 \n\t"
+        "movq      %%mm0, %%mm2 \n\t"
+        "punpckldq %%mm1, %%mm0 \n\t"
+        "punpckhdq %%mm1, %%mm2 \n\t"
+        "movq       (%1), %%mm1 \n\t"
+        "movq      8(%1), %%mm3 \n\t"
+        "movq     16(%1), %%mm4 \n\t"
+        "movq     24(%1), %%mm5 \n\t"
+        "movq     32(%1), %%mm6 \n\t"
+        "movq     40(%1), %%mm7 \n\t"
+        "pmaddwd   %%mm0, %%mm1 \n\t"
+        "pmaddwd   %%mm2, %%mm3 \n\t"
+        "pmaddwd   %%mm0, %%mm4 \n\t"
+        "pmaddwd   %%mm2, %%mm5 \n\t"
+        "pmaddwd   %%mm0, %%mm6 \n\t"
+        "pmaddwd   %%mm2, %%mm7 \n\t"
+        "pmaddwd  48(%1), %%mm0 \n\t"
+        "pmaddwd  56(%1), %%mm2 \n\t"
+        "paddd     %%mm1, %%mm3 \n\t"
+        "paddd     %%mm4, %%mm5 \n\t"
+        "paddd     %%mm6, %%mm7 \n\t"
+        "paddd     %%mm0, %%mm2 \n\t"
+        "movq       (%2), %%mm0 \n\t"
+        "paddd     %%mm0, %%mm3 \n\t"
+        "paddd     %%mm0, %%mm5 \n\t"
+        "paddd     %%mm0, %%mm7 \n\t"
+        "paddd     %%mm0, %%mm2 \n\t"
+        "psrad $"S(SHIFT_FRW_ROW)", %%mm3 \n\t"
+        "psrad $"S(SHIFT_FRW_ROW)", %%mm5 \n\t"
+        "psrad $"S(SHIFT_FRW_ROW)", %%mm7 \n\t"
+        "psrad $"S(SHIFT_FRW_ROW)", %%mm2 \n\t"
+        "packssdw  %%mm5, %%mm3 \n\t"
+        "packssdw  %%mm2, %%mm7 \n\t"
+        "movq      %%mm3,  (%3) \n\t"
+        "movq      %%mm7, 8(%3) \n\t"
+        :
+        : "r" (in), "r" (table), "r" (fdct_r_row), "r" (out));
+}
+
+static av_always_inline void fdct_row_mmx(const int16_t *in, int16_t *out, const int16_t *table)
+{
+    //FIXME reorder (I do not have an old MMX-only CPU here to benchmark ...)
+    __asm__ volatile(
+        "movd     12(%0), %%mm1 \n\t"
+        "punpcklwd 8(%0), %%mm1 \n\t"
+        "movq      %%mm1, %%mm2 \n\t"
+        "psrlq     $0x20, %%mm1 \n\t"
+        "movq      0(%0), %%mm0 \n\t"
+        "punpcklwd %%mm2, %%mm1 \n\t"
+        "movq      %%mm0, %%mm5 \n\t"
+        "paddsw    %%mm1, %%mm0 \n\t"
+        "psubsw    %%mm1, %%mm5 \n\t"
+        "movq      %%mm0, %%mm2 \n\t"
+        "punpckldq %%mm5, %%mm0 \n\t"
+        "punpckhdq %%mm5, %%mm2 \n\t"
+        "movq      0(%1), %%mm1 \n\t"
+        "movq      8(%1), %%mm3 \n\t"
+        "movq     16(%1), %%mm4 \n\t"
+        "movq     24(%1), %%mm5 \n\t"
+        "movq     32(%1), %%mm6 \n\t"
+        "movq     40(%1), %%mm7 \n\t"
+        "pmaddwd   %%mm0, %%mm1 \n\t"
+        "pmaddwd   %%mm2, %%mm3 \n\t"
+        "pmaddwd   %%mm0, %%mm4 \n\t"
+        "pmaddwd   %%mm2, %%mm5 \n\t"
+        "pmaddwd   %%mm0, %%mm6 \n\t"
+        "pmaddwd   %%mm2, %%mm7 \n\t"
+        "pmaddwd  48(%1), %%mm0 \n\t"
+        "pmaddwd  56(%1), %%mm2 \n\t"
+        "paddd     %%mm1, %%mm3 \n\t"
+        "paddd     %%mm4, %%mm5 \n\t"
+        "paddd     %%mm6, %%mm7 \n\t"
+        "paddd     %%mm0, %%mm2 \n\t"
+        "movq       (%2), %%mm0 \n\t"
+        "paddd     %%mm0, %%mm3 \n\t"
+        "paddd     %%mm0, %%mm5 \n\t"
+        "paddd     %%mm0, %%mm7 \n\t"
+        "paddd     %%mm0, %%mm2 \n\t"
+        "psrad $"S(SHIFT_FRW_ROW)", %%mm3 \n\t"
+        "psrad $"S(SHIFT_FRW_ROW)", %%mm5 \n\t"
+        "psrad $"S(SHIFT_FRW_ROW)", %%mm7 \n\t"
+        "psrad $"S(SHIFT_FRW_ROW)", %%mm2 \n\t"
+        "packssdw  %%mm5, %%mm3 \n\t"
+        "packssdw  %%mm2, %%mm7 \n\t"
+        "movq      %%mm3, 0(%3) \n\t"
+        "movq      %%mm7, 8(%3) \n\t"
+        :
+        : "r" (in), "r" (table), "r" (fdct_r_row), "r" (out));
+}
+
+void ff_fdct_mmx(int16_t *block)
+{
+    DECLARE_ALIGNED(8, int64_t, align_tmp)[16];
+    int16_t * block1= (int16_t*)align_tmp;
+    const int16_t *table= tab_frw_01234567;
+    int i;
+
+    fdct_col_mmx(block, block1, 0);
+    fdct_col_mmx(block, block1, 4);
+
+    for(i=8;i>0;i--) {
+        fdct_row_mmx(block1, block, table);
+        block1 += 8;
+        table += 32;
+        block += 8;
+    }
+}
+
+#endif /* HAVE_MMX_INLINE */
+
+#if HAVE_MMXEXT_INLINE
+
+void ff_fdct_mmxext(int16_t *block)
+{
+    DECLARE_ALIGNED(8, int64_t, align_tmp)[16];
+    int16_t *block1= (int16_t*)align_tmp;
+    const int16_t *table= tab_frw_01234567;
+    int i;
+
+    fdct_col_mmx(block, block1, 0);
+    fdct_col_mmx(block, block1, 4);
+
+    for(i=8;i>0;i--) {
+        fdct_row_mmxext(block1, block, table);
+        block1 += 8;
+        table += 32;
+        block += 8;
+    }
+}
+
+#endif /* HAVE_MMXEXT_INLINE */
+
+#if HAVE_SSE2_INLINE
+
+void ff_fdct_sse2(int16_t *block)
+{
+    DECLARE_ALIGNED(16, int64_t, align_tmp)[16];
+    int16_t * const block1= (int16_t*)align_tmp;
+
+    fdct_col_sse2(block, block1, 0);
+    fdct_row_sse2(block1, block);
+}
+
+#endif /* HAVE_SSE2_INLINE */
--- a/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/fdct.h
+++ b/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/fdct.h
@ -0,0 +1,28 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_X86_FDCT_H
+#define AVCODEC_X86_FDCT_H
+
+#include <stdint.h>
+
+void ff_fdct_mmx(int16_t *block);
+void ff_fdct_mmxext(int16_t *block);
+void ff_fdct_sse2(int16_t *block);
+
+#endif /* AVCODEC_X86_FDCT_H */
--- a/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/fdctdsp_init.c
+++ b/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/fdctdsp_init.c
@ -0,0 +1,44 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/avcodec.h"
+#include "libavcodec/fdctdsp.h"
+#include "fdct.h"
+
+av_cold void ff_fdctdsp_init_x86(FDCTDSPContext *c, AVCodecContext *avctx,
+                                 unsigned high_bit_depth)
+{
+    int cpu_flags = av_get_cpu_flags();
+    const int dct_algo = avctx->dct_algo;
+
+    if (!high_bit_depth) {
+        if ((dct_algo == FF_DCT_AUTO || dct_algo == FF_DCT_MMX)) {
+            if (INLINE_MMX(cpu_flags))
+                c->fdct = ff_fdct_mmx;
+
+            if (INLINE_MMXEXT(cpu_flags))
+                c->fdct = ff_fdct_mmxext;
+
+            if (INLINE_SSE2(cpu_flags))
+                c->fdct = ff_fdct_sse2;
+        }
+    }
+}
--- a/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/fft.asm
+++ b/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/fft.asm
--- a/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/fft.h
+++ b/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/fft.h
@ -0,0 +1,38 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_X86_FFT_H
+#define AVCODEC_X86_FFT_H
+
+#include "libavcodec/fft.h"
+
+void ff_fft_permute_sse(FFTContext *s, FFTComplex *z);
+void ff_fft_calc_avx(FFTContext *s, FFTComplex *z);
+void ff_fft_calc_sse(FFTContext *s, FFTComplex *z);
+void ff_fft_calc_3dnow(FFTContext *s, FFTComplex *z);
+void ff_fft_calc_3dnowext(FFTContext *s, FFTComplex *z);
+
+void ff_imdct_calc_3dnow(FFTContext *s, FFTSample *output, const FFTSample *input);
+void ff_imdct_half_3dnow(FFTContext *s, FFTSample *output, const FFTSample *input);
+void ff_imdct_calc_3dnowext(FFTContext *s, FFTSample *output, const FFTSample *input);
+void ff_imdct_half_3dnowext(FFTContext *s, FFTSample *output, const FFTSample *input);
+void ff_imdct_calc_sse(FFTContext *s, FFTSample *output, const FFTSample *input);
+void ff_imdct_half_sse(FFTContext *s, FFTSample *output, const FFTSample *input);
+void ff_imdct_half_avx(FFTContext *s, FFTSample *output, const FFTSample *input);
+
+#endif /* AVCODEC_X86_FFT_H */
--- a/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/fft_init.c
+++ b/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/fft_init.c
@ -0,0 +1,61 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+
+#include "fft.h"
+
+av_cold void ff_fft_init_x86(FFTContext *s)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (s->nbits > 16)
+        return;
+
+#if ARCH_X86_32
+    if (EXTERNAL_AMD3DNOW(cpu_flags)) {
+        s->imdct_calc = ff_imdct_calc_3dnow;
+        s->imdct_half = ff_imdct_half_3dnow;
+        s->fft_calc   = ff_fft_calc_3dnow;
+    }
+
+    if (EXTERNAL_AMD3DNOWEXT(cpu_flags)) {
+        s->imdct_calc = ff_imdct_calc_3dnowext;
+        s->imdct_half = ff_imdct_half_3dnowext;
+        s->fft_calc   = ff_fft_calc_3dnowext;
+    }
+#endif /* ARCH_X86_32 */
+
+    if (EXTERNAL_SSE(cpu_flags)) {
+        s->imdct_calc  = ff_imdct_calc_sse;
+        s->imdct_half  = ff_imdct_half_sse;
+        s->fft_permute = ff_fft_permute_sse;
+        s->fft_calc    = ff_fft_calc_sse;
+        s->fft_permutation = FF_FFT_PERM_SWAP_LSBS;
+    }
+
+    if (EXTERNAL_AVX_FAST(cpu_flags) && s->nbits >= 5) {
+        s->imdct_half      = ff_imdct_half_avx;
+        s->fft_calc        = ff_fft_calc_avx;
+        s->fft_permutation = FF_FFT_PERM_AVX;
+    }
+}
--- a/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/flacdsp_init.c
+++ b/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/flacdsp_init.c
@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2014 James Almer
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/flacdsp.h"
+#include "libavutil/x86/cpu.h"
+#include "config.h"
+
+void ff_flac_lpc_32_sse4(int32_t *samples, const int coeffs[32], int order,
+                         int qlevel, int len);
+void ff_flac_lpc_32_xop(int32_t *samples, const int coeffs[32], int order,
+                        int qlevel, int len);
+
+void ff_flac_enc_lpc_16_sse4(int32_t *, const int32_t *, int, int, const int32_t *,int);
+
+#define DECORRELATE_FUNCS(fmt, opt)                                                      \
+void ff_flac_decorrelate_ls_##fmt##_##opt(uint8_t **out, int32_t **in, int channels,     \
+                                          int len, int shift);                           \
+void ff_flac_decorrelate_rs_##fmt##_##opt(uint8_t **out, int32_t **in, int channels,     \
+                                          int len, int shift);                           \
+void ff_flac_decorrelate_ms_##fmt##_##opt(uint8_t **out, int32_t **in, int channels,     \
+                                          int len, int shift);                           \
+void ff_flac_decorrelate_indep2_##fmt##_##opt(uint8_t **out, int32_t **in, int channels, \
+                                             int len, int shift);                        \
+void ff_flac_decorrelate_indep4_##fmt##_##opt(uint8_t **out, int32_t **in, int channels, \
+                                              int len, int shift);                       \
+void ff_flac_decorrelate_indep6_##fmt##_##opt(uint8_t **out, int32_t **in, int channels, \
+                                              int len, int shift);                       \
+void ff_flac_decorrelate_indep8_##fmt##_##opt(uint8_t **out, int32_t **in, int channels, \
+                                              int len, int shift)
+
+DECORRELATE_FUNCS(16, sse2);
+DECORRELATE_FUNCS(16,  avx);
+DECORRELATE_FUNCS(32, sse2);
+DECORRELATE_FUNCS(32,  avx);
+
+av_cold void ff_flacdsp_init_x86(FLACDSPContext *c, enum AVSampleFormat fmt, int channels,
+                                 int bps)
+{
+#if HAVE_X86ASM
+    int cpu_flags = av_get_cpu_flags();
+
+#if CONFIG_FLAC_DECODER
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        if (fmt == AV_SAMPLE_FMT_S16) {
+            if (channels == 2)
+                c->decorrelate[0] = ff_flac_decorrelate_indep2_16_sse2;
+            else if (channels == 4)
+                c->decorrelate[0] = ff_flac_decorrelate_indep4_16_sse2;
+            else if (channels == 6)
+                c->decorrelate[0] = ff_flac_decorrelate_indep6_16_sse2;
+            else if (ARCH_X86_64 && channels == 8)
+                c->decorrelate[0] = ff_flac_decorrelate_indep8_16_sse2;
+            c->decorrelate[1] = ff_flac_decorrelate_ls_16_sse2;
+            c->decorrelate[2] = ff_flac_decorrelate_rs_16_sse2;
+            c->decorrelate[3] = ff_flac_decorrelate_ms_16_sse2;
+        } else if (fmt == AV_SAMPLE_FMT_S32) {
+            if (channels == 2)
+                c->decorrelate[0] = ff_flac_decorrelate_indep2_32_sse2;
+            else if (channels == 4)
+                c->decorrelate[0] = ff_flac_decorrelate_indep4_32_sse2;
+            else if (channels == 6)
+                c->decorrelate[0] = ff_flac_decorrelate_indep6_32_sse2;
+            else if (ARCH_X86_64 && channels == 8)
+                c->decorrelate[0] = ff_flac_decorrelate_indep8_32_sse2;
+            c->decorrelate[1] = ff_flac_decorrelate_ls_32_sse2;
+            c->decorrelate[2] = ff_flac_decorrelate_rs_32_sse2;
+            c->decorrelate[3] = ff_flac_decorrelate_ms_32_sse2;
+        }
+    }
+    if (EXTERNAL_SSE4(cpu_flags)) {
+        c->lpc32 = ff_flac_lpc_32_sse4;
+    }
+    if (EXTERNAL_AVX(cpu_flags)) {
+        if (fmt == AV_SAMPLE_FMT_S16) {
+            if (ARCH_X86_64 && channels == 8)
+                c->decorrelate[0] = ff_flac_decorrelate_indep8_16_avx;
+        } else if (fmt == AV_SAMPLE_FMT_S32) {
+            if (channels == 4)
+                c->decorrelate[0] = ff_flac_decorrelate_indep4_32_avx;
+            else if (channels == 6)
+                c->decorrelate[0] = ff_flac_decorrelate_indep6_32_avx;
+            else if (ARCH_X86_64 && channels == 8)
+                c->decorrelate[0] = ff_flac_decorrelate_indep8_32_avx;
+        }
+    }
+    if (EXTERNAL_XOP(cpu_flags)) {
+        c->lpc32 = ff_flac_lpc_32_xop;
+    }
+#endif
+
+#if CONFIG_FLAC_ENCODER
+    if (EXTERNAL_SSE4(cpu_flags)) {
+        if (CONFIG_GPL)
+            c->lpc16_encode = ff_flac_enc_lpc_16_sse4;
+    }
+#endif
+#endif /* HAVE_X86ASM */
+}
--- a/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/fmtconvert_init.c
+++ b/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/fmtconvert_init.c
@ -0,0 +1,55 @@
+/*
+ * Format Conversion Utils
+ * Copyright (c) 2000, 2001 Fabrice Bellard
+ * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/fmtconvert.h"
+
+#if HAVE_X86ASM
+
+void ff_int32_to_float_fmul_scalar_sse (float *dst, const int32_t *src, float mul, int len);
+void ff_int32_to_float_fmul_scalar_sse2(float *dst, const int32_t *src, float mul, int len);
+void ff_int32_to_float_fmul_array8_sse (FmtConvertContext *c, float *dst, const int32_t *src,
+                                        const float *mul, int len);
+void ff_int32_to_float_fmul_array8_sse2(FmtConvertContext *c, float *dst, const int32_t *src,
+                                        const float *mul, int len);
+
+#endif /* HAVE_X86ASM */
+
+av_cold void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx)
+{
+#if HAVE_X86ASM
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_SSE(cpu_flags)) {
+        c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_sse;
+        c->int32_to_float_fmul_array8 = ff_int32_to_float_fmul_array8_sse;
+    }
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_sse2;
+        c->int32_to_float_fmul_array8 = ff_int32_to_float_fmul_array8_sse2;
+    }
+#endif /* HAVE_X86ASM */
+}
--- a/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/fpel.h
+++ b/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/fpel.h
@ -0,0 +1,49 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_X86_FPEL_H
+#define AVCODEC_X86_FPEL_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+void ff_avg_pixels4_mmx(uint8_t *block, const uint8_t *pixels,
+                        ptrdiff_t line_size, int h);
+void ff_avg_pixels4_mmxext(uint8_t *block, const uint8_t *pixels,
+                           ptrdiff_t line_size, int h);
+void ff_avg_pixels8_mmx(uint8_t *block, const uint8_t *pixels,
+                        ptrdiff_t line_size, int h);
+void ff_avg_pixels8_mmxext(uint8_t *block, const uint8_t *pixels,
+                           ptrdiff_t line_size, int h);
+void ff_avg_pixels16_mmx(uint8_t *block, const uint8_t *pixels,
+                         ptrdiff_t line_size, int h);
+void ff_avg_pixels16_mmxext(uint8_t *block, const uint8_t *pixels,
+                            ptrdiff_t line_size, int h);
+void ff_avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
+                          ptrdiff_t line_size, int h);
+void ff_put_pixels4_mmx(uint8_t *block, const uint8_t *pixels,
+                        ptrdiff_t line_size, int h);
+void ff_put_pixels8_mmx(uint8_t *block, const uint8_t *pixels,
+                        ptrdiff_t line_size, int h);
+void ff_put_pixels16_mmx(uint8_t *block, const uint8_t *pixels,
+                         ptrdiff_t line_size, int h);
+void ff_put_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
+                          ptrdiff_t line_size, int h);
+
+
+#endif /* AVCODEC_X86_FPEL_H */
--- a/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/g722dsp_init.c
+++ b/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/g722dsp_init.c
@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2014 James Almer
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/g722dsp.h"
+
+void ff_g722_apply_qmf_sse2(const int16_t *prev_samples, int xout[2]);
+
+av_cold void ff_g722dsp_init_x86(G722DSPContext *dsp)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_SSE2(cpu_flags))
+        dsp->apply_qmf = ff_g722_apply_qmf_sse2;
+}
--- a/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/h263dsp_init.c
+++ b/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/h263dsp_init.c
@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2013 Diego Biurrun <diego@biurrun.de>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/h263dsp.h"
+
+void ff_h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale);
+void ff_h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale);
+
+av_cold void ff_h263dsp_init_x86(H263DSPContext *c)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_MMX(cpu_flags)) {
+        c->h263_h_loop_filter = ff_h263_h_loop_filter_mmx;
+        c->h263_v_loop_filter = ff_h263_v_loop_filter_mmx;
+    }
+}
--- a/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/h264_cabac.c
+++ b/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/h264_cabac.c
@ -0,0 +1,208 @@
+/*
+ * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
+ * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * H.264 / AVC / MPEG-4 part10 codec.
+ * non-SIMD x86-specific optimizations for H.264
+ * @author Michael Niedermayer <michaelni@gmx.at>
+ */
+
+#include <stddef.h>
+
+#include "libavcodec/cabac.h"
+#include "cabac.h"
+
+#if HAVE_INLINE_ASM
+
+#if ARCH_X86_64
+#define REG64 "r"
+#else
+#define REG64 "m"
+#endif
+
+//FIXME use some macros to avoid duplicating get_cabac (cannot be done yet
+//as that would make optimization work hard)
+#if HAVE_7REGS && !BROKEN_COMPILER
+#define decode_significance decode_significance_x86
+static int decode_significance_x86(CABACContext *c, int max_coeff,
+                                   uint8_t *significant_coeff_ctx_base,
+                                   int *index, x86_reg last_off){
+    void *end= significant_coeff_ctx_base + max_coeff - 1;
+    int minusstart= -(intptr_t)significant_coeff_ctx_base;
+    int minusindex= 4-(intptr_t)index;
+    int bit;
+    x86_reg coeff_count;
+
+#ifdef BROKEN_RELOCATIONS
+    void *tables;
+
+    __asm__ volatile(
+        "lea   "MANGLE(ff_h264_cabac_tables)", %0      \n\t"
+        : "=&r"(tables)
+        : NAMED_CONSTRAINTS_ARRAY(ff_h264_cabac_tables)
+    );
+#endif
+
+    __asm__ volatile(
+        "3:                                     \n\t"
+
+        BRANCHLESS_GET_CABAC("%4", "%q4", "(%1)", "%3", "%w3",
+                             "%5", "%q5", "%k0", "%b0",
+                             "%c11(%6)", "%c12(%6)",
+                             AV_STRINGIFY(H264_NORM_SHIFT_OFFSET),
+                             AV_STRINGIFY(H264_LPS_RANGE_OFFSET),
+                             AV_STRINGIFY(H264_MLPS_STATE_OFFSET),
+                             "%13")
+
+        "test $1, %4                            \n\t"
+        " jz 4f                                 \n\t"
+        "add  %10, %1                           \n\t"
+
+        BRANCHLESS_GET_CABAC("%4", "%q4", "(%1)", "%3", "%w3",
+                             "%5", "%q5", "%k0", "%b0",
+                             "%c11(%6)", "%c12(%6)",
+                             AV_STRINGIFY(H264_NORM_SHIFT_OFFSET),
+                             AV_STRINGIFY(H264_LPS_RANGE_OFFSET),
+                             AV_STRINGIFY(H264_MLPS_STATE_OFFSET),
+                             "%13")
+
+        "sub  %10, %1                           \n\t"
+        "mov  %2, %0                            \n\t"
+        "movl %7, %%ecx                         \n\t"
+        "add  %1, %%"FF_REG_c"                  \n\t"
+        "movl %%ecx, (%0)                       \n\t"
+
+        "test $1, %4                            \n\t"
+        " jnz 5f                                \n\t"
+
+        "add"FF_OPSIZE"  $4, %2                 \n\t"
+
+        "4:                                     \n\t"
+        "add  $1, %1                            \n\t"
+        "cmp  %8, %1                            \n\t"
+        " jb 3b                                 \n\t"
+        "mov  %2, %0                            \n\t"
+        "movl %7, %%ecx                         \n\t"
+        "add  %1, %%"FF_REG_c"                  \n\t"
+        "movl %%ecx, (%0)                       \n\t"
+        "5:                                     \n\t"
+        "add  %9, %k0                           \n\t"
+        "shr $2, %k0                            \n\t"
+        : "=&q"(coeff_count), "+r"(significant_coeff_ctx_base), "+m"(index),
+          "+&r"(c->low), "=&r"(bit), "+&r"(c->range)
+        : "r"(c), "m"(minusstart), "m"(end), "m"(minusindex), "m"(last_off),
+          "i"(offsetof(CABACContext, bytestream)),
+          "i"(offsetof(CABACContext, bytestream_end))
+          TABLES_ARG
+        : "%"FF_REG_c, "memory"
+    );
+    return coeff_count;
+}
+
+#define decode_significance_8x8 decode_significance_8x8_x86
+static int decode_significance_8x8_x86(CABACContext *c,
+                                       uint8_t *significant_coeff_ctx_base,
+                                       int *index, uint8_t *last_coeff_ctx_base, const uint8_t *sig_off){
+    int minusindex= 4-(intptr_t)index;
+    int bit;
+    x86_reg coeff_count;
+    x86_reg last=0;
+    x86_reg state;
+
+#ifdef BROKEN_RELOCATIONS
+    void *tables;
+
+    __asm__ volatile(
+        "lea    "MANGLE(ff_h264_cabac_tables)", %0      \n\t"
+        : "=&r"(tables)
+        : NAMED_CONSTRAINTS_ARRAY(ff_h264_cabac_tables)
+    );
+#endif
+
+    __asm__ volatile(
+        "mov %1, %6                             \n\t"
+        "3:                                     \n\t"
+
+        "mov %10, %0                            \n\t"
+        "movzb (%0, %6), %6                     \n\t"
+        "add %9, %6                             \n\t"
+
+        BRANCHLESS_GET_CABAC("%4", "%q4", "(%6)", "%3", "%w3",
+                             "%5", "%q5", "%k0", "%b0",
+                             "%c12(%7)", "%c13(%7)",
+                             AV_STRINGIFY(H264_NORM_SHIFT_OFFSET),
+                             AV_STRINGIFY(H264_LPS_RANGE_OFFSET),
+                             AV_STRINGIFY(H264_MLPS_STATE_OFFSET),
+                             "%15")
+
+        "mov %1, %6                             \n\t"
+        "test $1, %4                            \n\t"
+        " jz 4f                                 \n\t"
+
+#ifdef BROKEN_RELOCATIONS
+        "movzb %c14(%15, %q6), %6\n\t"
+#else
+        "movzb "MANGLE(ff_h264_cabac_tables)"+%c14(%6), %6\n\t"
+#endif
+        "add %11, %6                            \n\t"
+
+        BRANCHLESS_GET_CABAC("%4", "%q4", "(%6)", "%3", "%w3",
+                             "%5", "%q5", "%k0", "%b0",
+                             "%c12(%7)", "%c13(%7)",
+                             AV_STRINGIFY(H264_NORM_SHIFT_OFFSET),
+                             AV_STRINGIFY(H264_LPS_RANGE_OFFSET),
+                             AV_STRINGIFY(H264_MLPS_STATE_OFFSET),
+                             "%15")
+
+        "mov %2, %0                             \n\t"
+        "mov %1, %6                             \n\t"
+        "mov %k6, (%0)                          \n\t"
+
+        "test $1, %4                            \n\t"
+        " jnz 5f                                \n\t"
+
+        "add"FF_OPSIZE"  $4, %2                 \n\t"
+
+        "4:                                     \n\t"
+        "add $1, %6                             \n\t"
+        "mov %6, %1                             \n\t"
+        "cmp $63, %6                            \n\t"
+        " jb 3b                                 \n\t"
+        "mov %2, %0                             \n\t"
+        "mov %k6, (%0)                          \n\t"
+        "5:                                     \n\t"
+        "addl %8, %k0                           \n\t"
+        "shr $2, %k0                            \n\t"
+        : "=&q"(coeff_count), "+"REG64(last), "+"REG64(index), "+&r"(c->low),
+          "=&r"(bit), "+&r"(c->range), "=&r"(state)
+        : "r"(c), "m"(minusindex), "m"(significant_coeff_ctx_base),
+          REG64(sig_off), REG64(last_coeff_ctx_base),
+          "i"(offsetof(CABACContext, bytestream)),
+          "i"(offsetof(CABACContext, bytestream_end)),
+          "i"(H264_LAST_COEFF_FLAG_OFFSET_8x8_OFFSET) TABLES_ARG
+        : "%"FF_REG_c, "memory"
+    );
+    return coeff_count;
+}
+#endif /* HAVE_7REGS && BROKEN_COMPILER */
+
+#endif /* HAVE_INLINE_ASM */
--- a/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/h264_intrapred_init.c
+++ b/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/h264_intrapred_init.c
@ -0,0 +1,410 @@
+/*
+ * Copyright (c) 2010 Fiona Glaser <fiona@x264.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/avcodec.h"
+#include "libavcodec/h264pred.h"
+
+#define PRED4x4(TYPE, DEPTH, OPT) \
+void ff_pred4x4_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, \
+                                                    const uint8_t *topright, \
+                                                    ptrdiff_t stride);
+
+PRED4x4(dc, 10, mmxext)
+PRED4x4(down_left, 10, sse2)
+PRED4x4(down_left, 10, avx)
+PRED4x4(down_right, 10, sse2)
+PRED4x4(down_right, 10, ssse3)
+PRED4x4(down_right, 10, avx)
+PRED4x4(vertical_left, 10, sse2)
+PRED4x4(vertical_left, 10, avx)
+PRED4x4(vertical_right, 10, sse2)
+PRED4x4(vertical_right, 10, ssse3)
+PRED4x4(vertical_right, 10, avx)
+PRED4x4(horizontal_up, 10, mmxext)
+PRED4x4(horizontal_down, 10, sse2)
+PRED4x4(horizontal_down, 10, ssse3)
+PRED4x4(horizontal_down, 10, avx)
+
+#define PRED8x8(TYPE, DEPTH, OPT) \
+void ff_pred8x8_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, \
+                                                    ptrdiff_t stride);
+
+PRED8x8(dc, 10, mmxext)
+PRED8x8(dc, 10, sse2)
+PRED8x8(top_dc, 10, sse2)
+PRED8x8(plane, 10, sse2)
+PRED8x8(vertical, 10, sse2)
+PRED8x8(horizontal, 10, sse2)
+
+#define PRED8x8L(TYPE, DEPTH, OPT)\
+void ff_pred8x8l_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, \
+                                                     int has_topleft, \
+                                                     int has_topright, \
+                                                     ptrdiff_t stride);
+
+PRED8x8L(dc, 10, sse2)
+PRED8x8L(dc, 10, avx)
+PRED8x8L(128_dc, 10, mmxext)
+PRED8x8L(128_dc, 10, sse2)
+PRED8x8L(top_dc, 10, sse2)
+PRED8x8L(top_dc, 10, avx)
+PRED8x8L(vertical, 10, sse2)
+PRED8x8L(vertical, 10, avx)
+PRED8x8L(horizontal, 10, sse2)
+PRED8x8L(horizontal, 10, ssse3)
+PRED8x8L(horizontal, 10, avx)
+PRED8x8L(down_left, 10, sse2)
+PRED8x8L(down_left, 10, ssse3)
+PRED8x8L(down_left, 10, avx)
+PRED8x8L(down_right, 10, sse2)
+PRED8x8L(down_right, 10, ssse3)
+PRED8x8L(down_right, 10, avx)
+PRED8x8L(vertical_right, 10, sse2)
+PRED8x8L(vertical_right, 10, ssse3)
+PRED8x8L(vertical_right, 10, avx)
+PRED8x8L(horizontal_up, 10, sse2)
+PRED8x8L(horizontal_up, 10, ssse3)
+PRED8x8L(horizontal_up, 10, avx)
+
+#define PRED16x16(TYPE, DEPTH, OPT)\
+void ff_pred16x16_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, \
+                                                      ptrdiff_t stride);
+
+PRED16x16(dc, 10, mmxext)
+PRED16x16(dc, 10, sse2)
+PRED16x16(top_dc, 10, mmxext)
+PRED16x16(top_dc, 10, sse2)
+PRED16x16(128_dc, 10, mmxext)
+PRED16x16(128_dc, 10, sse2)
+PRED16x16(left_dc, 10, mmxext)
+PRED16x16(left_dc, 10, sse2)
+PRED16x16(vertical, 10, mmxext)
+PRED16x16(vertical, 10, sse2)
+PRED16x16(horizontal, 10, mmxext)
+PRED16x16(horizontal, 10, sse2)
+
+/* 8-bit versions */
+PRED16x16(vertical, 8, mmx)
+PRED16x16(vertical, 8, sse)
+PRED16x16(horizontal, 8, mmx)
+PRED16x16(horizontal, 8, mmxext)
+PRED16x16(horizontal, 8, ssse3)
+PRED16x16(dc, 8, mmxext)
+PRED16x16(dc, 8, sse2)
+PRED16x16(dc, 8, ssse3)
+PRED16x16(plane_h264, 8, mmx)
+PRED16x16(plane_h264, 8, mmxext)
+PRED16x16(plane_h264, 8, sse2)
+PRED16x16(plane_h264, 8, ssse3)
+PRED16x16(plane_rv40, 8, mmx)
+PRED16x16(plane_rv40, 8, mmxext)
+PRED16x16(plane_rv40, 8, sse2)
+PRED16x16(plane_rv40, 8, ssse3)
+PRED16x16(plane_svq3, 8, mmx)
+PRED16x16(plane_svq3, 8, mmxext)
+PRED16x16(plane_svq3, 8, sse2)
+PRED16x16(plane_svq3, 8, ssse3)
+PRED16x16(tm_vp8, 8, mmx)
+PRED16x16(tm_vp8, 8, mmxext)
+PRED16x16(tm_vp8, 8, sse2)
+PRED16x16(tm_vp8, 8, avx2)
+
+PRED8x8(top_dc, 8, mmxext)
+PRED8x8(dc_rv40, 8, mmxext)
+PRED8x8(dc, 8, mmxext)
+PRED8x8(vertical, 8, mmx)
+PRED8x8(horizontal, 8, mmx)
+PRED8x8(horizontal, 8, mmxext)
+PRED8x8(horizontal, 8, ssse3)
+PRED8x8(plane, 8, mmx)
+PRED8x8(plane, 8, mmxext)
+PRED8x8(plane, 8, sse2)
+PRED8x8(plane, 8, ssse3)
+PRED8x8(tm_vp8, 8, mmx)
+PRED8x8(tm_vp8, 8, mmxext)
+PRED8x8(tm_vp8, 8, sse2)
+PRED8x8(tm_vp8, 8, ssse3)
+
+PRED8x8L(top_dc, 8, mmxext)
+PRED8x8L(top_dc, 8, ssse3)
+PRED8x8L(dc, 8, mmxext)
+PRED8x8L(dc, 8, ssse3)
+PRED8x8L(horizontal, 8, mmxext)
+PRED8x8L(horizontal, 8, ssse3)
+PRED8x8L(vertical, 8, mmxext)
+PRED8x8L(vertical, 8, ssse3)
+PRED8x8L(down_left, 8, mmxext)
+PRED8x8L(down_left, 8, sse2)
+PRED8x8L(down_left, 8, ssse3)
+PRED8x8L(down_right, 8, mmxext)
+PRED8x8L(down_right, 8, sse2)
+PRED8x8L(down_right, 8, ssse3)
+PRED8x8L(vertical_right, 8, mmxext)
+PRED8x8L(vertical_right, 8, sse2)
+PRED8x8L(vertical_right, 8, ssse3)
+PRED8x8L(vertical_left, 8, sse2)
+PRED8x8L(vertical_left, 8, ssse3)
+PRED8x8L(horizontal_up, 8, mmxext)
+PRED8x8L(horizontal_up, 8, ssse3)
+PRED8x8L(horizontal_down, 8, mmxext)
+PRED8x8L(horizontal_down, 8, sse2)
+PRED8x8L(horizontal_down, 8, ssse3)
+
+PRED4x4(dc, 8, mmxext)
+PRED4x4(down_left, 8, mmxext)
+PRED4x4(down_right, 8, mmxext)
+PRED4x4(vertical_left, 8, mmxext)
+PRED4x4(vertical_right, 8, mmxext)
+PRED4x4(horizontal_up, 8, mmxext)
+PRED4x4(horizontal_down, 8, mmxext)
+PRED4x4(tm_vp8, 8, mmx)
+PRED4x4(tm_vp8, 8, mmxext)
+PRED4x4(tm_vp8, 8, ssse3)
+PRED4x4(vertical_vp8, 8, mmxext)
+
+av_cold void ff_h264_pred_init_x86(H264PredContext *h, int codec_id,
+                                   const int bit_depth,
+                                   const int chroma_format_idc)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (bit_depth == 8) {
+        if (EXTERNAL_MMX(cpu_flags)) {
+            h->pred16x16[VERT_PRED8x8         ] = ff_pred16x16_vertical_8_mmx;
+            h->pred16x16[HOR_PRED8x8          ] = ff_pred16x16_horizontal_8_mmx;
+            if (chroma_format_idc <= 1) {
+                h->pred8x8  [VERT_PRED8x8     ] = ff_pred8x8_vertical_8_mmx;
+                h->pred8x8  [HOR_PRED8x8      ] = ff_pred8x8_horizontal_8_mmx;
+            }
+            if (codec_id == AV_CODEC_ID_VP7 || codec_id == AV_CODEC_ID_VP8) {
+                h->pred16x16[PLANE_PRED8x8    ] = ff_pred16x16_tm_vp8_8_mmx;
+                h->pred8x8  [PLANE_PRED8x8    ] = ff_pred8x8_tm_vp8_8_mmx;
+                h->pred4x4  [TM_VP8_PRED      ] = ff_pred4x4_tm_vp8_8_mmx;
+            } else {
+                if (chroma_format_idc <= 1)
+                    h->pred8x8  [PLANE_PRED8x8] = ff_pred8x8_plane_8_mmx;
+                if (codec_id == AV_CODEC_ID_SVQ3) {
+                    if (cpu_flags & AV_CPU_FLAG_CMOV)
+                        h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_svq3_8_mmx;
+                } else if (codec_id == AV_CODEC_ID_RV40) {
+                    h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_rv40_8_mmx;
+                } else {
+                    h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_h264_8_mmx;
+                }
+            }
+        }
+
+        if (EXTERNAL_MMXEXT(cpu_flags)) {
+            h->pred16x16[HOR_PRED8x8            ] = ff_pred16x16_horizontal_8_mmxext;
+            h->pred16x16[DC_PRED8x8             ] = ff_pred16x16_dc_8_mmxext;
+            if (chroma_format_idc <= 1)
+                h->pred8x8[HOR_PRED8x8          ] = ff_pred8x8_horizontal_8_mmxext;
+            h->pred8x8l [TOP_DC_PRED            ] = ff_pred8x8l_top_dc_8_mmxext;
+            h->pred8x8l [DC_PRED                ] = ff_pred8x8l_dc_8_mmxext;
+            h->pred8x8l [HOR_PRED               ] = ff_pred8x8l_horizontal_8_mmxext;
+            h->pred8x8l [VERT_PRED              ] = ff_pred8x8l_vertical_8_mmxext;
+            h->pred8x8l [DIAG_DOWN_RIGHT_PRED   ] = ff_pred8x8l_down_right_8_mmxext;
+            h->pred8x8l [VERT_RIGHT_PRED        ] = ff_pred8x8l_vertical_right_8_mmxext;
+            h->pred8x8l [HOR_UP_PRED            ] = ff_pred8x8l_horizontal_up_8_mmxext;
+            h->pred8x8l [DIAG_DOWN_LEFT_PRED    ] = ff_pred8x8l_down_left_8_mmxext;
+            h->pred8x8l [HOR_DOWN_PRED          ] = ff_pred8x8l_horizontal_down_8_mmxext;
+            h->pred4x4  [DIAG_DOWN_RIGHT_PRED   ] = ff_pred4x4_down_right_8_mmxext;
+            h->pred4x4  [VERT_RIGHT_PRED        ] = ff_pred4x4_vertical_right_8_mmxext;
+            h->pred4x4  [HOR_DOWN_PRED          ] = ff_pred4x4_horizontal_down_8_mmxext;
+            h->pred4x4  [DC_PRED                ] = ff_pred4x4_dc_8_mmxext;
+            if (codec_id == AV_CODEC_ID_VP7 || codec_id == AV_CODEC_ID_VP8 ||
+                codec_id == AV_CODEC_ID_H264) {
+                h->pred4x4  [DIAG_DOWN_LEFT_PRED] = ff_pred4x4_down_left_8_mmxext;
+            }
+            if (codec_id == AV_CODEC_ID_SVQ3 || codec_id == AV_CODEC_ID_H264) {
+                h->pred4x4  [VERT_LEFT_PRED     ] = ff_pred4x4_vertical_left_8_mmxext;
+            }
+            if (codec_id != AV_CODEC_ID_RV40) {
+                h->pred4x4  [HOR_UP_PRED        ] = ff_pred4x4_horizontal_up_8_mmxext;
+            }
+            if (codec_id == AV_CODEC_ID_SVQ3 || codec_id == AV_CODEC_ID_H264) {
+                if (chroma_format_idc <= 1) {
+                    h->pred8x8[TOP_DC_PRED8x8   ] = ff_pred8x8_top_dc_8_mmxext;
+                    h->pred8x8[DC_PRED8x8       ] = ff_pred8x8_dc_8_mmxext;
+                }
+            }
+            if (codec_id == AV_CODEC_ID_VP7 || codec_id == AV_CODEC_ID_VP8) {
+                h->pred16x16[PLANE_PRED8x8      ] = ff_pred16x16_tm_vp8_8_mmxext;
+                h->pred8x8  [DC_PRED8x8         ] = ff_pred8x8_dc_rv40_8_mmxext;
+                h->pred8x8  [PLANE_PRED8x8      ] = ff_pred8x8_tm_vp8_8_mmxext;
+                h->pred4x4  [TM_VP8_PRED        ] = ff_pred4x4_tm_vp8_8_mmxext;
+                h->pred4x4  [VERT_PRED          ] = ff_pred4x4_vertical_vp8_8_mmxext;
+            } else {
+                if (chroma_format_idc <= 1)
+                    h->pred8x8  [PLANE_PRED8x8] = ff_pred8x8_plane_8_mmxext;
+                if (codec_id == AV_CODEC_ID_SVQ3) {
+                    h->pred16x16[PLANE_PRED8x8  ] = ff_pred16x16_plane_svq3_8_mmxext;
+                } else if (codec_id == AV_CODEC_ID_RV40) {
+                    h->pred16x16[PLANE_PRED8x8  ] = ff_pred16x16_plane_rv40_8_mmxext;
+                } else {
+                    h->pred16x16[PLANE_PRED8x8  ] = ff_pred16x16_plane_h264_8_mmxext;
+                }
+            }
+        }
+
+        if (EXTERNAL_SSE(cpu_flags)) {
+            h->pred16x16[VERT_PRED8x8] = ff_pred16x16_vertical_8_sse;
+        }
+
+        if (EXTERNAL_SSE2(cpu_flags)) {
+            h->pred16x16[DC_PRED8x8           ] = ff_pred16x16_dc_8_sse2;
+            h->pred8x8l [DIAG_DOWN_LEFT_PRED  ] = ff_pred8x8l_down_left_8_sse2;
+            h->pred8x8l [DIAG_DOWN_RIGHT_PRED ] = ff_pred8x8l_down_right_8_sse2;
+            h->pred8x8l [VERT_RIGHT_PRED      ] = ff_pred8x8l_vertical_right_8_sse2;
+            h->pred8x8l [VERT_LEFT_PRED       ] = ff_pred8x8l_vertical_left_8_sse2;
+            h->pred8x8l [HOR_DOWN_PRED        ] = ff_pred8x8l_horizontal_down_8_sse2;
+            if (codec_id == AV_CODEC_ID_VP7 || codec_id == AV_CODEC_ID_VP8) {
+                h->pred16x16[PLANE_PRED8x8    ] = ff_pred16x16_tm_vp8_8_sse2;
+                h->pred8x8  [PLANE_PRED8x8    ] = ff_pred8x8_tm_vp8_8_sse2;
+            } else {
+                if (chroma_format_idc <= 1)
+                    h->pred8x8  [PLANE_PRED8x8] = ff_pred8x8_plane_8_sse2;
+                if (codec_id == AV_CODEC_ID_SVQ3) {
+                    h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_svq3_8_sse2;
+                } else if (codec_id == AV_CODEC_ID_RV40) {
+                    h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_rv40_8_sse2;
+                } else {
+                    h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_h264_8_sse2;
+                }
+            }
+        }
+
+        if (EXTERNAL_SSSE3(cpu_flags)) {
+            h->pred16x16[HOR_PRED8x8          ] = ff_pred16x16_horizontal_8_ssse3;
+            h->pred16x16[DC_PRED8x8           ] = ff_pred16x16_dc_8_ssse3;
+            if (chroma_format_idc <= 1)
+                h->pred8x8  [HOR_PRED8x8      ] = ff_pred8x8_horizontal_8_ssse3;
+            h->pred8x8l [TOP_DC_PRED          ] = ff_pred8x8l_top_dc_8_ssse3;
+            h->pred8x8l [DC_PRED              ] = ff_pred8x8l_dc_8_ssse3;
+            h->pred8x8l [HOR_PRED             ] = ff_pred8x8l_horizontal_8_ssse3;
+            h->pred8x8l [VERT_PRED            ] = ff_pred8x8l_vertical_8_ssse3;
+            h->pred8x8l [DIAG_DOWN_LEFT_PRED  ] = ff_pred8x8l_down_left_8_ssse3;
+            h->pred8x8l [DIAG_DOWN_RIGHT_PRED ] = ff_pred8x8l_down_right_8_ssse3;
+            h->pred8x8l [VERT_RIGHT_PRED      ] = ff_pred8x8l_vertical_right_8_ssse3;
+            h->pred8x8l [VERT_LEFT_PRED       ] = ff_pred8x8l_vertical_left_8_ssse3;
+            h->pred8x8l [HOR_UP_PRED          ] = ff_pred8x8l_horizontal_up_8_ssse3;
+            h->pred8x8l [HOR_DOWN_PRED        ] = ff_pred8x8l_horizontal_down_8_ssse3;
+            if (codec_id == AV_CODEC_ID_VP7 || codec_id == AV_CODEC_ID_VP8) {
+                h->pred8x8  [PLANE_PRED8x8    ] = ff_pred8x8_tm_vp8_8_ssse3;
+                h->pred4x4  [TM_VP8_PRED      ] = ff_pred4x4_tm_vp8_8_ssse3;
+            } else {
+                if (chroma_format_idc <= 1)
+                    h->pred8x8  [PLANE_PRED8x8] = ff_pred8x8_plane_8_ssse3;
+                if (codec_id == AV_CODEC_ID_SVQ3) {
+                    h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_svq3_8_ssse3;
+                } else if (codec_id == AV_CODEC_ID_RV40) {
+                    h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_rv40_8_ssse3;
+                } else {
+                    h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_h264_8_ssse3;
+                }
+            }
+        }
+
+        if(EXTERNAL_AVX2(cpu_flags)){
+            if (codec_id == AV_CODEC_ID_VP8) {
+                h->pred16x16[PLANE_PRED8x8    ] = ff_pred16x16_tm_vp8_8_avx2;
+            }
+        }
+    } else if (bit_depth == 10) {
+        if (EXTERNAL_MMXEXT(cpu_flags)) {
+            h->pred4x4[DC_PRED             ] = ff_pred4x4_dc_10_mmxext;
+            h->pred4x4[HOR_UP_PRED         ] = ff_pred4x4_horizontal_up_10_mmxext;
+
+            if (chroma_format_idc <= 1)
+                h->pred8x8[DC_PRED8x8      ] = ff_pred8x8_dc_10_mmxext;
+
+            h->pred8x8l[DC_128_PRED        ] = ff_pred8x8l_128_dc_10_mmxext;
+
+            h->pred16x16[DC_PRED8x8        ] = ff_pred16x16_dc_10_mmxext;
+            h->pred16x16[TOP_DC_PRED8x8    ] = ff_pred16x16_top_dc_10_mmxext;
+            h->pred16x16[DC_128_PRED8x8    ] = ff_pred16x16_128_dc_10_mmxext;
+            h->pred16x16[LEFT_DC_PRED8x8   ] = ff_pred16x16_left_dc_10_mmxext;
+            h->pred16x16[VERT_PRED8x8      ] = ff_pred16x16_vertical_10_mmxext;
+            h->pred16x16[HOR_PRED8x8       ] = ff_pred16x16_horizontal_10_mmxext;
+        }
+        if (EXTERNAL_SSE2(cpu_flags)) {
+            h->pred4x4[DIAG_DOWN_LEFT_PRED ] = ff_pred4x4_down_left_10_sse2;
+            h->pred4x4[DIAG_DOWN_RIGHT_PRED] = ff_pred4x4_down_right_10_sse2;
+            h->pred4x4[VERT_LEFT_PRED      ] = ff_pred4x4_vertical_left_10_sse2;
+            h->pred4x4[VERT_RIGHT_PRED     ] = ff_pred4x4_vertical_right_10_sse2;
+            h->pred4x4[HOR_DOWN_PRED       ] = ff_pred4x4_horizontal_down_10_sse2;
+
+            if (chroma_format_idc <= 1) {
+                h->pred8x8[DC_PRED8x8      ] = ff_pred8x8_dc_10_sse2;
+                h->pred8x8[TOP_DC_PRED8x8  ] = ff_pred8x8_top_dc_10_sse2;
+                h->pred8x8[PLANE_PRED8x8   ] = ff_pred8x8_plane_10_sse2;
+                h->pred8x8[VERT_PRED8x8    ] = ff_pred8x8_vertical_10_sse2;
+                h->pred8x8[HOR_PRED8x8     ] = ff_pred8x8_horizontal_10_sse2;
+            }
+
+            h->pred8x8l[VERT_PRED           ] = ff_pred8x8l_vertical_10_sse2;
+            h->pred8x8l[HOR_PRED            ] = ff_pred8x8l_horizontal_10_sse2;
+            h->pred8x8l[DC_PRED             ] = ff_pred8x8l_dc_10_sse2;
+            h->pred8x8l[DC_128_PRED         ] = ff_pred8x8l_128_dc_10_sse2;
+            h->pred8x8l[TOP_DC_PRED         ] = ff_pred8x8l_top_dc_10_sse2;
+            h->pred8x8l[DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_10_sse2;
+            h->pred8x8l[DIAG_DOWN_RIGHT_PRED] = ff_pred8x8l_down_right_10_sse2;
+            h->pred8x8l[VERT_RIGHT_PRED     ] = ff_pred8x8l_vertical_right_10_sse2;
+            h->pred8x8l[HOR_UP_PRED         ] = ff_pred8x8l_horizontal_up_10_sse2;
+
+            h->pred16x16[DC_PRED8x8        ] = ff_pred16x16_dc_10_sse2;
+            h->pred16x16[TOP_DC_PRED8x8    ] = ff_pred16x16_top_dc_10_sse2;
+            h->pred16x16[DC_128_PRED8x8    ] = ff_pred16x16_128_dc_10_sse2;
+            h->pred16x16[LEFT_DC_PRED8x8   ] = ff_pred16x16_left_dc_10_sse2;
+            h->pred16x16[VERT_PRED8x8      ] = ff_pred16x16_vertical_10_sse2;
+            h->pred16x16[HOR_PRED8x8       ] = ff_pred16x16_horizontal_10_sse2;
+        }
+        if (EXTERNAL_SSSE3(cpu_flags)) {
+            h->pred4x4[DIAG_DOWN_RIGHT_PRED] = ff_pred4x4_down_right_10_ssse3;
+            h->pred4x4[VERT_RIGHT_PRED     ] = ff_pred4x4_vertical_right_10_ssse3;
+            h->pred4x4[HOR_DOWN_PRED       ] = ff_pred4x4_horizontal_down_10_ssse3;
+
+            h->pred8x8l[HOR_PRED            ] = ff_pred8x8l_horizontal_10_ssse3;
+            h->pred8x8l[DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_10_ssse3;
+            h->pred8x8l[DIAG_DOWN_RIGHT_PRED] = ff_pred8x8l_down_right_10_ssse3;
+            h->pred8x8l[VERT_RIGHT_PRED     ] = ff_pred8x8l_vertical_right_10_ssse3;
+            h->pred8x8l[HOR_UP_PRED         ] = ff_pred8x8l_horizontal_up_10_ssse3;
+        }
+        if (EXTERNAL_AVX(cpu_flags)) {
+            h->pred4x4[DIAG_DOWN_LEFT_PRED ] = ff_pred4x4_down_left_10_avx;
+            h->pred4x4[DIAG_DOWN_RIGHT_PRED] = ff_pred4x4_down_right_10_avx;
+            h->pred4x4[VERT_LEFT_PRED      ] = ff_pred4x4_vertical_left_10_avx;
+            h->pred4x4[VERT_RIGHT_PRED     ] = ff_pred4x4_vertical_right_10_avx;
+            h->pred4x4[HOR_DOWN_PRED       ] = ff_pred4x4_horizontal_down_10_avx;
+
+            h->pred8x8l[VERT_PRED           ] = ff_pred8x8l_vertical_10_avx;
+            h->pred8x8l[HOR_PRED            ] = ff_pred8x8l_horizontal_10_avx;
+            h->pred8x8l[DC_PRED             ] = ff_pred8x8l_dc_10_avx;
+            h->pred8x8l[TOP_DC_PRED         ] = ff_pred8x8l_top_dc_10_avx;
+            h->pred8x8l[DIAG_DOWN_RIGHT_PRED] = ff_pred8x8l_down_right_10_avx;
+            h->pred8x8l[DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_10_avx;
+            h->pred8x8l[VERT_RIGHT_PRED     ] = ff_pred8x8l_vertical_right_10_avx;
+            h->pred8x8l[HOR_UP_PRED         ] = ff_pred8x8l_horizontal_up_10_avx;
+        }
+    }
+}
--- a/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/h264_qpel.c
+++ b/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/h264_qpel.c
@ -0,0 +1,634 @@
+/*
+ * Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
+ * Copyright (c) 2011 Daniel Kang
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/asm.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/h264dec.h"
+#include "libavcodec/h264qpel.h"
+#include "libavcodec/pixels.h"
+#include "fpel.h"
+
+#if HAVE_X86ASM
+void ff_put_pixels4_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
+                              int dstStride, int src1Stride, int h);
+void ff_avg_pixels4_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
+                              int dstStride, int src1Stride, int h);
+void ff_put_pixels8_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
+                              int dstStride, int src1Stride, int h);
+void ff_avg_pixels8_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
+                              int dstStride, int src1Stride, int h);
+void ff_put_pixels16_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
+                               int dstStride, int src1Stride, int h);
+void ff_avg_pixels16_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
+                               int dstStride, int src1Stride, int h);
+#define ff_put_pixels8_l2_sse2  ff_put_pixels8_l2_mmxext
+#define ff_avg_pixels8_l2_sse2  ff_avg_pixels8_l2_mmxext
+#define ff_put_pixels16_l2_sse2 ff_put_pixels16_l2_mmxext
+#define ff_avg_pixels16_l2_sse2 ff_avg_pixels16_l2_mmxext
+#define ff_put_pixels16_mmxext  ff_put_pixels16_mmx
+#define ff_put_pixels8_mmxext   ff_put_pixels8_mmx
+#define ff_put_pixels4_mmxext   ff_put_pixels4_mmx
+
+#define DEF_QPEL(OPNAME)\
+void ff_ ## OPNAME ## _h264_qpel4_h_lowpass_mmxext(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride);\
+void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_mmxext(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride);\
+void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_ssse3(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride);\
+void ff_ ## OPNAME ## _h264_qpel4_h_lowpass_l2_mmxext(uint8_t *dst, const uint8_t *src, const uint8_t *src2, int dstStride, int src2Stride);\
+void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_l2_mmxext(uint8_t *dst, const uint8_t *src, const uint8_t *src2, int dstStride, int src2Stride);\
+void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_l2_ssse3(uint8_t *dst, const uint8_t *src, const uint8_t *src2, int dstStride, int src2Stride);\
+void ff_ ## OPNAME ## _h264_qpel4_v_lowpass_mmxext(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride);\
+void ff_ ## OPNAME ## _h264_qpel8or16_v_lowpass_op_mmxext(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h);\
+void ff_ ## OPNAME ## _h264_qpel8or16_v_lowpass_sse2(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h);\
+void ff_ ## OPNAME ## _h264_qpel4_hv_lowpass_v_mmxext(const uint8_t *src, int16_t *tmp, int srcStride);\
+void ff_ ## OPNAME ## _h264_qpel4_hv_lowpass_h_mmxext(int16_t *tmp, uint8_t *dst, int dstStride);\
+void ff_ ## OPNAME ## _h264_qpel8or16_hv1_lowpass_op_mmxext(const uint8_t *src, int16_t *tmp, int srcStride, int size);\
+void ff_ ## OPNAME ## _h264_qpel8or16_hv1_lowpass_op_sse2(const uint8_t *src, int16_t *tmp, int srcStride, int size);\
+void ff_ ## OPNAME ## _h264_qpel8or16_hv2_lowpass_op_mmxext(uint8_t *dst, int16_t *tmp, int dstStride, int unused, int h);\
+void ff_ ## OPNAME ## _h264_qpel8or16_hv2_lowpass_ssse3(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size);\
+void ff_ ## OPNAME ## _pixels4_l2_shift5_mmxext(uint8_t *dst, const int16_t *src16, const uint8_t *src8, int dstStride, int src8Stride, int h);\
+void ff_ ## OPNAME ## _pixels8_l2_shift5_mmxext(uint8_t *dst, const int16_t *src16, const uint8_t *src8, int dstStride, int src8Stride, int h);
+
+DEF_QPEL(avg)
+DEF_QPEL(put)
+
+static av_always_inline void ff_put_h264_qpel8or16_hv1_lowpass_mmxext(int16_t *tmp, const uint8_t *src, int tmpStride, int srcStride, int size)
+{
+    int w = (size + 8) >> 2;
+    src -= 2 * srcStride + 2;
+    while (w--) {
+        ff_put_h264_qpel8or16_hv1_lowpass_op_mmxext(src, tmp, srcStride, size);
+        tmp += 4;
+        src += 4;
+    }
+}
+
+#define QPEL_H264(OPNAME, OP, MMX)\
+static av_always_inline void ff_ ## OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, const uint8_t *src, int dstStride, int tmpStride, int srcStride){\
+    int w=3;\
+    src -= 2*srcStride+2;\
+    while(w--){\
+        ff_ ## OPNAME ## h264_qpel4_hv_lowpass_v_mmxext(src, tmp, srcStride);\
+        tmp += 4;\
+        src += 4;\
+    }\
+    tmp -= 3*4;\
+    ff_ ## OPNAME ## h264_qpel4_hv_lowpass_h_mmxext(tmp, dst, dstStride);\
+}\
+\
+static av_always_inline void ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h){\
+    src -= 2*srcStride;\
+    ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_op_mmxext(dst, src, dstStride, srcStride, h);\
+    src += 4;\
+    dst += 4;\
+    ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_op_mmxext(dst, src, dstStride, srcStride, h);\
+}\
+static av_always_inline void ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\
+    int w = size>>4;\
+    do{\
+    ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_op_mmxext(dst, tmp, dstStride, 0, size);\
+    tmp += 8;\
+    dst += 8;\
+    }while(w--);\
+}\
+\
+static av_always_inline void ff_ ## OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride){\
+    ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst  , src  , dstStride, srcStride, 8);\
+}\
+static av_always_inline void ff_ ## OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride){\
+    ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst  , src  , dstStride, srcStride, 16);\
+    ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
+}\
+\
+static av_always_inline void ff_ ## OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride){\
+    ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst  , src  , dstStride, srcStride);\
+    ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
+    src += 8*srcStride;\
+    dst += 8*dstStride;\
+    ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst  , src  , dstStride, srcStride);\
+    ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
+}\
+\
+static av_always_inline void ff_ ## OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, const uint8_t *src, const uint8_t *src2, int dstStride, int src2Stride){\
+    ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst  , src  , src2  , dstStride, src2Stride);\
+    ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
+    src += 8*dstStride;\
+    dst += 8*dstStride;\
+    src2 += 8*src2Stride;\
+    ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst  , src  , src2  , dstStride, src2Stride);\
+    ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
+}\
+\
+static av_always_inline void ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, const uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\
+    ff_put_h264_qpel8or16_hv1_lowpass_ ## MMX(tmp, src, tmpStride, srcStride, size);\
+    ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\
+}\
+static av_always_inline void ff_ ## OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, const uint8_t *src, int dstStride, int tmpStride, int srcStride){\
+    ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst  , tmp  , src  , dstStride, tmpStride, srcStride, 8);\
+}\
+\
+static av_always_inline void ff_ ## OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, const uint8_t *src, int dstStride, int tmpStride, int srcStride){\
+    ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst  , tmp  , src  , dstStride, tmpStride, srcStride, 16);\
+}\
+\
+static av_always_inline void ff_ ## OPNAME ## pixels16_l2_shift5_ ## MMX(uint8_t *dst, const int16_t *src16, const uint8_t *src8, int dstStride, int src8Stride, int h)\
+{\
+    ff_ ## OPNAME ## pixels8_l2_shift5_ ## MMX(dst  , src16  , src8  , dstStride, src8Stride, h);\
+    ff_ ## OPNAME ## pixels8_l2_shift5_ ## MMX(dst+8, src16+8, src8+8, dstStride, src8Stride, h);\
+}\
+
+
+#if ARCH_X86_64
+#define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
+
+void ff_avg_h264_qpel16_h_lowpass_l2_ssse3(uint8_t *dst, const uint8_t *src, const uint8_t *src2, int dstStride, int src2Stride);
+void ff_put_h264_qpel16_h_lowpass_l2_ssse3(uint8_t *dst, const uint8_t *src, const uint8_t *src2, int dstStride, int src2Stride);
+
+#else // ARCH_X86_64
+#define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
+static av_always_inline void ff_ ## OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, const uint8_t *src, const uint8_t *src2, int dstStride, int src2Stride){\
+    ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst  , src  , src2  , dstStride, src2Stride);\
+    ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
+    src += 8*dstStride;\
+    dst += 8*dstStride;\
+    src2 += 8*src2Stride;\
+    ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst  , src  , src2  , dstStride, src2Stride);\
+    ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
+}
+#endif // ARCH_X86_64
+
+#define QPEL_H264_H_XMM(OPNAME, OP, MMX)\
+QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
+static av_always_inline void ff_ ## OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride){\
+    ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst  , src  , dstStride, srcStride);\
+    ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
+    src += 8*srcStride;\
+    dst += 8*dstStride;\
+    ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst  , src  , dstStride, srcStride);\
+    ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
+}\
+
+#define QPEL_H264_V_XMM(OPNAME, OP, MMX)\
+static av_always_inline void ff_ ## OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride){\
+    ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst  , src  , dstStride, srcStride, 8);\
+}\
+static av_always_inline void ff_ ## OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride){\
+    ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst  , src  , dstStride, srcStride, 16);\
+    ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
+}
+
+static av_always_inline void put_h264_qpel8or16_hv1_lowpass_sse2(int16_t *tmp,
+                                                                 const uint8_t *src,
+                                                                 int tmpStride,
+                                                                 int srcStride,
+                                                                 int size)
+{
+    int w = (size+8)>>3;
+    src -= 2*srcStride+2;
+    while(w--){
+        ff_put_h264_qpel8or16_hv1_lowpass_op_sse2(src, tmp, srcStride, size);
+        tmp += 8;
+        src += 8;
+    }
+}
+
+#define QPEL_H264_HV_XMM(OPNAME, OP, MMX)\
+static av_always_inline void ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, const uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\
+    put_h264_qpel8or16_hv1_lowpass_sse2(tmp, src, tmpStride, srcStride, size);\
+    ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\
+}\
+static av_always_inline void ff_ ## OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, const uint8_t *src, int dstStride, int tmpStride, int srcStride){\
+    ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 8);\
+}\
+static av_always_inline void ff_ ## OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, const uint8_t *src, int dstStride, int tmpStride, int srcStride){\
+    ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 16);\
+}\
+
+#define ff_put_h264_qpel8_h_lowpass_l2_sse2  ff_put_h264_qpel8_h_lowpass_l2_mmxext
+#define ff_avg_h264_qpel8_h_lowpass_l2_sse2  ff_avg_h264_qpel8_h_lowpass_l2_mmxext
+#define ff_put_h264_qpel16_h_lowpass_l2_sse2 ff_put_h264_qpel16_h_lowpass_l2_mmxext
+#define ff_avg_h264_qpel16_h_lowpass_l2_sse2 ff_avg_h264_qpel16_h_lowpass_l2_mmxext
+
+#define ff_put_h264_qpel8_v_lowpass_ssse3  ff_put_h264_qpel8_v_lowpass_sse2
+#define ff_avg_h264_qpel8_v_lowpass_ssse3  ff_avg_h264_qpel8_v_lowpass_sse2
+#define ff_put_h264_qpel16_v_lowpass_ssse3 ff_put_h264_qpel16_v_lowpass_sse2
+#define ff_avg_h264_qpel16_v_lowpass_ssse3 ff_avg_h264_qpel16_v_lowpass_sse2
+
+#define ff_put_h264_qpel8or16_hv2_lowpass_sse2 ff_put_h264_qpel8or16_hv2_lowpass_mmxext
+#define ff_avg_h264_qpel8or16_hv2_lowpass_sse2 ff_avg_h264_qpel8or16_hv2_lowpass_mmxext
+
+#define H264_MC(OPNAME, SIZE, MMX, ALIGN) \
+H264_MC_C(OPNAME, SIZE, MMX, ALIGN)\
+H264_MC_V(OPNAME, SIZE, MMX, ALIGN)\
+H264_MC_H(OPNAME, SIZE, MMX, ALIGN)\
+H264_MC_HV(OPNAME, SIZE, MMX, ALIGN)\
+
+static void put_h264_qpel16_mc00_sse2 (uint8_t *dst, const uint8_t *src,
+                                       ptrdiff_t stride)
+{
+    ff_put_pixels16_sse2(dst, src, stride, 16);
+}
+static void avg_h264_qpel16_mc00_sse2 (uint8_t *dst, const uint8_t *src,
+                                       ptrdiff_t stride)
+{
+    ff_avg_pixels16_sse2(dst, src, stride, 16);
+}
+#define put_h264_qpel8_mc00_sse2 put_h264_qpel8_mc00_mmxext
+#define avg_h264_qpel8_mc00_sse2 avg_h264_qpel8_mc00_mmxext
+
+#define H264_MC_C(OPNAME, SIZE, MMX, ALIGN) \
+static void OPNAME ## h264_qpel ## SIZE ## _mc00_ ## MMX (uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
+{\
+    ff_ ## OPNAME ## pixels ## SIZE ## _ ## MMX(dst, src, stride, SIZE);\
+}\
+
+#define H264_MC_H(OPNAME, SIZE, MMX, ALIGN) \
+static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
+{\
+    ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src, stride, stride);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc20_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
+{\
+    ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_ ## MMX(dst, src, stride, stride);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
+{\
+    ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src+1, stride, stride);\
+}\
+
+#define H264_MC_V(OPNAME, SIZE, MMX, ALIGN) \
+static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
+{\
+    LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*SIZE]);\
+    ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
+    ff_ ## OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src, temp, stride, stride, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
+{\
+    ff_ ## OPNAME ## h264_qpel ## SIZE ## _v_lowpass_ ## MMX(dst, src, stride, stride);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
+{\
+    LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*SIZE]);\
+    ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
+    ff_ ## OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src+stride, temp, stride, stride, SIZE);\
+}\
+
+#define H264_MC_HV(OPNAME, SIZE, MMX, ALIGN) \
+static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
+{\
+    LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*SIZE]);\
+    ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
+    ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
+{\
+    LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*SIZE]);\
+    ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\
+    ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
+{\
+    LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*SIZE]);\
+    ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
+    ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
+{\
+    LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*SIZE]);\
+    ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\
+    ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
+{\
+    LOCAL_ALIGNED(ALIGN, uint16_t, temp, [SIZE*(SIZE<8?12:24)]);\
+    ff_ ## OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(dst, temp, src, stride, SIZE, stride);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
+{\
+    LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\
+    uint8_t * const halfHV= temp;\
+    int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
+    av_assert2(((int)temp & 7) == 0);\
+    ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
+    ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, halfHV, stride, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
+{\
+    LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\
+    uint8_t * const halfHV= temp;\
+    int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
+    av_assert2(((int)temp & 7) == 0);\
+    ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
+    ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, halfHV, stride, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
+{\
+    LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\
+    uint8_t * const halfHV= temp;\
+    int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
+    av_assert2(((int)temp & 7) == 0);\
+    ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
+    ff_ ## OPNAME ## pixels ## SIZE ## _l2_shift5_mmxext(dst, halfV+2, halfHV, stride, SIZE, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
+{\
+    LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\
+    uint8_t * const halfHV= temp;\
+    int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
+    av_assert2(((int)temp & 7) == 0);\
+    ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
+    ff_ ## OPNAME ## pixels ## SIZE ## _l2_shift5_mmxext(dst, halfV+3, halfHV, stride, SIZE, SIZE);\
+}\
+
+#define H264_MC_4816(MMX)\
+H264_MC(put_, 4, MMX, 8)\
+H264_MC(put_, 8, MMX, 8)\
+H264_MC(put_, 16,MMX, 8)\
+H264_MC(avg_, 4, MMX, 8)\
+H264_MC(avg_, 8, MMX, 8)\
+H264_MC(avg_, 16,MMX, 8)\
+
+#define H264_MC_816(QPEL, XMM)\
+QPEL(put_, 8, XMM, 16)\
+QPEL(put_, 16,XMM, 16)\
+QPEL(avg_, 8, XMM, 16)\
+QPEL(avg_, 16,XMM, 16)\
+
+QPEL_H264(put_,        PUT_OP, mmxext)
+QPEL_H264(avg_, AVG_MMXEXT_OP, mmxext)
+QPEL_H264_V_XMM(put_,       PUT_OP, sse2)
+QPEL_H264_V_XMM(avg_,AVG_MMXEXT_OP, sse2)
+QPEL_H264_HV_XMM(put_,       PUT_OP, sse2)
+QPEL_H264_HV_XMM(avg_,AVG_MMXEXT_OP, sse2)
+QPEL_H264_H_XMM(put_,       PUT_OP, ssse3)
+QPEL_H264_H_XMM(avg_,AVG_MMXEXT_OP, ssse3)
+QPEL_H264_HV_XMM(put_,       PUT_OP, ssse3)
+QPEL_H264_HV_XMM(avg_,AVG_MMXEXT_OP, ssse3)
+
+H264_MC_4816(mmxext)
+H264_MC_816(H264_MC_V, sse2)
+H264_MC_816(H264_MC_HV, sse2)
+H264_MC_816(H264_MC_H, ssse3)
+H264_MC_816(H264_MC_HV, ssse3)
+
+
+//10bit
+#define LUMA_MC_OP(OP, NUM, DEPTH, TYPE, OPT) \
+void ff_ ## OP ## _h264_qpel ## NUM ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT \
+    (uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+
+#define LUMA_MC_ALL(DEPTH, TYPE, OPT) \
+    LUMA_MC_OP(put,  4, DEPTH, TYPE, OPT) \
+    LUMA_MC_OP(avg,  4, DEPTH, TYPE, OPT) \
+    LUMA_MC_OP(put,  8, DEPTH, TYPE, OPT) \
+    LUMA_MC_OP(avg,  8, DEPTH, TYPE, OPT) \
+    LUMA_MC_OP(put, 16, DEPTH, TYPE, OPT) \
+    LUMA_MC_OP(avg, 16, DEPTH, TYPE, OPT)
+
+#define LUMA_MC_816(DEPTH, TYPE, OPT) \
+    LUMA_MC_OP(put,  8, DEPTH, TYPE, OPT) \
+    LUMA_MC_OP(avg,  8, DEPTH, TYPE, OPT) \
+    LUMA_MC_OP(put, 16, DEPTH, TYPE, OPT) \
+    LUMA_MC_OP(avg, 16, DEPTH, TYPE, OPT)
+
+LUMA_MC_ALL(10, mc00, mmxext)
+LUMA_MC_ALL(10, mc10, mmxext)
+LUMA_MC_ALL(10, mc20, mmxext)
+LUMA_MC_ALL(10, mc30, mmxext)
+LUMA_MC_ALL(10, mc01, mmxext)
+LUMA_MC_ALL(10, mc11, mmxext)
+LUMA_MC_ALL(10, mc21, mmxext)
+LUMA_MC_ALL(10, mc31, mmxext)
+LUMA_MC_ALL(10, mc02, mmxext)
+LUMA_MC_ALL(10, mc12, mmxext)
+LUMA_MC_ALL(10, mc22, mmxext)
+LUMA_MC_ALL(10, mc32, mmxext)
+LUMA_MC_ALL(10, mc03, mmxext)
+LUMA_MC_ALL(10, mc13, mmxext)
+LUMA_MC_ALL(10, mc23, mmxext)
+LUMA_MC_ALL(10, mc33, mmxext)
+
+LUMA_MC_816(10, mc00, sse2)
+LUMA_MC_816(10, mc10, sse2)
+LUMA_MC_816(10, mc10, sse2_cache64)
+LUMA_MC_816(10, mc10, ssse3_cache64)
+LUMA_MC_816(10, mc20, sse2)
+LUMA_MC_816(10, mc20, sse2_cache64)
+LUMA_MC_816(10, mc20, ssse3_cache64)
+LUMA_MC_816(10, mc30, sse2)
+LUMA_MC_816(10, mc30, sse2_cache64)
+LUMA_MC_816(10, mc30, ssse3_cache64)
+LUMA_MC_816(10, mc01, sse2)
+LUMA_MC_816(10, mc11, sse2)
+LUMA_MC_816(10, mc21, sse2)
+LUMA_MC_816(10, mc31, sse2)
+LUMA_MC_816(10, mc02, sse2)
+LUMA_MC_816(10, mc12, sse2)
+LUMA_MC_816(10, mc22, sse2)
+LUMA_MC_816(10, mc32, sse2)
+LUMA_MC_816(10, mc03, sse2)
+LUMA_MC_816(10, mc13, sse2)
+LUMA_MC_816(10, mc23, sse2)
+LUMA_MC_816(10, mc33, sse2)
+
+#define QPEL16_OPMC(OP, MC, MMX)\
+void ff_ ## OP ## _h264_qpel16_ ## MC ## _10_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride){\
+    ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst   , src   , stride);\
+    ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst+16, src+16, stride);\
+    src += 8*stride;\
+    dst += 8*stride;\
+    ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst   , src   , stride);\
+    ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst+16, src+16, stride);\
+}
+
+#define QPEL16_OP(MC, MMX)\
+QPEL16_OPMC(put, MC, MMX)\
+QPEL16_OPMC(avg, MC, MMX)
+
+#define QPEL16(MMX)\
+QPEL16_OP(mc00, MMX)\
+QPEL16_OP(mc01, MMX)\
+QPEL16_OP(mc02, MMX)\
+QPEL16_OP(mc03, MMX)\
+QPEL16_OP(mc10, MMX)\
+QPEL16_OP(mc11, MMX)\
+QPEL16_OP(mc12, MMX)\
+QPEL16_OP(mc13, MMX)\
+QPEL16_OP(mc20, MMX)\
+QPEL16_OP(mc21, MMX)\
+QPEL16_OP(mc22, MMX)\
+QPEL16_OP(mc23, MMX)\
+QPEL16_OP(mc30, MMX)\
+QPEL16_OP(mc31, MMX)\
+QPEL16_OP(mc32, MMX)\
+QPEL16_OP(mc33, MMX)
+
+#if ARCH_X86_32 // ARCH_X86_64 implies SSE2+
+QPEL16(mmxext)
+#endif
+
+#endif /* HAVE_X86ASM */
+
+#define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX)                          \
+    do {                                                                     \
+    c->PFX ## _pixels_tab[IDX][ 0] = PREFIX ## PFX ## SIZE ## _mc00_ ## CPU; \
+    c->PFX ## _pixels_tab[IDX][ 1] = PREFIX ## PFX ## SIZE ## _mc10_ ## CPU; \
+    c->PFX ## _pixels_tab[IDX][ 2] = PREFIX ## PFX ## SIZE ## _mc20_ ## CPU; \
+    c->PFX ## _pixels_tab[IDX][ 3] = PREFIX ## PFX ## SIZE ## _mc30_ ## CPU; \
+    c->PFX ## _pixels_tab[IDX][ 4] = PREFIX ## PFX ## SIZE ## _mc01_ ## CPU; \
+    c->PFX ## _pixels_tab[IDX][ 5] = PREFIX ## PFX ## SIZE ## _mc11_ ## CPU; \
+    c->PFX ## _pixels_tab[IDX][ 6] = PREFIX ## PFX ## SIZE ## _mc21_ ## CPU; \
+    c->PFX ## _pixels_tab[IDX][ 7] = PREFIX ## PFX ## SIZE ## _mc31_ ## CPU; \
+    c->PFX ## _pixels_tab[IDX][ 8] = PREFIX ## PFX ## SIZE ## _mc02_ ## CPU; \
+    c->PFX ## _pixels_tab[IDX][ 9] = PREFIX ## PFX ## SIZE ## _mc12_ ## CPU; \
+    c->PFX ## _pixels_tab[IDX][10] = PREFIX ## PFX ## SIZE ## _mc22_ ## CPU; \
+    c->PFX ## _pixels_tab[IDX][11] = PREFIX ## PFX ## SIZE ## _mc32_ ## CPU; \
+    c->PFX ## _pixels_tab[IDX][12] = PREFIX ## PFX ## SIZE ## _mc03_ ## CPU; \
+    c->PFX ## _pixels_tab[IDX][13] = PREFIX ## PFX ## SIZE ## _mc13_ ## CPU; \
+    c->PFX ## _pixels_tab[IDX][14] = PREFIX ## PFX ## SIZE ## _mc23_ ## CPU; \
+    c->PFX ## _pixels_tab[IDX][15] = PREFIX ## PFX ## SIZE ## _mc33_ ## CPU; \
+    } while (0)
+
+#define H264_QPEL_FUNCS(x, y, CPU)                                                            \
+    do {                                                                                      \
+        c->put_h264_qpel_pixels_tab[0][x + y * 4] = put_h264_qpel16_mc ## x ## y ## _ ## CPU; \
+        c->put_h264_qpel_pixels_tab[1][x + y * 4] = put_h264_qpel8_mc  ## x ## y ## _ ## CPU; \
+        c->avg_h264_qpel_pixels_tab[0][x + y * 4] = avg_h264_qpel16_mc ## x ## y ## _ ## CPU; \
+        c->avg_h264_qpel_pixels_tab[1][x + y * 4] = avg_h264_qpel8_mc  ## x ## y ## _ ## CPU; \
+    } while (0)
+
+#define H264_QPEL_FUNCS_10(x, y, CPU)                                                               \
+    do {                                                                                            \
+        c->put_h264_qpel_pixels_tab[0][x + y * 4] = ff_put_h264_qpel16_mc ## x ## y ## _10_ ## CPU; \
+        c->put_h264_qpel_pixels_tab[1][x + y * 4] = ff_put_h264_qpel8_mc  ## x ## y ## _10_ ## CPU; \
+        c->avg_h264_qpel_pixels_tab[0][x + y * 4] = ff_avg_h264_qpel16_mc ## x ## y ## _10_ ## CPU; \
+        c->avg_h264_qpel_pixels_tab[1][x + y * 4] = ff_avg_h264_qpel8_mc  ## x ## y ## _10_ ## CPU; \
+    } while (0)
+
+av_cold void ff_h264qpel_init_x86(H264QpelContext *c, int bit_depth)
+{
+#if HAVE_X86ASM
+    int high_bit_depth = bit_depth > 8;
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_MMXEXT(cpu_flags)) {
+        if (!high_bit_depth) {
+            SET_QPEL_FUNCS(put_h264_qpel, 0, 16, mmxext, );
+            SET_QPEL_FUNCS(put_h264_qpel, 1,  8, mmxext, );
+            SET_QPEL_FUNCS(put_h264_qpel, 2,  4, mmxext, );
+            SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, mmxext, );
+            SET_QPEL_FUNCS(avg_h264_qpel, 1,  8, mmxext, );
+            SET_QPEL_FUNCS(avg_h264_qpel, 2,  4, mmxext, );
+        } else if (bit_depth == 10) {
+#if ARCH_X86_32
+            SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_mmxext, ff_);
+            SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_mmxext, ff_);
+            SET_QPEL_FUNCS(put_h264_qpel, 1,  8, 10_mmxext, ff_);
+            SET_QPEL_FUNCS(avg_h264_qpel, 1,  8, 10_mmxext, ff_);
+#endif
+            SET_QPEL_FUNCS(put_h264_qpel, 2, 4,  10_mmxext, ff_);
+            SET_QPEL_FUNCS(avg_h264_qpel, 2, 4,  10_mmxext, ff_);
+        }
+    }
+
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        if (!high_bit_depth) {
+            H264_QPEL_FUNCS(0, 1, sse2);
+            H264_QPEL_FUNCS(0, 2, sse2);
+            H264_QPEL_FUNCS(0, 3, sse2);
+            H264_QPEL_FUNCS(1, 1, sse2);
+            H264_QPEL_FUNCS(1, 2, sse2);
+            H264_QPEL_FUNCS(1, 3, sse2);
+            H264_QPEL_FUNCS(2, 1, sse2);
+            H264_QPEL_FUNCS(2, 2, sse2);
+            H264_QPEL_FUNCS(2, 3, sse2);
+            H264_QPEL_FUNCS(3, 1, sse2);
+            H264_QPEL_FUNCS(3, 2, sse2);
+            H264_QPEL_FUNCS(3, 3, sse2);
+        }
+
+        if (bit_depth == 10) {
+            SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_sse2, ff_);
+            SET_QPEL_FUNCS(put_h264_qpel, 1,  8, 10_sse2, ff_);
+            SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_sse2, ff_);
+            SET_QPEL_FUNCS(avg_h264_qpel, 1,  8, 10_sse2, ff_);
+            H264_QPEL_FUNCS_10(1, 0, sse2_cache64);
+            H264_QPEL_FUNCS_10(2, 0, sse2_cache64);
+            H264_QPEL_FUNCS_10(3, 0, sse2_cache64);
+        }
+    }
+
+    if (EXTERNAL_SSE2_FAST(cpu_flags)) {
+        if (!high_bit_depth) {
+            H264_QPEL_FUNCS(0, 0, sse2);
+        }
+    }
+
+    if (EXTERNAL_SSSE3(cpu_flags)) {
+        if (!high_bit_depth) {
+            H264_QPEL_FUNCS(1, 0, ssse3);
+            H264_QPEL_FUNCS(1, 1, ssse3);
+            H264_QPEL_FUNCS(1, 2, ssse3);
+            H264_QPEL_FUNCS(1, 3, ssse3);
+            H264_QPEL_FUNCS(2, 0, ssse3);
+            H264_QPEL_FUNCS(2, 1, ssse3);
+            H264_QPEL_FUNCS(2, 2, ssse3);
+            H264_QPEL_FUNCS(2, 3, ssse3);
+            H264_QPEL_FUNCS(3, 0, ssse3);
+            H264_QPEL_FUNCS(3, 1, ssse3);
+            H264_QPEL_FUNCS(3, 2, ssse3);
+            H264_QPEL_FUNCS(3, 3, ssse3);
+        }
+
+        if (bit_depth == 10) {
+            H264_QPEL_FUNCS_10(1, 0, ssse3_cache64);
+            H264_QPEL_FUNCS_10(2, 0, ssse3_cache64);
+            H264_QPEL_FUNCS_10(3, 0, ssse3_cache64);
+        }
+    }
+
+    if (EXTERNAL_AVX(cpu_flags)) {
+        /* AVX implies 64 byte cache lines without the need to avoid unaligned
+         * memory accesses that cross the boundary between two cache lines.
+         * TODO: Port X264_CPU_CACHELINE_32/64 detection from x264 to avoid
+         * having to treat SSE2 functions with such properties as AVX. */
+        if (bit_depth == 10) {
+            H264_QPEL_FUNCS_10(1, 0, sse2);
+            H264_QPEL_FUNCS_10(2, 0, sse2);
+            H264_QPEL_FUNCS_10(3, 0, sse2);
+        }
+    }
+#endif
+}
--- a/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/h264chroma_init.c
+++ b/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/h264chroma_init.c
@ -0,0 +1,117 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/h264chroma.h"
+
+void ff_put_h264_chroma_mc8_rnd_mmx  (uint8_t *dst, uint8_t *src,
+                                      ptrdiff_t stride, int h, int x, int y);
+void ff_avg_h264_chroma_mc8_rnd_mmxext(uint8_t *dst, uint8_t *src,
+                                       ptrdiff_t stride, int h, int x, int y);
+void ff_avg_h264_chroma_mc8_rnd_3dnow(uint8_t *dst, uint8_t *src,
+                                      ptrdiff_t stride, int h, int x, int y);
+
+void ff_put_h264_chroma_mc4_mmx      (uint8_t *dst, uint8_t *src,
+                                      ptrdiff_t stride, int h, int x, int y);
+void ff_avg_h264_chroma_mc4_mmxext   (uint8_t *dst, uint8_t *src,
+                                      ptrdiff_t stride, int h, int x, int y);
+void ff_avg_h264_chroma_mc4_3dnow    (uint8_t *dst, uint8_t *src,
+                                      ptrdiff_t stride, int h, int x, int y);
+
+void ff_put_h264_chroma_mc2_mmxext   (uint8_t *dst, uint8_t *src,
+                                      ptrdiff_t stride, int h, int x, int y);
+void ff_avg_h264_chroma_mc2_mmxext   (uint8_t *dst, uint8_t *src,
+                                      ptrdiff_t stride, int h, int x, int y);
+
+void ff_put_h264_chroma_mc8_rnd_ssse3(uint8_t *dst, uint8_t *src,
+                                      ptrdiff_t stride, int h, int x, int y);
+void ff_put_h264_chroma_mc4_ssse3    (uint8_t *dst, uint8_t *src,
+                                      ptrdiff_t stride, int h, int x, int y);
+
+void ff_avg_h264_chroma_mc8_rnd_ssse3(uint8_t *dst, uint8_t *src,
+                                      ptrdiff_t stride, int h, int x, int y);
+void ff_avg_h264_chroma_mc4_ssse3    (uint8_t *dst, uint8_t *src,
+                                      ptrdiff_t stride, int h, int x, int y);
+
+#define CHROMA_MC(OP, NUM, DEPTH, OPT)                                  \
+void ff_ ## OP ## _h264_chroma_mc ## NUM ## _ ## DEPTH ## _ ## OPT      \
+                                      (uint8_t *dst, uint8_t *src,      \
+                                       ptrdiff_t stride, int h, int x, int y);
+
+CHROMA_MC(put, 2, 10, mmxext)
+CHROMA_MC(avg, 2, 10, mmxext)
+CHROMA_MC(put, 4, 10, mmxext)
+CHROMA_MC(avg, 4, 10, mmxext)
+CHROMA_MC(put, 8, 10, sse2)
+CHROMA_MC(avg, 8, 10, sse2)
+CHROMA_MC(put, 8, 10, avx)
+CHROMA_MC(avg, 8, 10, avx)
+
+av_cold void ff_h264chroma_init_x86(H264ChromaContext *c, int bit_depth)
+{
+    int high_bit_depth = bit_depth > 8;
+    int cpu_flags      = av_get_cpu_flags();
+
+    if (EXTERNAL_MMX(cpu_flags) && !high_bit_depth) {
+        c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_rnd_mmx;
+        c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_mmx;
+    }
+
+    if (EXTERNAL_AMD3DNOW(cpu_flags) && !high_bit_depth) {
+        c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_rnd_3dnow;
+        c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_3dnow;
+    }
+
+    if (EXTERNAL_MMXEXT(cpu_flags) && !high_bit_depth) {
+        c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_rnd_mmxext;
+        c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_mmxext;
+        c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_mmxext;
+        c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_mmxext;
+    }
+
+    if (EXTERNAL_MMXEXT(cpu_flags) && bit_depth > 8 && bit_depth <= 10) {
+        c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_10_mmxext;
+        c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_10_mmxext;
+        c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_10_mmxext;
+        c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_10_mmxext;
+    }
+
+    if (EXTERNAL_SSE2(cpu_flags) && bit_depth > 8 && bit_depth <= 10) {
+        c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_sse2;
+        c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_10_sse2;
+    }
+
+    if (EXTERNAL_SSSE3(cpu_flags) && !high_bit_depth) {
+        c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_rnd_ssse3;
+        c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_rnd_ssse3;
+        c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_ssse3;
+        c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_ssse3;
+    }
+
+    if (EXTERNAL_AVX(cpu_flags) && bit_depth > 8 && bit_depth <= 10) {
+        // AVX implies !cache64.
+        // TODO: Port cache(32|64) detection from x264.
+        c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_avx;
+        c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_10_avx;
+    }
+}
--- a/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/h264dsp_init.c
+++ b/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/h264dsp_init.c
@ -0,0 +1,448 @@
+/*
+ * Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/asm.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/h264dsp.h"
+
+/***********************************/
+/* IDCT */
+#define IDCT_ADD_FUNC(NUM, DEPTH, OPT)                                  \
+void ff_h264_idct ## NUM ## _add_ ## DEPTH ## _ ## OPT(uint8_t *dst,    \
+                                                       int16_t *block,  \
+                                                       int stride);
+
+IDCT_ADD_FUNC(, 8, mmx)
+IDCT_ADD_FUNC(, 8, sse2)
+IDCT_ADD_FUNC(, 8, avx)
+IDCT_ADD_FUNC(, 10, sse2)
+IDCT_ADD_FUNC(_dc, 8, mmxext)
+IDCT_ADD_FUNC(_dc, 8, sse2)
+IDCT_ADD_FUNC(_dc, 8, avx)
+IDCT_ADD_FUNC(_dc, 10, mmxext)
+IDCT_ADD_FUNC(8_dc, 8, mmxext)
+IDCT_ADD_FUNC(8_dc, 10, sse2)
+IDCT_ADD_FUNC(8, 8, mmx)
+IDCT_ADD_FUNC(8, 8, sse2)
+IDCT_ADD_FUNC(8, 10, sse2)
+IDCT_ADD_FUNC(, 10, avx)
+IDCT_ADD_FUNC(8_dc, 10, avx)
+IDCT_ADD_FUNC(8, 10, avx)
+
+
+#define IDCT_ADD_REP_FUNC(NUM, REP, DEPTH, OPT)                         \
+void ff_h264_idct ## NUM ## _add ## REP ## _ ## DEPTH ## _ ## OPT       \
+    (uint8_t *dst, const int *block_offset,                             \
+     int16_t *block, int stride, const uint8_t nnzc[6 * 8]);
+
+IDCT_ADD_REP_FUNC(8, 4, 8, mmx)
+IDCT_ADD_REP_FUNC(8, 4, 8, mmxext)
+IDCT_ADD_REP_FUNC(8, 4, 8, sse2)
+IDCT_ADD_REP_FUNC(8, 4, 10, sse2)
+IDCT_ADD_REP_FUNC(8, 4, 10, avx)
+IDCT_ADD_REP_FUNC(, 16, 8, mmx)
+IDCT_ADD_REP_FUNC(, 16, 8, mmxext)
+IDCT_ADD_REP_FUNC(, 16, 8, sse2)
+IDCT_ADD_REP_FUNC(, 16, 10, sse2)
+IDCT_ADD_REP_FUNC(, 16intra, 8, mmx)
+IDCT_ADD_REP_FUNC(, 16intra, 8, mmxext)
+IDCT_ADD_REP_FUNC(, 16intra, 8, sse2)
+IDCT_ADD_REP_FUNC(, 16intra, 10, sse2)
+IDCT_ADD_REP_FUNC(, 16, 10, avx)
+IDCT_ADD_REP_FUNC(, 16intra, 10, avx)
+
+
+#define IDCT_ADD_REP_FUNC2(NUM, REP, DEPTH, OPT)                      \
+void ff_h264_idct ## NUM ## _add ## REP ## _ ## DEPTH ## _ ## OPT     \
+    (uint8_t **dst, const int *block_offset,                          \
+     int16_t *block, int stride, const uint8_t nnzc[6 * 8]);
+
+IDCT_ADD_REP_FUNC2(, 8, 8, mmx)
+IDCT_ADD_REP_FUNC2(, 8, 8, mmxext)
+IDCT_ADD_REP_FUNC2(, 8, 8, sse2)
+IDCT_ADD_REP_FUNC2(, 8, 10, sse2)
+IDCT_ADD_REP_FUNC2(, 8, 10, avx)
+
+IDCT_ADD_REP_FUNC2(, 8_422, 8, mmx)
+
+IDCT_ADD_REP_FUNC2(, 8_422, 10, sse2)
+IDCT_ADD_REP_FUNC2(, 8_422, 10, avx)
+
+void ff_h264_luma_dc_dequant_idct_mmx(int16_t *output, int16_t *input, int qmul);
+void ff_h264_luma_dc_dequant_idct_sse2(int16_t *output, int16_t *input, int qmul);
+
+/***********************************/
+/* deblocking */
+
+void ff_h264_loop_filter_strength_mmxext(int16_t bS[2][4][4], uint8_t nnz[40],
+                                         int8_t ref[2][40],
+                                         int16_t mv[2][40][2],
+                                         int bidir, int edges, int step,
+                                         int mask_mv0, int mask_mv1, int field);
+
+#define LF_FUNC(DIR, TYPE, DEPTH, OPT)                                        \
+void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT(uint8_t *pix,  \
+                                                               ptrdiff_t stride, \
+                                                               int alpha,     \
+                                                               int beta,      \
+                                                               int8_t *tc0);
+#define LF_IFUNC(DIR, TYPE, DEPTH, OPT) \
+void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT(uint8_t *pix,  \
+                                                               ptrdiff_t stride, \
+                                                               int alpha,     \
+                                                               int beta);
+
+#define LF_FUNCS(type, depth)                   \
+LF_FUNC(h,  chroma,          depth, mmxext)     \
+LF_IFUNC(h, chroma_intra,    depth, mmxext)     \
+LF_FUNC(h,  chroma422,       depth, mmxext)     \
+LF_IFUNC(h, chroma422_intra, depth, mmxext)     \
+LF_FUNC(v,  chroma,          depth, mmxext)     \
+LF_IFUNC(v, chroma_intra,    depth, mmxext)     \
+LF_FUNC(h,  luma,            depth, mmxext)     \
+LF_IFUNC(h, luma_intra,      depth, mmxext)     \
+LF_FUNC(h,  luma,            depth, sse2)       \
+LF_IFUNC(h, luma_intra,      depth, sse2)       \
+LF_FUNC(v,  luma,            depth, sse2)       \
+LF_IFUNC(v, luma_intra,      depth, sse2)       \
+LF_FUNC(h,  chroma,          depth, sse2)       \
+LF_IFUNC(h, chroma_intra,    depth, sse2)       \
+LF_FUNC(h,  chroma422,       depth, sse2)       \
+LF_IFUNC(h, chroma422_intra, depth, sse2)       \
+LF_FUNC(v,  chroma,          depth, sse2)       \
+LF_IFUNC(v, chroma_intra,    depth, sse2)       \
+LF_FUNC(h,  luma,            depth, avx)        \
+LF_IFUNC(h, luma_intra,      depth, avx)        \
+LF_FUNC(v,  luma,            depth, avx)        \
+LF_IFUNC(v, luma_intra,      depth, avx)        \
+LF_FUNC(h,  chroma,          depth, avx)        \
+LF_IFUNC(h, chroma_intra,    depth, avx)        \
+LF_FUNC(h,  chroma422,       depth, avx)        \
+LF_IFUNC(h, chroma422_intra, depth, avx)        \
+LF_FUNC(v,  chroma,          depth, avx)        \
+LF_IFUNC(v, chroma_intra,    depth, avx)
+
+LF_FUNC(h, luma_mbaff, 8, sse2)
+LF_FUNC(h, luma_mbaff, 8, avx)
+
+LF_FUNCS(uint8_t,   8)
+LF_FUNCS(uint16_t, 10)
+
+#if ARCH_X86_32 && HAVE_MMXEXT_EXTERNAL
+LF_FUNC(v8, luma, 8, mmxext)
+static void deblock_v_luma_8_mmxext(uint8_t *pix, int stride, int alpha,
+                                    int beta, int8_t *tc0)
+{
+    if ((tc0[0] & tc0[1]) >= 0)
+        ff_deblock_v8_luma_8_mmxext(pix + 0, stride, alpha, beta, tc0);
+    if ((tc0[2] & tc0[3]) >= 0)
+        ff_deblock_v8_luma_8_mmxext(pix + 8, stride, alpha, beta, tc0 + 2);
+}
+LF_IFUNC(v8, luma_intra, 8, mmxext)
+static void deblock_v_luma_intra_8_mmxext(uint8_t *pix, int stride,
+                                          int alpha, int beta)
+{
+    ff_deblock_v8_luma_intra_8_mmxext(pix + 0, stride, alpha, beta);
+    ff_deblock_v8_luma_intra_8_mmxext(pix + 8, stride, alpha, beta);
+}
+#endif /* ARCH_X86_32 && HAVE_MMXEXT_EXTERNAL */
+
+LF_FUNC(v,  luma,       10, mmxext)
+LF_IFUNC(v, luma_intra, 10, mmxext)
+
+/***********************************/
+/* weighted prediction */
+
+#define H264_WEIGHT(W, OPT)                                             \
+void ff_h264_weight_ ## W ## _ ## OPT(uint8_t *dst, ptrdiff_t stride,   \
+                                      int height, int log2_denom,       \
+                                      int weight, int offset);
+
+#define H264_BIWEIGHT(W, OPT)                                           \
+void ff_h264_biweight_ ## W ## _ ## OPT(uint8_t *dst, uint8_t *src,     \
+                                        ptrdiff_t stride, int height,   \
+                                        int log2_denom, int weightd,    \
+                                        int weights, int offset);
+
+#define H264_BIWEIGHT_MMX(W)                    \
+    H264_WEIGHT(W, mmxext)                      \
+    H264_BIWEIGHT(W, mmxext)
+
+#define H264_BIWEIGHT_MMX_SSE(W)                \
+    H264_BIWEIGHT_MMX(W)                        \
+    H264_WEIGHT(W, sse2)                        \
+    H264_BIWEIGHT(W, sse2)                      \
+    H264_BIWEIGHT(W, ssse3)
+
+H264_BIWEIGHT_MMX_SSE(16)
+H264_BIWEIGHT_MMX_SSE(8)
+H264_BIWEIGHT_MMX(4)
+
+#define H264_WEIGHT_10(W, DEPTH, OPT)                                   \
+void ff_h264_weight_ ## W ## _ ## DEPTH ## _ ## OPT(uint8_t *dst,       \
+                                                    ptrdiff_t stride,   \
+                                                    int height,         \
+                                                    int log2_denom,     \
+                                                    int weight,         \
+                                                    int offset);
+
+#define H264_BIWEIGHT_10(W, DEPTH, OPT)                                 \
+void ff_h264_biweight_ ## W ## _ ## DEPTH ## _ ## OPT(uint8_t *dst,     \
+                                                      uint8_t *src,     \
+                                                      ptrdiff_t stride, \
+                                                      int height,       \
+                                                      int log2_denom,   \
+                                                      int weightd,      \
+                                                      int weights,      \
+                                                      int offset);
+
+#define H264_BIWEIGHT_10_SSE(W, DEPTH)          \
+    H264_WEIGHT_10(W, DEPTH, sse2)              \
+    H264_WEIGHT_10(W, DEPTH, sse4)              \
+    H264_BIWEIGHT_10(W, DEPTH, sse2)            \
+    H264_BIWEIGHT_10(W, DEPTH, sse4)
+
+H264_BIWEIGHT_10_SSE(16, 10)
+H264_BIWEIGHT_10_SSE(8,  10)
+H264_BIWEIGHT_10_SSE(4,  10)
+
+av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
+                                 const int chroma_format_idc)
+{
+#if HAVE_X86ASM
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_MMXEXT(cpu_flags) && chroma_format_idc <= 1)
+        c->h264_loop_filter_strength = ff_h264_loop_filter_strength_mmxext;
+
+    if (bit_depth == 8) {
+        if (EXTERNAL_MMX(cpu_flags)) {
+            c->h264_idct_dc_add   =
+            c->h264_idct_add      = ff_h264_idct_add_8_mmx;
+            c->h264_idct8_dc_add  =
+            c->h264_idct8_add     = ff_h264_idct8_add_8_mmx;
+
+            c->h264_idct_add16 = ff_h264_idct_add16_8_mmx;
+            c->h264_idct8_add4 = ff_h264_idct8_add4_8_mmx;
+            if (chroma_format_idc <= 1) {
+                c->h264_idct_add8 = ff_h264_idct_add8_8_mmx;
+            } else {
+                c->h264_idct_add8 = ff_h264_idct_add8_422_8_mmx;
+            }
+            c->h264_idct_add16intra = ff_h264_idct_add16intra_8_mmx;
+            if (cpu_flags & AV_CPU_FLAG_CMOV)
+                c->h264_luma_dc_dequant_idct = ff_h264_luma_dc_dequant_idct_mmx;
+        }
+        if (EXTERNAL_MMXEXT(cpu_flags)) {
+            c->h264_idct_dc_add  = ff_h264_idct_dc_add_8_mmxext;
+            c->h264_idct8_dc_add = ff_h264_idct8_dc_add_8_mmxext;
+            c->h264_idct_add16   = ff_h264_idct_add16_8_mmxext;
+            c->h264_idct8_add4   = ff_h264_idct8_add4_8_mmxext;
+            if (chroma_format_idc <= 1)
+                c->h264_idct_add8 = ff_h264_idct_add8_8_mmxext;
+            c->h264_idct_add16intra = ff_h264_idct_add16intra_8_mmxext;
+
+            c->h264_v_loop_filter_chroma       = ff_deblock_v_chroma_8_mmxext;
+            c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_8_mmxext;
+            if (chroma_format_idc <= 1) {
+                c->h264_h_loop_filter_chroma       = ff_deblock_h_chroma_8_mmxext;
+                c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma_intra_8_mmxext;
+            } else {
+                c->h264_h_loop_filter_chroma = ff_deblock_h_chroma422_8_mmxext;
+                c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma422_intra_8_mmxext;
+            }
+#if ARCH_X86_32 && HAVE_MMXEXT_EXTERNAL
+            c->h264_v_loop_filter_luma       = deblock_v_luma_8_mmxext;
+            c->h264_h_loop_filter_luma       = ff_deblock_h_luma_8_mmxext;
+            c->h264_v_loop_filter_luma_intra = deblock_v_luma_intra_8_mmxext;
+            c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_mmxext;
+#endif /* ARCH_X86_32 && HAVE_MMXEXT_EXTERNAL */
+            c->weight_h264_pixels_tab[0] = ff_h264_weight_16_mmxext;
+            c->weight_h264_pixels_tab[1] = ff_h264_weight_8_mmxext;
+            c->weight_h264_pixels_tab[2] = ff_h264_weight_4_mmxext;
+
+            c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_mmxext;
+            c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_mmxext;
+            c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_mmxext;
+        }
+        if (EXTERNAL_SSE2(cpu_flags)) {
+            c->h264_idct8_add  = ff_h264_idct8_add_8_sse2;
+
+            c->h264_idct_add16 = ff_h264_idct_add16_8_sse2;
+            c->h264_idct8_add4 = ff_h264_idct8_add4_8_sse2;
+            if (chroma_format_idc <= 1)
+                c->h264_idct_add8 = ff_h264_idct_add8_8_sse2;
+            c->h264_idct_add16intra      = ff_h264_idct_add16intra_8_sse2;
+            c->h264_luma_dc_dequant_idct = ff_h264_luma_dc_dequant_idct_sse2;
+
+            c->weight_h264_pixels_tab[0] = ff_h264_weight_16_sse2;
+            c->weight_h264_pixels_tab[1] = ff_h264_weight_8_sse2;
+
+            c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_sse2;
+            c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_sse2;
+
+            c->h264_v_loop_filter_luma       = ff_deblock_v_luma_8_sse2;
+            c->h264_h_loop_filter_luma       = ff_deblock_h_luma_8_sse2;
+            c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_sse2;
+            c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_sse2;
+
+#if ARCH_X86_64
+            c->h264_h_loop_filter_luma_mbaff = ff_deblock_h_luma_mbaff_8_sse2;
+#endif
+
+            c->h264_v_loop_filter_chroma       = ff_deblock_v_chroma_8_sse2;
+            c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_8_sse2;
+            if (chroma_format_idc <= 1) {
+                c->h264_h_loop_filter_chroma       = ff_deblock_h_chroma_8_sse2;
+                c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma_intra_8_sse2;
+            } else {
+                c->h264_h_loop_filter_chroma       = ff_deblock_h_chroma422_8_sse2;
+                c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma422_intra_8_sse2;
+            }
+
+            c->h264_idct_add        = ff_h264_idct_add_8_sse2;
+            c->h264_idct_dc_add     = ff_h264_idct_dc_add_8_sse2;
+        }
+        if (EXTERNAL_SSSE3(cpu_flags)) {
+            c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_ssse3;
+            c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_ssse3;
+        }
+        if (EXTERNAL_AVX(cpu_flags)) {
+            c->h264_v_loop_filter_luma       = ff_deblock_v_luma_8_avx;
+            c->h264_h_loop_filter_luma       = ff_deblock_h_luma_8_avx;
+            c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_avx;
+            c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_avx;
+#if ARCH_X86_64
+            c->h264_h_loop_filter_luma_mbaff = ff_deblock_h_luma_mbaff_8_avx;
+#endif
+
+            c->h264_v_loop_filter_chroma       = ff_deblock_v_chroma_8_avx;
+            c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_8_avx;
+            if (chroma_format_idc <= 1) {
+                c->h264_h_loop_filter_chroma       = ff_deblock_h_chroma_8_avx;
+                c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma_intra_8_avx;
+            } else {
+                c->h264_h_loop_filter_chroma       = ff_deblock_h_chroma422_8_avx;
+                c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma422_intra_8_avx;
+            }
+
+            c->h264_idct_add        = ff_h264_idct_add_8_avx;
+            c->h264_idct_dc_add     = ff_h264_idct_dc_add_8_avx;
+        }
+    } else if (bit_depth == 10) {
+        if (EXTERNAL_MMXEXT(cpu_flags)) {
+#if ARCH_X86_32
+            c->h264_v_loop_filter_chroma       = ff_deblock_v_chroma_10_mmxext;
+            c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_mmxext;
+            if (chroma_format_idc <= 1) {
+                c->h264_h_loop_filter_chroma = ff_deblock_h_chroma_10_mmxext;
+            } else {
+                c->h264_h_loop_filter_chroma = ff_deblock_h_chroma422_10_mmxext;
+            }
+            c->h264_v_loop_filter_luma         = ff_deblock_v_luma_10_mmxext;
+            c->h264_h_loop_filter_luma         = ff_deblock_h_luma_10_mmxext;
+            c->h264_v_loop_filter_luma_intra   = ff_deblock_v_luma_intra_10_mmxext;
+            c->h264_h_loop_filter_luma_intra   = ff_deblock_h_luma_intra_10_mmxext;
+#endif /* ARCH_X86_32 */
+            c->h264_idct_dc_add = ff_h264_idct_dc_add_10_mmxext;
+        }
+        if (EXTERNAL_SSE2(cpu_flags)) {
+            c->h264_idct_add     = ff_h264_idct_add_10_sse2;
+            c->h264_idct8_dc_add = ff_h264_idct8_dc_add_10_sse2;
+
+            c->h264_idct_add16 = ff_h264_idct_add16_10_sse2;
+            if (chroma_format_idc <= 1) {
+                c->h264_idct_add8 = ff_h264_idct_add8_10_sse2;
+            } else {
+                c->h264_idct_add8 = ff_h264_idct_add8_422_10_sse2;
+            }
+            c->h264_idct_add16intra = ff_h264_idct_add16intra_10_sse2;
+#if HAVE_ALIGNED_STACK
+            c->h264_idct8_add  = ff_h264_idct8_add_10_sse2;
+            c->h264_idct8_add4 = ff_h264_idct8_add4_10_sse2;
+#endif /* HAVE_ALIGNED_STACK */
+
+            c->weight_h264_pixels_tab[0] = ff_h264_weight_16_10_sse2;
+            c->weight_h264_pixels_tab[1] = ff_h264_weight_8_10_sse2;
+            c->weight_h264_pixels_tab[2] = ff_h264_weight_4_10_sse2;
+
+            c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_10_sse2;
+            c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_10_sse2;
+            c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_10_sse2;
+
+            c->h264_v_loop_filter_chroma       = ff_deblock_v_chroma_10_sse2;
+            c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_sse2;
+            if (chroma_format_idc <= 1) {
+                c->h264_h_loop_filter_chroma = ff_deblock_h_chroma_10_sse2;
+            } else {
+                c->h264_h_loop_filter_chroma = ff_deblock_h_chroma422_10_sse2;
+            }
+#if HAVE_ALIGNED_STACK
+            c->h264_v_loop_filter_luma       = ff_deblock_v_luma_10_sse2;
+            c->h264_h_loop_filter_luma       = ff_deblock_h_luma_10_sse2;
+            c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_sse2;
+            c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_sse2;
+#endif /* HAVE_ALIGNED_STACK */
+        }
+        if (EXTERNAL_SSE4(cpu_flags)) {
+            c->weight_h264_pixels_tab[0] = ff_h264_weight_16_10_sse4;
+            c->weight_h264_pixels_tab[1] = ff_h264_weight_8_10_sse4;
+            c->weight_h264_pixels_tab[2] = ff_h264_weight_4_10_sse4;
+
+            c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_10_sse4;
+            c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_10_sse4;
+            c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_10_sse4;
+        }
+        if (EXTERNAL_AVX(cpu_flags)) {
+            c->h264_idct_dc_add  =
+            c->h264_idct_add     = ff_h264_idct_add_10_avx;
+            c->h264_idct8_dc_add = ff_h264_idct8_dc_add_10_avx;
+
+            c->h264_idct_add16 = ff_h264_idct_add16_10_avx;
+            if (chroma_format_idc <= 1) {
+                c->h264_idct_add8 = ff_h264_idct_add8_10_avx;
+            } else {
+                c->h264_idct_add8 = ff_h264_idct_add8_422_10_avx;
+            }
+            c->h264_idct_add16intra = ff_h264_idct_add16intra_10_avx;
+#if HAVE_ALIGNED_STACK
+            c->h264_idct8_add  = ff_h264_idct8_add_10_avx;
+            c->h264_idct8_add4 = ff_h264_idct8_add4_10_avx;
+#endif /* HAVE_ALIGNED_STACK */
+
+            c->h264_v_loop_filter_chroma       = ff_deblock_v_chroma_10_avx;
+            c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_avx;
+            if (chroma_format_idc <= 1) {
+                c->h264_h_loop_filter_chroma = ff_deblock_h_chroma_10_avx;
+            } else {
+                c->h264_h_loop_filter_chroma = ff_deblock_h_chroma422_10_avx;
+            }
+#if HAVE_ALIGNED_STACK
+            c->h264_v_loop_filter_luma         = ff_deblock_v_luma_10_avx;
+            c->h264_h_loop_filter_luma         = ff_deblock_h_luma_10_avx;
+            c->h264_v_loop_filter_luma_intra   = ff_deblock_v_luma_intra_10_avx;
+            c->h264_h_loop_filter_luma_intra   = ff_deblock_h_luma_intra_10_avx;
+#endif /* HAVE_ALIGNED_STACK */
+        }
+    }
+#endif
+}
--- a/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/hevcdsp.h
+++ b/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/hevcdsp.h
@ -0,0 +1,259 @@
+/*
+ * HEVC video decoder
+ *
+ * Copyright (C) 2012 - 2013 Guillaume Martres
+ * Copyright (C) 2013 - 2014 Pierre-Edouard Lepere
+ *
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_X86_HEVCDSP_H
+#define AVCODEC_X86_HEVCDSP_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+
+#define PEL_LINK(dst, idx1, idx2, idx3, name, D, opt) \
+dst[idx1][idx2][idx3] = ff_hevc_put_hevc_ ## name ## _ ## D ## _##opt; \
+dst ## _bi[idx1][idx2][idx3] = ff_hevc_put_hevc_bi_ ## name ## _ ## D ## _##opt; \
+dst ## _uni[idx1][idx2][idx3] = ff_hevc_put_hevc_uni_ ## name ## _ ## D ## _##opt; \
+dst ## _uni_w[idx1][idx2][idx3] = ff_hevc_put_hevc_uni_w_ ## name ## _ ## D ## _##opt; \
+dst ## _bi_w[idx1][idx2][idx3] = ff_hevc_put_hevc_bi_w_ ## name ## _ ## D ## _##opt
+
+
+#define PEL_PROTOTYPE(name, D, opt) \
+void ff_hevc_put_hevc_ ## name ## _ ## D ## _##opt(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width); \
+void ff_hevc_put_hevc_bi_ ## name ## _ ## D ## _##opt(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width); \
+void ff_hevc_put_hevc_uni_ ## name ## _ ## D ## _##opt(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width); \
+void ff_hevc_put_hevc_uni_w_ ## name ## _ ## D ## _##opt(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width); \
+void ff_hevc_put_hevc_bi_w_ ## name ## _ ## D ## _##opt(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, int denom, int wx0, int wx1, int ox0, int ox1, intptr_t mx, intptr_t my, int width)
+
+
+///////////////////////////////////////////////////////////////////////////////
+// MC functions
+///////////////////////////////////////////////////////////////////////////////
+
+#define EPEL_PROTOTYPES(fname, bitd, opt) \
+        PEL_PROTOTYPE(fname##4,  bitd, opt); \
+        PEL_PROTOTYPE(fname##6,  bitd, opt); \
+        PEL_PROTOTYPE(fname##8,  bitd, opt); \
+        PEL_PROTOTYPE(fname##12, bitd, opt); \
+        PEL_PROTOTYPE(fname##16, bitd, opt); \
+        PEL_PROTOTYPE(fname##24, bitd, opt); \
+        PEL_PROTOTYPE(fname##32, bitd, opt); \
+        PEL_PROTOTYPE(fname##48, bitd, opt); \
+        PEL_PROTOTYPE(fname##64, bitd, opt)
+
+#define QPEL_PROTOTYPES(fname, bitd, opt) \
+        PEL_PROTOTYPE(fname##4,  bitd, opt); \
+        PEL_PROTOTYPE(fname##8,  bitd, opt); \
+        PEL_PROTOTYPE(fname##12, bitd, opt); \
+        PEL_PROTOTYPE(fname##16, bitd, opt); \
+        PEL_PROTOTYPE(fname##24, bitd, opt); \
+        PEL_PROTOTYPE(fname##32, bitd, opt); \
+        PEL_PROTOTYPE(fname##48, bitd, opt); \
+        PEL_PROTOTYPE(fname##64, bitd, opt)
+
+#define WEIGHTING_PROTOTYPE(width, bitd, opt) \
+void ff_hevc_put_hevc_uni_w##width##_##bitd##_##opt(uint8_t *dst, ptrdiff_t dststride, int16_t *_src, int height, int denom,  int _wx, int _ox); \
+void ff_hevc_put_hevc_bi_w##width##_##bitd##_##opt(uint8_t *dst, ptrdiff_t dststride, int16_t *_src, int16_t *_src2, int height, int denom,  int _wx0,  int _wx1, int _ox0, int _ox1)
+
+#define WEIGHTING_PROTOTYPES(bitd, opt) \
+        WEIGHTING_PROTOTYPE(2, bitd, opt); \
+        WEIGHTING_PROTOTYPE(4, bitd, opt); \
+        WEIGHTING_PROTOTYPE(6, bitd, opt); \
+        WEIGHTING_PROTOTYPE(8, bitd, opt); \
+        WEIGHTING_PROTOTYPE(12, bitd, opt); \
+        WEIGHTING_PROTOTYPE(16, bitd, opt); \
+        WEIGHTING_PROTOTYPE(24, bitd, opt); \
+        WEIGHTING_PROTOTYPE(32, bitd, opt); \
+        WEIGHTING_PROTOTYPE(48, bitd, opt); \
+        WEIGHTING_PROTOTYPE(64, bitd, opt)
+
+
+///////////////////////////////////////////////////////////////////////////////
+// QPEL_PIXELS EPEL_PIXELS
+///////////////////////////////////////////////////////////////////////////////
+EPEL_PROTOTYPES(pel_pixels ,  8, sse4);
+EPEL_PROTOTYPES(pel_pixels , 10, sse4);
+EPEL_PROTOTYPES(pel_pixels , 12, sse4);
+
+void ff_hevc_put_hevc_pel_pixels16_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_pel_pixels24_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_pel_pixels32_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_pel_pixels48_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_pel_pixels64_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+
+void ff_hevc_put_hevc_pel_pixels16_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_pel_pixels24_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_pel_pixels32_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_pel_pixels48_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_pel_pixels64_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+
+
+
+void ff_hevc_put_hevc_uni_pel_pixels32_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_uni_pel_pixels48_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_uni_pel_pixels64_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_uni_pel_pixels96_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width); //used for 10bit
+void ff_hevc_put_hevc_uni_pel_pixels128_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);//used for 10bit
+
+
+void ff_hevc_put_hevc_bi_pel_pixels16_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_bi_pel_pixels24_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_bi_pel_pixels32_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_bi_pel_pixels48_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_bi_pel_pixels64_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+
+void ff_hevc_put_hevc_bi_pel_pixels16_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_bi_pel_pixels24_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_bi_pel_pixels32_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_bi_pel_pixels48_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_bi_pel_pixels64_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+
+///////////////////////////////////////////////////////////////////////////////
+// EPEL
+///////////////////////////////////////////////////////////////////////////////
+EPEL_PROTOTYPES(epel_h ,  8, sse4);
+EPEL_PROTOTYPES(epel_h , 10, sse4);
+EPEL_PROTOTYPES(epel_h , 12, sse4);
+
+EPEL_PROTOTYPES(epel_v ,  8, sse4);
+EPEL_PROTOTYPES(epel_v , 10, sse4);
+EPEL_PROTOTYPES(epel_v , 12, sse4);
+
+EPEL_PROTOTYPES(epel_hv ,  8, sse4);
+EPEL_PROTOTYPES(epel_hv , 10, sse4);
+EPEL_PROTOTYPES(epel_hv , 12, sse4);
+
+PEL_PROTOTYPE(epel_h16, 8, avx2);
+PEL_PROTOTYPE(epel_h24, 8, avx2);
+PEL_PROTOTYPE(epel_h32, 8, avx2);
+PEL_PROTOTYPE(epel_h48, 8, avx2);
+PEL_PROTOTYPE(epel_h64, 8, avx2);
+
+PEL_PROTOTYPE(epel_h16,10, avx2);
+PEL_PROTOTYPE(epel_h24,10, avx2);
+PEL_PROTOTYPE(epel_h32,10, avx2);
+PEL_PROTOTYPE(epel_h48,10, avx2);
+PEL_PROTOTYPE(epel_h64,10, avx2);
+
+PEL_PROTOTYPE(epel_v16, 8, avx2);
+PEL_PROTOTYPE(epel_v24, 8, avx2);
+PEL_PROTOTYPE(epel_v32, 8, avx2);
+PEL_PROTOTYPE(epel_v48, 8, avx2);
+PEL_PROTOTYPE(epel_v64, 8, avx2);
+
+PEL_PROTOTYPE(epel_v16,10, avx2);
+PEL_PROTOTYPE(epel_v24,10, avx2);
+PEL_PROTOTYPE(epel_v32,10, avx2);
+PEL_PROTOTYPE(epel_v48,10, avx2);
+PEL_PROTOTYPE(epel_v64,10, avx2);
+
+PEL_PROTOTYPE(epel_hv16, 8, avx2);
+PEL_PROTOTYPE(epel_hv24, 8, avx2);
+PEL_PROTOTYPE(epel_hv32, 8, avx2);
+PEL_PROTOTYPE(epel_hv48, 8, avx2);
+PEL_PROTOTYPE(epel_hv64, 8, avx2);
+
+PEL_PROTOTYPE(epel_hv16,10, avx2);
+PEL_PROTOTYPE(epel_hv24,10, avx2);
+PEL_PROTOTYPE(epel_hv32,10, avx2);
+PEL_PROTOTYPE(epel_hv48,10, avx2);
+PEL_PROTOTYPE(epel_hv64,10, avx2);
+
+///////////////////////////////////////////////////////////////////////////////
+// QPEL
+///////////////////////////////////////////////////////////////////////////////
+QPEL_PROTOTYPES(qpel_h ,  8, sse4);
+QPEL_PROTOTYPES(qpel_h , 10, sse4);
+QPEL_PROTOTYPES(qpel_h , 12, sse4);
+
+QPEL_PROTOTYPES(qpel_v,  8, sse4);
+QPEL_PROTOTYPES(qpel_v, 10, sse4);
+QPEL_PROTOTYPES(qpel_v, 12, sse4);
+
+QPEL_PROTOTYPES(qpel_hv,  8, sse4);
+QPEL_PROTOTYPES(qpel_hv, 10, sse4);
+QPEL_PROTOTYPES(qpel_hv, 12, sse4);
+
+PEL_PROTOTYPE(qpel_h16, 8, avx2);
+PEL_PROTOTYPE(qpel_h24, 8, avx2);
+PEL_PROTOTYPE(qpel_h32, 8, avx2);
+PEL_PROTOTYPE(qpel_h48, 8, avx2);
+PEL_PROTOTYPE(qpel_h64, 8, avx2);
+
+PEL_PROTOTYPE(qpel_h16,10, avx2);
+PEL_PROTOTYPE(qpel_h24,10, avx2);
+PEL_PROTOTYPE(qpel_h32,10, avx2);
+PEL_PROTOTYPE(qpel_h48,10, avx2);
+PEL_PROTOTYPE(qpel_h64,10, avx2);
+
+PEL_PROTOTYPE(qpel_v16, 8, avx2);
+PEL_PROTOTYPE(qpel_v24, 8, avx2);
+PEL_PROTOTYPE(qpel_v32, 8, avx2);
+PEL_PROTOTYPE(qpel_v48, 8, avx2);
+PEL_PROTOTYPE(qpel_v64, 8, avx2);
+
+PEL_PROTOTYPE(qpel_v16,10, avx2);
+PEL_PROTOTYPE(qpel_v24,10, avx2);
+PEL_PROTOTYPE(qpel_v32,10, avx2);
+PEL_PROTOTYPE(qpel_v48,10, avx2);
+PEL_PROTOTYPE(qpel_v64,10, avx2);
+
+PEL_PROTOTYPE(qpel_hv16, 8, avx2);
+PEL_PROTOTYPE(qpel_hv24, 8, avx2);
+PEL_PROTOTYPE(qpel_hv32, 8, avx2);
+PEL_PROTOTYPE(qpel_hv48, 8, avx2);
+PEL_PROTOTYPE(qpel_hv64, 8, avx2);
+
+PEL_PROTOTYPE(qpel_hv16,10, avx2);
+PEL_PROTOTYPE(qpel_hv24,10, avx2);
+PEL_PROTOTYPE(qpel_hv32,10, avx2);
+PEL_PROTOTYPE(qpel_hv48,10, avx2);
+PEL_PROTOTYPE(qpel_hv64,10, avx2);
+
+WEIGHTING_PROTOTYPES(8, sse4);
+WEIGHTING_PROTOTYPES(10, sse4);
+WEIGHTING_PROTOTYPES(12, sse4);
+
+///////////////////////////////////////////////////////////////////////////////
+// TRANSFORM_ADD
+///////////////////////////////////////////////////////////////////////////////
+
+void ff_hevc_add_residual_4_8_mmxext(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+void ff_hevc_add_residual_8_8_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+void ff_hevc_add_residual_16_8_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+void ff_hevc_add_residual_32_8_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+
+void ff_hevc_add_residual_8_8_avx(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+void ff_hevc_add_residual_16_8_avx(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+void ff_hevc_add_residual_32_8_avx(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+
+void ff_hevc_add_residual_32_8_avx2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+
+void ff_hevc_add_residual_4_10_mmxext(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+void ff_hevc_add_residual_8_10_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+void ff_hevc_add_residual_16_10_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+void ff_hevc_add_residual_32_10_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+
+void ff_hevc_add_residual_16_10_avx2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+void ff_hevc_add_residual_32_10_avx2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+
+#endif // AVCODEC_X86_HEVCDSP_H
--- a/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/hevcdsp_init.c
+++ b/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/hevcdsp_init.c
--- a/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/hpeldsp.h
+++ b/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/hpeldsp.h
@ -0,0 +1,57 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_X86_HPELDSP_H
+#define AVCODEC_X86_HPELDSP_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "libavcodec/hpeldsp.h"
+
+void ff_avg_pixels8_x2_mmx(uint8_t *block, const uint8_t *pixels,
+                           ptrdiff_t line_size, int h);
+
+void ff_avg_pixels8_xy2_mmx(uint8_t *block, const uint8_t *pixels,
+                            ptrdiff_t line_size, int h);
+void ff_avg_pixels8_xy2_mmxext(uint8_t *block, const uint8_t *pixels,
+                               ptrdiff_t line_size, int h);
+void ff_avg_pixels8_xy2_ssse3(uint8_t *block, const uint8_t *pixels,
+                               ptrdiff_t line_size, int h);
+
+void ff_avg_pixels16_xy2_mmx(uint8_t *block, const uint8_t *pixels,
+                             ptrdiff_t line_size, int h);
+void ff_avg_pixels16_xy2_sse2(uint8_t *block, const uint8_t *pixels,
+                              ptrdiff_t line_size, int h);
+void ff_avg_pixels16_xy2_ssse3(uint8_t *block, const uint8_t *pixels,
+                               ptrdiff_t line_size, int h);
+
+void ff_put_pixels8_xy2_mmx(uint8_t *block, const uint8_t *pixels,
+                            ptrdiff_t line_size, int h);
+void ff_put_pixels8_xy2_ssse3(uint8_t *block, const uint8_t *pixels,
+                              ptrdiff_t line_size, int h);
+void ff_put_pixels16_xy2_mmx(uint8_t *block, const uint8_t *pixels,
+                             ptrdiff_t line_size, int h);
+void ff_put_pixels16_xy2_sse2(uint8_t *block, const uint8_t *pixels,
+                              ptrdiff_t line_size, int h);
+void ff_put_pixels16_xy2_ssse3(uint8_t *block, const uint8_t *pixels,
+                               ptrdiff_t line_size, int h);
+
+void ff_hpeldsp_vp3_init_x86(HpelDSPContext *c, int cpu_flags, int flags);
+
+#endif /* AVCODEC_X86_HPELDSP_H */
--- a/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/hpeldsp_init.c
+++ b/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/hpeldsp_init.c
@ -0,0 +1,313 @@
+/*
+ * SIMD-optimized halfpel functions
+ * Copyright (c) 2000, 2001 Fabrice Bellard
+ * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/avcodec.h"
+#include "libavcodec/hpeldsp.h"
+#include "libavcodec/pixels.h"
+#include "fpel.h"
+#include "hpeldsp.h"
+
+void ff_put_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
+                              ptrdiff_t line_size, int h);
+void ff_put_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
+                             ptrdiff_t line_size, int h);
+void ff_put_pixels16_x2_mmxext(uint8_t *block, const uint8_t *pixels,
+                               ptrdiff_t line_size, int h);
+void ff_put_pixels16_x2_3dnow(uint8_t *block, const uint8_t *pixels,
+                              ptrdiff_t line_size, int h);
+void ff_put_pixels16_x2_sse2(uint8_t *block, const uint8_t *pixels,
+                             ptrdiff_t line_size, int h);
+void ff_avg_pixels16_x2_sse2(uint8_t *block, const uint8_t *pixels,
+                             ptrdiff_t line_size, int h);
+void ff_put_pixels16_y2_sse2(uint8_t *block, const uint8_t *pixels,
+                             ptrdiff_t line_size, int h);
+void ff_avg_pixels16_y2_sse2(uint8_t *block, const uint8_t *pixels,
+                             ptrdiff_t line_size, int h);
+void ff_put_no_rnd_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
+                                     ptrdiff_t line_size, int h);
+void ff_put_no_rnd_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
+                                    ptrdiff_t line_size, int h);
+void ff_put_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
+                              ptrdiff_t line_size, int h);
+void ff_put_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
+                             ptrdiff_t line_size, int h);
+void ff_put_no_rnd_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
+                                     ptrdiff_t line_size, int h);
+void ff_put_no_rnd_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
+                                    ptrdiff_t line_size, int h);
+void ff_avg_pixels8_3dnow(uint8_t *block, const uint8_t *pixels,
+                          ptrdiff_t line_size, int h);
+void ff_avg_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
+                              ptrdiff_t line_size, int h);
+void ff_avg_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
+                             ptrdiff_t line_size, int h);
+void ff_avg_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
+                              ptrdiff_t line_size, int h);
+void ff_avg_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
+                             ptrdiff_t line_size, int h);
+void ff_avg_pixels8_xy2_3dnow(uint8_t *block, const uint8_t *pixels,
+                              ptrdiff_t line_size, int h);
+void ff_avg_approx_pixels8_xy2_mmxext(uint8_t *block, const uint8_t *pixels,
+                                      ptrdiff_t line_size, int h);
+void ff_avg_approx_pixels8_xy2_3dnow(uint8_t *block, const uint8_t *pixels,
+                                     ptrdiff_t line_size, int h);
+
+#define avg_pixels8_mmx         ff_avg_pixels8_mmx
+#define avg_pixels8_x2_mmx      ff_avg_pixels8_x2_mmx
+#define avg_pixels16_mmx        ff_avg_pixels16_mmx
+#define avg_pixels8_xy2_mmx     ff_avg_pixels8_xy2_mmx
+#define avg_pixels16_xy2_mmx    ff_avg_pixels16_xy2_mmx
+#define put_pixels8_mmx         ff_put_pixels8_mmx
+#define put_pixels16_mmx        ff_put_pixels16_mmx
+#define put_pixels8_xy2_mmx     ff_put_pixels8_xy2_mmx
+#define put_pixels16_xy2_mmx    ff_put_pixels16_xy2_mmx
+#define avg_no_rnd_pixels16_mmx ff_avg_pixels16_mmx
+#define put_no_rnd_pixels8_mmx  ff_put_pixels8_mmx
+#define put_no_rnd_pixels16_mmx ff_put_pixels16_mmx
+
+#if HAVE_INLINE_ASM
+
+/***********************************/
+/* MMX no rounding */
+#define DEF(x, y) x ## _no_rnd_ ## y ## _mmx
+#define SET_RND  MOVQ_WONE
+#define PAVGBP(a, b, c, d, e, f)        PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
+#define PAVGB(a, b, c, e)               PAVGB_MMX_NO_RND(a, b, c, e)
+#define STATIC static
+
+#include "rnd_template.c"
+#include "hpeldsp_rnd_template.c"
+
+#undef DEF
+#undef SET_RND
+#undef PAVGBP
+#undef PAVGB
+#undef STATIC
+
+#if HAVE_MMX
+CALL_2X_PIXELS(avg_no_rnd_pixels16_y2_mmx, avg_no_rnd_pixels8_y2_mmx, 8)
+CALL_2X_PIXELS(put_no_rnd_pixels16_y2_mmx, put_no_rnd_pixels8_y2_mmx, 8)
+
+CALL_2X_PIXELS(avg_no_rnd_pixels16_xy2_mmx, avg_no_rnd_pixels8_xy2_mmx, 8)
+CALL_2X_PIXELS(put_no_rnd_pixels16_xy2_mmx, put_no_rnd_pixels8_xy2_mmx, 8)
+#endif
+
+/***********************************/
+/* MMX rounding */
+
+#define DEF(x, y) x ## _ ## y ## _mmx
+#define SET_RND  MOVQ_WTWO
+#define PAVGBP(a, b, c, d, e, f)        PAVGBP_MMX(a, b, c, d, e, f)
+#define PAVGB(a, b, c, e)               PAVGB_MMX(a, b, c, e)
+
+#include "hpeldsp_rnd_template.c"
+
+#undef DEF
+#define DEF(x, y) ff_ ## x ## _ ## y ## _mmx
+#define STATIC
+
+#include "rnd_template.c"
+
+#undef DEF
+#undef SET_RND
+#undef PAVGBP
+#undef PAVGB
+
+#if HAVE_MMX
+CALL_2X_PIXELS(avg_pixels16_y2_mmx, avg_pixels8_y2_mmx, 8)
+CALL_2X_PIXELS(put_pixels16_y2_mmx, put_pixels8_y2_mmx, 8)
+
+CALL_2X_PIXELS_EXPORT(ff_avg_pixels16_xy2_mmx, ff_avg_pixels8_xy2_mmx, 8)
+CALL_2X_PIXELS_EXPORT(ff_put_pixels16_xy2_mmx, ff_put_pixels8_xy2_mmx, 8)
+#endif
+
+#endif /* HAVE_INLINE_ASM */
+
+
+#if HAVE_X86ASM
+
+#define HPELDSP_AVG_PIXELS16(CPUEXT)                      \
+    CALL_2X_PIXELS(put_no_rnd_pixels16_x2 ## CPUEXT, ff_put_no_rnd_pixels8_x2 ## CPUEXT, 8) \
+    CALL_2X_PIXELS(put_pixels16_y2        ## CPUEXT, ff_put_pixels8_y2        ## CPUEXT, 8) \
+    CALL_2X_PIXELS(put_no_rnd_pixels16_y2 ## CPUEXT, ff_put_no_rnd_pixels8_y2 ## CPUEXT, 8) \
+    CALL_2X_PIXELS(avg_pixels16           ## CPUEXT, ff_avg_pixels8           ## CPUEXT, 8) \
+    CALL_2X_PIXELS(avg_pixels16_x2        ## CPUEXT, ff_avg_pixels8_x2        ## CPUEXT, 8) \
+    CALL_2X_PIXELS(avg_pixels16_y2        ## CPUEXT, ff_avg_pixels8_y2        ## CPUEXT, 8) \
+    CALL_2X_PIXELS(avg_pixels16_xy2       ## CPUEXT, ff_avg_pixels8_xy2       ## CPUEXT, 8) \
+    CALL_2X_PIXELS(avg_approx_pixels16_xy2## CPUEXT, ff_avg_approx_pixels8_xy2## CPUEXT, 8)
+
+HPELDSP_AVG_PIXELS16(_3dnow)
+HPELDSP_AVG_PIXELS16(_mmxext)
+
+#endif /* HAVE_X86ASM */
+
+#define SET_HPEL_FUNCS_EXT(PFX, IDX, SIZE, CPU)                             \
+    if (HAVE_MMX_EXTERNAL)                                                  \
+    c->PFX ## _pixels_tab IDX [0] = PFX ## _pixels ## SIZE ## _     ## CPU;
+
+#if HAVE_MMX_INLINE
+#define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU)                                     \
+    do {                                                                        \
+        SET_HPEL_FUNCS_EXT(PFX, IDX, SIZE, CPU)                                 \
+        c->PFX ## _pixels_tab IDX [1] = PFX ## _pixels ## SIZE ## _x2_  ## CPU; \
+        c->PFX ## _pixels_tab IDX [2] = PFX ## _pixels ## SIZE ## _y2_  ## CPU; \
+        c->PFX ## _pixels_tab IDX [3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU; \
+    } while (0)
+#else
+#define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU)                                     \
+    do {                                                                        \
+        SET_HPEL_FUNCS_EXT(PFX, IDX, SIZE, CPU)                                 \
+    } while (0)
+#endif
+
+static void hpeldsp_init_mmx(HpelDSPContext *c, int flags)
+{
+    SET_HPEL_FUNCS(put,        [0], 16, mmx);
+    SET_HPEL_FUNCS(put_no_rnd, [0], 16, mmx);
+    SET_HPEL_FUNCS(avg,        [0], 16, mmx);
+    SET_HPEL_FUNCS(avg_no_rnd,    , 16, mmx);
+    SET_HPEL_FUNCS(put,        [1],  8, mmx);
+    SET_HPEL_FUNCS(put_no_rnd, [1],  8, mmx);
+    if (HAVE_MMX_EXTERNAL) {
+        c->avg_pixels_tab[1][0] = ff_avg_pixels8_mmx;
+        c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_mmx;
+    }
+#if HAVE_MMX_INLINE
+    c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx;
+    c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_mmx;
+#endif
+}
+
+static void hpeldsp_init_mmxext(HpelDSPContext *c, int flags)
+{
+#if HAVE_MMXEXT_EXTERNAL
+    c->put_pixels_tab[0][1] = ff_put_pixels16_x2_mmxext;
+    c->put_pixels_tab[0][2] = put_pixels16_y2_mmxext;
+
+    c->avg_pixels_tab[0][0] = avg_pixels16_mmxext;
+    c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmxext;
+    c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmxext;
+    c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmxext;
+
+    c->put_pixels_tab[1][1] = ff_put_pixels8_x2_mmxext;
+    c->put_pixels_tab[1][2] = ff_put_pixels8_y2_mmxext;
+
+    c->avg_pixels_tab[1][0] = ff_avg_pixels8_mmxext;
+    c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_mmxext;
+    c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_mmxext;
+    c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_mmxext;
+
+    if (!(flags & AV_CODEC_FLAG_BITEXACT)) {
+        c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmxext;
+        c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmxext;
+        c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_mmxext;
+        c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_mmxext;
+
+        c->avg_pixels_tab[0][3] = avg_approx_pixels16_xy2_mmxext;
+        c->avg_pixels_tab[1][3] = ff_avg_approx_pixels8_xy2_mmxext;
+    }
+#endif /* HAVE_MMXEXT_EXTERNAL */
+}
+
+static void hpeldsp_init_3dnow(HpelDSPContext *c, int flags)
+{
+#if HAVE_AMD3DNOW_EXTERNAL
+    c->put_pixels_tab[0][1] = ff_put_pixels16_x2_3dnow;
+    c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
+
+    c->avg_pixels_tab[0][0] = avg_pixels16_3dnow;
+    c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
+    c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
+    c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
+
+    c->put_pixels_tab[1][1] = ff_put_pixels8_x2_3dnow;
+    c->put_pixels_tab[1][2] = ff_put_pixels8_y2_3dnow;
+
+    c->avg_pixels_tab[1][0] = ff_avg_pixels8_3dnow;
+    c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_3dnow;
+    c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_3dnow;
+    c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_3dnow;
+
+    if (!(flags & AV_CODEC_FLAG_BITEXACT)){
+        c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
+        c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow;
+        c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_3dnow;
+        c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_3dnow;
+
+        c->avg_pixels_tab[0][3] = avg_approx_pixels16_xy2_3dnow;
+        c->avg_pixels_tab[1][3] = ff_avg_approx_pixels8_xy2_3dnow;
+    }
+#endif /* HAVE_AMD3DNOW_EXTERNAL */
+}
+
+static void hpeldsp_init_sse2_fast(HpelDSPContext *c, int flags)
+{
+#if HAVE_SSE2_EXTERNAL
+    c->put_pixels_tab[0][0]        = ff_put_pixels16_sse2;
+    c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_sse2;
+    c->put_pixels_tab[0][1]        = ff_put_pixels16_x2_sse2;
+    c->put_pixels_tab[0][2]        = ff_put_pixels16_y2_sse2;
+    c->put_pixels_tab[0][3]        = ff_put_pixels16_xy2_sse2;
+    c->avg_pixels_tab[0][0]        = ff_avg_pixels16_sse2;
+    c->avg_pixels_tab[0][1]        = ff_avg_pixels16_x2_sse2;
+    c->avg_pixels_tab[0][2]        = ff_avg_pixels16_y2_sse2;
+    c->avg_pixels_tab[0][3]        = ff_avg_pixels16_xy2_sse2;
+#endif /* HAVE_SSE2_EXTERNAL */
+}
+
+static void hpeldsp_init_ssse3(HpelDSPContext *c, int flags)
+{
+#if HAVE_SSSE3_EXTERNAL
+    c->put_pixels_tab[0][3]            = ff_put_pixels16_xy2_ssse3;
+    c->avg_pixels_tab[0][3]            = ff_avg_pixels16_xy2_ssse3;
+    c->put_pixels_tab[1][3]            = ff_put_pixels8_xy2_ssse3;
+    c->avg_pixels_tab[1][3]            = ff_avg_pixels8_xy2_ssse3;
+#endif
+}
+
+av_cold void ff_hpeldsp_init_x86(HpelDSPContext *c, int flags)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (INLINE_MMX(cpu_flags))
+        hpeldsp_init_mmx(c, flags);
+
+    if (EXTERNAL_AMD3DNOW(cpu_flags))
+        hpeldsp_init_3dnow(c, flags);
+
+    if (EXTERNAL_MMXEXT(cpu_flags))
+        hpeldsp_init_mmxext(c, flags);
+
+    if (EXTERNAL_SSE2_FAST(cpu_flags))
+        hpeldsp_init_sse2_fast(c, flags);
+
+    if (EXTERNAL_SSSE3(cpu_flags))
+        hpeldsp_init_ssse3(c, flags);
+
+    if (CONFIG_VP3_DECODER)
+        ff_hpeldsp_vp3_init_x86(c, cpu_flags, flags);
+}
--- a/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/hpeldsp_rnd_template.c
+++ b/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/hpeldsp_rnd_template.c
@ -0,0 +1,202 @@
+/*
+ * SIMD-optimized halfpel functions are compiled twice for rnd/no_rnd
+ * Copyright (c) 2000, 2001 Fabrice Bellard
+ * Copyright (c) 2003-2004 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
+ * mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
+ * and improved by Zdenek Kabelac <kabi@users.sf.net>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+
+// put_pixels
+av_unused static void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
+{
+    MOVQ_BFE(mm6);
+    __asm__ volatile(
+        "lea    (%3, %3), %%"FF_REG_a"  \n\t"
+        ".p2align 3                     \n\t"
+        "1:                             \n\t"
+        "movq   (%1), %%mm0             \n\t"
+        "movq   1(%1), %%mm1            \n\t"
+        "movq   (%1, %3), %%mm2         \n\t"
+        "movq   1(%1, %3), %%mm3        \n\t"
+        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
+        "movq   %%mm4, (%2)             \n\t"
+        "movq   %%mm5, (%2, %3)         \n\t"
+        "add    %%"FF_REG_a", %1        \n\t"
+        "add    %%"FF_REG_a", %2        \n\t"
+        "movq   (%1), %%mm0             \n\t"
+        "movq   1(%1), %%mm1            \n\t"
+        "movq   (%1, %3), %%mm2         \n\t"
+        "movq   1(%1, %3), %%mm3        \n\t"
+        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
+        "movq   %%mm4, (%2)             \n\t"
+        "movq   %%mm5, (%2, %3)         \n\t"
+        "add    %%"FF_REG_a", %1        \n\t"
+        "add    %%"FF_REG_a", %2        \n\t"
+        "subl   $4, %0                  \n\t"
+        "jnz    1b                      \n\t"
+        :"+g"(h), "+S"(pixels), "+D"(block)
+        :"r"((x86_reg)line_size)
+        :FF_REG_a, "memory");
+}
+
+av_unused static void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
+{
+    MOVQ_BFE(mm6);
+    __asm__ volatile(
+        "lea    (%3, %3), %%"FF_REG_a"  \n\t"
+        ".p2align 3                     \n\t"
+        "1:                             \n\t"
+        "movq   (%1), %%mm0             \n\t"
+        "movq   1(%1), %%mm1            \n\t"
+        "movq   (%1, %3), %%mm2         \n\t"
+        "movq   1(%1, %3), %%mm3        \n\t"
+        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
+        "movq   %%mm4, (%2)             \n\t"
+        "movq   %%mm5, (%2, %3)         \n\t"
+        "movq   8(%1), %%mm0            \n\t"
+        "movq   9(%1), %%mm1            \n\t"
+        "movq   8(%1, %3), %%mm2        \n\t"
+        "movq   9(%1, %3), %%mm3        \n\t"
+        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
+        "movq   %%mm4, 8(%2)            \n\t"
+        "movq   %%mm5, 8(%2, %3)        \n\t"
+        "add    %%"FF_REG_a", %1        \n\t"
+        "add    %%"FF_REG_a", %2        \n\t"
+        "movq   (%1), %%mm0             \n\t"
+        "movq   1(%1), %%mm1            \n\t"
+        "movq   (%1, %3), %%mm2         \n\t"
+        "movq   1(%1, %3), %%mm3        \n\t"
+        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
+        "movq   %%mm4, (%2)             \n\t"
+        "movq   %%mm5, (%2, %3)         \n\t"
+        "movq   8(%1), %%mm0            \n\t"
+        "movq   9(%1), %%mm1            \n\t"
+        "movq   8(%1, %3), %%mm2        \n\t"
+        "movq   9(%1, %3), %%mm3        \n\t"
+        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
+        "movq   %%mm4, 8(%2)            \n\t"
+        "movq   %%mm5, 8(%2, %3)        \n\t"
+        "add    %%"FF_REG_a", %1        \n\t"
+        "add    %%"FF_REG_a", %2        \n\t"
+        "subl   $4, %0                  \n\t"
+        "jnz    1b                      \n\t"
+        :"+g"(h), "+S"(pixels), "+D"(block)
+        :"r"((x86_reg)line_size)
+        :FF_REG_a, "memory");
+}
+
+av_unused static void DEF(put, pixels8_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
+{
+    MOVQ_BFE(mm6);
+    __asm__ volatile(
+        "lea (%3, %3), %%"FF_REG_a"     \n\t"
+        "movq (%1), %%mm0               \n\t"
+        ".p2align 3                     \n\t"
+        "1:                             \n\t"
+        "movq   (%1, %3), %%mm1         \n\t"
+        "movq   (%1, %%"FF_REG_a"),%%mm2\n\t"
+        PAVGBP(%%mm1, %%mm0, %%mm4,   %%mm2, %%mm1, %%mm5)
+        "movq   %%mm4, (%2)             \n\t"
+        "movq   %%mm5, (%2, %3)         \n\t"
+        "add    %%"FF_REG_a", %1        \n\t"
+        "add    %%"FF_REG_a", %2        \n\t"
+        "movq   (%1, %3), %%mm1         \n\t"
+        "movq   (%1, %%"FF_REG_a"),%%mm0\n\t"
+        PAVGBP(%%mm1, %%mm2, %%mm4,   %%mm0, %%mm1, %%mm5)
+        "movq   %%mm4, (%2)             \n\t"
+        "movq   %%mm5, (%2, %3)         \n\t"
+        "add    %%"FF_REG_a", %1        \n\t"
+        "add    %%"FF_REG_a", %2        \n\t"
+        "subl   $4, %0                  \n\t"
+        "jnz    1b                      \n\t"
+        :"+g"(h), "+S"(pixels), "+D"(block)
+        :"r"((x86_reg)line_size)
+        :FF_REG_a, "memory");
+}
+
+av_unused static void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
+{
+    MOVQ_BFE(mm6);
+        __asm__ volatile(
+            ".p2align 3                 \n\t"
+            "1:                         \n\t"
+            "movq  (%1), %%mm0          \n\t"
+            "movq  1(%1), %%mm1         \n\t"
+            "movq  (%2), %%mm3          \n\t"
+            PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
+            PAVGB_MMX(%%mm3, %%mm2, %%mm0, %%mm6)
+            "movq  %%mm0, (%2)          \n\t"
+            "movq  8(%1), %%mm0         \n\t"
+            "movq  9(%1), %%mm1         \n\t"
+            "movq  8(%2), %%mm3         \n\t"
+            PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
+            PAVGB_MMX(%%mm3, %%mm2, %%mm0, %%mm6)
+            "movq  %%mm0, 8(%2)         \n\t"
+            "add    %3, %1              \n\t"
+            "add    %3, %2              \n\t"
+            "subl   $1, %0              \n\t"
+            "jnz    1b                  \n\t"
+            :"+g"(h), "+S"(pixels), "+D"(block)
+            :"r"((x86_reg)line_size)
+            :"memory");
+}
+
+av_unused static void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
+{
+    MOVQ_BFE(mm6);
+    __asm__ volatile(
+        "lea    (%3, %3), %%"FF_REG_a"  \n\t"
+        "movq   (%1), %%mm0             \n\t"
+        ".p2align 3                     \n\t"
+        "1:                             \n\t"
+        "movq   (%1, %3), %%mm1         \n\t"
+        "movq   (%1, %%"FF_REG_a"), %%mm2 \n\t"
+        PAVGBP(%%mm1, %%mm0, %%mm4,   %%mm2, %%mm1, %%mm5)
+        "movq   (%2), %%mm3             \n\t"
+        PAVGB_MMX(%%mm3, %%mm4, %%mm0, %%mm6)
+        "movq   (%2, %3), %%mm3         \n\t"
+        PAVGB_MMX(%%mm3, %%mm5, %%mm1, %%mm6)
+        "movq   %%mm0, (%2)             \n\t"
+        "movq   %%mm1, (%2, %3)         \n\t"
+        "add    %%"FF_REG_a", %1        \n\t"
+        "add    %%"FF_REG_a", %2        \n\t"
+
+        "movq   (%1, %3), %%mm1         \n\t"
+        "movq   (%1, %%"FF_REG_a"), %%mm0 \n\t"
+        PAVGBP(%%mm1, %%mm2, %%mm4,   %%mm0, %%mm1, %%mm5)
+        "movq   (%2), %%mm3             \n\t"
+        PAVGB_MMX(%%mm3, %%mm4, %%mm2, %%mm6)
+        "movq   (%2, %3), %%mm3         \n\t"
+        PAVGB_MMX(%%mm3, %%mm5, %%mm1, %%mm6)
+        "movq   %%mm2, (%2)             \n\t"
+        "movq   %%mm1, (%2, %3)         \n\t"
+        "add    %%"FF_REG_a", %1        \n\t"
+        "add    %%"FF_REG_a", %2        \n\t"
+
+        "subl   $4, %0                  \n\t"
+        "jnz    1b                      \n\t"
+        :"+g"(h), "+S"(pixels), "+D"(block)
+        :"r"((x86_reg)line_size)
+        :FF_REG_a, "memory");
+}
--- a/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/hpeldsp_vp3_init.c
+++ b/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/hpeldsp_vp3_init.c
@ -0,0 +1,56 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+
+#include "libavcodec/avcodec.h"
+#include "libavcodec/hpeldsp.h"
+
+#include "hpeldsp.h"
+
+void ff_put_no_rnd_pixels8_x2_exact_mmxext(uint8_t *block,
+                                           const uint8_t *pixels,
+                                           ptrdiff_t line_size, int h);
+void ff_put_no_rnd_pixels8_x2_exact_3dnow(uint8_t *block,
+                                          const uint8_t *pixels,
+                                          ptrdiff_t line_size, int h);
+void ff_put_no_rnd_pixels8_y2_exact_mmxext(uint8_t *block,
+                                           const uint8_t *pixels,
+                                           ptrdiff_t line_size, int h);
+void ff_put_no_rnd_pixels8_y2_exact_3dnow(uint8_t *block,
+                                          const uint8_t *pixels,
+                                          ptrdiff_t line_size, int h);
+
+av_cold void ff_hpeldsp_vp3_init_x86(HpelDSPContext *c, int cpu_flags, int flags)
+{
+    if (EXTERNAL_AMD3DNOW(cpu_flags)) {
+        if (flags & AV_CODEC_FLAG_BITEXACT) {
+            c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_3dnow;
+            c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_3dnow;
+        }
+    }
+
+    if (EXTERNAL_MMXEXT(cpu_flags)) {
+        if (flags & AV_CODEC_FLAG_BITEXACT) {
+            c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_mmxext;
+            c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_mmxext;
+        }
+    }
+}
--- a/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/huffyuvdsp_init.c
+++ b/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/huffyuvdsp_init.c
@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2009 Loren Merritt <lorenm@u.washington.edu>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/pixdesc.h"
+#include "libavutil/x86/asm.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/huffyuvdsp.h"
+
+void ff_add_int16_mmx(uint16_t *dst, const uint16_t *src, unsigned mask, int w);
+void ff_add_int16_sse2(uint16_t *dst, const uint16_t *src, unsigned mask, int w);
+void ff_add_int16_avx2(uint16_t *dst, const uint16_t *src, unsigned mask, int w);
+
+void ff_add_hfyu_left_pred_bgr32_mmx(uint8_t *dst, const uint8_t *src,
+                                     intptr_t w, uint8_t *left);
+void ff_add_hfyu_left_pred_bgr32_sse2(uint8_t *dst, const uint8_t *src,
+                                      intptr_t w, uint8_t *left);
+void ff_add_hfyu_median_pred_int16_mmxext(uint16_t *dst, const uint16_t *top, const uint16_t *diff, unsigned mask, int w, int *left, int *left_top);
+
+av_cold void ff_huffyuvdsp_init_x86(HuffYUVDSPContext *c, enum AVPixelFormat pix_fmt)
+{
+    int cpu_flags = av_get_cpu_flags();
+    const AVPixFmtDescriptor *pix_desc = av_pix_fmt_desc_get(pix_fmt);
+
+    if (ARCH_X86_32 && EXTERNAL_MMX(cpu_flags)) {
+        c->add_hfyu_left_pred_bgr32 = ff_add_hfyu_left_pred_bgr32_mmx;
+        c->add_int16 = ff_add_int16_mmx;
+    }
+
+    if (EXTERNAL_MMXEXT(cpu_flags) && pix_desc && pix_desc->comp[0].depth<16) {
+        c->add_hfyu_median_pred_int16 = ff_add_hfyu_median_pred_int16_mmxext;
+    }
+
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        c->add_int16 = ff_add_int16_sse2;
+        c->add_hfyu_left_pred_bgr32 = ff_add_hfyu_left_pred_bgr32_sse2;
+    }
+
+    if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+        c->add_int16 = ff_add_int16_avx2;
+    }
+}
--- a/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/huffyuvencdsp_init.c
+++ b/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/huffyuvencdsp_init.c
@ -0,0 +1,60 @@
+/*
+ * SIMD-optimized HuffYUV encoding functions
+ * Copyright (c) 2000, 2001 Fabrice Bellard
+ * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/pixdesc.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/huffyuvencdsp.h"
+
+void ff_diff_int16_mmx (uint16_t *dst, const uint16_t *src1, const uint16_t *src2,
+                        unsigned mask, int w);
+void ff_diff_int16_sse2(uint16_t *dst, const uint16_t *src1, const uint16_t *src2,
+                        unsigned mask, int w);
+void ff_diff_int16_avx2(uint16_t *dst, const uint16_t *src1, const uint16_t *src2,
+                        unsigned mask, int w);
+void ff_sub_hfyu_median_pred_int16_mmxext(uint16_t *dst, const uint16_t *src1, const uint16_t *src2,
+                                          unsigned mask, int w, int *left, int *left_top);
+
+av_cold void ff_huffyuvencdsp_init_x86(HuffYUVEncDSPContext *c, AVCodecContext *avctx)
+{
+    av_unused int cpu_flags = av_get_cpu_flags();
+    const AVPixFmtDescriptor *pix_desc = av_pix_fmt_desc_get(avctx->pix_fmt);
+
+    if (ARCH_X86_32 && EXTERNAL_MMX(cpu_flags)) {
+        c->diff_int16 = ff_diff_int16_mmx;
+    }
+
+    if (EXTERNAL_MMXEXT(cpu_flags) && pix_desc && pix_desc->comp[0].depth<16) {
+        c->sub_hfyu_median_pred_int16 = ff_sub_hfyu_median_pred_int16_mmxext;
+    }
+
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        c->diff_int16 = ff_diff_int16_sse2;
+    }
+
+    if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+        c->diff_int16 = ff_diff_int16_avx2;
+    }
+}
--- a/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/idctdsp.h
+++ b/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/idctdsp.h
@ -0,0 +1,39 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_X86_IDCTDSP_H
+#define AVCODEC_X86_IDCTDSP_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
+                               ptrdiff_t line_size);
+void ff_add_pixels_clamped_sse2(const int16_t *block, uint8_t *pixels,
+                                ptrdiff_t line_size);
+void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
+                               ptrdiff_t line_size);
+void ff_put_pixels_clamped_sse2(const int16_t *block, uint8_t *pixels,
+                                ptrdiff_t line_size);
+void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
+                                      ptrdiff_t line_size);
+void ff_put_signed_pixels_clamped_sse2(const int16_t *block, uint8_t *pixels,
+                                       ptrdiff_t line_size);
+
+
+#endif /* AVCODEC_X86_IDCTDSP_H */
--- a/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/idctdsp_init.c
+++ b/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/idctdsp_init.c
@ -0,0 +1,162 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/avcodec.h"
+#include "libavcodec/idctdsp.h"
+#include "idctdsp.h"
+#include "simple_idct.h"
+
+/* Input permutation for the simple_idct_mmx */
+static const uint8_t simple_mmx_permutation[64] = {
+    0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
+    0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
+    0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
+    0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
+    0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
+    0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
+    0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
+    0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
+};
+
+static const uint8_t idct_sse2_row_perm[8] = { 0, 4, 1, 5, 2, 6, 3, 7 };
+
+av_cold int ff_init_scantable_permutation_x86(uint8_t *idct_permutation,
+                                              enum idct_permutation_type perm_type)
+{
+    int i;
+
+    switch (perm_type) {
+    case FF_IDCT_PERM_SIMPLE:
+        for (i = 0; i < 64; i++)
+            idct_permutation[i] = simple_mmx_permutation[i];
+        return 1;
+    case FF_IDCT_PERM_SSE2:
+        for (i = 0; i < 64; i++)
+            idct_permutation[i] = (i & 0x38) | idct_sse2_row_perm[i & 7];
+        return 1;
+    }
+
+    return 0;
+}
+
+av_cold void ff_idctdsp_init_x86(IDCTDSPContext *c, AVCodecContext *avctx,
+                                 unsigned high_bit_depth)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_MMX(cpu_flags)) {
+        c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;
+        c->put_pixels_clamped        = ff_put_pixels_clamped_mmx;
+        c->add_pixels_clamped        = ff_add_pixels_clamped_mmx;
+
+        if (!high_bit_depth &&
+            avctx->lowres == 0 &&
+            (avctx->idct_algo == FF_IDCT_AUTO ||
+                avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
+                avctx->idct_algo == FF_IDCT_SIMPLEMMX)) {
+                c->idct_put  = ff_simple_idct_put_mmx;
+                c->idct_add  = ff_simple_idct_add_mmx;
+                c->idct      = ff_simple_idct_mmx;
+                c->perm_type = FF_IDCT_PERM_SIMPLE;
+        }
+    }
+
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_sse2;
+        c->put_pixels_clamped        = ff_put_pixels_clamped_sse2;
+        c->add_pixels_clamped        = ff_add_pixels_clamped_sse2;
+
+        if (!high_bit_depth &&
+            avctx->lowres == 0 &&
+            (avctx->idct_algo == FF_IDCT_AUTO ||
+                avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
+                avctx->idct_algo == FF_IDCT_SIMPLEMMX)) {
+                c->idct_put  = ff_simple_idct_put_sse2;
+                c->idct_add  = ff_simple_idct_add_sse2;
+                c->perm_type = FF_IDCT_PERM_SIMPLE;
+        }
+
+        if (ARCH_X86_64 &&
+            !high_bit_depth &&
+            avctx->lowres == 0 &&
+            (avctx->idct_algo == FF_IDCT_AUTO ||
+                avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
+                avctx->idct_algo == FF_IDCT_SIMPLEMMX ||
+                avctx->idct_algo == FF_IDCT_SIMPLE)) {
+                c->idct      = ff_simple_idct8_sse2;
+                c->idct_put  = ff_simple_idct8_put_sse2;
+                c->idct_add  = ff_simple_idct8_add_sse2;
+                c->perm_type = FF_IDCT_PERM_TRANSPOSE;
+        }
+    }
+
+    if (ARCH_X86_64 && avctx->lowres == 0) {
+        if (EXTERNAL_AVX(cpu_flags) &&
+            !high_bit_depth &&
+            (avctx->idct_algo == FF_IDCT_AUTO ||
+                avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
+                avctx->idct_algo == FF_IDCT_SIMPLEMMX ||
+                avctx->idct_algo == FF_IDCT_SIMPLE)) {
+                c->idct      = ff_simple_idct8_avx;
+                c->idct_put  = ff_simple_idct8_put_avx;
+                c->idct_add  = ff_simple_idct8_add_avx;
+                c->perm_type = FF_IDCT_PERM_TRANSPOSE;
+        }
+
+        if (avctx->bits_per_raw_sample == 10 &&
+            avctx->codec_id != AV_CODEC_ID_MPEG4 &&
+            (avctx->idct_algo == FF_IDCT_AUTO ||
+             avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
+             avctx->idct_algo == FF_IDCT_SIMPLE)) {
+            if (EXTERNAL_SSE2(cpu_flags)) {
+                c->idct_put  = ff_simple_idct10_put_sse2;
+                c->idct_add  = NULL;
+                c->idct      = ff_simple_idct10_sse2;
+                c->perm_type = FF_IDCT_PERM_TRANSPOSE;
+
+            }
+            if (EXTERNAL_AVX(cpu_flags)) {
+                c->idct_put  = ff_simple_idct10_put_avx;
+                c->idct_add  = NULL;
+                c->idct      = ff_simple_idct10_avx;
+                c->perm_type = FF_IDCT_PERM_TRANSPOSE;
+            }
+        }
+
+        if (avctx->bits_per_raw_sample == 12 &&
+            (avctx->idct_algo == FF_IDCT_AUTO ||
+             avctx->idct_algo == FF_IDCT_SIMPLEMMX)) {
+            if (EXTERNAL_SSE2(cpu_flags)) {
+                c->idct_put  = ff_simple_idct12_put_sse2;
+                c->idct_add  = NULL;
+                c->idct      = ff_simple_idct12_sse2;
+                c->perm_type = FF_IDCT_PERM_TRANSPOSE;
+            }
+            if (EXTERNAL_AVX(cpu_flags)) {
+                c->idct_put  = ff_simple_idct12_put_avx;
+                c->idct_add  = NULL;
+                c->idct      = ff_simple_idct12_avx;
+                c->perm_type = FF_IDCT_PERM_TRANSPOSE;
+            }
+        }
+    }
+}
--- a/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/inline_asm.h
+++ b/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/inline_asm.h
@ -0,0 +1,100 @@
+/*
+ * inline assembly helper macros
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_X86_INLINE_ASM_H
+#define AVCODEC_X86_INLINE_ASM_H
+
+#include "constants.h"
+
+#define MOVQ_WONE(regd) \
+    __asm__ volatile ( \
+    "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
+    "psrlw $15, %%" #regd ::)
+
+#define JUMPALIGN()     __asm__ volatile (".p2align 3"::)
+#define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%"#regd", %%"#regd ::)
+
+#define MOVQ_BFE(regd)                                  \
+    __asm__ volatile (                                  \
+        "pcmpeqd %%"#regd", %%"#regd"   \n\t"           \
+        "paddb   %%"#regd", %%"#regd"   \n\t" ::)
+
+#ifndef PIC
+#define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_pw_2))
+#else
+// for shared library it's better to use this way for accessing constants
+// pcmpeqd -> -1
+#define MOVQ_WTWO(regd)                                 \
+    __asm__ volatile (                                  \
+        "pcmpeqd %%"#regd", %%"#regd"   \n\t"           \
+        "psrlw         $15, %%"#regd"   \n\t"           \
+        "psllw          $1, %%"#regd"   \n\t"::)
+
+#endif
+
+// using regr as temporary and for the output result
+// first argument is unmodified and second is trashed
+// regfe is supposed to contain 0xfefefefefefefefe
+#define PAVGB_MMX_NO_RND(rega, regb, regr, regfe)                \
+    "movq   "#rega", "#regr"            \n\t"                    \
+    "pand   "#regb", "#regr"            \n\t"                    \
+    "pxor   "#rega", "#regb"            \n\t"                    \
+    "pand  "#regfe", "#regb"            \n\t"                    \
+    "psrlq       $1, "#regb"            \n\t"                    \
+    "paddb  "#regb", "#regr"            \n\t"
+
+#define PAVGB_MMX(rega, regb, regr, regfe)                       \
+    "movq   "#rega", "#regr"            \n\t"                    \
+    "por    "#regb", "#regr"            \n\t"                    \
+    "pxor   "#rega", "#regb"            \n\t"                    \
+    "pand  "#regfe", "#regb"            \n\t"                    \
+    "psrlq       $1, "#regb"            \n\t"                    \
+    "psubb  "#regb", "#regr"            \n\t"
+
+// mm6 is supposed to contain 0xfefefefefefefefe
+#define PAVGBP_MMX_NO_RND(rega, regb, regr,  regc, regd, regp)   \
+    "movq  "#rega", "#regr"             \n\t"                    \
+    "movq  "#regc", "#regp"             \n\t"                    \
+    "pand  "#regb", "#regr"             \n\t"                    \
+    "pand  "#regd", "#regp"             \n\t"                    \
+    "pxor  "#rega", "#regb"             \n\t"                    \
+    "pxor  "#regc", "#regd"             \n\t"                    \
+    "pand    %%mm6, "#regb"             \n\t"                    \
+    "pand    %%mm6, "#regd"             \n\t"                    \
+    "psrlq      $1, "#regb"             \n\t"                    \
+    "psrlq      $1, "#regd"             \n\t"                    \
+    "paddb "#regb", "#regr"             \n\t"                    \
+    "paddb "#regd", "#regp"             \n\t"
+
+#define PAVGBP_MMX(rega, regb, regr, regc, regd, regp)           \
+    "movq  "#rega", "#regr"             \n\t"                    \
+    "movq  "#regc", "#regp"             \n\t"                    \
+    "por   "#regb", "#regr"             \n\t"                    \
+    "por   "#regd", "#regp"             \n\t"                    \
+    "pxor  "#rega", "#regb"             \n\t"                    \
+    "pxor  "#regc", "#regd"             \n\t"                    \
+    "pand    %%mm6, "#regb"             \n\t"                    \
+    "pand    %%mm6, "#regd"             \n\t"                    \
+    "psrlq      $1, "#regd"             \n\t"                    \
+    "psrlq      $1, "#regb"             \n\t"                    \
+    "psubb "#regb", "#regr"             \n\t"                    \
+    "psubb "#regd", "#regp"             \n\t"
+
+#endif /* AVCODEC_X86_INLINE_ASM_H */
--- a/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/jpeg2000dsp_init.c
+++ b/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/jpeg2000dsp_init.c
@ -0,0 +1,60 @@
+/*
+ * SIMD optimized JPEG 2000 DSP functions
+ * Copyright (c) 2015 James Almer
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/jpeg2000dsp.h"
+
+void ff_ict_float_sse(void *src0, void *src1, void *src2, int csize);
+void ff_ict_float_avx(void *src0, void *src1, void *src2, int csize);
+void ff_ict_float_fma3(void *src0, void *src1, void *src2, int csize);
+void ff_ict_float_fma4(void *src0, void *src1, void *src2, int csize);
+void ff_rct_int_sse2 (void *src0, void *src1, void *src2, int csize);
+void ff_rct_int_avx2 (void *src0, void *src1, void *src2, int csize);
+
+av_cold void ff_jpeg2000dsp_init_x86(Jpeg2000DSPContext *c)
+{
+    int cpu_flags = av_get_cpu_flags();
+    if (EXTERNAL_SSE(cpu_flags)) {
+        c->mct_decode[FF_DWT97] = ff_ict_float_sse;
+    }
+
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        c->mct_decode[FF_DWT53] = ff_rct_int_sse2;
+    }
+
+    if (EXTERNAL_AVX_FAST(cpu_flags)) {
+        c->mct_decode[FF_DWT97] = ff_ict_float_avx;
+    }
+
+    if (EXTERNAL_FMA4(cpu_flags)) {
+        c->mct_decode[FF_DWT97] = ff_ict_float_fma4;
+    }
+
+    if (EXTERNAL_FMA3_FAST(cpu_flags)) {
+        c->mct_decode[FF_DWT97] = ff_ict_float_fma3;
+    }
+
+    if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+        c->mct_decode[FF_DWT53] = ff_rct_int_avx2;
+    }
+}
--- a/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/lossless_audiodsp_init.c
+++ b/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/lossless_audiodsp_init.c
@ -0,0 +1,56 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/lossless_audiodsp.h"
+
+int32_t ff_scalarproduct_and_madd_int16_mmxext(int16_t *v1, const int16_t *v2,
+                                               const int16_t *v3,
+                                               int order, int mul);
+int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2,
+                                             const int16_t *v3,
+                                             int order, int mul);
+int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2,
+                                              const int16_t *v3,
+                                              int order, int mul);
+
+int32_t ff_scalarproduct_and_madd_int32_sse4(int16_t *v1, const int32_t *v2,
+                                             const int16_t *v3,
+                                             int order, int mul);
+
+av_cold void ff_llauddsp_init_x86(LLAudDSPContext *c)
+{
+#if HAVE_X86ASM
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_MMXEXT(cpu_flags))
+        c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmxext;
+
+    if (EXTERNAL_SSE2(cpu_flags))
+        c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
+
+    if (EXTERNAL_SSSE3(cpu_flags) &&
+        !(cpu_flags & (AV_CPU_FLAG_SSE42 | AV_CPU_FLAG_3DNOW))) // cachesplit
+        c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
+
+    if (EXTERNAL_SSE4(cpu_flags))
+        c->scalarproduct_and_madd_int32 = ff_scalarproduct_and_madd_int32_sse4;
+#endif
+}
--- a/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/lossless_videodsp_init.c
+++ b/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/lossless_videodsp_init.c
@ -0,0 +1,128 @@
+/*
+ * Lossless video DSP utils
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "libavutil/x86/asm.h"
+#include "../lossless_videodsp.h"
+#include "libavutil/x86/cpu.h"
+
+void ff_add_bytes_mmx(uint8_t *dst, uint8_t *src, ptrdiff_t w);
+void ff_add_bytes_sse2(uint8_t *dst, uint8_t *src, ptrdiff_t w);
+void ff_add_bytes_avx2(uint8_t *dst, uint8_t *src, ptrdiff_t w);
+
+void ff_add_median_pred_mmxext(uint8_t *dst, const uint8_t *top,
+                               const uint8_t *diff, ptrdiff_t w,
+                               int *left, int *left_top);
+void ff_add_median_pred_sse2(uint8_t *dst, const uint8_t *top,
+                             const uint8_t *diff, ptrdiff_t w,
+                             int *left, int *left_top);
+
+int  ff_add_left_pred_ssse3(uint8_t *dst, const uint8_t *src,
+                            ptrdiff_t w, int left);
+int  ff_add_left_pred_unaligned_ssse3(uint8_t *dst, const uint8_t *src,
+                                      ptrdiff_t w, int left);
+int  ff_add_left_pred_unaligned_avx2(uint8_t *dst, const uint8_t *src,
+                                     ptrdiff_t w, int left);
+
+int ff_add_left_pred_int16_ssse3(uint16_t *dst, const uint16_t *src, unsigned mask, ptrdiff_t w, unsigned acc);
+int ff_add_left_pred_int16_unaligned_ssse3(uint16_t *dst, const uint16_t *src, unsigned mask, ptrdiff_t w, unsigned acc);
+
+void ff_add_gradient_pred_ssse3(uint8_t *src, const ptrdiff_t stride, const ptrdiff_t width);
+void ff_add_gradient_pred_avx2(uint8_t *src, const ptrdiff_t stride, const ptrdiff_t width);
+
+#if HAVE_INLINE_ASM && HAVE_7REGS && ARCH_X86_32
+static void add_median_pred_cmov(uint8_t *dst, const uint8_t *top,
+                                 const uint8_t *diff, ptrdiff_t w,
+                                 int *left, int *left_top)
+{
+    x86_reg w2 = -w;
+    x86_reg x;
+    int l  = *left     & 0xff;
+    int tl = *left_top & 0xff;
+    int t;
+    __asm__ volatile (
+        "mov          %7, %3            \n"
+        "1:                             \n"
+        "movzbl (%3, %4), %2            \n"
+        "mov          %2, %k3           \n"
+        "sub         %b1, %b3           \n"
+        "add         %b0, %b3           \n"
+        "mov          %2, %1            \n"
+        "cmp          %0, %2            \n"
+        "cmovg        %0, %2            \n"
+        "cmovg        %1, %0            \n"
+        "cmp         %k3, %0            \n"
+        "cmovg       %k3, %0            \n"
+        "mov          %7, %3            \n"
+        "cmp          %2, %0            \n"
+        "cmovl        %2, %0            \n"
+        "add    (%6, %4), %b0           \n"
+        "mov         %b0, (%5, %4)      \n"
+        "inc          %4                \n"
+        "jl           1b                \n"
+        : "+&q"(l), "+&q"(tl), "=&r"(t), "=&q"(x), "+&r"(w2)
+        : "r"(dst + w), "r"(diff + w), "rm"(top + w)
+    );
+    *left     = l;
+    *left_top = tl;
+}
+#endif
+
+void ff_llviddsp_init_x86(LLVidDSPContext *c)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+#if HAVE_INLINE_ASM && HAVE_7REGS && ARCH_X86_32
+    if (cpu_flags & AV_CPU_FLAG_CMOV)
+        c->add_median_pred = add_median_pred_cmov;
+#endif
+
+    if (ARCH_X86_32 && EXTERNAL_MMX(cpu_flags)) {
+        c->add_bytes = ff_add_bytes_mmx;
+    }
+
+    if (ARCH_X86_32 && EXTERNAL_MMXEXT(cpu_flags)) {
+        /* slower than cmov version on AMD */
+        if (!(cpu_flags & AV_CPU_FLAG_3DNOW))
+            c->add_median_pred = ff_add_median_pred_mmxext;
+    }
+
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        c->add_bytes       = ff_add_bytes_sse2;
+        c->add_median_pred = ff_add_median_pred_sse2;
+    }
+
+    if (EXTERNAL_SSSE3(cpu_flags)) {
+        c->add_left_pred = ff_add_left_pred_ssse3;
+        c->add_left_pred_int16 = ff_add_left_pred_int16_ssse3;
+        c->add_gradient_pred   = ff_add_gradient_pred_ssse3;
+    }
+
+    if (EXTERNAL_SSSE3_FAST(cpu_flags)) {
+        c->add_left_pred = ff_add_left_pred_unaligned_ssse3;
+        c->add_left_pred_int16 = ff_add_left_pred_int16_unaligned_ssse3;
+    }
+
+    if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+        c->add_bytes       = ff_add_bytes_avx2;
+        c->add_left_pred   = ff_add_left_pred_unaligned_avx2;
+        c->add_gradient_pred = ff_add_gradient_pred_avx2;
+    }
+}
--- a/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/lossless_videoencdsp_init.c
+++ b/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/lossless_videoencdsp_init.c
@ -0,0 +1,111 @@
+/*
+ * SIMD-optimized lossless video encoding functions
+ * Copyright (c) 2000, 2001 Fabrice Bellard
+ * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/asm.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/lossless_videoencdsp.h"
+#include "libavcodec/mathops.h"
+
+void ff_diff_bytes_mmx(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
+                       intptr_t w);
+void ff_diff_bytes_sse2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
+                        intptr_t w);
+void ff_diff_bytes_avx2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
+                        intptr_t w);
+
+void ff_sub_left_predict_avx(uint8_t *dst, uint8_t *src,
+                            ptrdiff_t stride, ptrdiff_t width, int height);
+
+#if HAVE_INLINE_ASM
+
+static void sub_median_pred_mmxext(uint8_t *dst, const uint8_t *src1,
+                                   const uint8_t *src2, intptr_t w,
+                                   int *left, int *left_top)
+{
+    x86_reg i = 0;
+    uint8_t l, lt;
+
+    __asm__ volatile (
+        "movq  (%1, %0), %%mm0          \n\t" // LT
+        "psllq $8, %%mm0                \n\t"
+        "1:                             \n\t"
+        "movq  (%1, %0), %%mm1          \n\t" // T
+        "movq  -1(%2, %0), %%mm2        \n\t" // L
+        "movq  (%2, %0), %%mm3          \n\t" // X
+        "movq %%mm2, %%mm4              \n\t" // L
+        "psubb %%mm0, %%mm2             \n\t"
+        "paddb %%mm1, %%mm2             \n\t" // L + T - LT
+        "movq %%mm4, %%mm5              \n\t" // L
+        "pmaxub %%mm1, %%mm4            \n\t" // max(T, L)
+        "pminub %%mm5, %%mm1            \n\t" // min(T, L)
+        "pminub %%mm2, %%mm4            \n\t"
+        "pmaxub %%mm1, %%mm4            \n\t"
+        "psubb %%mm4, %%mm3             \n\t" // dst - pred
+        "movq %%mm3, (%3, %0)           \n\t"
+        "add $8, %0                     \n\t"
+        "movq -1(%1, %0), %%mm0         \n\t" // LT
+        "cmp %4, %0                     \n\t"
+        " jb 1b                         \n\t"
+        : "+r" (i)
+        : "r" (src1), "r" (src2), "r" (dst), "r" ((x86_reg) w));
+
+    l  = *left;
+    lt = *left_top;
+
+    dst[0] = src2[0] - mid_pred(l, src1[0], (l + src1[0] - lt) & 0xFF);
+
+    *left_top = src1[w - 1];
+    *left     = src2[w - 1];
+}
+
+#endif /* HAVE_INLINE_ASM */
+
+av_cold void ff_llvidencdsp_init_x86(LLVidEncDSPContext *c)
+{
+    av_unused int cpu_flags = av_get_cpu_flags();
+
+    if (ARCH_X86_32 && EXTERNAL_MMX(cpu_flags)) {
+        c->diff_bytes = ff_diff_bytes_mmx;
+    }
+
+#if HAVE_INLINE_ASM
+    if (INLINE_MMXEXT(cpu_flags)) {
+        c->sub_median_pred = sub_median_pred_mmxext;
+    }
+#endif /* HAVE_INLINE_ASM */
+
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        c->diff_bytes = ff_diff_bytes_sse2;
+    }
+
+    if (EXTERNAL_AVX(cpu_flags)) {
+        c->sub_left_predict = ff_sub_left_predict_avx;
+    }
+
+    if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+        c->diff_bytes = ff_diff_bytes_avx2;
+    }
+}
--- a/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/lpc.c
+++ b/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/lpc.c
@ -0,0 +1,162 @@
+/*
+ * SIMD-optimized LPC functions
+ * Copyright (c) 2007 Loren Merritt
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/mem.h"
+#include "libavutil/x86/asm.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/lpc.h"
+
+DECLARE_ASM_CONST(16, double, pd_1)[2] = { 1.0, 1.0 };
+DECLARE_ASM_CONST(16, double, pd_2)[2] = { 2.0, 2.0 };
+
+#if HAVE_SSE2_INLINE
+
+static void lpc_apply_welch_window_sse2(const int32_t *data, int len,
+                                        double *w_data)
+{
+    double c = 2.0 / (len-1.0);
+    int n2 = len>>1;
+    x86_reg i = -n2*sizeof(int32_t);
+    x86_reg j =  n2*sizeof(int32_t);
+    __asm__ volatile(
+        "movsd   %4,     %%xmm7                \n\t"
+        "movapd  "MANGLE(pd_1)", %%xmm6        \n\t"
+        "movapd  "MANGLE(pd_2)", %%xmm5        \n\t"
+        "movlhps %%xmm7, %%xmm7                \n\t"
+        "subpd   %%xmm5, %%xmm7                \n\t"
+        "addsd   %%xmm6, %%xmm7                \n\t"
+        "test    $1,     %5                    \n\t"
+        "jz      2f                            \n\t"
+#define WELCH(MOVPD, offset)\
+        "1:                                    \n\t"\
+        "movapd   %%xmm7,  %%xmm1              \n\t"\
+        "mulpd    %%xmm1,  %%xmm1              \n\t"\
+        "movapd   %%xmm6,  %%xmm0              \n\t"\
+        "subpd    %%xmm1,  %%xmm0              \n\t"\
+        "pshufd   $0x4e,   %%xmm0, %%xmm1      \n\t"\
+        "cvtpi2pd (%3,%0), %%xmm2              \n\t"\
+        "cvtpi2pd "#offset"*4(%3,%1), %%xmm3   \n\t"\
+        "mulpd    %%xmm0,  %%xmm2              \n\t"\
+        "mulpd    %%xmm1,  %%xmm3              \n\t"\
+        "movapd   %%xmm2, (%2,%0,2)            \n\t"\
+        MOVPD"    %%xmm3, "#offset"*8(%2,%1,2) \n\t"\
+        "subpd    %%xmm5,  %%xmm7              \n\t"\
+        "sub      $8,      %1                  \n\t"\
+        "add      $8,      %0                  \n\t"\
+        "jl 1b                                 \n\t"\
+
+        WELCH("movupd", -1)
+        "jmp 3f                                \n\t"
+        "2:                                    \n\t"
+        WELCH("movapd", -2)
+        "3:                                    \n\t"
+        :"+&r"(i), "+&r"(j)
+        :"r"(w_data+n2), "r"(data+n2), "m"(c), "r"(len)
+         NAMED_CONSTRAINTS_ARRAY_ADD(pd_1,pd_2)
+         XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3",
+                                    "%xmm5", "%xmm6", "%xmm7")
+    );
+#undef WELCH
+}
+
+static void lpc_compute_autocorr_sse2(const double *data, int len, int lag,
+                                      double *autoc)
+{
+    int j;
+
+    if((x86_reg)data & 15)
+        data++;
+
+    for(j=0; j<lag; j+=2){
+        x86_reg i = -len*sizeof(double);
+        if(j == lag-2) {
+            __asm__ volatile(
+                "movsd    "MANGLE(pd_1)", %%xmm0    \n\t"
+                "movsd    "MANGLE(pd_1)", %%xmm1    \n\t"
+                "movsd    "MANGLE(pd_1)", %%xmm2    \n\t"
+                "1:                                 \n\t"
+                "movapd   (%2,%0), %%xmm3           \n\t"
+                "movupd -8(%3,%0), %%xmm4           \n\t"
+                "movapd   (%3,%0), %%xmm5           \n\t"
+                "mulpd     %%xmm3, %%xmm4           \n\t"
+                "mulpd     %%xmm3, %%xmm5           \n\t"
+                "mulpd -16(%3,%0), %%xmm3           \n\t"
+                "addpd     %%xmm4, %%xmm1           \n\t"
+                "addpd     %%xmm5, %%xmm0           \n\t"
+                "addpd     %%xmm3, %%xmm2           \n\t"
+                "add       $16,    %0               \n\t"
+                "jl 1b                              \n\t"
+                "movhlps   %%xmm0, %%xmm3           \n\t"
+                "movhlps   %%xmm1, %%xmm4           \n\t"
+                "movhlps   %%xmm2, %%xmm5           \n\t"
+                "addsd     %%xmm3, %%xmm0           \n\t"
+                "addsd     %%xmm4, %%xmm1           \n\t"
+                "addsd     %%xmm5, %%xmm2           \n\t"
+                "movsd     %%xmm0,   (%1)           \n\t"
+                "movsd     %%xmm1,  8(%1)           \n\t"
+                "movsd     %%xmm2, 16(%1)           \n\t"
+                :"+&r"(i)
+                :"r"(autoc+j), "r"(data+len), "r"(data+len-j)
+                 NAMED_CONSTRAINTS_ARRAY_ADD(pd_1)
+                :"memory"
+            );
+        } else {
+            __asm__ volatile(
+                "movsd    "MANGLE(pd_1)", %%xmm0    \n\t"
+                "movsd    "MANGLE(pd_1)", %%xmm1    \n\t"
+                "1:                                 \n\t"
+                "movapd   (%3,%0), %%xmm3           \n\t"
+                "movupd -8(%4,%0), %%xmm4           \n\t"
+                "mulpd     %%xmm3, %%xmm4           \n\t"
+                "mulpd    (%4,%0), %%xmm3           \n\t"
+                "addpd     %%xmm4, %%xmm1           \n\t"
+                "addpd     %%xmm3, %%xmm0           \n\t"
+                "add       $16,    %0               \n\t"
+                "jl 1b                              \n\t"
+                "movhlps   %%xmm0, %%xmm3           \n\t"
+                "movhlps   %%xmm1, %%xmm4           \n\t"
+                "addsd     %%xmm3, %%xmm0           \n\t"
+                "addsd     %%xmm4, %%xmm1           \n\t"
+                "movsd     %%xmm0, %1               \n\t"
+                "movsd     %%xmm1, %2               \n\t"
+                :"+&r"(i), "=m"(autoc[j]), "=m"(autoc[j+1])
+                :"r"(data+len), "r"(data+len-j)
+                 NAMED_CONSTRAINTS_ARRAY_ADD(pd_1)
+            );
+        }
+    }
+}
+
+#endif /* HAVE_SSE2_INLINE */
+
+av_cold void ff_lpc_init_x86(LPCContext *c)
+{
+#if HAVE_SSE2_INLINE
+    int cpu_flags = av_get_cpu_flags();
+
+    if (INLINE_SSE2(cpu_flags) || INLINE_SSE2_SLOW(cpu_flags)) {
+        c->lpc_apply_welch_window = lpc_apply_welch_window_sse2;
+        c->lpc_compute_autocorr   = lpc_compute_autocorr_sse2;
+    }
+#endif /* HAVE_SSE2_INLINE */
+}
--- a/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/mathops.h
+++ b/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/mathops.h
@ -0,0 +1,133 @@
+/*
+ * simple math operations
+ * Copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at> et al
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_X86_MATHOPS_H
+#define AVCODEC_X86_MATHOPS_H
+
+#include "config.h"
+
+#include "libavutil/common.h"
+#include "libavutil/x86/asm.h"
+
+#if HAVE_INLINE_ASM
+
+#if ARCH_X86_32
+
+#define MULL MULL
+static av_always_inline av_const int MULL(int a, int b, unsigned shift)
+{
+    int rt, dummy;
+    __asm__ (
+        "imull %3               \n\t"
+        "shrdl %4, %%edx, %%eax \n\t"
+        :"=a"(rt), "=d"(dummy)
+        :"a"(a), "rm"(b), "ci"((uint8_t)shift)
+    );
+    return rt;
+}
+
+#define MULH MULH
+static av_always_inline av_const int MULH(int a, int b)
+{
+    int rt, dummy;
+    __asm__ (
+        "imull %3"
+        :"=d"(rt), "=a"(dummy)
+        :"a"(a), "rm"(b)
+    );
+    return rt;
+}
+
+#define MUL64 MUL64
+static av_always_inline av_const int64_t MUL64(int a, int b)
+{
+    int64_t rt;
+    __asm__ (
+        "imull %2"
+        :"=A"(rt)
+        :"a"(a), "rm"(b)
+    );
+    return rt;
+}
+
+#endif /* ARCH_X86_32 */
+
+#if HAVE_I686
+/* median of 3 */
+#define mid_pred mid_pred
+static inline av_const int mid_pred(int a, int b, int c)
+{
+    int i=b;
+    __asm__ (
+        "cmp    %2, %1 \n\t"
+        "cmovg  %1, %0 \n\t"
+        "cmovg  %2, %1 \n\t"
+        "cmp    %3, %1 \n\t"
+        "cmovl  %3, %1 \n\t"
+        "cmp    %1, %0 \n\t"
+        "cmovg  %1, %0 \n\t"
+        :"+&r"(i), "+&r"(a)
+        :"r"(b), "r"(c)
+    );
+    return i;
+}
+
+#if HAVE_6REGS
+#define COPY3_IF_LT(x, y, a, b, c, d)\
+__asm__ volatile(\
+    "cmpl  %0, %3       \n\t"\
+    "cmovl %3, %0       \n\t"\
+    "cmovl %4, %1       \n\t"\
+    "cmovl %5, %2       \n\t"\
+    : "+&r" (x), "+&r" (a), "+r" (c)\
+    : "r" (y), "r" (b), "r" (d)\
+);
+#endif /* HAVE_6REGS */
+
+#endif /* HAVE_I686 */
+
+#define MASK_ABS(mask, level)                   \
+    __asm__ ("cdq                    \n\t"      \
+             "xorl %1, %0            \n\t"      \
+             "subl %1, %0            \n\t"      \
+             : "+a"(level), "=&d"(mask))
+
+// avoid +32 for shift optimization (gcc should do that ...)
+#define NEG_SSR32 NEG_SSR32
+static inline  int32_t NEG_SSR32( int32_t a, int8_t s){
+    __asm__ ("sarl %1, %0\n\t"
+         : "+r" (a)
+         : "ic" ((uint8_t)(-s))
+    );
+    return a;
+}
+
+#define NEG_USR32 NEG_USR32
+static inline uint32_t NEG_USR32(uint32_t a, int8_t s){
+    __asm__ ("shrl %1, %0\n\t"
+         : "+r" (a)
+         : "ic" ((uint8_t)(-s))
+    );
+    return a;
+}
+
+#endif /* HAVE_INLINE_ASM */
+#endif /* AVCODEC_X86_MATHOPS_H */
--- a/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/mdct15.asm
+++ b/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/mdct15.asm
@ -0,0 +1,221 @@
+;******************************************************************************
+;* SIMD optimized non-power-of-two MDCT functions
+;*
+;* Copyright (C) 2017 Rostislav Pehlivanov <atomnuker@gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA 32
+
+perm_neg: dd 2, 5, 3, 4, 6, 1, 7, 0
+perm_pos: dd 0, 7, 1, 6, 4, 3, 5, 2
+sign_adjust_r: times 4 dd 0x80000000, 0x00000000
+
+sign_adjust_5: dd 0x00000000, 0x80000000, 0x80000000, 0x00000000
+
+SECTION .text
+
+%if ARCH_X86_64
+
+;*****************************************************************************************
+;void ff_fft15_avx(FFTComplex *out, FFTComplex *in, FFTComplex *exptab, ptrdiff_t stride);
+;*****************************************************************************************
+%macro FFT5 3 ; %1 - in_offset, %2 - dst1 (64bit used), %3 - dst2
+    VBROADCASTSD m0, [inq + %1]         ; in[ 0].re, in[ 0].im, in[ 0].re, in[ 0].im
+    movsd   xm1, [inq + 1*16 +  8 + %1] ; in[ 3].re, in[ 3].im,         0,         0
+    movsd   xm4, [inq + 6*16 +  0 + %1] ; in[12].re, in[12].im,         0,         0
+    movhps  xm1, [inq + 3*16 +  0 + %1] ; in[ 3].re, in[ 3].im, in[ 6].re, in[ 6].im
+    movhps  xm4, [inq + 4*16 +  8 + %1] ; in[12].re, in[12].im, in[ 9].re, in[ 9].im
+
+    subps       xm2,  xm1, xm4          ; t[2].im, t[2].re, t[3].im, t[3].re
+    addps       xm1,  xm4               ; t[0].re, t[0].im, t[1].re, t[1].im
+
+    movhlps     %2,   xm1               ; t[0].re, t[1].re, t[0].im, t[1].im
+    addps       %2,   xm1
+    addps       %2,   xm0               ; DC[0].re, DC[0].im, junk...
+    movlhps     %2,   %2                ; DC[0].re, DC[0].im, DC[0].re, DC[0].im
+
+    shufps      xm3,  xm1, xm2, q0110   ; t[0].re, t[0].im, t[2].re, t[2].im
+    shufps      xm1,  xm2, q2332        ; t[1].re, t[1].im, t[3].re, t[3].im
+
+    mulps       xm%3, xm1, xm5
+    mulps       xm4,  xm3, xm6
+    mulps       xm1,  xm6
+
+    xorps       xm1,  xm7
+    mulps       xm3,  xm5
+    addsubps    xm3,  xm1               ; t[0].re, t[0].im, t[2].re, t[2].im
+    subps       xm%3, xm4               ; t[4].re, t[4].im, t[5].re, t[5].im
+
+    movhlps     xm2, xm%3, xm3          ; t[2].re, t[2].im, t[5].re, t[5].im
+    movlhps     xm3, xm%3               ; t[0].re, t[0].im, t[4].re, t[4].im
+
+    xorps       xm2,  xm7
+    addps       xm%3, xm2, xm3
+    subps       xm3,  xm2
+
+    shufps      xm3,  xm3, q1032
+    vinsertf128 m%3,  m%3, xm3, 1       ; All ACs (tmp[1] through to tmp[4])
+    addps       m%3,  m%3,  m0          ; Finally offset with DCs
+%endmacro
+
+%macro BUTTERFLIES_DC 1 ; %1 - exptab_offset
+    mulps xm0,  xm9, [exptabq + %1 + 16*0]
+    mulps xm1, xm10, [exptabq + %1 + 16*1]
+
+    haddps  xm0,  xm1
+    movhlps xm1,  xm0                   ; t[0].re, t[1].re, t[0].im, t[1].im
+
+    addps   xm0,  xm1
+    addps   xm0,  xm8
+
+    movsd [outq], xm0
+%endmacro
+
+%macro BUTTERFLIES_AC 1 ; %1 - exptab_offset
+    mulps  m0, m12, [exptabq + 64*0 + 0*mmsize + %1]
+    mulps  m1, m12, [exptabq + 64*0 + 1*mmsize + %1]
+    mulps  m2, m13, [exptabq + 64*1 + 0*mmsize + %1]
+    mulps  m3, m13, [exptabq + 64*1 + 1*mmsize + %1]
+
+    addps  m0, m0, m2
+    addps  m1, m1, m3
+    addps  m0, m0, m11
+
+    shufps m1, m1, m1, q2301
+    addps  m0, m0, m1
+
+    vextractf128 xm1, m0, 1
+
+    movlps [outq + strideq*1], xm0
+    movhps [outq + strideq*2], xm0
+    movlps [outq +  stride3q], xm1
+    movhps [outq + strideq*4], xm1
+%endmacro
+
+INIT_YMM avx
+cglobal fft15, 4, 5, 14, out, in, exptab, stride, stride5
+    shl strideq, 3
+
+    movaps xm5, [exptabq + 480 + 16*0]
+    movaps xm6, [exptabq + 480 + 16*1]
+    movaps xm7, [sign_adjust_5]
+
+    FFT5  0,  xm8, 11
+    FFT5  8,  xm9, 12
+    FFT5 16, xm10, 13
+
+%define stride3q inq
+    lea stride3q, [strideq + strideq*2]
+    lea stride5q, [strideq + strideq*4]
+
+    BUTTERFLIES_DC (8*6 + 4*0)*2*4
+    BUTTERFLIES_AC (8*0 + 0*0)*2*4
+
+    add outq, stride5q
+    BUTTERFLIES_DC (8*6 + 4*1)*2*4
+    BUTTERFLIES_AC (8*2 + 0*0)*2*4
+
+    add outq, stride5q
+    BUTTERFLIES_DC (8*6 + 4*2)*2*4
+    BUTTERFLIES_AC (8*4 + 0*0)*2*4
+
+    RET
+
+%endif ; ARCH_X86_64
+
+;*******************************************************************************************************
+;void ff_mdct15_postreindex(FFTComplex *out, FFTComplex *in, FFTComplex *exp, int *lut, ptrdiff_t len8);
+;*******************************************************************************************************
+%macro LUT_LOAD_4D 3
+    mov      r4d, [lutq + %3q*4 +  0]
+    movsd  xmm%1, [inq +  r4q*8]
+    mov      r4d, [lutq + %3q*4 +  4]
+    movhps xmm%1, [inq +  r4q*8]
+%if cpuflag(avx2)
+    mov      r4d, [lutq + %3q*4 +  8]
+    movsd     %2, [inq +  r4q*8]
+    mov      r4d, [lutq + %3q*4 + 12]
+    movhps    %2, [inq +  r4q*8]
+    vinsertf128 %1, %1, %2, 1
+%endif
+%endmacro
+
+%macro POSTROTATE_FN 1
+cglobal mdct15_postreindex, 5, 7, 8 + cpuflag(avx2)*2, out, in, exp, lut, len8, offset_p, offset_n
+
+    xor offset_nq, offset_nq
+    lea offset_pq, [len8q*2 - %1]
+
+    movaps m7,  [sign_adjust_r]
+
+%if cpuflag(avx2)
+    movaps   m8, [perm_pos]
+    movaps   m9, [perm_neg]
+%endif
+
+.loop:
+    movups m0, [expq + offset_pq*8]     ; exp[p0].re, exp[p0].im, exp[p1].re, exp[p1].im, exp[p2].re, exp[p2].im, exp[p3].re, exp[p3].im
+    movups m1, [expq + offset_nq*8]     ; exp[n3].re, exp[n3].im, exp[n2].re, exp[n2].im, exp[n1].re, exp[n1].im, exp[n0].re, exp[n0].im
+
+    LUT_LOAD_4D m3, xm4, offset_p       ; in[p0].re, in[p0].im, in[p1].re, in[p1].im, in[p2].re, in[p2].im, in[p3].re, in[p3].im
+    LUT_LOAD_4D m4, xm5, offset_n       ; in[n3].re, in[n3].im, in[n2].re, in[n2].im, in[n1].re, in[n1].im, in[n0].re, in[n0].im
+
+    mulps  m5, m3, m0                   ; in[p].reim * exp[p].reim
+    mulps  m6, m4, m1                   ; in[n].reim * exp[n].reim
+
+    xorps  m5, m7                       ; in[p].re *= -1, in[p].im *= 1
+    xorps  m6, m7                       ; in[n].re *= -1, in[n].im *= 1
+
+    shufps m3, m3, m3, q2301            ; in[p].imre
+    shufps m4, m4, m4, q2301            ; in[n].imre
+
+    mulps  m3, m0                       ; in[p].imre * exp[p].reim
+    mulps  m4, m1                       ; in[n].imre * exp[n].reim
+
+    haddps m3, m6                       ; out[n0].im, out[n1].im, out[n3].re, out[n2].re, out[n2].im, out[n3].im, out[n1].re, out[n0].re
+    haddps m5, m4                       ; out[p0].re, out[p1].re, out[p3].im, out[p2].im, out[p2].re, out[p3].re, out[p1].im, out[p0].im
+
+%if cpuflag(avx2)
+    vpermps m3, m9, m3                  ; out[n3].im, out[n3].re, out[n2].im, out[n2].re, out[n1].im, out[n1].re, out[n0].im, out[n0].re
+    vpermps m5, m8, m5                  ; out[p0].re, out[p0].im, out[p1].re, out[p1].im, out[p2].re, out[p2].im, out[p3].re, out[p3].im
+%else
+    shufps m3, m3, m3, q0312
+    shufps m5, m5, m5, q2130
+%endif
+
+    movups [outq + offset_nq*8], m3
+    movups [outq + offset_pq*8], m5
+
+    sub offset_pq, %1
+    add offset_nq, %1
+    cmp offset_nq, offset_pq
+    jle .loop
+
+    REP_RET
+%endmacro
+
+INIT_XMM sse3
+POSTROTATE_FN 2
+
+%if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+POSTROTATE_FN 4
+%endif
--- a/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/mdct15_init.c
+++ b/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/mdct15_init.c
@ -0,0 +1,99 @@
+/*
+ * SIMD optimized non-power-of-two MDCT functions
+ *
+ * Copyright (C) 2017 Rostislav Pehlivanov <atomnuker@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/mdct15.h"
+
+void ff_mdct15_postreindex_sse3(FFTComplex *out, FFTComplex *in, FFTComplex *exp, int *lut, ptrdiff_t len8);
+void ff_mdct15_postreindex_avx2(FFTComplex *out, FFTComplex *in, FFTComplex *exp, int *lut, ptrdiff_t len8);
+
+void ff_fft15_avx(FFTComplex *out, FFTComplex *in, FFTComplex *exptab, ptrdiff_t stride);
+
+static void perm_twiddles(MDCT15Context *s)
+{
+    int k;
+    FFTComplex tmp[30];
+
+    /* 5-point FFT twiddles */
+    s->exptab[60].re = s->exptab[60].im = s->exptab[19].re;
+    s->exptab[61].re = s->exptab[61].im = s->exptab[19].im;
+    s->exptab[62].re = s->exptab[62].im = s->exptab[20].re;
+    s->exptab[63].re = s->exptab[63].im = s->exptab[20].im;
+
+    /* 15-point FFT twiddles */
+    for (k = 0; k < 5; k++) {
+        tmp[6*k + 0] = s->exptab[k +  0];
+        tmp[6*k + 2] = s->exptab[k +  5];
+        tmp[6*k + 4] = s->exptab[k + 10];
+
+        tmp[6*k + 1] = s->exptab[2 * (k + 0)];
+        tmp[6*k + 3] = s->exptab[2 * (k + 5)];
+        tmp[6*k + 5] = s->exptab[2 *  k + 5 ];
+    }
+
+    for (k = 0; k < 6; k++) {
+        FFTComplex ac_exp[] = {
+            { tmp[6*1 + k].re,  tmp[6*1 + k].re },
+            { tmp[6*2 + k].re,  tmp[6*2 + k].re },
+            { tmp[6*3 + k].re,  tmp[6*3 + k].re },
+            { tmp[6*4 + k].re,  tmp[6*4 + k].re },
+            { tmp[6*1 + k].im, -tmp[6*1 + k].im },
+            { tmp[6*2 + k].im, -tmp[6*2 + k].im },
+            { tmp[6*3 + k].im, -tmp[6*3 + k].im },
+            { tmp[6*4 + k].im, -tmp[6*4 + k].im },
+        };
+        memcpy(s->exptab + 8*k, ac_exp, 8*sizeof(FFTComplex));
+    }
+
+    /* Specialcase when k = 0 */
+    for (k = 0; k < 3; k++) {
+        FFTComplex dc_exp[] = {
+            { tmp[2*k + 0].re, -tmp[2*k + 0].im },
+            { tmp[2*k + 0].im,  tmp[2*k + 0].re },
+            { tmp[2*k + 1].re, -tmp[2*k + 1].im },
+            { tmp[2*k + 1].im,  tmp[2*k + 1].re },
+        };
+        memcpy(s->exptab + 8*6 + 4*k, dc_exp, 4*sizeof(FFTComplex));
+    }
+}
+
+av_cold void ff_mdct15_init_x86(MDCT15Context *s)
+{
+    int adjust_twiddles = 0;
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_SSE3(cpu_flags))
+        s->postreindex = ff_mdct15_postreindex_sse3;
+
+    if (ARCH_X86_64 && EXTERNAL_AVX(cpu_flags)) {
+        s->fft15 = ff_fft15_avx;
+        adjust_twiddles = 1;
+    }
+
+    if (ARCH_X86_64 && EXTERNAL_AVX2_FAST(cpu_flags))
+        s->postreindex = ff_mdct15_postreindex_avx2;
+
+    if (adjust_twiddles)
+        perm_twiddles(s);
+}
--- a/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/me_cmp_init.c
+++ b/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/me_cmp_init.c
@ -0,0 +1,651 @@
+/*
+ * SIMD-optimized motion estimation
+ * Copyright (c) 2000, 2001 Fabrice Bellard
+ * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/asm.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/me_cmp.h"
+#include "libavcodec/mpegvideo.h"
+
+int ff_sum_abs_dctelem_mmx(int16_t *block);
+int ff_sum_abs_dctelem_mmxext(int16_t *block);
+int ff_sum_abs_dctelem_sse2(int16_t *block);
+int ff_sum_abs_dctelem_ssse3(int16_t *block);
+int ff_sse8_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                ptrdiff_t stride, int h);
+int ff_sse16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                 ptrdiff_t stride, int h);
+int ff_sse16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                  ptrdiff_t stride, int h);
+int ff_hf_noise8_mmx(uint8_t *pix1, ptrdiff_t stride, int h);
+int ff_hf_noise16_mmx(uint8_t *pix1, ptrdiff_t stride, int h);
+int ff_sad8_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                   ptrdiff_t stride, int h);
+int ff_sad16_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                    ptrdiff_t stride, int h);
+int ff_sad16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                  ptrdiff_t stride, int h);
+int ff_sad8_x2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                      ptrdiff_t stride, int h);
+int ff_sad16_x2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                       ptrdiff_t stride, int h);
+int ff_sad16_x2_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                     ptrdiff_t stride, int h);
+int ff_sad8_y2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                      ptrdiff_t stride, int h);
+int ff_sad16_y2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                       ptrdiff_t stride, int h);
+int ff_sad16_y2_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                     ptrdiff_t stride, int h);
+int ff_sad8_approx_xy2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                              ptrdiff_t stride, int h);
+int ff_sad16_approx_xy2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                               ptrdiff_t stride, int h);
+int ff_sad16_approx_xy2_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                             ptrdiff_t stride, int h);
+int ff_vsad_intra8_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                          ptrdiff_t stride, int h);
+int ff_vsad_intra16_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                           ptrdiff_t stride, int h);
+int ff_vsad_intra16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                         ptrdiff_t stride, int h);
+int ff_vsad8_approx_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                    ptrdiff_t stride, int h);
+int ff_vsad16_approx_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                     ptrdiff_t stride, int h);
+int ff_vsad16_approx_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                   ptrdiff_t stride, int h);
+
+#define hadamard_func(cpu)                                                    \
+    int ff_hadamard8_diff_ ## cpu(MpegEncContext *s, uint8_t *src1,           \
+                                  uint8_t *src2, ptrdiff_t stride, int h);    \
+    int ff_hadamard8_diff16_ ## cpu(MpegEncContext *s, uint8_t *src1,         \
+                                    uint8_t *src2, ptrdiff_t stride, int h);
+
+hadamard_func(mmx)
+hadamard_func(mmxext)
+hadamard_func(sse2)
+hadamard_func(ssse3)
+
+#if HAVE_X86ASM
+static int nsse16_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
+                      ptrdiff_t stride, int h)
+{
+    int score1, score2;
+
+    if (c)
+        score1 = c->mecc.sse[0](c, pix1, pix2, stride, h);
+    else
+        score1 = ff_sse16_mmx(c, pix1, pix2, stride, h);
+    score2 = ff_hf_noise16_mmx(pix1, stride, h) + ff_hf_noise8_mmx(pix1+8, stride, h)
+           - ff_hf_noise16_mmx(pix2, stride, h) - ff_hf_noise8_mmx(pix2+8, stride, h);
+
+    if (c)
+        return score1 + FFABS(score2) * c->avctx->nsse_weight;
+    else
+        return score1 + FFABS(score2) * 8;
+}
+
+static int nsse8_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
+                     ptrdiff_t stride, int h)
+{
+    int score1 = ff_sse8_mmx(c, pix1, pix2, stride, h);
+    int score2 = ff_hf_noise8_mmx(pix1, stride, h) -
+                 ff_hf_noise8_mmx(pix2, stride, h);
+
+    if (c)
+        return score1 + FFABS(score2) * c->avctx->nsse_weight;
+    else
+        return score1 + FFABS(score2) * 8;
+}
+
+#endif /* HAVE_X86ASM */
+
+#if HAVE_INLINE_ASM
+
+static int vsad_intra16_mmx(MpegEncContext *v, uint8_t *pix, uint8_t *dummy,
+                            ptrdiff_t stride, int h)
+{
+    int tmp;
+
+    av_assert2((((int) pix) & 7) == 0);
+    av_assert2((stride & 7) == 0);
+
+#define SUM(in0, in1, out0, out1)               \
+    "movq (%0), %%mm2\n"                        \
+    "movq 8(%0), %%mm3\n"                       \
+    "add %2,%0\n"                               \
+    "movq %%mm2, " #out0 "\n"                   \
+    "movq %%mm3, " #out1 "\n"                   \
+    "psubusb " #in0 ", %%mm2\n"                 \
+    "psubusb " #in1 ", %%mm3\n"                 \
+    "psubusb " #out0 ", " #in0 "\n"             \
+    "psubusb " #out1 ", " #in1 "\n"             \
+    "por %%mm2, " #in0 "\n"                     \
+    "por %%mm3, " #in1 "\n"                     \
+    "movq " #in0 ", %%mm2\n"                    \
+    "movq " #in1 ", %%mm3\n"                    \
+    "punpcklbw %%mm7, " #in0 "\n"               \
+    "punpcklbw %%mm7, " #in1 "\n"               \
+    "punpckhbw %%mm7, %%mm2\n"                  \
+    "punpckhbw %%mm7, %%mm3\n"                  \
+    "paddw " #in1 ", " #in0 "\n"                \
+    "paddw %%mm3, %%mm2\n"                      \
+    "paddw %%mm2, " #in0 "\n"                   \
+    "paddw " #in0 ", %%mm6\n"
+
+
+    __asm__ volatile (
+        "movl    %3, %%ecx\n"
+        "pxor %%mm6, %%mm6\n"
+        "pxor %%mm7, %%mm7\n"
+        "movq  (%0), %%mm0\n"
+        "movq 8(%0), %%mm1\n"
+        "add %2, %0\n"
+        "jmp 2f\n"
+        "1:\n"
+
+        SUM(%%mm4, %%mm5, %%mm0, %%mm1)
+        "2:\n"
+        SUM(%%mm0, %%mm1, %%mm4, %%mm5)
+
+        "subl $2, %%ecx\n"
+        "jnz 1b\n"
+
+        "movq  %%mm6, %%mm0\n"
+        "psrlq $32,   %%mm6\n"
+        "paddw %%mm6, %%mm0\n"
+        "movq  %%mm0, %%mm6\n"
+        "psrlq $16,   %%mm0\n"
+        "paddw %%mm6, %%mm0\n"
+        "movd  %%mm0, %1\n"
+        : "+r" (pix), "=r" (tmp)
+        : "r" (stride), "m" (h)
+        : "%ecx");
+
+    return tmp & 0xFFFF;
+}
+#undef SUM
+
+static int vsad16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                      ptrdiff_t stride, int h)
+{
+    int tmp;
+
+    av_assert2((((int) pix1) & 7) == 0);
+    av_assert2((((int) pix2) & 7) == 0);
+    av_assert2((stride & 7) == 0);
+
+#define SUM(in0, in1, out0, out1)       \
+    "movq (%0), %%mm2\n"                \
+    "movq (%1), " #out0 "\n"            \
+    "movq 8(%0), %%mm3\n"               \
+    "movq 8(%1), " #out1 "\n"           \
+    "add %3, %0\n"                      \
+    "add %3, %1\n"                      \
+    "psubb " #out0 ", %%mm2\n"          \
+    "psubb " #out1 ", %%mm3\n"          \
+    "pxor %%mm7, %%mm2\n"               \
+    "pxor %%mm7, %%mm3\n"               \
+    "movq %%mm2, " #out0 "\n"           \
+    "movq %%mm3, " #out1 "\n"           \
+    "psubusb " #in0 ", %%mm2\n"         \
+    "psubusb " #in1 ", %%mm3\n"         \
+    "psubusb " #out0 ", " #in0 "\n"     \
+    "psubusb " #out1 ", " #in1 "\n"     \
+    "por %%mm2, " #in0 "\n"             \
+    "por %%mm3, " #in1 "\n"             \
+    "movq " #in0 ", %%mm2\n"            \
+    "movq " #in1 ", %%mm3\n"            \
+    "punpcklbw %%mm7, " #in0 "\n"       \
+    "punpcklbw %%mm7, " #in1 "\n"       \
+    "punpckhbw %%mm7, %%mm2\n"          \
+    "punpckhbw %%mm7, %%mm3\n"          \
+    "paddw " #in1 ", " #in0 "\n"        \
+    "paddw %%mm3, %%mm2\n"              \
+    "paddw %%mm2, " #in0 "\n"           \
+    "paddw " #in0 ", %%mm6\n"
+
+
+    __asm__ volatile (
+        "movl %4, %%ecx\n"
+        "pxor %%mm6, %%mm6\n"
+        "pcmpeqw %%mm7, %%mm7\n"
+        "psllw $15, %%mm7\n"
+        "packsswb %%mm7, %%mm7\n"
+        "movq (%0), %%mm0\n"
+        "movq (%1), %%mm2\n"
+        "movq 8(%0), %%mm1\n"
+        "movq 8(%1), %%mm3\n"
+        "add %3, %0\n"
+        "add %3, %1\n"
+        "psubb %%mm2, %%mm0\n"
+        "psubb %%mm3, %%mm1\n"
+        "pxor %%mm7, %%mm0\n"
+        "pxor %%mm7, %%mm1\n"
+        "jmp 2f\n"
+        "1:\n"
+
+        SUM(%%mm4, %%mm5, %%mm0, %%mm1)
+        "2:\n"
+        SUM(%%mm0, %%mm1, %%mm4, %%mm5)
+
+        "subl $2, %%ecx\n"
+        "jnz 1b\n"
+
+        "movq %%mm6, %%mm0\n"
+        "psrlq $32, %%mm6\n"
+        "paddw %%mm6, %%mm0\n"
+        "movq %%mm0, %%mm6\n"
+        "psrlq $16, %%mm0\n"
+        "paddw %%mm6, %%mm0\n"
+        "movd %%mm0, %2\n"
+        : "+r" (pix1), "+r" (pix2), "=r" (tmp)
+        : "r" (stride), "m" (h)
+        : "%ecx");
+
+    return tmp & 0x7FFF;
+}
+#undef SUM
+
+DECLARE_ASM_CONST(8, uint64_t, round_tab)[3] = {
+    0x0000000000000000ULL,
+    0x0001000100010001ULL,
+    0x0002000200020002ULL,
+};
+
+static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2,
+                              ptrdiff_t stride, int h)
+{
+    x86_reg len = -stride * h;
+    __asm__ volatile (
+        ".p2align 4                     \n\t"
+        "1:                             \n\t"
+        "movq (%1, %%"FF_REG_a"), %%mm0 \n\t"
+        "movq (%2, %%"FF_REG_a"), %%mm2 \n\t"
+        "movq (%2, %%"FF_REG_a"), %%mm4 \n\t"
+        "add %3, %%"FF_REG_a"           \n\t"
+        "psubusb %%mm0, %%mm2           \n\t"
+        "psubusb %%mm4, %%mm0           \n\t"
+        "movq (%1, %%"FF_REG_a"), %%mm1 \n\t"
+        "movq (%2, %%"FF_REG_a"), %%mm3 \n\t"
+        "movq (%2, %%"FF_REG_a"), %%mm5 \n\t"
+        "psubusb %%mm1, %%mm3           \n\t"
+        "psubusb %%mm5, %%mm1           \n\t"
+        "por %%mm2, %%mm0               \n\t"
+        "por %%mm1, %%mm3               \n\t"
+        "movq %%mm0, %%mm1              \n\t"
+        "movq %%mm3, %%mm2              \n\t"
+        "punpcklbw %%mm7, %%mm0         \n\t"
+        "punpckhbw %%mm7, %%mm1         \n\t"
+        "punpcklbw %%mm7, %%mm3         \n\t"
+        "punpckhbw %%mm7, %%mm2         \n\t"
+        "paddw %%mm1, %%mm0             \n\t"
+        "paddw %%mm3, %%mm2             \n\t"
+        "paddw %%mm2, %%mm0             \n\t"
+        "paddw %%mm0, %%mm6             \n\t"
+        "add %3, %%"FF_REG_a"           \n\t"
+        " js 1b                         \n\t"
+        : "+a" (len)
+        : "r" (blk1 - len), "r" (blk2 - len), "r" (stride));
+}
+
+static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2,
+                              ptrdiff_t stride, int h)
+{
+    x86_reg len = -stride * h;
+    __asm__ volatile (
+        ".p2align 4                     \n\t"
+        "1:                             \n\t"
+        "movq (%1, %%"FF_REG_a"), %%mm0 \n\t"
+        "movq (%2, %%"FF_REG_a"), %%mm1 \n\t"
+        "movq (%1, %%"FF_REG_a"), %%mm2 \n\t"
+        "movq (%2, %%"FF_REG_a"), %%mm3 \n\t"
+        "punpcklbw %%mm7, %%mm0         \n\t"
+        "punpcklbw %%mm7, %%mm1         \n\t"
+        "punpckhbw %%mm7, %%mm2         \n\t"
+        "punpckhbw %%mm7, %%mm3         \n\t"
+        "paddw %%mm0, %%mm1             \n\t"
+        "paddw %%mm2, %%mm3             \n\t"
+        "movq (%3, %%"FF_REG_a"), %%mm4 \n\t"
+        "movq (%3, %%"FF_REG_a"), %%mm2 \n\t"
+        "paddw %%mm5, %%mm1             \n\t"
+        "paddw %%mm5, %%mm3             \n\t"
+        "psrlw $1, %%mm1                \n\t"
+        "psrlw $1, %%mm3                \n\t"
+        "packuswb %%mm3, %%mm1          \n\t"
+        "psubusb %%mm1, %%mm4           \n\t"
+        "psubusb %%mm2, %%mm1           \n\t"
+        "por %%mm4, %%mm1               \n\t"
+        "movq %%mm1, %%mm0              \n\t"
+        "punpcklbw %%mm7, %%mm0         \n\t"
+        "punpckhbw %%mm7, %%mm1         \n\t"
+        "paddw %%mm1, %%mm0             \n\t"
+        "paddw %%mm0, %%mm6             \n\t"
+        "add %4, %%"FF_REG_a"           \n\t"
+        " js 1b                         \n\t"
+        : "+a" (len)
+        : "r" (blk1a - len), "r" (blk1b - len), "r" (blk2 - len),
+          "r" (stride));
+}
+
+static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2,
+                              ptrdiff_t stride, int h)
+{
+    x86_reg len = -stride * h;
+    __asm__ volatile (
+        "movq  (%1, %%"FF_REG_a"), %%mm0\n\t"
+        "movq 1(%1, %%"FF_REG_a"), %%mm2\n\t"
+        "movq %%mm0, %%mm1              \n\t"
+        "movq %%mm2, %%mm3              \n\t"
+        "punpcklbw %%mm7, %%mm0         \n\t"
+        "punpckhbw %%mm7, %%mm1         \n\t"
+        "punpcklbw %%mm7, %%mm2         \n\t"
+        "punpckhbw %%mm7, %%mm3         \n\t"
+        "paddw %%mm2, %%mm0             \n\t"
+        "paddw %%mm3, %%mm1             \n\t"
+        ".p2align 4                     \n\t"
+        "1:                             \n\t"
+        "movq  (%2, %%"FF_REG_a"), %%mm2\n\t"
+        "movq 1(%2, %%"FF_REG_a"), %%mm4\n\t"
+        "movq %%mm2, %%mm3              \n\t"
+        "movq %%mm4, %%mm5              \n\t"
+        "punpcklbw %%mm7, %%mm2         \n\t"
+        "punpckhbw %%mm7, %%mm3         \n\t"
+        "punpcklbw %%mm7, %%mm4         \n\t"
+        "punpckhbw %%mm7, %%mm5         \n\t"
+        "paddw %%mm4, %%mm2             \n\t"
+        "paddw %%mm5, %%mm3             \n\t"
+        "movq %5, %%mm5                 \n\t"
+        "paddw %%mm2, %%mm0             \n\t"
+        "paddw %%mm3, %%mm1             \n\t"
+        "paddw %%mm5, %%mm0             \n\t"
+        "paddw %%mm5, %%mm1             \n\t"
+        "movq (%3, %%"FF_REG_a"), %%mm4 \n\t"
+        "movq (%3, %%"FF_REG_a"), %%mm5 \n\t"
+        "psrlw $2, %%mm0                \n\t"
+        "psrlw $2, %%mm1                \n\t"
+        "packuswb %%mm1, %%mm0          \n\t"
+        "psubusb %%mm0, %%mm4           \n\t"
+        "psubusb %%mm5, %%mm0           \n\t"
+        "por %%mm4, %%mm0               \n\t"
+        "movq %%mm0, %%mm4              \n\t"
+        "punpcklbw %%mm7, %%mm0         \n\t"
+        "punpckhbw %%mm7, %%mm4         \n\t"
+        "paddw %%mm0, %%mm6             \n\t"
+        "paddw %%mm4, %%mm6             \n\t"
+        "movq  %%mm2, %%mm0             \n\t"
+        "movq  %%mm3, %%mm1             \n\t"
+        "add %4, %%"FF_REG_a"           \n\t"
+        " js 1b                         \n\t"
+        : "+a" (len)
+        : "r" (blk1 - len), "r" (blk1 - len + stride), "r" (blk2 - len),
+          "r" (stride), "m" (round_tab[2]));
+}
+
+static inline int sum_mmx(void)
+{
+    int ret;
+    __asm__ volatile (
+        "movq %%mm6, %%mm0              \n\t"
+        "psrlq $32, %%mm6               \n\t"
+        "paddw %%mm0, %%mm6             \n\t"
+        "movq %%mm6, %%mm0              \n\t"
+        "psrlq $16, %%mm6               \n\t"
+        "paddw %%mm0, %%mm6             \n\t"
+        "movd %%mm6, %0                 \n\t"
+        : "=r" (ret));
+    return ret & 0xFFFF;
+}
+
+static inline void sad8_x2a_mmx(uint8_t *blk1, uint8_t *blk2,
+                                ptrdiff_t stride, int h)
+{
+    sad8_2_mmx(blk1, blk1 + 1, blk2, stride, h);
+}
+
+static inline void sad8_y2a_mmx(uint8_t *blk1, uint8_t *blk2,
+                                ptrdiff_t stride, int h)
+{
+    sad8_2_mmx(blk1, blk1 + stride, blk2, stride, h);
+}
+
+#define PIX_SAD(suf)                                                    \
+static int sad8_ ## suf(MpegEncContext *v, uint8_t *blk2,               \
+                        uint8_t *blk1, ptrdiff_t stride, int h)         \
+{                                                                       \
+    av_assert2(h == 8);                                                     \
+    __asm__ volatile (                                                  \
+        "pxor %%mm7, %%mm7     \n\t"                                    \
+        "pxor %%mm6, %%mm6     \n\t"                                    \
+        :);                                                             \
+                                                                        \
+    sad8_1_ ## suf(blk1, blk2, stride, 8);                              \
+                                                                        \
+    return sum_ ## suf();                                               \
+}                                                                       \
+                                                                        \
+static int sad8_x2_ ## suf(MpegEncContext *v, uint8_t *blk2,            \
+                           uint8_t *blk1, ptrdiff_t stride, int h)      \
+{                                                                       \
+    av_assert2(h == 8);                                                     \
+    __asm__ volatile (                                                  \
+        "pxor %%mm7, %%mm7     \n\t"                                    \
+        "pxor %%mm6, %%mm6     \n\t"                                    \
+        "movq %0, %%mm5        \n\t"                                    \
+        :: "m" (round_tab[1]));                                         \
+                                                                        \
+    sad8_x2a_ ## suf(blk1, blk2, stride, 8);                            \
+                                                                        \
+    return sum_ ## suf();                                               \
+}                                                                       \
+                                                                        \
+static int sad8_y2_ ## suf(MpegEncContext *v, uint8_t *blk2,            \
+                           uint8_t *blk1, ptrdiff_t stride, int h)      \
+{                                                                       \
+    av_assert2(h == 8);                                                     \
+    __asm__ volatile (                                                  \
+        "pxor %%mm7, %%mm7     \n\t"                                    \
+        "pxor %%mm6, %%mm6     \n\t"                                    \
+        "movq %0, %%mm5        \n\t"                                    \
+        :: "m" (round_tab[1]));                                         \
+                                                                        \
+    sad8_y2a_ ## suf(blk1, blk2, stride, 8);                            \
+                                                                        \
+    return sum_ ## suf();                                               \
+}                                                                       \
+                                                                        \
+static int sad8_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2,           \
+                            uint8_t *blk1, ptrdiff_t stride, int h)     \
+{                                                                       \
+    av_assert2(h == 8);                                                     \
+    __asm__ volatile (                                                  \
+        "pxor %%mm7, %%mm7     \n\t"                                    \
+        "pxor %%mm6, %%mm6     \n\t"                                    \
+        ::);                                                            \
+                                                                        \
+    sad8_4_ ## suf(blk1, blk2, stride, 8);                              \
+                                                                        \
+    return sum_ ## suf();                                               \
+}                                                                       \
+                                                                        \
+static int sad16_ ## suf(MpegEncContext *v, uint8_t *blk2,              \
+                         uint8_t *blk1, ptrdiff_t stride, int h)        \
+{                                                                       \
+    __asm__ volatile (                                                  \
+        "pxor %%mm7, %%mm7     \n\t"                                    \
+        "pxor %%mm6, %%mm6     \n\t"                                    \
+        :);                                                             \
+                                                                        \
+    sad8_1_ ## suf(blk1,     blk2,     stride, h);                      \
+    sad8_1_ ## suf(blk1 + 8, blk2 + 8, stride, h);                      \
+                                                                        \
+    return sum_ ## suf();                                               \
+}                                                                       \
+                                                                        \
+static int sad16_x2_ ## suf(MpegEncContext *v, uint8_t *blk2,           \
+                            uint8_t *blk1, ptrdiff_t stride, int h)     \
+{                                                                       \
+    __asm__ volatile (                                                  \
+        "pxor %%mm7, %%mm7     \n\t"                                    \
+        "pxor %%mm6, %%mm6     \n\t"                                    \
+        "movq %0, %%mm5        \n\t"                                    \
+        :: "m" (round_tab[1]));                                         \
+                                                                        \
+    sad8_x2a_ ## suf(blk1,     blk2,     stride, h);                    \
+    sad8_x2a_ ## suf(blk1 + 8, blk2 + 8, stride, h);                    \
+                                                                        \
+    return sum_ ## suf();                                               \
+}                                                                       \
+                                                                        \
+static int sad16_y2_ ## suf(MpegEncContext *v, uint8_t *blk2,           \
+                            uint8_t *blk1, ptrdiff_t stride, int h)     \
+{                                                                       \
+    __asm__ volatile (                                                  \
+        "pxor %%mm7, %%mm7     \n\t"                                    \
+        "pxor %%mm6, %%mm6     \n\t"                                    \
+        "movq %0, %%mm5        \n\t"                                    \
+        :: "m" (round_tab[1]));                                         \
+                                                                        \
+    sad8_y2a_ ## suf(blk1,     blk2,     stride, h);                    \
+    sad8_y2a_ ## suf(blk1 + 8, blk2 + 8, stride, h);                    \
+                                                                        \
+    return sum_ ## suf();                                               \
+}                                                                       \
+                                                                        \
+static int sad16_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2,          \
+                             uint8_t *blk1, ptrdiff_t stride, int h)    \
+{                                                                       \
+    __asm__ volatile (                                                  \
+        "pxor %%mm7, %%mm7     \n\t"                                    \
+        "pxor %%mm6, %%mm6     \n\t"                                    \
+        ::);                                                            \
+                                                                        \
+    sad8_4_ ## suf(blk1,     blk2,     stride, h);                      \
+    sad8_4_ ## suf(blk1 + 8, blk2 + 8, stride, h);                      \
+                                                                        \
+    return sum_ ## suf();                                               \
+}                                                                       \
+
+PIX_SAD(mmx)
+
+#endif /* HAVE_INLINE_ASM */
+
+av_cold void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+#if HAVE_INLINE_ASM
+    if (INLINE_MMX(cpu_flags)) {
+        c->pix_abs[0][0] = sad16_mmx;
+        c->pix_abs[0][1] = sad16_x2_mmx;
+        c->pix_abs[0][2] = sad16_y2_mmx;
+        c->pix_abs[0][3] = sad16_xy2_mmx;
+        c->pix_abs[1][0] = sad8_mmx;
+        c->pix_abs[1][1] = sad8_x2_mmx;
+        c->pix_abs[1][2] = sad8_y2_mmx;
+        c->pix_abs[1][3] = sad8_xy2_mmx;
+
+        c->sad[0] = sad16_mmx;
+        c->sad[1] = sad8_mmx;
+
+        c->vsad[4] = vsad_intra16_mmx;
+
+        if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {
+            c->vsad[0] = vsad16_mmx;
+        }
+    }
+
+#endif /* HAVE_INLINE_ASM */
+
+    if (EXTERNAL_MMX(cpu_flags)) {
+        c->hadamard8_diff[0] = ff_hadamard8_diff16_mmx;
+        c->hadamard8_diff[1] = ff_hadamard8_diff_mmx;
+        c->sum_abs_dctelem   = ff_sum_abs_dctelem_mmx;
+        c->sse[0]            = ff_sse16_mmx;
+        c->sse[1]            = ff_sse8_mmx;
+#if HAVE_X86ASM
+        c->nsse[0]           = nsse16_mmx;
+        c->nsse[1]           = nsse8_mmx;
+#endif
+    }
+
+    if (EXTERNAL_MMXEXT(cpu_flags)) {
+        c->hadamard8_diff[0] = ff_hadamard8_diff16_mmxext;
+        c->hadamard8_diff[1] = ff_hadamard8_diff_mmxext;
+        c->sum_abs_dctelem   = ff_sum_abs_dctelem_mmxext;
+
+        c->sad[0] = ff_sad16_mmxext;
+        c->sad[1] = ff_sad8_mmxext;
+
+        c->pix_abs[0][0] = ff_sad16_mmxext;
+        c->pix_abs[0][1] = ff_sad16_x2_mmxext;
+        c->pix_abs[0][2] = ff_sad16_y2_mmxext;
+        c->pix_abs[1][0] = ff_sad8_mmxext;
+        c->pix_abs[1][1] = ff_sad8_x2_mmxext;
+        c->pix_abs[1][2] = ff_sad8_y2_mmxext;
+
+        c->vsad[4] = ff_vsad_intra16_mmxext;
+        c->vsad[5] = ff_vsad_intra8_mmxext;
+
+        if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {
+            c->pix_abs[0][3] = ff_sad16_approx_xy2_mmxext;
+            c->pix_abs[1][3] = ff_sad8_approx_xy2_mmxext;
+
+            c->vsad[0] = ff_vsad16_approx_mmxext;
+            c->vsad[1] = ff_vsad8_approx_mmxext;
+        }
+    }
+
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        c->sse[0] = ff_sse16_sse2;
+        c->sum_abs_dctelem   = ff_sum_abs_dctelem_sse2;
+
+#if HAVE_ALIGNED_STACK
+        c->hadamard8_diff[0] = ff_hadamard8_diff16_sse2;
+        c->hadamard8_diff[1] = ff_hadamard8_diff_sse2;
+#endif
+        if (!(cpu_flags & AV_CPU_FLAG_SSE2SLOW) && avctx->codec_id != AV_CODEC_ID_SNOW) {
+            c->sad[0]        = ff_sad16_sse2;
+            c->pix_abs[0][0] = ff_sad16_sse2;
+            c->pix_abs[0][1] = ff_sad16_x2_sse2;
+            c->pix_abs[0][2] = ff_sad16_y2_sse2;
+
+            c->vsad[4]       = ff_vsad_intra16_sse2;
+            if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {
+                c->pix_abs[0][3] = ff_sad16_approx_xy2_sse2;
+                c->vsad[0]       = ff_vsad16_approx_sse2;
+            }
+        }
+    }
+
+    if (EXTERNAL_SSSE3(cpu_flags)) {
+        c->sum_abs_dctelem   = ff_sum_abs_dctelem_ssse3;
+#if HAVE_ALIGNED_STACK
+        c->hadamard8_diff[0] = ff_hadamard8_diff16_ssse3;
+        c->hadamard8_diff[1] = ff_hadamard8_diff_ssse3;
+#endif
+    }
+}
--- a/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/mlpdsp_init.c
+++ b/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/mlpdsp_init.c
@ -0,0 +1,204 @@
+/*
+ * MLP DSP functions x86-optimized
+ * Copyright (c) 2009 Ramiro Polla
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/asm.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/mlpdsp.h"
+#include "libavcodec/mlp.h"
+
+#define REMATRIX_CHANNEL_FUNC(opt) \
+void ff_mlp_rematrix_channel_##opt(int32_t *samples, \
+                                   const int32_t *coeffs, \
+                                   const uint8_t *bypassed_lsbs, \
+                                   const int8_t *noise_buffer, \
+                                   int index, \
+                                   unsigned int dest_ch, \
+                                   uint16_t blockpos, \
+                                   unsigned int maxchan, \
+                                   int matrix_noise_shift, \
+                                   int access_unit_size_pow2, \
+                                   int32_t mask);
+
+REMATRIX_CHANNEL_FUNC(sse4)
+REMATRIX_CHANNEL_FUNC(avx2_bmi2)
+
+#if HAVE_7REGS && HAVE_INLINE_ASM && HAVE_INLINE_ASM_NONLOCAL_LABELS
+
+extern char ff_mlp_firorder_8;
+extern char ff_mlp_firorder_7;
+extern char ff_mlp_firorder_6;
+extern char ff_mlp_firorder_5;
+extern char ff_mlp_firorder_4;
+extern char ff_mlp_firorder_3;
+extern char ff_mlp_firorder_2;
+extern char ff_mlp_firorder_1;
+extern char ff_mlp_firorder_0;
+
+extern char ff_mlp_iirorder_4;
+extern char ff_mlp_iirorder_3;
+extern char ff_mlp_iirorder_2;
+extern char ff_mlp_iirorder_1;
+extern char ff_mlp_iirorder_0;
+
+static const void * const firtable[9] = { &ff_mlp_firorder_0, &ff_mlp_firorder_1,
+                                          &ff_mlp_firorder_2, &ff_mlp_firorder_3,
+                                          &ff_mlp_firorder_4, &ff_mlp_firorder_5,
+                                          &ff_mlp_firorder_6, &ff_mlp_firorder_7,
+                                          &ff_mlp_firorder_8 };
+static const void * const iirtable[5] = { &ff_mlp_iirorder_0, &ff_mlp_iirorder_1,
+                                          &ff_mlp_iirorder_2, &ff_mlp_iirorder_3,
+                                          &ff_mlp_iirorder_4 };
+
+#if ARCH_X86_64
+
+#define MLPMUL(label, offset, offs, offc)   \
+    LABEL_MANGLE(label)":             \n\t" \
+    "movslq "offset"+"offs"(%0), %%rax\n\t" \
+    "movslq "offset"+"offc"(%1), %%rdx\n\t" \
+    "imul                 %%rdx, %%rax\n\t" \
+    "add                  %%rax, %%rsi\n\t"
+
+#define FIRMULREG(label, offset, firc)\
+    LABEL_MANGLE(label)":       \n\t" \
+    "movslq "#offset"(%0), %%rax\n\t" \
+    "imul        %"#firc", %%rax\n\t" \
+    "add            %%rax, %%rsi\n\t"
+
+#define CLEAR_ACCUM                   \
+    "xor            %%rsi, %%rsi\n\t"
+
+#define SHIFT_ACCUM                   \
+    "shr     %%cl,         %%rsi\n\t"
+
+#define ACCUM    "%%rdx"
+#define RESULT   "%%rsi"
+#define RESULT32 "%%esi"
+
+#else /* if ARCH_X86_32 */
+
+#define MLPMUL(label, offset, offs, offc)  \
+    LABEL_MANGLE(label)":            \n\t" \
+    "mov   "offset"+"offs"(%0), %%eax\n\t" \
+    "imull "offset"+"offc"(%1)       \n\t" \
+    "add                %%eax , %%esi\n\t" \
+    "adc                %%edx , %%ecx\n\t"
+
+#define FIRMULREG(label, offset, firc)  \
+    MLPMUL(label, #offset, "0", "0")
+
+#define CLEAR_ACCUM                  \
+    "xor           %%esi, %%esi\n\t" \
+    "xor           %%ecx, %%ecx\n\t"
+
+#define SHIFT_ACCUM                  \
+    "mov           %%ecx, %%edx\n\t" \
+    "mov           %%esi, %%eax\n\t" \
+    "movzbl        %7   , %%ecx\n\t" \
+    "shrd    %%cl, %%edx, %%eax\n\t" \
+
+#define ACCUM    "%%edx"
+#define RESULT   "%%eax"
+#define RESULT32 "%%eax"
+
+#endif /* !ARCH_X86_64 */
+
+#define BINC  AV_STRINGIFY(4* MAX_CHANNELS)
+#define IOFFS AV_STRINGIFY(4*(MAX_FIR_ORDER + MAX_BLOCKSIZE))
+#define IOFFC AV_STRINGIFY(4* MAX_FIR_ORDER)
+
+#define FIRMUL(label, offset) MLPMUL(label, #offset,   "0",   "0")
+#define IIRMUL(label, offset) MLPMUL(label, #offset, IOFFS, IOFFC)
+
+static void mlp_filter_channel_x86(int32_t *state, const int32_t *coeff,
+                                   int firorder, int iirorder,
+                                   unsigned int filter_shift, int32_t mask,
+                                   int blocksize, int32_t *sample_buffer)
+{
+    const void *firjump = firtable[firorder];
+    const void *iirjump = iirtable[iirorder];
+
+    blocksize = -blocksize;
+
+    __asm__ volatile(
+        "1:                           \n\t"
+        CLEAR_ACCUM
+        "jmp  *%5                     \n\t"
+        FIRMUL   (ff_mlp_firorder_8, 0x1c   )
+        FIRMUL   (ff_mlp_firorder_7, 0x18   )
+        FIRMUL   (ff_mlp_firorder_6, 0x14   )
+        FIRMUL   (ff_mlp_firorder_5, 0x10   )
+        FIRMUL   (ff_mlp_firorder_4, 0x0c   )
+        FIRMUL   (ff_mlp_firorder_3, 0x08   )
+        FIRMUL   (ff_mlp_firorder_2, 0x04   )
+        FIRMULREG(ff_mlp_firorder_1, 0x00, 8)
+        LABEL_MANGLE(ff_mlp_firorder_0)":\n\t"
+        "jmp  *%6                     \n\t"
+        IIRMUL   (ff_mlp_iirorder_4, 0x0c   )
+        IIRMUL   (ff_mlp_iirorder_3, 0x08   )
+        IIRMUL   (ff_mlp_iirorder_2, 0x04   )
+        IIRMUL   (ff_mlp_iirorder_1, 0x00   )
+        LABEL_MANGLE(ff_mlp_iirorder_0)":\n\t"
+        SHIFT_ACCUM
+        "mov  "RESULT"  ,"ACCUM"      \n\t"
+        "add  (%2)      ,"RESULT"     \n\t"
+        "and   %4       ,"RESULT"     \n\t"
+        "sub   $4       ,  %0         \n\t"
+        "mov  "RESULT32", (%0)        \n\t"
+        "mov  "RESULT32", (%2)        \n\t"
+        "add $"BINC"    ,  %2         \n\t"
+        "sub  "ACCUM"   ,"RESULT"     \n\t"
+        "mov  "RESULT32","IOFFS"(%0)  \n\t"
+        "incl              %3         \n\t"
+        "js 1b                        \n\t"
+        : /* 0*/"+r"(state),
+          /* 1*/"+r"(coeff),
+          /* 2*/"+r"(sample_buffer),
+#if ARCH_X86_64
+          /* 3*/"+r"(blocksize)
+        : /* 4*/"r"((x86_reg)mask), /* 5*/"r"(firjump),
+          /* 6*/"r"(iirjump)      , /* 7*/"c"(filter_shift)
+        , /* 8*/"r"((int64_t)coeff[0])
+        : "rax", "rdx", "rsi"
+#else /* ARCH_X86_32 */
+          /* 3*/"+m"(blocksize)
+        : /* 4*/"m"(         mask), /* 5*/"m"(firjump),
+          /* 6*/"m"(iirjump)      , /* 7*/"m"(filter_shift)
+        : "eax", "edx", "esi", "ecx"
+#endif /* !ARCH_X86_64 */
+    );
+}
+
+#endif /* HAVE_7REGS && HAVE_INLINE_ASM */
+
+av_cold void ff_mlpdsp_init_x86(MLPDSPContext *c)
+{
+    int cpu_flags = av_get_cpu_flags();
+#if HAVE_7REGS && HAVE_INLINE_ASM && HAVE_INLINE_ASM_NONLOCAL_LABELS
+    if (INLINE_MMX(cpu_flags))
+        c->mlp_filter_channel = mlp_filter_channel_x86;
+#endif
+    if (ARCH_X86_64 && EXTERNAL_SSE4(cpu_flags))
+        c->mlp_rematrix_channel = ff_mlp_rematrix_channel_sse4;
+    if (ARCH_X86_64 && EXTERNAL_AVX2_FAST(cpu_flags) && cpu_flags & AV_CPU_FLAG_BMI2)
+        c->mlp_rematrix_channel = ff_mlp_rematrix_channel_avx2_bmi2;
+}
--- a/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/mpegaudiodsp.c
+++ b/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/mpegaudiodsp.c
@ -0,0 +1,289 @@
+/*
+ * SIMD-optimized MP3 decoding functions
+ * Copyright (c) 2010 Vitor Sessak
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/internal.h"
+#include "libavutil/x86/asm.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/mpegaudiodsp.h"
+
+#define DECL(CPU)\
+static void imdct36_blocks_ ## CPU(float *out, float *buf, float *in, int count, int switch_point, int block_type);\
+void ff_imdct36_float_ ## CPU(float *out, float *buf, float *in, float *win);
+
+#if HAVE_X86ASM
+#if ARCH_X86_32
+DECL(sse)
+#endif
+DECL(sse2)
+DECL(sse3)
+DECL(ssse3)
+DECL(avx)
+#endif /* HAVE_X86ASM */
+
+void ff_four_imdct36_float_sse(float *out, float *buf, float *in, float *win,
+                               float *tmpbuf);
+void ff_four_imdct36_float_avx(float *out, float *buf, float *in, float *win,
+                               float *tmpbuf);
+
+DECLARE_ALIGNED(16, static float, mdct_win_sse)[2][4][4*40];
+
+#if HAVE_6REGS && HAVE_SSE_INLINE
+
+#define MACS(rt, ra, rb) rt+=(ra)*(rb)
+#define MLSS(rt, ra, rb) rt-=(ra)*(rb)
+
+#define SUM8(op, sum, w, p)               \
+{                                         \
+    op(sum, (w)[0 * 64], (p)[0 * 64]);    \
+    op(sum, (w)[1 * 64], (p)[1 * 64]);    \
+    op(sum, (w)[2 * 64], (p)[2 * 64]);    \
+    op(sum, (w)[3 * 64], (p)[3 * 64]);    \
+    op(sum, (w)[4 * 64], (p)[4 * 64]);    \
+    op(sum, (w)[5 * 64], (p)[5 * 64]);    \
+    op(sum, (w)[6 * 64], (p)[6 * 64]);    \
+    op(sum, (w)[7 * 64], (p)[7 * 64]);    \
+}
+
+static void apply_window(const float *buf, const float *win1,
+                         const float *win2, float *sum1, float *sum2, int len)
+{
+    x86_reg count = - 4*len;
+    const float *win1a = win1+len;
+    const float *win2a = win2+len;
+    const float *bufa  = buf+len;
+    float *sum1a = sum1+len;
+    float *sum2a = sum2+len;
+
+
+#define MULT(a, b)                                 \
+    "movaps " #a "(%1,%0), %%xmm1           \n\t"  \
+    "movaps " #a "(%3,%0), %%xmm2           \n\t"  \
+    "mulps         %%xmm2, %%xmm1           \n\t"  \
+    "subps         %%xmm1, %%xmm0           \n\t"  \
+    "mulps  " #b "(%2,%0), %%xmm2           \n\t"  \
+    "subps         %%xmm2, %%xmm4           \n\t"  \
+
+    __asm__ volatile(
+            "1:                                   \n\t"
+            "xorps       %%xmm0, %%xmm0           \n\t"
+            "xorps       %%xmm4, %%xmm4           \n\t"
+
+            MULT(   0,   0)
+            MULT( 256,  64)
+            MULT( 512, 128)
+            MULT( 768, 192)
+            MULT(1024, 256)
+            MULT(1280, 320)
+            MULT(1536, 384)
+            MULT(1792, 448)
+
+            "movaps      %%xmm0, (%4,%0)          \n\t"
+            "movaps      %%xmm4, (%5,%0)          \n\t"
+            "add            $16,  %0              \n\t"
+            "jl              1b                   \n\t"
+            :"+&r"(count)
+            :"r"(win1a), "r"(win2a), "r"(bufa), "r"(sum1a), "r"(sum2a)
+            );
+
+#undef MULT
+}
+
+static void apply_window_mp3(float *in, float *win, int *unused, float *out,
+                             ptrdiff_t incr)
+{
+    LOCAL_ALIGNED_16(float, suma, [17]);
+    LOCAL_ALIGNED_16(float, sumb, [17]);
+    LOCAL_ALIGNED_16(float, sumc, [17]);
+    LOCAL_ALIGNED_16(float, sumd, [17]);
+
+    float sum;
+
+    /* copy to avoid wrap */
+    __asm__ volatile(
+            "movaps    0(%0), %%xmm0   \n\t" \
+            "movaps   16(%0), %%xmm1   \n\t" \
+            "movaps   32(%0), %%xmm2   \n\t" \
+            "movaps   48(%0), %%xmm3   \n\t" \
+            "movaps   %%xmm0,   0(%1) \n\t" \
+            "movaps   %%xmm1,  16(%1) \n\t" \
+            "movaps   %%xmm2,  32(%1) \n\t" \
+            "movaps   %%xmm3,  48(%1) \n\t" \
+            "movaps   64(%0), %%xmm0   \n\t" \
+            "movaps   80(%0), %%xmm1   \n\t" \
+            "movaps   96(%0), %%xmm2   \n\t" \
+            "movaps  112(%0), %%xmm3   \n\t" \
+            "movaps   %%xmm0,  64(%1) \n\t" \
+            "movaps   %%xmm1,  80(%1) \n\t" \
+            "movaps   %%xmm2,  96(%1) \n\t" \
+            "movaps   %%xmm3, 112(%1) \n\t"
+            ::"r"(in), "r"(in+512)
+            :"memory"
+            );
+
+    apply_window(in + 16, win     , win + 512, suma, sumc, 16);
+    apply_window(in + 32, win + 48, win + 640, sumb, sumd, 16);
+
+    SUM8(MACS, suma[0], win + 32, in + 48);
+
+    sumc[ 0] = 0;
+    sumb[16] = 0;
+    sumd[16] = 0;
+
+#define SUMS(suma, sumb, sumc, sumd, out1, out2)               \
+            "movups " #sumd "(%4),       %%xmm0          \n\t" \
+            "shufps         $0x1b,       %%xmm0, %%xmm0  \n\t" \
+            "subps  " #suma "(%1),       %%xmm0          \n\t" \
+            "movaps        %%xmm0," #out1 "(%0)          \n\t" \
+\
+            "movups " #sumc "(%3),       %%xmm0          \n\t" \
+            "shufps         $0x1b,       %%xmm0, %%xmm0  \n\t" \
+            "addps  " #sumb "(%2),       %%xmm0          \n\t" \
+            "movaps        %%xmm0," #out2 "(%0)          \n\t"
+
+    if (incr == 1) {
+        __asm__ volatile(
+            SUMS( 0, 48,  4, 52,  0, 112)
+            SUMS(16, 32, 20, 36, 16,  96)
+            SUMS(32, 16, 36, 20, 32,  80)
+            SUMS(48,  0, 52,  4, 48,  64)
+
+            :"+&r"(out)
+            :"r"(&suma[0]), "r"(&sumb[0]), "r"(&sumc[0]), "r"(&sumd[0])
+            :"memory"
+            );
+        out += 16*incr;
+    } else {
+        int j;
+        float *out2 = out + 32 * incr;
+        out[0  ]  = -suma[   0];
+        out += incr;
+        out2 -= incr;
+        for(j=1;j<16;j++) {
+            *out  = -suma[   j] + sumd[16-j];
+            *out2 =  sumb[16-j] + sumc[   j];
+            out  += incr;
+            out2 -= incr;
+        }
+    }
+
+    sum = 0;
+    SUM8(MLSS, sum, win + 16 + 32, in + 32);
+    *out = sum;
+}
+
+#endif /* HAVE_6REGS && HAVE_SSE_INLINE */
+
+#if HAVE_X86ASM
+#define DECL_IMDCT_BLOCKS(CPU1, CPU2)                                       \
+static void imdct36_blocks_ ## CPU1(float *out, float *buf, float *in,      \
+                               int count, int switch_point, int block_type) \
+{                                                                           \
+    int align_end = count - (count & 3);                                \
+    int j;                                                              \
+    for (j = 0; j < align_end; j+= 4) {                                 \
+        LOCAL_ALIGNED_16(float, tmpbuf, [1024]);                        \
+        float *win = mdct_win_sse[switch_point && j < 4][block_type];   \
+        /* apply window & overlap with previous buffer */               \
+                                                                        \
+        /* select window */                                             \
+        ff_four_imdct36_float_ ## CPU2(out, buf, in, win, tmpbuf);      \
+        in      += 4*18;                                                \
+        buf     += 4*18;                                                \
+        out     += 4;                                                   \
+    }                                                                   \
+    for (; j < count; j++) {                                            \
+        /* apply window & overlap with previous buffer */               \
+                                                                        \
+        /* select window */                                             \
+        int win_idx = (switch_point && j < 2) ? 0 : block_type;         \
+        float *win = ff_mdct_win_float[win_idx + (4 & -(j & 1))];       \
+                                                                        \
+        ff_imdct36_float_ ## CPU1(out, buf, in, win);                   \
+                                                                        \
+        in  += 18;                                                      \
+        buf++;                                                          \
+        out++;                                                          \
+    }                                                                   \
+}
+
+#if HAVE_SSE
+#if ARCH_X86_32
+DECL_IMDCT_BLOCKS(sse,sse)
+#endif
+DECL_IMDCT_BLOCKS(sse2,sse)
+DECL_IMDCT_BLOCKS(sse3,sse)
+DECL_IMDCT_BLOCKS(ssse3,sse)
+#endif
+#if HAVE_AVX_EXTERNAL
+DECL_IMDCT_BLOCKS(avx,avx)
+#endif
+#endif /* HAVE_X86ASM */
+
+av_cold void ff_mpadsp_init_x86(MPADSPContext *s)
+{
+    av_unused int cpu_flags = av_get_cpu_flags();
+
+    int i, j;
+    for (j = 0; j < 4; j++) {
+        for (i = 0; i < 40; i ++) {
+            mdct_win_sse[0][j][4*i    ] = ff_mdct_win_float[j    ][i];
+            mdct_win_sse[0][j][4*i + 1] = ff_mdct_win_float[j + 4][i];
+            mdct_win_sse[0][j][4*i + 2] = ff_mdct_win_float[j    ][i];
+            mdct_win_sse[0][j][4*i + 3] = ff_mdct_win_float[j + 4][i];
+            mdct_win_sse[1][j][4*i    ] = ff_mdct_win_float[0    ][i];
+            mdct_win_sse[1][j][4*i + 1] = ff_mdct_win_float[4    ][i];
+            mdct_win_sse[1][j][4*i + 2] = ff_mdct_win_float[j    ][i];
+            mdct_win_sse[1][j][4*i + 3] = ff_mdct_win_float[j + 4][i];
+        }
+    }
+
+#if HAVE_6REGS && HAVE_SSE_INLINE
+    if (INLINE_SSE(cpu_flags)) {
+        s->apply_window_float = apply_window_mp3;
+    }
+#endif /* HAVE_SSE_INLINE */
+
+#if HAVE_X86ASM
+#if HAVE_SSE
+#if ARCH_X86_32
+    if (EXTERNAL_SSE(cpu_flags)) {
+        s->imdct36_blocks_float = imdct36_blocks_sse;
+    }
+#endif
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        s->imdct36_blocks_float = imdct36_blocks_sse2;
+    }
+    if (EXTERNAL_SSE3(cpu_flags)) {
+        s->imdct36_blocks_float = imdct36_blocks_sse3;
+    }
+    if (EXTERNAL_SSSE3(cpu_flags)) {
+        s->imdct36_blocks_float = imdct36_blocks_ssse3;
+    }
+#endif
+#if HAVE_AVX_EXTERNAL
+    if (EXTERNAL_AVX(cpu_flags)) {
+        s->imdct36_blocks_float = imdct36_blocks_avx;
+    }
+#endif
+#endif /* HAVE_X86ASM */
+}
--- a/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/mpegvideo.c
+++ b/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/mpegvideo.c
@ -0,0 +1,469 @@
+/*
+ * Optimized for ia32 CPUs by Nick Kurshev <nickols_k@mail.ru>
+ * H.263, MPEG-1, MPEG-2 dequantizer & draw_edges by Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/asm.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/avcodec.h"
+#include "libavcodec/mpegvideo.h"
+#include "libavcodec/mpegvideodata.h"
+
+#if HAVE_MMX_INLINE
+
+static void dct_unquantize_h263_intra_mmx(MpegEncContext *s,
+                                  int16_t *block, int n, int qscale)
+{
+    x86_reg level, qmul, qadd, nCoeffs;
+
+    qmul = qscale << 1;
+
+    av_assert2(s->block_last_index[n]>=0 || s->h263_aic);
+
+    if (!s->h263_aic) {
+        if (n < 4)
+            level = block[0] * s->y_dc_scale;
+        else
+            level = block[0] * s->c_dc_scale;
+        qadd = (qscale - 1) | 1;
+    }else{
+        qadd = 0;
+        level= block[0];
+    }
+    if(s->ac_pred)
+        nCoeffs=63;
+    else
+        nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ];
+
+__asm__ volatile(
+                "movd %1, %%mm6                 \n\t" //qmul
+                "packssdw %%mm6, %%mm6          \n\t"
+                "packssdw %%mm6, %%mm6          \n\t"
+                "movd %2, %%mm5                 \n\t" //qadd
+                "pxor %%mm7, %%mm7              \n\t"
+                "packssdw %%mm5, %%mm5          \n\t"
+                "packssdw %%mm5, %%mm5          \n\t"
+                "psubw %%mm5, %%mm7             \n\t"
+                "pxor %%mm4, %%mm4              \n\t"
+                ".p2align 4                     \n\t"
+                "1:                             \n\t"
+                "movq (%0, %3), %%mm0           \n\t"
+                "movq 8(%0, %3), %%mm1          \n\t"
+
+                "pmullw %%mm6, %%mm0            \n\t"
+                "pmullw %%mm6, %%mm1            \n\t"
+
+                "movq (%0, %3), %%mm2           \n\t"
+                "movq 8(%0, %3), %%mm3          \n\t"
+
+                "pcmpgtw %%mm4, %%mm2           \n\t" // block[i] < 0 ? -1 : 0
+                "pcmpgtw %%mm4, %%mm3           \n\t" // block[i] < 0 ? -1 : 0
+
+                "pxor %%mm2, %%mm0              \n\t"
+                "pxor %%mm3, %%mm1              \n\t"
+
+                "paddw %%mm7, %%mm0             \n\t"
+                "paddw %%mm7, %%mm1             \n\t"
+
+                "pxor %%mm0, %%mm2              \n\t"
+                "pxor %%mm1, %%mm3              \n\t"
+
+                "pcmpeqw %%mm7, %%mm0           \n\t" // block[i] == 0 ? -1 : 0
+                "pcmpeqw %%mm7, %%mm1           \n\t" // block[i] == 0 ? -1 : 0
+
+                "pandn %%mm2, %%mm0             \n\t"
+                "pandn %%mm3, %%mm1             \n\t"
+
+                "movq %%mm0, (%0, %3)           \n\t"
+                "movq %%mm1, 8(%0, %3)          \n\t"
+
+                "add $16, %3                    \n\t"
+                "jng 1b                         \n\t"
+                ::"r" (block+nCoeffs), "rm"(qmul), "rm" (qadd), "r" (2*(-nCoeffs))
+                : "memory"
+        );
+        block[0]= level;
+}
+
+
+static void dct_unquantize_h263_inter_mmx(MpegEncContext *s,
+                                  int16_t *block, int n, int qscale)
+{
+    x86_reg qmul, qadd, nCoeffs;
+
+    qmul = qscale << 1;
+    qadd = (qscale - 1) | 1;
+
+    av_assert2(s->block_last_index[n]>=0 || s->h263_aic);
+
+    nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
+
+__asm__ volatile(
+                "movd %1, %%mm6                 \n\t" //qmul
+                "packssdw %%mm6, %%mm6          \n\t"
+                "packssdw %%mm6, %%mm6          \n\t"
+                "movd %2, %%mm5                 \n\t" //qadd
+                "pxor %%mm7, %%mm7              \n\t"
+                "packssdw %%mm5, %%mm5          \n\t"
+                "packssdw %%mm5, %%mm5          \n\t"
+                "psubw %%mm5, %%mm7             \n\t"
+                "pxor %%mm4, %%mm4              \n\t"
+                ".p2align 4                     \n\t"
+                "1:                             \n\t"
+                "movq (%0, %3), %%mm0           \n\t"
+                "movq 8(%0, %3), %%mm1          \n\t"
+
+                "pmullw %%mm6, %%mm0            \n\t"
+                "pmullw %%mm6, %%mm1            \n\t"
+
+                "movq (%0, %3), %%mm2           \n\t"
+                "movq 8(%0, %3), %%mm3          \n\t"
+
+                "pcmpgtw %%mm4, %%mm2           \n\t" // block[i] < 0 ? -1 : 0
+                "pcmpgtw %%mm4, %%mm3           \n\t" // block[i] < 0 ? -1 : 0
+
+                "pxor %%mm2, %%mm0              \n\t"
+                "pxor %%mm3, %%mm1              \n\t"
+
+                "paddw %%mm7, %%mm0             \n\t"
+                "paddw %%mm7, %%mm1             \n\t"
+
+                "pxor %%mm0, %%mm2              \n\t"
+                "pxor %%mm1, %%mm3              \n\t"
+
+                "pcmpeqw %%mm7, %%mm0           \n\t" // block[i] == 0 ? -1 : 0
+                "pcmpeqw %%mm7, %%mm1           \n\t" // block[i] == 0 ? -1 : 0
+
+                "pandn %%mm2, %%mm0             \n\t"
+                "pandn %%mm3, %%mm1             \n\t"
+
+                "movq %%mm0, (%0, %3)           \n\t"
+                "movq %%mm1, 8(%0, %3)          \n\t"
+
+                "add $16, %3                    \n\t"
+                "jng 1b                         \n\t"
+                ::"r" (block+nCoeffs), "rm"(qmul), "rm" (qadd), "r" (2*(-nCoeffs))
+                : "memory"
+        );
+}
+
+static void dct_unquantize_mpeg1_intra_mmx(MpegEncContext *s,
+                                     int16_t *block, int n, int qscale)
+{
+    x86_reg nCoeffs;
+    const uint16_t *quant_matrix;
+    int block0;
+
+    av_assert2(s->block_last_index[n]>=0);
+
+    nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1;
+
+    if (n < 4)
+        block0 = block[0] * s->y_dc_scale;
+    else
+        block0 = block[0] * s->c_dc_scale;
+    /* XXX: only MPEG-1 */
+    quant_matrix = s->intra_matrix;
+__asm__ volatile(
+                "pcmpeqw %%mm7, %%mm7           \n\t"
+                "psrlw $15, %%mm7               \n\t"
+                "movd %2, %%mm6                 \n\t"
+                "packssdw %%mm6, %%mm6          \n\t"
+                "packssdw %%mm6, %%mm6          \n\t"
+                "mov %3, %%"FF_REG_a"           \n\t"
+                ".p2align 4                     \n\t"
+                "1:                             \n\t"
+                "movq (%0, %%"FF_REG_a"), %%mm0 \n\t"
+                "movq 8(%0, %%"FF_REG_a"), %%mm1\n\t"
+                "movq (%1, %%"FF_REG_a"), %%mm4 \n\t"
+                "movq 8(%1, %%"FF_REG_a"), %%mm5\n\t"
+                "pmullw %%mm6, %%mm4            \n\t" // q=qscale*quant_matrix[i]
+                "pmullw %%mm6, %%mm5            \n\t" // q=qscale*quant_matrix[i]
+                "pxor %%mm2, %%mm2              \n\t"
+                "pxor %%mm3, %%mm3              \n\t"
+                "pcmpgtw %%mm0, %%mm2           \n\t" // block[i] < 0 ? -1 : 0
+                "pcmpgtw %%mm1, %%mm3           \n\t" // block[i] < 0 ? -1 : 0
+                "pxor %%mm2, %%mm0              \n\t"
+                "pxor %%mm3, %%mm1              \n\t"
+                "psubw %%mm2, %%mm0             \n\t" // abs(block[i])
+                "psubw %%mm3, %%mm1             \n\t" // abs(block[i])
+                "pmullw %%mm4, %%mm0            \n\t" // abs(block[i])*q
+                "pmullw %%mm5, %%mm1            \n\t" // abs(block[i])*q
+                "pxor %%mm4, %%mm4              \n\t"
+                "pxor %%mm5, %%mm5              \n\t" // FIXME slow
+                "pcmpeqw (%0, %%"FF_REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
+                "pcmpeqw 8(%0, %%"FF_REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0
+                "psraw $3, %%mm0                \n\t"
+                "psraw $3, %%mm1                \n\t"
+                "psubw %%mm7, %%mm0             \n\t"
+                "psubw %%mm7, %%mm1             \n\t"
+                "por %%mm7, %%mm0               \n\t"
+                "por %%mm7, %%mm1               \n\t"
+                "pxor %%mm2, %%mm0              \n\t"
+                "pxor %%mm3, %%mm1              \n\t"
+                "psubw %%mm2, %%mm0             \n\t"
+                "psubw %%mm3, %%mm1             \n\t"
+                "pandn %%mm0, %%mm4             \n\t"
+                "pandn %%mm1, %%mm5             \n\t"
+                "movq %%mm4, (%0, %%"FF_REG_a") \n\t"
+                "movq %%mm5, 8(%0, %%"FF_REG_a")\n\t"
+
+                "add $16, %%"FF_REG_a"          \n\t"
+                "js 1b                          \n\t"
+                ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "g" (-2*nCoeffs)
+                : "%"FF_REG_a, "memory"
+        );
+    block[0]= block0;
+}
+
+static void dct_unquantize_mpeg1_inter_mmx(MpegEncContext *s,
+                                     int16_t *block, int n, int qscale)
+{
+    x86_reg nCoeffs;
+    const uint16_t *quant_matrix;
+
+    av_assert2(s->block_last_index[n]>=0);
+
+    nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1;
+
+        quant_matrix = s->inter_matrix;
+__asm__ volatile(
+                "pcmpeqw %%mm7, %%mm7           \n\t"
+                "psrlw $15, %%mm7               \n\t"
+                "movd %2, %%mm6                 \n\t"
+                "packssdw %%mm6, %%mm6          \n\t"
+                "packssdw %%mm6, %%mm6          \n\t"
+                "mov %3, %%"FF_REG_a"           \n\t"
+                ".p2align 4                     \n\t"
+                "1:                             \n\t"
+                "movq (%0, %%"FF_REG_a"), %%mm0 \n\t"
+                "movq 8(%0, %%"FF_REG_a"), %%mm1\n\t"
+                "movq (%1, %%"FF_REG_a"), %%mm4 \n\t"
+                "movq 8(%1, %%"FF_REG_a"), %%mm5\n\t"
+                "pmullw %%mm6, %%mm4            \n\t" // q=qscale*quant_matrix[i]
+                "pmullw %%mm6, %%mm5            \n\t" // q=qscale*quant_matrix[i]
+                "pxor %%mm2, %%mm2              \n\t"
+                "pxor %%mm3, %%mm3              \n\t"
+                "pcmpgtw %%mm0, %%mm2           \n\t" // block[i] < 0 ? -1 : 0
+                "pcmpgtw %%mm1, %%mm3           \n\t" // block[i] < 0 ? -1 : 0
+                "pxor %%mm2, %%mm0              \n\t"
+                "pxor %%mm3, %%mm1              \n\t"
+                "psubw %%mm2, %%mm0             \n\t" // abs(block[i])
+                "psubw %%mm3, %%mm1             \n\t" // abs(block[i])
+                "paddw %%mm0, %%mm0             \n\t" // abs(block[i])*2
+                "paddw %%mm1, %%mm1             \n\t" // abs(block[i])*2
+                "paddw %%mm7, %%mm0             \n\t" // abs(block[i])*2 + 1
+                "paddw %%mm7, %%mm1             \n\t" // abs(block[i])*2 + 1
+                "pmullw %%mm4, %%mm0            \n\t" // (abs(block[i])*2 + 1)*q
+                "pmullw %%mm5, %%mm1            \n\t" // (abs(block[i])*2 + 1)*q
+                "pxor %%mm4, %%mm4              \n\t"
+                "pxor %%mm5, %%mm5              \n\t" // FIXME slow
+                "pcmpeqw (%0, %%"FF_REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
+                "pcmpeqw 8(%0, %%"FF_REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0
+                "psraw $4, %%mm0                \n\t"
+                "psraw $4, %%mm1                \n\t"
+                "psubw %%mm7, %%mm0             \n\t"
+                "psubw %%mm7, %%mm1             \n\t"
+                "por %%mm7, %%mm0               \n\t"
+                "por %%mm7, %%mm1               \n\t"
+                "pxor %%mm2, %%mm0              \n\t"
+                "pxor %%mm3, %%mm1              \n\t"
+                "psubw %%mm2, %%mm0             \n\t"
+                "psubw %%mm3, %%mm1             \n\t"
+                "pandn %%mm0, %%mm4             \n\t"
+                "pandn %%mm1, %%mm5             \n\t"
+                "movq %%mm4, (%0, %%"FF_REG_a") \n\t"
+                "movq %%mm5, 8(%0, %%"FF_REG_a")\n\t"
+
+                "add $16, %%"FF_REG_a"          \n\t"
+                "js 1b                          \n\t"
+                ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "g" (-2*nCoeffs)
+                : "%"FF_REG_a, "memory"
+        );
+}
+
+static void dct_unquantize_mpeg2_intra_mmx(MpegEncContext *s,
+                                     int16_t *block, int n, int qscale)
+{
+    x86_reg nCoeffs;
+    const uint16_t *quant_matrix;
+    int block0;
+
+    av_assert2(s->block_last_index[n]>=0);
+
+    if (s->q_scale_type) qscale = ff_mpeg2_non_linear_qscale[qscale];
+    else                 qscale <<= 1;
+
+    if(s->alternate_scan) nCoeffs= 63; //FIXME
+    else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ];
+
+    if (n < 4)
+        block0 = block[0] * s->y_dc_scale;
+    else
+        block0 = block[0] * s->c_dc_scale;
+    quant_matrix = s->intra_matrix;
+__asm__ volatile(
+                "pcmpeqw %%mm7, %%mm7           \n\t"
+                "psrlw $15, %%mm7               \n\t"
+                "movd %2, %%mm6                 \n\t"
+                "packssdw %%mm6, %%mm6          \n\t"
+                "packssdw %%mm6, %%mm6          \n\t"
+                "mov %3, %%"FF_REG_a"           \n\t"
+                ".p2align 4                     \n\t"
+                "1:                             \n\t"
+                "movq (%0, %%"FF_REG_a"), %%mm0 \n\t"
+                "movq 8(%0, %%"FF_REG_a"), %%mm1\n\t"
+                "movq (%1, %%"FF_REG_a"), %%mm4 \n\t"
+                "movq 8(%1, %%"FF_REG_a"), %%mm5\n\t"
+                "pmullw %%mm6, %%mm4            \n\t" // q=qscale*quant_matrix[i]
+                "pmullw %%mm6, %%mm5            \n\t" // q=qscale*quant_matrix[i]
+                "pxor %%mm2, %%mm2              \n\t"
+                "pxor %%mm3, %%mm3              \n\t"
+                "pcmpgtw %%mm0, %%mm2           \n\t" // block[i] < 0 ? -1 : 0
+                "pcmpgtw %%mm1, %%mm3           \n\t" // block[i] < 0 ? -1 : 0
+                "pxor %%mm2, %%mm0              \n\t"
+                "pxor %%mm3, %%mm1              \n\t"
+                "psubw %%mm2, %%mm0             \n\t" // abs(block[i])
+                "psubw %%mm3, %%mm1             \n\t" // abs(block[i])
+                "pmullw %%mm4, %%mm0            \n\t" // abs(block[i])*q
+                "pmullw %%mm5, %%mm1            \n\t" // abs(block[i])*q
+                "pxor %%mm4, %%mm4              \n\t"
+                "pxor %%mm5, %%mm5              \n\t" // FIXME slow
+                "pcmpeqw (%0, %%"FF_REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
+                "pcmpeqw 8(%0, %%"FF_REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0
+                "psraw $4, %%mm0                \n\t"
+                "psraw $4, %%mm1                \n\t"
+                "pxor %%mm2, %%mm0              \n\t"
+                "pxor %%mm3, %%mm1              \n\t"
+                "psubw %%mm2, %%mm0             \n\t"
+                "psubw %%mm3, %%mm1             \n\t"
+                "pandn %%mm0, %%mm4             \n\t"
+                "pandn %%mm1, %%mm5             \n\t"
+                "movq %%mm4, (%0, %%"FF_REG_a") \n\t"
+                "movq %%mm5, 8(%0, %%"FF_REG_a")\n\t"
+
+                "add $16, %%"FF_REG_a"          \n\t"
+                "jng 1b                         \n\t"
+                ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "g" (-2*nCoeffs)
+                : "%"FF_REG_a, "memory"
+        );
+    block[0]= block0;
+        //Note, we do not do mismatch control for intra as errors cannot accumulate
+}
+
+static void dct_unquantize_mpeg2_inter_mmx(MpegEncContext *s,
+                                     int16_t *block, int n, int qscale)
+{
+    x86_reg nCoeffs;
+    const uint16_t *quant_matrix;
+
+    av_assert2(s->block_last_index[n]>=0);
+
+    if (s->q_scale_type) qscale = ff_mpeg2_non_linear_qscale[qscale];
+    else                 qscale <<= 1;
+
+    if(s->alternate_scan) nCoeffs= 63; //FIXME
+    else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ];
+
+        quant_matrix = s->inter_matrix;
+__asm__ volatile(
+                "pcmpeqw %%mm7, %%mm7           \n\t"
+                "psrlq $48, %%mm7               \n\t"
+                "movd %2, %%mm6                 \n\t"
+                "packssdw %%mm6, %%mm6          \n\t"
+                "packssdw %%mm6, %%mm6          \n\t"
+                "mov %3, %%"FF_REG_a"           \n\t"
+                ".p2align 4                     \n\t"
+                "1:                             \n\t"
+                "movq (%0, %%"FF_REG_a"), %%mm0 \n\t"
+                "movq 8(%0, %%"FF_REG_a"), %%mm1\n\t"
+                "movq (%1, %%"FF_REG_a"), %%mm4 \n\t"
+                "movq 8(%1, %%"FF_REG_a"), %%mm5\n\t"
+                "pmullw %%mm6, %%mm4            \n\t" // q=qscale*quant_matrix[i]
+                "pmullw %%mm6, %%mm5            \n\t" // q=qscale*quant_matrix[i]
+                "pxor %%mm2, %%mm2              \n\t"
+                "pxor %%mm3, %%mm3              \n\t"
+                "pcmpgtw %%mm0, %%mm2           \n\t" // block[i] < 0 ? -1 : 0
+                "pcmpgtw %%mm1, %%mm3           \n\t" // block[i] < 0 ? -1 : 0
+                "pxor %%mm2, %%mm0              \n\t"
+                "pxor %%mm3, %%mm1              \n\t"
+                "psubw %%mm2, %%mm0             \n\t" // abs(block[i])
+                "psubw %%mm3, %%mm1             \n\t" // abs(block[i])
+                "paddw %%mm0, %%mm0             \n\t" // abs(block[i])*2
+                "paddw %%mm1, %%mm1             \n\t" // abs(block[i])*2
+                "pmullw %%mm4, %%mm0            \n\t" // abs(block[i])*2*q
+                "pmullw %%mm5, %%mm1            \n\t" // abs(block[i])*2*q
+                "paddw %%mm4, %%mm0             \n\t" // (abs(block[i])*2 + 1)*q
+                "paddw %%mm5, %%mm1             \n\t" // (abs(block[i])*2 + 1)*q
+                "pxor %%mm4, %%mm4              \n\t"
+                "pxor %%mm5, %%mm5              \n\t" // FIXME slow
+                "pcmpeqw (%0, %%"FF_REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
+                "pcmpeqw 8(%0, %%"FF_REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0
+                "psrlw $5, %%mm0                \n\t"
+                "psrlw $5, %%mm1                \n\t"
+                "pxor %%mm2, %%mm0              \n\t"
+                "pxor %%mm3, %%mm1              \n\t"
+                "psubw %%mm2, %%mm0             \n\t"
+                "psubw %%mm3, %%mm1             \n\t"
+                "pandn %%mm0, %%mm4             \n\t"
+                "pandn %%mm1, %%mm5             \n\t"
+                "pxor %%mm4, %%mm7              \n\t"
+                "pxor %%mm5, %%mm7              \n\t"
+                "movq %%mm4, (%0, %%"FF_REG_a") \n\t"
+                "movq %%mm5, 8(%0, %%"FF_REG_a")\n\t"
+
+                "add $16, %%"FF_REG_a"          \n\t"
+                "jng 1b                         \n\t"
+                "movd 124(%0, %3), %%mm0        \n\t"
+                "movq %%mm7, %%mm6              \n\t"
+                "psrlq $32, %%mm7               \n\t"
+                "pxor %%mm6, %%mm7              \n\t"
+                "movq %%mm7, %%mm6              \n\t"
+                "psrlq $16, %%mm7               \n\t"
+                "pxor %%mm6, %%mm7              \n\t"
+                "pslld $31, %%mm7               \n\t"
+                "psrlq $15, %%mm7               \n\t"
+                "pxor %%mm7, %%mm0              \n\t"
+                "movd %%mm0, 124(%0, %3)        \n\t"
+
+                ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "r" (-2*nCoeffs)
+                : "%"FF_REG_a, "memory"
+        );
+}
+
+#endif /* HAVE_MMX_INLINE */
+
+av_cold void ff_mpv_common_init_x86(MpegEncContext *s)
+{
+#if HAVE_MMX_INLINE
+    int cpu_flags = av_get_cpu_flags();
+
+    if (INLINE_MMX(cpu_flags)) {
+        s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_mmx;
+        s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_mmx;
+        s->dct_unquantize_mpeg1_intra = dct_unquantize_mpeg1_intra_mmx;
+        s->dct_unquantize_mpeg1_inter = dct_unquantize_mpeg1_inter_mmx;
+        if (!(s->avctx->flags & AV_CODEC_FLAG_BITEXACT))
+            s->dct_unquantize_mpeg2_intra = dct_unquantize_mpeg2_intra_mmx;
+        s->dct_unquantize_mpeg2_inter = dct_unquantize_mpeg2_inter_mmx;
+    }
+#endif /* HAVE_MMX_INLINE */
+}
--- a/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/mpegvideodsp.c
+++ b/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/mpegvideodsp.c
@ -0,0 +1,161 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/mpegvideodsp.h"
+#include "libavcodec/videodsp.h"
+
+#if HAVE_INLINE_ASM
+
+static void gmc_mmx(uint8_t *dst, uint8_t *src,
+                    int stride, int h, int ox, int oy,
+                    int dxx, int dxy, int dyx, int dyy,
+                    int shift, int r, int width, int height)
+{
+    const int w    = 8;
+    const int ix   = ox  >> (16 + shift);
+    const int iy   = oy  >> (16 + shift);
+    const int oxs  = ox  >> 4;
+    const int oys  = oy  >> 4;
+    const int dxxs = dxx >> 4;
+    const int dxys = dxy >> 4;
+    const int dyxs = dyx >> 4;
+    const int dyys = dyy >> 4;
+    const uint16_t r4[4]   = { r, r, r, r };
+    const uint16_t dxy4[4] = { dxys, dxys, dxys, dxys };
+    const uint16_t dyy4[4] = { dyys, dyys, dyys, dyys };
+    const uint64_t shift2  = 2 * shift;
+#define MAX_STRIDE 4096U
+#define MAX_H 8U
+    uint8_t edge_buf[(MAX_H + 1) * MAX_STRIDE];
+    int x, y;
+
+    const int dxw = (dxx - (1 << (16 + shift))) * (w - 1);
+    const int dyh = (dyy - (1 << (16 + shift))) * (h - 1);
+    const int dxh = dxy * (h - 1);
+    const int dyw = dyx * (w - 1);
+    int need_emu  =  (unsigned) ix >= width  - w || width < w ||
+                     (unsigned) iy >= height - h || height< h
+                     ;
+
+    if ( // non-constant fullpel offset (3% of blocks)
+        ((ox ^ (ox + dxw)) | (ox ^ (ox + dxh)) | (ox ^ (ox + dxw + dxh)) |
+         (oy ^ (oy + dyw)) | (oy ^ (oy + dyh)) | (oy ^ (oy + dyw + dyh))) >> (16 + shift) ||
+        // uses more than 16 bits of subpel mv (only at huge resolution)
+        (dxx | dxy | dyx | dyy) & 15 ||
+        (need_emu && (h > MAX_H || stride > MAX_STRIDE))) {
+        // FIXME could still use mmx for some of the rows
+        ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy,
+                 shift, r, width, height);
+        return;
+    }
+
+    src += ix + iy * stride;
+    if (need_emu) {
+        ff_emulated_edge_mc_8(edge_buf, src, stride, stride, w + 1, h + 1, ix, iy, width, height);
+        src = edge_buf;
+    }
+
+    __asm__ volatile (
+        "movd         %0, %%mm6         \n\t"
+        "pxor      %%mm7, %%mm7         \n\t"
+        "punpcklwd %%mm6, %%mm6         \n\t"
+        "punpcklwd %%mm6, %%mm6         \n\t"
+        :: "r" (1 << shift));
+
+    for (x = 0; x < w; x += 4) {
+        uint16_t dx4[4] = { oxs - dxys + dxxs * (x + 0),
+                            oxs - dxys + dxxs * (x + 1),
+                            oxs - dxys + dxxs * (x + 2),
+                            oxs - dxys + dxxs * (x + 3) };
+        uint16_t dy4[4] = { oys - dyys + dyxs * (x + 0),
+                            oys - dyys + dyxs * (x + 1),
+                            oys - dyys + dyxs * (x + 2),
+                            oys - dyys + dyxs * (x + 3) };
+
+        for (y = 0; y < h; y++) {
+            __asm__ volatile (
+                "movq      %0, %%mm4    \n\t"
+                "movq      %1, %%mm5    \n\t"
+                "paddw     %2, %%mm4    \n\t"
+                "paddw     %3, %%mm5    \n\t"
+                "movq   %%mm4, %0       \n\t"
+                "movq   %%mm5, %1       \n\t"
+                "psrlw    $12, %%mm4    \n\t"
+                "psrlw    $12, %%mm5    \n\t"
+                : "+m" (*dx4), "+m" (*dy4)
+                : "m" (*dxy4), "m" (*dyy4));
+
+            __asm__ volatile (
+                "movq      %%mm6, %%mm2 \n\t"
+                "movq      %%mm6, %%mm1 \n\t"
+                "psubw     %%mm4, %%mm2 \n\t"
+                "psubw     %%mm5, %%mm1 \n\t"
+                "movq      %%mm2, %%mm0 \n\t"
+                "movq      %%mm4, %%mm3 \n\t"
+                "pmullw    %%mm1, %%mm0 \n\t" // (s - dx) * (s - dy)
+                "pmullw    %%mm5, %%mm3 \n\t" // dx * dy
+                "pmullw    %%mm5, %%mm2 \n\t" // (s - dx) * dy
+                "pmullw    %%mm4, %%mm1 \n\t" // dx * (s - dy)
+
+                "movd         %4, %%mm5 \n\t"
+                "movd         %3, %%mm4 \n\t"
+                "punpcklbw %%mm7, %%mm5 \n\t"
+                "punpcklbw %%mm7, %%mm4 \n\t"
+                "pmullw    %%mm5, %%mm3 \n\t" // src[1, 1] * dx * dy
+                "pmullw    %%mm4, %%mm2 \n\t" // src[0, 1] * (s - dx) * dy
+
+                "movd         %2, %%mm5 \n\t"
+                "movd         %1, %%mm4 \n\t"
+                "punpcklbw %%mm7, %%mm5 \n\t"
+                "punpcklbw %%mm7, %%mm4 \n\t"
+                "pmullw    %%mm5, %%mm1 \n\t" // src[1, 0] * dx * (s - dy)
+                "pmullw    %%mm4, %%mm0 \n\t" // src[0, 0] * (s - dx) * (s - dy)
+                "paddw        %5, %%mm1 \n\t"
+                "paddw     %%mm3, %%mm2 \n\t"
+                "paddw     %%mm1, %%mm0 \n\t"
+                "paddw     %%mm2, %%mm0 \n\t"
+
+                "psrlw        %6, %%mm0 \n\t"
+                "packuswb  %%mm0, %%mm0 \n\t"
+                "movd      %%mm0, %0    \n\t"
+
+                : "=m" (dst[x + y * stride])
+                : "m" (src[0]), "m" (src[1]),
+                  "m" (src[stride]), "m" (src[stride + 1]),
+                  "m" (*r4), "m" (shift2));
+            src += stride;
+        }
+        src += 4 - h * stride;
+    }
+}
+
+#endif /* HAVE_INLINE_ASM */
+
+av_cold void ff_mpegvideodsp_init_x86(MpegVideoDSPContext *c)
+{
+#if HAVE_INLINE_ASM
+    int cpu_flags = av_get_cpu_flags();
+
+    if (INLINE_MMX(cpu_flags))
+        c->gmc = gmc_mmx;
+#endif /* HAVE_INLINE_ASM */
+}
--- a/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/mpegvideoenc.c
+++ b/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/mpegvideoenc.c
@ -0,0 +1,244 @@
+/*
+ * The simplest mpeg encoder (well, it was the simplest!)
+ * Copyright (c) 2000,2001 Fabrice Bellard
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/asm.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/avcodec.h"
+#include "libavcodec/dct.h"
+#include "libavcodec/mpegvideo.h"
+
+/* not permutated inverse zigzag_direct + 1 for MMX quantizer */
+DECLARE_ALIGNED(16, static const uint16_t, inv_zigzag_direct16)[64] = {
+    1,  2,  6,  7,  15, 16, 28, 29,
+    3,  5,  8,  14, 17, 27, 30, 43,
+    4,  9,  13, 18, 26, 31, 42, 44,
+    10, 12, 19, 25, 32, 41, 45, 54,
+    11, 20, 24, 33, 40, 46, 53, 55,
+    21, 23, 34, 39, 47, 52, 56, 61,
+    22, 35, 38, 48, 51, 57, 60, 62,
+    36, 37, 49, 50, 58, 59, 63, 64,
+};
+
+#if HAVE_6REGS
+
+#if HAVE_MMX_INLINE
+#define COMPILE_TEMPLATE_MMXEXT 0
+#define COMPILE_TEMPLATE_SSE2   0
+#define COMPILE_TEMPLATE_SSSE3  0
+#define RENAME(a)      a ## _mmx
+#define RENAME_FDCT(a) a ## _mmx
+#include "mpegvideoenc_template.c"
+#endif /* HAVE_MMX_INLINE */
+
+#if HAVE_MMXEXT_INLINE
+#undef COMPILE_TEMPLATE_SSSE3
+#undef COMPILE_TEMPLATE_SSE2
+#undef COMPILE_TEMPLATE_MMXEXT
+#define COMPILE_TEMPLATE_MMXEXT 1
+#define COMPILE_TEMPLATE_SSE2   0
+#define COMPILE_TEMPLATE_SSSE3  0
+#undef RENAME
+#undef RENAME_FDCT
+#define RENAME(a)      a ## _mmxext
+#define RENAME_FDCT(a) a ## _mmxext
+#include "mpegvideoenc_template.c"
+#endif /* HAVE_MMXEXT_INLINE */
+
+#if HAVE_SSE2_INLINE
+#undef COMPILE_TEMPLATE_MMXEXT
+#undef COMPILE_TEMPLATE_SSE2
+#undef COMPILE_TEMPLATE_SSSE3
+#define COMPILE_TEMPLATE_MMXEXT 0
+#define COMPILE_TEMPLATE_SSE2   1
+#define COMPILE_TEMPLATE_SSSE3  0
+#undef RENAME
+#undef RENAME_FDCT
+#define RENAME(a)      a ## _sse2
+#define RENAME_FDCT(a) a ## _sse2
+#include "mpegvideoenc_template.c"
+#endif /* HAVE_SSE2_INLINE */
+
+#if HAVE_SSSE3_INLINE
+#undef COMPILE_TEMPLATE_MMXEXT
+#undef COMPILE_TEMPLATE_SSE2
+#undef COMPILE_TEMPLATE_SSSE3
+#define COMPILE_TEMPLATE_MMXEXT 0
+#define COMPILE_TEMPLATE_SSE2   1
+#define COMPILE_TEMPLATE_SSSE3  1
+#undef RENAME
+#undef RENAME_FDCT
+#define RENAME(a)      a ## _ssse3
+#define RENAME_FDCT(a) a ## _sse2
+#include "mpegvideoenc_template.c"
+#endif /* HAVE_SSSE3_INLINE */
+
+#endif /* HAVE_6REGS */
+
+#if HAVE_INLINE_ASM
+#if HAVE_MMX_INLINE
+static void  denoise_dct_mmx(MpegEncContext *s, int16_t *block){
+    const int intra= s->mb_intra;
+    int *sum= s->dct_error_sum[intra];
+    uint16_t *offset= s->dct_offset[intra];
+
+    s->dct_count[intra]++;
+
+    __asm__ volatile(
+        "pxor %%mm7, %%mm7                      \n\t"
+        "1:                                     \n\t"
+        "pxor %%mm0, %%mm0                      \n\t"
+        "pxor %%mm1, %%mm1                      \n\t"
+        "movq (%0), %%mm2                       \n\t"
+        "movq 8(%0), %%mm3                      \n\t"
+        "pcmpgtw %%mm2, %%mm0                   \n\t"
+        "pcmpgtw %%mm3, %%mm1                   \n\t"
+        "pxor %%mm0, %%mm2                      \n\t"
+        "pxor %%mm1, %%mm3                      \n\t"
+        "psubw %%mm0, %%mm2                     \n\t"
+        "psubw %%mm1, %%mm3                     \n\t"
+        "movq %%mm2, %%mm4                      \n\t"
+        "movq %%mm3, %%mm5                      \n\t"
+        "psubusw (%2), %%mm2                    \n\t"
+        "psubusw 8(%2), %%mm3                   \n\t"
+        "pxor %%mm0, %%mm2                      \n\t"
+        "pxor %%mm1, %%mm3                      \n\t"
+        "psubw %%mm0, %%mm2                     \n\t"
+        "psubw %%mm1, %%mm3                     \n\t"
+        "movq %%mm2, (%0)                       \n\t"
+        "movq %%mm3, 8(%0)                      \n\t"
+        "movq %%mm4, %%mm2                      \n\t"
+        "movq %%mm5, %%mm3                      \n\t"
+        "punpcklwd %%mm7, %%mm4                 \n\t"
+        "punpckhwd %%mm7, %%mm2                 \n\t"
+        "punpcklwd %%mm7, %%mm5                 \n\t"
+        "punpckhwd %%mm7, %%mm3                 \n\t"
+        "paddd (%1), %%mm4                      \n\t"
+        "paddd 8(%1), %%mm2                     \n\t"
+        "paddd 16(%1), %%mm5                    \n\t"
+        "paddd 24(%1), %%mm3                    \n\t"
+        "movq %%mm4, (%1)                       \n\t"
+        "movq %%mm2, 8(%1)                      \n\t"
+        "movq %%mm5, 16(%1)                     \n\t"
+        "movq %%mm3, 24(%1)                     \n\t"
+        "add $16, %0                            \n\t"
+        "add $32, %1                            \n\t"
+        "add $16, %2                            \n\t"
+        "cmp %3, %0                             \n\t"
+            " jb 1b                             \n\t"
+        : "+r" (block), "+r" (sum), "+r" (offset)
+        : "r"(block+64)
+    );
+}
+#endif /* HAVE_MMX_INLINE */
+
+#if HAVE_SSE2_INLINE
+static void  denoise_dct_sse2(MpegEncContext *s, int16_t *block){
+    const int intra= s->mb_intra;
+    int *sum= s->dct_error_sum[intra];
+    uint16_t *offset= s->dct_offset[intra];
+
+    s->dct_count[intra]++;
+
+    __asm__ volatile(
+        "pxor %%xmm7, %%xmm7                    \n\t"
+        "1:                                     \n\t"
+        "pxor %%xmm0, %%xmm0                    \n\t"
+        "pxor %%xmm1, %%xmm1                    \n\t"
+        "movdqa (%0), %%xmm2                    \n\t"
+        "movdqa 16(%0), %%xmm3                  \n\t"
+        "pcmpgtw %%xmm2, %%xmm0                 \n\t"
+        "pcmpgtw %%xmm3, %%xmm1                 \n\t"
+        "pxor %%xmm0, %%xmm2                    \n\t"
+        "pxor %%xmm1, %%xmm3                    \n\t"
+        "psubw %%xmm0, %%xmm2                   \n\t"
+        "psubw %%xmm1, %%xmm3                   \n\t"
+        "movdqa %%xmm2, %%xmm4                  \n\t"
+        "movdqa %%xmm3, %%xmm5                  \n\t"
+        "psubusw (%2), %%xmm2                   \n\t"
+        "psubusw 16(%2), %%xmm3                 \n\t"
+        "pxor %%xmm0, %%xmm2                    \n\t"
+        "pxor %%xmm1, %%xmm3                    \n\t"
+        "psubw %%xmm0, %%xmm2                   \n\t"
+        "psubw %%xmm1, %%xmm3                   \n\t"
+        "movdqa %%xmm2, (%0)                    \n\t"
+        "movdqa %%xmm3, 16(%0)                  \n\t"
+        "movdqa %%xmm4, %%xmm6                  \n\t"
+        "movdqa %%xmm5, %%xmm0                  \n\t"
+        "punpcklwd %%xmm7, %%xmm4               \n\t"
+        "punpckhwd %%xmm7, %%xmm6               \n\t"
+        "punpcklwd %%xmm7, %%xmm5               \n\t"
+        "punpckhwd %%xmm7, %%xmm0               \n\t"
+        "paddd (%1), %%xmm4                     \n\t"
+        "paddd 16(%1), %%xmm6                   \n\t"
+        "paddd 32(%1), %%xmm5                   \n\t"
+        "paddd 48(%1), %%xmm0                   \n\t"
+        "movdqa %%xmm4, (%1)                    \n\t"
+        "movdqa %%xmm6, 16(%1)                  \n\t"
+        "movdqa %%xmm5, 32(%1)                  \n\t"
+        "movdqa %%xmm0, 48(%1)                  \n\t"
+        "add $32, %0                            \n\t"
+        "add $64, %1                            \n\t"
+        "add $32, %2                            \n\t"
+        "cmp %3, %0                             \n\t"
+            " jb 1b                             \n\t"
+        : "+r" (block), "+r" (sum), "+r" (offset)
+        : "r"(block+64)
+          XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3",
+                            "%xmm4", "%xmm5", "%xmm6", "%xmm7")
+    );
+}
+#endif /* HAVE_SSE2_INLINE */
+#endif /* HAVE_INLINE_ASM */
+
+av_cold void ff_dct_encode_init_x86(MpegEncContext *s)
+{
+    const int dct_algo = s->avctx->dct_algo;
+
+    if (dct_algo == FF_DCT_AUTO || dct_algo == FF_DCT_MMX) {
+#if HAVE_MMX_INLINE
+        int cpu_flags = av_get_cpu_flags();
+        if (INLINE_MMX(cpu_flags)) {
+#if HAVE_6REGS
+            s->dct_quantize = dct_quantize_mmx;
+#endif
+            s->denoise_dct  = denoise_dct_mmx;
+        }
+#endif
+#if HAVE_6REGS && HAVE_MMXEXT_INLINE
+        if (INLINE_MMXEXT(cpu_flags))
+            s->dct_quantize = dct_quantize_mmxext;
+#endif
+#if HAVE_SSE2_INLINE
+        if (INLINE_SSE2(cpu_flags)) {
+#if HAVE_6REGS
+            s->dct_quantize = dct_quantize_sse2;
+#endif
+            s->denoise_dct  = denoise_dct_sse2;
+        }
+#endif
+#if HAVE_6REGS && HAVE_SSSE3_INLINE
+        if (INLINE_SSSE3(cpu_flags))
+            s->dct_quantize = dct_quantize_ssse3;
+#endif
+    }
+}
--- a/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/mpegvideoenc_qns_template.c
+++ b/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/mpegvideoenc_qns_template.c
@ -0,0 +1,109 @@
+/*
+ * QNS functions are compiled 3 times for MMX/3DNOW/SSSE3
+ * Copyright (c) 2004 Michael Niedermayer
+ *
+ * MMX optimization by Michael Niedermayer <michaelni@gmx.at>
+ * 3DNow! and SSSE3 optimization by Zuxy Meng <zuxy.meng@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/avassert.h"
+#include "libavutil/common.h"
+#include "libavutil/x86/asm.h"
+
+#include "inline_asm.h"
+
+#define MAX_ABS (512 >> (SCALE_OFFSET>0 ? SCALE_OFFSET : 0))
+
+static int DEF(try_8x8basis)(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale)
+{
+    x86_reg i=0;
+
+    av_assert2(FFABS(scale) < MAX_ABS);
+    scale<<= 16 + SCALE_OFFSET - BASIS_SHIFT + RECON_SHIFT;
+
+    SET_RND(mm6);
+    __asm__ volatile(
+        "pxor %%mm7, %%mm7              \n\t"
+        "movd  %4, %%mm5                \n\t"
+        "punpcklwd %%mm5, %%mm5         \n\t"
+        "punpcklwd %%mm5, %%mm5         \n\t"
+        ".p2align 4                     \n\t"
+        "1:                             \n\t"
+        "movq  (%1, %0), %%mm0          \n\t"
+        "movq  8(%1, %0), %%mm1         \n\t"
+        PMULHRW(%%mm0, %%mm1, %%mm5, %%mm6)
+        "paddw (%2, %0), %%mm0          \n\t"
+        "paddw 8(%2, %0), %%mm1         \n\t"
+        "psraw $6, %%mm0                \n\t"
+        "psraw $6, %%mm1                \n\t"
+        "pmullw (%3, %0), %%mm0         \n\t"
+        "pmullw 8(%3, %0), %%mm1        \n\t"
+        "pmaddwd %%mm0, %%mm0           \n\t"
+        "pmaddwd %%mm1, %%mm1           \n\t"
+        "paddd %%mm1, %%mm0             \n\t"
+        "psrld $4, %%mm0                \n\t"
+        "paddd %%mm0, %%mm7             \n\t"
+        "add $16, %0                    \n\t"
+        "cmp $128, %0                   \n\t" //FIXME optimize & bench
+        " jb 1b                         \n\t"
+        PHADDD(%%mm7, %%mm6)
+        "psrld $2, %%mm7                \n\t"
+        "movd %%mm7, %0                 \n\t"
+
+        : "+r" (i)
+        : "r"(basis), "r"(rem), "r"(weight), "g"(scale)
+    );
+    return i;
+}
+
+static void DEF(add_8x8basis)(int16_t rem[64], int16_t basis[64], int scale)
+{
+    x86_reg i=0;
+
+    if(FFABS(scale) < MAX_ABS){
+        scale<<= 16 + SCALE_OFFSET - BASIS_SHIFT + RECON_SHIFT;
+        SET_RND(mm6);
+        __asm__ volatile(
+                "movd  %3, %%mm5        \n\t"
+                "punpcklwd %%mm5, %%mm5 \n\t"
+                "punpcklwd %%mm5, %%mm5 \n\t"
+                ".p2align 4             \n\t"
+                "1:                     \n\t"
+                "movq  (%1, %0), %%mm0  \n\t"
+                "movq  8(%1, %0), %%mm1 \n\t"
+                PMULHRW(%%mm0, %%mm1, %%mm5, %%mm6)
+                "paddw (%2, %0), %%mm0  \n\t"
+                "paddw 8(%2, %0), %%mm1 \n\t"
+                "movq %%mm0, (%2, %0)   \n\t"
+                "movq %%mm1, 8(%2, %0)  \n\t"
+                "add $16, %0            \n\t"
+                "cmp $128, %0           \n\t" // FIXME optimize & bench
+                " jb 1b                 \n\t"
+
+                : "+r" (i)
+                : "r"(basis), "r"(rem), "g"(scale)
+        );
+    }else{
+        for(i=0; i<8*8; i++){
+            rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
+        }
+    }
+}
--- a/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/mpegvideoenc_template.c
+++ b/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/mpegvideoenc_template.c
@ -0,0 +1,423 @@
+/*
+ * MPEG video MMX templates
+ *
+ * Copyright (c) 2002 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/internal.h"
+#include "libavutil/x86/asm.h"
+#include "libavcodec/mpegutils.h"
+#include "libavcodec/mpegvideo.h"
+#include "fdct.h"
+
+#undef MMREG_WIDTH
+#undef MM
+#undef MOVQ
+#undef SPREADW
+#undef PMAXW
+#undef PMAX
+#undef SAVE_SIGN
+#undef RESTORE_SIGN
+
+#if COMPILE_TEMPLATE_SSE2
+#define MMREG_WIDTH "16"
+#define MM "%%xmm"
+#define MOVQ "movdqa"
+#define SPREADW(a) \
+            "pshuflw $0, "a", "a"       \n\t"\
+            "punpcklwd "a", "a"         \n\t"
+#define PMAXW(a,b) "pmaxsw "a", "b"     \n\t"
+#define PMAX(a,b) \
+            "movhlps "a", "b"           \n\t"\
+            PMAXW(b, a)\
+            "pshuflw $0x0E, "a", "b"    \n\t"\
+            PMAXW(b, a)\
+            "pshuflw $0x01, "a", "b"    \n\t"\
+            PMAXW(b, a)
+#else
+#define MMREG_WIDTH "8"
+#define MM "%%mm"
+#define MOVQ "movq"
+#if COMPILE_TEMPLATE_MMXEXT
+#define SPREADW(a) "pshufw $0, "a", "a" \n\t"
+#define PMAXW(a,b) "pmaxsw "a", "b"     \n\t"
+#define PMAX(a,b) \
+            "pshufw $0x0E, "a", "b"     \n\t"\
+            PMAXW(b, a)\
+            "pshufw $0x01, "a", "b"     \n\t"\
+            PMAXW(b, a)
+#else
+#define SPREADW(a) \
+            "punpcklwd "a", "a"         \n\t"\
+            "punpcklwd "a", "a"         \n\t"
+#define PMAXW(a,b) \
+            "psubusw "a", "b"           \n\t"\
+            "paddw "a", "b"             \n\t"
+#define PMAX(a,b)  \
+            "movq "a", "b"              \n\t"\
+            "psrlq $32, "a"             \n\t"\
+            PMAXW(b, a)\
+            "movq "a", "b"              \n\t"\
+            "psrlq $16, "a"             \n\t"\
+            PMAXW(b, a)
+
+#endif
+#endif
+
+#if COMPILE_TEMPLATE_SSSE3
+#define SAVE_SIGN(a,b) \
+            "movdqa "b", "a"            \n\t"\
+            "pabsw  "b", "b"            \n\t"
+#define RESTORE_SIGN(a,b) \
+            "psignw "a", "b"            \n\t"
+#else
+#define SAVE_SIGN(a,b) \
+            "pxor "a", "a"              \n\t"\
+            "pcmpgtw "b", "a"           \n\t" /* block[i] <= 0 ? 0xFF : 0x00 */\
+            "pxor "a", "b"              \n\t"\
+            "psubw "a", "b"             \n\t" /* ABS(block[i]) */
+#define RESTORE_SIGN(a,b) \
+            "pxor "a", "b"              \n\t"\
+            "psubw "a", "b"             \n\t" // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i])
+#endif
+
+static int RENAME(dct_quantize)(MpegEncContext *s,
+                            int16_t *block, int n,
+                            int qscale, int *overflow)
+{
+    x86_reg last_non_zero_p1;
+    int level=0, q; //=0 is because gcc says uninitialized ...
+    const uint16_t *qmat, *bias;
+    LOCAL_ALIGNED_16(int16_t, temp_block, [64]);
+
+    av_assert2((7&(int)(&temp_block[0])) == 0); //did gcc align it correctly?
+
+    //s->fdct (block);
+    RENAME_FDCT(ff_fdct)(block); // cannot be anything else ...
+
+    if(s->dct_error_sum)
+        s->denoise_dct(s, block);
+
+    if (s->mb_intra) {
+        int dummy;
+        if (n < 4){
+            q = s->y_dc_scale;
+            bias = s->q_intra_matrix16[qscale][1];
+            qmat = s->q_intra_matrix16[qscale][0];
+        }else{
+            q = s->c_dc_scale;
+            bias = s->q_chroma_intra_matrix16[qscale][1];
+            qmat = s->q_chroma_intra_matrix16[qscale][0];
+        }
+        /* note: block[0] is assumed to be positive */
+        if (!s->h263_aic) {
+        __asm__ volatile (
+                "mul %%ecx                \n\t"
+                : "=d" (level), "=a"(dummy)
+                : "a" ((block[0]>>2) + q), "c" (ff_inverse[q<<1])
+        );
+        } else
+            /* For AIC we skip quant/dequant of INTRADC */
+            level = (block[0] + 4)>>3;
+
+        block[0]=0; //avoid fake overflow
+//        temp_block[0] = (block[0] + (q >> 1)) / q;
+        last_non_zero_p1 = 1;
+    } else {
+        last_non_zero_p1 = 0;
+        bias = s->q_inter_matrix16[qscale][1];
+        qmat = s->q_inter_matrix16[qscale][0];
+    }
+
+    if((s->out_format == FMT_H263 || s->out_format == FMT_H261) && s->mpeg_quant==0){
+
+        __asm__ volatile(
+            "movd %%"FF_REG_a", "MM"3           \n\t" // last_non_zero_p1
+            SPREADW(MM"3")
+            "pxor "MM"7, "MM"7                  \n\t" // 0
+            "pxor "MM"4, "MM"4                  \n\t" // 0
+            MOVQ" (%2), "MM"5                   \n\t" // qmat[0]
+            "pxor "MM"6, "MM"6                  \n\t"
+            "psubw (%3), "MM"6                  \n\t" // -bias[0]
+            "mov $-128, %%"FF_REG_a"            \n\t"
+            ".p2align 4                         \n\t"
+            "1:                                 \n\t"
+            MOVQ" (%1, %%"FF_REG_a"), "MM"0     \n\t" // block[i]
+            SAVE_SIGN(MM"1", MM"0")                   // ABS(block[i])
+            "psubusw "MM"6, "MM"0               \n\t" // ABS(block[i]) + bias[0]
+            "pmulhw "MM"5, "MM"0                \n\t" // (ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16
+            "por "MM"0, "MM"4                   \n\t"
+            RESTORE_SIGN(MM"1", MM"0")                // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i])
+            MOVQ" "MM"0, (%5, %%"FF_REG_a")     \n\t"
+            "pcmpeqw "MM"7, "MM"0               \n\t" // out==0 ? 0xFF : 0x00
+            MOVQ" (%4, %%"FF_REG_a"), "MM"1     \n\t"
+            MOVQ" "MM"7, (%1, %%"FF_REG_a")     \n\t" // 0
+            "pandn "MM"1, "MM"0                 \n\t"
+            PMAXW(MM"0", MM"3")
+            "add $"MMREG_WIDTH", %%"FF_REG_a"   \n\t"
+            " js 1b                             \n\t"
+            PMAX(MM"3", MM"0")
+            "movd "MM"3, %%"FF_REG_a"           \n\t"
+            "movzbl %%al, %%eax                 \n\t" // last_non_zero_p1
+            : "+a" (last_non_zero_p1)
+            : "r" (block+64), "r" (qmat), "r" (bias),
+              "r" (inv_zigzag_direct16 + 64), "r" (temp_block + 64)
+              XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3",
+                                "%xmm4", "%xmm5", "%xmm6", "%xmm7")
+        );
+    }else{ // FMT_H263
+        __asm__ volatile(
+            "movd %%"FF_REG_a", "MM"3           \n\t" // last_non_zero_p1
+            SPREADW(MM"3")
+            "pxor "MM"7, "MM"7                  \n\t" // 0
+            "pxor "MM"4, "MM"4                  \n\t" // 0
+            "mov $-128, %%"FF_REG_a"            \n\t"
+            ".p2align 4                         \n\t"
+            "1:                                 \n\t"
+            MOVQ" (%1, %%"FF_REG_a"), "MM"0     \n\t" // block[i]
+            SAVE_SIGN(MM"1", MM"0")                   // ABS(block[i])
+            MOVQ" (%3, %%"FF_REG_a"), "MM"6     \n\t" // bias[0]
+            "paddusw "MM"6, "MM"0               \n\t" // ABS(block[i]) + bias[0]
+            MOVQ" (%2, %%"FF_REG_a"), "MM"5     \n\t" // qmat[i]
+            "pmulhw "MM"5, "MM"0                \n\t" // (ABS(block[i])*qmat[0] + bias[0]*qmat[0])>>16
+            "por "MM"0, "MM"4                   \n\t"
+            RESTORE_SIGN(MM"1", MM"0")                // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i])
+            MOVQ" "MM"0, (%5, %%"FF_REG_a")     \n\t"
+            "pcmpeqw "MM"7, "MM"0               \n\t" // out==0 ? 0xFF : 0x00
+            MOVQ" (%4, %%"FF_REG_a"), "MM"1     \n\t"
+            MOVQ" "MM"7, (%1, %%"FF_REG_a")     \n\t" // 0
+            "pandn "MM"1, "MM"0                 \n\t"
+            PMAXW(MM"0", MM"3")
+            "add $"MMREG_WIDTH", %%"FF_REG_a"   \n\t"
+            " js 1b                             \n\t"
+            PMAX(MM"3", MM"0")
+            "movd "MM"3, %%"FF_REG_a"           \n\t"
+            "movzbl %%al, %%eax                 \n\t" // last_non_zero_p1
+            : "+a" (last_non_zero_p1)
+            : "r" (block+64), "r" (qmat+64), "r" (bias+64),
+              "r" (inv_zigzag_direct16 + 64), "r" (temp_block + 64)
+              XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3",
+                                "%xmm4", "%xmm5", "%xmm6", "%xmm7")
+        );
+    }
+    __asm__ volatile(
+        "movd %1, "MM"1                     \n\t" // max_qcoeff
+        SPREADW(MM"1")
+        "psubusw "MM"1, "MM"4               \n\t"
+        "packuswb "MM"4, "MM"4              \n\t"
+#if COMPILE_TEMPLATE_SSE2
+        "packsswb "MM"4, "MM"4              \n\t"
+#endif
+        "movd "MM"4, %0                     \n\t" // *overflow
+        : "=g" (*overflow)
+        : "g" (s->max_qcoeff)
+    );
+
+    if(s->mb_intra) block[0]= level;
+    else            block[0]= temp_block[0];
+
+    if (s->idsp.perm_type == FF_IDCT_PERM_SIMPLE) {
+        if(last_non_zero_p1 <= 1) goto end;
+        block[0x08] = temp_block[0x01]; block[0x10] = temp_block[0x08];
+        block[0x20] = temp_block[0x10];
+        if(last_non_zero_p1 <= 4) goto end;
+        block[0x18] = temp_block[0x09]; block[0x04] = temp_block[0x02];
+        block[0x09] = temp_block[0x03];
+        if(last_non_zero_p1 <= 7) goto end;
+        block[0x14] = temp_block[0x0A]; block[0x28] = temp_block[0x11];
+        block[0x12] = temp_block[0x18]; block[0x02] = temp_block[0x20];
+        if(last_non_zero_p1 <= 11) goto end;
+        block[0x1A] = temp_block[0x19]; block[0x24] = temp_block[0x12];
+        block[0x19] = temp_block[0x0B]; block[0x01] = temp_block[0x04];
+        block[0x0C] = temp_block[0x05];
+        if(last_non_zero_p1 <= 16) goto end;
+        block[0x11] = temp_block[0x0C]; block[0x29] = temp_block[0x13];
+        block[0x16] = temp_block[0x1A]; block[0x0A] = temp_block[0x21];
+        block[0x30] = temp_block[0x28]; block[0x22] = temp_block[0x30];
+        block[0x38] = temp_block[0x29]; block[0x06] = temp_block[0x22];
+        if(last_non_zero_p1 <= 24) goto end;
+        block[0x1B] = temp_block[0x1B]; block[0x21] = temp_block[0x14];
+        block[0x1C] = temp_block[0x0D]; block[0x05] = temp_block[0x06];
+        block[0x0D] = temp_block[0x07]; block[0x15] = temp_block[0x0E];
+        block[0x2C] = temp_block[0x15]; block[0x13] = temp_block[0x1C];
+        if(last_non_zero_p1 <= 32) goto end;
+        block[0x0B] = temp_block[0x23]; block[0x34] = temp_block[0x2A];
+        block[0x2A] = temp_block[0x31]; block[0x32] = temp_block[0x38];
+        block[0x3A] = temp_block[0x39]; block[0x26] = temp_block[0x32];
+        block[0x39] = temp_block[0x2B]; block[0x03] = temp_block[0x24];
+        if(last_non_zero_p1 <= 40) goto end;
+        block[0x1E] = temp_block[0x1D]; block[0x25] = temp_block[0x16];
+        block[0x1D] = temp_block[0x0F]; block[0x2D] = temp_block[0x17];
+        block[0x17] = temp_block[0x1E]; block[0x0E] = temp_block[0x25];
+        block[0x31] = temp_block[0x2C]; block[0x2B] = temp_block[0x33];
+        if(last_non_zero_p1 <= 48) goto end;
+        block[0x36] = temp_block[0x3A]; block[0x3B] = temp_block[0x3B];
+        block[0x23] = temp_block[0x34]; block[0x3C] = temp_block[0x2D];
+        block[0x07] = temp_block[0x26]; block[0x1F] = temp_block[0x1F];
+        block[0x0F] = temp_block[0x27]; block[0x35] = temp_block[0x2E];
+        if(last_non_zero_p1 <= 56) goto end;
+        block[0x2E] = temp_block[0x35]; block[0x33] = temp_block[0x3C];
+        block[0x3E] = temp_block[0x3D]; block[0x27] = temp_block[0x36];
+        block[0x3D] = temp_block[0x2F]; block[0x2F] = temp_block[0x37];
+        block[0x37] = temp_block[0x3E]; block[0x3F] = temp_block[0x3F];
+    }else if(s->idsp.perm_type == FF_IDCT_PERM_LIBMPEG2){
+        if(last_non_zero_p1 <= 1) goto end;
+        block[0x04] = temp_block[0x01];
+        block[0x08] = temp_block[0x08]; block[0x10] = temp_block[0x10];
+        if(last_non_zero_p1 <= 4) goto end;
+        block[0x0C] = temp_block[0x09]; block[0x01] = temp_block[0x02];
+        block[0x05] = temp_block[0x03];
+        if(last_non_zero_p1 <= 7) goto end;
+        block[0x09] = temp_block[0x0A]; block[0x14] = temp_block[0x11];
+        block[0x18] = temp_block[0x18]; block[0x20] = temp_block[0x20];
+        if(last_non_zero_p1 <= 11) goto end;
+        block[0x1C] = temp_block[0x19];
+        block[0x11] = temp_block[0x12]; block[0x0D] = temp_block[0x0B];
+        block[0x02] = temp_block[0x04]; block[0x06] = temp_block[0x05];
+        if(last_non_zero_p1 <= 16) goto end;
+        block[0x0A] = temp_block[0x0C]; block[0x15] = temp_block[0x13];
+        block[0x19] = temp_block[0x1A]; block[0x24] = temp_block[0x21];
+        block[0x28] = temp_block[0x28]; block[0x30] = temp_block[0x30];
+        block[0x2C] = temp_block[0x29]; block[0x21] = temp_block[0x22];
+        if(last_non_zero_p1 <= 24) goto end;
+        block[0x1D] = temp_block[0x1B]; block[0x12] = temp_block[0x14];
+        block[0x0E] = temp_block[0x0D]; block[0x03] = temp_block[0x06];
+        block[0x07] = temp_block[0x07]; block[0x0B] = temp_block[0x0E];
+        block[0x16] = temp_block[0x15]; block[0x1A] = temp_block[0x1C];
+        if(last_non_zero_p1 <= 32) goto end;
+        block[0x25] = temp_block[0x23]; block[0x29] = temp_block[0x2A];
+        block[0x34] = temp_block[0x31]; block[0x38] = temp_block[0x38];
+        block[0x3C] = temp_block[0x39]; block[0x31] = temp_block[0x32];
+        block[0x2D] = temp_block[0x2B]; block[0x22] = temp_block[0x24];
+        if(last_non_zero_p1 <= 40) goto end;
+        block[0x1E] = temp_block[0x1D]; block[0x13] = temp_block[0x16];
+        block[0x0F] = temp_block[0x0F]; block[0x17] = temp_block[0x17];
+        block[0x1B] = temp_block[0x1E]; block[0x26] = temp_block[0x25];
+        block[0x2A] = temp_block[0x2C]; block[0x35] = temp_block[0x33];
+        if(last_non_zero_p1 <= 48) goto end;
+        block[0x39] = temp_block[0x3A]; block[0x3D] = temp_block[0x3B];
+        block[0x32] = temp_block[0x34]; block[0x2E] = temp_block[0x2D];
+            block[0x23] = temp_block[0x26]; block[0x1F] = temp_block[0x1F];
+        block[0x27] = temp_block[0x27]; block[0x2B] = temp_block[0x2E];
+        if(last_non_zero_p1 <= 56) goto end;
+        block[0x36] = temp_block[0x35]; block[0x3A] = temp_block[0x3C];
+        block[0x3E] = temp_block[0x3D]; block[0x33] = temp_block[0x36];
+        block[0x2F] = temp_block[0x2F]; block[0x37] = temp_block[0x37];
+        block[0x3B] = temp_block[0x3E]; block[0x3F] = temp_block[0x3F];
+    } else if (s->idsp.perm_type == FF_IDCT_PERM_NONE) {
+        if(last_non_zero_p1 <= 1) goto end;
+        block[0x01] = temp_block[0x01];
+        block[0x08] = temp_block[0x08]; block[0x10] = temp_block[0x10];
+        if(last_non_zero_p1 <= 4) goto end;
+        block[0x09] = temp_block[0x09]; block[0x02] = temp_block[0x02];
+        block[0x03] = temp_block[0x03];
+        if(last_non_zero_p1 <= 7) goto end;
+        block[0x0A] = temp_block[0x0A]; block[0x11] = temp_block[0x11];
+        block[0x18] = temp_block[0x18]; block[0x20] = temp_block[0x20];
+        if(last_non_zero_p1 <= 11) goto end;
+        block[0x19] = temp_block[0x19];
+        block[0x12] = temp_block[0x12]; block[0x0B] = temp_block[0x0B];
+        block[0x04] = temp_block[0x04]; block[0x05] = temp_block[0x05];
+        if(last_non_zero_p1 <= 16) goto end;
+        block[0x0C] = temp_block[0x0C]; block[0x13] = temp_block[0x13];
+        block[0x1A] = temp_block[0x1A]; block[0x21] = temp_block[0x21];
+        block[0x28] = temp_block[0x28]; block[0x30] = temp_block[0x30];
+        block[0x29] = temp_block[0x29]; block[0x22] = temp_block[0x22];
+        if(last_non_zero_p1 <= 24) goto end;
+        block[0x1B] = temp_block[0x1B]; block[0x14] = temp_block[0x14];
+        block[0x0D] = temp_block[0x0D]; block[0x06] = temp_block[0x06];
+        block[0x07] = temp_block[0x07]; block[0x0E] = temp_block[0x0E];
+        block[0x15] = temp_block[0x15]; block[0x1C] = temp_block[0x1C];
+        if(last_non_zero_p1 <= 32) goto end;
+        block[0x23] = temp_block[0x23]; block[0x2A] = temp_block[0x2A];
+        block[0x31] = temp_block[0x31]; block[0x38] = temp_block[0x38];
+        block[0x39] = temp_block[0x39]; block[0x32] = temp_block[0x32];
+        block[0x2B] = temp_block[0x2B]; block[0x24] = temp_block[0x24];
+        if(last_non_zero_p1 <= 40) goto end;
+        block[0x1D] = temp_block[0x1D]; block[0x16] = temp_block[0x16];
+        block[0x0F] = temp_block[0x0F]; block[0x17] = temp_block[0x17];
+        block[0x1E] = temp_block[0x1E]; block[0x25] = temp_block[0x25];
+        block[0x2C] = temp_block[0x2C]; block[0x33] = temp_block[0x33];
+        if(last_non_zero_p1 <= 48) goto end;
+        block[0x3A] = temp_block[0x3A]; block[0x3B] = temp_block[0x3B];
+        block[0x34] = temp_block[0x34]; block[0x2D] = temp_block[0x2D];
+        block[0x26] = temp_block[0x26]; block[0x1F] = temp_block[0x1F];
+        block[0x27] = temp_block[0x27]; block[0x2E] = temp_block[0x2E];
+        if(last_non_zero_p1 <= 56) goto end;
+        block[0x35] = temp_block[0x35]; block[0x3C] = temp_block[0x3C];
+        block[0x3D] = temp_block[0x3D]; block[0x36] = temp_block[0x36];
+        block[0x2F] = temp_block[0x2F]; block[0x37] = temp_block[0x37];
+        block[0x3E] = temp_block[0x3E]; block[0x3F] = temp_block[0x3F];
+    } else if (s->idsp.perm_type == FF_IDCT_PERM_TRANSPOSE) {
+        if(last_non_zero_p1 <= 1) goto end;
+        block[0x08] = temp_block[0x01];
+        block[0x01] = temp_block[0x08]; block[0x02] = temp_block[0x10];
+        if(last_non_zero_p1 <= 4) goto end;
+        block[0x09] = temp_block[0x09]; block[0x10] = temp_block[0x02];
+        block[0x18] = temp_block[0x03];
+        if(last_non_zero_p1 <= 7) goto end;
+        block[0x11] = temp_block[0x0A]; block[0x0A] = temp_block[0x11];
+        block[0x03] = temp_block[0x18]; block[0x04] = temp_block[0x20];
+        if(last_non_zero_p1 <= 11) goto end;
+        block[0x0B] = temp_block[0x19];
+        block[0x12] = temp_block[0x12]; block[0x19] = temp_block[0x0B];
+        block[0x20] = temp_block[0x04]; block[0x28] = temp_block[0x05];
+        if(last_non_zero_p1 <= 16) goto end;
+        block[0x21] = temp_block[0x0C]; block[0x1A] = temp_block[0x13];
+        block[0x13] = temp_block[0x1A]; block[0x0C] = temp_block[0x21];
+        block[0x05] = temp_block[0x28]; block[0x06] = temp_block[0x30];
+        block[0x0D] = temp_block[0x29]; block[0x14] = temp_block[0x22];
+        if(last_non_zero_p1 <= 24) goto end;
+        block[0x1B] = temp_block[0x1B]; block[0x22] = temp_block[0x14];
+        block[0x29] = temp_block[0x0D]; block[0x30] = temp_block[0x06];
+        block[0x38] = temp_block[0x07]; block[0x31] = temp_block[0x0E];
+        block[0x2A] = temp_block[0x15]; block[0x23] = temp_block[0x1C];
+        if(last_non_zero_p1 <= 32) goto end;
+        block[0x1C] = temp_block[0x23]; block[0x15] = temp_block[0x2A];
+        block[0x0E] = temp_block[0x31]; block[0x07] = temp_block[0x38];
+        block[0x0F] = temp_block[0x39]; block[0x16] = temp_block[0x32];
+        block[0x1D] = temp_block[0x2B]; block[0x24] = temp_block[0x24];
+        if(last_non_zero_p1 <= 40) goto end;
+        block[0x2B] = temp_block[0x1D]; block[0x32] = temp_block[0x16];
+        block[0x39] = temp_block[0x0F]; block[0x3A] = temp_block[0x17];
+        block[0x33] = temp_block[0x1E]; block[0x2C] = temp_block[0x25];
+        block[0x25] = temp_block[0x2C]; block[0x1E] = temp_block[0x33];
+        if(last_non_zero_p1 <= 48) goto end;
+        block[0x17] = temp_block[0x3A]; block[0x1F] = temp_block[0x3B];
+        block[0x26] = temp_block[0x34]; block[0x2D] = temp_block[0x2D];
+        block[0x34] = temp_block[0x26]; block[0x3B] = temp_block[0x1F];
+        block[0x3C] = temp_block[0x27]; block[0x35] = temp_block[0x2E];
+        if(last_non_zero_p1 <= 56) goto end;
+        block[0x2E] = temp_block[0x35]; block[0x27] = temp_block[0x3C];
+        block[0x2F] = temp_block[0x3D]; block[0x36] = temp_block[0x36];
+        block[0x3D] = temp_block[0x2F]; block[0x3E] = temp_block[0x37];
+        block[0x37] = temp_block[0x3E]; block[0x3F] = temp_block[0x3F];
+    } else {
+        av_log(s, AV_LOG_DEBUG, "s->idsp.perm_type: %d\n",
+                (int)s->idsp.perm_type);
+        av_assert0(s->idsp.perm_type == FF_IDCT_PERM_NONE ||
+                s->idsp.perm_type == FF_IDCT_PERM_LIBMPEG2 ||
+                s->idsp.perm_type == FF_IDCT_PERM_SIMPLE ||
+                s->idsp.perm_type == FF_IDCT_PERM_TRANSPOSE);
+    }
+    end:
+    return last_non_zero_p1 - 1;
+}
--- a/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/mpegvideoencdsp_init.c
+++ b/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/mpegvideoencdsp_init.c
@ -0,0 +1,272 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/avassert.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/avcodec.h"
+#include "libavcodec/mpegvideoencdsp.h"
+
+int ff_pix_sum16_mmx(uint8_t *pix, int line_size);
+int ff_pix_sum16_mmxext(uint8_t *pix, int line_size);
+int ff_pix_sum16_sse2(uint8_t *pix, int line_size);
+int ff_pix_sum16_xop(uint8_t *pix, int line_size);
+int ff_pix_norm1_mmx(uint8_t *pix, int line_size);
+int ff_pix_norm1_sse2(uint8_t *pix, int line_size);
+
+#if HAVE_INLINE_ASM
+
+#define PHADDD(a, t)                            \
+    "movq  " #a ", " #t "               \n\t"   \
+    "psrlq    $32, " #a "               \n\t"   \
+    "paddd " #t ", " #a "               \n\t"
+
+/*
+ * pmulhw:   dst[0 - 15] = (src[0 - 15] * dst[0 - 15])[16 - 31]
+ * pmulhrw:  dst[0 - 15] = (src[0 - 15] * dst[0 - 15] + 0x8000)[16 - 31]
+ * pmulhrsw: dst[0 - 15] = (src[0 - 15] * dst[0 - 15] + 0x4000)[15 - 30]
+ */
+#define PMULHRW(x, y, s, o)                     \
+    "pmulhw " #s ", " #x "              \n\t"   \
+    "pmulhw " #s ", " #y "              \n\t"   \
+    "paddw  " #o ", " #x "              \n\t"   \
+    "paddw  " #o ", " #y "              \n\t"   \
+    "psraw      $1, " #x "              \n\t"   \
+    "psraw      $1, " #y "              \n\t"
+#define DEF(x) x ## _mmx
+#define SET_RND MOVQ_WONE
+#define SCALE_OFFSET 1
+
+#include "mpegvideoenc_qns_template.c"
+
+#undef DEF
+#undef SET_RND
+#undef SCALE_OFFSET
+#undef PMULHRW
+
+#define DEF(x) x ## _3dnow
+#define SET_RND(x)
+#define SCALE_OFFSET 0
+#define PMULHRW(x, y, s, o)                     \
+    "pmulhrw " #s ", " #x "             \n\t"   \
+    "pmulhrw " #s ", " #y "             \n\t"
+
+#include "mpegvideoenc_qns_template.c"
+
+#undef DEF
+#undef SET_RND
+#undef SCALE_OFFSET
+#undef PMULHRW
+
+#if HAVE_SSSE3_INLINE
+#undef PHADDD
+#define DEF(x) x ## _ssse3
+#define SET_RND(x)
+#define SCALE_OFFSET -1
+
+#define PHADDD(a, t)                            \
+    "pshufw $0x0E, " #a ", " #t "       \n\t"   \
+    /* faster than phaddd on core2 */           \
+    "paddd " #t ", " #a "               \n\t"
+
+#define PMULHRW(x, y, s, o)                     \
+    "pmulhrsw " #s ", " #x "            \n\t"   \
+    "pmulhrsw " #s ", " #y "            \n\t"
+
+#include "mpegvideoenc_qns_template.c"
+
+#undef DEF
+#undef SET_RND
+#undef SCALE_OFFSET
+#undef PMULHRW
+#undef PHADDD
+#endif /* HAVE_SSSE3_INLINE */
+
+/* Draw the edges of width 'w' of an image of size width, height
+ * this MMX version can only handle w == 8 || w == 16. */
+static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height,
+                           int w, int h, int sides)
+{
+    uint8_t *ptr, *last_line;
+    int i;
+
+    last_line = buf + (height - 1) * wrap;
+    /* left and right */
+    ptr = buf;
+    if (w == 8) {
+        __asm__ volatile (
+            "1:                             \n\t"
+            "movd            (%0), %%mm0    \n\t"
+            "punpcklbw      %%mm0, %%mm0    \n\t"
+            "punpcklwd      %%mm0, %%mm0    \n\t"
+            "punpckldq      %%mm0, %%mm0    \n\t"
+            "movq           %%mm0, -8(%0)   \n\t"
+            "movq      -8(%0, %2), %%mm1    \n\t"
+            "punpckhbw      %%mm1, %%mm1    \n\t"
+            "punpckhwd      %%mm1, %%mm1    \n\t"
+            "punpckhdq      %%mm1, %%mm1    \n\t"
+            "movq           %%mm1, (%0, %2) \n\t"
+            "add               %1, %0       \n\t"
+            "cmp               %3, %0       \n\t"
+            "jb                1b           \n\t"
+            : "+r" (ptr)
+            : "r" ((x86_reg) wrap), "r" ((x86_reg) width),
+              "r" (ptr + wrap * height));
+    } else if (w == 16) {
+        __asm__ volatile (
+            "1:                                 \n\t"
+            "movd            (%0), %%mm0        \n\t"
+            "punpcklbw      %%mm0, %%mm0        \n\t"
+            "punpcklwd      %%mm0, %%mm0        \n\t"
+            "punpckldq      %%mm0, %%mm0        \n\t"
+            "movq           %%mm0, -8(%0)       \n\t"
+            "movq           %%mm0, -16(%0)      \n\t"
+            "movq      -8(%0, %2), %%mm1        \n\t"
+            "punpckhbw      %%mm1, %%mm1        \n\t"
+            "punpckhwd      %%mm1, %%mm1        \n\t"
+            "punpckhdq      %%mm1, %%mm1        \n\t"
+            "movq           %%mm1,  (%0, %2)    \n\t"
+            "movq           %%mm1, 8(%0, %2)    \n\t"
+            "add               %1, %0           \n\t"
+            "cmp               %3, %0           \n\t"
+            "jb                1b               \n\t"
+            : "+r"(ptr)
+            : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
+            );
+    } else {
+        av_assert1(w == 4);
+        __asm__ volatile (
+            "1:                             \n\t"
+            "movd            (%0), %%mm0    \n\t"
+            "punpcklbw      %%mm0, %%mm0    \n\t"
+            "punpcklwd      %%mm0, %%mm0    \n\t"
+            "movd           %%mm0, -4(%0)   \n\t"
+            "movd      -4(%0, %2), %%mm1    \n\t"
+            "punpcklbw      %%mm1, %%mm1    \n\t"
+            "punpckhwd      %%mm1, %%mm1    \n\t"
+            "punpckhdq      %%mm1, %%mm1    \n\t"
+            "movd           %%mm1, (%0, %2) \n\t"
+            "add               %1, %0       \n\t"
+            "cmp               %3, %0       \n\t"
+            "jb                1b           \n\t"
+            : "+r" (ptr)
+            : "r" ((x86_reg) wrap), "r" ((x86_reg) width),
+              "r" (ptr + wrap * height));
+    }
+
+    /* top and bottom (and hopefully also the corners) */
+    if (sides & EDGE_TOP) {
+        for (i = 0; i < h; i += 4) {
+            ptr = buf - (i + 1) * wrap - w;
+            __asm__ volatile (
+                "1:                             \n\t"
+                "movq (%1, %0), %%mm0           \n\t"
+                "movq    %%mm0, (%0)            \n\t"
+                "movq    %%mm0, (%0, %2)        \n\t"
+                "movq    %%mm0, (%0, %2, 2)     \n\t"
+                "movq    %%mm0, (%0, %3)        \n\t"
+                "add        $8, %0              \n\t"
+                "cmp        %4, %0              \n\t"
+                "jb         1b                  \n\t"
+                : "+r" (ptr)
+                : "r" ((x86_reg) buf - (x86_reg) ptr - w),
+                  "r" ((x86_reg) - wrap), "r" ((x86_reg) - wrap * 3),
+                  "r" (ptr + width + 2 * w));
+        }
+    }
+
+    if (sides & EDGE_BOTTOM) {
+        for (i = 0; i < h; i += 4) {
+            ptr = last_line + (i + 1) * wrap - w;
+            __asm__ volatile (
+                "1:                             \n\t"
+                "movq (%1, %0), %%mm0           \n\t"
+                "movq    %%mm0, (%0)            \n\t"
+                "movq    %%mm0, (%0, %2)        \n\t"
+                "movq    %%mm0, (%0, %2, 2)     \n\t"
+                "movq    %%mm0, (%0, %3)        \n\t"
+                "add        $8, %0              \n\t"
+                "cmp        %4, %0              \n\t"
+                "jb         1b                  \n\t"
+                : "+r" (ptr)
+                : "r" ((x86_reg) last_line - (x86_reg) ptr - w),
+                  "r" ((x86_reg) wrap), "r" ((x86_reg) wrap * 3),
+                  "r" (ptr + width + 2 * w));
+        }
+    }
+}
+
+#endif /* HAVE_INLINE_ASM */
+
+av_cold void ff_mpegvideoencdsp_init_x86(MpegvideoEncDSPContext *c,
+                                         AVCodecContext *avctx)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+#if ARCH_X86_32
+    if (EXTERNAL_MMX(cpu_flags)) {
+        c->pix_sum   = ff_pix_sum16_mmx;
+        c->pix_norm1 = ff_pix_norm1_mmx;
+    }
+
+    if (EXTERNAL_MMXEXT(cpu_flags)) {
+        c->pix_sum     = ff_pix_sum16_mmxext;
+    }
+#endif
+
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        c->pix_sum     = ff_pix_sum16_sse2;
+        c->pix_norm1   = ff_pix_norm1_sse2;
+    }
+
+    if (EXTERNAL_XOP(cpu_flags)) {
+        c->pix_sum     = ff_pix_sum16_xop;
+    }
+
+#if HAVE_INLINE_ASM
+
+    if (INLINE_MMX(cpu_flags)) {
+        if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {
+            c->try_8x8basis = try_8x8basis_mmx;
+        }
+        c->add_8x8basis = add_8x8basis_mmx;
+
+        if (avctx->bits_per_raw_sample <= 8) {
+            c->draw_edges = draw_edges_mmx;
+        }
+    }
+
+    if (INLINE_AMD3DNOW(cpu_flags)) {
+        if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {
+            c->try_8x8basis = try_8x8basis_3dnow;
+        }
+        c->add_8x8basis = add_8x8basis_3dnow;
+    }
+
+#if HAVE_SSSE3_INLINE
+    if (INLINE_SSSE3(cpu_flags)) {
+        if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {
+            c->try_8x8basis = try_8x8basis_ssse3;
+        }
+        c->add_8x8basis = add_8x8basis_ssse3;
+    }
+#endif /* HAVE_SSSE3_INLINE */
+
+#endif /* HAVE_INLINE_ASM */
+}
--- a/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/opusdsp_init.c
+++ b/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/opusdsp_init.c
@ -0,0 +1,35 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/opusdsp.h"
+
+void ff_opus_postfilter_fma3(float *data, int period, float *gains, int len);
+float ff_opus_deemphasis_fma3(float *out, float *in, float coeff, int len);
+
+av_cold void ff_opus_dsp_init_x86(OpusDSP *ctx)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_FMA3_FAST(cpu_flags)) {
+        ctx->postfilter = ff_opus_postfilter_fma3;
+        ctx->deemphasis = ff_opus_deemphasis_fma3;
+    }
+}
--- a/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/pixblockdsp_init.c
+++ b/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/pixblockdsp_init.c
@ -0,0 +1,52 @@
+/*
+ * SIMD-optimized pixel operations
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/pixblockdsp.h"
+
+void ff_get_pixels_mmx(int16_t *block, const uint8_t *pixels, ptrdiff_t stride);
+void ff_get_pixels_sse2(int16_t *block, const uint8_t *pixels, ptrdiff_t stride);
+void ff_diff_pixels_mmx(int16_t *block, const uint8_t *s1, const uint8_t *s2,
+                        ptrdiff_t stride);
+void ff_diff_pixels_sse2(int16_t *block, const uint8_t *s1, const uint8_t *s2,
+                         ptrdiff_t stride);
+
+av_cold void ff_pixblockdsp_init_x86(PixblockDSPContext *c,
+                                     AVCodecContext *avctx,
+                                     unsigned high_bit_depth)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_MMX(cpu_flags)) {
+        if (!high_bit_depth)
+            c->get_pixels = ff_get_pixels_mmx;
+        c->diff_pixels_unaligned =
+        c->diff_pixels = ff_diff_pixels_mmx;
+    }
+
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        if (!high_bit_depth)
+            c->get_pixels = ff_get_pixels_sse2;
+        c->diff_pixels_unaligned =
+        c->diff_pixels = ff_diff_pixels_sse2;
+    }
+}
--- a/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/pngdsp_init.c
+++ b/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/pngdsp_init.c
@ -0,0 +1,50 @@
+/*
+ * x86 PNG optimizations.
+ * Copyright (c) 2008 Loren Merrit <lorenm@u.washington.edu>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/common.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/pngdsp.h"
+
+void ff_add_png_paeth_prediction_mmxext(uint8_t *dst, uint8_t *src,
+                                        uint8_t *top, int w, int bpp);
+void ff_add_png_paeth_prediction_ssse3(uint8_t *dst, uint8_t *src,
+                                       uint8_t *top, int w, int bpp);
+void ff_add_bytes_l2_mmx (uint8_t *dst, uint8_t *src1,
+                          uint8_t *src2, int w);
+void ff_add_bytes_l2_sse2(uint8_t *dst, uint8_t *src1,
+                          uint8_t *src2, int w);
+
+av_cold void ff_pngdsp_init_x86(PNGDSPContext *dsp)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+#if ARCH_X86_32
+    if (EXTERNAL_MMX(cpu_flags))
+        dsp->add_bytes_l2         = ff_add_bytes_l2_mmx;
+#endif
+    if (EXTERNAL_MMXEXT(cpu_flags))
+        dsp->add_paeth_prediction = ff_add_png_paeth_prediction_mmxext;
+    if (EXTERNAL_SSE2(cpu_flags))
+        dsp->add_bytes_l2         = ff_add_bytes_l2_sse2;
+    if (EXTERNAL_SSSE3(cpu_flags))
+        dsp->add_paeth_prediction = ff_add_png_paeth_prediction_ssse3;
+}
--- a/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/proresdsp_init.c
+++ b/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/proresdsp_init.c
@ -0,0 +1,50 @@
+/*
+ * Apple ProRes compatible decoder
+ *
+ * Copyright (c) 2010-2011 Maxim Poliakovski
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/idctdsp.h"
+#include "libavcodec/proresdsp.h"
+
+void ff_prores_idct_put_10_sse2(uint16_t *dst, ptrdiff_t linesize,
+                                int16_t *block, const int16_t *qmat);
+void ff_prores_idct_put_10_avx (uint16_t *dst, ptrdiff_t linesize,
+                                int16_t *block, const int16_t *qmat);
+
+av_cold void ff_proresdsp_init_x86(ProresDSPContext *dsp, AVCodecContext *avctx)
+{
+#if ARCH_X86_64
+    int cpu_flags = av_get_cpu_flags();
+
+    if (avctx->bits_per_raw_sample == 10){
+        if (EXTERNAL_SSE2(cpu_flags)) {
+            dsp->idct_permutation_type = FF_IDCT_PERM_TRANSPOSE;
+            dsp->idct_put = ff_prores_idct_put_10_sse2;
+        }
+
+        if (EXTERNAL_AVX(cpu_flags)) {
+            dsp->idct_permutation_type = FF_IDCT_PERM_TRANSPOSE;
+            dsp->idct_put = ff_prores_idct_put_10_avx;
+        }
+    }
+#endif /* ARCH_X86_64 */
+}
--- a/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/qpeldsp_init.c
+++ b/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/qpeldsp_init.c
@ -0,0 +1,544 @@
+/*
+ * quarterpel DSP functions
+ * Copyright (c) 2000, 2001 Fabrice Bellard
+ * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/pixels.h"
+#include "libavcodec/qpeldsp.h"
+#include "fpel.h"
+
+void ff_put_pixels8_l2_mmxext(uint8_t *dst,
+                              const uint8_t *src1, const uint8_t *src2,
+                              int dstStride, int src1Stride, int h);
+void ff_put_no_rnd_pixels8_l2_mmxext(uint8_t *dst,
+                                     const uint8_t *src1, const uint8_t *src2,
+                                     int dstStride, int src1Stride, int h);
+void ff_avg_pixels8_l2_mmxext(uint8_t *dst,
+                              const uint8_t *src1, const uint8_t *src2,
+                              int dstStride, int src1Stride, int h);
+void ff_put_pixels16_l2_mmxext(uint8_t *dst,
+                               const uint8_t *src1, const uint8_t *src2,
+                               int dstStride, int src1Stride, int h);
+void ff_avg_pixels16_l2_mmxext(uint8_t *dst,
+                               const uint8_t *src1, const uint8_t *src2,
+                               int dstStride, int src1Stride, int h);
+void ff_put_no_rnd_pixels16_l2_mmxext(uint8_t *dst,
+                                      const uint8_t *src1, const uint8_t *src2,
+                                      int dstStride, int src1Stride, int h);
+void ff_put_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, const uint8_t *src,
+                                          int dstStride, int srcStride, int h);
+void ff_avg_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, const uint8_t *src,
+                                          int dstStride, int srcStride, int h);
+void ff_put_no_rnd_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst,
+                                                 const uint8_t *src,
+                                                 int dstStride, int srcStride,
+                                                 int h);
+void ff_put_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, const uint8_t *src,
+                                         int dstStride, int srcStride, int h);
+void ff_avg_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, const uint8_t *src,
+                                         int dstStride, int srcStride, int h);
+void ff_put_no_rnd_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst,
+                                                const uint8_t *src,
+                                                int dstStride, int srcStride,
+                                                int h);
+void ff_put_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, const uint8_t *src,
+                                          int dstStride, int srcStride);
+void ff_avg_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, const uint8_t *src,
+                                          int dstStride, int srcStride);
+void ff_put_no_rnd_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst,
+                                                 const uint8_t *src,
+                                                 int dstStride, int srcStride);
+void ff_put_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, const uint8_t *src,
+                                         int dstStride, int srcStride);
+void ff_avg_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, const uint8_t *src,
+                                         int dstStride, int srcStride);
+void ff_put_no_rnd_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst,
+                                                const uint8_t *src,
+                                                int dstStride, int srcStride);
+#define ff_put_no_rnd_pixels16_mmxext ff_put_pixels16_mmx
+#define ff_put_no_rnd_pixels8_mmxext ff_put_pixels8_mmx
+
+#if HAVE_X86ASM
+
+#define ff_put_pixels16_mmxext ff_put_pixels16_mmx
+#define ff_put_pixels8_mmxext  ff_put_pixels8_mmx
+
+#define QPEL_OP(OPNAME, RND, MMX)                                       \
+static void OPNAME ## qpel8_mc00_ ## MMX(uint8_t *dst,                  \
+                                         const uint8_t *src,            \
+                                         ptrdiff_t stride)              \
+{                                                                       \
+    ff_ ## OPNAME ## pixels8_ ## MMX(dst, src, stride, 8);              \
+}                                                                       \
+                                                                        \
+static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst,                  \
+                                         const uint8_t *src,            \
+                                         ptrdiff_t stride)              \
+{                                                                       \
+    uint64_t temp[8];                                                   \
+    uint8_t *const half = (uint8_t *) temp;                             \
+    ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8,        \
+                                                   stride, 8);          \
+    ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src, half,                 \
+                                        stride, stride, 8);             \
+}                                                                       \
+                                                                        \
+static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst,                  \
+                                         const uint8_t *src,            \
+                                         ptrdiff_t stride)              \
+{                                                                       \
+    ff_ ## OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride,    \
+                                                   stride, 8);          \
+}                                                                       \
+                                                                        \
+static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst,                  \
+                                         const uint8_t *src,            \
+                                         ptrdiff_t stride)              \
+{                                                                       \
+    uint64_t temp[8];                                                   \
+    uint8_t *const half = (uint8_t *) temp;                             \
+    ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8,        \
+                                                   stride, 8);          \
+    ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src + 1, half, stride,     \
+                                        stride, 8);                     \
+}                                                                       \
+                                                                        \
+static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst,                  \
+                                         const uint8_t *src,            \
+                                         ptrdiff_t stride)              \
+{                                                                       \
+    uint64_t temp[8];                                                   \
+    uint8_t *const half = (uint8_t *) temp;                             \
+    ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src,           \
+                                                   8, stride);          \
+    ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src, half,                 \
+                                        stride, stride, 8);             \
+}                                                                       \
+                                                                        \
+static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst,                  \
+                                         const uint8_t *src,            \
+                                         ptrdiff_t stride)              \
+{                                                                       \
+    ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src,            \
+                                                   stride, stride);     \
+}                                                                       \
+                                                                        \
+static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst,                  \
+                                         const uint8_t *src,            \
+                                         ptrdiff_t stride)              \
+{                                                                       \
+    uint64_t temp[8];                                                   \
+    uint8_t *const half = (uint8_t *) temp;                             \
+    ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src,           \
+                                                   8, stride);          \
+    ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src + stride, half, stride,\
+                                        stride, 8);                     \
+}                                                                       \
+                                                                        \
+static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst,                  \
+                                         const uint8_t *src,            \
+                                         ptrdiff_t stride)              \
+{                                                                       \
+    uint64_t half[8 + 9];                                               \
+    uint8_t *const halfH  = (uint8_t *) half + 64;                      \
+    uint8_t *const halfHV = (uint8_t *) half;                           \
+    ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,       \
+                                                   stride, 9);          \
+    ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8,           \
+                                        stride, 9);                     \
+    ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
+    ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV,             \
+                                        stride, 8, 8);                  \
+}                                                                       \
+                                                                        \
+static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst,                  \
+                                         const uint8_t *src,            \
+                                         ptrdiff_t stride)              \
+{                                                                       \
+    uint64_t half[8 + 9];                                               \
+    uint8_t *const halfH  = (uint8_t *) half + 64;                      \
+    uint8_t *const halfHV = (uint8_t *) half;                           \
+    ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,       \
+                                                   stride, 9);          \
+    ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8,       \
+                                        stride, 9);                     \
+    ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
+    ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV,             \
+                                        stride, 8, 8);                  \
+}                                                                       \
+                                                                        \
+static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst,                  \
+                                         const uint8_t *src,            \
+                                         ptrdiff_t stride)              \
+{                                                                       \
+    uint64_t half[8 + 9];                                               \
+    uint8_t *const halfH  = (uint8_t *) half + 64;                      \
+    uint8_t *const halfHV = (uint8_t *) half;                           \
+    ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,       \
+                                                   stride, 9);          \
+    ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8,           \
+                                        stride, 9);                     \
+    ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
+    ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV,         \
+                                        stride, 8, 8);                  \
+}                                                                       \
+                                                                        \
+static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst,                  \
+                                         const uint8_t *src,            \
+                                         ptrdiff_t stride)              \
+{                                                                       \
+    uint64_t half[8 + 9];                                               \
+    uint8_t *const halfH  = (uint8_t *) half + 64;                      \
+    uint8_t *const halfHV = (uint8_t *) half;                           \
+    ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,       \
+                                                   stride, 9);          \
+    ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8,       \
+                                        stride, 9);                     \
+    ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
+    ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV,         \
+                                        stride, 8, 8);                  \
+}                                                                       \
+                                                                        \
+static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst,                  \
+                                         const uint8_t *src,            \
+                                         ptrdiff_t stride)              \
+{                                                                       \
+    uint64_t half[8 + 9];                                               \
+    uint8_t *const halfH  = (uint8_t *) half + 64;                      \
+    uint8_t *const halfHV = (uint8_t *) half;                           \
+    ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,       \
+                                                   stride, 9);          \
+    ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
+    ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV,             \
+                                        stride, 8, 8);                  \
+}                                                                       \
+                                                                        \
+static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst,                  \
+                                         const uint8_t *src,            \
+                                         ptrdiff_t stride)              \
+{                                                                       \
+    uint64_t half[8 + 9];                                               \
+    uint8_t *const halfH  = (uint8_t *) half + 64;                      \
+    uint8_t *const halfHV = (uint8_t *) half;                           \
+    ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,       \
+                                                   stride, 9);          \
+    ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
+    ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV,         \
+                                        stride, 8, 8);                  \
+}                                                                       \
+                                                                        \
+static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst,                  \
+                                         const uint8_t *src,            \
+                                         ptrdiff_t stride)              \
+{                                                                       \
+    uint64_t half[8 + 9];                                               \
+    uint8_t *const halfH = (uint8_t *) half;                            \
+    ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,       \
+                                                   stride, 9);          \
+    ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH,              \
+                                        8, stride, 9);                  \
+    ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH,          \
+                                                   stride, 8);          \
+}                                                                       \
+                                                                        \
+static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst,                  \
+                                         const uint8_t *src,            \
+                                         ptrdiff_t stride)              \
+{                                                                       \
+    uint64_t half[8 + 9];                                               \
+    uint8_t *const halfH = (uint8_t *) half;                            \
+    ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,       \
+                                                   stride, 9);          \
+    ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8,       \
+                                        stride, 9);                     \
+    ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH,          \
+                                                   stride, 8);          \
+}                                                                       \
+                                                                        \
+static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst,                  \
+                                         const uint8_t *src,            \
+                                         ptrdiff_t stride)              \
+{                                                                       \
+    uint64_t half[9];                                                   \
+    uint8_t *const halfH = (uint8_t *) half;                            \
+    ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,       \
+                                                   stride, 9);          \
+    ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH,          \
+                                                   stride, 8);          \
+}                                                                       \
+                                                                        \
+static void OPNAME ## qpel16_mc00_ ## MMX(uint8_t *dst,                 \
+                                          const uint8_t *src,           \
+                                          ptrdiff_t stride)             \
+{                                                                       \
+    ff_ ## OPNAME ## pixels16_ ## MMX(dst, src, stride, 16);            \
+}                                                                       \
+                                                                        \
+static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst,                 \
+                                          const uint8_t *src,           \
+                                          ptrdiff_t stride)             \
+{                                                                       \
+    uint64_t temp[32];                                                  \
+    uint8_t *const half = (uint8_t *) temp;                             \
+    ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16,      \
+                                                    stride, 16);        \
+    ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride,        \
+                                         stride, 16);                   \
+}                                                                       \
+                                                                        \
+static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst,                 \
+                                          const uint8_t *src,           \
+                                          ptrdiff_t stride)             \
+{                                                                       \
+    ff_ ## OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src,           \
+                                                    stride, stride, 16);\
+}                                                                       \
+                                                                        \
+static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst,                 \
+                                          const uint8_t *src,           \
+                                          ptrdiff_t stride)             \
+{                                                                       \
+    uint64_t temp[32];                                                  \
+    uint8_t *const half = (uint8_t*) temp;                              \
+    ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16,      \
+                                                    stride, 16);        \
+    ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src + 1, half,            \
+                                         stride, stride, 16);           \
+}                                                                       \
+                                                                        \
+static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst,                 \
+                                          const uint8_t *src,           \
+                                          ptrdiff_t stride)             \
+{                                                                       \
+    uint64_t temp[32];                                                  \
+    uint8_t *const half = (uint8_t *) temp;                             \
+    ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16,      \
+                                                    stride);            \
+    ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride,        \
+                                         stride, 16);                   \
+}                                                                       \
+                                                                        \
+static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst,                 \
+                                          const uint8_t *src,           \
+                                          ptrdiff_t stride)             \
+{                                                                       \
+    ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src,           \
+                                                    stride, stride);    \
+}                                                                       \
+                                                                        \
+static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst,                 \
+                                          const uint8_t *src,           \
+                                          ptrdiff_t stride)             \
+{                                                                       \
+    uint64_t temp[32];                                                  \
+    uint8_t *const half = (uint8_t *) temp;                             \
+    ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16,      \
+                                                    stride);            \
+    ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half,         \
+                                         stride, stride, 16);           \
+}                                                                       \
+                                                                        \
+static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst,                 \
+                                          const uint8_t *src,           \
+                                          ptrdiff_t stride)             \
+{                                                                       \
+    uint64_t half[16 * 2 + 17 * 2];                                     \
+    uint8_t *const halfH  = (uint8_t *) half + 256;                     \
+    uint8_t *const halfHV = (uint8_t *) half;                           \
+    ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,     \
+                                                    stride, 17);        \
+    ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16,         \
+                                         stride, 17);                   \
+    ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH,      \
+                                                    16, 16);            \
+    ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV,            \
+                                         stride, 16, 16);               \
+}                                                                       \
+                                                                        \
+static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst,                 \
+                                          const uint8_t *src,           \
+                                          ptrdiff_t stride)             \
+{                                                                       \
+    uint64_t half[16 * 2 + 17 * 2];                                     \
+    uint8_t *const halfH  = (uint8_t *) half + 256;                     \
+    uint8_t *const halfHV = (uint8_t *) half;                           \
+    ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,     \
+                                                    stride, 17);        \
+    ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16,     \
+                                         stride, 17);                   \
+    ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH,      \
+                                                    16, 16);            \
+    ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV,            \
+                                         stride, 16, 16);               \
+}                                                                       \
+                                                                        \
+static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst,                 \
+                                          const uint8_t *src,           \
+                                          ptrdiff_t stride)             \
+{                                                                       \
+    uint64_t half[16 * 2 + 17 * 2];                                     \
+    uint8_t *const halfH  = (uint8_t *) half + 256;                     \
+    uint8_t *const halfHV = (uint8_t *) half;                           \
+    ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,     \
+                                                    stride, 17);        \
+    ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16,         \
+                                         stride, 17);                   \
+    ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH,      \
+                                                    16, 16);            \
+    ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV,       \
+                                         stride, 16, 16);               \
+}                                                                       \
+                                                                        \
+static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst,                 \
+                                          const uint8_t *src,           \
+                                          ptrdiff_t stride)             \
+{                                                                       \
+    uint64_t half[16 * 2 + 17 * 2];                                     \
+    uint8_t *const halfH  = (uint8_t *) half + 256;                     \
+    uint8_t *const halfHV = (uint8_t *) half;                           \
+    ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,     \
+                                                    stride, 17);        \
+    ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16,     \
+                                         stride, 17);                   \
+    ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH,      \
+                                                    16, 16);            \
+    ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV,       \
+                                         stride, 16, 16);               \
+}                                                                       \
+                                                                        \
+static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst,                 \
+                                          const uint8_t *src,           \
+                                          ptrdiff_t stride)             \
+{                                                                       \
+    uint64_t half[16 * 2 + 17 * 2];                                     \
+    uint8_t *const halfH  = (uint8_t *) half + 256;                     \
+    uint8_t *const halfHV = (uint8_t *) half;                           \
+    ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,     \
+                                                    stride, 17);        \
+    ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH,      \
+                                                    16, 16);            \
+    ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV,            \
+                                         stride, 16, 16);               \
+}                                                                       \
+                                                                        \
+static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst,                 \
+                                          const uint8_t *src,           \
+                                          ptrdiff_t stride)             \
+{                                                                       \
+    uint64_t half[16 * 2 + 17 * 2];                                     \
+    uint8_t *const halfH  = (uint8_t *) half + 256;                     \
+    uint8_t *const halfHV = (uint8_t *) half;                           \
+    ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,     \
+                                                    stride, 17);        \
+    ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH,      \
+                                                    16, 16);            \
+    ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV,       \
+                                         stride, 16, 16);               \
+}                                                                       \
+                                                                        \
+static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst,                 \
+                                          const uint8_t *src,           \
+                                          ptrdiff_t stride)             \
+{                                                                       \
+    uint64_t half[17 * 2];                                              \
+    uint8_t *const halfH = (uint8_t *) half;                            \
+    ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,     \
+                                                    stride, 17);        \
+    ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16,         \
+                                         stride, 17);                   \
+    ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH,         \
+                                                    stride, 16);        \
+}                                                                       \
+                                                                        \
+static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst,                 \
+                                          const uint8_t *src,           \
+                                          ptrdiff_t stride)             \
+{                                                                       \
+    uint64_t half[17 * 2];                                              \
+    uint8_t *const halfH = (uint8_t *) half;                            \
+    ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,     \
+                                                    stride, 17);        \
+    ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16,     \
+                                         stride, 17);                   \
+    ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH,         \
+                                                    stride, 16);        \
+}                                                                       \
+                                                                        \
+static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst,                 \
+                                          const uint8_t *src,           \
+                                          ptrdiff_t stride)             \
+{                                                                       \
+    uint64_t half[17 * 2];                                              \
+    uint8_t *const halfH = (uint8_t *) half;                            \
+    ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,     \
+                                                    stride, 17);        \
+    ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH,         \
+                                                    stride, 16);        \
+}
+
+QPEL_OP(put_,        _,        mmxext)
+QPEL_OP(avg_,        _,        mmxext)
+QPEL_OP(put_no_rnd_, _no_rnd_, mmxext)
+
+#endif /* HAVE_X86ASM */
+
+#define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX)                          \
+do {                                                                         \
+    c->PFX ## _pixels_tab[IDX][ 0] = PREFIX ## PFX ## SIZE ## _mc00_ ## CPU; \
+    c->PFX ## _pixels_tab[IDX][ 1] = PREFIX ## PFX ## SIZE ## _mc10_ ## CPU; \
+    c->PFX ## _pixels_tab[IDX][ 2] = PREFIX ## PFX ## SIZE ## _mc20_ ## CPU; \
+    c->PFX ## _pixels_tab[IDX][ 3] = PREFIX ## PFX ## SIZE ## _mc30_ ## CPU; \
+    c->PFX ## _pixels_tab[IDX][ 4] = PREFIX ## PFX ## SIZE ## _mc01_ ## CPU; \
+    c->PFX ## _pixels_tab[IDX][ 5] = PREFIX ## PFX ## SIZE ## _mc11_ ## CPU; \
+    c->PFX ## _pixels_tab[IDX][ 6] = PREFIX ## PFX ## SIZE ## _mc21_ ## CPU; \
+    c->PFX ## _pixels_tab[IDX][ 7] = PREFIX ## PFX ## SIZE ## _mc31_ ## CPU; \
+    c->PFX ## _pixels_tab[IDX][ 8] = PREFIX ## PFX ## SIZE ## _mc02_ ## CPU; \
+    c->PFX ## _pixels_tab[IDX][ 9] = PREFIX ## PFX ## SIZE ## _mc12_ ## CPU; \
+    c->PFX ## _pixels_tab[IDX][10] = PREFIX ## PFX ## SIZE ## _mc22_ ## CPU; \
+    c->PFX ## _pixels_tab[IDX][11] = PREFIX ## PFX ## SIZE ## _mc32_ ## CPU; \
+    c->PFX ## _pixels_tab[IDX][12] = PREFIX ## PFX ## SIZE ## _mc03_ ## CPU; \
+    c->PFX ## _pixels_tab[IDX][13] = PREFIX ## PFX ## SIZE ## _mc13_ ## CPU; \
+    c->PFX ## _pixels_tab[IDX][14] = PREFIX ## PFX ## SIZE ## _mc23_ ## CPU; \
+    c->PFX ## _pixels_tab[IDX][15] = PREFIX ## PFX ## SIZE ## _mc33_ ## CPU; \
+} while (0)
+
+av_cold void ff_qpeldsp_init_x86(QpelDSPContext *c)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (X86_MMXEXT(cpu_flags)) {
+#if HAVE_MMXEXT_EXTERNAL
+        SET_QPEL_FUNCS(avg_qpel,        0, 16, mmxext, );
+        SET_QPEL_FUNCS(avg_qpel,        1,  8, mmxext, );
+
+        SET_QPEL_FUNCS(put_qpel,        0, 16, mmxext, );
+        SET_QPEL_FUNCS(put_qpel,        1,  8, mmxext, );
+        SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmxext, );
+        SET_QPEL_FUNCS(put_no_rnd_qpel, 1,  8, mmxext, );
+#endif /* HAVE_MMXEXT_EXTERNAL */
+    }
+}
--- a/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/rnd_template.c
+++ b/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/rnd_template.c
@ -0,0 +1,175 @@
+/*
+ * SIMD-optimized halfpel functions are compiled twice for rnd/no_rnd
+ * Copyright (c) 2000, 2001 Fabrice Bellard
+ * Copyright (c) 2003-2004 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
+ * mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
+ * and improved by Zdenek Kabelac <kabi@users.sf.net>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "inline_asm.h"
+
+// put_pixels
+av_unused STATIC void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels,
+                                  ptrdiff_t line_size, int h)
+{
+    MOVQ_ZERO(mm7);
+    SET_RND(mm6); // =2 for rnd  and  =1 for no_rnd version
+    __asm__ volatile(
+        "movq   (%1), %%mm0             \n\t"
+        "movq   1(%1), %%mm4            \n\t"
+        "movq   %%mm0, %%mm1            \n\t"
+        "movq   %%mm4, %%mm5            \n\t"
+        "punpcklbw %%mm7, %%mm0         \n\t"
+        "punpcklbw %%mm7, %%mm4         \n\t"
+        "punpckhbw %%mm7, %%mm1         \n\t"
+        "punpckhbw %%mm7, %%mm5         \n\t"
+        "paddusw %%mm0, %%mm4           \n\t"
+        "paddusw %%mm1, %%mm5           \n\t"
+        "xor    %%"FF_REG_a", %%"FF_REG_a" \n\t"
+        "add    %3, %1                  \n\t"
+        ".p2align 3                     \n\t"
+        "1:                             \n\t"
+        "movq   (%1, %%"FF_REG_a"), %%mm0  \n\t"
+        "movq   1(%1, %%"FF_REG_a"), %%mm2 \n\t"
+        "movq   %%mm0, %%mm1            \n\t"
+        "movq   %%mm2, %%mm3            \n\t"
+        "punpcklbw %%mm7, %%mm0         \n\t"
+        "punpcklbw %%mm7, %%mm2         \n\t"
+        "punpckhbw %%mm7, %%mm1         \n\t"
+        "punpckhbw %%mm7, %%mm3         \n\t"
+        "paddusw %%mm2, %%mm0           \n\t"
+        "paddusw %%mm3, %%mm1           \n\t"
+        "paddusw %%mm6, %%mm4           \n\t"
+        "paddusw %%mm6, %%mm5           \n\t"
+        "paddusw %%mm0, %%mm4           \n\t"
+        "paddusw %%mm1, %%mm5           \n\t"
+        "psrlw  $2, %%mm4               \n\t"
+        "psrlw  $2, %%mm5               \n\t"
+        "packuswb  %%mm5, %%mm4         \n\t"
+        "movq   %%mm4, (%2, %%"FF_REG_a")  \n\t"
+        "add    %3, %%"FF_REG_a"           \n\t"
+
+        "movq   (%1, %%"FF_REG_a"), %%mm2  \n\t" // 0 <-> 2   1 <-> 3
+        "movq   1(%1, %%"FF_REG_a"), %%mm4 \n\t"
+        "movq   %%mm2, %%mm3            \n\t"
+        "movq   %%mm4, %%mm5            \n\t"
+        "punpcklbw %%mm7, %%mm2         \n\t"
+        "punpcklbw %%mm7, %%mm4         \n\t"
+        "punpckhbw %%mm7, %%mm3         \n\t"
+        "punpckhbw %%mm7, %%mm5         \n\t"
+        "paddusw %%mm2, %%mm4           \n\t"
+        "paddusw %%mm3, %%mm5           \n\t"
+        "paddusw %%mm6, %%mm0           \n\t"
+        "paddusw %%mm6, %%mm1           \n\t"
+        "paddusw %%mm4, %%mm0           \n\t"
+        "paddusw %%mm5, %%mm1           \n\t"
+        "psrlw  $2, %%mm0               \n\t"
+        "psrlw  $2, %%mm1               \n\t"
+        "packuswb  %%mm1, %%mm0         \n\t"
+        "movq   %%mm0, (%2, %%"FF_REG_a")  \n\t"
+        "add    %3, %%"FF_REG_a"        \n\t"
+
+        "subl   $2, %0                  \n\t"
+        "jnz    1b                      \n\t"
+        :"+g"(h), "+S"(pixels)
+        :"D"(block), "r"((x86_reg)line_size)
+        :FF_REG_a, "memory");
+}
+
+// avg_pixels
+// this routine is 'slightly' suboptimal but mostly unused
+av_unused STATIC void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels,
+                                  ptrdiff_t line_size, int h)
+{
+    MOVQ_ZERO(mm7);
+    SET_RND(mm6); // =2 for rnd  and  =1 for no_rnd version
+    __asm__ volatile(
+        "movq   (%1), %%mm0             \n\t"
+        "movq   1(%1), %%mm4            \n\t"
+        "movq   %%mm0, %%mm1            \n\t"
+        "movq   %%mm4, %%mm5            \n\t"
+        "punpcklbw %%mm7, %%mm0         \n\t"
+        "punpcklbw %%mm7, %%mm4         \n\t"
+        "punpckhbw %%mm7, %%mm1         \n\t"
+        "punpckhbw %%mm7, %%mm5         \n\t"
+        "paddusw %%mm0, %%mm4           \n\t"
+        "paddusw %%mm1, %%mm5           \n\t"
+        "xor    %%"FF_REG_a", %%"FF_REG_a" \n\t"
+        "add    %3, %1                  \n\t"
+        ".p2align 3                     \n\t"
+        "1:                             \n\t"
+        "movq   (%1, %%"FF_REG_a"), %%mm0  \n\t"
+        "movq   1(%1, %%"FF_REG_a"), %%mm2 \n\t"
+        "movq   %%mm0, %%mm1            \n\t"
+        "movq   %%mm2, %%mm3            \n\t"
+        "punpcklbw %%mm7, %%mm0         \n\t"
+        "punpcklbw %%mm7, %%mm2         \n\t"
+        "punpckhbw %%mm7, %%mm1         \n\t"
+        "punpckhbw %%mm7, %%mm3         \n\t"
+        "paddusw %%mm2, %%mm0           \n\t"
+        "paddusw %%mm3, %%mm1           \n\t"
+        "paddusw %%mm6, %%mm4           \n\t"
+        "paddusw %%mm6, %%mm5           \n\t"
+        "paddusw %%mm0, %%mm4           \n\t"
+        "paddusw %%mm1, %%mm5           \n\t"
+        "psrlw  $2, %%mm4               \n\t"
+        "psrlw  $2, %%mm5               \n\t"
+                "movq   (%2, %%"FF_REG_a"), %%mm3  \n\t"
+        "packuswb  %%mm5, %%mm4         \n\t"
+                "pcmpeqd %%mm2, %%mm2   \n\t"
+                "paddb %%mm2, %%mm2     \n\t"
+                PAVGB_MMX(%%mm3, %%mm4, %%mm5, %%mm2)
+                "movq   %%mm5, (%2, %%"FF_REG_a")  \n\t"
+        "add    %3, %%"FF_REG_a"        \n\t"
+
+        "movq   (%1, %%"FF_REG_a"), %%mm2  \n\t" // 0 <-> 2   1 <-> 3
+        "movq   1(%1, %%"FF_REG_a"), %%mm4 \n\t"
+        "movq   %%mm2, %%mm3            \n\t"
+        "movq   %%mm4, %%mm5            \n\t"
+        "punpcklbw %%mm7, %%mm2         \n\t"
+        "punpcklbw %%mm7, %%mm4         \n\t"
+        "punpckhbw %%mm7, %%mm3         \n\t"
+        "punpckhbw %%mm7, %%mm5         \n\t"
+        "paddusw %%mm2, %%mm4           \n\t"
+        "paddusw %%mm3, %%mm5           \n\t"
+        "paddusw %%mm6, %%mm0           \n\t"
+        "paddusw %%mm6, %%mm1           \n\t"
+        "paddusw %%mm4, %%mm0           \n\t"
+        "paddusw %%mm5, %%mm1           \n\t"
+        "psrlw  $2, %%mm0               \n\t"
+        "psrlw  $2, %%mm1               \n\t"
+                "movq   (%2, %%"FF_REG_a"), %%mm3  \n\t"
+        "packuswb  %%mm1, %%mm0         \n\t"
+                "pcmpeqd %%mm2, %%mm2   \n\t"
+                "paddb %%mm2, %%mm2     \n\t"
+                PAVGB_MMX(%%mm3, %%mm0, %%mm1, %%mm2)
+                "movq   %%mm1, (%2, %%"FF_REG_a")  \n\t"
+        "add    %3, %%"FF_REG_a"           \n\t"
+
+        "subl   $2, %0                  \n\t"
+        "jnz    1b                      \n\t"
+        :"+g"(h), "+S"(pixels)
+        :"D"(block), "r"((x86_reg)line_size)
+        :FF_REG_a, "memory");
+}
--- a/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/rv34dsp_init.c
+++ b/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/rv34dsp_init.c
@ -0,0 +1,48 @@
+/*
+ * RV30/40 MMX/SSE2 optimizations
+ * Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/rv34dsp.h"
+
+void ff_rv34_idct_dc_mmxext(int16_t *block);
+void ff_rv34_idct_dc_noround_mmxext(int16_t *block);
+void ff_rv34_idct_dc_add_mmx(uint8_t *dst, ptrdiff_t stride, int dc);
+void ff_rv34_idct_dc_add_sse2(uint8_t *dst, ptrdiff_t stride, int dc);
+void ff_rv34_idct_dc_add_sse4(uint8_t *dst, ptrdiff_t stride, int dc);
+void ff_rv34_idct_add_mmxext(uint8_t *dst, ptrdiff_t stride, int16_t *block);
+
+av_cold void ff_rv34dsp_init_x86(RV34DSPContext* c)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (ARCH_X86_32 && EXTERNAL_MMX(cpu_flags))
+        c->rv34_idct_dc_add = ff_rv34_idct_dc_add_mmx;
+    if (EXTERNAL_MMXEXT(cpu_flags)) {
+        c->rv34_inv_transform_dc = ff_rv34_idct_dc_noround_mmxext;
+        c->rv34_idct_add         = ff_rv34_idct_add_mmxext;
+    }
+    if (EXTERNAL_SSE2(cpu_flags))
+        c->rv34_idct_dc_add = ff_rv34_idct_dc_add_sse2;
+    if (EXTERNAL_SSE4(cpu_flags))
+        c->rv34_idct_dc_add = ff_rv34_idct_dc_add_sse4;
+}
--- a/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/rv40dsp_init.c
+++ b/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/rv40dsp_init.c
@ -0,0 +1,278 @@
+/*
+ * RV40 decoder motion compensation functions x86-optimised
+ * Copyright (c) 2008 Konstantin Shishkov
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * RV40 decoder motion compensation functions x86-optimised
+ * 2,0 and 0,2 have h264 equivalents.
+ * 3,3 is bugged in the rv40 format and maps to _xy2 version
+ */
+
+#include "libavcodec/rv34dsp.h"
+#include "libavutil/attributes.h"
+#include "libavutil/mem.h"
+#include "libavutil/x86/cpu.h"
+#include "hpeldsp.h"
+
+#define DEFINE_FN(op, size, insn) \
+static void op##_rv40_qpel##size##_mc33_##insn(uint8_t *dst, const uint8_t *src, \
+                                               ptrdiff_t stride) \
+{ \
+    ff_##op##_pixels##size##_xy2_##insn(dst, src, stride, size); \
+}
+
+#if HAVE_X86ASM
+void ff_put_rv40_chroma_mc8_mmx  (uint8_t *dst, uint8_t *src,
+                                  ptrdiff_t stride, int h, int x, int y);
+void ff_avg_rv40_chroma_mc8_mmxext(uint8_t *dst, uint8_t *src,
+                                   ptrdiff_t stride, int h, int x, int y);
+void ff_avg_rv40_chroma_mc8_3dnow(uint8_t *dst, uint8_t *src,
+                                  ptrdiff_t stride, int h, int x, int y);
+
+void ff_put_rv40_chroma_mc4_mmx  (uint8_t *dst, uint8_t *src,
+                                  ptrdiff_t stride, int h, int x, int y);
+void ff_avg_rv40_chroma_mc4_mmxext(uint8_t *dst, uint8_t *src,
+                                   ptrdiff_t stride, int h, int x, int y);
+void ff_avg_rv40_chroma_mc4_3dnow(uint8_t *dst, uint8_t *src,
+                                  ptrdiff_t stride, int h, int x, int y);
+
+#define DECLARE_WEIGHT(opt) \
+void ff_rv40_weight_func_rnd_16_##opt(uint8_t *dst, uint8_t *src1, uint8_t *src2, \
+                                      int w1, int w2, ptrdiff_t stride); \
+void ff_rv40_weight_func_rnd_8_##opt (uint8_t *dst, uint8_t *src1, uint8_t *src2, \
+                                      int w1, int w2, ptrdiff_t stride); \
+void ff_rv40_weight_func_nornd_16_##opt(uint8_t *dst, uint8_t *src1, uint8_t *src2, \
+                                        int w1, int w2, ptrdiff_t stride); \
+void ff_rv40_weight_func_nornd_8_##opt (uint8_t *dst, uint8_t *src1, uint8_t *src2, \
+                                        int w1, int w2, ptrdiff_t stride);
+DECLARE_WEIGHT(mmxext)
+DECLARE_WEIGHT(sse2)
+DECLARE_WEIGHT(ssse3)
+
+/** @{ */
+/**
+ * Define one qpel function.
+ * LOOPSIZE must be already set to the number of pixels processed per
+ * iteration in the inner loop of the called functions.
+ * COFF(x) must be already defined so as to provide the offset into any
+ * array of coeffs used by the called function for the qpel position x.
+ */
+#define QPEL_FUNC_DECL(OP, SIZE, PH, PV, OPT)                           \
+static void OP ## rv40_qpel ##SIZE ##_mc ##PH ##PV ##OPT(uint8_t *dst,  \
+                                                         const uint8_t *src, \
+                                                         ptrdiff_t stride)  \
+{                                                                       \
+    int i;                                                              \
+    if (PH && PV) {                                                     \
+        LOCAL_ALIGNED(16, uint8_t, tmp, [SIZE * (SIZE + 5)]);           \
+        uint8_t *tmpptr = tmp + SIZE * 2;                               \
+        src -= stride * 2;                                              \
+                                                                        \
+        for (i = 0; i < SIZE; i += LOOPSIZE)                            \
+            ff_put_rv40_qpel_h ##OPT(tmp + i, SIZE, src + i, stride,    \
+                                     SIZE + 5, HCOFF(PH));              \
+        for (i = 0; i < SIZE; i += LOOPSIZE)                            \
+            ff_ ##OP ##rv40_qpel_v ##OPT(dst + i, stride, tmpptr + i,   \
+                                         SIZE, SIZE, VCOFF(PV));        \
+    } else if (PV) {                                                    \
+        for (i = 0; i < SIZE; i += LOOPSIZE)                            \
+            ff_ ##OP ##rv40_qpel_v ## OPT(dst + i, stride, src + i,     \
+                                          stride, SIZE, VCOFF(PV));     \
+    } else {                                                            \
+        for (i = 0; i < SIZE; i += LOOPSIZE)                            \
+            ff_ ##OP ##rv40_qpel_h ## OPT(dst + i, stride, src + i,     \
+                                          stride, SIZE, HCOFF(PH));     \
+    }                                                                   \
+}
+
+/** Declare functions for sizes 8 and 16 and given operations
+ *  and qpel position. */
+#define QPEL_FUNCS_DECL(OP, PH, PV, OPT) \
+    QPEL_FUNC_DECL(OP,  8, PH, PV, OPT)  \
+    QPEL_FUNC_DECL(OP, 16, PH, PV, OPT)
+
+/** Declare all functions for all sizes and qpel positions */
+#define QPEL_MC_DECL(OP, OPT)                                           \
+void ff_ ##OP ##rv40_qpel_h ##OPT(uint8_t *dst, ptrdiff_t dstStride,    \
+                                  const uint8_t *src,                   \
+                                  ptrdiff_t srcStride,                  \
+                                  int len, int m);                      \
+void ff_ ##OP ##rv40_qpel_v ##OPT(uint8_t *dst, ptrdiff_t dstStride,    \
+                                  const uint8_t *src,                   \
+                                  ptrdiff_t srcStride,                  \
+                                  int len, int m);                      \
+QPEL_FUNCS_DECL(OP, 0, 1, OPT)                                          \
+QPEL_FUNCS_DECL(OP, 0, 3, OPT)                                          \
+QPEL_FUNCS_DECL(OP, 1, 0, OPT)                                          \
+QPEL_FUNCS_DECL(OP, 1, 1, OPT)                                          \
+QPEL_FUNCS_DECL(OP, 1, 2, OPT)                                          \
+QPEL_FUNCS_DECL(OP, 1, 3, OPT)                                          \
+QPEL_FUNCS_DECL(OP, 2, 1, OPT)                                          \
+QPEL_FUNCS_DECL(OP, 2, 2, OPT)                                          \
+QPEL_FUNCS_DECL(OP, 2, 3, OPT)                                          \
+QPEL_FUNCS_DECL(OP, 3, 0, OPT)                                          \
+QPEL_FUNCS_DECL(OP, 3, 1, OPT)                                          \
+QPEL_FUNCS_DECL(OP, 3, 2, OPT)
+/** @} */
+
+#define LOOPSIZE  8
+#define HCOFF(x)  (32 * ((x) - 1))
+#define VCOFF(x)  (32 * ((x) - 1))
+QPEL_MC_DECL(put_, _ssse3)
+QPEL_MC_DECL(avg_, _ssse3)
+
+#undef LOOPSIZE
+#undef HCOFF
+#undef VCOFF
+#define LOOPSIZE  8
+#define HCOFF(x)  (64 * ((x) - 1))
+#define VCOFF(x)  (64 * ((x) - 1))
+QPEL_MC_DECL(put_, _sse2)
+QPEL_MC_DECL(avg_, _sse2)
+
+#if ARCH_X86_32
+#undef LOOPSIZE
+#undef HCOFF
+#undef VCOFF
+#define LOOPSIZE  4
+#define HCOFF(x)  (64 * ((x) - 1))
+#define VCOFF(x)  (64 * ((x) - 1))
+
+QPEL_MC_DECL(put_, _mmx)
+
+#define ff_put_rv40_qpel_h_mmxext  ff_put_rv40_qpel_h_mmx
+#define ff_put_rv40_qpel_v_mmxext  ff_put_rv40_qpel_v_mmx
+QPEL_MC_DECL(avg_, _mmxext)
+
+#define ff_put_rv40_qpel_h_3dnow  ff_put_rv40_qpel_h_mmx
+#define ff_put_rv40_qpel_v_3dnow  ff_put_rv40_qpel_v_mmx
+QPEL_MC_DECL(avg_, _3dnow)
+#endif
+
+/** @{ */
+/** Set one function */
+#define QPEL_FUNC_SET(OP, SIZE, PH, PV, OPT)                            \
+    c-> OP ## pixels_tab[2 - SIZE / 8][4 * PV + PH] = OP ## rv40_qpel ##SIZE ## _mc ##PH ##PV ##OPT;
+
+/** Set functions put and avg for sizes 8 and 16 and a given qpel position */
+#define QPEL_FUNCS_SET(OP, PH, PV, OPT)         \
+    QPEL_FUNC_SET(OP,  8, PH, PV, OPT)          \
+    QPEL_FUNC_SET(OP, 16, PH, PV, OPT)
+
+/** Set all functions for all sizes and qpel positions */
+#define QPEL_MC_SET(OP, OPT)   \
+QPEL_FUNCS_SET (OP, 0, 1, OPT) \
+QPEL_FUNCS_SET (OP, 0, 3, OPT) \
+QPEL_FUNCS_SET (OP, 1, 0, OPT) \
+QPEL_FUNCS_SET (OP, 1, 1, OPT) \
+QPEL_FUNCS_SET (OP, 1, 2, OPT) \
+QPEL_FUNCS_SET (OP, 1, 3, OPT) \
+QPEL_FUNCS_SET (OP, 2, 1, OPT) \
+QPEL_FUNCS_SET (OP, 2, 2, OPT) \
+QPEL_FUNCS_SET (OP, 2, 3, OPT) \
+QPEL_FUNCS_SET (OP, 3, 0, OPT) \
+QPEL_FUNCS_SET (OP, 3, 1, OPT) \
+QPEL_FUNCS_SET (OP, 3, 2, OPT)
+/** @} */
+
+DEFINE_FN(put, 8, ssse3)
+
+DEFINE_FN(put, 16, sse2)
+DEFINE_FN(put, 16, ssse3)
+
+DEFINE_FN(avg, 8, mmxext)
+DEFINE_FN(avg, 8, ssse3)
+
+DEFINE_FN(avg, 16, sse2)
+DEFINE_FN(avg, 16, ssse3)
+#endif /* HAVE_X86ASM */
+
+#if HAVE_MMX_INLINE
+DEFINE_FN(put, 8, mmx)
+DEFINE_FN(avg, 8, mmx)
+DEFINE_FN(put, 16, mmx)
+DEFINE_FN(avg, 16, mmx)
+#endif
+
+av_cold void ff_rv40dsp_init_x86(RV34DSPContext *c)
+{
+    av_unused int cpu_flags = av_get_cpu_flags();
+
+#if HAVE_MMX_INLINE
+    if (INLINE_MMX(cpu_flags)) {
+        c->put_pixels_tab[0][15] = put_rv40_qpel16_mc33_mmx;
+        c->put_pixels_tab[1][15] = put_rv40_qpel8_mc33_mmx;
+        c->avg_pixels_tab[0][15] = avg_rv40_qpel16_mc33_mmx;
+        c->avg_pixels_tab[1][15] = avg_rv40_qpel8_mc33_mmx;
+    }
+#endif /* HAVE_MMX_INLINE */
+
+#if HAVE_X86ASM
+    if (EXTERNAL_MMX(cpu_flags)) {
+        c->put_chroma_pixels_tab[0] = ff_put_rv40_chroma_mc8_mmx;
+        c->put_chroma_pixels_tab[1] = ff_put_rv40_chroma_mc4_mmx;
+#if ARCH_X86_32
+        QPEL_MC_SET(put_, _mmx)
+#endif
+    }
+    if (EXTERNAL_AMD3DNOW(cpu_flags)) {
+        c->avg_chroma_pixels_tab[0] = ff_avg_rv40_chroma_mc8_3dnow;
+        c->avg_chroma_pixels_tab[1] = ff_avg_rv40_chroma_mc4_3dnow;
+#if ARCH_X86_32
+        QPEL_MC_SET(avg_, _3dnow)
+#endif
+    }
+    if (EXTERNAL_MMXEXT(cpu_flags)) {
+        c->avg_pixels_tab[1][15]        = avg_rv40_qpel8_mc33_mmxext;
+        c->avg_chroma_pixels_tab[0]     = ff_avg_rv40_chroma_mc8_mmxext;
+        c->avg_chroma_pixels_tab[1]     = ff_avg_rv40_chroma_mc4_mmxext;
+        c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_mmxext;
+        c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_rnd_8_mmxext;
+        c->rv40_weight_pixels_tab[1][0] = ff_rv40_weight_func_nornd_16_mmxext;
+        c->rv40_weight_pixels_tab[1][1] = ff_rv40_weight_func_nornd_8_mmxext;
+#if ARCH_X86_32
+        QPEL_MC_SET(avg_, _mmxext)
+#endif
+    }
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        c->put_pixels_tab[0][15]        = put_rv40_qpel16_mc33_sse2;
+        c->avg_pixels_tab[0][15]        = avg_rv40_qpel16_mc33_sse2;
+        c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_sse2;
+        c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_rnd_8_sse2;
+        c->rv40_weight_pixels_tab[1][0] = ff_rv40_weight_func_nornd_16_sse2;
+        c->rv40_weight_pixels_tab[1][1] = ff_rv40_weight_func_nornd_8_sse2;
+        QPEL_MC_SET(put_, _sse2)
+        QPEL_MC_SET(avg_, _sse2)
+    }
+    if (EXTERNAL_SSSE3(cpu_flags)) {
+        c->put_pixels_tab[0][15]        = put_rv40_qpel16_mc33_ssse3;
+        c->put_pixels_tab[1][15]        = put_rv40_qpel8_mc33_ssse3;
+        c->avg_pixels_tab[0][15]        = avg_rv40_qpel16_mc33_ssse3;
+        c->avg_pixels_tab[1][15]        = avg_rv40_qpel8_mc33_ssse3;
+        c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_ssse3;
+        c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_rnd_8_ssse3;
+        c->rv40_weight_pixels_tab[1][0] = ff_rv40_weight_func_nornd_16_ssse3;
+        c->rv40_weight_pixels_tab[1][1] = ff_rv40_weight_func_nornd_8_ssse3;
+        QPEL_MC_SET(put_, _ssse3)
+        QPEL_MC_SET(avg_, _ssse3)
+    }
+#endif /* HAVE_X86ASM */
+}
--- a/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/sbcdsp_init.c
+++ b/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/sbcdsp_init.c
@ -0,0 +1,51 @@
+/*
+ * Bluetooth low-complexity, subband codec (SBC)
+ *
+ * Copyright (C) 2017  Aurelien Jacobs <aurel@gnuage.org>
+ * Copyright (C) 2008-2010  Nokia Corporation
+ * Copyright (C) 2004-2010  Marcel Holtmann <marcel@holtmann.org>
+ * Copyright (C) 2004-2005  Henryk Ploetz <henryk@ploetzli.ch>
+ * Copyright (C) 2005-2006  Brad Midgley <bmidgley@xmission.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * SBC MMX optimization for some basic "building bricks"
+ */
+
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/sbcdsp.h"
+
+void ff_sbc_analyze_4_mmx(const int16_t *in, int32_t *out, const int16_t *consts);
+void ff_sbc_analyze_8_mmx(const int16_t *in, int32_t *out, const int16_t *consts);
+void ff_sbc_calc_scalefactors_mmx(int32_t sb_sample_f[16][2][8],
+                                  uint32_t scale_factor[2][8],
+                                  int blocks, int channels, int subbands);
+
+av_cold void ff_sbcdsp_init_x86(SBCDSPContext *s)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_MMX(cpu_flags)) {
+        s->sbc_analyze_4 = ff_sbc_analyze_4_mmx;
+        s->sbc_analyze_8 = ff_sbc_analyze_8_mmx;
+        s->sbc_calc_scalefactors = ff_sbc_calc_scalefactors_mmx;
+    }
+}
--- a/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/sbrdsp.asm
+++ b/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/sbrdsp.asm
@ -0,0 +1,548 @@
+;******************************************************************************
+;* AAC Spectral Band Replication decoding functions
+;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+; mask equivalent for multiply by -1.0 1.0
+ps_mask         times 2 dd 1<<31, 0
+ps_mask2        times 2 dd 0, 1<<31
+ps_mask3        dd  0, 0, 0, 1<<31
+ps_noise0       times 2 dd  1.0,  0.0,
+ps_noise2       times 2 dd -1.0,  0.0
+ps_noise13      dd  0.0,  1.0, 0.0, -1.0
+                dd  0.0, -1.0, 0.0,  1.0
+                dd  0.0,  1.0, 0.0, -1.0
+cextern         sbr_noise_table
+cextern         ps_neg
+
+SECTION .text
+
+INIT_XMM sse
+cglobal sbr_sum_square, 2, 3, 6
+    mov        r2d, r1d
+    xorps       m0, m0
+    xorps       m1, m1
+    sar         r2, 3
+    jz          .prepare
+.loop:
+    movu        m2, [r0 +  0]
+    movu        m3, [r0 + 16]
+    movu        m4, [r0 + 32]
+    movu        m5, [r0 + 48]
+    mulps       m2, m2
+    mulps       m3, m3
+    mulps       m4, m4
+    mulps       m5, m5
+    addps       m0, m2
+    addps       m1, m3
+    addps       m0, m4
+    addps       m1, m5
+    add         r0, 64
+    dec         r2
+    jnz         .loop
+.prepare:
+    and         r1, 7
+    sar         r1, 1
+    jz          .end
+; len is a multiple of 2, thus there are at least 4 elements to process
+.endloop:
+    movu        m2, [r0]
+    add         r0, 16
+    mulps       m2, m2
+    dec         r1
+    addps       m0, m2
+    jnz         .endloop
+.end:
+    addps       m0, m1
+    movhlps     m2, m0
+    addps       m0, m2
+    movss       m1, m0
+    shufps      m0, m0, 1
+    addss       m0, m1
+%if ARCH_X86_64 == 0
+    movss       r0m,  m0
+    fld         dword r0m
+%endif
+    RET
+
+%define STEP  40*4*2
+cglobal sbr_hf_g_filt, 5, 6, 5
+    lea         r1, [r1 + 8*r4] ; offset by ixh elements into X_high
+    mov         r5, r3
+    and         r3, 0xFC
+    lea         r2, [r2 + r3*4]
+    lea         r0, [r0 + r3*8]
+    neg         r3
+    jz          .loop1
+.loop4:
+    movlps      m0, [r2 + 4*r3 + 0]
+    movlps      m1, [r2 + 4*r3 + 8]
+    movlps      m2, [r1 + 0*STEP]
+    movlps      m3, [r1 + 2*STEP]
+    movhps      m2, [r1 + 1*STEP]
+    movhps      m3, [r1 + 3*STEP]
+    unpcklps    m0, m0
+    unpcklps    m1, m1
+    mulps       m0, m2
+    mulps       m1, m3
+    movu        [r0 + 8*r3 +  0], m0
+    movu        [r0 + 8*r3 + 16], m1
+    add         r1, 4*STEP
+    add         r3, 4
+    jnz         .loop4
+    and         r5, 3 ; number of single element loops
+    jz          .end
+.loop1: ; element 0 and 1 can be computed at the same time
+    movss       m0, [r2]
+    movlps      m2, [r1]
+    unpcklps    m0, m0
+    mulps       m2, m0
+    movlps    [r0], m2
+    add         r0, 8
+    add         r2, 4
+    add         r1, STEP
+    dec         r5
+    jnz         .loop1
+.end:
+    RET
+
+; void ff_sbr_hf_gen_sse(float (*X_high)[2], const float (*X_low)[2],
+;                        const float alpha0[2], const float alpha1[2],
+;                        float bw, int start, int end)
+;
+cglobal sbr_hf_gen, 4,4,8, X_high, X_low, alpha0, alpha1, BW, S, E
+    ; load alpha factors
+%define bw m0
+%if ARCH_X86_64 == 0 || WIN64
+    movss      bw, BWm
+%endif
+    movlps     m2, [alpha1q]
+    movlps     m1, [alpha0q]
+    shufps     bw, bw, 0
+    mulps      m2, bw             ; (a1[0] a1[1])*bw
+    mulps      m1, bw             ; (a0[0] a0[1])*bw    = (a2 a3)
+    mulps      m2, bw             ; (a1[0] a1[1])*bw*bw = (a0 a1)
+    mova       m3, m1
+    mova       m4, m2
+
+    ; Set pointers
+%if ARCH_X86_64 == 0 || WIN64
+    ; start and end 6th and 7th args on stack
+    mov        r2d, Sm
+    mov        r3d, Em
+    DEFINE_ARGS X_high, X_low, start, end
+%else
+; BW does not actually occupy a register, so shift by 1
+    DEFINE_ARGS X_high, X_low, alpha0, alpha1, start, end
+    movsxd  startq, startd
+    movsxd    endq, endd
+%endif
+    sub     startq, endq         ; neg num of loops
+    lea    X_highq, [X_highq + endq*2*4]
+    lea     X_lowq, [X_lowq  + endq*2*4 - 2*2*4]
+    shl     startq, 3            ; offset from num loops
+
+    mova        m0, [X_lowq + startq]
+    shufps      m3, m3, q1111
+    shufps      m4, m4, q1111
+    xorps       m3, [ps_mask]
+    shufps      m1, m1, q0000
+    shufps      m2, m2, q0000
+    xorps       m4, [ps_mask]
+.loop2:
+    movu        m7, [X_lowq + startq + 8]       ; BbCc
+    mova        m6, m0
+    mova        m5, m7
+    shufps      m0, m0, q2301                   ; aAbB
+    shufps      m7, m7, q2301                   ; bBcC
+    mulps       m0, m4
+    mulps       m7, m3
+    mulps       m6, m2
+    mulps       m5, m1
+    addps       m7, m0
+    mova        m0, [X_lowq + startq + 16]      ; CcDd
+    addps       m7, m0
+    addps       m6, m5
+    addps       m7, m6
+    mova  [X_highq + startq], m7
+    add     startq, 16
+    jnz         .loop2
+    RET
+
+cglobal sbr_sum64x5, 1,2,4,z
+    lea    r1q, [zq+ 256]
+.loop:
+    mova    m0, [zq+   0]
+    mova    m2, [zq+  16]
+    mova    m1, [zq+ 256]
+    mova    m3, [zq+ 272]
+    addps   m0, [zq+ 512]
+    addps   m2, [zq+ 528]
+    addps   m1, [zq+ 768]
+    addps   m3, [zq+ 784]
+    addps   m0, [zq+1024]
+    addps   m2, [zq+1040]
+    addps   m0, m1
+    addps   m2, m3
+    mova  [zq], m0
+    mova  [zq+16], m2
+    add     zq, 32
+    cmp     zq, r1q
+    jne  .loop
+    REP_RET
+
+INIT_XMM sse
+cglobal sbr_qmf_post_shuffle, 2,3,4,W,z
+    lea              r2q, [zq + (64-4)*4]
+    mova              m3, [ps_neg]
+.loop:
+    mova              m1, [zq]
+    xorps             m0, m3, [r2q]
+    shufps            m0, m0, m0, q0123
+    unpcklps          m2, m0, m1
+    unpckhps          m0, m0, m1
+    mova       [Wq +  0], m2
+    mova       [Wq + 16], m0
+    add               Wq, 32
+    sub              r2q, 16
+    add               zq, 16
+    cmp               zq, r2q
+    jl             .loop
+    REP_RET
+
+INIT_XMM sse
+cglobal sbr_neg_odd_64, 1,2,4,z
+    lea        r1q, [zq+256]
+.loop:
+    mova        m0, [zq+ 0]
+    mova        m1, [zq+16]
+    mova        m2, [zq+32]
+    mova        m3, [zq+48]
+    xorps       m0, [ps_mask2]
+    xorps       m1, [ps_mask2]
+    xorps       m2, [ps_mask2]
+    xorps       m3, [ps_mask2]
+    mova   [zq+ 0], m0
+    mova   [zq+16], m1
+    mova   [zq+32], m2
+    mova   [zq+48], m3
+    add         zq, 64
+    cmp         zq, r1q
+    jne      .loop
+    REP_RET
+
+; void ff_sbr_qmf_deint_bfly_sse2(float *v, const float *src0, const float *src1)
+%macro SBR_QMF_DEINT_BFLY  0
+cglobal sbr_qmf_deint_bfly, 3,5,8, v,src0,src1,vrev,c
+    mov               cq, 64*4-2*mmsize
+    lea            vrevq, [vq + 64*4]
+.loop:
+    mova              m0, [src0q+cq]
+    mova              m1, [src1q]
+    mova              m4, [src0q+cq+mmsize]
+    mova              m5, [src1q+mmsize]
+%if cpuflag(sse2)
+    pshufd            m2, m0, q0123
+    pshufd            m3, m1, q0123
+    pshufd            m6, m4, q0123
+    pshufd            m7, m5, q0123
+%else
+    shufps            m2, m0, m0, q0123
+    shufps            m3, m1, m1, q0123
+    shufps            m6, m4, m4, q0123
+    shufps            m7, m5, m5, q0123
+%endif
+    addps             m5, m2
+    subps             m0, m7
+    addps             m1, m6
+    subps             m4, m3
+    mova         [vrevq], m1
+    mova  [vrevq+mmsize], m5
+    mova         [vq+cq], m0
+    mova  [vq+cq+mmsize], m4
+    add            src1q, 2*mmsize
+    add            vrevq, 2*mmsize
+    sub               cq, 2*mmsize
+    jge            .loop
+    REP_RET
+%endmacro
+
+INIT_XMM sse
+SBR_QMF_DEINT_BFLY
+
+INIT_XMM sse2
+SBR_QMF_DEINT_BFLY
+
+INIT_XMM sse2
+cglobal sbr_qmf_pre_shuffle, 1,4,6,z
+%define OFFSET  (32*4-2*mmsize)
+    mov       r3q, OFFSET
+    lea       r1q, [zq + (32+1)*4]
+    lea       r2q, [zq + 64*4]
+    mova       m5, [ps_neg]
+.loop:
+    movu       m0, [r1q]
+    movu       m2, [r1q + mmsize]
+    movu       m1, [zq + r3q + 4 + mmsize]
+    movu       m3, [zq + r3q + 4]
+
+    pxor       m2, m5
+    pxor       m0, m5
+    pshufd     m2, m2, q0123
+    pshufd     m0, m0, q0123
+    SBUTTERFLY dq, 2, 3, 4
+    SBUTTERFLY dq, 0, 1, 4
+    mova  [r2q + 2*r3q + 0*mmsize], m2
+    mova  [r2q + 2*r3q + 1*mmsize], m3
+    mova  [r2q + 2*r3q + 2*mmsize], m0
+    mova  [r2q + 2*r3q + 3*mmsize], m1
+    add       r1q, 2*mmsize
+    sub       r3q, 2*mmsize
+    jge      .loop
+    movq       m2, [zq]
+    movq    [r2q], m2
+    REP_RET
+
+%ifdef PIC
+%define NREGS 1
+%if UNIX64
+%define NOISE_TABLE r6q ; r5q is m_max
+%else
+%define NOISE_TABLE r5q
+%endif
+%else
+%define NREGS 0
+%define NOISE_TABLE sbr_noise_table
+%endif
+
+%macro LOAD_NST  1
+%ifdef PIC
+    lea  NOISE_TABLE, [%1]
+    mova          m0, [kxq + NOISE_TABLE]
+%else
+    mova          m0, [kxq + %1]
+%endif
+%endmacro
+
+INIT_XMM sse2
+; sbr_hf_apply_noise_0(float (*Y)[2], const float *s_m,
+;                      const float *q_filt, int noise,
+;                      int kx, int m_max)
+cglobal sbr_hf_apply_noise_0, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
+    mova       m0, [ps_noise0]
+    jmp apply_noise_main
+
+; sbr_hf_apply_noise_1(float (*Y)[2], const float *s_m,
+;                      const float *q_filt, int noise,
+;                      int kx, int m_max)
+cglobal sbr_hf_apply_noise_1, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
+    and       kxq, 1
+    shl       kxq, 4
+    LOAD_NST  ps_noise13
+    jmp apply_noise_main
+
+; sbr_hf_apply_noise_2(float (*Y)[2], const float *s_m,
+;                      const float *q_filt, int noise,
+;                      int kx, int m_max)
+cglobal sbr_hf_apply_noise_2, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
+    mova       m0, [ps_noise2]
+    jmp apply_noise_main
+
+; sbr_hf_apply_noise_3(float (*Y)[2], const float *s_m,
+;                      const float *q_filt, int noise,
+;                      int kx, int m_max)
+cglobal sbr_hf_apply_noise_3, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
+    and       kxq, 1
+    shl       kxq, 4
+    LOAD_NST  ps_noise13+16
+
+apply_noise_main:
+%if ARCH_X86_64 == 0 || WIN64
+    mov       kxd, m_maxm
+    DEFINE_ARGS Y, s_m, q_filt, noise, count
+%else
+    DEFINE_ARGS Y, s_m, q_filt, noise, kx, count
+%endif
+    movsxdifnidn    noiseq, noised
+    dec    noiseq
+    shl    countd, 2
+%ifdef PIC
+    lea NOISE_TABLE, [sbr_noise_table]
+%endif
+    lea        Yq, [Yq + 2*countq]
+    add      s_mq, countq
+    add   q_filtq, countq
+    shl    noiseq, 3
+    pxor       m5, m5
+    neg    countq
+.loop:
+    mova       m1, [q_filtq + countq]
+    movu       m3, [noiseq + NOISE_TABLE + 1*mmsize]
+    movu       m4, [noiseq + NOISE_TABLE + 2*mmsize]
+    add    noiseq, 2*mmsize
+    and    noiseq, 0x1ff<<3
+    punpckhdq  m2, m1, m1
+    punpckldq  m1, m1
+    mulps      m1, m3 ; m2 = q_filt[m] * ff_sbr_noise_table[noise]
+    mulps      m2, m4 ; m2 = q_filt[m] * ff_sbr_noise_table[noise]
+    mova       m3, [s_mq + countq]
+    ; TODO: replace by a vpermd in AVX2
+    punpckhdq  m4, m3, m3
+    punpckldq  m3, m3
+    pcmpeqd    m6, m3, m5 ; m6 == 0
+    pcmpeqd    m7, m4, m5 ; m7 == 0
+    mulps      m3, m0 ; s_m[m] * phi_sign
+    mulps      m4, m0 ; s_m[m] * phi_sign
+    pand       m1, m6
+    pand       m2, m7
+    movu       m6, [Yq + 2*countq]
+    movu       m7, [Yq + 2*countq + mmsize]
+    addps      m3, m1
+    addps      m4, m2
+    addps      m6, m3
+    addps      m7, m4
+    movu    [Yq + 2*countq], m6
+    movu    [Yq + 2*countq + mmsize], m7
+    add    countq, mmsize
+    jl      .loop
+    RET
+
+INIT_XMM sse
+cglobal sbr_qmf_deint_neg, 2,4,4,v,src,vrev,c
+%define COUNT  32*4
+%define OFFSET 32*4
+    mov        cq, -COUNT
+    lea     vrevq, [vq + OFFSET + COUNT]
+    add        vq, OFFSET-mmsize
+    add      srcq, 2*COUNT
+    mova       m3, [ps_neg]
+.loop:
+    mova       m0, [srcq + 2*cq + 0*mmsize]
+    mova       m1, [srcq + 2*cq + 1*mmsize]
+    shufps     m2, m0, m1, q2020
+    shufps     m1, m0, q1313
+    xorps      m2, m3
+    mova     [vq], m1
+    mova  [vrevq + cq], m2
+    sub        vq, mmsize
+    add        cq, mmsize
+    jl      .loop
+    REP_RET
+
+%macro SBR_AUTOCORRELATE 0
+cglobal sbr_autocorrelate, 2,3,8,32, x, phi, cnt
+    mov   cntq, 37*8
+    add     xq, cntq
+    neg   cntq
+
+%if cpuflag(sse3)
+%define   MOVH  movsd
+    movddup m5, [xq+cntq]
+%else
+%define   MOVH  movlps
+    movlps  m5, [xq+cntq]
+    movlhps m5, m5
+%endif
+    MOVH    m7, [xq+cntq+8 ]
+    MOVH    m1, [xq+cntq+16]
+    shufps  m7, m7, q0110
+    shufps  m1, m1, q0110
+    mulps   m3, m5, m7   ;              x[0][0] * x[1][0], x[0][1] * x[1][1], x[0][0] * x[1][1], x[0][1] * x[1][0]
+    mulps   m4, m5, m5   ;              x[0][0] * x[0][0], x[0][1] * x[0][1];
+    mulps   m5, m1       ; real_sum2  = x[0][0] * x[2][0], x[0][1] * x[2][1]; imag_sum2 = x[0][0] * x[2][1], x[0][1] * x[2][0]
+    movaps  [rsp   ], m3
+    movaps  [rsp+16], m4
+    add   cntq, 8
+
+    MOVH    m2, [xq+cntq+16]
+    movlhps m7, m7
+    shufps  m2, m2, q0110
+    mulps   m6, m7, m1   ; real_sum1  = x[1][0] * x[2][0], x[1][1] * x[2][1]; imag_sum1 += x[1][0] * x[2][1], x[1][1] * x[2][0]
+    mulps   m4, m7, m2
+    mulps   m7, m7       ; real_sum0  = x[1][0] * x[1][0], x[1][1] * x[1][1];
+    addps   m5, m4       ; real_sum2 += x[1][0] * x[3][0], x[1][1] * x[3][1]; imag_sum2 += x[1][0] * x[3][1], x[1][1] * x[3][0]
+
+align 16
+.loop:
+    add   cntq, 8
+    MOVH    m0, [xq+cntq+16]
+    movlhps m1, m1
+    shufps  m0, m0, q0110
+    mulps   m3, m1, m2
+    mulps   m4, m1, m0
+    mulps   m1, m1
+    addps   m6, m3       ; real_sum1 += x[i][0] * x[i + 1][0], x[i][1] * x[i + 1][1]; imag_sum1 += x[i][0] * x[i + 1][1], x[i][1] * x[i + 1][0];
+    addps   m5, m4       ; real_sum2 += x[i][0] * x[i + 2][0], x[i][1] * x[i + 2][1]; imag_sum2 += x[i][0] * x[i + 2][1], x[i][1] * x[i + 2][0];
+    addps   m7, m1       ; real_sum0 += x[i][0] * x[i][0],     x[i][1] * x[i][1];
+    add   cntq, 8
+    MOVH    m1, [xq+cntq+16]
+    movlhps m2, m2
+    shufps  m1, m1, q0110
+    mulps   m3, m2, m0
+    mulps   m4, m2, m1
+    mulps   m2, m2
+    addps   m6, m3       ; real_sum1 += x[i][0] * x[i + 1][0], x[i][1] * x[i + 1][1]; imag_sum1 += x[i][0] * x[i + 1][1], x[i][1] * x[i + 1][0];
+    addps   m5, m4       ; real_sum2 += x[i][0] * x[i + 2][0], x[i][1] * x[i + 2][1]; imag_sum2 += x[i][0] * x[i + 2][1], x[i][1] * x[i + 2][0];
+    addps   m7, m2       ; real_sum0 += x[i][0] * x[i][0],     x[i][1] * x[i][1];
+    add   cntq, 8
+    MOVH    m2, [xq+cntq+16]
+    movlhps m0, m0
+    shufps  m2, m2, q0110
+    mulps   m3, m0, m1
+    mulps   m4, m0, m2
+    mulps   m0, m0
+    addps   m6, m3       ; real_sum1 += x[i][0] * x[i + 1][0], x[i][1] * x[i + 1][1]; imag_sum1 += x[i][0] * x[i + 1][1], x[i][1] * x[i + 1][0];
+    addps   m5, m4       ; real_sum2 += x[i][0] * x[i + 2][0], x[i][1] * x[i + 2][1]; imag_sum2 += x[i][0] * x[i + 2][1], x[i][1] * x[i + 2][0];
+    addps   m7, m0       ; real_sum0 += x[i][0] * x[i][0],     x[i][1] * x[i][1];
+    jl .loop
+
+    movlhps m1, m1
+    mulps   m2, m1
+    mulps   m1, m1
+    addps   m2, m6       ; real_sum1 + x[38][0] * x[39][0], x[38][1] * x[39][1]; imag_sum1 + x[38][0] * x[39][1], x[38][1] * x[39][0];
+    addps   m1, m7       ; real_sum0 + x[38][0] * x[38][0], x[38][1] * x[38][1];
+    addps   m6, [rsp   ] ; real_sum1 + x[ 0][0] * x[ 1][0], x[ 0][1] * x[ 1][1]; imag_sum1 + x[ 0][0] * x[ 1][1], x[ 0][1] * x[ 1][0];
+    addps   m7, [rsp+16] ; real_sum0 + x[ 0][0] * x[ 0][0], x[ 0][1] * x[ 0][1];
+
+    xorps   m2, [ps_mask3]
+    xorps   m5, [ps_mask3]
+    xorps   m6, [ps_mask3]
+    HADDPS  m2, m5, m3
+    HADDPS  m7, m6, m4
+%if cpuflag(sse3)
+    movshdup m0, m1
+%else
+    movss   m0, m1
+    shufps  m1, m1, q0001
+%endif
+    addss   m1, m0
+    movaps  [phiq     ], m2
+    movhps  [phiq+0x18], m7
+    movss   [phiq+0x28], m7
+    movss   [phiq+0x10], m1
+    RET
+%endmacro
+
+INIT_XMM sse
+SBR_AUTOCORRELATE
+INIT_XMM sse3
+SBR_AUTOCORRELATE
--- a/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/sbrdsp_init.c
+++ b/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/sbrdsp_init.c
@ -0,0 +1,87 @@
+/*
+ * AAC Spectral Band Replication decoding functions
+ * Copyright (c) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/sbrdsp.h"
+
+float ff_sbr_sum_square_sse(float (*x)[2], int n);
+void ff_sbr_sum64x5_sse(float *z);
+void ff_sbr_hf_g_filt_sse(float (*Y)[2], const float (*X_high)[40][2],
+                          const float *g_filt, int m_max, intptr_t ixh);
+void ff_sbr_hf_gen_sse(float (*X_high)[2], const float (*X_low)[2],
+                       const float alpha0[2], const float alpha1[2],
+                       float bw, int start, int end);
+void ff_sbr_neg_odd_64_sse(float *z);
+void ff_sbr_qmf_post_shuffle_sse(float W[32][2], const float *z);
+void ff_sbr_qmf_deint_bfly_sse(float *v, const float *src0, const float *src1);
+void ff_sbr_qmf_deint_bfly_sse2(float *v, const float *src0, const float *src1);
+void ff_sbr_qmf_pre_shuffle_sse2(float *z);
+
+void ff_sbr_hf_apply_noise_0_sse2(float (*Y)[2], const float *s_m,
+                                  const float *q_filt, int noise,
+                                  int kx, int m_max);
+void ff_sbr_hf_apply_noise_1_sse2(float (*Y)[2], const float *s_m,
+                                  const float *q_filt, int noise,
+                                  int kx, int m_max);
+void ff_sbr_hf_apply_noise_2_sse2(float (*Y)[2], const float *s_m,
+                                  const float *q_filt, int noise,
+                                  int kx, int m_max);
+void ff_sbr_hf_apply_noise_3_sse2(float (*Y)[2], const float *s_m,
+                                  const float *q_filt, int noise,
+                                  int kx, int m_max);
+
+void ff_sbr_qmf_deint_neg_sse(float *v, const float *src);
+
+void ff_sbr_autocorrelate_sse (const float x[40][2], float phi[3][2][2]);
+void ff_sbr_autocorrelate_sse3(const float x[40][2], float phi[3][2][2]);
+
+av_cold void ff_sbrdsp_init_x86(SBRDSPContext *s)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_SSE(cpu_flags)) {
+        s->neg_odd_64 = ff_sbr_neg_odd_64_sse;
+        s->sum_square = ff_sbr_sum_square_sse;
+        s->sum64x5    = ff_sbr_sum64x5_sse;
+        s->hf_g_filt  = ff_sbr_hf_g_filt_sse;
+        s->hf_gen     = ff_sbr_hf_gen_sse;
+        s->qmf_post_shuffle = ff_sbr_qmf_post_shuffle_sse;
+        s->qmf_deint_bfly   = ff_sbr_qmf_deint_bfly_sse;
+        s->qmf_deint_neg    = ff_sbr_qmf_deint_neg_sse;
+        s->autocorrelate    = ff_sbr_autocorrelate_sse;
+    }
+
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        s->qmf_deint_bfly   = ff_sbr_qmf_deint_bfly_sse2;
+        s->qmf_pre_shuffle  = ff_sbr_qmf_pre_shuffle_sse2;
+        s->hf_apply_noise[0] = ff_sbr_hf_apply_noise_0_sse2;
+        s->hf_apply_noise[1] = ff_sbr_hf_apply_noise_1_sse2;
+        s->hf_apply_noise[2] = ff_sbr_hf_apply_noise_2_sse2;
+        s->hf_apply_noise[3] = ff_sbr_hf_apply_noise_3_sse2;
+    }
+
+    if (EXTERNAL_SSE3(cpu_flags)) {
+        s->autocorrelate = ff_sbr_autocorrelate_sse3;
+    }
+}
--- a/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/simple_idct.h
+++ b/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/simple_idct.h
@ -0,0 +1,53 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_X86_SIMPLE_IDCT_H
+#define AVCODEC_X86_SIMPLE_IDCT_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+void ff_simple_idct_mmx(int16_t *block);
+void ff_simple_idct_add_mmx(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
+void ff_simple_idct_put_mmx(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
+
+void ff_simple_idct_add_sse2(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
+void ff_simple_idct_put_sse2(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
+
+void ff_simple_idct8_sse2(int16_t *block);
+void ff_simple_idct8_avx(int16_t *block);
+
+void ff_simple_idct8_put_sse2(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
+void ff_simple_idct8_put_avx(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
+
+void ff_simple_idct8_add_sse2(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
+void ff_simple_idct8_add_avx(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
+
+void ff_simple_idct10_sse2(int16_t *block);
+void ff_simple_idct10_avx(int16_t *block);
+
+void ff_simple_idct10_put_sse2(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
+void ff_simple_idct10_put_avx(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
+
+void ff_simple_idct12_sse2(int16_t *block);
+void ff_simple_idct12_avx(int16_t *block);
+
+void ff_simple_idct12_put_sse2(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
+void ff_simple_idct12_put_avx(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
+
+#endif /* AVCODEC_X86_SIMPLE_IDCT_H */
--- a/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/snowdsp.c
+++ b/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/snowdsp.c
@ -0,0 +1,908 @@
+/*
+ * MMX and SSE2 optimized snow DSP utils
+ * Copyright (c) 2005-2006 Robert Edele <yartrebo@earthlink.net>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/cpu.h"
+#include "libavutil/x86/asm.h"
+#include "libavcodec/avcodec.h"
+#include "libavcodec/snow.h"
+#include "libavcodec/snow_dwt.h"
+
+#if HAVE_INLINE_ASM
+
+static void ff_snow_horizontal_compose97i_sse2(IDWTELEM *b, IDWTELEM *temp, int width){
+    const int w2= (width+1)>>1;
+    const int w_l= (width>>1);
+    const int w_r= w2 - 1;
+    int i;
+
+    { // Lift 0
+        IDWTELEM * const ref = b + w2 - 1;
+        IDWTELEM b_0 = b[0]; //By allowing the first entry in b[0] to be calculated twice
+        // (the first time erroneously), we allow the SSE2 code to run an extra pass.
+        // The savings in code and time are well worth having to store this value and
+        // calculate b[0] correctly afterwards.
+
+        i = 0;
+        __asm__ volatile(
+            "pcmpeqd   %%xmm7, %%xmm7         \n\t"
+            "pcmpeqd   %%xmm3, %%xmm3         \n\t"
+            "psllw         $1, %%xmm3         \n\t"
+            "paddw     %%xmm7, %%xmm3         \n\t"
+            "psllw        $13, %%xmm3         \n\t"
+        ::);
+        for(; i<w_l-15; i+=16){
+            __asm__ volatile(
+                "movdqu   (%1), %%xmm1        \n\t"
+                "movdqu 16(%1), %%xmm5        \n\t"
+                "movdqu  2(%1), %%xmm2        \n\t"
+                "movdqu 18(%1), %%xmm6        \n\t"
+                "paddw  %%xmm1, %%xmm2        \n\t"
+                "paddw  %%xmm5, %%xmm6        \n\t"
+                "paddw  %%xmm7, %%xmm2        \n\t"
+                "paddw  %%xmm7, %%xmm6        \n\t"
+                "pmulhw %%xmm3, %%xmm2        \n\t"
+                "pmulhw %%xmm3, %%xmm6        \n\t"
+                "paddw    (%0), %%xmm2        \n\t"
+                "paddw  16(%0), %%xmm6        \n\t"
+                "movdqa %%xmm2, (%0)          \n\t"
+                "movdqa %%xmm6, 16(%0)        \n\t"
+                :: "r"(&b[i]), "r"(&ref[i])
+                : "memory"
+            );
+        }
+        snow_horizontal_compose_lift_lead_out(i, b, b, ref, width, w_l, 0, W_DM, W_DO, W_DS);
+        b[0] = b_0 - ((W_DM * 2 * ref[1]+W_DO)>>W_DS);
+    }
+
+    { // Lift 1
+        IDWTELEM * const dst = b+w2;
+
+        i = 0;
+        for(; (((x86_reg)&dst[i]) & 0x1F) && i<w_r; i++){
+            dst[i] = dst[i] - (b[i] + b[i + 1]);
+        }
+        for(; i<w_r-15; i+=16){
+            __asm__ volatile(
+                "movdqu   (%1), %%xmm1        \n\t"
+                "movdqu 16(%1), %%xmm5        \n\t"
+                "movdqu  2(%1), %%xmm2        \n\t"
+                "movdqu 18(%1), %%xmm6        \n\t"
+                "paddw  %%xmm1, %%xmm2        \n\t"
+                "paddw  %%xmm5, %%xmm6        \n\t"
+                "movdqa   (%0), %%xmm0        \n\t"
+                "movdqa 16(%0), %%xmm4        \n\t"
+                "psubw  %%xmm2, %%xmm0        \n\t"
+                "psubw  %%xmm6, %%xmm4        \n\t"
+                "movdqa %%xmm0, (%0)          \n\t"
+                "movdqa %%xmm4, 16(%0)        \n\t"
+                :: "r"(&dst[i]), "r"(&b[i])
+                : "memory"
+            );
+        }
+        snow_horizontal_compose_lift_lead_out(i, dst, dst, b, width, w_r, 1, W_CM, W_CO, W_CS);
+    }
+
+    { // Lift 2
+        IDWTELEM * const ref = b+w2 - 1;
+        IDWTELEM b_0 = b[0];
+
+        i = 0;
+        __asm__ volatile(
+            "psllw         $15, %%xmm7        \n\t"
+            "pcmpeqw    %%xmm6, %%xmm6        \n\t"
+            "psrlw         $13, %%xmm6        \n\t"
+            "paddw      %%xmm7, %%xmm6        \n\t"
+        ::);
+        for(; i<w_l-15; i+=16){
+            __asm__ volatile(
+                "movdqu   (%1), %%xmm0        \n\t"
+                "movdqu 16(%1), %%xmm4        \n\t"
+                "movdqu  2(%1), %%xmm1        \n\t"
+                "movdqu 18(%1), %%xmm5        \n\t" //FIXME try aligned reads and shifts
+                "paddw  %%xmm6, %%xmm0        \n\t"
+                "paddw  %%xmm6, %%xmm4        \n\t"
+                "paddw  %%xmm7, %%xmm1        \n\t"
+                "paddw  %%xmm7, %%xmm5        \n\t"
+                "pavgw  %%xmm1, %%xmm0        \n\t"
+                "pavgw  %%xmm5, %%xmm4        \n\t"
+                "psubw  %%xmm7, %%xmm0        \n\t"
+                "psubw  %%xmm7, %%xmm4        \n\t"
+                "psraw      $1, %%xmm0        \n\t"
+                "psraw      $1, %%xmm4        \n\t"
+                "movdqa   (%0), %%xmm1        \n\t"
+                "movdqa 16(%0), %%xmm5        \n\t"
+                "paddw  %%xmm1, %%xmm0        \n\t"
+                "paddw  %%xmm5, %%xmm4        \n\t"
+                "psraw      $2, %%xmm0        \n\t"
+                "psraw      $2, %%xmm4        \n\t"
+                "paddw  %%xmm1, %%xmm0        \n\t"
+                "paddw  %%xmm5, %%xmm4        \n\t"
+                "movdqa %%xmm0, (%0)          \n\t"
+                "movdqa %%xmm4, 16(%0)        \n\t"
+                :: "r"(&b[i]), "r"(&ref[i])
+                : "memory"
+            );
+        }
+        snow_horizontal_compose_liftS_lead_out(i, b, b, ref, width, w_l);
+        b[0] = b_0 + ((2 * ref[1] + W_BO-1 + 4 * b_0) >> W_BS);
+    }
+
+    { // Lift 3
+        IDWTELEM * const src = b+w2;
+
+        i = 0;
+        for(; (((x86_reg)&temp[i]) & 0x1F) && i<w_r; i++){
+            temp[i] = src[i] - ((-W_AM*(b[i] + b[i+1]))>>W_AS);
+        }
+        for(; i<w_r-7; i+=8){
+            __asm__ volatile(
+                "movdqu  2(%1), %%xmm2        \n\t"
+                "movdqu 18(%1), %%xmm6        \n\t"
+                "paddw    (%1), %%xmm2        \n\t"
+                "paddw  16(%1), %%xmm6        \n\t"
+                "movdqu   (%0), %%xmm0        \n\t"
+                "movdqu 16(%0), %%xmm4        \n\t"
+                "paddw  %%xmm2, %%xmm0        \n\t"
+                "paddw  %%xmm6, %%xmm4        \n\t"
+                "psraw      $1, %%xmm2        \n\t"
+                "psraw      $1, %%xmm6        \n\t"
+                "paddw  %%xmm0, %%xmm2        \n\t"
+                "paddw  %%xmm4, %%xmm6        \n\t"
+                "movdqa %%xmm2, (%2)          \n\t"
+                "movdqa %%xmm6, 16(%2)        \n\t"
+                :: "r"(&src[i]), "r"(&b[i]), "r"(&temp[i])
+                 : "memory"
+               );
+        }
+        snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -W_AM, W_AO+1, W_AS);
+    }
+
+    {
+        snow_interleave_line_header(&i, width, b, temp);
+
+        for (; (i & 0x3E) != 0x3E; i-=2){
+            b[i+1] = temp[i>>1];
+            b[i] = b[i>>1];
+        }
+        for (i-=62; i>=0; i-=64){
+            __asm__ volatile(
+                "movdqa      (%1), %%xmm0       \n\t"
+                "movdqa    16(%1), %%xmm2       \n\t"
+                "movdqa    32(%1), %%xmm4       \n\t"
+                "movdqa    48(%1), %%xmm6       \n\t"
+                "movdqa      (%1), %%xmm1       \n\t"
+                "movdqa    16(%1), %%xmm3       \n\t"
+                "movdqa    32(%1), %%xmm5       \n\t"
+                "movdqa    48(%1), %%xmm7       \n\t"
+                "punpcklwd   (%2), %%xmm0       \n\t"
+                "punpcklwd 16(%2), %%xmm2       \n\t"
+                "punpcklwd 32(%2), %%xmm4       \n\t"
+                "punpcklwd 48(%2), %%xmm6       \n\t"
+                "movdqa    %%xmm0, (%0)         \n\t"
+                "movdqa    %%xmm2, 32(%0)       \n\t"
+                "movdqa    %%xmm4, 64(%0)       \n\t"
+                "movdqa    %%xmm6, 96(%0)       \n\t"
+                "punpckhwd   (%2), %%xmm1       \n\t"
+                "punpckhwd 16(%2), %%xmm3       \n\t"
+                "punpckhwd 32(%2), %%xmm5       \n\t"
+                "punpckhwd 48(%2), %%xmm7       \n\t"
+                "movdqa    %%xmm1, 16(%0)       \n\t"
+                "movdqa    %%xmm3, 48(%0)       \n\t"
+                "movdqa    %%xmm5, 80(%0)       \n\t"
+                "movdqa    %%xmm7, 112(%0)      \n\t"
+                :: "r"(&(b)[i]), "r"(&(b)[i>>1]), "r"(&(temp)[i>>1])
+                 : "memory"
+               );
+        }
+    }
+}
+
+static void ff_snow_horizontal_compose97i_mmx(IDWTELEM *b, IDWTELEM *temp, int width){
+    const int w2= (width+1)>>1;
+    const int w_l= (width>>1);
+    const int w_r= w2 - 1;
+    int i;
+
+    { // Lift 0
+        IDWTELEM * const ref = b + w2 - 1;
+
+        i = 1;
+        b[0] = b[0] - ((W_DM * 2 * ref[1]+W_DO)>>W_DS);
+        __asm__ volatile(
+            "pcmpeqw    %%mm7, %%mm7         \n\t"
+            "pcmpeqw    %%mm3, %%mm3         \n\t"
+            "psllw         $1, %%mm3         \n\t"
+            "paddw      %%mm7, %%mm3         \n\t"
+            "psllw        $13, %%mm3         \n\t"
+           ::);
+        for(; i<w_l-7; i+=8){
+            __asm__ volatile(
+                "movq     (%1), %%mm2        \n\t"
+                "movq    8(%1), %%mm6        \n\t"
+                "paddw   2(%1), %%mm2        \n\t"
+                "paddw  10(%1), %%mm6        \n\t"
+                "paddw   %%mm7, %%mm2        \n\t"
+                "paddw   %%mm7, %%mm6        \n\t"
+                "pmulhw  %%mm3, %%mm2        \n\t"
+                "pmulhw  %%mm3, %%mm6        \n\t"
+                "paddw    (%0), %%mm2        \n\t"
+                "paddw   8(%0), %%mm6        \n\t"
+                "movq    %%mm2, (%0)         \n\t"
+                "movq    %%mm6, 8(%0)        \n\t"
+                :: "r"(&b[i]), "r"(&ref[i])
+                 : "memory"
+               );
+        }
+        snow_horizontal_compose_lift_lead_out(i, b, b, ref, width, w_l, 0, W_DM, W_DO, W_DS);
+    }
+
+    { // Lift 1
+        IDWTELEM * const dst = b+w2;
+
+        i = 0;
+        for(; i<w_r-7; i+=8){
+            __asm__ volatile(
+                "movq     (%1), %%mm2        \n\t"
+                "movq    8(%1), %%mm6        \n\t"
+                "paddw   2(%1), %%mm2        \n\t"
+                "paddw  10(%1), %%mm6        \n\t"
+                "movq     (%0), %%mm0        \n\t"
+                "movq    8(%0), %%mm4        \n\t"
+                "psubw   %%mm2, %%mm0        \n\t"
+                "psubw   %%mm6, %%mm4        \n\t"
+                "movq    %%mm0, (%0)         \n\t"
+                "movq    %%mm4, 8(%0)        \n\t"
+                :: "r"(&dst[i]), "r"(&b[i])
+                 : "memory"
+               );
+        }
+        snow_horizontal_compose_lift_lead_out(i, dst, dst, b, width, w_r, 1, W_CM, W_CO, W_CS);
+    }
+
+    { // Lift 2
+        IDWTELEM * const ref = b+w2 - 1;
+
+        i = 1;
+        b[0] = b[0] + (((2 * ref[1] + W_BO) + 4 * b[0]) >> W_BS);
+        __asm__ volatile(
+            "psllw         $15, %%mm7        \n\t"
+            "pcmpeqw     %%mm6, %%mm6        \n\t"
+            "psrlw         $13, %%mm6        \n\t"
+            "paddw       %%mm7, %%mm6        \n\t"
+           ::);
+        for(; i<w_l-7; i+=8){
+            __asm__ volatile(
+                "movq     (%1), %%mm0        \n\t"
+                "movq    8(%1), %%mm4        \n\t"
+                "movq    2(%1), %%mm1        \n\t"
+                "movq   10(%1), %%mm5        \n\t"
+                "paddw   %%mm6, %%mm0        \n\t"
+                "paddw   %%mm6, %%mm4        \n\t"
+                "paddw   %%mm7, %%mm1        \n\t"
+                "paddw   %%mm7, %%mm5        \n\t"
+                "pavgw   %%mm1, %%mm0        \n\t"
+                "pavgw   %%mm5, %%mm4        \n\t"
+                "psubw   %%mm7, %%mm0        \n\t"
+                "psubw   %%mm7, %%mm4        \n\t"
+                "psraw      $1, %%mm0        \n\t"
+                "psraw      $1, %%mm4        \n\t"
+                "movq     (%0), %%mm1        \n\t"
+                "movq    8(%0), %%mm5        \n\t"
+                "paddw   %%mm1, %%mm0        \n\t"
+                "paddw   %%mm5, %%mm4        \n\t"
+                "psraw      $2, %%mm0        \n\t"
+                "psraw      $2, %%mm4        \n\t"
+                "paddw   %%mm1, %%mm0        \n\t"
+                "paddw   %%mm5, %%mm4        \n\t"
+                "movq    %%mm0, (%0)         \n\t"
+                "movq    %%mm4, 8(%0)        \n\t"
+                :: "r"(&b[i]), "r"(&ref[i])
+                 : "memory"
+               );
+        }
+        snow_horizontal_compose_liftS_lead_out(i, b, b, ref, width, w_l);
+    }
+
+    { // Lift 3
+        IDWTELEM * const src = b+w2;
+        i = 0;
+
+        for(; i<w_r-7; i+=8){
+            __asm__ volatile(
+                "movq    2(%1), %%mm2        \n\t"
+                "movq   10(%1), %%mm6        \n\t"
+                "paddw    (%1), %%mm2        \n\t"
+                "paddw   8(%1), %%mm6        \n\t"
+                "movq     (%0), %%mm0        \n\t"
+                "movq    8(%0), %%mm4        \n\t"
+                "paddw   %%mm2, %%mm0        \n\t"
+                "paddw   %%mm6, %%mm4        \n\t"
+                "psraw      $1, %%mm2        \n\t"
+                "psraw      $1, %%mm6        \n\t"
+                "paddw   %%mm0, %%mm2        \n\t"
+                "paddw   %%mm4, %%mm6        \n\t"
+                "movq    %%mm2, (%2)         \n\t"
+                "movq    %%mm6, 8(%2)        \n\t"
+                :: "r"(&src[i]), "r"(&b[i]), "r"(&temp[i])
+                 : "memory"
+               );
+        }
+        snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -W_AM, W_AO+1, W_AS);
+    }
+
+    {
+        snow_interleave_line_header(&i, width, b, temp);
+
+        for (; (i & 0x1E) != 0x1E; i-=2){
+            b[i+1] = temp[i>>1];
+            b[i] = b[i>>1];
+        }
+        for (i-=30; i>=0; i-=32){
+            __asm__ volatile(
+                "movq        (%1), %%mm0       \n\t"
+                "movq       8(%1), %%mm2       \n\t"
+                "movq      16(%1), %%mm4       \n\t"
+                "movq      24(%1), %%mm6       \n\t"
+                "movq        (%1), %%mm1       \n\t"
+                "movq       8(%1), %%mm3       \n\t"
+                "movq      16(%1), %%mm5       \n\t"
+                "movq      24(%1), %%mm7       \n\t"
+                "punpcklwd   (%2), %%mm0       \n\t"
+                "punpcklwd  8(%2), %%mm2       \n\t"
+                "punpcklwd 16(%2), %%mm4       \n\t"
+                "punpcklwd 24(%2), %%mm6       \n\t"
+                "movq       %%mm0, (%0)        \n\t"
+                "movq       %%mm2, 16(%0)      \n\t"
+                "movq       %%mm4, 32(%0)      \n\t"
+                "movq       %%mm6, 48(%0)      \n\t"
+                "punpckhwd   (%2), %%mm1       \n\t"
+                "punpckhwd  8(%2), %%mm3       \n\t"
+                "punpckhwd 16(%2), %%mm5       \n\t"
+                "punpckhwd 24(%2), %%mm7       \n\t"
+                "movq       %%mm1, 8(%0)       \n\t"
+                "movq       %%mm3, 24(%0)      \n\t"
+                "movq       %%mm5, 40(%0)      \n\t"
+                "movq       %%mm7, 56(%0)      \n\t"
+                :: "r"(&b[i]), "r"(&b[i>>1]), "r"(&temp[i>>1])
+                 : "memory"
+               );
+        }
+    }
+}
+
+#if HAVE_7REGS
+#define snow_vertical_compose_sse2_load_add(op,r,t0,t1,t2,t3)\
+        ""op" ("r",%%"FF_REG_d"), %%"t0"      \n\t"\
+        ""op" 16("r",%%"FF_REG_d"), %%"t1"    \n\t"\
+        ""op" 32("r",%%"FF_REG_d"), %%"t2"    \n\t"\
+        ""op" 48("r",%%"FF_REG_d"), %%"t3"    \n\t"
+
+#define snow_vertical_compose_sse2_load(r,t0,t1,t2,t3)\
+        snow_vertical_compose_sse2_load_add("movdqa",r,t0,t1,t2,t3)
+
+#define snow_vertical_compose_sse2_add(r,t0,t1,t2,t3)\
+        snow_vertical_compose_sse2_load_add("paddw",r,t0,t1,t2,t3)
+
+#define snow_vertical_compose_r2r_sub(s0,s1,s2,s3,t0,t1,t2,t3)\
+        "psubw %%"s0", %%"t0" \n\t"\
+        "psubw %%"s1", %%"t1" \n\t"\
+        "psubw %%"s2", %%"t2" \n\t"\
+        "psubw %%"s3", %%"t3" \n\t"
+
+#define snow_vertical_compose_sse2_store(w,s0,s1,s2,s3)\
+        "movdqa %%"s0", ("w",%%"FF_REG_d")    \n\t"\
+        "movdqa %%"s1", 16("w",%%"FF_REG_d")  \n\t"\
+        "movdqa %%"s2", 32("w",%%"FF_REG_d")  \n\t"\
+        "movdqa %%"s3", 48("w",%%"FF_REG_d")  \n\t"
+
+#define snow_vertical_compose_sra(n,t0,t1,t2,t3)\
+        "psraw $"n", %%"t0" \n\t"\
+        "psraw $"n", %%"t1" \n\t"\
+        "psraw $"n", %%"t2" \n\t"\
+        "psraw $"n", %%"t3" \n\t"
+
+#define snow_vertical_compose_r2r_add(s0,s1,s2,s3,t0,t1,t2,t3)\
+        "paddw %%"s0", %%"t0" \n\t"\
+        "paddw %%"s1", %%"t1" \n\t"\
+        "paddw %%"s2", %%"t2" \n\t"\
+        "paddw %%"s3", %%"t3" \n\t"
+
+#define snow_vertical_compose_r2r_pmulhw(s0,s1,s2,s3,t0,t1,t2,t3)\
+        "pmulhw %%"s0", %%"t0" \n\t"\
+        "pmulhw %%"s1", %%"t1" \n\t"\
+        "pmulhw %%"s2", %%"t2" \n\t"\
+        "pmulhw %%"s3", %%"t3" \n\t"
+
+#define snow_vertical_compose_sse2_move(s0,s1,s2,s3,t0,t1,t2,t3)\
+        "movdqa %%"s0", %%"t0" \n\t"\
+        "movdqa %%"s1", %%"t1" \n\t"\
+        "movdqa %%"s2", %%"t2" \n\t"\
+        "movdqa %%"s3", %%"t3" \n\t"
+
+static void ff_snow_vertical_compose97i_sse2(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width){
+    x86_reg i = width;
+
+    while(i & 0x1F)
+    {
+        i--;
+        b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS;
+        b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS;
+        b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS;
+        b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
+    }
+    i+=i;
+
+         __asm__ volatile (
+        "jmp 2f                                      \n\t"
+        "1:                                          \n\t"
+        snow_vertical_compose_sse2_load("%4","xmm0","xmm2","xmm4","xmm6")
+        snow_vertical_compose_sse2_add("%6","xmm0","xmm2","xmm4","xmm6")
+
+
+        "pcmpeqw    %%xmm0, %%xmm0                   \n\t"
+        "pcmpeqw    %%xmm2, %%xmm2                   \n\t"
+        "paddw      %%xmm2, %%xmm2                   \n\t"
+        "paddw      %%xmm0, %%xmm2                   \n\t"
+        "psllw         $13, %%xmm2                   \n\t"
+        snow_vertical_compose_r2r_add("xmm0","xmm0","xmm0","xmm0","xmm1","xmm3","xmm5","xmm7")
+        snow_vertical_compose_r2r_pmulhw("xmm2","xmm2","xmm2","xmm2","xmm1","xmm3","xmm5","xmm7")
+        snow_vertical_compose_sse2_add("%5","xmm1","xmm3","xmm5","xmm7")
+        snow_vertical_compose_sse2_store("%5","xmm1","xmm3","xmm5","xmm7")
+        snow_vertical_compose_sse2_load("%4","xmm0","xmm2","xmm4","xmm6")
+        snow_vertical_compose_sse2_add("%3","xmm1","xmm3","xmm5","xmm7")
+        snow_vertical_compose_r2r_sub("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6")
+        snow_vertical_compose_sse2_store("%4","xmm0","xmm2","xmm4","xmm6")
+
+        "pcmpeqw %%xmm7, %%xmm7                      \n\t"
+        "pcmpeqw %%xmm5, %%xmm5                      \n\t"
+        "psllw $15, %%xmm7                           \n\t"
+        "psrlw $13, %%xmm5                           \n\t"
+        "paddw %%xmm7, %%xmm5                        \n\t"
+        snow_vertical_compose_r2r_add("xmm5","xmm5","xmm5","xmm5","xmm0","xmm2","xmm4","xmm6")
+        "movq   (%2,%%"FF_REG_d"), %%xmm1            \n\t"
+        "movq  8(%2,%%"FF_REG_d"), %%xmm3            \n\t"
+        "paddw %%xmm7, %%xmm1                        \n\t"
+        "paddw %%xmm7, %%xmm3                        \n\t"
+        "pavgw %%xmm1, %%xmm0                        \n\t"
+        "pavgw %%xmm3, %%xmm2                        \n\t"
+        "movq 16(%2,%%"FF_REG_d"), %%xmm1            \n\t"
+        "movq 24(%2,%%"FF_REG_d"), %%xmm3            \n\t"
+        "paddw %%xmm7, %%xmm1                        \n\t"
+        "paddw %%xmm7, %%xmm3                        \n\t"
+        "pavgw %%xmm1, %%xmm4                        \n\t"
+        "pavgw %%xmm3, %%xmm6                        \n\t"
+        snow_vertical_compose_r2r_sub("xmm7","xmm7","xmm7","xmm7","xmm0","xmm2","xmm4","xmm6")
+        snow_vertical_compose_sra("1","xmm0","xmm2","xmm4","xmm6")
+        snow_vertical_compose_sse2_add("%3","xmm0","xmm2","xmm4","xmm6")
+
+        snow_vertical_compose_sra("2","xmm0","xmm2","xmm4","xmm6")
+        snow_vertical_compose_sse2_add("%3","xmm0","xmm2","xmm4","xmm6")
+        snow_vertical_compose_sse2_store("%3","xmm0","xmm2","xmm4","xmm6")
+        snow_vertical_compose_sse2_add("%1","xmm0","xmm2","xmm4","xmm6")
+        snow_vertical_compose_sse2_move("xmm0","xmm2","xmm4","xmm6","xmm1","xmm3","xmm5","xmm7")
+        snow_vertical_compose_sra("1","xmm0","xmm2","xmm4","xmm6")
+        snow_vertical_compose_r2r_add("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6")
+        snow_vertical_compose_sse2_add("%2","xmm0","xmm2","xmm4","xmm6")
+        snow_vertical_compose_sse2_store("%2","xmm0","xmm2","xmm4","xmm6")
+
+        "2:                                          \n\t"
+        "sub $64, %%"FF_REG_d"                       \n\t"
+        "jge 1b                                      \n\t"
+        :"+d"(i)
+        :"r"(b0),"r"(b1),"r"(b2),"r"(b3),"r"(b4),"r"(b5));
+}
+
+#define snow_vertical_compose_mmx_load_add(op,r,t0,t1,t2,t3)\
+        ""op" ("r",%%"FF_REG_d"), %%"t0"   \n\t"\
+        ""op" 8("r",%%"FF_REG_d"), %%"t1"  \n\t"\
+        ""op" 16("r",%%"FF_REG_d"), %%"t2" \n\t"\
+        ""op" 24("r",%%"FF_REG_d"), %%"t3" \n\t"
+
+#define snow_vertical_compose_mmx_load(r,t0,t1,t2,t3)\
+        snow_vertical_compose_mmx_load_add("movq",r,t0,t1,t2,t3)
+
+#define snow_vertical_compose_mmx_add(r,t0,t1,t2,t3)\
+        snow_vertical_compose_mmx_load_add("paddw",r,t0,t1,t2,t3)
+
+#define snow_vertical_compose_mmx_store(w,s0,s1,s2,s3)\
+        "movq %%"s0", ("w",%%"FF_REG_d")   \n\t"\
+        "movq %%"s1", 8("w",%%"FF_REG_d")  \n\t"\
+        "movq %%"s2", 16("w",%%"FF_REG_d") \n\t"\
+        "movq %%"s3", 24("w",%%"FF_REG_d") \n\t"
+
+#define snow_vertical_compose_mmx_move(s0,s1,s2,s3,t0,t1,t2,t3)\
+        "movq %%"s0", %%"t0" \n\t"\
+        "movq %%"s1", %%"t1" \n\t"\
+        "movq %%"s2", %%"t2" \n\t"\
+        "movq %%"s3", %%"t3" \n\t"
+
+
+static void ff_snow_vertical_compose97i_mmx(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width){
+    x86_reg i = width;
+    while(i & 15)
+    {
+        i--;
+        b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS;
+        b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS;
+        b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS;
+        b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
+    }
+    i+=i;
+    __asm__ volatile(
+        "jmp 2f                                      \n\t"
+        "1:                                          \n\t"
+
+        snow_vertical_compose_mmx_load("%4","mm1","mm3","mm5","mm7")
+        snow_vertical_compose_mmx_add("%6","mm1","mm3","mm5","mm7")
+        "pcmpeqw    %%mm0, %%mm0                     \n\t"
+        "pcmpeqw    %%mm2, %%mm2                     \n\t"
+        "paddw      %%mm2, %%mm2                     \n\t"
+        "paddw      %%mm0, %%mm2                     \n\t"
+        "psllw        $13, %%mm2                     \n\t"
+        snow_vertical_compose_r2r_add("mm0","mm0","mm0","mm0","mm1","mm3","mm5","mm7")
+        snow_vertical_compose_r2r_pmulhw("mm2","mm2","mm2","mm2","mm1","mm3","mm5","mm7")
+        snow_vertical_compose_mmx_add("%5","mm1","mm3","mm5","mm7")
+        snow_vertical_compose_mmx_store("%5","mm1","mm3","mm5","mm7")
+        snow_vertical_compose_mmx_load("%4","mm0","mm2","mm4","mm6")
+        snow_vertical_compose_mmx_add("%3","mm1","mm3","mm5","mm7")
+        snow_vertical_compose_r2r_sub("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6")
+        snow_vertical_compose_mmx_store("%4","mm0","mm2","mm4","mm6")
+        "pcmpeqw %%mm7, %%mm7                        \n\t"
+        "pcmpeqw %%mm5, %%mm5                        \n\t"
+        "psllw $15, %%mm7                            \n\t"
+        "psrlw $13, %%mm5                            \n\t"
+        "paddw %%mm7, %%mm5                          \n\t"
+        snow_vertical_compose_r2r_add("mm5","mm5","mm5","mm5","mm0","mm2","mm4","mm6")
+        "movq   (%2,%%"FF_REG_d"), %%mm1             \n\t"
+        "movq  8(%2,%%"FF_REG_d"), %%mm3             \n\t"
+        "paddw %%mm7, %%mm1                          \n\t"
+        "paddw %%mm7, %%mm3                          \n\t"
+        "pavgw %%mm1, %%mm0                          \n\t"
+        "pavgw %%mm3, %%mm2                          \n\t"
+        "movq 16(%2,%%"FF_REG_d"), %%mm1             \n\t"
+        "movq 24(%2,%%"FF_REG_d"), %%mm3             \n\t"
+        "paddw %%mm7, %%mm1                          \n\t"
+        "paddw %%mm7, %%mm3                          \n\t"
+        "pavgw %%mm1, %%mm4                          \n\t"
+        "pavgw %%mm3, %%mm6                          \n\t"
+        snow_vertical_compose_r2r_sub("mm7","mm7","mm7","mm7","mm0","mm2","mm4","mm6")
+        snow_vertical_compose_sra("1","mm0","mm2","mm4","mm6")
+        snow_vertical_compose_mmx_add("%3","mm0","mm2","mm4","mm6")
+
+        snow_vertical_compose_sra("2","mm0","mm2","mm4","mm6")
+        snow_vertical_compose_mmx_add("%3","mm0","mm2","mm4","mm6")
+        snow_vertical_compose_mmx_store("%3","mm0","mm2","mm4","mm6")
+        snow_vertical_compose_mmx_add("%1","mm0","mm2","mm4","mm6")
+        snow_vertical_compose_mmx_move("mm0","mm2","mm4","mm6","mm1","mm3","mm5","mm7")
+        snow_vertical_compose_sra("1","mm0","mm2","mm4","mm6")
+        snow_vertical_compose_r2r_add("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6")
+        snow_vertical_compose_mmx_add("%2","mm0","mm2","mm4","mm6")
+        snow_vertical_compose_mmx_store("%2","mm0","mm2","mm4","mm6")
+
+        "2:                                          \n\t"
+        "sub $32, %%"FF_REG_d"                       \n\t"
+        "jge 1b                                      \n\t"
+        :"+d"(i)
+        :"r"(b0),"r"(b1),"r"(b2),"r"(b3),"r"(b4),"r"(b5));
+}
+#endif //HAVE_7REGS
+
+#if HAVE_6REGS
+#define snow_inner_add_yblock_sse2_header \
+    IDWTELEM * * dst_array = sb->line + src_y;\
+    x86_reg tmp;\
+    __asm__ volatile(\
+             "mov  %7, %%"FF_REG_c"          \n\t"\
+             "mov  %6, %2                    \n\t"\
+             "mov  %4, %%"FF_REG_S"          \n\t"\
+             "pxor %%xmm7, %%xmm7            \n\t" /* 0 */\
+             "pcmpeqd %%xmm3, %%xmm3         \n\t"\
+             "psllw $15, %%xmm3              \n\t"\
+             "psrlw $12, %%xmm3              \n\t" /* FRAC_BITS >> 1 */\
+             "1:                             \n\t"\
+             "mov %1, %%"FF_REG_D"           \n\t"\
+             "mov (%%"FF_REG_D"), %%"FF_REG_D" \n\t"\
+             "add %3, %%"FF_REG_D"           \n\t"
+
+#define snow_inner_add_yblock_sse2_start_8(out_reg1, out_reg2, ptr_offset, s_offset)\
+             "mov "FF_PTR_SIZE"*"ptr_offset"(%%"FF_REG_a"), %%"FF_REG_d"; \n\t"\
+             "movq (%%"FF_REG_d"), %%"out_reg1"                           \n\t"\
+             "movq (%%"FF_REG_d", %%"FF_REG_c"), %%"out_reg2"             \n\t"\
+             "punpcklbw %%xmm7, %%"out_reg1" \n\t"\
+             "punpcklbw %%xmm7, %%"out_reg2" \n\t"\
+             "movq "s_offset"(%%"FF_REG_S"), %%xmm0    \n\t"\
+             "movq "s_offset"+16(%%"FF_REG_S"), %%xmm4 \n\t"\
+             "punpcklbw %%xmm7, %%xmm0       \n\t"\
+             "punpcklbw %%xmm7, %%xmm4       \n\t"\
+             "pmullw %%xmm0, %%"out_reg1"    \n\t"\
+             "pmullw %%xmm4, %%"out_reg2"    \n\t"
+
+#define snow_inner_add_yblock_sse2_start_16(out_reg1, out_reg2, ptr_offset, s_offset)\
+             "mov "FF_PTR_SIZE"*"ptr_offset"(%%"FF_REG_a"), %%"FF_REG_d"; \n\t"\
+             "movq (%%"FF_REG_d"), %%"out_reg1"                           \n\t"\
+             "movq 8(%%"FF_REG_d"), %%"out_reg2"                          \n\t"\
+             "punpcklbw %%xmm7, %%"out_reg1" \n\t"\
+             "punpcklbw %%xmm7, %%"out_reg2" \n\t"\
+             "movq "s_offset"(%%"FF_REG_S"), %%xmm0   \n\t"\
+             "movq "s_offset"+8(%%"FF_REG_S"), %%xmm4 \n\t"\
+             "punpcklbw %%xmm7, %%xmm0       \n\t"\
+             "punpcklbw %%xmm7, %%xmm4       \n\t"\
+             "pmullw %%xmm0, %%"out_reg1"    \n\t"\
+             "pmullw %%xmm4, %%"out_reg2"    \n\t"
+
+#define snow_inner_add_yblock_sse2_accum_8(ptr_offset, s_offset) \
+             snow_inner_add_yblock_sse2_start_8("xmm2", "xmm6", ptr_offset, s_offset)\
+             "paddusw %%xmm2, %%xmm1         \n\t"\
+             "paddusw %%xmm6, %%xmm5         \n\t"
+
+#define snow_inner_add_yblock_sse2_accum_16(ptr_offset, s_offset) \
+             snow_inner_add_yblock_sse2_start_16("xmm2", "xmm6", ptr_offset, s_offset)\
+             "paddusw %%xmm2, %%xmm1         \n\t"\
+             "paddusw %%xmm6, %%xmm5         \n\t"
+
+#define snow_inner_add_yblock_sse2_end_common1\
+             "add $32, %%"FF_REG_S"                            \n\t"\
+             "add %%"FF_REG_c", %0                             \n\t"\
+             "add %%"FF_REG_c", "FF_PTR_SIZE"*3(%%"FF_REG_a"); \n\t"\
+             "add %%"FF_REG_c", "FF_PTR_SIZE"*2(%%"FF_REG_a"); \n\t"\
+             "add %%"FF_REG_c", "FF_PTR_SIZE"*1(%%"FF_REG_a"); \n\t"\
+             "add %%"FF_REG_c", (%%"FF_REG_a")                 \n\t"
+
+#define snow_inner_add_yblock_sse2_end_common2\
+             "jnz 1b                         \n\t"\
+             :"+m"(dst8),"+m"(dst_array),"=&r"(tmp)\
+             :\
+             "rm"((x86_reg)(src_x<<1)),"m"(obmc),"a"(block),"m"(b_h),"m"(src_stride):\
+             XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", )\
+             "%"FF_REG_c"","%"FF_REG_S"","%"FF_REG_D"","%"FF_REG_d"");
+
+#define snow_inner_add_yblock_sse2_end_8\
+             "sal $1, %%"FF_REG_c"                \n\t"\
+             "add"FF_OPSIZE" $"FF_PTR_SIZE"*2, %1 \n\t"\
+             snow_inner_add_yblock_sse2_end_common1\
+             "sar $1, %%"FF_REG_c"           \n\t"\
+             "sub $2, %2                     \n\t"\
+             snow_inner_add_yblock_sse2_end_common2
+
+#define snow_inner_add_yblock_sse2_end_16\
+             "add"FF_OPSIZE" $"FF_PTR_SIZE"*1, %1 \n\t"\
+             snow_inner_add_yblock_sse2_end_common1\
+             "dec %2                         \n\t"\
+             snow_inner_add_yblock_sse2_end_common2
+
+static void inner_add_yblock_bw_8_obmc_16_bh_even_sse2(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h,
+                      int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){
+snow_inner_add_yblock_sse2_header
+snow_inner_add_yblock_sse2_start_8("xmm1", "xmm5", "3", "0")
+snow_inner_add_yblock_sse2_accum_8("2", "8")
+snow_inner_add_yblock_sse2_accum_8("1", "128")
+snow_inner_add_yblock_sse2_accum_8("0", "136")
+
+             "mov %0, %%"FF_REG_d"           \n\t"
+             "movdqa (%%"FF_REG_D"), %%xmm0  \n\t"
+             "movdqa %%xmm1, %%xmm2          \n\t"
+
+             "punpckhwd %%xmm7, %%xmm1       \n\t"
+             "punpcklwd %%xmm7, %%xmm2       \n\t"
+             "paddd %%xmm2, %%xmm0           \n\t"
+             "movdqa 16(%%"FF_REG_D"), %%xmm2\n\t"
+             "paddd %%xmm1, %%xmm2           \n\t"
+             "paddd %%xmm3, %%xmm0           \n\t"
+             "paddd %%xmm3, %%xmm2           \n\t"
+
+             "mov %1, %%"FF_REG_D"           \n\t"
+             "mov "FF_PTR_SIZE"(%%"FF_REG_D"), %%"FF_REG_D"; \n\t"
+             "add %3, %%"FF_REG_D"           \n\t"
+
+             "movdqa (%%"FF_REG_D"), %%xmm4  \n\t"
+             "movdqa %%xmm5, %%xmm6          \n\t"
+             "punpckhwd %%xmm7, %%xmm5       \n\t"
+             "punpcklwd %%xmm7, %%xmm6       \n\t"
+             "paddd %%xmm6, %%xmm4           \n\t"
+             "movdqa 16(%%"FF_REG_D"), %%xmm6\n\t"
+             "paddd %%xmm5, %%xmm6           \n\t"
+             "paddd %%xmm3, %%xmm4           \n\t"
+             "paddd %%xmm3, %%xmm6           \n\t"
+
+             "psrad $8, %%xmm0               \n\t" /* FRAC_BITS. */
+             "psrad $8, %%xmm2               \n\t" /* FRAC_BITS. */
+             "packssdw %%xmm2, %%xmm0        \n\t"
+             "packuswb %%xmm7, %%xmm0        \n\t"
+             "movq %%xmm0, (%%"FF_REG_d")    \n\t"
+
+             "psrad $8, %%xmm4               \n\t" /* FRAC_BITS. */
+             "psrad $8, %%xmm6               \n\t" /* FRAC_BITS. */
+             "packssdw %%xmm6, %%xmm4        \n\t"
+             "packuswb %%xmm7, %%xmm4        \n\t"
+             "movq %%xmm4, (%%"FF_REG_d",%%"FF_REG_c"); \n\t"
+snow_inner_add_yblock_sse2_end_8
+}
+
+static void inner_add_yblock_bw_16_obmc_32_sse2(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h,
+                      int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){
+snow_inner_add_yblock_sse2_header
+snow_inner_add_yblock_sse2_start_16("xmm1", "xmm5", "3", "0")
+snow_inner_add_yblock_sse2_accum_16("2", "16")
+snow_inner_add_yblock_sse2_accum_16("1", "512")
+snow_inner_add_yblock_sse2_accum_16("0", "528")
+
+             "mov %0, %%"FF_REG_d"           \n\t"
+             "psrlw $4, %%xmm1               \n\t"
+             "psrlw $4, %%xmm5               \n\t"
+             "paddw   (%%"FF_REG_D"), %%xmm1 \n\t"
+             "paddw 16(%%"FF_REG_D"), %%xmm5 \n\t"
+             "paddw %%xmm3, %%xmm1           \n\t"
+             "paddw %%xmm3, %%xmm5           \n\t"
+             "psraw $4, %%xmm1               \n\t" /* FRAC_BITS. */
+             "psraw $4, %%xmm5               \n\t" /* FRAC_BITS. */
+             "packuswb %%xmm5, %%xmm1        \n\t"
+
+             "movdqu %%xmm1, (%%"FF_REG_d")  \n\t"
+
+snow_inner_add_yblock_sse2_end_16
+}
+
+#define snow_inner_add_yblock_mmx_header \
+    IDWTELEM * * dst_array = sb->line + src_y;\
+    x86_reg tmp;\
+    __asm__ volatile(\
+             "mov  %7, %%"FF_REG_c"          \n\t"\
+             "mov  %6, %2                    \n\t"\
+             "mov  %4, %%"FF_REG_S"          \n\t"\
+             "pxor %%mm7, %%mm7              \n\t" /* 0 */\
+             "pcmpeqd %%mm3, %%mm3           \n\t"\
+             "psllw $15, %%mm3               \n\t"\
+             "psrlw $12, %%mm3               \n\t" /* FRAC_BITS >> 1 */\
+             "1:                             \n\t"\
+             "mov %1, %%"FF_REG_D"           \n\t"\
+             "mov (%%"FF_REG_D"), %%"FF_REG_D" \n\t"\
+             "add %3, %%"FF_REG_D"           \n\t"
+
+#define snow_inner_add_yblock_mmx_start(out_reg1, out_reg2, ptr_offset, s_offset, d_offset)\
+             "mov "FF_PTR_SIZE"*"ptr_offset"(%%"FF_REG_a"), %%"FF_REG_d"; \n\t"\
+             "movd "d_offset"(%%"FF_REG_d"), %%"out_reg1"                 \n\t"\
+             "movd "d_offset"+4(%%"FF_REG_d"), %%"out_reg2"               \n\t"\
+             "punpcklbw %%mm7, %%"out_reg1" \n\t"\
+             "punpcklbw %%mm7, %%"out_reg2" \n\t"\
+             "movd "s_offset"(%%"FF_REG_S"), %%mm0   \n\t"\
+             "movd "s_offset"+4(%%"FF_REG_S"), %%mm4 \n\t"\
+             "punpcklbw %%mm7, %%mm0       \n\t"\
+             "punpcklbw %%mm7, %%mm4       \n\t"\
+             "pmullw %%mm0, %%"out_reg1"   \n\t"\
+             "pmullw %%mm4, %%"out_reg2"   \n\t"
+
+#define snow_inner_add_yblock_mmx_accum(ptr_offset, s_offset, d_offset) \
+             snow_inner_add_yblock_mmx_start("mm2", "mm6", ptr_offset, s_offset, d_offset)\
+             "paddusw %%mm2, %%mm1         \n\t"\
+             "paddusw %%mm6, %%mm5         \n\t"
+
+#define snow_inner_add_yblock_mmx_mix(read_offset, write_offset)\
+             "mov %0, %%"FF_REG_d"           \n\t"\
+             "psrlw $4, %%mm1                \n\t"\
+             "psrlw $4, %%mm5                \n\t"\
+             "paddw "read_offset"(%%"FF_REG_D"), %%mm1   \n\t"\
+             "paddw "read_offset"+8(%%"FF_REG_D"), %%mm5 \n\t"\
+             "paddw %%mm3, %%mm1             \n\t"\
+             "paddw %%mm3, %%mm5             \n\t"\
+             "psraw $4, %%mm1                \n\t"\
+             "psraw $4, %%mm5                \n\t"\
+             "packuswb %%mm5, %%mm1          \n\t"\
+             "movq %%mm1, "write_offset"(%%"FF_REG_d") \n\t"
+
+#define snow_inner_add_yblock_mmx_end(s_step)\
+             "add $"s_step", %%"FF_REG_S"                      \n\t"\
+             "add %%"FF_REG_c", "FF_PTR_SIZE"*3(%%"FF_REG_a"); \n\t"\
+             "add %%"FF_REG_c", "FF_PTR_SIZE"*2(%%"FF_REG_a"); \n\t"\
+             "add %%"FF_REG_c", "FF_PTR_SIZE"*1(%%"FF_REG_a"); \n\t"\
+             "add %%"FF_REG_c", (%%"FF_REG_a")                 \n\t"\
+             "add"FF_OPSIZE " $"FF_PTR_SIZE"*1, %1             \n\t"\
+             "add %%"FF_REG_c", %0                             \n\t"\
+             "dec %2                         \n\t"\
+             "jnz 1b                         \n\t"\
+             :"+m"(dst8),"+m"(dst_array),"=&r"(tmp)\
+             :\
+             "rm"((x86_reg)(src_x<<1)),"m"(obmc),"a"(block),"m"(b_h),"m"(src_stride):\
+             "%"FF_REG_c"","%"FF_REG_S"","%"FF_REG_D"","%"FF_REG_d"");
+
+static void inner_add_yblock_bw_8_obmc_16_mmx(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h,
+                      int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){
+snow_inner_add_yblock_mmx_header
+snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "0", "0")
+snow_inner_add_yblock_mmx_accum("2", "8", "0")
+snow_inner_add_yblock_mmx_accum("1", "128", "0")
+snow_inner_add_yblock_mmx_accum("0", "136", "0")
+snow_inner_add_yblock_mmx_mix("0", "0")
+snow_inner_add_yblock_mmx_end("16")
+}
+
+static void inner_add_yblock_bw_16_obmc_32_mmx(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h,
+                      int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){
+snow_inner_add_yblock_mmx_header
+snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "0", "0")
+snow_inner_add_yblock_mmx_accum("2", "16", "0")
+snow_inner_add_yblock_mmx_accum("1", "512", "0")
+snow_inner_add_yblock_mmx_accum("0", "528", "0")
+snow_inner_add_yblock_mmx_mix("0", "0")
+
+snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "8", "8")
+snow_inner_add_yblock_mmx_accum("2", "24", "8")
+snow_inner_add_yblock_mmx_accum("1", "520", "8")
+snow_inner_add_yblock_mmx_accum("0", "536", "8")
+snow_inner_add_yblock_mmx_mix("16", "8")
+snow_inner_add_yblock_mmx_end("32")
+}
+
+static void ff_snow_inner_add_yblock_sse2(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
+                           int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){
+
+    if (b_w == 16)
+        inner_add_yblock_bw_16_obmc_32_sse2(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
+    else if (b_w == 8 && obmc_stride == 16) {
+        if (!(b_h & 1))
+            inner_add_yblock_bw_8_obmc_16_bh_even_sse2(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
+        else
+            inner_add_yblock_bw_8_obmc_16_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
+    } else
+         ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
+}
+
+static void ff_snow_inner_add_yblock_mmx(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
+                          int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){
+    if (b_w == 16)
+        inner_add_yblock_bw_16_obmc_32_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
+    else if (b_w == 8 && obmc_stride == 16)
+        inner_add_yblock_bw_8_obmc_16_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
+    else
+        ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
+}
+#endif /* HAVE_6REGS */
+
+#endif /* HAVE_INLINE_ASM */
+
+av_cold void ff_dwt_init_x86(SnowDWTContext *c)
+{
+#if HAVE_INLINE_ASM
+    int mm_flags = av_get_cpu_flags();
+
+    if (mm_flags & AV_CPU_FLAG_MMX) {
+        if(mm_flags & AV_CPU_FLAG_SSE2 & 0){
+            c->horizontal_compose97i = ff_snow_horizontal_compose97i_sse2;
+#if HAVE_7REGS
+            c->vertical_compose97i = ff_snow_vertical_compose97i_sse2;
+#endif
+#if HAVE_6REGS
+            c->inner_add_yblock = ff_snow_inner_add_yblock_sse2;
+#endif
+        }
+        else{
+            if (mm_flags & AV_CPU_FLAG_MMXEXT) {
+            c->horizontal_compose97i = ff_snow_horizontal_compose97i_mmx;
+#if HAVE_7REGS
+            c->vertical_compose97i = ff_snow_vertical_compose97i_mmx;
+#endif
+            }
+#if HAVE_6REGS
+            c->inner_add_yblock = ff_snow_inner_add_yblock_mmx;
+#endif
+        }
+    }
+#endif /* HAVE_INLINE_ASM */
+}
--- a/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/svq1enc_init.c
+++ b/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/svq1enc_init.c
@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2007 Loren Merritt
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/svq1enc.h"
+
+int ff_ssd_int8_vs_int16_mmx(const int8_t *pix1, const int16_t *pix2,
+                             intptr_t size);
+int ff_ssd_int8_vs_int16_sse2(const int8_t *pix1, const int16_t *pix2,
+                              intptr_t size);
+
+av_cold void ff_svq1enc_init_x86(SVQ1EncContext *c)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_MMX(cpu_flags)) {
+        c->ssd_int8_vs_int16 = ff_ssd_int8_vs_int16_mmx;
+    }
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        c->ssd_int8_vs_int16 = ff_ssd_int8_vs_int16_sse2;
+    }
+}
--- a/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/synth_filter_init.c
+++ b/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/synth_filter_init.c
@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2012-2014 Christophe Gisquet <christophe.gisquet@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/synth_filter.h"
+
+#define SYNTH_FILTER_FUNC(opt)                                                 \
+void ff_synth_filter_inner_##opt(float *synth_buf_ptr, float synth_buf2[32],   \
+                                 const float window[512],                      \
+                                 float out[32], intptr_t offset, float scale); \
+static void synth_filter_##opt(FFTContext *imdct,                              \
+                               float *synth_buf_ptr, int *synth_buf_offset,    \
+                               float synth_buf2[32], const float window[512],  \
+                               float out[32], const float in[32], float scale) \
+{                                                                              \
+    float *synth_buf= synth_buf_ptr + *synth_buf_offset;                       \
+                                                                               \
+    imdct->imdct_half(imdct, synth_buf, in);                                   \
+                                                                               \
+    ff_synth_filter_inner_##opt(synth_buf, synth_buf2, window,                 \
+                                out, *synth_buf_offset, scale);                \
+                                                                               \
+    *synth_buf_offset = (*synth_buf_offset - 32) & 511;                        \
+}                                                                              \
+
+#if HAVE_X86ASM
+#if ARCH_X86_32
+SYNTH_FILTER_FUNC(sse)
+#endif
+SYNTH_FILTER_FUNC(sse2)
+SYNTH_FILTER_FUNC(avx)
+SYNTH_FILTER_FUNC(fma3)
+#endif /* HAVE_X86ASM */
+
+av_cold void ff_synth_filter_init_x86(SynthFilterContext *s)
+{
+#if HAVE_X86ASM
+    int cpu_flags = av_get_cpu_flags();
+
+#if ARCH_X86_32
+    if (EXTERNAL_SSE(cpu_flags)) {
+        s->synth_filter_float = synth_filter_sse;
+    }
+#endif
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        s->synth_filter_float = synth_filter_sse2;
+    }
+    if (EXTERNAL_AVX_FAST(cpu_flags)) {
+        s->synth_filter_float = synth_filter_avx;
+    }
+    if (EXTERNAL_FMA3_FAST(cpu_flags)) {
+        s->synth_filter_float = synth_filter_fma3;
+    }
+#endif /* HAVE_X86ASM */
+}
--- a/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/takdsp_init.c
+++ b/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/takdsp_init.c
@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2015 Paul B Mahol
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/takdsp.h"
+#include "libavutil/x86/cpu.h"
+#include "config.h"
+
+void ff_tak_decorrelate_ls_sse2(int32_t *p1, int32_t *p2, int length);
+void ff_tak_decorrelate_sr_sse2(int32_t *p1, int32_t *p2, int length);
+void ff_tak_decorrelate_sm_sse2(int32_t *p1, int32_t *p2, int length);
+void ff_tak_decorrelate_sf_sse4(int32_t *p1, int32_t *p2, int length, int dshift, int dfactor);
+
+av_cold void ff_takdsp_init_x86(TAKDSPContext *c)
+{
+#if HAVE_X86ASM
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        c->decorrelate_ls = ff_tak_decorrelate_ls_sse2;
+        c->decorrelate_sr = ff_tak_decorrelate_sr_sse2;
+        c->decorrelate_sm = ff_tak_decorrelate_sm_sse2;
+    }
+
+    if (EXTERNAL_SSE4(cpu_flags)) {
+        c->decorrelate_sf = ff_tak_decorrelate_sf_sse4;
+    }
+#endif
+}
--- a/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/ttadsp_init.c
+++ b/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/ttadsp_init.c
@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2014 James Almer
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/ttadsp.h"
+#include "libavutil/x86/cpu.h"
+#include "config.h"
+
+void ff_tta_filter_process_ssse3(int32_t *qm, int32_t *dx, int32_t *dl,
+                                 int32_t *error, int32_t *in, int32_t shift,
+                                 int32_t round);
+void ff_tta_filter_process_sse4(int32_t *qm, int32_t *dx, int32_t *dl,
+                                int32_t *error, int32_t *in, int32_t shift,
+                                int32_t round);
+
+av_cold void ff_ttadsp_init_x86(TTADSPContext *c)
+{
+#if HAVE_X86ASM
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_SSSE3(cpu_flags))
+        c->filter_process = ff_tta_filter_process_ssse3;
+    if (EXTERNAL_SSE4(cpu_flags))
+        c->filter_process = ff_tta_filter_process_sse4;
+#endif
+}
--- a/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/ttaencdsp_init.c
+++ b/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/ttaencdsp_init.c
@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2014-2016 James Almer
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/ttaencdsp.h"
+#include "libavutil/x86/cpu.h"
+#include "config.h"
+
+void ff_ttaenc_filter_process_ssse3(int32_t *qm, int32_t *dx, int32_t *dl,
+                                    int32_t *error, int32_t *in, int32_t shift,
+                                    int32_t round);
+void ff_ttaenc_filter_process_sse4(int32_t *qm, int32_t *dx, int32_t *dl,
+                                   int32_t *error, int32_t *in, int32_t shift,
+                                   int32_t round);
+
+av_cold void ff_ttaencdsp_init_x86(TTAEncDSPContext *c)
+{
+#if HAVE_X86ASM
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_SSSE3(cpu_flags))
+        c->filter_process = ff_ttaenc_filter_process_ssse3;
+    if (EXTERNAL_SSE4(cpu_flags))
+        c->filter_process = ff_ttaenc_filter_process_sse4;
+#endif
+}
--- a/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/utvideodsp_init.c
+++ b/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/utvideodsp_init.c
@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2017 Paul B Mahol
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/asm.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/utvideodsp.h"
+
+void ff_restore_rgb_planes_sse2(uint8_t *src_r, uint8_t *src_g, uint8_t *src_b,
+                                ptrdiff_t linesize_r, ptrdiff_t linesize_g,
+                                ptrdiff_t linesize_b, int width, int height);
+void ff_restore_rgb_planes_avx2(uint8_t *src_r, uint8_t *src_g, uint8_t *src_b,
+                                ptrdiff_t linesize_r, ptrdiff_t linesize_g,
+                                ptrdiff_t linesize_b, int width, int height);
+
+void ff_restore_rgb_planes10_sse2(uint16_t *src_r, uint16_t *src_g, uint16_t *src_b,
+                                  ptrdiff_t linesize_r, ptrdiff_t linesize_g,
+                                  ptrdiff_t linesize_b, int width, int height);
+void ff_restore_rgb_planes10_avx2(uint16_t *src_r, uint16_t *src_g, uint16_t *src_b,
+                                  ptrdiff_t linesize_r, ptrdiff_t linesize_g,
+                                  ptrdiff_t linesize_b, int width, int height);
+
+av_cold void ff_utvideodsp_init_x86(UTVideoDSPContext *c)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        c->restore_rgb_planes   = ff_restore_rgb_planes_sse2;
+        c->restore_rgb_planes10 = ff_restore_rgb_planes10_sse2;
+    }
+    if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+        c->restore_rgb_planes   = ff_restore_rgb_planes_avx2;
+        c->restore_rgb_planes10 = ff_restore_rgb_planes10_avx2;
+    }
+}
--- a/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/v210-init.c
+++ b/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/v210-init.c
@ -0,0 +1,56 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/cpu.h"
+#include "libavcodec/v210dec.h"
+
+extern void ff_v210_planar_unpack_unaligned_ssse3(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width);
+extern void ff_v210_planar_unpack_unaligned_avx(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width);
+extern void ff_v210_planar_unpack_unaligned_avx2(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width);
+
+extern void ff_v210_planar_unpack_aligned_ssse3(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width);
+extern void ff_v210_planar_unpack_aligned_avx(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width);
+extern void ff_v210_planar_unpack_aligned_avx2(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width);
+
+av_cold void ff_v210_x86_init(V210DecContext *s)
+{
+#if HAVE_X86ASM
+    int cpu_flags = av_get_cpu_flags();
+
+    if (s->aligned_input) {
+        if (cpu_flags & AV_CPU_FLAG_SSSE3)
+            s->unpack_frame = ff_v210_planar_unpack_aligned_ssse3;
+
+        if (HAVE_AVX_EXTERNAL && cpu_flags & AV_CPU_FLAG_AVX)
+            s->unpack_frame = ff_v210_planar_unpack_aligned_avx;
+
+        if (HAVE_AVX2_EXTERNAL && cpu_flags & AV_CPU_FLAG_AVX2)
+            s->unpack_frame = ff_v210_planar_unpack_aligned_avx2;
+    }
+    else {
+        if (cpu_flags & AV_CPU_FLAG_SSSE3)
+            s->unpack_frame = ff_v210_planar_unpack_unaligned_ssse3;
+
+        if (HAVE_AVX_EXTERNAL && cpu_flags & AV_CPU_FLAG_AVX)
+            s->unpack_frame = ff_v210_planar_unpack_unaligned_avx;
+
+        if (HAVE_AVX2_EXTERNAL && cpu_flags & AV_CPU_FLAG_AVX2)
+            s->unpack_frame = ff_v210_planar_unpack_unaligned_avx2;
+    }
+#endif
+}
--- a/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/v210enc_init.c
+++ b/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/v210enc_init.c
@ -0,0 +1,54 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/v210enc.h"
+
+void ff_v210_planar_pack_8_ssse3(const uint8_t *y, const uint8_t *u,
+                                 const uint8_t *v, uint8_t *dst,
+                                 ptrdiff_t width);
+void ff_v210_planar_pack_8_avx(const uint8_t *y, const uint8_t *u,
+                               const uint8_t *v, uint8_t *dst, ptrdiff_t width);
+void ff_v210_planar_pack_8_avx2(const uint8_t *y, const uint8_t *u,
+                                const uint8_t *v, uint8_t *dst, ptrdiff_t width);
+void ff_v210_planar_pack_10_ssse3(const uint16_t *y, const uint16_t *u,
+                                  const uint16_t *v, uint8_t *dst,
+                                  ptrdiff_t width);
+void ff_v210_planar_pack_10_avx2(const uint16_t *y, const uint16_t *u,
+                                 const uint16_t *v, uint8_t *dst,
+                                 ptrdiff_t width);
+
+av_cold void ff_v210enc_init_x86(V210EncContext *s)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_SSSE3(cpu_flags)) {
+        s->pack_line_8 = ff_v210_planar_pack_8_ssse3;
+        s->pack_line_10 = ff_v210_planar_pack_10_ssse3;
+    }
+
+    if (EXTERNAL_AVX(cpu_flags))
+        s->pack_line_8 = ff_v210_planar_pack_8_avx;
+
+    if (EXTERNAL_AVX2(cpu_flags)) {
+        s->sample_factor_8  = 2;
+        s->pack_line_8      = ff_v210_planar_pack_8_avx2;
+        s->sample_factor_10 = 2;
+        s->pack_line_10     = ff_v210_planar_pack_10_avx2;
+    }
+}
--- a/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/vc1dsp.h
+++ b/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/vc1dsp.h
@ -0,0 +1,29 @@
+/*
+ * VC-1 and WMV3 decoder - X86 DSP init functions
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_X86_VC1DSP_H
+#define AVCODEC_X86_VC1DSP_H
+
+#include "libavcodec/vc1dsp.h"
+
+void ff_vc1dsp_init_mmx(VC1DSPContext *dsp);
+void ff_vc1dsp_init_mmxext(VC1DSPContext *dsp);
+
+#endif /* AVCODEC_X86_VC1DSP_H */
--- a/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/vc1dsp_init.c
+++ b/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/vc1dsp_init.c
@ -0,0 +1,168 @@
+/*
+ * VC-1 and WMV3 - DSP functions MMX-optimized
+ * Copyright (c) 2007 Christophe GISQUET <christophe.gisquet@free.fr>
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavutil/x86/asm.h"
+#include "libavcodec/vc1dsp.h"
+#include "fpel.h"
+#include "vc1dsp.h"
+#include "config.h"
+
+#define LOOP_FILTER(EXT) \
+void ff_vc1_v_loop_filter4_ ## EXT(uint8_t *src, int stride, int pq); \
+void ff_vc1_h_loop_filter4_ ## EXT(uint8_t *src, int stride, int pq); \
+void ff_vc1_v_loop_filter8_ ## EXT(uint8_t *src, int stride, int pq); \
+void ff_vc1_h_loop_filter8_ ## EXT(uint8_t *src, int stride, int pq); \
+\
+static void vc1_v_loop_filter16_ ## EXT(uint8_t *src, int stride, int pq) \
+{ \
+    ff_vc1_v_loop_filter8_ ## EXT(src,   stride, pq); \
+    ff_vc1_v_loop_filter8_ ## EXT(src+8, stride, pq); \
+} \
+\
+static void vc1_h_loop_filter16_ ## EXT(uint8_t *src, int stride, int pq) \
+{ \
+    ff_vc1_h_loop_filter8_ ## EXT(src,          stride, pq); \
+    ff_vc1_h_loop_filter8_ ## EXT(src+8*stride, stride, pq); \
+}
+
+#if HAVE_X86ASM
+LOOP_FILTER(mmxext)
+LOOP_FILTER(sse2)
+LOOP_FILTER(ssse3)
+
+void ff_vc1_h_loop_filter8_sse4(uint8_t *src, int stride, int pq);
+
+static void vc1_h_loop_filter16_sse4(uint8_t *src, int stride, int pq)
+{
+    ff_vc1_h_loop_filter8_sse4(src,          stride, pq);
+    ff_vc1_h_loop_filter8_sse4(src+8*stride, stride, pq);
+}
+
+#define DECLARE_FUNCTION(OP, DEPTH, INSN)                       \
+    static void OP##vc1_mspel_mc00_##DEPTH##INSN(uint8_t *dst,          \
+                             const uint8_t *src, ptrdiff_t stride, int rnd) \
+    {                                                                       \
+        ff_ ## OP ## pixels ## DEPTH ## INSN(dst, src, stride, DEPTH);     \
+    }
+
+DECLARE_FUNCTION(put_,  8, _mmx)
+DECLARE_FUNCTION(put_, 16, _mmx)
+DECLARE_FUNCTION(avg_,  8, _mmx)
+DECLARE_FUNCTION(avg_, 16, _mmx)
+DECLARE_FUNCTION(avg_,  8, _mmxext)
+DECLARE_FUNCTION(avg_, 16, _mmxext)
+DECLARE_FUNCTION(put_, 16, _sse2)
+DECLARE_FUNCTION(avg_, 16, _sse2)
+
+#endif /* HAVE_X86ASM */
+
+void ff_put_vc1_chroma_mc8_nornd_mmx  (uint8_t *dst, uint8_t *src,
+                                       ptrdiff_t stride, int h, int x, int y);
+void ff_avg_vc1_chroma_mc8_nornd_mmxext(uint8_t *dst, uint8_t *src,
+                                        ptrdiff_t stride, int h, int x, int y);
+void ff_avg_vc1_chroma_mc8_nornd_3dnow(uint8_t *dst, uint8_t *src,
+                                       ptrdiff_t stride, int h, int x, int y);
+void ff_put_vc1_chroma_mc8_nornd_ssse3(uint8_t *dst, uint8_t *src,
+                                       ptrdiff_t stride, int h, int x, int y);
+void ff_avg_vc1_chroma_mc8_nornd_ssse3(uint8_t *dst, uint8_t *src,
+                                       ptrdiff_t stride, int h, int x, int y);
+void ff_vc1_inv_trans_4x4_dc_mmxext(uint8_t *dest, ptrdiff_t linesize,
+                                    int16_t *block);
+void ff_vc1_inv_trans_4x8_dc_mmxext(uint8_t *dest, ptrdiff_t linesize,
+                                    int16_t *block);
+void ff_vc1_inv_trans_8x4_dc_mmxext(uint8_t *dest, ptrdiff_t linesize,
+                                    int16_t *block);
+void ff_vc1_inv_trans_8x8_dc_mmxext(uint8_t *dest, ptrdiff_t linesize,
+                                    int16_t *block);
+
+
+av_cold void ff_vc1dsp_init_x86(VC1DSPContext *dsp)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (HAVE_6REGS && INLINE_MMX(cpu_flags))
+        if (EXTERNAL_MMX(cpu_flags))
+        ff_vc1dsp_init_mmx(dsp);
+
+    if (HAVE_6REGS && INLINE_MMXEXT(cpu_flags))
+        if (EXTERNAL_MMXEXT(cpu_flags))
+        ff_vc1dsp_init_mmxext(dsp);
+
+#define ASSIGN_LF(EXT) \
+        dsp->vc1_v_loop_filter4  = ff_vc1_v_loop_filter4_ ## EXT; \
+        dsp->vc1_h_loop_filter4  = ff_vc1_h_loop_filter4_ ## EXT; \
+        dsp->vc1_v_loop_filter8  = ff_vc1_v_loop_filter8_ ## EXT; \
+        dsp->vc1_h_loop_filter8  = ff_vc1_h_loop_filter8_ ## EXT; \
+        dsp->vc1_v_loop_filter16 = vc1_v_loop_filter16_ ## EXT; \
+        dsp->vc1_h_loop_filter16 = vc1_h_loop_filter16_ ## EXT
+
+#if HAVE_X86ASM
+    if (EXTERNAL_MMX(cpu_flags)) {
+        dsp->put_no_rnd_vc1_chroma_pixels_tab[0] = ff_put_vc1_chroma_mc8_nornd_mmx;
+
+        dsp->put_vc1_mspel_pixels_tab[1][0]      = put_vc1_mspel_mc00_8_mmx;
+        dsp->put_vc1_mspel_pixels_tab[0][0]      = put_vc1_mspel_mc00_16_mmx;
+        dsp->avg_vc1_mspel_pixels_tab[1][0]      = avg_vc1_mspel_mc00_8_mmx;
+        dsp->avg_vc1_mspel_pixels_tab[0][0]      = avg_vc1_mspel_mc00_16_mmx;
+    }
+    if (EXTERNAL_AMD3DNOW(cpu_flags)) {
+        dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_nornd_3dnow;
+    }
+    if (EXTERNAL_MMXEXT(cpu_flags)) {
+        ASSIGN_LF(mmxext);
+        dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_nornd_mmxext;
+
+        dsp->avg_vc1_mspel_pixels_tab[1][0]      = avg_vc1_mspel_mc00_8_mmxext;
+        dsp->avg_vc1_mspel_pixels_tab[0][0]      = avg_vc1_mspel_mc00_16_mmxext;
+
+        dsp->vc1_inv_trans_8x8_dc                = ff_vc1_inv_trans_8x8_dc_mmxext;
+        dsp->vc1_inv_trans_4x8_dc                = ff_vc1_inv_trans_4x8_dc_mmxext;
+        dsp->vc1_inv_trans_8x4_dc                = ff_vc1_inv_trans_8x4_dc_mmxext;
+        dsp->vc1_inv_trans_4x4_dc                = ff_vc1_inv_trans_4x4_dc_mmxext;
+    }
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        dsp->vc1_v_loop_filter8  = ff_vc1_v_loop_filter8_sse2;
+        dsp->vc1_h_loop_filter8  = ff_vc1_h_loop_filter8_sse2;
+        dsp->vc1_v_loop_filter16 = vc1_v_loop_filter16_sse2;
+        dsp->vc1_h_loop_filter16 = vc1_h_loop_filter16_sse2;
+
+        dsp->put_vc1_mspel_pixels_tab[0][0]      = put_vc1_mspel_mc00_16_sse2;
+        dsp->avg_vc1_mspel_pixels_tab[0][0]      = avg_vc1_mspel_mc00_16_sse2;
+    }
+    if (EXTERNAL_SSSE3(cpu_flags)) {
+        ASSIGN_LF(ssse3);
+        dsp->put_no_rnd_vc1_chroma_pixels_tab[0] = ff_put_vc1_chroma_mc8_nornd_ssse3;
+        dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_nornd_ssse3;
+    }
+    if (EXTERNAL_SSE4(cpu_flags)) {
+        dsp->vc1_h_loop_filter8  = ff_vc1_h_loop_filter8_sse4;
+        dsp->vc1_h_loop_filter16 = vc1_h_loop_filter16_sse4;
+    }
+#endif /* HAVE_X86ASM */
+}
--- a/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/vc1dsp_mmx.c
+++ b/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/vc1dsp_mmx.c
@ -0,0 +1,486 @@
+/*
+ * VC-1 and WMV3 - DSP functions MMX-optimized
+ * Copyright (c) 2007 Christophe GISQUET <christophe.gisquet@free.fr>
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "libavutil/cpu.h"
+#include "libavutil/mem.h"
+#include "libavutil/x86/asm.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/vc1dsp.h"
+#include "constants.h"
+#include "fpel.h"
+#include "vc1dsp.h"
+
+#if HAVE_6REGS && HAVE_INLINE_ASM && HAVE_MMX_EXTERNAL
+
+void ff_vc1_put_ver_16b_shift2_mmx(int16_t *dst,
+                                   const uint8_t *src, x86_reg stride,
+                                   int rnd, int64_t shift);
+void ff_vc1_put_hor_16b_shift2_mmx(uint8_t *dst, x86_reg stride,
+                                   const int16_t *src, int rnd);
+void ff_vc1_avg_hor_16b_shift2_mmxext(uint8_t *dst, x86_reg stride,
+                                      const int16_t *src, int rnd);
+
+#define OP_PUT(S,D)
+#define OP_AVG(S,D) "pavgb " #S ", " #D " \n\t"
+
+/** Add rounder from mm7 to mm3 and pack result at destination */
+#define NORMALIZE_MMX(SHIFT)                                    \
+     "paddw     %%mm7, %%mm3           \n\t" /* +bias-r */      \
+     "paddw     %%mm7, %%mm4           \n\t" /* +bias-r */      \
+     "psraw     "SHIFT", %%mm3         \n\t"                    \
+     "psraw     "SHIFT", %%mm4         \n\t"
+
+#define TRANSFER_DO_PACK(OP)                    \
+     "packuswb  %%mm4, %%mm3           \n\t"    \
+     OP((%2), %%mm3)                            \
+     "movq      %%mm3, (%2)            \n\t"
+
+#define TRANSFER_DONT_PACK(OP)                  \
+     OP(0(%2), %%mm3)                           \
+     OP(8(%2), %%mm4)                           \
+     "movq      %%mm3, 0(%2)           \n\t"    \
+     "movq      %%mm4, 8(%2)           \n\t"
+
+/** @see MSPEL_FILTER13_CORE for use as UNPACK macro */
+#define DO_UNPACK(reg)  "punpcklbw %%mm0, " reg "\n\t"
+#define DONT_UNPACK(reg)
+
+/** Compute the rounder 32-r or 8-r and unpacks it to mm7 */
+#define LOAD_ROUNDER_MMX(ROUND)                 \
+     "movd      "ROUND", %%mm7         \n\t"    \
+     "punpcklwd %%mm7, %%mm7           \n\t"    \
+     "punpckldq %%mm7, %%mm7           \n\t"
+
+/**
+ * Purely vertical or horizontal 1/2 shift interpolation.
+ * Sacrifice mm6 for *9 factor.
+ */
+#define VC1_SHIFT2(OP, OPNAME)\
+static void OPNAME ## vc1_shift2_mmx(uint8_t *dst, const uint8_t *src,\
+                                     x86_reg stride, int rnd, x86_reg offset)\
+{\
+    rnd = 8-rnd;\
+    __asm__ volatile(\
+        "mov       $8, %%"FF_REG_c"        \n\t"\
+        LOAD_ROUNDER_MMX("%5")\
+        "movq      "MANGLE(ff_pw_9)", %%mm6\n\t"\
+        "1:                                \n\t"\
+        "movd      0(%0   ), %%mm3         \n\t"\
+        "movd      4(%0   ), %%mm4         \n\t"\
+        "movd      0(%0,%2), %%mm1         \n\t"\
+        "movd      4(%0,%2), %%mm2         \n\t"\
+        "add       %2, %0                  \n\t"\
+        "punpcklbw %%mm0, %%mm3            \n\t"\
+        "punpcklbw %%mm0, %%mm4            \n\t"\
+        "punpcklbw %%mm0, %%mm1            \n\t"\
+        "punpcklbw %%mm0, %%mm2            \n\t"\
+        "paddw     %%mm1, %%mm3            \n\t"\
+        "paddw     %%mm2, %%mm4            \n\t"\
+        "movd      0(%0,%3), %%mm1         \n\t"\
+        "movd      4(%0,%3), %%mm2         \n\t"\
+        "pmullw    %%mm6, %%mm3            \n\t" /* 0,9,9,0*/\
+        "pmullw    %%mm6, %%mm4            \n\t" /* 0,9,9,0*/\
+        "punpcklbw %%mm0, %%mm1            \n\t"\
+        "punpcklbw %%mm0, %%mm2            \n\t"\
+        "psubw     %%mm1, %%mm3            \n\t" /*-1,9,9,0*/\
+        "psubw     %%mm2, %%mm4            \n\t" /*-1,9,9,0*/\
+        "movd      0(%0,%2), %%mm1         \n\t"\
+        "movd      4(%0,%2), %%mm2         \n\t"\
+        "punpcklbw %%mm0, %%mm1            \n\t"\
+        "punpcklbw %%mm0, %%mm2            \n\t"\
+        "psubw     %%mm1, %%mm3            \n\t" /*-1,9,9,-1*/\
+        "psubw     %%mm2, %%mm4            \n\t" /*-1,9,9,-1*/\
+        NORMALIZE_MMX("$4")\
+        "packuswb  %%mm4, %%mm3            \n\t"\
+        OP((%1), %%mm3)\
+        "movq      %%mm3, (%1)             \n\t"\
+        "add       %6, %0                  \n\t"\
+        "add       %4, %1                  \n\t"\
+        "dec       %%"FF_REG_c"            \n\t"\
+        "jnz 1b                            \n\t"\
+        : "+r"(src),  "+r"(dst)\
+        : "r"(offset), "r"(-2*offset), "g"(stride), "m"(rnd),\
+          "g"(stride-offset)\
+          NAMED_CONSTRAINTS_ADD(ff_pw_9)\
+        : "%"FF_REG_c, "memory"\
+    );\
+}
+
+VC1_SHIFT2(OP_PUT, put_)
+VC1_SHIFT2(OP_AVG, avg_)
+
+/**
+ * Core of the 1/4 and 3/4 shift bicubic interpolation.
+ *
+ * @param UNPACK  Macro unpacking arguments from 8 to 16 bits (can be empty).
+ * @param MOVQ    "movd 1" or "movq 2", if data read is already unpacked.
+ * @param A1      Address of 1st tap (beware of unpacked/packed).
+ * @param A2      Address of 2nd tap
+ * @param A3      Address of 3rd tap
+ * @param A4      Address of 4th tap
+ */
+#define MSPEL_FILTER13_CORE(UNPACK, MOVQ, A1, A2, A3, A4)       \
+     MOVQ "*0+"A1", %%mm1       \n\t"                           \
+     MOVQ "*4+"A1", %%mm2       \n\t"                           \
+     UNPACK("%%mm1")                                            \
+     UNPACK("%%mm2")                                            \
+     "pmullw    "MANGLE(ff_pw_3)", %%mm1\n\t"                   \
+     "pmullw    "MANGLE(ff_pw_3)", %%mm2\n\t"                   \
+     MOVQ "*0+"A2", %%mm3       \n\t"                           \
+     MOVQ "*4+"A2", %%mm4       \n\t"                           \
+     UNPACK("%%mm3")                                            \
+     UNPACK("%%mm4")                                            \
+     "pmullw    %%mm6, %%mm3    \n\t" /* *18 */                 \
+     "pmullw    %%mm6, %%mm4    \n\t" /* *18 */                 \
+     "psubw     %%mm1, %%mm3    \n\t" /* 18,-3 */               \
+     "psubw     %%mm2, %%mm4    \n\t" /* 18,-3 */               \
+     MOVQ "*0+"A4", %%mm1       \n\t"                           \
+     MOVQ "*4+"A4", %%mm2       \n\t"                           \
+     UNPACK("%%mm1")                                            \
+     UNPACK("%%mm2")                                            \
+     "psllw     $2, %%mm1       \n\t" /* 4* */                  \
+     "psllw     $2, %%mm2       \n\t" /* 4* */                  \
+     "psubw     %%mm1, %%mm3    \n\t" /* -4,18,-3 */            \
+     "psubw     %%mm2, %%mm4    \n\t" /* -4,18,-3 */            \
+     MOVQ "*0+"A3", %%mm1       \n\t"                           \
+     MOVQ "*4+"A3", %%mm2       \n\t"                           \
+     UNPACK("%%mm1")                                            \
+     UNPACK("%%mm2")                                            \
+     "pmullw    %%mm5, %%mm1    \n\t" /* *53 */                 \
+     "pmullw    %%mm5, %%mm2    \n\t" /* *53 */                 \
+     "paddw     %%mm1, %%mm3    \n\t" /* 4,53,18,-3 */          \
+     "paddw     %%mm2, %%mm4    \n\t" /* 4,53,18,-3 */
+
+/**
+ * Macro to build the vertical 16 bits version of vc1_put_shift[13].
+ * Here, offset=src_stride. Parameters passed A1 to A4 must use
+ * %3 (src_stride) and %4 (3*src_stride).
+ *
+ * @param  NAME   Either 1 or 3
+ * @see MSPEL_FILTER13_CORE for information on A1->A4
+ */
+#define MSPEL_FILTER13_VER_16B(NAME, A1, A2, A3, A4)                    \
+static void                                                             \
+vc1_put_ver_16b_ ## NAME ## _mmx(int16_t *dst, const uint8_t *src,      \
+                                 x86_reg src_stride,                   \
+                                 int rnd, int64_t shift)                \
+{                                                                       \
+    int h = 8;                                                          \
+    src -= src_stride;                                                  \
+    __asm__ volatile(                                                       \
+        LOAD_ROUNDER_MMX("%5")                                          \
+        "movq      "MANGLE(ff_pw_53)", %%mm5\n\t"                       \
+        "movq      "MANGLE(ff_pw_18)", %%mm6\n\t"                       \
+        ".p2align 3                \n\t"                                \
+        "1:                        \n\t"                                \
+        MSPEL_FILTER13_CORE(DO_UNPACK, "movd  1", A1, A2, A3, A4)       \
+        NORMALIZE_MMX("%6")                                             \
+        TRANSFER_DONT_PACK(OP_PUT)                                      \
+        /* Last 3 (in fact 4) bytes on the line */                      \
+        "movd      8+"A1", %%mm1   \n\t"                                \
+        DO_UNPACK("%%mm1")                                              \
+        "movq      %%mm1, %%mm3    \n\t"                                \
+        "paddw     %%mm1, %%mm1    \n\t"                                \
+        "paddw     %%mm3, %%mm1    \n\t" /* 3* */                       \
+        "movd      8+"A2", %%mm3   \n\t"                                \
+        DO_UNPACK("%%mm3")                                              \
+        "pmullw    %%mm6, %%mm3    \n\t" /* *18 */                      \
+        "psubw     %%mm1, %%mm3    \n\t" /*18,-3 */                     \
+        "movd      8+"A3", %%mm1   \n\t"                                \
+        DO_UNPACK("%%mm1")                                              \
+        "pmullw    %%mm5, %%mm1    \n\t" /* *53 */                      \
+        "paddw     %%mm1, %%mm3    \n\t" /*53,18,-3 */                  \
+        "movd      8+"A4", %%mm1   \n\t"                                \
+        DO_UNPACK("%%mm1")                                              \
+        "psllw     $2, %%mm1       \n\t" /* 4* */                       \
+        "psubw     %%mm1, %%mm3    \n\t"                                \
+        "paddw     %%mm7, %%mm3    \n\t"                                \
+        "psraw     %6, %%mm3       \n\t"                                \
+        "movq      %%mm3, 16(%2)   \n\t"                                \
+        "add       %3, %1          \n\t"                                \
+        "add       $24, %2         \n\t"                                \
+        "decl      %0              \n\t"                                \
+        "jnz 1b                    \n\t"                                \
+        : "+r"(h), "+r" (src),  "+r" (dst)                              \
+        : "r"(src_stride), "r"(3*src_stride),                           \
+          "m"(rnd), "m"(shift)                                          \
+          NAMED_CONSTRAINTS_ADD(ff_pw_3,ff_pw_53,ff_pw_18)              \
+        : "memory"                                                      \
+    );                                                                  \
+}
+
+/**
+ * Macro to build the horizontal 16 bits version of vc1_put_shift[13].
+ * Here, offset=16 bits, so parameters passed A1 to A4 should be simple.
+ *
+ * @param  NAME   Either 1 or 3
+ * @see MSPEL_FILTER13_CORE for information on A1->A4
+ */
+#define MSPEL_FILTER13_HOR_16B(NAME, A1, A2, A3, A4, OP, OPNAME)        \
+static void                                                             \
+OPNAME ## vc1_hor_16b_ ## NAME ## _mmx(uint8_t *dst, x86_reg stride,    \
+                                 const int16_t *src, int rnd)           \
+{                                                                       \
+    int h = 8;                                                          \
+    src -= 1;                                                           \
+    rnd -= (-4+58+13-3)*256; /* Add -256 bias */                        \
+    __asm__ volatile(                                                       \
+        LOAD_ROUNDER_MMX("%4")                                          \
+        "movq      "MANGLE(ff_pw_18)", %%mm6   \n\t"                    \
+        "movq      "MANGLE(ff_pw_53)", %%mm5   \n\t"                    \
+        ".p2align 3                \n\t"                                \
+        "1:                        \n\t"                                \
+        MSPEL_FILTER13_CORE(DONT_UNPACK, "movq 2", A1, A2, A3, A4)      \
+        NORMALIZE_MMX("$7")                                             \
+        /* Remove bias */                                               \
+        "paddw     "MANGLE(ff_pw_128)", %%mm3  \n\t"                    \
+        "paddw     "MANGLE(ff_pw_128)", %%mm4  \n\t"                    \
+        TRANSFER_DO_PACK(OP)                                            \
+        "add       $24, %1         \n\t"                                \
+        "add       %3, %2          \n\t"                                \
+        "decl      %0              \n\t"                                \
+        "jnz 1b                    \n\t"                                \
+        : "+r"(h), "+r" (src),  "+r" (dst)                              \
+        : "r"(stride), "m"(rnd)                                         \
+          NAMED_CONSTRAINTS_ADD(ff_pw_3,ff_pw_18,ff_pw_53,ff_pw_128)    \
+        : "memory"                                                      \
+    );                                                                  \
+}
+
+/**
+ * Macro to build the 8 bits, any direction, version of vc1_put_shift[13].
+ * Here, offset=src_stride. Parameters passed A1 to A4 must use
+ * %3 (offset) and %4 (3*offset).
+ *
+ * @param  NAME   Either 1 or 3
+ * @see MSPEL_FILTER13_CORE for information on A1->A4
+ */
+#define MSPEL_FILTER13_8B(NAME, A1, A2, A3, A4, OP, OPNAME)             \
+static void                                                             \
+OPNAME ## vc1_## NAME ## _mmx(uint8_t *dst, const uint8_t *src,         \
+                        x86_reg stride, int rnd, x86_reg offset)      \
+{                                                                       \
+    int h = 8;                                                          \
+    src -= offset;                                                      \
+    rnd = 32-rnd;                                                       \
+    __asm__ volatile (                                                      \
+        LOAD_ROUNDER_MMX("%6")                                          \
+        "movq      "MANGLE(ff_pw_53)", %%mm5       \n\t"                \
+        "movq      "MANGLE(ff_pw_18)", %%mm6       \n\t"                \
+        ".p2align 3                \n\t"                                \
+        "1:                        \n\t"                                \
+        MSPEL_FILTER13_CORE(DO_UNPACK, "movd   1", A1, A2, A3, A4)      \
+        NORMALIZE_MMX("$6")                                             \
+        TRANSFER_DO_PACK(OP)                                            \
+        "add       %5, %1          \n\t"                                \
+        "add       %5, %2          \n\t"                                \
+        "decl      %0              \n\t"                                \
+        "jnz 1b                    \n\t"                                \
+        : "+r"(h), "+r" (src),  "+r" (dst)                              \
+        : "r"(offset), "r"(3*offset), "g"(stride), "m"(rnd)             \
+          NAMED_CONSTRAINTS_ADD(ff_pw_53,ff_pw_18,ff_pw_3)              \
+        : "memory"                                                      \
+    );                                                                  \
+}
+
+/** 1/4 shift bicubic interpolation */
+MSPEL_FILTER13_8B     (shift1, "0(%1,%4  )", "0(%1,%3,2)", "0(%1,%3  )", "0(%1     )", OP_PUT, put_)
+MSPEL_FILTER13_8B     (shift1, "0(%1,%4  )", "0(%1,%3,2)", "0(%1,%3  )", "0(%1     )", OP_AVG, avg_)
+MSPEL_FILTER13_VER_16B(shift1, "0(%1,%4  )", "0(%1,%3,2)", "0(%1,%3  )", "0(%1     )")
+MSPEL_FILTER13_HOR_16B(shift1, "2*3(%1)", "2*2(%1)", "2*1(%1)", "2*0(%1)", OP_PUT, put_)
+MSPEL_FILTER13_HOR_16B(shift1, "2*3(%1)", "2*2(%1)", "2*1(%1)", "2*0(%1)", OP_AVG, avg_)
+
+/** 3/4 shift bicubic interpolation */
+MSPEL_FILTER13_8B     (shift3, "0(%1     )", "0(%1,%3  )", "0(%1,%3,2)", "0(%1,%4  )", OP_PUT, put_)
+MSPEL_FILTER13_8B     (shift3, "0(%1     )", "0(%1,%3  )", "0(%1,%3,2)", "0(%1,%4  )", OP_AVG, avg_)
+MSPEL_FILTER13_VER_16B(shift3, "0(%1     )", "0(%1,%3  )", "0(%1,%3,2)", "0(%1,%4  )")
+MSPEL_FILTER13_HOR_16B(shift3, "2*0(%1)", "2*1(%1)", "2*2(%1)", "2*3(%1)", OP_PUT, put_)
+MSPEL_FILTER13_HOR_16B(shift3, "2*0(%1)", "2*1(%1)", "2*2(%1)", "2*3(%1)", OP_AVG, avg_)
+
+typedef void (*vc1_mspel_mc_filter_ver_16bits)(int16_t *dst, const uint8_t *src, x86_reg src_stride, int rnd, int64_t shift);
+typedef void (*vc1_mspel_mc_filter_hor_16bits)(uint8_t *dst, x86_reg dst_stride, const int16_t *src, int rnd);
+typedef void (*vc1_mspel_mc_filter_8bits)(uint8_t *dst, const uint8_t *src, x86_reg stride, int rnd, x86_reg offset);
+
+/**
+ * Interpolate fractional pel values by applying proper vertical then
+ * horizontal filter.
+ *
+ * @param  dst     Destination buffer for interpolated pels.
+ * @param  src     Source buffer.
+ * @param  stride  Stride for both src and dst buffers.
+ * @param  hmode   Horizontal filter (expressed in quarter pixels shift).
+ * @param  hmode   Vertical filter.
+ * @param  rnd     Rounding bias.
+ */
+#define VC1_MSPEL_MC(OP, INSTR)\
+static void OP ## vc1_mspel_mc(uint8_t *dst, const uint8_t *src, int stride,\
+                               int hmode, int vmode, int rnd)\
+{\
+    static const vc1_mspel_mc_filter_ver_16bits vc1_put_shift_ver_16bits[] =\
+         { NULL, vc1_put_ver_16b_shift1_mmx, ff_vc1_put_ver_16b_shift2_mmx, vc1_put_ver_16b_shift3_mmx };\
+    static const vc1_mspel_mc_filter_hor_16bits vc1_put_shift_hor_16bits[] =\
+         { NULL, OP ## vc1_hor_16b_shift1_mmx, ff_vc1_ ## OP ## hor_16b_shift2_ ## INSTR, OP ## vc1_hor_16b_shift3_mmx };\
+    static const vc1_mspel_mc_filter_8bits vc1_put_shift_8bits[] =\
+         { NULL, OP ## vc1_shift1_mmx, OP ## vc1_shift2_mmx, OP ## vc1_shift3_mmx };\
+\
+    __asm__ volatile(\
+        "pxor %%mm0, %%mm0         \n\t"\
+        ::: "memory"\
+    );\
+\
+    if (vmode) { /* Vertical filter to apply */\
+        if (hmode) { /* Horizontal filter to apply, output to tmp */\
+            static const int shift_value[] = { 0, 5, 1, 5 };\
+            int              shift = (shift_value[hmode]+shift_value[vmode])>>1;\
+            int              r;\
+            LOCAL_ALIGNED(16, int16_t, tmp, [12*8]);\
+\
+            r = (1<<(shift-1)) + rnd-1;\
+            vc1_put_shift_ver_16bits[vmode](tmp, src-1, stride, r, shift);\
+\
+            vc1_put_shift_hor_16bits[hmode](dst, stride, tmp+1, 64-rnd);\
+            return;\
+        }\
+        else { /* No horizontal filter, output 8 lines to dst */\
+            vc1_put_shift_8bits[vmode](dst, src, stride, 1-rnd, stride);\
+            return;\
+        }\
+    }\
+\
+    /* Horizontal mode with no vertical mode */\
+    vc1_put_shift_8bits[hmode](dst, src, stride, rnd, 1);\
+} \
+static void OP ## vc1_mspel_mc_16(uint8_t *dst, const uint8_t *src, \
+                                  int stride, int hmode, int vmode, int rnd)\
+{ \
+    OP ## vc1_mspel_mc(dst + 0, src + 0, stride, hmode, vmode, rnd); \
+    OP ## vc1_mspel_mc(dst + 8, src + 8, stride, hmode, vmode, rnd); \
+    dst += 8*stride; src += 8*stride; \
+    OP ## vc1_mspel_mc(dst + 0, src + 0, stride, hmode, vmode, rnd); \
+    OP ## vc1_mspel_mc(dst + 8, src + 8, stride, hmode, vmode, rnd); \
+}
+
+VC1_MSPEL_MC(put_, mmx)
+VC1_MSPEL_MC(avg_, mmxext)
+
+/** Macro to ease bicubic filter interpolation functions declarations */
+#define DECLARE_FUNCTION(a, b)                                          \
+static void put_vc1_mspel_mc ## a ## b ## _mmx(uint8_t *dst,            \
+                                               const uint8_t *src,      \
+                                               ptrdiff_t stride,        \
+                                               int rnd)                 \
+{                                                                       \
+     put_vc1_mspel_mc(dst, src, stride, a, b, rnd);                     \
+}\
+static void avg_vc1_mspel_mc ## a ## b ## _mmxext(uint8_t *dst,         \
+                                                  const uint8_t *src,   \
+                                                  ptrdiff_t stride,     \
+                                                  int rnd)              \
+{                                                                       \
+     avg_vc1_mspel_mc(dst, src, stride, a, b, rnd);                     \
+}\
+static void put_vc1_mspel_mc ## a ## b ## _16_mmx(uint8_t *dst,         \
+                                                  const uint8_t *src,   \
+                                                  ptrdiff_t stride,     \
+                                                  int rnd)              \
+{                                                                       \
+     put_vc1_mspel_mc_16(dst, src, stride, a, b, rnd);                  \
+}\
+static void avg_vc1_mspel_mc ## a ## b ## _16_mmxext(uint8_t *dst,      \
+                                                     const uint8_t *src,\
+                                                     ptrdiff_t stride,  \
+                                                     int rnd)           \
+{                                                                       \
+     avg_vc1_mspel_mc_16(dst, src, stride, a, b, rnd);                  \
+}
+
+DECLARE_FUNCTION(0, 1)
+DECLARE_FUNCTION(0, 2)
+DECLARE_FUNCTION(0, 3)
+
+DECLARE_FUNCTION(1, 0)
+DECLARE_FUNCTION(1, 1)
+DECLARE_FUNCTION(1, 2)
+DECLARE_FUNCTION(1, 3)
+
+DECLARE_FUNCTION(2, 0)
+DECLARE_FUNCTION(2, 1)
+DECLARE_FUNCTION(2, 2)
+DECLARE_FUNCTION(2, 3)
+
+DECLARE_FUNCTION(3, 0)
+DECLARE_FUNCTION(3, 1)
+DECLARE_FUNCTION(3, 2)
+DECLARE_FUNCTION(3, 3)
+
+#define FN_ASSIGN(OP, X, Y, INSN) \
+    dsp->OP##vc1_mspel_pixels_tab[1][X+4*Y] = OP##vc1_mspel_mc##X##Y##INSN; \
+    dsp->OP##vc1_mspel_pixels_tab[0][X+4*Y] = OP##vc1_mspel_mc##X##Y##_16##INSN
+
+av_cold void ff_vc1dsp_init_mmx(VC1DSPContext *dsp)
+{
+    FN_ASSIGN(put_, 0, 1, _mmx);
+    FN_ASSIGN(put_, 0, 2, _mmx);
+    FN_ASSIGN(put_, 0, 3, _mmx);
+
+    FN_ASSIGN(put_, 1, 0, _mmx);
+    FN_ASSIGN(put_, 1, 1, _mmx);
+    FN_ASSIGN(put_, 1, 2, _mmx);
+    FN_ASSIGN(put_, 1, 3, _mmx);
+
+    FN_ASSIGN(put_, 2, 0, _mmx);
+    FN_ASSIGN(put_, 2, 1, _mmx);
+    FN_ASSIGN(put_, 2, 2, _mmx);
+    FN_ASSIGN(put_, 2, 3, _mmx);
+
+    FN_ASSIGN(put_, 3, 0, _mmx);
+    FN_ASSIGN(put_, 3, 1, _mmx);
+    FN_ASSIGN(put_, 3, 2, _mmx);
+    FN_ASSIGN(put_, 3, 3, _mmx);
+}
+
+av_cold void ff_vc1dsp_init_mmxext(VC1DSPContext *dsp)
+{
+    FN_ASSIGN(avg_, 0, 1, _mmxext);
+    FN_ASSIGN(avg_, 0, 2, _mmxext);
+    FN_ASSIGN(avg_, 0, 3, _mmxext);
+
+    FN_ASSIGN(avg_, 1, 0, _mmxext);
+    FN_ASSIGN(avg_, 1, 1, _mmxext);
+    FN_ASSIGN(avg_, 1, 2, _mmxext);
+    FN_ASSIGN(avg_, 1, 3, _mmxext);
+
+    FN_ASSIGN(avg_, 2, 0, _mmxext);
+    FN_ASSIGN(avg_, 2, 1, _mmxext);
+    FN_ASSIGN(avg_, 2, 2, _mmxext);
+    FN_ASSIGN(avg_, 2, 3, _mmxext);
+
+    FN_ASSIGN(avg_, 3, 0, _mmxext);
+    FN_ASSIGN(avg_, 3, 1, _mmxext);
+    FN_ASSIGN(avg_, 3, 2, _mmxext);
+    FN_ASSIGN(avg_, 3, 3, _mmxext);
+}
+#endif /* HAVE_6REGS && HAVE_INLINE_ASM && HAVE_MMX_EXTERNAL */
--- a/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/videodsp_init.c
+++ b/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/videodsp_init.c
@ -0,0 +1,309 @@
+/*
+ * Copyright (C) 2002-2012 Michael Niedermayer
+ * Copyright (C) 2012 Ronald S. Bultje
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavutil/avassert.h"
+#include "libavutil/common.h"
+#include "libavutil/cpu.h"
+#include "libavutil/mem.h"
+#include "libavutil/x86/asm.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/videodsp.h"
+
+#if HAVE_X86ASM
+typedef void emu_edge_vfix_func(uint8_t *dst, x86_reg dst_stride,
+                                const uint8_t *src, x86_reg src_stride,
+                                x86_reg start_y, x86_reg end_y, x86_reg bh);
+typedef void emu_edge_vvar_func(uint8_t *dst, x86_reg dst_stride,
+                                const uint8_t *src, x86_reg src_stride,
+                                x86_reg start_y, x86_reg end_y, x86_reg bh,
+                                x86_reg w);
+
+extern emu_edge_vfix_func ff_emu_edge_vfix1_mmx;
+extern emu_edge_vfix_func ff_emu_edge_vfix2_mmx;
+extern emu_edge_vfix_func ff_emu_edge_vfix3_mmx;
+extern emu_edge_vfix_func ff_emu_edge_vfix4_mmx;
+extern emu_edge_vfix_func ff_emu_edge_vfix5_mmx;
+extern emu_edge_vfix_func ff_emu_edge_vfix6_mmx;
+extern emu_edge_vfix_func ff_emu_edge_vfix7_mmx;
+extern emu_edge_vfix_func ff_emu_edge_vfix8_mmx;
+extern emu_edge_vfix_func ff_emu_edge_vfix9_mmx;
+extern emu_edge_vfix_func ff_emu_edge_vfix10_mmx;
+extern emu_edge_vfix_func ff_emu_edge_vfix11_mmx;
+extern emu_edge_vfix_func ff_emu_edge_vfix12_mmx;
+extern emu_edge_vfix_func ff_emu_edge_vfix13_mmx;
+extern emu_edge_vfix_func ff_emu_edge_vfix14_mmx;
+extern emu_edge_vfix_func ff_emu_edge_vfix15_mmx;
+extern emu_edge_vfix_func ff_emu_edge_vfix16_mmx;
+extern emu_edge_vfix_func ff_emu_edge_vfix17_mmx;
+extern emu_edge_vfix_func ff_emu_edge_vfix18_mmx;
+extern emu_edge_vfix_func ff_emu_edge_vfix19_mmx;
+extern emu_edge_vfix_func ff_emu_edge_vfix20_mmx;
+extern emu_edge_vfix_func ff_emu_edge_vfix21_mmx;
+extern emu_edge_vfix_func ff_emu_edge_vfix22_mmx;
+#if ARCH_X86_32
+static emu_edge_vfix_func * const vfixtbl_mmx[22] = {
+    &ff_emu_edge_vfix1_mmx,  &ff_emu_edge_vfix2_mmx,  &ff_emu_edge_vfix3_mmx,
+    &ff_emu_edge_vfix4_mmx,  &ff_emu_edge_vfix5_mmx,  &ff_emu_edge_vfix6_mmx,
+    &ff_emu_edge_vfix7_mmx,  &ff_emu_edge_vfix8_mmx,  &ff_emu_edge_vfix9_mmx,
+    &ff_emu_edge_vfix10_mmx, &ff_emu_edge_vfix11_mmx, &ff_emu_edge_vfix12_mmx,
+    &ff_emu_edge_vfix13_mmx, &ff_emu_edge_vfix14_mmx, &ff_emu_edge_vfix15_mmx,
+    &ff_emu_edge_vfix16_mmx, &ff_emu_edge_vfix17_mmx, &ff_emu_edge_vfix18_mmx,
+    &ff_emu_edge_vfix19_mmx, &ff_emu_edge_vfix20_mmx, &ff_emu_edge_vfix21_mmx,
+    &ff_emu_edge_vfix22_mmx
+};
+#endif
+extern emu_edge_vvar_func ff_emu_edge_vvar_mmx;
+extern emu_edge_vfix_func ff_emu_edge_vfix16_sse;
+extern emu_edge_vfix_func ff_emu_edge_vfix17_sse;
+extern emu_edge_vfix_func ff_emu_edge_vfix18_sse;
+extern emu_edge_vfix_func ff_emu_edge_vfix19_sse;
+extern emu_edge_vfix_func ff_emu_edge_vfix20_sse;
+extern emu_edge_vfix_func ff_emu_edge_vfix21_sse;
+extern emu_edge_vfix_func ff_emu_edge_vfix22_sse;
+static emu_edge_vfix_func * const vfixtbl_sse[22] = {
+    ff_emu_edge_vfix1_mmx,  ff_emu_edge_vfix2_mmx,  ff_emu_edge_vfix3_mmx,
+    ff_emu_edge_vfix4_mmx,  ff_emu_edge_vfix5_mmx,  ff_emu_edge_vfix6_mmx,
+    ff_emu_edge_vfix7_mmx,  ff_emu_edge_vfix8_mmx,  ff_emu_edge_vfix9_mmx,
+    ff_emu_edge_vfix10_mmx, ff_emu_edge_vfix11_mmx, ff_emu_edge_vfix12_mmx,
+    ff_emu_edge_vfix13_mmx, ff_emu_edge_vfix14_mmx, ff_emu_edge_vfix15_mmx,
+    ff_emu_edge_vfix16_sse, ff_emu_edge_vfix17_sse, ff_emu_edge_vfix18_sse,
+    ff_emu_edge_vfix19_sse, ff_emu_edge_vfix20_sse, ff_emu_edge_vfix21_sse,
+    ff_emu_edge_vfix22_sse
+};
+extern emu_edge_vvar_func ff_emu_edge_vvar_sse;
+
+typedef void emu_edge_hfix_func(uint8_t *dst, x86_reg dst_stride,
+                                x86_reg start_x, x86_reg bh);
+typedef void emu_edge_hvar_func(uint8_t *dst, x86_reg dst_stride,
+                                x86_reg start_x, x86_reg n_words, x86_reg bh);
+
+extern emu_edge_hfix_func ff_emu_edge_hfix2_mmx;
+extern emu_edge_hfix_func ff_emu_edge_hfix4_mmx;
+extern emu_edge_hfix_func ff_emu_edge_hfix6_mmx;
+extern emu_edge_hfix_func ff_emu_edge_hfix8_mmx;
+extern emu_edge_hfix_func ff_emu_edge_hfix10_mmx;
+extern emu_edge_hfix_func ff_emu_edge_hfix12_mmx;
+extern emu_edge_hfix_func ff_emu_edge_hfix14_mmx;
+extern emu_edge_hfix_func ff_emu_edge_hfix16_mmx;
+extern emu_edge_hfix_func ff_emu_edge_hfix18_mmx;
+extern emu_edge_hfix_func ff_emu_edge_hfix20_mmx;
+extern emu_edge_hfix_func ff_emu_edge_hfix22_mmx;
+#if ARCH_X86_32
+static emu_edge_hfix_func * const hfixtbl_mmx[11] = {
+    ff_emu_edge_hfix2_mmx,  ff_emu_edge_hfix4_mmx,  ff_emu_edge_hfix6_mmx,
+    ff_emu_edge_hfix8_mmx,  ff_emu_edge_hfix10_mmx, ff_emu_edge_hfix12_mmx,
+    ff_emu_edge_hfix14_mmx, ff_emu_edge_hfix16_mmx, ff_emu_edge_hfix18_mmx,
+    ff_emu_edge_hfix20_mmx, ff_emu_edge_hfix22_mmx
+};
+#endif
+extern emu_edge_hvar_func ff_emu_edge_hvar_mmx;
+extern emu_edge_hfix_func ff_emu_edge_hfix16_sse2;
+extern emu_edge_hfix_func ff_emu_edge_hfix18_sse2;
+extern emu_edge_hfix_func ff_emu_edge_hfix20_sse2;
+extern emu_edge_hfix_func ff_emu_edge_hfix22_sse2;
+static emu_edge_hfix_func * const hfixtbl_sse2[11] = {
+    ff_emu_edge_hfix2_mmx,  ff_emu_edge_hfix4_mmx,  ff_emu_edge_hfix6_mmx,
+    ff_emu_edge_hfix8_mmx,  ff_emu_edge_hfix10_mmx, ff_emu_edge_hfix12_mmx,
+    ff_emu_edge_hfix14_mmx, ff_emu_edge_hfix16_sse2, ff_emu_edge_hfix18_sse2,
+    ff_emu_edge_hfix20_sse2, ff_emu_edge_hfix22_sse2
+};
+extern emu_edge_hvar_func ff_emu_edge_hvar_sse2;
+#if HAVE_AVX2_EXTERNAL
+extern emu_edge_hfix_func ff_emu_edge_hfix8_avx2;
+extern emu_edge_hfix_func ff_emu_edge_hfix10_avx2;
+extern emu_edge_hfix_func ff_emu_edge_hfix12_avx2;
+extern emu_edge_hfix_func ff_emu_edge_hfix14_avx2;
+extern emu_edge_hfix_func ff_emu_edge_hfix16_avx2;
+extern emu_edge_hfix_func ff_emu_edge_hfix18_avx2;
+extern emu_edge_hfix_func ff_emu_edge_hfix20_avx2;
+extern emu_edge_hfix_func ff_emu_edge_hfix22_avx2;
+static emu_edge_hfix_func * const hfixtbl_avx2[11] = {
+    ff_emu_edge_hfix2_mmx,  ff_emu_edge_hfix4_mmx,  ff_emu_edge_hfix6_mmx,
+    ff_emu_edge_hfix8_avx2,  ff_emu_edge_hfix10_avx2, ff_emu_edge_hfix12_avx2,
+    ff_emu_edge_hfix14_avx2, ff_emu_edge_hfix16_avx2, ff_emu_edge_hfix18_avx2,
+    ff_emu_edge_hfix20_avx2, ff_emu_edge_hfix22_avx2
+};
+extern emu_edge_hvar_func ff_emu_edge_hvar_avx2;
+#endif
+
+static av_always_inline void emulated_edge_mc(uint8_t *dst, const uint8_t *src,
+                                              ptrdiff_t dst_stride,
+                                              ptrdiff_t src_stride,
+                                              x86_reg block_w, x86_reg block_h,
+                                              x86_reg src_x, x86_reg src_y,
+                                              x86_reg w, x86_reg h,
+                                              emu_edge_vfix_func * const *vfix_tbl,
+                                              emu_edge_vvar_func *v_extend_var,
+                                              emu_edge_hfix_func * const *hfix_tbl,
+                                              emu_edge_hvar_func *h_extend_var)
+{
+    x86_reg start_y, start_x, end_y, end_x, src_y_add = 0, p;
+
+    if (!w || !h)
+        return;
+
+    av_assert2(block_w <= FFABS(dst_stride));
+
+    if (src_y >= h) {
+        src -= src_y*src_stride;
+        src_y_add = h - 1;
+        src_y     = h - 1;
+    } else if (src_y <= -block_h) {
+        src -= src_y*src_stride;
+        src_y_add = 1 - block_h;
+        src_y     = 1 - block_h;
+    }
+    if (src_x >= w) {
+        src   += w - 1 - src_x;
+        src_x  = w - 1;
+    } else if (src_x <= -block_w) {
+        src   += 1 - block_w - src_x;
+        src_x  = 1 - block_w;
+    }
+
+    start_y = FFMAX(0, -src_y);
+    start_x = FFMAX(0, -src_x);
+    end_y   = FFMIN(block_h, h-src_y);
+    end_x   = FFMIN(block_w, w-src_x);
+    av_assert2(start_x < end_x && block_w > 0);
+    av_assert2(start_y < end_y && block_h > 0);
+
+    // fill in the to-be-copied part plus all above/below
+    src += (src_y_add + start_y) * src_stride + start_x;
+    w = end_x - start_x;
+    if (w <= 22) {
+        vfix_tbl[w - 1](dst + start_x, dst_stride, src, src_stride,
+                        start_y, end_y, block_h);
+    } else {
+        v_extend_var(dst + start_x, dst_stride, src, src_stride,
+                     start_y, end_y, block_h, w);
+    }
+
+    // fill left
+    if (start_x) {
+        if (start_x <= 22) {
+            hfix_tbl[(start_x - 1) >> 1](dst, dst_stride, start_x, block_h);
+        } else {
+            h_extend_var(dst, dst_stride,
+                         start_x, (start_x + 1) >> 1, block_h);
+        }
+    }
+
+    // fill right
+    p = block_w - end_x;
+    if (p) {
+        if (p <= 22) {
+            hfix_tbl[(p - 1) >> 1](dst + end_x - (p & 1), dst_stride,
+                                   -!(p & 1), block_h);
+        } else {
+            h_extend_var(dst + end_x - (p & 1), dst_stride,
+                         -!(p & 1), (p + 1) >> 1, block_h);
+        }
+    }
+}
+
+#if ARCH_X86_32
+static av_noinline void emulated_edge_mc_mmx(uint8_t *buf, const uint8_t *src,
+                                             ptrdiff_t buf_stride,
+                                             ptrdiff_t src_stride,
+                                             int block_w, int block_h,
+                                             int src_x, int src_y, int w, int h)
+{
+    emulated_edge_mc(buf, src, buf_stride, src_stride, block_w, block_h,
+                     src_x, src_y, w, h, vfixtbl_mmx, &ff_emu_edge_vvar_mmx,
+                     hfixtbl_mmx, &ff_emu_edge_hvar_mmx);
+}
+
+static av_noinline void emulated_edge_mc_sse(uint8_t *buf, const uint8_t *src,
+                                             ptrdiff_t buf_stride,
+                                             ptrdiff_t src_stride,
+                                             int block_w, int block_h,
+                                             int src_x, int src_y, int w, int h)
+{
+    emulated_edge_mc(buf, src, buf_stride, src_stride, block_w, block_h,
+                     src_x, src_y, w, h, vfixtbl_sse, &ff_emu_edge_vvar_sse,
+                     hfixtbl_mmx, &ff_emu_edge_hvar_mmx);
+}
+#endif
+
+static av_noinline void emulated_edge_mc_sse2(uint8_t *buf, const uint8_t *src,
+                                              ptrdiff_t buf_stride,
+                                              ptrdiff_t src_stride,
+                                              int block_w, int block_h,
+                                              int src_x, int src_y, int w,
+                                              int h)
+{
+    emulated_edge_mc(buf, src, buf_stride, src_stride, block_w, block_h,
+                     src_x, src_y, w, h, vfixtbl_sse, &ff_emu_edge_vvar_sse,
+                     hfixtbl_sse2, &ff_emu_edge_hvar_sse2);
+}
+
+#if HAVE_AVX2_EXTERNAL
+static av_noinline void emulated_edge_mc_avx2(uint8_t *buf, const uint8_t *src,
+                                              ptrdiff_t buf_stride,
+                                              ptrdiff_t src_stride,
+                                              int block_w, int block_h,
+                                              int src_x, int src_y, int w,
+                                              int h)
+{
+    emulated_edge_mc(buf, src, buf_stride, src_stride, block_w, block_h,
+                     src_x, src_y, w, h, vfixtbl_sse, &ff_emu_edge_vvar_sse,
+                     hfixtbl_avx2, &ff_emu_edge_hvar_avx2);
+}
+#endif /* HAVE_AVX2_EXTERNAL */
+#endif /* HAVE_X86ASM */
+
+void ff_prefetch_mmxext(uint8_t *buf, ptrdiff_t stride, int h);
+void ff_prefetch_3dnow(uint8_t *buf, ptrdiff_t stride, int h);
+
+av_cold void ff_videodsp_init_x86(VideoDSPContext *ctx, int bpc)
+{
+#if HAVE_X86ASM
+    int cpu_flags = av_get_cpu_flags();
+
+#if ARCH_X86_32
+    if (EXTERNAL_MMX(cpu_flags) && bpc <= 8) {
+        ctx->emulated_edge_mc = emulated_edge_mc_mmx;
+    }
+    if (EXTERNAL_AMD3DNOW(cpu_flags)) {
+        ctx->prefetch = ff_prefetch_3dnow;
+    }
+#endif /* ARCH_X86_32 */
+    if (EXTERNAL_MMXEXT(cpu_flags)) {
+        ctx->prefetch = ff_prefetch_mmxext;
+    }
+#if ARCH_X86_32
+    if (EXTERNAL_SSE(cpu_flags) && bpc <= 8) {
+        ctx->emulated_edge_mc = emulated_edge_mc_sse;
+    }
+#endif /* ARCH_X86_32 */
+    if (EXTERNAL_SSE2(cpu_flags) && bpc <= 8) {
+        ctx->emulated_edge_mc = emulated_edge_mc_sse2;
+    }
+#if HAVE_AVX2_EXTERNAL
+    if (EXTERNAL_AVX2(cpu_flags) && bpc <= 8) {
+        ctx->emulated_edge_mc = emulated_edge_mc_avx2;
+    }
+#endif
+#endif /* HAVE_X86ASM */
+}
--- a/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/vorbisdsp_init.c
+++ b/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/vorbisdsp_init.c
@ -0,0 +1,42 @@
+/*
+ * Copyright (C) 2006 Loren Merritt <lorenm@u.washington.edu>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/vorbisdsp.h"
+
+void ff_vorbis_inverse_coupling_3dnow(float *mag, float *ang,
+                                      intptr_t blocksize);
+void ff_vorbis_inverse_coupling_sse(float *mag, float *ang,
+                                    intptr_t blocksize);
+
+av_cold void ff_vorbisdsp_init_x86(VorbisDSPContext *dsp)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+#if ARCH_X86_32
+    if (EXTERNAL_AMD3DNOW(cpu_flags))
+        dsp->vorbis_inverse_coupling = ff_vorbis_inverse_coupling_3dnow;
+#endif /* ARCH_X86_32 */
+    if (EXTERNAL_SSE(cpu_flags))
+        dsp->vorbis_inverse_coupling = ff_vorbis_inverse_coupling_sse;
+}
--- a/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/vp3dsp_init.c
+++ b/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/vp3dsp_init.c
@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2009 David Conrad <lessen42@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/avcodec.h"
+#include "libavcodec/vp3dsp.h"
+
+void ff_vp3_idct_put_mmx(uint8_t *dest, ptrdiff_t stride, int16_t *block);
+void ff_vp3_idct_add_mmx(uint8_t *dest, ptrdiff_t stride, int16_t *block);
+
+void ff_vp3_idct_put_sse2(uint8_t *dest, ptrdiff_t stride, int16_t *block);
+void ff_vp3_idct_add_sse2(uint8_t *dest, ptrdiff_t stride, int16_t *block);
+
+void ff_vp3_idct_dc_add_mmxext(uint8_t *dest, ptrdiff_t stride, int16_t *block);
+
+void ff_vp3_v_loop_filter_mmxext(uint8_t *src, ptrdiff_t stride,
+                                 int *bounding_values);
+void ff_vp3_h_loop_filter_mmxext(uint8_t *src, ptrdiff_t stride,
+                                 int *bounding_values);
+
+void ff_put_vp_no_rnd_pixels8_l2_mmx(uint8_t *dst, const uint8_t *a,
+                                     const uint8_t *b, ptrdiff_t stride,
+                                     int h);
+
+av_cold void ff_vp3dsp_init_x86(VP3DSPContext *c, int flags)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_MMX(cpu_flags)) {
+        c->put_no_rnd_pixels_l2 = ff_put_vp_no_rnd_pixels8_l2_mmx;
+#if ARCH_X86_32
+        c->idct_put  = ff_vp3_idct_put_mmx;
+        c->idct_add  = ff_vp3_idct_add_mmx;
+#endif
+    }
+
+    if (EXTERNAL_MMXEXT(cpu_flags)) {
+        c->idct_dc_add = ff_vp3_idct_dc_add_mmxext;
+
+        if (!(flags & AV_CODEC_FLAG_BITEXACT)) {
+            c->v_loop_filter = ff_vp3_v_loop_filter_mmxext;
+            c->h_loop_filter = ff_vp3_h_loop_filter_mmxext;
+        }
+    }
+
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        c->idct_put  = ff_vp3_idct_put_sse2;
+        c->idct_add  = ff_vp3_idct_add_sse2;
+    }
+}
--- a/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/vp56_arith.h
+++ b/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/vp56_arith.h
@ -0,0 +1,51 @@
+/**
+ * VP5 and VP6 compatible video decoder (arith decoder)
+ *
+ * Copyright (C) 2006  Aurelien Jacobs <aurel@gnuage.org>
+ * Copyright (C) 2010  Eli Friedman
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_X86_VP56_ARITH_H
+#define AVCODEC_X86_VP56_ARITH_H
+
+#if HAVE_INLINE_ASM && HAVE_FAST_CMOV && HAVE_6REGS
+#define vp56_rac_get_prob vp56_rac_get_prob
+static av_always_inline int vp56_rac_get_prob(VP56RangeCoder *c, uint8_t prob)
+{
+    unsigned int code_word = vp56_rac_renorm(c);
+    unsigned int low = 1 + (((c->high - 1) * prob) >> 8);
+    unsigned int low_shift = low << 16;
+    int bit = 0;
+    c->code_word = code_word;
+
+    __asm__(
+        "subl  %4, %1      \n\t"
+        "subl  %3, %2      \n\t"
+        "setae %b0         \n\t"
+        "cmovb %4, %1      \n\t"
+        "cmovb %5, %2      \n\t"
+        : "+q"(bit), "+&r"(c->high), "+&r"(c->code_word)
+        : "r"(low_shift), "r"(low), "r"(code_word)
+    );
+
+    return bit;
+}
+#endif
+
+#endif /* AVCODEC_X86_VP56_ARITH_H */
--- a/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/vp6dsp_init.c
+++ b/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/vp6dsp_init.c
@ -0,0 +1,45 @@
+/*
+ * VP6 MMX/SSE2 optimizations
+ * Copyright (C) 2009  Sebastien Lucas <sebastien.lucas@gmail.com>
+ * Copyright (C) 2009  Zuxy Meng <zuxy.meng@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/vp56dsp.h"
+
+void ff_vp6_filter_diag4_mmx(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+                             const int16_t *h_weights,const int16_t *v_weights);
+void ff_vp6_filter_diag4_sse2(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+                              const int16_t *h_weights,const int16_t *v_weights);
+
+av_cold void ff_vp6dsp_init_x86(VP56DSPContext *c)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+#if ARCH_X86_32
+    if (EXTERNAL_MMX(cpu_flags)) {
+        c->vp6_filter_diag4 = ff_vp6_filter_diag4_mmx;
+    }
+#endif
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        c->vp6_filter_diag4 = ff_vp6_filter_diag4_sse2;
+    }
+}
--- a/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/vp8dsp_init.c
+++ b/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/vp8dsp_init.c
@ -0,0 +1,467 @@
+/*
+ * VP8 DSP functions x86-optimized
+ * Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
+ * Copyright (c) 2010 Fiona Glaser <fiona@x264.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/mem.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/vp8dsp.h"
+
+#if HAVE_X86ASM
+
+/*
+ * MC functions
+ */
+void ff_put_vp8_epel4_h4_mmxext(uint8_t *dst, ptrdiff_t dststride,
+                                uint8_t *src, ptrdiff_t srcstride,
+                                int height, int mx, int my);
+void ff_put_vp8_epel4_h6_mmxext(uint8_t *dst, ptrdiff_t dststride,
+                                uint8_t *src, ptrdiff_t srcstride,
+                                int height, int mx, int my);
+void ff_put_vp8_epel4_v4_mmxext(uint8_t *dst, ptrdiff_t dststride,
+                                uint8_t *src, ptrdiff_t srcstride,
+                                int height, int mx, int my);
+void ff_put_vp8_epel4_v6_mmxext(uint8_t *dst, ptrdiff_t dststride,
+                                uint8_t *src, ptrdiff_t srcstride,
+                                int height, int mx, int my);
+
+void ff_put_vp8_epel8_h4_sse2  (uint8_t *dst, ptrdiff_t dststride,
+                                uint8_t *src, ptrdiff_t srcstride,
+                                int height, int mx, int my);
+void ff_put_vp8_epel8_h6_sse2  (uint8_t *dst, ptrdiff_t dststride,
+                                uint8_t *src, ptrdiff_t srcstride,
+                                int height, int mx, int my);
+void ff_put_vp8_epel8_v4_sse2  (uint8_t *dst, ptrdiff_t dststride,
+                                uint8_t *src, ptrdiff_t srcstride,
+                                int height, int mx, int my);
+void ff_put_vp8_epel8_v6_sse2  (uint8_t *dst, ptrdiff_t dststride,
+                                uint8_t *src, ptrdiff_t srcstride,
+                                int height, int mx, int my);
+
+void ff_put_vp8_epel4_h4_ssse3 (uint8_t *dst, ptrdiff_t dststride,
+                                uint8_t *src, ptrdiff_t srcstride,
+                                int height, int mx, int my);
+void ff_put_vp8_epel4_h6_ssse3 (uint8_t *dst, ptrdiff_t dststride,
+                                uint8_t *src, ptrdiff_t srcstride,
+                                int height, int mx, int my);
+void ff_put_vp8_epel4_v4_ssse3 (uint8_t *dst, ptrdiff_t dststride,
+                                uint8_t *src, ptrdiff_t srcstride,
+                                int height, int mx, int my);
+void ff_put_vp8_epel4_v6_ssse3 (uint8_t *dst, ptrdiff_t dststride,
+                                uint8_t *src, ptrdiff_t srcstride,
+                                int height, int mx, int my);
+void ff_put_vp8_epel8_h4_ssse3 (uint8_t *dst, ptrdiff_t dststride,
+                                uint8_t *src, ptrdiff_t srcstride,
+                                int height, int mx, int my);
+void ff_put_vp8_epel8_h6_ssse3 (uint8_t *dst, ptrdiff_t dststride,
+                                uint8_t *src, ptrdiff_t srcstride,
+                                int height, int mx, int my);
+void ff_put_vp8_epel8_v4_ssse3 (uint8_t *dst, ptrdiff_t dststride,
+                                uint8_t *src, ptrdiff_t srcstride,
+                                int height, int mx, int my);
+void ff_put_vp8_epel8_v6_ssse3 (uint8_t *dst, ptrdiff_t dststride,
+                                uint8_t *src, ptrdiff_t srcstride,
+                                int height, int mx, int my);
+
+void ff_put_vp8_bilinear4_h_mmxext(uint8_t *dst, ptrdiff_t dststride,
+                                   uint8_t *src, ptrdiff_t srcstride,
+                                   int height, int mx, int my);
+void ff_put_vp8_bilinear8_h_sse2  (uint8_t *dst, ptrdiff_t dststride,
+                                   uint8_t *src, ptrdiff_t srcstride,
+                                   int height, int mx, int my);
+void ff_put_vp8_bilinear4_h_ssse3 (uint8_t *dst, ptrdiff_t dststride,
+                                   uint8_t *src, ptrdiff_t srcstride,
+                                   int height, int mx, int my);
+void ff_put_vp8_bilinear8_h_ssse3 (uint8_t *dst, ptrdiff_t dststride,
+                                   uint8_t *src, ptrdiff_t srcstride,
+                                   int height, int mx, int my);
+
+void ff_put_vp8_bilinear4_v_mmxext(uint8_t *dst, ptrdiff_t dststride,
+                                   uint8_t *src, ptrdiff_t srcstride,
+                                   int height, int mx, int my);
+void ff_put_vp8_bilinear8_v_sse2  (uint8_t *dst, ptrdiff_t dststride,
+                                   uint8_t *src, ptrdiff_t srcstride,
+                                   int height, int mx, int my);
+void ff_put_vp8_bilinear4_v_ssse3 (uint8_t *dst, ptrdiff_t dststride,
+                                   uint8_t *src, ptrdiff_t srcstride,
+                                   int height, int mx, int my);
+void ff_put_vp8_bilinear8_v_ssse3 (uint8_t *dst, ptrdiff_t dststride,
+                                   uint8_t *src, ptrdiff_t srcstride,
+                                   int height, int mx, int my);
+
+
+void ff_put_vp8_pixels8_mmx (uint8_t *dst, ptrdiff_t dststride,
+                             uint8_t *src, ptrdiff_t srcstride,
+                             int height, int mx, int my);
+void ff_put_vp8_pixels16_mmx(uint8_t *dst, ptrdiff_t dststride,
+                             uint8_t *src, ptrdiff_t srcstride,
+                             int height, int mx, int my);
+void ff_put_vp8_pixels16_sse(uint8_t *dst, ptrdiff_t dststride,
+                             uint8_t *src, ptrdiff_t srcstride,
+                             int height, int mx, int my);
+
+#define TAP_W16(OPT, FILTERTYPE, TAPTYPE) \
+static void ff_put_vp8_ ## FILTERTYPE ## 16_ ## TAPTYPE ## _ ## OPT( \
+    uint8_t *dst,  ptrdiff_t dststride, uint8_t *src, \
+    ptrdiff_t srcstride, int height, int mx, int my) \
+{ \
+    ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \
+        dst,     dststride, src,     srcstride, height, mx, my); \
+    ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \
+        dst + 8, dststride, src + 8, srcstride, height, mx, my); \
+}
+#define TAP_W8(OPT, FILTERTYPE, TAPTYPE) \
+static void ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \
+    uint8_t *dst,  ptrdiff_t dststride, uint8_t *src, \
+    ptrdiff_t srcstride, int height, int mx, int my) \
+{ \
+    ff_put_vp8_ ## FILTERTYPE ## 4_ ## TAPTYPE ## _ ## OPT( \
+        dst,     dststride, src,     srcstride, height, mx, my); \
+    ff_put_vp8_ ## FILTERTYPE ## 4_ ## TAPTYPE ## _ ## OPT( \
+        dst + 4, dststride, src + 4, srcstride, height, mx, my); \
+}
+
+#if ARCH_X86_32
+TAP_W8 (mmxext, epel, h4)
+TAP_W8 (mmxext, epel, h6)
+TAP_W16(mmxext, epel, h6)
+TAP_W8 (mmxext, epel, v4)
+TAP_W8 (mmxext, epel, v6)
+TAP_W16(mmxext, epel, v6)
+TAP_W8 (mmxext, bilinear, h)
+TAP_W16(mmxext, bilinear, h)
+TAP_W8 (mmxext, bilinear, v)
+TAP_W16(mmxext, bilinear, v)
+#endif
+
+TAP_W16(sse2,  epel, h6)
+TAP_W16(sse2,  epel, v6)
+TAP_W16(sse2,  bilinear, h)
+TAP_W16(sse2,  bilinear, v)
+
+TAP_W16(ssse3, epel, h6)
+TAP_W16(ssse3, epel, v6)
+TAP_W16(ssse3, bilinear, h)
+TAP_W16(ssse3, bilinear, v)
+
+#define HVTAP(OPT, ALIGN, TAPNUMX, TAPNUMY, SIZE, MAXHEIGHT) \
+static void ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## v ## TAPNUMY ## _ ## OPT( \
+    uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \
+    ptrdiff_t srcstride, int height, int mx, int my) \
+{ \
+    LOCAL_ALIGNED(ALIGN, uint8_t, tmp, [SIZE * (MAXHEIGHT + TAPNUMY - 1)]); \
+    uint8_t *tmpptr = tmp + SIZE * (TAPNUMY / 2 - 1); \
+    src -= srcstride * (TAPNUMY / 2 - 1); \
+    ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## _ ## OPT( \
+        tmp, SIZE,      src,    srcstride, height + TAPNUMY - 1, mx, my); \
+    ff_put_vp8_epel ## SIZE ## _v ## TAPNUMY ## _ ## OPT( \
+        dst, dststride, tmpptr, SIZE,      height,               mx, my); \
+}
+
+#if ARCH_X86_32
+#define HVTAPMMX(x, y) \
+HVTAP(mmxext, 8, x, y,  4,  8) \
+HVTAP(mmxext, 8, x, y,  8, 16)
+
+HVTAP(mmxext, 8, 6, 6, 16, 16)
+#else
+#define HVTAPMMX(x, y) \
+HVTAP(mmxext, 8, x, y,  4,  8)
+#endif
+
+HVTAPMMX(4, 4)
+HVTAPMMX(4, 6)
+HVTAPMMX(6, 4)
+HVTAPMMX(6, 6)
+
+#define HVTAPSSE2(x, y, w) \
+HVTAP(sse2,  16, x, y, w, 16) \
+HVTAP(ssse3, 16, x, y, w, 16)
+
+HVTAPSSE2(4, 4, 8)
+HVTAPSSE2(4, 6, 8)
+HVTAPSSE2(6, 4, 8)
+HVTAPSSE2(6, 6, 8)
+HVTAPSSE2(6, 6, 16)
+
+HVTAP(ssse3, 16, 4, 4, 4, 8)
+HVTAP(ssse3, 16, 4, 6, 4, 8)
+HVTAP(ssse3, 16, 6, 4, 4, 8)
+HVTAP(ssse3, 16, 6, 6, 4, 8)
+
+#define HVBILIN(OPT, ALIGN, SIZE, MAXHEIGHT) \
+static void ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT( \
+    uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \
+    ptrdiff_t srcstride, int height, int mx, int my) \
+{ \
+    LOCAL_ALIGNED(ALIGN, uint8_t, tmp, [SIZE * (MAXHEIGHT + 2)]); \
+    ff_put_vp8_bilinear ## SIZE ## _h_ ## OPT( \
+        tmp, SIZE,      src, srcstride, height + 1, mx, my); \
+    ff_put_vp8_bilinear ## SIZE ## _v_ ## OPT( \
+        dst, dststride, tmp, SIZE,      height,     mx, my); \
+}
+
+HVBILIN(mmxext,  8,  4,  8)
+#if ARCH_X86_32
+HVBILIN(mmxext,  8,  8, 16)
+HVBILIN(mmxext,  8, 16, 16)
+#endif
+HVBILIN(sse2,  8,  8, 16)
+HVBILIN(sse2,  8, 16, 16)
+HVBILIN(ssse3, 8,  4,  8)
+HVBILIN(ssse3, 8,  8, 16)
+HVBILIN(ssse3, 8, 16, 16)
+
+void ff_vp8_idct_dc_add_mmx(uint8_t *dst, int16_t block[16],
+                            ptrdiff_t stride);
+void ff_vp8_idct_dc_add_sse2(uint8_t *dst, int16_t block[16],
+                             ptrdiff_t stride);
+void ff_vp8_idct_dc_add_sse4(uint8_t *dst, int16_t block[16],
+                             ptrdiff_t stride);
+void ff_vp8_idct_dc_add4y_mmx(uint8_t *dst, int16_t block[4][16],
+                               ptrdiff_t stride);
+void ff_vp8_idct_dc_add4y_sse2(uint8_t *dst, int16_t block[4][16],
+                               ptrdiff_t stride);
+void ff_vp8_idct_dc_add4uv_mmx(uint8_t *dst, int16_t block[2][16],
+                               ptrdiff_t stride);
+void ff_vp8_luma_dc_wht_mmx(int16_t block[4][4][16], int16_t dc[16]);
+void ff_vp8_luma_dc_wht_sse(int16_t block[4][4][16], int16_t dc[16]);
+void ff_vp8_idct_add_mmx(uint8_t *dst, int16_t block[16], ptrdiff_t stride);
+void ff_vp8_idct_add_sse(uint8_t *dst, int16_t block[16], ptrdiff_t stride);
+
+#define DECLARE_LOOP_FILTER(NAME)                                       \
+void ff_vp8_v_loop_filter_simple_ ## NAME(uint8_t *dst,                 \
+                                          ptrdiff_t stride,             \
+                                          int flim);                    \
+void ff_vp8_h_loop_filter_simple_ ## NAME(uint8_t *dst,                 \
+                                          ptrdiff_t stride,             \
+                                          int flim);                    \
+void ff_vp8_v_loop_filter16y_inner_ ## NAME (uint8_t *dst,              \
+                                             ptrdiff_t stride,          \
+                                             int e, int i, int hvt);    \
+void ff_vp8_h_loop_filter16y_inner_ ## NAME (uint8_t *dst,              \
+                                             ptrdiff_t stride,          \
+                                             int e, int i, int hvt);    \
+void ff_vp8_v_loop_filter8uv_inner_ ## NAME (uint8_t *dstU,             \
+                                             uint8_t *dstV,             \
+                                             ptrdiff_t s,               \
+                                             int e, int i, int hvt);    \
+void ff_vp8_h_loop_filter8uv_inner_ ## NAME (uint8_t *dstU,             \
+                                             uint8_t *dstV,             \
+                                             ptrdiff_t s,               \
+                                             int e, int i, int hvt);    \
+void ff_vp8_v_loop_filter16y_mbedge_ ## NAME(uint8_t *dst,              \
+                                             ptrdiff_t stride,          \
+                                             int e, int i, int hvt);    \
+void ff_vp8_h_loop_filter16y_mbedge_ ## NAME(uint8_t *dst,              \
+                                             ptrdiff_t stride,          \
+                                             int e, int i, int hvt);    \
+void ff_vp8_v_loop_filter8uv_mbedge_ ## NAME(uint8_t *dstU,             \
+                                             uint8_t *dstV,             \
+                                             ptrdiff_t s,               \
+                                             int e, int i, int hvt);    \
+void ff_vp8_h_loop_filter8uv_mbedge_ ## NAME(uint8_t *dstU,             \
+                                             uint8_t *dstV,             \
+                                             ptrdiff_t s,               \
+                                             int e, int i, int hvt);
+
+DECLARE_LOOP_FILTER(mmx)
+DECLARE_LOOP_FILTER(mmxext)
+DECLARE_LOOP_FILTER(sse2)
+DECLARE_LOOP_FILTER(ssse3)
+DECLARE_LOOP_FILTER(sse4)
+
+#endif /* HAVE_X86ASM */
+
+#define VP8_LUMA_MC_FUNC(IDX, SIZE, OPT) \
+    c->put_vp8_epel_pixels_tab[IDX][0][2] = ff_put_vp8_epel ## SIZE ## _h6_ ## OPT; \
+    c->put_vp8_epel_pixels_tab[IDX][2][0] = ff_put_vp8_epel ## SIZE ## _v6_ ## OPT; \
+    c->put_vp8_epel_pixels_tab[IDX][2][2] = ff_put_vp8_epel ## SIZE ## _h6v6_ ## OPT
+
+#define VP8_MC_FUNC(IDX, SIZE, OPT) \
+    c->put_vp8_epel_pixels_tab[IDX][0][1] = ff_put_vp8_epel ## SIZE ## _h4_ ## OPT; \
+    c->put_vp8_epel_pixels_tab[IDX][1][0] = ff_put_vp8_epel ## SIZE ## _v4_ ## OPT; \
+    c->put_vp8_epel_pixels_tab[IDX][1][1] = ff_put_vp8_epel ## SIZE ## _h4v4_ ## OPT; \
+    c->put_vp8_epel_pixels_tab[IDX][1][2] = ff_put_vp8_epel ## SIZE ## _h6v4_ ## OPT; \
+    c->put_vp8_epel_pixels_tab[IDX][2][1] = ff_put_vp8_epel ## SIZE ## _h4v6_ ## OPT; \
+    VP8_LUMA_MC_FUNC(IDX, SIZE, OPT)
+
+#define VP8_BILINEAR_MC_FUNC(IDX, SIZE, OPT) \
+    c->put_vp8_bilinear_pixels_tab[IDX][0][1] = ff_put_vp8_bilinear ## SIZE ## _h_ ## OPT; \
+    c->put_vp8_bilinear_pixels_tab[IDX][0][2] = ff_put_vp8_bilinear ## SIZE ## _h_ ## OPT; \
+    c->put_vp8_bilinear_pixels_tab[IDX][1][0] = ff_put_vp8_bilinear ## SIZE ## _v_ ## OPT; \
+    c->put_vp8_bilinear_pixels_tab[IDX][1][1] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT; \
+    c->put_vp8_bilinear_pixels_tab[IDX][1][2] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT; \
+    c->put_vp8_bilinear_pixels_tab[IDX][2][0] = ff_put_vp8_bilinear ## SIZE ## _v_ ## OPT; \
+    c->put_vp8_bilinear_pixels_tab[IDX][2][1] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT; \
+    c->put_vp8_bilinear_pixels_tab[IDX][2][2] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT
+
+
+av_cold void ff_vp78dsp_init_x86(VP8DSPContext *c)
+{
+#if HAVE_X86ASM
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_MMX(cpu_flags)) {
+#if ARCH_X86_32
+        c->put_vp8_epel_pixels_tab[0][0][0]     =
+        c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_mmx;
+#endif
+        c->put_vp8_epel_pixels_tab[1][0][0]     =
+        c->put_vp8_bilinear_pixels_tab[1][0][0] = ff_put_vp8_pixels8_mmx;
+    }
+
+    /* note that 4-tap width=16 functions are missing because w=16
+     * is only used for luma, and luma is always a copy or sixtap. */
+    if (EXTERNAL_MMXEXT(cpu_flags)) {
+        VP8_MC_FUNC(2, 4, mmxext);
+        VP8_BILINEAR_MC_FUNC(2, 4, mmxext);
+#if ARCH_X86_32
+        VP8_LUMA_MC_FUNC(0, 16, mmxext);
+        VP8_MC_FUNC(1, 8, mmxext);
+        VP8_BILINEAR_MC_FUNC(0, 16, mmxext);
+        VP8_BILINEAR_MC_FUNC(1,  8, mmxext);
+#endif
+    }
+
+    if (EXTERNAL_SSE(cpu_flags)) {
+        c->put_vp8_epel_pixels_tab[0][0][0]     =
+        c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_sse;
+    }
+
+    if (EXTERNAL_SSE2(cpu_flags) || EXTERNAL_SSE2_SLOW(cpu_flags)) {
+        VP8_LUMA_MC_FUNC(0, 16, sse2);
+        VP8_MC_FUNC(1, 8, sse2);
+        VP8_BILINEAR_MC_FUNC(0, 16, sse2);
+        VP8_BILINEAR_MC_FUNC(1, 8, sse2);
+    }
+
+    if (EXTERNAL_SSSE3(cpu_flags)) {
+        VP8_LUMA_MC_FUNC(0, 16, ssse3);
+        VP8_MC_FUNC(1, 8, ssse3);
+        VP8_MC_FUNC(2, 4, ssse3);
+        VP8_BILINEAR_MC_FUNC(0, 16, ssse3);
+        VP8_BILINEAR_MC_FUNC(1, 8, ssse3);
+        VP8_BILINEAR_MC_FUNC(2, 4, ssse3);
+    }
+#endif /* HAVE_X86ASM */
+}
+
+av_cold void ff_vp8dsp_init_x86(VP8DSPContext *c)
+{
+#if HAVE_X86ASM
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_MMX(cpu_flags)) {
+        c->vp8_idct_dc_add4uv = ff_vp8_idct_dc_add4uv_mmx;
+#if ARCH_X86_32
+        c->vp8_idct_dc_add    = ff_vp8_idct_dc_add_mmx;
+        c->vp8_idct_dc_add4y  = ff_vp8_idct_dc_add4y_mmx;
+        c->vp8_idct_add       = ff_vp8_idct_add_mmx;
+        c->vp8_luma_dc_wht    = ff_vp8_luma_dc_wht_mmx;
+
+        c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_mmx;
+        c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_mmx;
+
+        c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_mmx;
+        c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_mmx;
+        c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_mmx;
+        c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_mmx;
+
+        c->vp8_v_loop_filter16y       = ff_vp8_v_loop_filter16y_mbedge_mmx;
+        c->vp8_h_loop_filter16y       = ff_vp8_h_loop_filter16y_mbedge_mmx;
+        c->vp8_v_loop_filter8uv       = ff_vp8_v_loop_filter8uv_mbedge_mmx;
+        c->vp8_h_loop_filter8uv       = ff_vp8_h_loop_filter8uv_mbedge_mmx;
+#endif
+    }
+
+    /* note that 4-tap width=16 functions are missing because w=16
+     * is only used for luma, and luma is always a copy or sixtap. */
+    if (EXTERNAL_MMXEXT(cpu_flags)) {
+#if ARCH_X86_32
+        c->vp8_v_loop_filter_simple   = ff_vp8_v_loop_filter_simple_mmxext;
+        c->vp8_h_loop_filter_simple   = ff_vp8_h_loop_filter_simple_mmxext;
+
+        c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_mmxext;
+        c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_mmxext;
+        c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_mmxext;
+        c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_mmxext;
+
+        c->vp8_v_loop_filter16y       = ff_vp8_v_loop_filter16y_mbedge_mmxext;
+        c->vp8_h_loop_filter16y       = ff_vp8_h_loop_filter16y_mbedge_mmxext;
+        c->vp8_v_loop_filter8uv       = ff_vp8_v_loop_filter8uv_mbedge_mmxext;
+        c->vp8_h_loop_filter8uv       = ff_vp8_h_loop_filter8uv_mbedge_mmxext;
+#endif
+    }
+
+    if (EXTERNAL_SSE(cpu_flags)) {
+        c->vp8_idct_add                         = ff_vp8_idct_add_sse;
+        c->vp8_luma_dc_wht                      = ff_vp8_luma_dc_wht_sse;
+    }
+
+    if (EXTERNAL_SSE2(cpu_flags) || EXTERNAL_SSE2_SLOW(cpu_flags)) {
+        c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_sse2;
+
+        c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_sse2;
+        c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_sse2;
+
+        c->vp8_v_loop_filter16y       = ff_vp8_v_loop_filter16y_mbedge_sse2;
+        c->vp8_v_loop_filter8uv       = ff_vp8_v_loop_filter8uv_mbedge_sse2;
+    }
+
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        c->vp8_idct_dc_add            = ff_vp8_idct_dc_add_sse2;
+        c->vp8_idct_dc_add4y          = ff_vp8_idct_dc_add4y_sse2;
+
+        c->vp8_h_loop_filter_simple   = ff_vp8_h_loop_filter_simple_sse2;
+
+        c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_sse2;
+        c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_sse2;
+
+        c->vp8_h_loop_filter16y       = ff_vp8_h_loop_filter16y_mbedge_sse2;
+        c->vp8_h_loop_filter8uv       = ff_vp8_h_loop_filter8uv_mbedge_sse2;
+    }
+
+    if (EXTERNAL_SSSE3(cpu_flags)) {
+        c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_ssse3;
+        c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_ssse3;
+
+        c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_ssse3;
+        c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_ssse3;
+        c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_ssse3;
+        c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_ssse3;
+
+        c->vp8_v_loop_filter16y       = ff_vp8_v_loop_filter16y_mbedge_ssse3;
+        c->vp8_h_loop_filter16y       = ff_vp8_h_loop_filter16y_mbedge_ssse3;
+        c->vp8_v_loop_filter8uv       = ff_vp8_v_loop_filter8uv_mbedge_ssse3;
+        c->vp8_h_loop_filter8uv       = ff_vp8_h_loop_filter8uv_mbedge_ssse3;
+    }
+
+    if (EXTERNAL_SSE4(cpu_flags)) {
+        c->vp8_idct_dc_add            = ff_vp8_idct_dc_add_sse4;
+
+        c->vp8_h_loop_filter_simple   = ff_vp8_h_loop_filter_simple_sse4;
+        c->vp8_h_loop_filter16y       = ff_vp8_h_loop_filter16y_mbedge_sse4;
+        c->vp8_h_loop_filter8uv       = ff_vp8_h_loop_filter8uv_mbedge_sse4;
+    }
+#endif /* HAVE_X86ASM */
+}
--- a/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/vp9dsp_init.c
+++ b/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/vp9dsp_init.c
@ -0,0 +1,416 @@
+/*
+ * VP9 SIMD optimizations
+ *
+ * Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/mem.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/vp9dsp.h"
+#include "libavcodec/x86/vp9dsp_init.h"
+
+#if HAVE_X86ASM
+
+decl_fpel_func(put,  4,   , mmx);
+decl_fpel_func(put,  8,   , mmx);
+decl_fpel_func(put, 16,   , sse);
+decl_fpel_func(put, 32,   , sse);
+decl_fpel_func(put, 64,   , sse);
+decl_fpel_func(avg,  4, _8, mmxext);
+decl_fpel_func(avg,  8, _8, mmxext);
+decl_fpel_func(avg, 16, _8, sse2);
+decl_fpel_func(avg, 32, _8, sse2);
+decl_fpel_func(avg, 64, _8, sse2);
+decl_fpel_func(put, 32,   , avx);
+decl_fpel_func(put, 64,   , avx);
+decl_fpel_func(avg, 32, _8, avx2);
+decl_fpel_func(avg, 64, _8, avx2);
+
+decl_mc_funcs(4, mmxext, int16_t, 8, 8);
+decl_mc_funcs(8, sse2, int16_t,  8, 8);
+decl_mc_funcs(4, ssse3, int8_t, 32, 8);
+decl_mc_funcs(8, ssse3, int8_t, 32, 8);
+#if ARCH_X86_64
+decl_mc_funcs(16, ssse3, int8_t, 32, 8);
+decl_mc_funcs(32, avx2, int8_t, 32, 8);
+#endif
+
+mc_rep_funcs(16,  8,  8,  sse2, int16_t,  8, 8)
+#if ARCH_X86_32
+mc_rep_funcs(16,  8,  8, ssse3, int8_t,  32, 8)
+#endif
+mc_rep_funcs(32, 16, 16, sse2,  int16_t,  8, 8)
+mc_rep_funcs(32, 16, 16, ssse3, int8_t,  32, 8)
+mc_rep_funcs(64, 32, 32, sse2,  int16_t,  8, 8)
+mc_rep_funcs(64, 32, 32, ssse3, int8_t,  32, 8)
+#if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
+mc_rep_funcs(64, 32, 32, avx2,  int8_t,  32, 8)
+#endif
+
+extern const int8_t ff_filters_ssse3[3][15][4][32];
+extern const int16_t ff_filters_sse2[3][15][8][8];
+
+filters_8tap_2d_fn2(put, 16, 8, 1, mmxext, sse2, sse2)
+filters_8tap_2d_fn2(avg, 16, 8, 1, mmxext, sse2, sse2)
+filters_8tap_2d_fn2(put, 16, 8, 1, ssse3, ssse3, ssse3)
+filters_8tap_2d_fn2(avg, 16, 8, 1, ssse3, ssse3, ssse3)
+#if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
+filters_8tap_2d_fn(put, 64, 32, 8, 1, avx2, ssse3)
+filters_8tap_2d_fn(put, 32, 32, 8, 1, avx2, ssse3)
+filters_8tap_2d_fn(avg, 64, 32, 8, 1, avx2, ssse3)
+filters_8tap_2d_fn(avg, 32, 32, 8, 1, avx2, ssse3)
+#endif
+
+filters_8tap_1d_fn3(put, 8, mmxext, sse2, sse2)
+filters_8tap_1d_fn3(avg, 8, mmxext, sse2, sse2)
+filters_8tap_1d_fn3(put, 8, ssse3, ssse3, ssse3)
+filters_8tap_1d_fn3(avg, 8, ssse3, ssse3, ssse3)
+#if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
+filters_8tap_1d_fn2(put, 64, 8, avx2, ssse3)
+filters_8tap_1d_fn2(put, 32, 8, avx2, ssse3)
+filters_8tap_1d_fn2(avg, 64, 8, avx2, ssse3)
+filters_8tap_1d_fn2(avg, 32, 8, avx2, ssse3)
+#endif
+
+#define itxfm_func(typea, typeb, size, opt) \
+void ff_vp9_##typea##_##typeb##_##size##x##size##_add_##opt(uint8_t *dst, ptrdiff_t stride, \
+                                                            int16_t *block, int eob)
+#define itxfm_funcs(size, opt) \
+itxfm_func(idct,  idct,  size, opt); \
+itxfm_func(iadst, idct,  size, opt); \
+itxfm_func(idct,  iadst, size, opt); \
+itxfm_func(iadst, iadst, size, opt)
+
+itxfm_func(idct,  idct,  4, mmxext);
+itxfm_func(idct,  iadst, 4, sse2);
+itxfm_func(iadst, idct,  4, sse2);
+itxfm_func(iadst, iadst, 4, sse2);
+itxfm_funcs(4, ssse3);
+itxfm_funcs(8, sse2);
+itxfm_funcs(8, ssse3);
+itxfm_funcs(8, avx);
+itxfm_funcs(16, sse2);
+itxfm_funcs(16, ssse3);
+itxfm_funcs(16, avx);
+itxfm_func(idct, idct, 32, sse2);
+itxfm_func(idct, idct, 32, ssse3);
+itxfm_func(idct, idct, 32, avx);
+itxfm_func(iwht, iwht, 4, mmx);
+itxfm_funcs(16, avx2);
+itxfm_func(idct, idct, 32, avx2);
+
+#undef itxfm_func
+#undef itxfm_funcs
+
+#define lpf_funcs(size1, size2, opt) \
+void ff_vp9_loop_filter_v_##size1##_##size2##_##opt(uint8_t *dst, ptrdiff_t stride, \
+                                                    int E, int I, int H); \
+void ff_vp9_loop_filter_h_##size1##_##size2##_##opt(uint8_t *dst, ptrdiff_t stride, \
+                                                    int E, int I, int H)
+
+lpf_funcs(4, 8, mmxext);
+lpf_funcs(8, 8, mmxext);
+lpf_funcs(16, 16, sse2);
+lpf_funcs(16, 16, ssse3);
+lpf_funcs(16, 16, avx);
+lpf_funcs(44, 16, sse2);
+lpf_funcs(44, 16, ssse3);
+lpf_funcs(44, 16, avx);
+lpf_funcs(84, 16, sse2);
+lpf_funcs(84, 16, ssse3);
+lpf_funcs(84, 16, avx);
+lpf_funcs(48, 16, sse2);
+lpf_funcs(48, 16, ssse3);
+lpf_funcs(48, 16, avx);
+lpf_funcs(88, 16, sse2);
+lpf_funcs(88, 16, ssse3);
+lpf_funcs(88, 16, avx);
+
+#undef lpf_funcs
+
+#define ipred_func(size, type, opt) \
+void ff_vp9_ipred_##type##_##size##x##size##_##opt(uint8_t *dst, ptrdiff_t stride, \
+                                                   const uint8_t *l, const uint8_t *a)
+
+ipred_func(8, v, mmx);
+
+#define ipred_dc_funcs(size, opt) \
+ipred_func(size, dc, opt); \
+ipred_func(size, dc_left, opt); \
+ipred_func(size, dc_top, opt)
+
+ipred_dc_funcs(4, mmxext);
+ipred_dc_funcs(8, mmxext);
+
+#define ipred_dir_tm_funcs(size, opt) \
+ipred_func(size, tm, opt); \
+ipred_func(size, dl, opt); \
+ipred_func(size, dr, opt); \
+ipred_func(size, hd, opt); \
+ipred_func(size, hu, opt); \
+ipred_func(size, vl, opt); \
+ipred_func(size, vr, opt)
+
+ipred_dir_tm_funcs(4, mmxext);
+
+ipred_func(16, v, sse);
+ipred_func(32, v, sse);
+
+ipred_dc_funcs(16, sse2);
+ipred_dc_funcs(32, sse2);
+
+#define ipred_dir_tm_h_funcs(size, opt) \
+ipred_dir_tm_funcs(size, opt); \
+ipred_func(size, h, opt)
+
+ipred_dir_tm_h_funcs(8, sse2);
+ipred_dir_tm_h_funcs(16, sse2);
+ipred_dir_tm_h_funcs(32, sse2);
+
+ipred_func(4, h, sse2);
+
+#define ipred_all_funcs(size, opt) \
+ipred_dc_funcs(size, opt); \
+ipred_dir_tm_h_funcs(size, opt)
+
+// FIXME hd/vl_4x4_ssse3 does not exist
+ipred_all_funcs(4, ssse3);
+ipred_all_funcs(8, ssse3);
+ipred_all_funcs(16, ssse3);
+ipred_all_funcs(32, ssse3);
+
+ipred_dir_tm_h_funcs(8, avx);
+ipred_dir_tm_h_funcs(16, avx);
+ipred_dir_tm_h_funcs(32, avx);
+
+ipred_func(32, v, avx);
+
+ipred_dc_funcs(32, avx2);
+ipred_func(32, h, avx2);
+ipred_func(32, tm, avx2);
+
+#undef ipred_func
+#undef ipred_dir_tm_h_funcs
+#undef ipred_dir_tm_funcs
+#undef ipred_dc_funcs
+
+#endif /* HAVE_X86ASM */
+
+av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int bpp, int bitexact)
+{
+#if HAVE_X86ASM
+    int cpu_flags;
+
+    if (bpp == 10) {
+        ff_vp9dsp_init_10bpp_x86(dsp, bitexact);
+        return;
+    } else if (bpp == 12) {
+        ff_vp9dsp_init_12bpp_x86(dsp, bitexact);
+        return;
+    }
+
+    cpu_flags = av_get_cpu_flags();
+
+#define init_lpf(opt) do { \
+    dsp->loop_filter_16[0] = ff_vp9_loop_filter_h_16_16_##opt; \
+    dsp->loop_filter_16[1] = ff_vp9_loop_filter_v_16_16_##opt; \
+    dsp->loop_filter_mix2[0][0][0] = ff_vp9_loop_filter_h_44_16_##opt; \
+    dsp->loop_filter_mix2[0][0][1] = ff_vp9_loop_filter_v_44_16_##opt; \
+    dsp->loop_filter_mix2[0][1][0] = ff_vp9_loop_filter_h_48_16_##opt; \
+    dsp->loop_filter_mix2[0][1][1] = ff_vp9_loop_filter_v_48_16_##opt; \
+    dsp->loop_filter_mix2[1][0][0] = ff_vp9_loop_filter_h_84_16_##opt; \
+    dsp->loop_filter_mix2[1][0][1] = ff_vp9_loop_filter_v_84_16_##opt; \
+    dsp->loop_filter_mix2[1][1][0] = ff_vp9_loop_filter_h_88_16_##opt; \
+    dsp->loop_filter_mix2[1][1][1] = ff_vp9_loop_filter_v_88_16_##opt; \
+} while (0)
+
+#define init_ipred(sz, opt, t, e) \
+    dsp->intra_pred[TX_##sz##X##sz][e##_PRED] = ff_vp9_ipred_##t##_##sz##x##sz##_##opt
+
+#define ff_vp9_ipred_hd_4x4_ssse3 ff_vp9_ipred_hd_4x4_mmxext
+#define ff_vp9_ipred_vl_4x4_ssse3 ff_vp9_ipred_vl_4x4_mmxext
+#define init_dir_tm_ipred(sz, opt) do { \
+    init_ipred(sz, opt, dl, DIAG_DOWN_LEFT); \
+    init_ipred(sz, opt, dr, DIAG_DOWN_RIGHT); \
+    init_ipred(sz, opt, hd, HOR_DOWN); \
+    init_ipred(sz, opt, vl, VERT_LEFT); \
+    init_ipred(sz, opt, hu, HOR_UP); \
+    init_ipred(sz, opt, tm, TM_VP8); \
+    init_ipred(sz, opt, vr, VERT_RIGHT); \
+} while (0)
+#define init_dir_tm_h_ipred(sz, opt) do { \
+    init_dir_tm_ipred(sz, opt); \
+    init_ipred(sz, opt, h,  HOR); \
+} while (0)
+#define init_dc_ipred(sz, opt) do { \
+    init_ipred(sz, opt, dc,      DC); \
+    init_ipred(sz, opt, dc_left, LEFT_DC); \
+    init_ipred(sz, opt, dc_top,  TOP_DC); \
+} while (0)
+#define init_all_ipred(sz, opt) do { \
+    init_dc_ipred(sz, opt); \
+    init_dir_tm_h_ipred(sz, opt); \
+} while (0)
+
+    if (EXTERNAL_MMX(cpu_flags)) {
+        init_fpel_func(4, 0,  4, put, , mmx);
+        init_fpel_func(3, 0,  8, put, , mmx);
+        if (!bitexact) {
+            dsp->itxfm_add[4 /* lossless */][DCT_DCT] =
+            dsp->itxfm_add[4 /* lossless */][ADST_DCT] =
+            dsp->itxfm_add[4 /* lossless */][DCT_ADST] =
+            dsp->itxfm_add[4 /* lossless */][ADST_ADST] = ff_vp9_iwht_iwht_4x4_add_mmx;
+        }
+        init_ipred(8, mmx, v, VERT);
+    }
+
+    if (EXTERNAL_MMXEXT(cpu_flags)) {
+        dsp->loop_filter_8[0][0] = ff_vp9_loop_filter_h_4_8_mmxext;
+        dsp->loop_filter_8[0][1] = ff_vp9_loop_filter_v_4_8_mmxext;
+        dsp->loop_filter_8[1][0] = ff_vp9_loop_filter_h_8_8_mmxext;
+        dsp->loop_filter_8[1][1] = ff_vp9_loop_filter_v_8_8_mmxext;
+        init_subpel2(4, 0, 4, put, 8, mmxext);
+        init_subpel2(4, 1, 4, avg, 8, mmxext);
+        init_fpel_func(4, 1,  4, avg, _8, mmxext);
+        init_fpel_func(3, 1,  8, avg, _8, mmxext);
+        dsp->itxfm_add[TX_4X4][DCT_DCT] = ff_vp9_idct_idct_4x4_add_mmxext;
+        init_dc_ipred(4, mmxext);
+        init_dc_ipred(8, mmxext);
+        init_dir_tm_ipred(4, mmxext);
+    }
+
+    if (EXTERNAL_SSE(cpu_flags)) {
+        init_fpel_func(2, 0, 16, put, , sse);
+        init_fpel_func(1, 0, 32, put, , sse);
+        init_fpel_func(0, 0, 64, put, , sse);
+        init_ipred(16, sse, v, VERT);
+        init_ipred(32, sse, v, VERT);
+    }
+
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        init_subpel3_8to64(0, put, 8, sse2);
+        init_subpel3_8to64(1, avg, 8, sse2);
+        init_fpel_func(2, 1, 16, avg,  _8, sse2);
+        init_fpel_func(1, 1, 32, avg,  _8, sse2);
+        init_fpel_func(0, 1, 64, avg,  _8, sse2);
+        init_lpf(sse2);
+        dsp->itxfm_add[TX_4X4][ADST_DCT]  = ff_vp9_idct_iadst_4x4_add_sse2;
+        dsp->itxfm_add[TX_4X4][DCT_ADST]  = ff_vp9_iadst_idct_4x4_add_sse2;
+        dsp->itxfm_add[TX_4X4][ADST_ADST] = ff_vp9_iadst_iadst_4x4_add_sse2;
+        dsp->itxfm_add[TX_8X8][DCT_DCT] = ff_vp9_idct_idct_8x8_add_sse2;
+        dsp->itxfm_add[TX_8X8][ADST_DCT]  = ff_vp9_idct_iadst_8x8_add_sse2;
+        dsp->itxfm_add[TX_8X8][DCT_ADST]  = ff_vp9_iadst_idct_8x8_add_sse2;
+        dsp->itxfm_add[TX_8X8][ADST_ADST] = ff_vp9_iadst_iadst_8x8_add_sse2;
+        dsp->itxfm_add[TX_16X16][DCT_DCT]   = ff_vp9_idct_idct_16x16_add_sse2;
+        dsp->itxfm_add[TX_16X16][ADST_DCT]  = ff_vp9_idct_iadst_16x16_add_sse2;
+        dsp->itxfm_add[TX_16X16][DCT_ADST]  = ff_vp9_iadst_idct_16x16_add_sse2;
+        dsp->itxfm_add[TX_16X16][ADST_ADST] = ff_vp9_iadst_iadst_16x16_add_sse2;
+        dsp->itxfm_add[TX_32X32][ADST_ADST] =
+        dsp->itxfm_add[TX_32X32][ADST_DCT] =
+        dsp->itxfm_add[TX_32X32][DCT_ADST] =
+        dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_sse2;
+        init_dc_ipred(16, sse2);
+        init_dc_ipred(32, sse2);
+        init_dir_tm_h_ipred(8, sse2);
+        init_dir_tm_h_ipred(16, sse2);
+        init_dir_tm_h_ipred(32, sse2);
+        init_ipred(4, sse2, h, HOR);
+    }
+
+    if (EXTERNAL_SSSE3(cpu_flags)) {
+        init_subpel3(0, put, 8, ssse3);
+        init_subpel3(1, avg, 8, ssse3);
+        dsp->itxfm_add[TX_4X4][DCT_DCT] = ff_vp9_idct_idct_4x4_add_ssse3;
+        dsp->itxfm_add[TX_4X4][ADST_DCT]  = ff_vp9_idct_iadst_4x4_add_ssse3;
+        dsp->itxfm_add[TX_4X4][DCT_ADST]  = ff_vp9_iadst_idct_4x4_add_ssse3;
+        dsp->itxfm_add[TX_4X4][ADST_ADST] = ff_vp9_iadst_iadst_4x4_add_ssse3;
+        dsp->itxfm_add[TX_8X8][DCT_DCT] = ff_vp9_idct_idct_8x8_add_ssse3;
+        dsp->itxfm_add[TX_8X8][ADST_DCT]  = ff_vp9_idct_iadst_8x8_add_ssse3;
+        dsp->itxfm_add[TX_8X8][DCT_ADST]  = ff_vp9_iadst_idct_8x8_add_ssse3;
+        dsp->itxfm_add[TX_8X8][ADST_ADST] = ff_vp9_iadst_iadst_8x8_add_ssse3;
+        dsp->itxfm_add[TX_16X16][DCT_DCT]   = ff_vp9_idct_idct_16x16_add_ssse3;
+        dsp->itxfm_add[TX_16X16][ADST_DCT]  = ff_vp9_idct_iadst_16x16_add_ssse3;
+        dsp->itxfm_add[TX_16X16][DCT_ADST]  = ff_vp9_iadst_idct_16x16_add_ssse3;
+        dsp->itxfm_add[TX_16X16][ADST_ADST] = ff_vp9_iadst_iadst_16x16_add_ssse3;
+        dsp->itxfm_add[TX_32X32][ADST_ADST] =
+        dsp->itxfm_add[TX_32X32][ADST_DCT] =
+        dsp->itxfm_add[TX_32X32][DCT_ADST] =
+        dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_ssse3;
+        init_lpf(ssse3);
+        init_all_ipred(4, ssse3);
+        init_all_ipred(8, ssse3);
+        init_all_ipred(16, ssse3);
+        init_all_ipred(32, ssse3);
+    }
+
+    if (EXTERNAL_AVX(cpu_flags)) {
+        dsp->itxfm_add[TX_8X8][DCT_DCT] = ff_vp9_idct_idct_8x8_add_avx;
+        dsp->itxfm_add[TX_8X8][ADST_DCT]  = ff_vp9_idct_iadst_8x8_add_avx;
+        dsp->itxfm_add[TX_8X8][DCT_ADST]  = ff_vp9_iadst_idct_8x8_add_avx;
+        dsp->itxfm_add[TX_8X8][ADST_ADST] = ff_vp9_iadst_iadst_8x8_add_avx;
+        dsp->itxfm_add[TX_16X16][DCT_DCT] = ff_vp9_idct_idct_16x16_add_avx;
+        dsp->itxfm_add[TX_16X16][ADST_DCT]  = ff_vp9_idct_iadst_16x16_add_avx;
+        dsp->itxfm_add[TX_16X16][DCT_ADST]  = ff_vp9_iadst_idct_16x16_add_avx;
+        dsp->itxfm_add[TX_16X16][ADST_ADST] = ff_vp9_iadst_iadst_16x16_add_avx;
+        dsp->itxfm_add[TX_32X32][ADST_ADST] =
+        dsp->itxfm_add[TX_32X32][ADST_DCT] =
+        dsp->itxfm_add[TX_32X32][DCT_ADST] =
+        dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_avx;
+        init_lpf(avx);
+        init_dir_tm_h_ipred(8, avx);
+        init_dir_tm_h_ipred(16, avx);
+        init_dir_tm_h_ipred(32, avx);
+    }
+    if (EXTERNAL_AVX_FAST(cpu_flags)) {
+        init_fpel_func(1, 0, 32, put, , avx);
+        init_fpel_func(0, 0, 64, put, , avx);
+        init_ipred(32, avx, v, VERT);
+    }
+
+    if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+        init_fpel_func(1, 1, 32, avg, _8, avx2);
+        init_fpel_func(0, 1, 64, avg, _8, avx2);
+        if (ARCH_X86_64) {
+#if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
+            dsp->itxfm_add[TX_16X16][DCT_DCT] = ff_vp9_idct_idct_16x16_add_avx2;
+            dsp->itxfm_add[TX_16X16][ADST_DCT]  = ff_vp9_idct_iadst_16x16_add_avx2;
+            dsp->itxfm_add[TX_16X16][DCT_ADST]  = ff_vp9_iadst_idct_16x16_add_avx2;
+            dsp->itxfm_add[TX_16X16][ADST_ADST] = ff_vp9_iadst_iadst_16x16_add_avx2;
+            dsp->itxfm_add[TX_32X32][ADST_ADST] =
+            dsp->itxfm_add[TX_32X32][ADST_DCT] =
+            dsp->itxfm_add[TX_32X32][DCT_ADST] =
+            dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_avx2;
+            init_subpel3_32_64(0, put, 8, avx2);
+            init_subpel3_32_64(1, avg, 8, avx2);
+#endif
+        }
+        init_dc_ipred(32, avx2);
+        init_ipred(32, avx2, h,  HOR);
+        init_ipred(32, avx2, tm, TM_VP8);
+    }
+
+#undef init_fpel
+#undef init_subpel1
+#undef init_subpel2
+#undef init_subpel3
+
+#endif /* HAVE_X86ASM */
+}
--- a/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/vp9dsp_init.h
+++ b/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/vp9dsp_init.h
@ -0,0 +1,189 @@
+/*
+ * VP9 SIMD optimizations
+ *
+ * Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_X86_VP9DSP_INIT_H
+#define AVCODEC_X86_VP9DSP_INIT_H
+
+#include "libavcodec/vp9dsp.h"
+
+// hack to force-expand BPC
+#define cat(a, bpp, b) a##bpp##b
+
+#define decl_fpel_func(avg, sz, bpp, opt) \
+void ff_vp9_##avg##sz##bpp##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
+                                   const uint8_t *src, ptrdiff_t src_stride, \
+                                   int h, int mx, int my)
+
+#define decl_mc_func(avg, sz, dir, opt, type, f_sz, bpp) \
+void ff_vp9_##avg##_8tap_1d_##dir##_##sz##_##bpp##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
+                                                         const uint8_t *src, ptrdiff_t src_stride, \
+                                                         int h, const type (*filter)[f_sz])
+
+#define decl_mc_funcs(sz, opt, type, fsz, bpp) \
+decl_mc_func(put, sz, h, opt, type, fsz, bpp); \
+decl_mc_func(avg, sz, h, opt, type, fsz, bpp); \
+decl_mc_func(put, sz, v, opt, type, fsz, bpp); \
+decl_mc_func(avg, sz, v, opt, type, fsz, bpp)
+
+#define decl_ipred_fn(type, sz, bpp, opt) \
+void ff_vp9_ipred_##type##_##sz##x##sz##_##bpp##_##opt(uint8_t *dst, \
+                                                       ptrdiff_t stride, \
+                                                       const uint8_t *l, \
+                                                       const uint8_t *a)
+
+#define decl_ipred_fns(type, bpp, opt4, opt8_16_32) \
+decl_ipred_fn(type,  4, bpp, opt4); \
+decl_ipred_fn(type,  8, bpp, opt8_16_32); \
+decl_ipred_fn(type, 16, bpp, opt8_16_32); \
+decl_ipred_fn(type, 32, bpp, opt8_16_32)
+
+#define decl_itxfm_func(typea, typeb, size, bpp, opt) \
+void cat(ff_vp9_##typea##_##typeb##_##size##x##size##_add_, bpp, _##opt)(uint8_t *dst, \
+                                                                         ptrdiff_t stride, \
+                                                                         int16_t *block, \
+                                                                         int eob)
+
+#define decl_itxfm_funcs(size, bpp, opt) \
+decl_itxfm_func(idct,  idct,  size, bpp, opt); \
+decl_itxfm_func(iadst, idct,  size, bpp, opt); \
+decl_itxfm_func(idct,  iadst, size, bpp, opt); \
+decl_itxfm_func(iadst, iadst, size, bpp, opt)
+
+#define mc_rep_func(avg, sz, hsz, hszb, dir, opt, type, f_sz, bpp) \
+static av_always_inline void \
+ff_vp9_##avg##_8tap_1d_##dir##_##sz##_##bpp##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
+                                                    const uint8_t *src, ptrdiff_t src_stride, \
+                                                    int h, const type (*filter)[f_sz]) \
+{ \
+    ff_vp9_##avg##_8tap_1d_##dir##_##hsz##_##bpp##_##opt(dst,        dst_stride, src, \
+                                                         src_stride, h, filter); \
+    ff_vp9_##avg##_8tap_1d_##dir##_##hsz##_##bpp##_##opt(dst + hszb, dst_stride, src + hszb, \
+                                                         src_stride, h, filter); \
+}
+
+#define mc_rep_funcs(sz, hsz, hszb, opt, type, fsz, bpp) \
+mc_rep_func(put, sz, hsz, hszb, h, opt, type, fsz, bpp) \
+mc_rep_func(avg, sz, hsz, hszb, h, opt, type, fsz, bpp) \
+mc_rep_func(put, sz, hsz, hszb, v, opt, type, fsz, bpp) \
+mc_rep_func(avg, sz, hsz, hszb, v, opt, type, fsz, bpp)
+
+#define filter_8tap_1d_fn(op, sz, f, f_opt, fname, dir, dvar, bpp, opt) \
+static void op##_8tap_##fname##_##sz##dir##_##bpp##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
+                                                          const uint8_t *src, ptrdiff_t src_stride, \
+                                                          int h, int mx, int my) \
+{ \
+    ff_vp9_##op##_8tap_1d_##dir##_##sz##_##bpp##_##opt(dst, dst_stride, src, src_stride, \
+                                                       h, ff_filters_##f_opt[f][dvar - 1]); \
+}
+
+#define filters_8tap_1d_fn(op, sz, dir, dvar, bpp, opt, f_opt) \
+filter_8tap_1d_fn(op, sz, FILTER_8TAP_REGULAR, f_opt, regular, dir, dvar, bpp, opt) \
+filter_8tap_1d_fn(op, sz, FILTER_8TAP_SHARP,   f_opt, sharp,   dir, dvar, bpp, opt) \
+filter_8tap_1d_fn(op, sz, FILTER_8TAP_SMOOTH,  f_opt, smooth,  dir, dvar, bpp, opt)
+
+#define filters_8tap_1d_fn2(op, sz, bpp, opt, f_opt) \
+filters_8tap_1d_fn(op, sz, h, mx, bpp, opt, f_opt) \
+filters_8tap_1d_fn(op, sz, v, my, bpp, opt, f_opt)
+
+#define filters_8tap_1d_fn3(op, bpp, opt4, opt8, f_opt) \
+filters_8tap_1d_fn2(op, 64, bpp, opt8, f_opt) \
+filters_8tap_1d_fn2(op, 32, bpp, opt8, f_opt) \
+filters_8tap_1d_fn2(op, 16, bpp, opt8, f_opt) \
+filters_8tap_1d_fn2(op, 8, bpp, opt8, f_opt) \
+filters_8tap_1d_fn2(op, 4, bpp, opt4, f_opt)
+
+#define filter_8tap_2d_fn(op, sz, f, f_opt, fname, align, bpp, bytes, opt) \
+static void op##_8tap_##fname##_##sz##hv_##bpp##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
+                                                       const uint8_t *src, ptrdiff_t src_stride, \
+                                                       int h, int mx, int my) \
+{ \
+    LOCAL_ALIGNED_##align(uint8_t, temp, [71 * 64 * bytes]); \
+    ff_vp9_put_8tap_1d_h_##sz##_##bpp##_##opt(temp, 64 * bytes, src - 3 * src_stride, \
+                                              src_stride,  h + 7, \
+                                              ff_filters_##f_opt[f][mx - 1]); \
+    ff_vp9_##op##_8tap_1d_v_##sz##_##bpp##_##opt(dst, dst_stride, temp + 3 * bytes * 64, \
+                                                 64 * bytes, h, \
+                                                 ff_filters_##f_opt[f][my - 1]); \
+}
+
+#define filters_8tap_2d_fn(op, sz, align, bpp, bytes, opt, f_opt) \
+filter_8tap_2d_fn(op, sz, FILTER_8TAP_REGULAR, f_opt, regular, align, bpp, bytes, opt) \
+filter_8tap_2d_fn(op, sz, FILTER_8TAP_SHARP,   f_opt, sharp, align, bpp, bytes, opt) \
+filter_8tap_2d_fn(op, sz, FILTER_8TAP_SMOOTH,  f_opt, smooth, align, bpp, bytes, opt)
+
+#define filters_8tap_2d_fn2(op, align, bpp, bytes, opt4, opt8, f_opt) \
+filters_8tap_2d_fn(op, 64, align, bpp, bytes, opt8, f_opt) \
+filters_8tap_2d_fn(op, 32, align, bpp, bytes, opt8, f_opt) \
+filters_8tap_2d_fn(op, 16, align, bpp, bytes, opt8, f_opt) \
+filters_8tap_2d_fn(op, 8, align, bpp, bytes, opt8, f_opt) \
+filters_8tap_2d_fn(op, 4, align, bpp, bytes, opt4, f_opt)
+
+#define init_fpel_func(idx1, idx2, sz, type, bpp, opt) \
+    dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][0][0] = \
+    dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][0][0] = \
+    dsp->mc[idx1][FILTER_8TAP_SHARP  ][idx2][0][0] = \
+    dsp->mc[idx1][FILTER_BILINEAR    ][idx2][0][0] = ff_vp9_##type##sz##bpp##_##opt
+
+#define init_subpel1(idx1, idx2, idxh, idxv, sz, dir, type, bpp, opt) \
+    dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][idxh][idxv] = \
+        type##_8tap_smooth_##sz##dir##_##bpp##_##opt; \
+    dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] = \
+        type##_8tap_regular_##sz##dir##_##bpp##_##opt; \
+    dsp->mc[idx1][FILTER_8TAP_SHARP  ][idx2][idxh][idxv] = \
+        type##_8tap_sharp_##sz##dir##_##bpp##_##opt
+
+#define init_subpel2(idx1, idx2, sz, type, bpp, opt) \
+    init_subpel1(idx1, idx2, 1, 1, sz, hv, type, bpp, opt); \
+    init_subpel1(idx1, idx2, 0, 1, sz, v,  type, bpp, opt); \
+    init_subpel1(idx1, idx2, 1, 0, sz, h,  type, bpp, opt)
+
+#define init_subpel3_32_64(idx, type, bpp, opt) \
+    init_subpel2(0, idx, 64, type, bpp, opt); \
+    init_subpel2(1, idx, 32, type, bpp, opt)
+
+#define init_subpel3_8to64(idx, type, bpp, opt) \
+    init_subpel3_32_64(idx, type, bpp, opt); \
+    init_subpel2(2, idx, 16, type, bpp, opt); \
+    init_subpel2(3, idx,  8, type, bpp, opt)
+
+#define init_subpel3(idx, type, bpp, opt) \
+    init_subpel3_8to64(idx, type, bpp, opt); \
+    init_subpel2(4, idx,  4, type, bpp, opt)
+
+#define init_ipred_func(type, enum, sz, bpp, opt) \
+    dsp->intra_pred[TX_##sz##X##sz][enum##_PRED] = \
+        cat(ff_vp9_ipred_##type##_##sz##x##sz##_, bpp, _##opt)
+
+#define init_8_16_32_ipred_funcs(type, enum, bpp, opt) \
+    init_ipred_func(type, enum,  8, bpp, opt); \
+    init_ipred_func(type, enum, 16, bpp, opt); \
+    init_ipred_func(type, enum, 32, bpp, opt)
+
+#define init_ipred_funcs(type, enum, bpp, opt) \
+    init_ipred_func(type, enum,  4, bpp, opt); \
+    init_8_16_32_ipred_funcs(type, enum, bpp, opt)
+
+void ff_vp9dsp_init_10bpp_x86(VP9DSPContext *dsp, int bitexact);
+void ff_vp9dsp_init_12bpp_x86(VP9DSPContext *dsp, int bitexact);
+void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp);
+
+#endif /* AVCODEC_X86_VP9DSP_INIT_H */
--- a/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/vp9dsp_init_10bpp.c
+++ b/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/vp9dsp_init_10bpp.c
@ -0,0 +1,25 @@
+/*
+ * VP9 SIMD optimizations
+ *
+ * Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define BPC 10
+#define INIT_FUNC ff_vp9dsp_init_10bpp_x86
+#include "vp9dsp_init_16bpp_template.c"
--- a/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/vp9dsp_init_12bpp.c
+++ b/trunk/3rdparty/ffmpeg-4-fit/libavcodec/x86/vp9dsp_init_12bpp.c
@ -0,0 +1,25 @@
+/*
+ * VP9 SIMD optimizations
+ *
+ * Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define BPC 12
+#define INIT_FUNC ff_vp9dsp_init_12bpp_x86
+#include "vp9dsp_init_16bpp_template.c"
--- a/Show more
+++ b/Show more