1
0
Fork 0
mirror of https://github.com/ossrs/srs.git synced 2025-03-09 15:49:59 +00:00

Rename ffmpeg-4.2-fit to ffmpeg-4-fit

This commit is contained in:
winlin 2021-03-02 17:48:40 +08:00
parent b19074721c
commit 27712fdda7
720 changed files with 14 additions and 14 deletions

View file

@ -0,0 +1,18 @@
OBJS += x86/cpu.o \
x86/fixed_dsp_init.o \
x86/float_dsp_init.o \
x86/imgutils_init.o \
x86/lls_init.o \
OBJS-$(CONFIG_PIXELUTILS) += x86/pixelutils_init.o \
EMMS_OBJS_$(HAVE_MMX_INLINE)_$(HAVE_MMX_EXTERNAL)_$(HAVE_MM_EMPTY) = x86/emms.o
X86ASM-OBJS += x86/cpuid.o \
$(EMMS_OBJS__yes_) \
x86/fixed_dsp.o \
x86/float_dsp.o \
x86/imgutils.o \
x86/lls.o \
X86ASM-OBJS-$(CONFIG_PIXELUTILS) += x86/pixelutils.o \

View file

@ -0,0 +1,154 @@
/*
* copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVUTIL_X86_ASM_H
#define AVUTIL_X86_ASM_H
#include <stdint.h>
#include "config.h"
typedef struct xmm_reg { uint64_t a, b; } xmm_reg;
typedef struct ymm_reg { uint64_t a, b, c, d; } ymm_reg;
#if ARCH_X86_64
# define FF_OPSIZE "q"
# define FF_REG_a "rax"
# define FF_REG_b "rbx"
# define FF_REG_c "rcx"
# define FF_REG_d "rdx"
# define FF_REG_D "rdi"
# define FF_REG_S "rsi"
# define FF_PTR_SIZE "8"
typedef int64_t x86_reg;
/* FF_REG_SP is defined in Solaris sys headers, so use FF_REG_sp */
# define FF_REG_sp "rsp"
# define FF_REG_BP "rbp"
# define FF_REGBP rbp
# define FF_REGa rax
# define FF_REGb rbx
# define FF_REGc rcx
# define FF_REGd rdx
# define FF_REGSP rsp
#elif ARCH_X86_32
# define FF_OPSIZE "l"
# define FF_REG_a "eax"
# define FF_REG_b "ebx"
# define FF_REG_c "ecx"
# define FF_REG_d "edx"
# define FF_REG_D "edi"
# define FF_REG_S "esi"
# define FF_PTR_SIZE "4"
typedef int32_t x86_reg;
# define FF_REG_sp "esp"
# define FF_REG_BP "ebp"
# define FF_REGBP ebp
# define FF_REGa eax
# define FF_REGb ebx
# define FF_REGc ecx
# define FF_REGd edx
# define FF_REGSP esp
#else
typedef int x86_reg;
#endif
#define HAVE_7REGS (ARCH_X86_64 || (HAVE_EBX_AVAILABLE && HAVE_EBP_AVAILABLE))
#define HAVE_6REGS (ARCH_X86_64 || (HAVE_EBX_AVAILABLE || HAVE_EBP_AVAILABLE))
#if ARCH_X86_64 && defined(PIC)
# define BROKEN_RELOCATIONS 1
#endif
/*
* If gcc is not set to support sse (-msse) it will not accept xmm registers
* in the clobber list for inline asm. XMM_CLOBBERS takes a list of xmm
* registers to be marked as clobbered and evaluates to nothing if they are
* not supported, or to the list itself if they are supported. Since a clobber
* list may not be empty, XMM_CLOBBERS_ONLY should be used if the xmm
* registers are the only in the clobber list.
* For example a list with "eax" and "xmm0" as clobbers should become:
* : XMM_CLOBBERS("xmm0",) "eax"
* and a list with only "xmm0" should become:
* XMM_CLOBBERS_ONLY("xmm0")
*/
#if HAVE_XMM_CLOBBERS
# define XMM_CLOBBERS(...) __VA_ARGS__
# define XMM_CLOBBERS_ONLY(...) : __VA_ARGS__
#else
# define XMM_CLOBBERS(...)
# define XMM_CLOBBERS_ONLY(...)
#endif
/* Use to export labels from asm. */
#define LABEL_MANGLE(a) EXTERN_PREFIX #a
// Use rip-relative addressing if compiling PIC code on x86-64.
#if ARCH_X86_64 && defined(PIC)
# define LOCAL_MANGLE(a) #a "(%%rip)"
#else
# define LOCAL_MANGLE(a) #a
#endif
#if HAVE_INLINE_ASM_DIRECT_SYMBOL_REFS
# define MANGLE(a) EXTERN_PREFIX LOCAL_MANGLE(a)
# define NAMED_CONSTRAINTS_ADD(...)
# define NAMED_CONSTRAINTS(...)
# define NAMED_CONSTRAINTS_ARRAY_ADD(...)
# define NAMED_CONSTRAINTS_ARRAY(...)
#else
/* When direct symbol references are used in code passed to a compiler that does not support them
* then these references need to be converted to named asm constraints instead.
* Instead of returning a direct symbol MANGLE now returns a named constraint for that specific symbol.
* In order for this to work there must also be a corresponding entry in the asm-interface. To add this
* entry use the macro NAMED_CONSTRAINTS() and pass in a list of each symbol reference used in the
* corresponding block of code. (e.g. NAMED_CONSTRAINTS(var1,var2,var3) where var1 is the first symbol etc. ).
* If there are already existing constraints then use NAMED_CONSTRAINTS_ADD to add to the existing constraint list.
*/
# define MANGLE(a) "%["#a"]"
// Intel/MSVC does not correctly expand va-args so we need a rather ugly hack in order to get it to work
# define FE_0(P,X) P(X)
# define FE_1(P,X,X1) P(X), FE_0(P,X1)
# define FE_2(P,X,X1,X2) P(X), FE_1(P,X1,X2)
# define FE_3(P,X,X1,X2,X3) P(X), FE_2(P,X1,X2,X3)
# define FE_4(P,X,X1,X2,X3,X4) P(X), FE_3(P,X1,X2,X3,X4)
# define FE_5(P,X,X1,X2,X3,X4,X5) P(X), FE_4(P,X1,X2,X3,X4,X5)
# define FE_6(P,X,X1,X2,X3,X4,X5,X6) P(X), FE_5(P,X1,X2,X3,X4,X5,X6)
# define FE_7(P,X,X1,X2,X3,X4,X5,X6,X7) P(X), FE_6(P,X1,X2,X3,X4,X5,X6,X7)
# define FE_8(P,X,X1,X2,X3,X4,X5,X6,X7,X8) P(X), FE_7(P,X1,X2,X3,X4,X5,X6,X7,X8)
# define FE_9(P,X,X1,X2,X3,X4,X5,X6,X7,X8,X9) P(X), FE_8(P,X1,X2,X3,X4,X5,X6,X7,X8,X9)
# define GET_FE_IMPL(_0,_1,_2,_3,_4,_5,_6,_7,_8,_9,NAME,...) NAME
# define GET_FE(A) GET_FE_IMPL A
# define GET_FE_GLUE(x, y) x y
# define FOR_EACH_VA(P,...) GET_FE_GLUE(GET_FE((__VA_ARGS__,FE_9,FE_8,FE_7,FE_6,FE_5,FE_4,FE_3,FE_2,FE_1,FE_0)), (P,__VA_ARGS__))
# define NAME_CONSTRAINT(x) [x] "m"(x)
// Parameters are a list of each symbol reference required
# define NAMED_CONSTRAINTS_ADD(...) , FOR_EACH_VA(NAME_CONSTRAINT,__VA_ARGS__)
// Same but without comma for when there are no previously defined constraints
# define NAMED_CONSTRAINTS(...) FOR_EACH_VA(NAME_CONSTRAINT,__VA_ARGS__)
// Same as above NAMED_CONSTRAINTS except used for passing arrays/pointers instead of normal variables
# define NAME_CONSTRAINT_ARRAY(x) [x] "m"(*x)
# define NAMED_CONSTRAINTS_ARRAY_ADD(...) , FOR_EACH_VA(NAME_CONSTRAINT_ARRAY,__VA_ARGS__)
# define NAMED_CONSTRAINTS_ARRAY(...) FOR_EACH_VA(NAME_CONSTRAINT_ARRAY,__VA_ARGS__)
#endif
#endif /* AVUTIL_X86_ASM_H */

View file

@ -0,0 +1,87 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
/**
* @file
* byte swapping routines
*/
#ifndef AVUTIL_X86_BSWAP_H
#define AVUTIL_X86_BSWAP_H
#include <stdint.h>
#if defined(_MSC_VER)
#include <intrin.h>
#endif
#include "config.h"
#include "libavutil/attributes.h"
#if defined(_MSC_VER)
#define av_bswap16 av_bswap16
static av_always_inline av_const uint16_t av_bswap16(uint16_t x)
{
return _rotr16(x, 8);
}
#define av_bswap32 av_bswap32
static av_always_inline av_const uint32_t av_bswap32(uint32_t x)
{
return _byteswap_ulong(x);
}
#if ARCH_X86_64
#define av_bswap64 av_bswap64
static inline uint64_t av_const av_bswap64(uint64_t x)
{
return _byteswap_uint64(x);
}
#endif
#elif HAVE_INLINE_ASM
#if AV_GCC_VERSION_AT_MOST(4,0)
#define av_bswap16 av_bswap16
static av_always_inline av_const unsigned av_bswap16(unsigned x)
{
__asm__("rorw $8, %w0" : "+r"(x));
return x;
}
#endif /* AV_GCC_VERSION_AT_MOST(4,0) */
#if AV_GCC_VERSION_AT_MOST(4,4) || defined(__INTEL_COMPILER)
#define av_bswap32 av_bswap32
static av_always_inline av_const uint32_t av_bswap32(uint32_t x)
{
__asm__("bswap %0" : "+r" (x));
return x;
}
#if ARCH_X86_64
#define av_bswap64 av_bswap64
static inline uint64_t av_const av_bswap64(uint64_t x)
{
__asm__("bswap %0": "=r" (x) : "0" (x));
return x;
}
#endif
#endif /* AV_GCC_VERSION_AT_MOST(4,4) */
#endif /* HAVE_INLINE_ASM */
#endif /* AVUTIL_X86_BSWAP_H */

View file

@ -0,0 +1,272 @@
/*
* CPU detection code, extracted from mmx.h
* (c)1997-99 by H. Dietz and R. Fisher
* Converted to C and improved by Fabrice Bellard.
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdlib.h>
#include <string.h>
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
#include "libavutil/cpu.h"
#include "libavutil/cpu_internal.h"
#if HAVE_X86ASM
#define cpuid(index, eax, ebx, ecx, edx) \
ff_cpu_cpuid(index, &eax, &ebx, &ecx, &edx)
#define xgetbv(index, eax, edx) \
ff_cpu_xgetbv(index, &eax, &edx)
#elif HAVE_INLINE_ASM
/* ebx saving is necessary for PIC. gcc seems unable to see it alone */
#define cpuid(index, eax, ebx, ecx, edx) \
__asm__ volatile ( \
"mov %%"FF_REG_b", %%"FF_REG_S" \n\t" \
"cpuid \n\t" \
"xchg %%"FF_REG_b", %%"FF_REG_S \
: "=a" (eax), "=S" (ebx), "=c" (ecx), "=d" (edx) \
: "0" (index), "2"(0))
#define xgetbv(index, eax, edx) \
__asm__ (".byte 0x0f, 0x01, 0xd0" : "=a"(eax), "=d"(edx) : "c" (index))
#define get_eflags(x) \
__asm__ volatile ("pushfl \n" \
"pop %0 \n" \
: "=r"(x))
#define set_eflags(x) \
__asm__ volatile ("push %0 \n" \
"popfl \n" \
:: "r"(x))
#endif /* HAVE_INLINE_ASM */
#if ARCH_X86_64
#define cpuid_test() 1
#elif HAVE_X86ASM
#define cpuid_test ff_cpu_cpuid_test
#elif HAVE_INLINE_ASM
static int cpuid_test(void)
{
x86_reg a, c;
/* Check if CPUID is supported by attempting to toggle the ID bit in
* the EFLAGS register. */
get_eflags(a);
set_eflags(a ^ 0x200000);
get_eflags(c);
return a != c;
}
#endif
/* Function to test if multimedia instructions are supported... */
int ff_get_cpu_flags_x86(void)
{
int rval = 0;
#ifdef cpuid
int eax, ebx, ecx, edx;
int max_std_level, max_ext_level, std_caps = 0, ext_caps = 0;
int family = 0, model = 0;
union { int i[3]; char c[12]; } vendor;
int xcr0_lo = 0, xcr0_hi = 0;
if (!cpuid_test())
return 0; /* CPUID not supported */
cpuid(0, max_std_level, vendor.i[0], vendor.i[2], vendor.i[1]);
if (max_std_level >= 1) {
cpuid(1, eax, ebx, ecx, std_caps);
family = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff);
model = ((eax >> 4) & 0xf) + ((eax >> 12) & 0xf0);
if (std_caps & (1 << 15))
rval |= AV_CPU_FLAG_CMOV;
if (std_caps & (1 << 23))
rval |= AV_CPU_FLAG_MMX;
if (std_caps & (1 << 25))
rval |= AV_CPU_FLAG_MMXEXT;
#if HAVE_SSE
if (std_caps & (1 << 25))
rval |= AV_CPU_FLAG_SSE;
if (std_caps & (1 << 26))
rval |= AV_CPU_FLAG_SSE2;
if (ecx & 1)
rval |= AV_CPU_FLAG_SSE3;
if (ecx & 0x00000200 )
rval |= AV_CPU_FLAG_SSSE3;
if (ecx & 0x00080000 )
rval |= AV_CPU_FLAG_SSE4;
if (ecx & 0x00100000 )
rval |= AV_CPU_FLAG_SSE42;
if (ecx & 0x02000000 )
rval |= AV_CPU_FLAG_AESNI;
#if HAVE_AVX
/* Check OXSAVE and AVX bits */
if ((ecx & 0x18000000) == 0x18000000) {
/* Check for OS support */
xgetbv(0, xcr0_lo, xcr0_hi);
if ((xcr0_lo & 0x6) == 0x6) {
rval |= AV_CPU_FLAG_AVX;
if (ecx & 0x00001000)
rval |= AV_CPU_FLAG_FMA3;
}
}
#endif /* HAVE_AVX */
#endif /* HAVE_SSE */
}
if (max_std_level >= 7) {
cpuid(7, eax, ebx, ecx, edx);
#if HAVE_AVX2
if ((rval & AV_CPU_FLAG_AVX) && (ebx & 0x00000020))
rval |= AV_CPU_FLAG_AVX2;
#if HAVE_AVX512 /* F, CD, BW, DQ, VL */
if ((xcr0_lo & 0xe0) == 0xe0) { /* OPMASK/ZMM state */
if ((rval & AV_CPU_FLAG_AVX2) && (ebx & 0xd0030000) == 0xd0030000)
rval |= AV_CPU_FLAG_AVX512;
}
#endif /* HAVE_AVX512 */
#endif /* HAVE_AVX2 */
/* BMI1/2 don't need OS support */
if (ebx & 0x00000008) {
rval |= AV_CPU_FLAG_BMI1;
if (ebx & 0x00000100)
rval |= AV_CPU_FLAG_BMI2;
}
}
cpuid(0x80000000, max_ext_level, ebx, ecx, edx);
if (max_ext_level >= 0x80000001) {
cpuid(0x80000001, eax, ebx, ecx, ext_caps);
if (ext_caps & (1U << 31))
rval |= AV_CPU_FLAG_3DNOW;
if (ext_caps & (1 << 30))
rval |= AV_CPU_FLAG_3DNOWEXT;
if (ext_caps & (1 << 23))
rval |= AV_CPU_FLAG_MMX;
if (ext_caps & (1 << 22))
rval |= AV_CPU_FLAG_MMXEXT;
if (!strncmp(vendor.c, "AuthenticAMD", 12)) {
/* Allow for selectively disabling SSE2 functions on AMD processors
with SSE2 support but not SSE4a. This includes Athlon64, some
Opteron, and some Sempron processors. MMX, SSE, or 3DNow! are faster
than SSE2 often enough to utilize this special-case flag.
AV_CPU_FLAG_SSE2 and AV_CPU_FLAG_SSE2SLOW are both set in this case
so that SSE2 is used unless explicitly disabled by checking
AV_CPU_FLAG_SSE2SLOW. */
if (rval & AV_CPU_FLAG_SSE2 && !(ecx & 0x00000040))
rval |= AV_CPU_FLAG_SSE2SLOW;
/* Similar to the above but for AVX functions on AMD processors.
This is necessary only for functions using YMM registers on Bulldozer
and Jaguar based CPUs as they lack 256-bit execution units. SSE/AVX
functions using XMM registers are always faster on them.
AV_CPU_FLAG_AVX and AV_CPU_FLAG_AVXSLOW are both set so that AVX is
used unless explicitly disabled by checking AV_CPU_FLAG_AVXSLOW. */
if ((family == 0x15 || family == 0x16) && (rval & AV_CPU_FLAG_AVX))
rval |= AV_CPU_FLAG_AVXSLOW;
}
/* XOP and FMA4 use the AVX instruction coding scheme, so they can't be
* used unless the OS has AVX support. */
if (rval & AV_CPU_FLAG_AVX) {
if (ecx & 0x00000800)
rval |= AV_CPU_FLAG_XOP;
if (ecx & 0x00010000)
rval |= AV_CPU_FLAG_FMA4;
}
}
if (!strncmp(vendor.c, "GenuineIntel", 12)) {
if (family == 6 && (model == 9 || model == 13 || model == 14)) {
/* 6/9 (pentium-m "banias"), 6/13 (pentium-m "dothan"), and
* 6/14 (core1 "yonah") theoretically support sse2, but it's
* usually slower than mmx, so let's just pretend they don't.
* AV_CPU_FLAG_SSE2 is disabled and AV_CPU_FLAG_SSE2SLOW is
* enabled so that SSE2 is not used unless explicitly enabled
* by checking AV_CPU_FLAG_SSE2SLOW. The same situation
* applies for AV_CPU_FLAG_SSE3 and AV_CPU_FLAG_SSE3SLOW. */
if (rval & AV_CPU_FLAG_SSE2)
rval ^= AV_CPU_FLAG_SSE2SLOW | AV_CPU_FLAG_SSE2;
if (rval & AV_CPU_FLAG_SSE3)
rval ^= AV_CPU_FLAG_SSE3SLOW | AV_CPU_FLAG_SSE3;
}
/* The Atom processor has SSSE3 support, which is useful in many cases,
* but sometimes the SSSE3 version is slower than the SSE2 equivalent
* on the Atom, but is generally faster on other processors supporting
* SSSE3. This flag allows for selectively disabling certain SSSE3
* functions on the Atom. */
if (family == 6 && model == 28)
rval |= AV_CPU_FLAG_ATOM;
/* Conroe has a slow shuffle unit. Check the model number to ensure not
* to include crippled low-end Penryns and Nehalems that lack SSE4. */
if ((rval & AV_CPU_FLAG_SSSE3) && !(rval & AV_CPU_FLAG_SSE4) &&
family == 6 && model < 23)
rval |= AV_CPU_FLAG_SSSE3SLOW;
}
#endif /* cpuid */
return rval;
}
size_t ff_get_cpu_max_align_x86(void)
{
int flags = av_get_cpu_flags();
if (flags & AV_CPU_FLAG_AVX512)
return 64;
if (flags & (AV_CPU_FLAG_AVX2 |
AV_CPU_FLAG_AVX |
AV_CPU_FLAG_XOP |
AV_CPU_FLAG_FMA4 |
AV_CPU_FLAG_FMA3 |
AV_CPU_FLAG_AVXSLOW))
return 32;
if (flags & (AV_CPU_FLAG_AESNI |
AV_CPU_FLAG_SSE42 |
AV_CPU_FLAG_SSE4 |
AV_CPU_FLAG_SSSE3 |
AV_CPU_FLAG_SSE3 |
AV_CPU_FLAG_SSE2 |
AV_CPU_FLAG_SSE |
AV_CPU_FLAG_ATOM |
AV_CPU_FLAG_SSSE3SLOW |
AV_CPU_FLAG_SSE3SLOW |
AV_CPU_FLAG_SSE2SLOW))
return 16;
return 8;
}

View file

@ -0,0 +1,113 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVUTIL_X86_CPU_H
#define AVUTIL_X86_CPU_H
#include "libavutil/cpu.h"
#include "libavutil/cpu_internal.h"
#define AV_CPU_FLAG_AMD3DNOW AV_CPU_FLAG_3DNOW
#define AV_CPU_FLAG_AMD3DNOWEXT AV_CPU_FLAG_3DNOWEXT
#define X86_AMD3DNOW(flags) CPUEXT(flags, AMD3DNOW)
#define X86_AMD3DNOWEXT(flags) CPUEXT(flags, AMD3DNOWEXT)
#define X86_MMX(flags) CPUEXT(flags, MMX)
#define X86_MMXEXT(flags) CPUEXT(flags, MMXEXT)
#define X86_SSE(flags) CPUEXT(flags, SSE)
#define X86_SSE2(flags) CPUEXT(flags, SSE2)
#define X86_SSE2_FAST(flags) CPUEXT_FAST(flags, SSE2)
#define X86_SSE2_SLOW(flags) CPUEXT_SLOW(flags, SSE2)
#define X86_SSE3(flags) CPUEXT(flags, SSE3)
#define X86_SSE3_FAST(flags) CPUEXT_FAST(flags, SSE3)
#define X86_SSE3_SLOW(flags) CPUEXT_SLOW(flags, SSE3)
#define X86_SSSE3(flags) CPUEXT(flags, SSSE3)
#define X86_SSSE3_FAST(flags) CPUEXT_FAST(flags, SSSE3)
#define X86_SSSE3_SLOW(flags) CPUEXT_SLOW(flags, SSSE3)
#define X86_SSE4(flags) CPUEXT(flags, SSE4)
#define X86_SSE42(flags) CPUEXT(flags, SSE42)
#define X86_AVX(flags) CPUEXT(flags, AVX)
#define X86_AVX_FAST(flags) CPUEXT_FAST(flags, AVX)
#define X86_AVX_SLOW(flags) CPUEXT_SLOW(flags, AVX)
#define X86_XOP(flags) CPUEXT(flags, XOP)
#define X86_FMA3(flags) CPUEXT(flags, FMA3)
#define X86_FMA4(flags) CPUEXT(flags, FMA4)
#define X86_AVX2(flags) CPUEXT(flags, AVX2)
#define X86_AESNI(flags) CPUEXT(flags, AESNI)
#define X86_AVX512(flags) CPUEXT(flags, AVX512)
#define EXTERNAL_AMD3DNOW(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, AMD3DNOW)
#define EXTERNAL_AMD3DNOWEXT(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, AMD3DNOWEXT)
#define EXTERNAL_MMX(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, MMX)
#define EXTERNAL_MMXEXT(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, MMXEXT)
#define EXTERNAL_SSE(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, SSE)
#define EXTERNAL_SSE2(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, SSE2)
#define EXTERNAL_SSE2_FAST(flags) CPUEXT_SUFFIX_FAST(flags, _EXTERNAL, SSE2)
#define EXTERNAL_SSE2_SLOW(flags) CPUEXT_SUFFIX_SLOW(flags, _EXTERNAL, SSE2)
#define EXTERNAL_SSE3(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, SSE3)
#define EXTERNAL_SSE3_FAST(flags) CPUEXT_SUFFIX_FAST(flags, _EXTERNAL, SSE3)
#define EXTERNAL_SSE3_SLOW(flags) CPUEXT_SUFFIX_SLOW(flags, _EXTERNAL, SSE3)
#define EXTERNAL_SSSE3(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, SSSE3)
#define EXTERNAL_SSSE3_FAST(flags) CPUEXT_SUFFIX_FAST(flags, _EXTERNAL, SSSE3)
#define EXTERNAL_SSSE3_SLOW(flags) CPUEXT_SUFFIX_SLOW(flags, _EXTERNAL, SSSE3)
#define EXTERNAL_SSE4(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, SSE4)
#define EXTERNAL_SSE42(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, SSE42)
#define EXTERNAL_AVX(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, AVX)
#define EXTERNAL_AVX_FAST(flags) CPUEXT_SUFFIX_FAST(flags, _EXTERNAL, AVX)
#define EXTERNAL_AVX_SLOW(flags) CPUEXT_SUFFIX_SLOW(flags, _EXTERNAL, AVX)
#define EXTERNAL_XOP(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, XOP)
#define EXTERNAL_FMA3(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, FMA3)
#define EXTERNAL_FMA3_FAST(flags) CPUEXT_SUFFIX_FAST2(flags, _EXTERNAL, FMA3, AVX)
#define EXTERNAL_FMA3_SLOW(flags) CPUEXT_SUFFIX_SLOW2(flags, _EXTERNAL, FMA3, AVX)
#define EXTERNAL_FMA4(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, FMA4)
#define EXTERNAL_AVX2(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, AVX2)
#define EXTERNAL_AVX2_FAST(flags) CPUEXT_SUFFIX_FAST2(flags, _EXTERNAL, AVX2, AVX)
#define EXTERNAL_AVX2_SLOW(flags) CPUEXT_SUFFIX_SLOW2(flags, _EXTERNAL, AVX2, AVX)
#define EXTERNAL_AESNI(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, AESNI)
#define EXTERNAL_AVX512(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, AVX512)
#define INLINE_AMD3DNOW(flags) CPUEXT_SUFFIX(flags, _INLINE, AMD3DNOW)
#define INLINE_AMD3DNOWEXT(flags) CPUEXT_SUFFIX(flags, _INLINE, AMD3DNOWEXT)
#define INLINE_MMX(flags) CPUEXT_SUFFIX(flags, _INLINE, MMX)
#define INLINE_MMXEXT(flags) CPUEXT_SUFFIX(flags, _INLINE, MMXEXT)
#define INLINE_SSE(flags) CPUEXT_SUFFIX(flags, _INLINE, SSE)
#define INLINE_SSE2(flags) CPUEXT_SUFFIX(flags, _INLINE, SSE2)
#define INLINE_SSE2_FAST(flags) CPUEXT_SUFFIX_FAST(flags, _INLINE, SSE2)
#define INLINE_SSE2_SLOW(flags) CPUEXT_SUFFIX_SLOW(flags, _INLINE, SSE2)
#define INLINE_SSE3(flags) CPUEXT_SUFFIX(flags, _INLINE, SSE3)
#define INLINE_SSE3_FAST(flags) CPUEXT_SUFFIX_FAST(flags, _INLINE, SSE3)
#define INLINE_SSE3_SLOW(flags) CPUEXT_SUFFIX_SLOW(flags, _INLINE, SSE3)
#define INLINE_SSSE3(flags) CPUEXT_SUFFIX(flags, _INLINE, SSSE3)
#define INLINE_SSSE3_FAST(flags) CPUEXT_SUFFIX_FAST(flags, _INLINE, SSSE3)
#define INLINE_SSSE3_SLOW(flags) CPUEXT_SUFFIX_SLOW(flags, _INLINE, SSSE3)
#define INLINE_SSE4(flags) CPUEXT_SUFFIX(flags, _INLINE, SSE4)
#define INLINE_SSE42(flags) CPUEXT_SUFFIX(flags, _INLINE, SSE42)
#define INLINE_AVX(flags) CPUEXT_SUFFIX(flags, _INLINE, AVX)
#define INLINE_AVX_FAST(flags) CPUEXT_SUFFIX_FAST(flags, _INLINE, AVX)
#define INLINE_AVX_SLOW(flags) CPUEXT_SUFFIX_SLOW(flags, _INLINE, AVX)
#define INLINE_XOP(flags) CPUEXT_SUFFIX(flags, _INLINE, XOP)
#define INLINE_FMA3(flags) CPUEXT_SUFFIX(flags, _INLINE, FMA3)
#define INLINE_FMA4(flags) CPUEXT_SUFFIX(flags, _INLINE, FMA4)
#define INLINE_AVX2(flags) CPUEXT_SUFFIX(flags, _INLINE, AVX2)
#define INLINE_AESNI(flags) CPUEXT_SUFFIX(flags, _INLINE, AESNI)
void ff_cpu_cpuid(int index, int *eax, int *ebx, int *ecx, int *edx);
void ff_cpu_xgetbv(int op, int *eax, int *edx);
int ff_cpu_cpuid_test(void);
#endif /* AVUTIL_X86_CPU_H */

View file

@ -0,0 +1,91 @@
;*****************************************************************************
;* Copyright (C) 2005-2010 x264 project
;*
;* Authors: Loren Merritt <lorenm@u.washington.edu>
;* Fiona Glaser <fiona@x264.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "x86util.asm"
SECTION .text
;-----------------------------------------------------------------------------
; void ff_cpu_cpuid(int index, int *eax, int *ebx, int *ecx, int *edx)
;-----------------------------------------------------------------------------
cglobal cpu_cpuid, 5,7
push rbx
push r4
push r3
push r2
push r1
mov eax, r0d
xor ecx, ecx
cpuid
pop r4
mov [r4], eax
pop r4
mov [r4], ebx
pop r4
mov [r4], ecx
pop r4
mov [r4], edx
pop rbx
RET
;-----------------------------------------------------------------------------
; void ff_cpu_xgetbv(int op, int *eax, int *edx)
;-----------------------------------------------------------------------------
cglobal cpu_xgetbv, 3,7
push r2
push r1
mov ecx, r0d
xgetbv
pop r4
mov [r4], eax
pop r4
mov [r4], edx
RET
%if ARCH_X86_64 == 0
;-----------------------------------------------------------------------------
; int ff_cpu_cpuid_test(void)
; return 0 if unsupported
;-----------------------------------------------------------------------------
cglobal cpu_cpuid_test
pushfd
push ebx
push ebp
push esi
push edi
pushfd
pop eax
mov ebx, eax
xor eax, 0x200000
push eax
popfd
pushfd
pop eax
xor eax, ebx
pop edi
pop esi
pop ebp
pop ebx
popfd
ret
%endif

View file

@ -0,0 +1,55 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVUTIL_X86_EMMS_H
#define AVUTIL_X86_EMMS_H
#include "config.h"
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
void avpriv_emms_asm(void);
#if HAVE_MMX_INLINE
# define emms_c emms_c
/**
* Empty mmx state.
* this must be called between any dsp function and float/double code.
* for example sin(); dsp->idct_put(); emms_c(); cos()
* Note, *alloc() and *free() also use float code in some libc implementations
* thus this also applies to them or any function using them.
*/
static av_always_inline void emms_c(void)
{
/* Some inlined functions may also use mmx instructions regardless of
* runtime cpuflags. With that in mind, we unconditionally empty the
* mmx state if the target cpu chosen at configure time supports it.
*/
#if !defined(__MMX__)
if(av_get_cpu_flags() & AV_CPU_FLAG_MMX)
#endif
__asm__ volatile ("emms" ::: "memory");
}
#elif HAVE_MMX && HAVE_MM_EMPTY
# include <mmintrin.h>
# define emms_c _mm_empty
#elif HAVE_MMX_EXTERNAL
# define emms_c avpriv_emms_asm
#endif /* HAVE_MMX_INLINE */
#endif /* AVUTIL_X86_EMMS_H */

View file

@ -0,0 +1,48 @@
;*****************************************************************************
;* x86-optimized Float DSP functions
;*
;* Copyright 2016 James Almer
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "x86util.asm"
SECTION .text
;-----------------------------------------------------------------------------
; void ff_butterflies_fixed(float *src0, float *src1, int len);
;-----------------------------------------------------------------------------
INIT_XMM sse2
cglobal butterflies_fixed, 3,3,3, src0, src1, len
shl lend, 2
add src0q, lenq
add src1q, lenq
neg lenq
align 16
.loop:
mova m0, [src0q + lenq]
mova m1, [src1q + lenq]
mova m2, m0
paddd m0, m1
psubd m2, m1
mova [src0q + lenq], m0
mova [src1q + lenq], m2
add lenq, mmsize
jl .loop
RET

View file

@ -0,0 +1,35 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/fixed_dsp.h"
#include "cpu.h"
void ff_butterflies_fixed_sse2(int *src0, int *src1, int len);
av_cold void ff_fixed_dsp_init_x86(AVFixedDSPContext *fdsp)
{
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_SSE2(cpu_flags)) {
fdsp->butterflies_fixed = ff_butterflies_fixed_sse2;
}
}

View file

@ -0,0 +1,484 @@
;*****************************************************************************
;* x86-optimized Float DSP functions
;*
;* Copyright 2006 Loren Merritt
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "x86util.asm"
SECTION_RODATA 32
pd_reverse: dd 7, 6, 5, 4, 3, 2, 1, 0
SECTION .text
;-----------------------------------------------------------------------------
; void vector_fmul(float *dst, const float *src0, const float *src1, int len)
;-----------------------------------------------------------------------------
%macro VECTOR_FMUL 0
cglobal vector_fmul, 4,4,2, dst, src0, src1, len
lea lenq, [lend*4 - 64]
ALIGN 16
.loop:
%assign a 0
%rep 32/mmsize
mova m0, [src0q + lenq + (a+0)*mmsize]
mova m1, [src0q + lenq + (a+1)*mmsize]
mulps m0, m0, [src1q + lenq + (a+0)*mmsize]
mulps m1, m1, [src1q + lenq + (a+1)*mmsize]
mova [dstq + lenq + (a+0)*mmsize], m0
mova [dstq + lenq + (a+1)*mmsize], m1
%assign a a+2
%endrep
sub lenq, 64
jge .loop
REP_RET
%endmacro
INIT_XMM sse
VECTOR_FMUL
%if HAVE_AVX_EXTERNAL
INIT_YMM avx
VECTOR_FMUL
%endif
;-----------------------------------------------------------------------------
; void vector_dmul(double *dst, const double *src0, const double *src1, int len)
;-----------------------------------------------------------------------------
%macro VECTOR_DMUL 0
cglobal vector_dmul, 4,4,4, dst, src0, src1, len
lea lend, [lenq*8 - mmsize*4]
ALIGN 16
.loop:
movaps m0, [src0q + lenq + 0*mmsize]
movaps m1, [src0q + lenq + 1*mmsize]
movaps m2, [src0q + lenq + 2*mmsize]
movaps m3, [src0q + lenq + 3*mmsize]
mulpd m0, m0, [src1q + lenq + 0*mmsize]
mulpd m1, m1, [src1q + lenq + 1*mmsize]
mulpd m2, m2, [src1q + lenq + 2*mmsize]
mulpd m3, m3, [src1q + lenq + 3*mmsize]
movaps [dstq + lenq + 0*mmsize], m0
movaps [dstq + lenq + 1*mmsize], m1
movaps [dstq + lenq + 2*mmsize], m2
movaps [dstq + lenq + 3*mmsize], m3
sub lenq, mmsize*4
jge .loop
RET
%endmacro
INIT_XMM sse2
VECTOR_DMUL
%if HAVE_AVX_EXTERNAL
INIT_YMM avx
VECTOR_DMUL
%endif
;------------------------------------------------------------------------------
; void ff_vector_fmac_scalar(float *dst, const float *src, float mul, int len)
;------------------------------------------------------------------------------
%macro VECTOR_FMAC_SCALAR 0
%if UNIX64
cglobal vector_fmac_scalar, 3,3,5, dst, src, len
%else
cglobal vector_fmac_scalar, 4,4,5, dst, src, mul, len
%endif
%if ARCH_X86_32
VBROADCASTSS m0, mulm
%else
%if WIN64
SWAP 0, 2
%endif
shufps xm0, xm0, 0
%if cpuflag(avx)
vinsertf128 m0, m0, xm0, 1
%endif
%endif
lea lenq, [lend*4-64]
.loop:
%if cpuflag(fma3)
mova m1, [dstq+lenq]
mova m2, [dstq+lenq+1*mmsize]
fmaddps m1, m0, [srcq+lenq], m1
fmaddps m2, m0, [srcq+lenq+1*mmsize], m2
%else ; cpuflag
mulps m1, m0, [srcq+lenq]
mulps m2, m0, [srcq+lenq+1*mmsize]
%if mmsize < 32
mulps m3, m0, [srcq+lenq+2*mmsize]
mulps m4, m0, [srcq+lenq+3*mmsize]
%endif ; mmsize
addps m1, m1, [dstq+lenq]
addps m2, m2, [dstq+lenq+1*mmsize]
%if mmsize < 32
addps m3, m3, [dstq+lenq+2*mmsize]
addps m4, m4, [dstq+lenq+3*mmsize]
%endif ; mmsize
%endif ; cpuflag
mova [dstq+lenq], m1
mova [dstq+lenq+1*mmsize], m2
%if mmsize < 32
mova [dstq+lenq+2*mmsize], m3
mova [dstq+lenq+3*mmsize], m4
%endif ; mmsize
sub lenq, 64
jge .loop
REP_RET
%endmacro
INIT_XMM sse
VECTOR_FMAC_SCALAR
%if HAVE_AVX_EXTERNAL
INIT_YMM avx
VECTOR_FMAC_SCALAR
%endif
%if HAVE_FMA3_EXTERNAL
INIT_YMM fma3
VECTOR_FMAC_SCALAR
%endif
;------------------------------------------------------------------------------
; void ff_vector_fmul_scalar(float *dst, const float *src, float mul, int len)
;------------------------------------------------------------------------------
%macro VECTOR_FMUL_SCALAR 0
%if UNIX64
cglobal vector_fmul_scalar, 3,3,2, dst, src, len
%else
cglobal vector_fmul_scalar, 4,4,3, dst, src, mul, len
%endif
%if ARCH_X86_32
movss m0, mulm
%elif WIN64
SWAP 0, 2
%endif
shufps m0, m0, 0
lea lenq, [lend*4-mmsize]
.loop:
mova m1, [srcq+lenq]
mulps m1, m0
mova [dstq+lenq], m1
sub lenq, mmsize
jge .loop
REP_RET
%endmacro
INIT_XMM sse
VECTOR_FMUL_SCALAR
;------------------------------------------------------------------------------
; void ff_vector_dmac_scalar(double *dst, const double *src, double mul,
; int len)
;------------------------------------------------------------------------------
%macro VECTOR_DMAC_SCALAR 0
%if ARCH_X86_32
cglobal vector_dmac_scalar, 2,4,5, dst, src, mul, len, lenaddr
mov lenq, lenaddrm
VBROADCASTSD m0, mulm
%else
%if UNIX64
cglobal vector_dmac_scalar, 3,3,5, dst, src, len
%else
cglobal vector_dmac_scalar, 4,4,5, dst, src, mul, len
SWAP 0, 2
%endif
movlhps xm0, xm0
%if cpuflag(avx)
vinsertf128 m0, m0, xm0, 1
%endif
%endif
lea lenq, [lend*8-mmsize*4]
.loop:
%if cpuflag(fma3)
movaps m1, [dstq+lenq]
movaps m2, [dstq+lenq+1*mmsize]
movaps m3, [dstq+lenq+2*mmsize]
movaps m4, [dstq+lenq+3*mmsize]
fmaddpd m1, m0, [srcq+lenq], m1
fmaddpd m2, m0, [srcq+lenq+1*mmsize], m2
fmaddpd m3, m0, [srcq+lenq+2*mmsize], m3
fmaddpd m4, m0, [srcq+lenq+3*mmsize], m4
%else ; cpuflag
mulpd m1, m0, [srcq+lenq]
mulpd m2, m0, [srcq+lenq+1*mmsize]
mulpd m3, m0, [srcq+lenq+2*mmsize]
mulpd m4, m0, [srcq+lenq+3*mmsize]
addpd m1, m1, [dstq+lenq]
addpd m2, m2, [dstq+lenq+1*mmsize]
addpd m3, m3, [dstq+lenq+2*mmsize]
addpd m4, m4, [dstq+lenq+3*mmsize]
%endif ; cpuflag
movaps [dstq+lenq], m1
movaps [dstq+lenq+1*mmsize], m2
movaps [dstq+lenq+2*mmsize], m3
movaps [dstq+lenq+3*mmsize], m4
sub lenq, mmsize*4
jge .loop
REP_RET
%endmacro
INIT_XMM sse2
VECTOR_DMAC_SCALAR
%if HAVE_AVX_EXTERNAL
INIT_YMM avx
VECTOR_DMAC_SCALAR
%endif
%if HAVE_FMA3_EXTERNAL
INIT_YMM fma3
VECTOR_DMAC_SCALAR
%endif
;------------------------------------------------------------------------------
; void ff_vector_dmul_scalar(double *dst, const double *src, double mul,
; int len)
;------------------------------------------------------------------------------
%macro VECTOR_DMUL_SCALAR 0
%if ARCH_X86_32
cglobal vector_dmul_scalar, 3,4,3, dst, src, mul, len, lenaddr
mov lenq, lenaddrm
%elif UNIX64
cglobal vector_dmul_scalar, 3,3,3, dst, src, len
%else
cglobal vector_dmul_scalar, 4,4,3, dst, src, mul, len
%endif
%if ARCH_X86_32
VBROADCASTSD m0, mulm
%else
%if WIN64
SWAP 0, 2
%endif
movlhps xm0, xm0
%if cpuflag(avx)
vinsertf128 ym0, ym0, xm0, 1
%endif
%endif
lea lenq, [lend*8-2*mmsize]
.loop:
mulpd m1, m0, [srcq+lenq ]
mulpd m2, m0, [srcq+lenq+mmsize]
movaps [dstq+lenq ], m1
movaps [dstq+lenq+mmsize], m2
sub lenq, 2*mmsize
jge .loop
REP_RET
%endmacro
INIT_XMM sse2
VECTOR_DMUL_SCALAR
%if HAVE_AVX_EXTERNAL
INIT_YMM avx
VECTOR_DMUL_SCALAR
%endif
;-----------------------------------------------------------------------------
; vector_fmul_window(float *dst, const float *src0,
; const float *src1, const float *win, int len);
;-----------------------------------------------------------------------------
%macro VECTOR_FMUL_WINDOW 0
cglobal vector_fmul_window, 5, 6, 6, dst, src0, src1, win, len, len1
shl lend, 2
lea len1q, [lenq - mmsize]
add src0q, lenq
add dstq, lenq
add winq, lenq
neg lenq
.loop:
mova m0, [winq + lenq]
mova m4, [src0q + lenq]
%if cpuflag(sse)
mova m1, [winq + len1q]
mova m5, [src1q + len1q]
shufps m1, m1, 0x1b
shufps m5, m5, 0x1b
mova m2, m0
mova m3, m1
mulps m2, m4
mulps m3, m5
mulps m1, m4
mulps m0, m5
addps m2, m3
subps m1, m0
shufps m2, m2, 0x1b
%else
pswapd m1, [winq + len1q]
pswapd m5, [src1q + len1q]
mova m2, m0
mova m3, m1
pfmul m2, m4
pfmul m3, m5
pfmul m1, m4
pfmul m0, m5
pfadd m2, m3
pfsub m1, m0
pswapd m2, m2
%endif
mova [dstq + lenq], m1
mova [dstq + len1q], m2
sub len1q, mmsize
add lenq, mmsize
jl .loop
%if mmsize == 8
femms
%endif
REP_RET
%endmacro
INIT_MMX 3dnowext
VECTOR_FMUL_WINDOW
INIT_XMM sse
VECTOR_FMUL_WINDOW
;-----------------------------------------------------------------------------
; vector_fmul_add(float *dst, const float *src0, const float *src1,
; const float *src2, int len)
;-----------------------------------------------------------------------------
%macro VECTOR_FMUL_ADD 0
cglobal vector_fmul_add, 5,5,4, dst, src0, src1, src2, len
lea lenq, [lend*4 - 2*mmsize]
ALIGN 16
.loop:
mova m0, [src0q + lenq]
mova m1, [src0q + lenq + mmsize]
%if cpuflag(fma3)
mova m2, [src2q + lenq]
mova m3, [src2q + lenq + mmsize]
fmaddps m0, m0, [src1q + lenq], m2
fmaddps m1, m1, [src1q + lenq + mmsize], m3
%else
mulps m0, m0, [src1q + lenq]
mulps m1, m1, [src1q + lenq + mmsize]
addps m0, m0, [src2q + lenq]
addps m1, m1, [src2q + lenq + mmsize]
%endif
mova [dstq + lenq], m0
mova [dstq + lenq + mmsize], m1
sub lenq, 2*mmsize
jge .loop
REP_RET
%endmacro
INIT_XMM sse
VECTOR_FMUL_ADD
%if HAVE_AVX_EXTERNAL
INIT_YMM avx
VECTOR_FMUL_ADD
%endif
%if HAVE_FMA3_EXTERNAL
INIT_YMM fma3
VECTOR_FMUL_ADD
%endif
;-----------------------------------------------------------------------------
; void vector_fmul_reverse(float *dst, const float *src0, const float *src1,
; int len)
;-----------------------------------------------------------------------------
%macro VECTOR_FMUL_REVERSE 0
cglobal vector_fmul_reverse, 4,4,2, dst, src0, src1, len
%if cpuflag(avx2)
movaps m2, [pd_reverse]
%endif
lea lenq, [lend*4 - 2*mmsize]
ALIGN 16
.loop:
%if cpuflag(avx2)
vpermps m0, m2, [src1q]
vpermps m1, m2, [src1q+mmsize]
%elif cpuflag(avx)
vmovaps xmm0, [src1q + 16]
vinsertf128 m0, m0, [src1q], 1
vshufps m0, m0, m0, q0123
vmovaps xmm1, [src1q + mmsize + 16]
vinsertf128 m1, m1, [src1q + mmsize], 1
vshufps m1, m1, m1, q0123
%else
mova m0, [src1q]
mova m1, [src1q + mmsize]
shufps m0, m0, q0123
shufps m1, m1, q0123
%endif
mulps m0, m0, [src0q + lenq + mmsize]
mulps m1, m1, [src0q + lenq]
movaps [dstq + lenq + mmsize], m0
movaps [dstq + lenq], m1
add src1q, 2*mmsize
sub lenq, 2*mmsize
jge .loop
REP_RET
%endmacro
INIT_XMM sse
VECTOR_FMUL_REVERSE
%if HAVE_AVX_EXTERNAL
INIT_YMM avx
VECTOR_FMUL_REVERSE
%endif
%if HAVE_AVX2_EXTERNAL
INIT_YMM avx2
VECTOR_FMUL_REVERSE
%endif
; float scalarproduct_float_sse(const float *v1, const float *v2, int len)
INIT_XMM sse
cglobal scalarproduct_float, 3,3,2, v1, v2, offset
shl offsetd, 2
add v1q, offsetq
add v2q, offsetq
neg offsetq
xorps xmm0, xmm0
.loop:
movaps xmm1, [v1q+offsetq]
mulps xmm1, [v2q+offsetq]
addps xmm0, xmm1
add offsetq, 16
js .loop
movhlps xmm1, xmm0
addps xmm0, xmm1
movss xmm1, xmm0
shufps xmm0, xmm0, 1
addss xmm0, xmm1
%if ARCH_X86_64 == 0
movss r0m, xmm0
fld dword r0m
%endif
RET
;-----------------------------------------------------------------------------
; void ff_butterflies_float(float *src0, float *src1, int len);
;-----------------------------------------------------------------------------
INIT_XMM sse
cglobal butterflies_float, 3,3,3, src0, src1, len
shl lend, 2
add src0q, lenq
add src1q, lenq
neg lenq
.loop:
mova m0, [src0q + lenq]
mova m1, [src1q + lenq]
subps m2, m0, m1
addps m0, m0, m1
mova [src1q + lenq], m2
mova [src0q + lenq], m0
add lenq, mmsize
jl .loop
REP_RET

View file

@ -0,0 +1,121 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/float_dsp.h"
#include "cpu.h"
#include "asm.h"
void ff_vector_fmul_sse(float *dst, const float *src0, const float *src1,
int len);
void ff_vector_fmul_avx(float *dst, const float *src0, const float *src1,
int len);
void ff_vector_dmul_sse2(double *dst, const double *src0, const double *src1,
int len);
void ff_vector_dmul_avx(double *dst, const double *src0, const double *src1,
int len);
void ff_vector_fmac_scalar_sse(float *dst, const float *src, float mul,
int len);
void ff_vector_fmac_scalar_avx(float *dst, const float *src, float mul,
int len);
void ff_vector_fmac_scalar_fma3(float *dst, const float *src, float mul,
int len);
void ff_vector_fmul_scalar_sse(float *dst, const float *src, float mul,
int len);
void ff_vector_dmac_scalar_sse2(double *dst, const double *src, double mul,
int len);
void ff_vector_dmac_scalar_avx(double *dst, const double *src, double mul,
int len);
void ff_vector_dmac_scalar_fma3(double *dst, const double *src, double mul,
int len);
void ff_vector_dmul_scalar_sse2(double *dst, const double *src,
double mul, int len);
void ff_vector_dmul_scalar_avx(double *dst, const double *src,
double mul, int len);
void ff_vector_fmul_window_3dnowext(float *dst, const float *src0,
const float *src1, const float *win, int len);
void ff_vector_fmul_window_sse(float *dst, const float *src0,
const float *src1, const float *win, int len);
void ff_vector_fmul_add_sse(float *dst, const float *src0, const float *src1,
const float *src2, int len);
void ff_vector_fmul_add_avx(float *dst, const float *src0, const float *src1,
const float *src2, int len);
void ff_vector_fmul_add_fma3(float *dst, const float *src0, const float *src1,
const float *src2, int len);
void ff_vector_fmul_reverse_sse(float *dst, const float *src0,
const float *src1, int len);
void ff_vector_fmul_reverse_avx(float *dst, const float *src0,
const float *src1, int len);
void ff_vector_fmul_reverse_avx2(float *dst, const float *src0,
const float *src1, int len);
float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order);
void ff_butterflies_float_sse(float *av_restrict src0, float *av_restrict src1, int len);
av_cold void ff_float_dsp_init_x86(AVFloatDSPContext *fdsp)
{
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_AMD3DNOWEXT(cpu_flags)) {
fdsp->vector_fmul_window = ff_vector_fmul_window_3dnowext;
}
if (EXTERNAL_SSE(cpu_flags)) {
fdsp->vector_fmul = ff_vector_fmul_sse;
fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_sse;
fdsp->vector_fmul_scalar = ff_vector_fmul_scalar_sse;
fdsp->vector_fmul_window = ff_vector_fmul_window_sse;
fdsp->vector_fmul_add = ff_vector_fmul_add_sse;
fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_sse;
fdsp->scalarproduct_float = ff_scalarproduct_float_sse;
fdsp->butterflies_float = ff_butterflies_float_sse;
}
if (EXTERNAL_SSE2(cpu_flags)) {
fdsp->vector_dmul = ff_vector_dmul_sse2;
fdsp->vector_dmac_scalar = ff_vector_dmac_scalar_sse2;
fdsp->vector_dmul_scalar = ff_vector_dmul_scalar_sse2;
}
if (EXTERNAL_AVX_FAST(cpu_flags)) {
fdsp->vector_fmul = ff_vector_fmul_avx;
fdsp->vector_dmul = ff_vector_dmul_avx;
fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_avx;
fdsp->vector_dmul_scalar = ff_vector_dmul_scalar_avx;
fdsp->vector_dmac_scalar = ff_vector_dmac_scalar_avx;
fdsp->vector_fmul_add = ff_vector_fmul_add_avx;
fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_avx;
}
if (EXTERNAL_AVX2_FAST(cpu_flags)) {
fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_avx2;
}
if (EXTERNAL_FMA3_FAST(cpu_flags)) {
fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_fma3;
fdsp->vector_fmul_add = ff_vector_fmul_add_fma3;
fdsp->vector_dmac_scalar = ff_vector_dmac_scalar_fma3;
}
}

View file

@ -0,0 +1,53 @@
;*****************************************************************************
;* Copyright 2016 Anton Khirnov
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION .text
INIT_XMM sse4
cglobal image_copy_plane_uc_from, 6, 7, 4, dst, dst_linesize, src, src_linesize, bw, height, rowpos
add dstq, bwq
add srcq, bwq
neg bwq
.row_start:
mov rowposq, bwq
.loop:
movntdqa m0, [srcq + rowposq + 0 * mmsize]
movntdqa m1, [srcq + rowposq + 1 * mmsize]
movntdqa m2, [srcq + rowposq + 2 * mmsize]
movntdqa m3, [srcq + rowposq + 3 * mmsize]
mova [dstq + rowposq + 0 * mmsize], m0
mova [dstq + rowposq + 1 * mmsize], m1
mova [dstq + rowposq + 2 * mmsize], m2
mova [dstq + rowposq + 3 * mmsize], m3
add rowposq, 4 * mmsize
jnz .loop
add srcq, src_linesizeq
add dstq, dst_linesizeq
dec heightd
jnz .row_start
RET

View file

@ -0,0 +1,49 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stddef.h>
#include <stdint.h>
#include "libavutil/cpu.h"
#include "libavutil/error.h"
#include "libavutil/imgutils.h"
#include "libavutil/imgutils_internal.h"
#include "libavutil/internal.h"
#include "cpu.h"
void ff_image_copy_plane_uc_from_sse4(uint8_t *dst, ptrdiff_t dst_linesize,
const uint8_t *src, ptrdiff_t src_linesize,
ptrdiff_t bytewidth, int height);
int ff_image_copy_plane_uc_from_x86(uint8_t *dst, ptrdiff_t dst_linesize,
const uint8_t *src, ptrdiff_t src_linesize,
ptrdiff_t bytewidth, int height)
{
int cpu_flags = av_get_cpu_flags();
ptrdiff_t bw_aligned = FFALIGN(bytewidth, 64);
if (EXTERNAL_SSE4(cpu_flags) &&
bw_aligned <= dst_linesize && bw_aligned <= src_linesize)
ff_image_copy_plane_uc_from_sse4(dst, dst_linesize, src, src_linesize,
bw_aligned, height);
else
return AVERROR(ENOSYS);
return 0;
}

View file

@ -0,0 +1,139 @@
/*
* Copyright (c) 2015 James Almer
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVUTIL_X86_INTMATH_H
#define AVUTIL_X86_INTMATH_H
#include <stdint.h>
#include <stdlib.h>
#if HAVE_FAST_CLZ
#if defined(_MSC_VER)
#include <intrin.h>
#elif defined(__INTEL_COMPILER)
#include <immintrin.h>
#endif
#endif
#include "config.h"
#if HAVE_FAST_CLZ
#if (defined(__INTEL_COMPILER) && (__INTEL_COMPILER>=1216)) || defined(_MSC_VER)
# if defined(__INTEL_COMPILER)
# define ff_log2(x) (_bit_scan_reverse((x)|1))
# else
# define ff_log2 ff_log2_x86
static av_always_inline av_const int ff_log2_x86(unsigned int v)
{
unsigned long n;
_BitScanReverse(&n, v|1);
return n;
}
# endif
# define ff_log2_16bit av_log2
#if defined(__INTEL_COMPILER) || (defined(_MSC_VER) && (_MSC_VER >= 1700) && \
(defined(__BMI__) || !defined(__clang__)))
# define ff_ctz(v) _tzcnt_u32(v)
# if ARCH_X86_64
# define ff_ctzll(v) _tzcnt_u64(v)
# else
# define ff_ctzll ff_ctzll_x86
static av_always_inline av_const int ff_ctzll_x86(long long v)
{
return ((uint32_t)v == 0) ? _tzcnt_u32((uint32_t)(v >> 32)) + 32 : _tzcnt_u32((uint32_t)v);
}
# endif
#endif /* _MSC_VER */
#endif /* __INTEL_COMPILER */
#endif /* HAVE_FAST_CLZ */
#if defined(__GNUC__)
/* Our generic version of av_popcount is faster than GCC's built-in on
* CPUs that don't support the popcnt instruction.
*/
#if defined(__POPCNT__)
#define av_popcount __builtin_popcount
#if ARCH_X86_64
#define av_popcount64 __builtin_popcountll
#endif
#endif /* __POPCNT__ */
#if defined(__BMI2__)
#if AV_GCC_VERSION_AT_LEAST(5,1)
#define av_mod_uintp2 __builtin_ia32_bzhi_si
#elif HAVE_INLINE_ASM
/* GCC releases before 5.1.0 have a broken bzhi builtin, so for those we
* implement it using inline assembly
*/
#define av_mod_uintp2 av_mod_uintp2_bmi2
static av_always_inline av_const unsigned av_mod_uintp2_bmi2(unsigned a, unsigned p)
{
if (av_builtin_constant_p(p))
return a & ((1 << p) - 1);
else {
unsigned x;
__asm__ ("bzhi %2, %1, %0 \n\t" : "=r"(x) : "rm"(a), "r"(p));
return x;
}
}
#endif /* AV_GCC_VERSION_AT_LEAST */
#endif /* __BMI2__ */
#if defined(__SSE2__) && !defined(__INTEL_COMPILER)
#define av_clipd av_clipd_sse2
static av_always_inline av_const double av_clipd_sse2(double a, double amin, double amax)
{
#if defined(ASSERT_LEVEL) && ASSERT_LEVEL >= 2
if (amin > amax) abort();
#endif
__asm__ ("minsd %2, %0 \n\t"
"maxsd %1, %0 \n\t"
: "+&x"(a) : "xm"(amin), "xm"(amax));
return a;
}
#endif /* __SSE2__ */
#if defined(__SSE__) && !defined(__INTEL_COMPILER)
#define av_clipf av_clipf_sse
static av_always_inline av_const float av_clipf_sse(float a, float amin, float amax)
{
#if defined(ASSERT_LEVEL) && ASSERT_LEVEL >= 2
if (amin > amax) abort();
#endif
__asm__ ("minss %2, %0 \n\t"
"maxss %1, %0 \n\t"
: "+&x"(a) : "xm"(amin), "xm"(amax));
return a;
}
#endif /* __SSE__ */
#endif /* __GNUC__ */
#endif /* AVUTIL_X86_INTMATH_H */

View file

@ -0,0 +1,97 @@
/*
* Copyright (c) 2010 Alexander Strange <astrange@ithinksw.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVUTIL_X86_INTREADWRITE_H
#define AVUTIL_X86_INTREADWRITE_H
#include <stdint.h>
#include "config.h"
#include "libavutil/attributes.h"
#if HAVE_MMX
#if !HAVE_FAST_64BIT && defined(__MMX__)
#define AV_COPY64 AV_COPY64
static av_always_inline void AV_COPY64(void *d, const void *s)
{
__asm__("movq %1, %%mm0 \n\t"
"movq %%mm0, %0 \n\t"
: "=m"(*(uint64_t*)d)
: "m" (*(const uint64_t*)s)
: "mm0");
}
#define AV_SWAP64 AV_SWAP64
static av_always_inline void AV_SWAP64(void *a, void *b)
{
__asm__("movq %1, %%mm0 \n\t"
"movq %0, %%mm1 \n\t"
"movq %%mm0, %0 \n\t"
"movq %%mm1, %1 \n\t"
: "+m"(*(uint64_t*)a), "+m"(*(uint64_t*)b)
::"mm0", "mm1");
}
#define AV_ZERO64 AV_ZERO64
static av_always_inline void AV_ZERO64(void *d)
{
__asm__("pxor %%mm0, %%mm0 \n\t"
"movq %%mm0, %0 \n\t"
: "=m"(*(uint64_t*)d)
:: "mm0");
}
#endif /* !HAVE_FAST_64BIT && defined(__MMX__) */
#ifdef __SSE__
#define AV_COPY128 AV_COPY128
static av_always_inline void AV_COPY128(void *d, const void *s)
{
struct v {uint64_t v[2];};
__asm__("movaps %1, %%xmm0 \n\t"
"movaps %%xmm0, %0 \n\t"
: "=m"(*(struct v*)d)
: "m" (*(const struct v*)s)
: "xmm0");
}
#endif /* __SSE__ */
#ifdef __SSE2__
#define AV_ZERO128 AV_ZERO128
static av_always_inline void AV_ZERO128(void *d)
{
struct v {uint64_t v[2];};
__asm__("pxor %%xmm0, %%xmm0 \n\t"
"movdqa %%xmm0, %0 \n\t"
: "=m"(*(struct v*)d)
:: "xmm0");
}
#endif /* __SSE2__ */
#endif /* HAVE_MMX */
#endif /* AVUTIL_X86_INTREADWRITE_H */

View file

@ -0,0 +1,290 @@
;******************************************************************************
;* linear least squares model
;*
;* Copyright (c) 2013 Loren Merritt
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "x86util.asm"
SECTION .text
%define MAX_VARS 32
%define MAX_VARS_ALIGN (MAX_VARS+4)
%define COVAR_STRIDE MAX_VARS_ALIGN*8
%define COVAR(x,y) [covarq + (x)*8 + (y)*COVAR_STRIDE]
struc LLSModel
.covariance: resq MAX_VARS_ALIGN*MAX_VARS_ALIGN
.coeff: resq MAX_VARS*MAX_VARS
.variance: resq MAX_VARS
.indep_count: resd 1
endstruc
%macro ADDPD_MEM 2
%if cpuflag(avx)
vaddpd %2, %2, %1
%else
addpd %2, %1
%endif
mova %1, %2
%endmacro
INIT_XMM sse2
%define movdqa movaps
cglobal update_lls, 2,5,8, ctx, var, i, j, covar2
%define covarq ctxq
mov id, [ctxq + LLSModel.indep_count]
lea varq, [varq + iq*8]
neg iq
mov covar2q, covarq
.loopi:
; Compute all 3 pairwise products of a 2x2 block that lies on the diagonal
mova m1, [varq + iq*8]
mova m3, [varq + iq*8 + 16]
pshufd m4, m1, q1010
pshufd m5, m1, q3232
pshufd m6, m3, q1010
pshufd m7, m3, q3232
mulpd m0, m1, m4
mulpd m1, m1, m5
lea covarq, [covar2q + 16]
ADDPD_MEM COVAR(-2,0), m0
ADDPD_MEM COVAR(-2,1), m1
lea jq, [iq + 2]
cmp jd, -2
jg .skip4x4
.loop4x4:
; Compute all 16 pairwise products of a 4x4 block
mulpd m0, m4, m3
mulpd m1, m5, m3
mulpd m2, m6, m3
mulpd m3, m3, m7
ADDPD_MEM COVAR(0,0), m0
ADDPD_MEM COVAR(0,1), m1
ADDPD_MEM COVAR(0,2), m2
ADDPD_MEM COVAR(0,3), m3
mova m3, [varq + jq*8 + 16]
mulpd m0, m4, m3
mulpd m1, m5, m3
mulpd m2, m6, m3
mulpd m3, m3, m7
ADDPD_MEM COVAR(2,0), m0
ADDPD_MEM COVAR(2,1), m1
ADDPD_MEM COVAR(2,2), m2
ADDPD_MEM COVAR(2,3), m3
mova m3, [varq + jq*8 + 32]
add covarq, 32
add jq, 4
cmp jd, -2
jle .loop4x4
.skip4x4:
test jd, jd
jg .skip2x4
mulpd m4, m3
mulpd m5, m3
mulpd m6, m3
mulpd m7, m3
ADDPD_MEM COVAR(0,0), m4
ADDPD_MEM COVAR(0,1), m5
ADDPD_MEM COVAR(0,2), m6
ADDPD_MEM COVAR(0,3), m7
.skip2x4:
add iq, 4
add covar2q, 4*COVAR_STRIDE+32
cmp id, -2
jle .loopi
test id, id
jg .ret
mov jq, iq
%define covarq covar2q
.loop2x1:
movsd m0, [varq + iq*8]
movlhps m0, m0
mulpd m0, [varq + jq*8]
ADDPD_MEM COVAR(0,0), m0
inc iq
add covarq, COVAR_STRIDE
test id, id
jle .loop2x1
.ret:
REP_RET
%macro UPDATE_LLS 0
cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2
%define covarq ctxq
mov countd, [ctxq + LLSModel.indep_count]
lea count2d, [countq-2]
xor id, id
.loopi:
; Compute all 10 pairwise products of a 4x4 block that lies on the diagonal
mova ymm1, [varq + iq*8]
vbroadcastsd ymm4, [varq + iq*8]
vbroadcastsd ymm5, [varq + iq*8 + 8]
vbroadcastsd ymm6, [varq + iq*8 + 16]
vbroadcastsd ymm7, [varq + iq*8 + 24]
vextractf128 xmm3, ymm1, 1
%if cpuflag(fma3)
mova ymm0, COVAR(iq ,0)
mova xmm2, COVAR(iq+2,2)
fmaddpd ymm0, ymm1, ymm4, ymm0
fmaddpd xmm2, xmm3, xmm6, xmm2
fmaddpd ymm1, ymm5, ymm1, COVAR(iq ,1)
fmaddpd xmm3, xmm7, xmm3, COVAR(iq+2,3)
mova COVAR(iq ,0), ymm0
mova COVAR(iq ,1), ymm1
mova COVAR(iq+2,2), xmm2
mova COVAR(iq+2,3), xmm3
%else
vmulpd ymm0, ymm1, ymm4
vmulpd ymm1, ymm1, ymm5
vmulpd xmm2, xmm3, xmm6
vmulpd xmm3, xmm3, xmm7
ADDPD_MEM COVAR(iq ,0), ymm0
ADDPD_MEM COVAR(iq ,1), ymm1
ADDPD_MEM COVAR(iq+2,2), xmm2
ADDPD_MEM COVAR(iq+2,3), xmm3
%endif ; cpuflag(fma3)
lea jd, [iq + 4]
cmp jd, count2d
jg .skip4x4
.loop4x4:
; Compute all 16 pairwise products of a 4x4 block
mova ymm3, [varq + jq*8]
%if cpuflag(fma3)
mova ymm0, COVAR(jq, 0)
mova ymm1, COVAR(jq, 1)
mova ymm2, COVAR(jq, 2)
fmaddpd ymm0, ymm3, ymm4, ymm0
fmaddpd ymm1, ymm3, ymm5, ymm1
fmaddpd ymm2, ymm3, ymm6, ymm2
fmaddpd ymm3, ymm7, ymm3, COVAR(jq,3)
mova COVAR(jq, 0), ymm0
mova COVAR(jq, 1), ymm1
mova COVAR(jq, 2), ymm2
mova COVAR(jq, 3), ymm3
%else
vmulpd ymm0, ymm3, ymm4
vmulpd ymm1, ymm3, ymm5
vmulpd ymm2, ymm3, ymm6
vmulpd ymm3, ymm3, ymm7
ADDPD_MEM COVAR(jq,0), ymm0
ADDPD_MEM COVAR(jq,1), ymm1
ADDPD_MEM COVAR(jq,2), ymm2
ADDPD_MEM COVAR(jq,3), ymm3
%endif ; cpuflag(fma3)
add jd, 4
cmp jd, count2d
jle .loop4x4
.skip4x4:
cmp jd, countd
jg .skip2x4
mova xmm3, [varq + jq*8]
%if cpuflag(fma3)
mova xmm0, COVAR(jq, 0)
mova xmm1, COVAR(jq, 1)
mova xmm2, COVAR(jq, 2)
fmaddpd xmm0, xmm3, xmm4, xmm0
fmaddpd xmm1, xmm3, xmm5, xmm1
fmaddpd xmm2, xmm3, xmm6, xmm2
fmaddpd xmm3, xmm7, xmm3, COVAR(jq,3)
mova COVAR(jq, 0), xmm0
mova COVAR(jq, 1), xmm1
mova COVAR(jq, 2), xmm2
mova COVAR(jq, 3), xmm3
%else
vmulpd xmm0, xmm3, xmm4
vmulpd xmm1, xmm3, xmm5
vmulpd xmm2, xmm3, xmm6
vmulpd xmm3, xmm3, xmm7
ADDPD_MEM COVAR(jq,0), xmm0
ADDPD_MEM COVAR(jq,1), xmm1
ADDPD_MEM COVAR(jq,2), xmm2
ADDPD_MEM COVAR(jq,3), xmm3
%endif ; cpuflag(fma3)
.skip2x4:
add id, 4
add covarq, 4*COVAR_STRIDE
cmp id, count2d
jle .loopi
cmp id, countd
jg .ret
mov jd, id
.loop2x1:
vmovddup xmm0, [varq + iq*8]
%if cpuflag(fma3)
mova xmm1, [varq + jq*8]
fmaddpd xmm0, xmm1, xmm0, COVAR(jq,0)
mova COVAR(jq,0), xmm0
%else
vmulpd xmm0, [varq + jq*8]
ADDPD_MEM COVAR(jq,0), xmm0
%endif ; cpuflag(fma3)
inc id
add covarq, COVAR_STRIDE
cmp id, countd
jle .loop2x1
.ret:
REP_RET
%endmacro ; UPDATE_LLS
%if HAVE_AVX_EXTERNAL
INIT_YMM avx
UPDATE_LLS
%endif
%if HAVE_FMA3_EXTERNAL
INIT_YMM fma3
UPDATE_LLS
%endif
INIT_XMM sse2
cglobal evaluate_lls, 3,4,2, ctx, var, order, i
; This function is often called on the same buffer as update_lls, but with
; an offset. They can't both be aligned.
; Load halves rather than movu to avoid store-forwarding stalls, since the
; input was initialized immediately prior to this function using scalar math.
%define coefsq ctxq
mov id, orderd
imul orderd, MAX_VARS
lea coefsq, [ctxq + LLSModel.coeff + orderq*8]
movsd m0, [varq]
movhpd m0, [varq + 8]
mulpd m0, [coefsq]
lea coefsq, [coefsq + iq*8]
lea varq, [varq + iq*8]
neg iq
add iq, 2
.loop:
movsd m1, [varq + iq*8]
movhpd m1, [varq + iq*8 + 8]
mulpd m1, [coefsq + iq*8]
addpd m0, m1
add iq, 2
jl .loop
jg .skip1
movsd m1, [varq + iq*8]
mulsd m1, [coefsq + iq*8]
addpd m0, m1
.skip1:
movhlps m1, m0
addsd m0, m1
%if ARCH_X86_32
movsd r0m, m0
fld qword r0m
%endif
RET

View file

@ -0,0 +1,45 @@
/*
* linear least squares model
*
* Copyright (c) 2013 Loren Merritt
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/lls.h"
#include "libavutil/x86/cpu.h"
void ff_update_lls_sse2(LLSModel *m, const double *var);
void ff_update_lls_avx(LLSModel *m, const double *var);
void ff_update_lls_fma3(LLSModel *m, const double *var);
double ff_evaluate_lls_sse2(LLSModel *m, const double *var, int order);
av_cold void ff_init_lls_x86(LLSModel *m)
{
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_SSE2(cpu_flags)) {
m->update_lls = ff_update_lls_sse2;
if (m->indep_count >= 4)
m->evaluate_lls = ff_evaluate_lls_sse2;
}
if (EXTERNAL_AVX_FAST(cpu_flags)) {
m->update_lls = ff_update_lls_avx;
}
if (EXTERNAL_FMA3_FAST(cpu_flags)) {
m->update_lls = ff_update_lls_fma3;
}
}

View file

@ -0,0 +1,26 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVUTIL_X86_PIXELUTILS_H
#define AVUTIL_X86_PIXELUTILS_H
#include "libavutil/pixelutils.h"
void ff_pixelutils_sad_init_x86(av_pixelutils_sad_fn *sad, int aligned);
#endif /* AVUTIL_X86_PIXELUTILS_H */

View file

@ -0,0 +1,94 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#include "pixelutils.h"
#include "cpu.h"
int ff_pixelutils_sad_8x8_mmx(const uint8_t *src1, ptrdiff_t stride1,
const uint8_t *src2, ptrdiff_t stride2);
int ff_pixelutils_sad_8x8_mmxext(const uint8_t *src1, ptrdiff_t stride1,
const uint8_t *src2, ptrdiff_t stride2);
int ff_pixelutils_sad_16x16_mmxext(const uint8_t *src1, ptrdiff_t stride1,
const uint8_t *src2, ptrdiff_t stride2);
int ff_pixelutils_sad_16x16_sse2(const uint8_t *src1, ptrdiff_t stride1,
const uint8_t *src2, ptrdiff_t stride2);
int ff_pixelutils_sad_a_16x16_sse2(const uint8_t *src1, ptrdiff_t stride1,
const uint8_t *src2, ptrdiff_t stride2);
int ff_pixelutils_sad_u_16x16_sse2(const uint8_t *src1, ptrdiff_t stride1,
const uint8_t *src2, ptrdiff_t stride2);
int ff_pixelutils_sad_32x32_sse2(const uint8_t *src1, ptrdiff_t stride1,
const uint8_t *src2, ptrdiff_t stride2);
int ff_pixelutils_sad_a_32x32_sse2(const uint8_t *src1, ptrdiff_t stride1,
const uint8_t *src2, ptrdiff_t stride2);
int ff_pixelutils_sad_u_32x32_sse2(const uint8_t *src1, ptrdiff_t stride1,
const uint8_t *src2, ptrdiff_t stride2);
int ff_pixelutils_sad_32x32_avx2(const uint8_t *src1, ptrdiff_t stride1,
const uint8_t *src2, ptrdiff_t stride2);
int ff_pixelutils_sad_a_32x32_avx2(const uint8_t *src1, ptrdiff_t stride1,
const uint8_t *src2, ptrdiff_t stride2);
int ff_pixelutils_sad_u_32x32_avx2(const uint8_t *src1, ptrdiff_t stride1,
const uint8_t *src2, ptrdiff_t stride2);
void ff_pixelutils_sad_init_x86(av_pixelutils_sad_fn *sad, int aligned)
{
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_MMX(cpu_flags)) {
sad[2] = ff_pixelutils_sad_8x8_mmx;
}
// The best way to use SSE2 would be to do 2 SADs in parallel,
// but we'd have to modify the pixelutils API to return SIMD functions.
// It's probably not faster to shuffle data around
// to get two lines of 8 pixels into a single 16byte register,
// so just use the MMX 8x8 version even when SSE2 is available.
if (EXTERNAL_MMXEXT(cpu_flags)) {
sad[2] = ff_pixelutils_sad_8x8_mmxext;
sad[3] = ff_pixelutils_sad_16x16_mmxext;
}
if (EXTERNAL_SSE2(cpu_flags)) {
switch (aligned) {
case 0: sad[3] = ff_pixelutils_sad_16x16_sse2; break; // src1 unaligned, src2 unaligned
case 1: sad[3] = ff_pixelutils_sad_u_16x16_sse2; break; // src1 aligned, src2 unaligned
case 2: sad[3] = ff_pixelutils_sad_a_16x16_sse2; break; // src1 aligned, src2 aligned
}
}
if (EXTERNAL_SSE2(cpu_flags)) {
switch (aligned) {
case 0: sad[4] = ff_pixelutils_sad_32x32_sse2; break; // src1 unaligned, src2 unaligned
case 1: sad[4] = ff_pixelutils_sad_u_32x32_sse2; break; // src1 aligned, src2 unaligned
case 2: sad[4] = ff_pixelutils_sad_a_32x32_sse2; break; // src1 aligned, src2 aligned
}
}
if (EXTERNAL_AVX2_FAST(cpu_flags)) {
switch (aligned) {
case 0: sad[4] = ff_pixelutils_sad_32x32_avx2; break; // src1 unaligned, src2 unaligned
case 1: sad[4] = ff_pixelutils_sad_u_32x32_avx2; break; // src1 aligned, src2 unaligned
case 2: sad[4] = ff_pixelutils_sad_a_32x32_avx2; break; // src1 aligned, src2 aligned
}
}
}

View file

@ -0,0 +1,50 @@
/*
* copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVUTIL_X86_TIMER_H
#define AVUTIL_X86_TIMER_H
#include <stdint.h>
#if HAVE_INLINE_ASM
#define FF_TIMER_UNITS "decicycles"
#define AV_READ_TIME read_time
static inline uint64_t read_time(void)
{
uint32_t a, d;
__asm__ volatile(
#if ARCH_X86_64 || defined(__SSE2__)
"lfence \n\t"
#endif
"rdtsc \n\t"
: "=a" (a), "=d" (d));
return ((uint64_t)d << 32) + a;
}
#elif HAVE_RDTSC
#include <intrin.h>
#define AV_READ_TIME __rdtsc
#endif /* HAVE_INLINE_ASM */
#endif /* AVUTIL_X86_TIMER_H */

View file

@ -0,0 +1,78 @@
/*
* check XMM registers for clobbers on Win64
* Copyright (c) 2008 Ramiro Polla <ramiro.polla@gmail.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVUTIL_X86_W64XMMTEST_H
#define AVUTIL_X86_W64XMMTEST_H
#include <inttypes.h>
#include <stdint.h>
#include <stdlib.h>
#include <stdarg.h>
#include <string.h>
#include "libavutil/bswap.h"
#define storexmmregs(mem) \
__asm__ volatile( \
"movups %%xmm6 , 0x00(%0)\n\t" \
"movups %%xmm7 , 0x10(%0)\n\t" \
"movups %%xmm8 , 0x20(%0)\n\t" \
"movups %%xmm9 , 0x30(%0)\n\t" \
"movups %%xmm10, 0x40(%0)\n\t" \
"movups %%xmm11, 0x50(%0)\n\t" \
"movups %%xmm12, 0x60(%0)\n\t" \
"movups %%xmm13, 0x70(%0)\n\t" \
"movups %%xmm14, 0x80(%0)\n\t" \
"movups %%xmm15, 0x90(%0)\n\t" \
:: "r"(mem) : "memory")
#define testxmmclobbers(func, ctx, ...) \
uint64_t xmm[2][10][2]; \
int ret; \
storexmmregs(xmm[0]); \
ret = __real_ ## func(ctx, __VA_ARGS__); \
storexmmregs(xmm[1]); \
if (memcmp(xmm[0], xmm[1], sizeof(xmm[0]))) { \
int i; \
av_log(ctx, AV_LOG_ERROR, \
"XMM REGS CLOBBERED IN %s!\n", #func); \
for (i = 0; i < 10; i ++) \
if (xmm[0][i][0] != xmm[1][i][0] || \
xmm[0][i][1] != xmm[1][i][1]) { \
av_log(ctx, AV_LOG_ERROR, \
"xmm%-2d = %016"PRIx64"%016"PRIx64"\n", \
6 + i, av_bswap64(xmm[0][i][0]), \
av_bswap64(xmm[0][i][1])); \
av_log(ctx, AV_LOG_ERROR, \
" -> %016"PRIx64"%016"PRIx64"\n", \
av_bswap64(xmm[1][i][0]), \
av_bswap64(xmm[1][i][1])); \
} \
abort(); \
} \
return ret
#define wrap(func) \
int __real_ ## func; \
int __wrap_ ## func; \
int __wrap_ ## func
#endif /* AVUTIL_X86_W64XMMTEST_H */

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff