1
0
Fork 0
mirror of https://github.com/ossrs/srs.git synced 2025-03-09 15:49:59 +00:00

For #1659, #307, add ffmpeg-4.2-fit for rtc

This commit is contained in:
winlin 2020-03-22 16:34:54 +08:00
parent 634a14bfa6
commit 4308f238c0
690 changed files with 182077 additions and 1 deletions

View file

@ -0,0 +1,154 @@
/*
* copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVUTIL_X86_ASM_H
#define AVUTIL_X86_ASM_H
#include <stdint.h>
#include "config.h"
typedef struct xmm_reg { uint64_t a, b; } xmm_reg;
typedef struct ymm_reg { uint64_t a, b, c, d; } ymm_reg;
#if ARCH_X86_64
# define FF_OPSIZE "q"
# define FF_REG_a "rax"
# define FF_REG_b "rbx"
# define FF_REG_c "rcx"
# define FF_REG_d "rdx"
# define FF_REG_D "rdi"
# define FF_REG_S "rsi"
# define FF_PTR_SIZE "8"
typedef int64_t x86_reg;
/* FF_REG_SP is defined in Solaris sys headers, so use FF_REG_sp */
# define FF_REG_sp "rsp"
# define FF_REG_BP "rbp"
# define FF_REGBP rbp
# define FF_REGa rax
# define FF_REGb rbx
# define FF_REGc rcx
# define FF_REGd rdx
# define FF_REGSP rsp
#elif ARCH_X86_32
# define FF_OPSIZE "l"
# define FF_REG_a "eax"
# define FF_REG_b "ebx"
# define FF_REG_c "ecx"
# define FF_REG_d "edx"
# define FF_REG_D "edi"
# define FF_REG_S "esi"
# define FF_PTR_SIZE "4"
typedef int32_t x86_reg;
# define FF_REG_sp "esp"
# define FF_REG_BP "ebp"
# define FF_REGBP ebp
# define FF_REGa eax
# define FF_REGb ebx
# define FF_REGc ecx
# define FF_REGd edx
# define FF_REGSP esp
#else
typedef int x86_reg;
#endif
#define HAVE_7REGS (ARCH_X86_64 || (HAVE_EBX_AVAILABLE && HAVE_EBP_AVAILABLE))
#define HAVE_6REGS (ARCH_X86_64 || (HAVE_EBX_AVAILABLE || HAVE_EBP_AVAILABLE))
#if ARCH_X86_64 && defined(PIC)
# define BROKEN_RELOCATIONS 1
#endif
/*
* If gcc is not set to support sse (-msse) it will not accept xmm registers
* in the clobber list for inline asm. XMM_CLOBBERS takes a list of xmm
* registers to be marked as clobbered and evaluates to nothing if they are
* not supported, or to the list itself if they are supported. Since a clobber
* list may not be empty, XMM_CLOBBERS_ONLY should be used if the xmm
* registers are the only in the clobber list.
* For example a list with "eax" and "xmm0" as clobbers should become:
* : XMM_CLOBBERS("xmm0",) "eax"
* and a list with only "xmm0" should become:
* XMM_CLOBBERS_ONLY("xmm0")
*/
#if HAVE_XMM_CLOBBERS
# define XMM_CLOBBERS(...) __VA_ARGS__
# define XMM_CLOBBERS_ONLY(...) : __VA_ARGS__
#else
# define XMM_CLOBBERS(...)
# define XMM_CLOBBERS_ONLY(...)
#endif
/* Use to export labels from asm. */
#define LABEL_MANGLE(a) EXTERN_PREFIX #a
// Use rip-relative addressing if compiling PIC code on x86-64.
#if ARCH_X86_64 && defined(PIC)
# define LOCAL_MANGLE(a) #a "(%%rip)"
#else
# define LOCAL_MANGLE(a) #a
#endif
#if HAVE_INLINE_ASM_DIRECT_SYMBOL_REFS
# define MANGLE(a) EXTERN_PREFIX LOCAL_MANGLE(a)
# define NAMED_CONSTRAINTS_ADD(...)
# define NAMED_CONSTRAINTS(...)
# define NAMED_CONSTRAINTS_ARRAY_ADD(...)
# define NAMED_CONSTRAINTS_ARRAY(...)
#else
/* When direct symbol references are used in code passed to a compiler that does not support them
* then these references need to be converted to named asm constraints instead.
* Instead of returning a direct symbol MANGLE now returns a named constraint for that specific symbol.
* In order for this to work there must also be a corresponding entry in the asm-interface. To add this
* entry use the macro NAMED_CONSTRAINTS() and pass in a list of each symbol reference used in the
* corresponding block of code. (e.g. NAMED_CONSTRAINTS(var1,var2,var3) where var1 is the first symbol etc. ).
* If there are already existing constraints then use NAMED_CONSTRAINTS_ADD to add to the existing constraint list.
*/
# define MANGLE(a) "%["#a"]"
// Intel/MSVC does not correctly expand va-args so we need a rather ugly hack in order to get it to work
# define FE_0(P,X) P(X)
# define FE_1(P,X,X1) P(X), FE_0(P,X1)
# define FE_2(P,X,X1,X2) P(X), FE_1(P,X1,X2)
# define FE_3(P,X,X1,X2,X3) P(X), FE_2(P,X1,X2,X3)
# define FE_4(P,X,X1,X2,X3,X4) P(X), FE_3(P,X1,X2,X3,X4)
# define FE_5(P,X,X1,X2,X3,X4,X5) P(X), FE_4(P,X1,X2,X3,X4,X5)
# define FE_6(P,X,X1,X2,X3,X4,X5,X6) P(X), FE_5(P,X1,X2,X3,X4,X5,X6)
# define FE_7(P,X,X1,X2,X3,X4,X5,X6,X7) P(X), FE_6(P,X1,X2,X3,X4,X5,X6,X7)
# define FE_8(P,X,X1,X2,X3,X4,X5,X6,X7,X8) P(X), FE_7(P,X1,X2,X3,X4,X5,X6,X7,X8)
# define FE_9(P,X,X1,X2,X3,X4,X5,X6,X7,X8,X9) P(X), FE_8(P,X1,X2,X3,X4,X5,X6,X7,X8,X9)
# define GET_FE_IMPL(_0,_1,_2,_3,_4,_5,_6,_7,_8,_9,NAME,...) NAME
# define GET_FE(A) GET_FE_IMPL A
# define GET_FE_GLUE(x, y) x y
# define FOR_EACH_VA(P,...) GET_FE_GLUE(GET_FE((__VA_ARGS__,FE_9,FE_8,FE_7,FE_6,FE_5,FE_4,FE_3,FE_2,FE_1,FE_0)), (P,__VA_ARGS__))
# define NAME_CONSTRAINT(x) [x] "m"(x)
// Parameters are a list of each symbol reference required
# define NAMED_CONSTRAINTS_ADD(...) , FOR_EACH_VA(NAME_CONSTRAINT,__VA_ARGS__)
// Same but without comma for when there are no previously defined constraints
# define NAMED_CONSTRAINTS(...) FOR_EACH_VA(NAME_CONSTRAINT,__VA_ARGS__)
// Same as above NAMED_CONSTRAINTS except used for passing arrays/pointers instead of normal variables
# define NAME_CONSTRAINT_ARRAY(x) [x] "m"(*x)
# define NAMED_CONSTRAINTS_ARRAY_ADD(...) , FOR_EACH_VA(NAME_CONSTRAINT_ARRAY,__VA_ARGS__)
# define NAMED_CONSTRAINTS_ARRAY(...) FOR_EACH_VA(NAME_CONSTRAINT_ARRAY,__VA_ARGS__)
#endif
#endif /* AVUTIL_X86_ASM_H */

View file

@ -0,0 +1,87 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
/**
* @file
* byte swapping routines
*/
#ifndef AVUTIL_X86_BSWAP_H
#define AVUTIL_X86_BSWAP_H
#include <stdint.h>
#if defined(_MSC_VER)
#include <intrin.h>
#endif
#include "config.h"
#include "libavutil/attributes.h"
#if defined(_MSC_VER)
#define av_bswap16 av_bswap16
static av_always_inline av_const uint16_t av_bswap16(uint16_t x)
{
return _rotr16(x, 8);
}
#define av_bswap32 av_bswap32
static av_always_inline av_const uint32_t av_bswap32(uint32_t x)
{
return _byteswap_ulong(x);
}
#if ARCH_X86_64
#define av_bswap64 av_bswap64
static inline uint64_t av_const av_bswap64(uint64_t x)
{
return _byteswap_uint64(x);
}
#endif
#elif HAVE_INLINE_ASM
#if AV_GCC_VERSION_AT_MOST(4,0)
#define av_bswap16 av_bswap16
static av_always_inline av_const unsigned av_bswap16(unsigned x)
{
__asm__("rorw $8, %w0" : "+r"(x));
return x;
}
#endif /* AV_GCC_VERSION_AT_MOST(4,0) */
#if AV_GCC_VERSION_AT_MOST(4,4) || defined(__INTEL_COMPILER)
#define av_bswap32 av_bswap32
static av_always_inline av_const uint32_t av_bswap32(uint32_t x)
{
__asm__("bswap %0" : "+r" (x));
return x;
}
#if ARCH_X86_64
#define av_bswap64 av_bswap64
static inline uint64_t av_const av_bswap64(uint64_t x)
{
__asm__("bswap %0": "=r" (x) : "0" (x));
return x;
}
#endif
#endif /* AV_GCC_VERSION_AT_MOST(4,4) */
#endif /* HAVE_INLINE_ASM */
#endif /* AVUTIL_X86_BSWAP_H */

View file

@ -0,0 +1,272 @@
/*
* CPU detection code, extracted from mmx.h
* (c)1997-99 by H. Dietz and R. Fisher
* Converted to C and improved by Fabrice Bellard.
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdlib.h>
#include <string.h>
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
#include "libavutil/cpu.h"
#include "libavutil/cpu_internal.h"
#if HAVE_X86ASM
#define cpuid(index, eax, ebx, ecx, edx) \
ff_cpu_cpuid(index, &eax, &ebx, &ecx, &edx)
#define xgetbv(index, eax, edx) \
ff_cpu_xgetbv(index, &eax, &edx)
#elif HAVE_INLINE_ASM
/* ebx saving is necessary for PIC. gcc seems unable to see it alone */
#define cpuid(index, eax, ebx, ecx, edx) \
__asm__ volatile ( \
"mov %%"FF_REG_b", %%"FF_REG_S" \n\t" \
"cpuid \n\t" \
"xchg %%"FF_REG_b", %%"FF_REG_S \
: "=a" (eax), "=S" (ebx), "=c" (ecx), "=d" (edx) \
: "0" (index), "2"(0))
#define xgetbv(index, eax, edx) \
__asm__ (".byte 0x0f, 0x01, 0xd0" : "=a"(eax), "=d"(edx) : "c" (index))
#define get_eflags(x) \
__asm__ volatile ("pushfl \n" \
"pop %0 \n" \
: "=r"(x))
#define set_eflags(x) \
__asm__ volatile ("push %0 \n" \
"popfl \n" \
:: "r"(x))
#endif /* HAVE_INLINE_ASM */
#if ARCH_X86_64
#define cpuid_test() 1
#elif HAVE_X86ASM
#define cpuid_test ff_cpu_cpuid_test
#elif HAVE_INLINE_ASM
static int cpuid_test(void)
{
x86_reg a, c;
/* Check if CPUID is supported by attempting to toggle the ID bit in
* the EFLAGS register. */
get_eflags(a);
set_eflags(a ^ 0x200000);
get_eflags(c);
return a != c;
}
#endif
/* Function to test if multimedia instructions are supported... */
int ff_get_cpu_flags_x86(void)
{
int rval = 0;
#ifdef cpuid
int eax, ebx, ecx, edx;
int max_std_level, max_ext_level, std_caps = 0, ext_caps = 0;
int family = 0, model = 0;
union { int i[3]; char c[12]; } vendor;
int xcr0_lo = 0, xcr0_hi = 0;
if (!cpuid_test())
return 0; /* CPUID not supported */
cpuid(0, max_std_level, vendor.i[0], vendor.i[2], vendor.i[1]);
if (max_std_level >= 1) {
cpuid(1, eax, ebx, ecx, std_caps);
family = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff);
model = ((eax >> 4) & 0xf) + ((eax >> 12) & 0xf0);
if (std_caps & (1 << 15))
rval |= AV_CPU_FLAG_CMOV;
if (std_caps & (1 << 23))
rval |= AV_CPU_FLAG_MMX;
if (std_caps & (1 << 25))
rval |= AV_CPU_FLAG_MMXEXT;
#if HAVE_SSE
if (std_caps & (1 << 25))
rval |= AV_CPU_FLAG_SSE;
if (std_caps & (1 << 26))
rval |= AV_CPU_FLAG_SSE2;
if (ecx & 1)
rval |= AV_CPU_FLAG_SSE3;
if (ecx & 0x00000200 )
rval |= AV_CPU_FLAG_SSSE3;
if (ecx & 0x00080000 )
rval |= AV_CPU_FLAG_SSE4;
if (ecx & 0x00100000 )
rval |= AV_CPU_FLAG_SSE42;
if (ecx & 0x02000000 )
rval |= AV_CPU_FLAG_AESNI;
#if HAVE_AVX
/* Check OXSAVE and AVX bits */
if ((ecx & 0x18000000) == 0x18000000) {
/* Check for OS support */
xgetbv(0, xcr0_lo, xcr0_hi);
if ((xcr0_lo & 0x6) == 0x6) {
rval |= AV_CPU_FLAG_AVX;
if (ecx & 0x00001000)
rval |= AV_CPU_FLAG_FMA3;
}
}
#endif /* HAVE_AVX */
#endif /* HAVE_SSE */
}
if (max_std_level >= 7) {
cpuid(7, eax, ebx, ecx, edx);
#if HAVE_AVX2
if ((rval & AV_CPU_FLAG_AVX) && (ebx & 0x00000020))
rval |= AV_CPU_FLAG_AVX2;
#if HAVE_AVX512 /* F, CD, BW, DQ, VL */
if ((xcr0_lo & 0xe0) == 0xe0) { /* OPMASK/ZMM state */
if ((rval & AV_CPU_FLAG_AVX2) && (ebx & 0xd0030000) == 0xd0030000)
rval |= AV_CPU_FLAG_AVX512;
}
#endif /* HAVE_AVX512 */
#endif /* HAVE_AVX2 */
/* BMI1/2 don't need OS support */
if (ebx & 0x00000008) {
rval |= AV_CPU_FLAG_BMI1;
if (ebx & 0x00000100)
rval |= AV_CPU_FLAG_BMI2;
}
}
cpuid(0x80000000, max_ext_level, ebx, ecx, edx);
if (max_ext_level >= 0x80000001) {
cpuid(0x80000001, eax, ebx, ecx, ext_caps);
if (ext_caps & (1U << 31))
rval |= AV_CPU_FLAG_3DNOW;
if (ext_caps & (1 << 30))
rval |= AV_CPU_FLAG_3DNOWEXT;
if (ext_caps & (1 << 23))
rval |= AV_CPU_FLAG_MMX;
if (ext_caps & (1 << 22))
rval |= AV_CPU_FLAG_MMXEXT;
if (!strncmp(vendor.c, "AuthenticAMD", 12)) {
/* Allow for selectively disabling SSE2 functions on AMD processors
with SSE2 support but not SSE4a. This includes Athlon64, some
Opteron, and some Sempron processors. MMX, SSE, or 3DNow! are faster
than SSE2 often enough to utilize this special-case flag.
AV_CPU_FLAG_SSE2 and AV_CPU_FLAG_SSE2SLOW are both set in this case
so that SSE2 is used unless explicitly disabled by checking
AV_CPU_FLAG_SSE2SLOW. */
if (rval & AV_CPU_FLAG_SSE2 && !(ecx & 0x00000040))
rval |= AV_CPU_FLAG_SSE2SLOW;
/* Similar to the above but for AVX functions on AMD processors.
This is necessary only for functions using YMM registers on Bulldozer
and Jaguar based CPUs as they lack 256-bit execution units. SSE/AVX
functions using XMM registers are always faster on them.
AV_CPU_FLAG_AVX and AV_CPU_FLAG_AVXSLOW are both set so that AVX is
used unless explicitly disabled by checking AV_CPU_FLAG_AVXSLOW. */
if ((family == 0x15 || family == 0x16) && (rval & AV_CPU_FLAG_AVX))
rval |= AV_CPU_FLAG_AVXSLOW;
}
/* XOP and FMA4 use the AVX instruction coding scheme, so they can't be
* used unless the OS has AVX support. */
if (rval & AV_CPU_FLAG_AVX) {
if (ecx & 0x00000800)
rval |= AV_CPU_FLAG_XOP;
if (ecx & 0x00010000)
rval |= AV_CPU_FLAG_FMA4;
}
}
if (!strncmp(vendor.c, "GenuineIntel", 12)) {
if (family == 6 && (model == 9 || model == 13 || model == 14)) {
/* 6/9 (pentium-m "banias"), 6/13 (pentium-m "dothan"), and
* 6/14 (core1 "yonah") theoretically support sse2, but it's
* usually slower than mmx, so let's just pretend they don't.
* AV_CPU_FLAG_SSE2 is disabled and AV_CPU_FLAG_SSE2SLOW is
* enabled so that SSE2 is not used unless explicitly enabled
* by checking AV_CPU_FLAG_SSE2SLOW. The same situation
* applies for AV_CPU_FLAG_SSE3 and AV_CPU_FLAG_SSE3SLOW. */
if (rval & AV_CPU_FLAG_SSE2)
rval ^= AV_CPU_FLAG_SSE2SLOW | AV_CPU_FLAG_SSE2;
if (rval & AV_CPU_FLAG_SSE3)
rval ^= AV_CPU_FLAG_SSE3SLOW | AV_CPU_FLAG_SSE3;
}
/* The Atom processor has SSSE3 support, which is useful in many cases,
* but sometimes the SSSE3 version is slower than the SSE2 equivalent
* on the Atom, but is generally faster on other processors supporting
* SSSE3. This flag allows for selectively disabling certain SSSE3
* functions on the Atom. */
if (family == 6 && model == 28)
rval |= AV_CPU_FLAG_ATOM;
/* Conroe has a slow shuffle unit. Check the model number to ensure not
* to include crippled low-end Penryns and Nehalems that lack SSE4. */
if ((rval & AV_CPU_FLAG_SSSE3) && !(rval & AV_CPU_FLAG_SSE4) &&
family == 6 && model < 23)
rval |= AV_CPU_FLAG_SSSE3SLOW;
}
#endif /* cpuid */
return rval;
}
size_t ff_get_cpu_max_align_x86(void)
{
int flags = av_get_cpu_flags();
if (flags & AV_CPU_FLAG_AVX512)
return 64;
if (flags & (AV_CPU_FLAG_AVX2 |
AV_CPU_FLAG_AVX |
AV_CPU_FLAG_XOP |
AV_CPU_FLAG_FMA4 |
AV_CPU_FLAG_FMA3 |
AV_CPU_FLAG_AVXSLOW))
return 32;
if (flags & (AV_CPU_FLAG_AESNI |
AV_CPU_FLAG_SSE42 |
AV_CPU_FLAG_SSE4 |
AV_CPU_FLAG_SSSE3 |
AV_CPU_FLAG_SSE3 |
AV_CPU_FLAG_SSE2 |
AV_CPU_FLAG_SSE |
AV_CPU_FLAG_ATOM |
AV_CPU_FLAG_SSSE3SLOW |
AV_CPU_FLAG_SSE3SLOW |
AV_CPU_FLAG_SSE2SLOW))
return 16;
return 8;
}

View file

@ -0,0 +1,113 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVUTIL_X86_CPU_H
#define AVUTIL_X86_CPU_H
#include "libavutil/cpu.h"
#include "libavutil/cpu_internal.h"
#define AV_CPU_FLAG_AMD3DNOW AV_CPU_FLAG_3DNOW
#define AV_CPU_FLAG_AMD3DNOWEXT AV_CPU_FLAG_3DNOWEXT
#define X86_AMD3DNOW(flags) CPUEXT(flags, AMD3DNOW)
#define X86_AMD3DNOWEXT(flags) CPUEXT(flags, AMD3DNOWEXT)
#define X86_MMX(flags) CPUEXT(flags, MMX)
#define X86_MMXEXT(flags) CPUEXT(flags, MMXEXT)
#define X86_SSE(flags) CPUEXT(flags, SSE)
#define X86_SSE2(flags) CPUEXT(flags, SSE2)
#define X86_SSE2_FAST(flags) CPUEXT_FAST(flags, SSE2)
#define X86_SSE2_SLOW(flags) CPUEXT_SLOW(flags, SSE2)
#define X86_SSE3(flags) CPUEXT(flags, SSE3)
#define X86_SSE3_FAST(flags) CPUEXT_FAST(flags, SSE3)
#define X86_SSE3_SLOW(flags) CPUEXT_SLOW(flags, SSE3)
#define X86_SSSE3(flags) CPUEXT(flags, SSSE3)
#define X86_SSSE3_FAST(flags) CPUEXT_FAST(flags, SSSE3)
#define X86_SSSE3_SLOW(flags) CPUEXT_SLOW(flags, SSSE3)
#define X86_SSE4(flags) CPUEXT(flags, SSE4)
#define X86_SSE42(flags) CPUEXT(flags, SSE42)
#define X86_AVX(flags) CPUEXT(flags, AVX)
#define X86_AVX_FAST(flags) CPUEXT_FAST(flags, AVX)
#define X86_AVX_SLOW(flags) CPUEXT_SLOW(flags, AVX)
#define X86_XOP(flags) CPUEXT(flags, XOP)
#define X86_FMA3(flags) CPUEXT(flags, FMA3)
#define X86_FMA4(flags) CPUEXT(flags, FMA4)
#define X86_AVX2(flags) CPUEXT(flags, AVX2)
#define X86_AESNI(flags) CPUEXT(flags, AESNI)
#define X86_AVX512(flags) CPUEXT(flags, AVX512)
#define EXTERNAL_AMD3DNOW(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, AMD3DNOW)
#define EXTERNAL_AMD3DNOWEXT(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, AMD3DNOWEXT)
#define EXTERNAL_MMX(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, MMX)
#define EXTERNAL_MMXEXT(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, MMXEXT)
#define EXTERNAL_SSE(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, SSE)
#define EXTERNAL_SSE2(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, SSE2)
#define EXTERNAL_SSE2_FAST(flags) CPUEXT_SUFFIX_FAST(flags, _EXTERNAL, SSE2)
#define EXTERNAL_SSE2_SLOW(flags) CPUEXT_SUFFIX_SLOW(flags, _EXTERNAL, SSE2)
#define EXTERNAL_SSE3(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, SSE3)
#define EXTERNAL_SSE3_FAST(flags) CPUEXT_SUFFIX_FAST(flags, _EXTERNAL, SSE3)
#define EXTERNAL_SSE3_SLOW(flags) CPUEXT_SUFFIX_SLOW(flags, _EXTERNAL, SSE3)
#define EXTERNAL_SSSE3(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, SSSE3)
#define EXTERNAL_SSSE3_FAST(flags) CPUEXT_SUFFIX_FAST(flags, _EXTERNAL, SSSE3)
#define EXTERNAL_SSSE3_SLOW(flags) CPUEXT_SUFFIX_SLOW(flags, _EXTERNAL, SSSE3)
#define EXTERNAL_SSE4(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, SSE4)
#define EXTERNAL_SSE42(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, SSE42)
#define EXTERNAL_AVX(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, AVX)
#define EXTERNAL_AVX_FAST(flags) CPUEXT_SUFFIX_FAST(flags, _EXTERNAL, AVX)
#define EXTERNAL_AVX_SLOW(flags) CPUEXT_SUFFIX_SLOW(flags, _EXTERNAL, AVX)
#define EXTERNAL_XOP(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, XOP)
#define EXTERNAL_FMA3(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, FMA3)
#define EXTERNAL_FMA3_FAST(flags) CPUEXT_SUFFIX_FAST2(flags, _EXTERNAL, FMA3, AVX)
#define EXTERNAL_FMA3_SLOW(flags) CPUEXT_SUFFIX_SLOW2(flags, _EXTERNAL, FMA3, AVX)
#define EXTERNAL_FMA4(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, FMA4)
#define EXTERNAL_AVX2(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, AVX2)
#define EXTERNAL_AVX2_FAST(flags) CPUEXT_SUFFIX_FAST2(flags, _EXTERNAL, AVX2, AVX)
#define EXTERNAL_AVX2_SLOW(flags) CPUEXT_SUFFIX_SLOW2(flags, _EXTERNAL, AVX2, AVX)
#define EXTERNAL_AESNI(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, AESNI)
#define EXTERNAL_AVX512(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, AVX512)
#define INLINE_AMD3DNOW(flags) CPUEXT_SUFFIX(flags, _INLINE, AMD3DNOW)
#define INLINE_AMD3DNOWEXT(flags) CPUEXT_SUFFIX(flags, _INLINE, AMD3DNOWEXT)
#define INLINE_MMX(flags) CPUEXT_SUFFIX(flags, _INLINE, MMX)
#define INLINE_MMXEXT(flags) CPUEXT_SUFFIX(flags, _INLINE, MMXEXT)
#define INLINE_SSE(flags) CPUEXT_SUFFIX(flags, _INLINE, SSE)
#define INLINE_SSE2(flags) CPUEXT_SUFFIX(flags, _INLINE, SSE2)
#define INLINE_SSE2_FAST(flags) CPUEXT_SUFFIX_FAST(flags, _INLINE, SSE2)
#define INLINE_SSE2_SLOW(flags) CPUEXT_SUFFIX_SLOW(flags, _INLINE, SSE2)
#define INLINE_SSE3(flags) CPUEXT_SUFFIX(flags, _INLINE, SSE3)
#define INLINE_SSE3_FAST(flags) CPUEXT_SUFFIX_FAST(flags, _INLINE, SSE3)
#define INLINE_SSE3_SLOW(flags) CPUEXT_SUFFIX_SLOW(flags, _INLINE, SSE3)
#define INLINE_SSSE3(flags) CPUEXT_SUFFIX(flags, _INLINE, SSSE3)
#define INLINE_SSSE3_FAST(flags) CPUEXT_SUFFIX_FAST(flags, _INLINE, SSSE3)
#define INLINE_SSSE3_SLOW(flags) CPUEXT_SUFFIX_SLOW(flags, _INLINE, SSSE3)
#define INLINE_SSE4(flags) CPUEXT_SUFFIX(flags, _INLINE, SSE4)
#define INLINE_SSE42(flags) CPUEXT_SUFFIX(flags, _INLINE, SSE42)
#define INLINE_AVX(flags) CPUEXT_SUFFIX(flags, _INLINE, AVX)
#define INLINE_AVX_FAST(flags) CPUEXT_SUFFIX_FAST(flags, _INLINE, AVX)
#define INLINE_AVX_SLOW(flags) CPUEXT_SUFFIX_SLOW(flags, _INLINE, AVX)
#define INLINE_XOP(flags) CPUEXT_SUFFIX(flags, _INLINE, XOP)
#define INLINE_FMA3(flags) CPUEXT_SUFFIX(flags, _INLINE, FMA3)
#define INLINE_FMA4(flags) CPUEXT_SUFFIX(flags, _INLINE, FMA4)
#define INLINE_AVX2(flags) CPUEXT_SUFFIX(flags, _INLINE, AVX2)
#define INLINE_AESNI(flags) CPUEXT_SUFFIX(flags, _INLINE, AESNI)
void ff_cpu_cpuid(int index, int *eax, int *ebx, int *ecx, int *edx);
void ff_cpu_xgetbv(int op, int *eax, int *edx);
int ff_cpu_cpuid_test(void);
#endif /* AVUTIL_X86_CPU_H */

View file

@ -0,0 +1,55 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVUTIL_X86_EMMS_H
#define AVUTIL_X86_EMMS_H
#include "config.h"
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
void avpriv_emms_asm(void);
#if HAVE_MMX_INLINE
# define emms_c emms_c
/**
* Empty mmx state.
* this must be called between any dsp function and float/double code.
* for example sin(); dsp->idct_put(); emms_c(); cos()
* Note, *alloc() and *free() also use float code in some libc implementations
* thus this also applies to them or any function using them.
*/
static av_always_inline void emms_c(void)
{
/* Some inlined functions may also use mmx instructions regardless of
* runtime cpuflags. With that in mind, we unconditionally empty the
* mmx state if the target cpu chosen at configure time supports it.
*/
#if !defined(__MMX__)
if(av_get_cpu_flags() & AV_CPU_FLAG_MMX)
#endif
__asm__ volatile ("emms" ::: "memory");
}
#elif HAVE_MMX && HAVE_MM_EMPTY
# include <mmintrin.h>
# define emms_c _mm_empty
#elif HAVE_MMX_EXTERNAL
# define emms_c avpriv_emms_asm
#endif /* HAVE_MMX_INLINE */
#endif /* AVUTIL_X86_EMMS_H */

View file

@ -0,0 +1,35 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/fixed_dsp.h"
#include "cpu.h"
void ff_butterflies_fixed_sse2(int *src0, int *src1, int len);
av_cold void ff_fixed_dsp_init_x86(AVFixedDSPContext *fdsp)
{
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_SSE2(cpu_flags)) {
fdsp->butterflies_fixed = ff_butterflies_fixed_sse2;
}
}

View file

@ -0,0 +1,121 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/float_dsp.h"
#include "cpu.h"
#include "asm.h"
void ff_vector_fmul_sse(float *dst, const float *src0, const float *src1,
int len);
void ff_vector_fmul_avx(float *dst, const float *src0, const float *src1,
int len);
void ff_vector_dmul_sse2(double *dst, const double *src0, const double *src1,
int len);
void ff_vector_dmul_avx(double *dst, const double *src0, const double *src1,
int len);
void ff_vector_fmac_scalar_sse(float *dst, const float *src, float mul,
int len);
void ff_vector_fmac_scalar_avx(float *dst, const float *src, float mul,
int len);
void ff_vector_fmac_scalar_fma3(float *dst, const float *src, float mul,
int len);
void ff_vector_fmul_scalar_sse(float *dst, const float *src, float mul,
int len);
void ff_vector_dmac_scalar_sse2(double *dst, const double *src, double mul,
int len);
void ff_vector_dmac_scalar_avx(double *dst, const double *src, double mul,
int len);
void ff_vector_dmac_scalar_fma3(double *dst, const double *src, double mul,
int len);
void ff_vector_dmul_scalar_sse2(double *dst, const double *src,
double mul, int len);
void ff_vector_dmul_scalar_avx(double *dst, const double *src,
double mul, int len);
void ff_vector_fmul_window_3dnowext(float *dst, const float *src0,
const float *src1, const float *win, int len);
void ff_vector_fmul_window_sse(float *dst, const float *src0,
const float *src1, const float *win, int len);
void ff_vector_fmul_add_sse(float *dst, const float *src0, const float *src1,
const float *src2, int len);
void ff_vector_fmul_add_avx(float *dst, const float *src0, const float *src1,
const float *src2, int len);
void ff_vector_fmul_add_fma3(float *dst, const float *src0, const float *src1,
const float *src2, int len);
void ff_vector_fmul_reverse_sse(float *dst, const float *src0,
const float *src1, int len);
void ff_vector_fmul_reverse_avx(float *dst, const float *src0,
const float *src1, int len);
void ff_vector_fmul_reverse_avx2(float *dst, const float *src0,
const float *src1, int len);
float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order);
void ff_butterflies_float_sse(float *av_restrict src0, float *av_restrict src1, int len);
av_cold void ff_float_dsp_init_x86(AVFloatDSPContext *fdsp)
{
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_AMD3DNOWEXT(cpu_flags)) {
fdsp->vector_fmul_window = ff_vector_fmul_window_3dnowext;
}
if (EXTERNAL_SSE(cpu_flags)) {
fdsp->vector_fmul = ff_vector_fmul_sse;
fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_sse;
fdsp->vector_fmul_scalar = ff_vector_fmul_scalar_sse;
fdsp->vector_fmul_window = ff_vector_fmul_window_sse;
fdsp->vector_fmul_add = ff_vector_fmul_add_sse;
fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_sse;
fdsp->scalarproduct_float = ff_scalarproduct_float_sse;
fdsp->butterflies_float = ff_butterflies_float_sse;
}
if (EXTERNAL_SSE2(cpu_flags)) {
fdsp->vector_dmul = ff_vector_dmul_sse2;
fdsp->vector_dmac_scalar = ff_vector_dmac_scalar_sse2;
fdsp->vector_dmul_scalar = ff_vector_dmul_scalar_sse2;
}
if (EXTERNAL_AVX_FAST(cpu_flags)) {
fdsp->vector_fmul = ff_vector_fmul_avx;
fdsp->vector_dmul = ff_vector_dmul_avx;
fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_avx;
fdsp->vector_dmul_scalar = ff_vector_dmul_scalar_avx;
fdsp->vector_dmac_scalar = ff_vector_dmac_scalar_avx;
fdsp->vector_fmul_add = ff_vector_fmul_add_avx;
fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_avx;
}
if (EXTERNAL_AVX2_FAST(cpu_flags)) {
fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_avx2;
}
if (EXTERNAL_FMA3_FAST(cpu_flags)) {
fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_fma3;
fdsp->vector_fmul_add = ff_vector_fmul_add_fma3;
fdsp->vector_dmac_scalar = ff_vector_dmac_scalar_fma3;
}
}

View file

@ -0,0 +1,49 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stddef.h>
#include <stdint.h>
#include "libavutil/cpu.h"
#include "libavutil/error.h"
#include "libavutil/imgutils.h"
#include "libavutil/imgutils_internal.h"
#include "libavutil/internal.h"
#include "cpu.h"
void ff_image_copy_plane_uc_from_sse4(uint8_t *dst, ptrdiff_t dst_linesize,
const uint8_t *src, ptrdiff_t src_linesize,
ptrdiff_t bytewidth, int height);
int ff_image_copy_plane_uc_from_x86(uint8_t *dst, ptrdiff_t dst_linesize,
const uint8_t *src, ptrdiff_t src_linesize,
ptrdiff_t bytewidth, int height)
{
int cpu_flags = av_get_cpu_flags();
ptrdiff_t bw_aligned = FFALIGN(bytewidth, 64);
if (EXTERNAL_SSE4(cpu_flags) &&
bw_aligned <= dst_linesize && bw_aligned <= src_linesize)
ff_image_copy_plane_uc_from_sse4(dst, dst_linesize, src, src_linesize,
bw_aligned, height);
else
return AVERROR(ENOSYS);
return 0;
}

View file

@ -0,0 +1,139 @@
/*
* Copyright (c) 2015 James Almer
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVUTIL_X86_INTMATH_H
#define AVUTIL_X86_INTMATH_H
#include <stdint.h>
#include <stdlib.h>
#if HAVE_FAST_CLZ
#if defined(_MSC_VER)
#include <intrin.h>
#elif defined(__INTEL_COMPILER)
#include <immintrin.h>
#endif
#endif
#include "config.h"
#if HAVE_FAST_CLZ
#if (defined(__INTEL_COMPILER) && (__INTEL_COMPILER>=1216)) || defined(_MSC_VER)
# if defined(__INTEL_COMPILER)
# define ff_log2(x) (_bit_scan_reverse((x)|1))
# else
# define ff_log2 ff_log2_x86
static av_always_inline av_const int ff_log2_x86(unsigned int v)
{
unsigned long n;
_BitScanReverse(&n, v|1);
return n;
}
# endif
# define ff_log2_16bit av_log2
#if defined(__INTEL_COMPILER) || (defined(_MSC_VER) && (_MSC_VER >= 1700) && \
(defined(__BMI__) || !defined(__clang__)))
# define ff_ctz(v) _tzcnt_u32(v)
# if ARCH_X86_64
# define ff_ctzll(v) _tzcnt_u64(v)
# else
# define ff_ctzll ff_ctzll_x86
static av_always_inline av_const int ff_ctzll_x86(long long v)
{
return ((uint32_t)v == 0) ? _tzcnt_u32((uint32_t)(v >> 32)) + 32 : _tzcnt_u32((uint32_t)v);
}
# endif
#endif /* _MSC_VER */
#endif /* __INTEL_COMPILER */
#endif /* HAVE_FAST_CLZ */
#if defined(__GNUC__)
/* Our generic version of av_popcount is faster than GCC's built-in on
* CPUs that don't support the popcnt instruction.
*/
#if defined(__POPCNT__)
#define av_popcount __builtin_popcount
#if ARCH_X86_64
#define av_popcount64 __builtin_popcountll
#endif
#endif /* __POPCNT__ */
#if defined(__BMI2__)
#if AV_GCC_VERSION_AT_LEAST(5,1)
#define av_mod_uintp2 __builtin_ia32_bzhi_si
#elif HAVE_INLINE_ASM
/* GCC releases before 5.1.0 have a broken bzhi builtin, so for those we
* implement it using inline assembly
*/
#define av_mod_uintp2 av_mod_uintp2_bmi2
static av_always_inline av_const unsigned av_mod_uintp2_bmi2(unsigned a, unsigned p)
{
if (av_builtin_constant_p(p))
return a & ((1 << p) - 1);
else {
unsigned x;
__asm__ ("bzhi %2, %1, %0 \n\t" : "=r"(x) : "rm"(a), "r"(p));
return x;
}
}
#endif /* AV_GCC_VERSION_AT_LEAST */
#endif /* __BMI2__ */
#if defined(__SSE2__) && !defined(__INTEL_COMPILER)
#define av_clipd av_clipd_sse2
static av_always_inline av_const double av_clipd_sse2(double a, double amin, double amax)
{
#if defined(ASSERT_LEVEL) && ASSERT_LEVEL >= 2
if (amin > amax) abort();
#endif
__asm__ ("minsd %2, %0 \n\t"
"maxsd %1, %0 \n\t"
: "+&x"(a) : "xm"(amin), "xm"(amax));
return a;
}
#endif /* __SSE2__ */
#if defined(__SSE__) && !defined(__INTEL_COMPILER)
#define av_clipf av_clipf_sse
static av_always_inline av_const float av_clipf_sse(float a, float amin, float amax)
{
#if defined(ASSERT_LEVEL) && ASSERT_LEVEL >= 2
if (amin > amax) abort();
#endif
__asm__ ("minss %2, %0 \n\t"
"maxss %1, %0 \n\t"
: "+&x"(a) : "xm"(amin), "xm"(amax));
return a;
}
#endif /* __SSE__ */
#endif /* __GNUC__ */
#endif /* AVUTIL_X86_INTMATH_H */

View file

@ -0,0 +1,97 @@
/*
* Copyright (c) 2010 Alexander Strange <astrange@ithinksw.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVUTIL_X86_INTREADWRITE_H
#define AVUTIL_X86_INTREADWRITE_H
#include <stdint.h>
#include "config.h"
#include "libavutil/attributes.h"
#if HAVE_MMX
#if !HAVE_FAST_64BIT && defined(__MMX__)
#define AV_COPY64 AV_COPY64
static av_always_inline void AV_COPY64(void *d, const void *s)
{
__asm__("movq %1, %%mm0 \n\t"
"movq %%mm0, %0 \n\t"
: "=m"(*(uint64_t*)d)
: "m" (*(const uint64_t*)s)
: "mm0");
}
#define AV_SWAP64 AV_SWAP64
static av_always_inline void AV_SWAP64(void *a, void *b)
{
__asm__("movq %1, %%mm0 \n\t"
"movq %0, %%mm1 \n\t"
"movq %%mm0, %0 \n\t"
"movq %%mm1, %1 \n\t"
: "+m"(*(uint64_t*)a), "+m"(*(uint64_t*)b)
::"mm0", "mm1");
}
#define AV_ZERO64 AV_ZERO64
static av_always_inline void AV_ZERO64(void *d)
{
__asm__("pxor %%mm0, %%mm0 \n\t"
"movq %%mm0, %0 \n\t"
: "=m"(*(uint64_t*)d)
:: "mm0");
}
#endif /* !HAVE_FAST_64BIT && defined(__MMX__) */
#ifdef __SSE__
#define AV_COPY128 AV_COPY128
static av_always_inline void AV_COPY128(void *d, const void *s)
{
struct v {uint64_t v[2];};
__asm__("movaps %1, %%xmm0 \n\t"
"movaps %%xmm0, %0 \n\t"
: "=m"(*(struct v*)d)
: "m" (*(const struct v*)s)
: "xmm0");
}
#endif /* __SSE__ */
#ifdef __SSE2__
#define AV_ZERO128 AV_ZERO128
static av_always_inline void AV_ZERO128(void *d)
{
struct v {uint64_t v[2];};
__asm__("pxor %%xmm0, %%xmm0 \n\t"
"movdqa %%xmm0, %0 \n\t"
: "=m"(*(struct v*)d)
:: "xmm0");
}
#endif /* __SSE2__ */
#endif /* HAVE_MMX */
#endif /* AVUTIL_X86_INTREADWRITE_H */

View file

@ -0,0 +1,45 @@
/*
* linear least squares model
*
* Copyright (c) 2013 Loren Merritt
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/lls.h"
#include "libavutil/x86/cpu.h"
void ff_update_lls_sse2(LLSModel *m, const double *var);
void ff_update_lls_avx(LLSModel *m, const double *var);
void ff_update_lls_fma3(LLSModel *m, const double *var);
double ff_evaluate_lls_sse2(LLSModel *m, const double *var, int order);
av_cold void ff_init_lls_x86(LLSModel *m)
{
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_SSE2(cpu_flags)) {
m->update_lls = ff_update_lls_sse2;
if (m->indep_count >= 4)
m->evaluate_lls = ff_evaluate_lls_sse2;
}
if (EXTERNAL_AVX_FAST(cpu_flags)) {
m->update_lls = ff_update_lls_avx;
}
if (EXTERNAL_FMA3_FAST(cpu_flags)) {
m->update_lls = ff_update_lls_fma3;
}
}

View file

@ -0,0 +1,26 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVUTIL_X86_PIXELUTILS_H
#define AVUTIL_X86_PIXELUTILS_H
#include "libavutil/pixelutils.h"
void ff_pixelutils_sad_init_x86(av_pixelutils_sad_fn *sad, int aligned);
#endif /* AVUTIL_X86_PIXELUTILS_H */

View file

@ -0,0 +1,94 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#include "pixelutils.h"
#include "cpu.h"
int ff_pixelutils_sad_8x8_mmx(const uint8_t *src1, ptrdiff_t stride1,
const uint8_t *src2, ptrdiff_t stride2);
int ff_pixelutils_sad_8x8_mmxext(const uint8_t *src1, ptrdiff_t stride1,
const uint8_t *src2, ptrdiff_t stride2);
int ff_pixelutils_sad_16x16_mmxext(const uint8_t *src1, ptrdiff_t stride1,
const uint8_t *src2, ptrdiff_t stride2);
int ff_pixelutils_sad_16x16_sse2(const uint8_t *src1, ptrdiff_t stride1,
const uint8_t *src2, ptrdiff_t stride2);
int ff_pixelutils_sad_a_16x16_sse2(const uint8_t *src1, ptrdiff_t stride1,
const uint8_t *src2, ptrdiff_t stride2);
int ff_pixelutils_sad_u_16x16_sse2(const uint8_t *src1, ptrdiff_t stride1,
const uint8_t *src2, ptrdiff_t stride2);
int ff_pixelutils_sad_32x32_sse2(const uint8_t *src1, ptrdiff_t stride1,
const uint8_t *src2, ptrdiff_t stride2);
int ff_pixelutils_sad_a_32x32_sse2(const uint8_t *src1, ptrdiff_t stride1,
const uint8_t *src2, ptrdiff_t stride2);
int ff_pixelutils_sad_u_32x32_sse2(const uint8_t *src1, ptrdiff_t stride1,
const uint8_t *src2, ptrdiff_t stride2);
int ff_pixelutils_sad_32x32_avx2(const uint8_t *src1, ptrdiff_t stride1,
const uint8_t *src2, ptrdiff_t stride2);
int ff_pixelutils_sad_a_32x32_avx2(const uint8_t *src1, ptrdiff_t stride1,
const uint8_t *src2, ptrdiff_t stride2);
int ff_pixelutils_sad_u_32x32_avx2(const uint8_t *src1, ptrdiff_t stride1,
const uint8_t *src2, ptrdiff_t stride2);
void ff_pixelutils_sad_init_x86(av_pixelutils_sad_fn *sad, int aligned)
{
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_MMX(cpu_flags)) {
sad[2] = ff_pixelutils_sad_8x8_mmx;
}
// The best way to use SSE2 would be to do 2 SADs in parallel,
// but we'd have to modify the pixelutils API to return SIMD functions.
// It's probably not faster to shuffle data around
// to get two lines of 8 pixels into a single 16byte register,
// so just use the MMX 8x8 version even when SSE2 is available.
if (EXTERNAL_MMXEXT(cpu_flags)) {
sad[2] = ff_pixelutils_sad_8x8_mmxext;
sad[3] = ff_pixelutils_sad_16x16_mmxext;
}
if (EXTERNAL_SSE2(cpu_flags)) {
switch (aligned) {
case 0: sad[3] = ff_pixelutils_sad_16x16_sse2; break; // src1 unaligned, src2 unaligned
case 1: sad[3] = ff_pixelutils_sad_u_16x16_sse2; break; // src1 aligned, src2 unaligned
case 2: sad[3] = ff_pixelutils_sad_a_16x16_sse2; break; // src1 aligned, src2 aligned
}
}
if (EXTERNAL_SSE2(cpu_flags)) {
switch (aligned) {
case 0: sad[4] = ff_pixelutils_sad_32x32_sse2; break; // src1 unaligned, src2 unaligned
case 1: sad[4] = ff_pixelutils_sad_u_32x32_sse2; break; // src1 aligned, src2 unaligned
case 2: sad[4] = ff_pixelutils_sad_a_32x32_sse2; break; // src1 aligned, src2 aligned
}
}
if (EXTERNAL_AVX2_FAST(cpu_flags)) {
switch (aligned) {
case 0: sad[4] = ff_pixelutils_sad_32x32_avx2; break; // src1 unaligned, src2 unaligned
case 1: sad[4] = ff_pixelutils_sad_u_32x32_avx2; break; // src1 aligned, src2 unaligned
case 2: sad[4] = ff_pixelutils_sad_a_32x32_avx2; break; // src1 aligned, src2 aligned
}
}
}

View file

@ -0,0 +1,50 @@
/*
* copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVUTIL_X86_TIMER_H
#define AVUTIL_X86_TIMER_H
#include <stdint.h>
#if HAVE_INLINE_ASM
#define FF_TIMER_UNITS "decicycles"
#define AV_READ_TIME read_time
static inline uint64_t read_time(void)
{
uint32_t a, d;
__asm__ volatile(
#if ARCH_X86_64 || defined(__SSE2__)
"lfence \n\t"
#endif
"rdtsc \n\t"
: "=a" (a), "=d" (d));
return ((uint64_t)d << 32) + a;
}
#elif HAVE_RDTSC
#include <intrin.h>
#define AV_READ_TIME __rdtsc
#endif /* HAVE_INLINE_ASM */
#endif /* AVUTIL_X86_TIMER_H */

View file

@ -0,0 +1,78 @@
/*
* check XMM registers for clobbers on Win64
* Copyright (c) 2008 Ramiro Polla <ramiro.polla@gmail.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVUTIL_X86_W64XMMTEST_H
#define AVUTIL_X86_W64XMMTEST_H
#include <inttypes.h>
#include <stdint.h>
#include <stdlib.h>
#include <stdarg.h>
#include <string.h>
#include "libavutil/bswap.h"
#define storexmmregs(mem) \
__asm__ volatile( \
"movups %%xmm6 , 0x00(%0)\n\t" \
"movups %%xmm7 , 0x10(%0)\n\t" \
"movups %%xmm8 , 0x20(%0)\n\t" \
"movups %%xmm9 , 0x30(%0)\n\t" \
"movups %%xmm10, 0x40(%0)\n\t" \
"movups %%xmm11, 0x50(%0)\n\t" \
"movups %%xmm12, 0x60(%0)\n\t" \
"movups %%xmm13, 0x70(%0)\n\t" \
"movups %%xmm14, 0x80(%0)\n\t" \
"movups %%xmm15, 0x90(%0)\n\t" \
:: "r"(mem) : "memory")
#define testxmmclobbers(func, ctx, ...) \
uint64_t xmm[2][10][2]; \
int ret; \
storexmmregs(xmm[0]); \
ret = __real_ ## func(ctx, __VA_ARGS__); \
storexmmregs(xmm[1]); \
if (memcmp(xmm[0], xmm[1], sizeof(xmm[0]))) { \
int i; \
av_log(ctx, AV_LOG_ERROR, \
"XMM REGS CLOBBERED IN %s!\n", #func); \
for (i = 0; i < 10; i ++) \
if (xmm[0][i][0] != xmm[1][i][0] || \
xmm[0][i][1] != xmm[1][i][1]) { \
av_log(ctx, AV_LOG_ERROR, \
"xmm%-2d = %016"PRIx64"%016"PRIx64"\n", \
6 + i, av_bswap64(xmm[0][i][0]), \
av_bswap64(xmm[0][i][1])); \
av_log(ctx, AV_LOG_ERROR, \
" -> %016"PRIx64"%016"PRIx64"\n", \
av_bswap64(xmm[1][i][0]), \
av_bswap64(xmm[1][i][1])); \
} \
abort(); \
} \
return ret
#define wrap(func) \
int __real_ ## func; \
int __wrap_ ## func; \
int __wrap_ ## func
#endif /* AVUTIL_X86_W64XMMTEST_H */