Close #2065: Update libyuv to fix linker error when building libyuv as dll on Visual Studio 2015.
git-svn-id: https://svn.pjsip.org/repos/pjproject/trunk@5699 74dad513-b988-da41-8d7b-12977e46ad98
This commit is contained in:
parent
c8847d0d12
commit
77545dfdac
|
@ -1,30 +1,4 @@
|
|||
Notes:
|
||||
|
||||
* Source code for libyuv from https://chromium.googlesource.com/libyuv/libyuv/ dated 27 July 2017.
|
||||
* Source code for libyuv from https://chromium.googlesource.com/libyuv/libyuv/ dated 17 November 2017.
|
||||
|
||||
* All code is compilable, except for compare_win.cc
|
||||
- Use older version (https://chromium.googlesource.com/libyuv/libyuv/+/baf6a3c1bd385e7ffe6b7634560e71fb49e4f589%5E%21/)
|
||||
Since there's a compiler error on:
|
||||
--------------------------------------------------------------------------------------
|
||||
pmulld xmm0,xmm6
|
||||
--------------------------------------------------------------------------------------
|
||||
|
||||
- On VS2015, error C2024: 'alignas' attribute applies to variables, data members and tag types only
|
||||
--------------------------------------------------------------------------------------
|
||||
__declspec(naked) __declspec(align(16))
|
||||
|
||||
Change to :
|
||||
|
||||
__declspec(naked)
|
||||
--------------------------------------------------------------------------------------
|
||||
|
||||
* Added these lines to file include/libyuv/basic_types.h:
|
||||
--
|
||||
#if _MSC_VER==1400
|
||||
# include <stdint.h> // for uint8_t
|
||||
#endif
|
||||
...
|
||||
#if defined(_MSC_VER)
|
||||
# pragma warning(disable:4996) // This function or variable may be unsafe.
|
||||
#endif
|
||||
--
|
||||
|
|
|
@ -14,18 +14,11 @@
|
|||
#include <stddef.h> // for NULL, size_t
|
||||
|
||||
#if defined(_MSC_VER) && (_MSC_VER < 1600)
|
||||
#if _MSC_VER==1400
|
||||
# include <stdint.h> // for uint8_t
|
||||
#endif
|
||||
#include <sys/types.h> // for uintptr_t on x86
|
||||
#else
|
||||
#include <stdint.h> // for uintptr_t
|
||||
#endif
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
# pragma warning(disable:4996) // This function or variable may be unsafe.
|
||||
#endif
|
||||
|
||||
#ifndef GG_LONGLONG
|
||||
#ifndef INT_TYPES_DEFINED
|
||||
#define INT_TYPES_DEFINED
|
||||
|
|
|
@ -19,7 +19,7 @@ extern "C" {
|
|||
#endif
|
||||
|
||||
#if defined(__pnacl__) || defined(__CLR_VER) || \
|
||||
(defined(__i386__) && !defined(__SSE2__))
|
||||
(defined(__i386__) && !defined(__SSE__) && !defined(__clang__))
|
||||
#define LIBYUV_DISABLE_X86
|
||||
#endif
|
||||
// MemorySanitizer does not support assembly code yet. http://crbug.com/344505
|
||||
|
@ -42,6 +42,7 @@ extern "C" {
|
|||
#endif // clang >= 3.4
|
||||
#endif // __clang__
|
||||
|
||||
// The following are available for Visual C:
|
||||
#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \
|
||||
(defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2))
|
||||
#define HAS_HASHDJB2_AVX2
|
||||
|
@ -52,7 +53,7 @@ extern "C" {
|
|||
(defined(__x86_64__) || defined(__i386__) || defined(_M_IX86))
|
||||
#define HAS_HASHDJB2_SSE41
|
||||
#define HAS_SUMSQUAREERROR_SSE2
|
||||
#define HAS_HAMMINGDISTANCE_X86
|
||||
#define HAS_HAMMINGDISTANCE_SSE42
|
||||
#endif
|
||||
|
||||
// The following are available for Visual C and clangcl 32 bit:
|
||||
|
@ -62,6 +63,18 @@ extern "C" {
|
|||
#define HAS_SUMSQUAREERROR_AVX2
|
||||
#endif
|
||||
|
||||
// The following are available for GCC and clangcl 64 bit:
|
||||
#if !defined(LIBYUV_DISABLE_X86) && \
|
||||
(defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
|
||||
#define HAS_HAMMINGDISTANCE_SSSE3
|
||||
#endif
|
||||
|
||||
// The following are available for GCC and clangcl 64 bit:
|
||||
#if !defined(LIBYUV_DISABLE_X86) && defined(CLANG_HAS_AVX2) && \
|
||||
(defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
|
||||
#define HAS_HAMMINGDISTANCE_AVX2
|
||||
#endif
|
||||
|
||||
// The following are available for Neon:
|
||||
#if !defined(LIBYUV_DISABLE_NEON) && \
|
||||
(defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
|
||||
|
@ -69,14 +82,23 @@ extern "C" {
|
|||
#define HAS_HAMMINGDISTANCE_NEON
|
||||
#endif
|
||||
|
||||
#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
|
||||
#define HAS_HAMMINGDISTANCE_MSA
|
||||
#define HAS_SUMSQUAREERROR_MSA
|
||||
#endif
|
||||
|
||||
uint32 HammingDistance_C(const uint8* src_a, const uint8* src_b, int count);
|
||||
uint32 HammingDistance_X86(const uint8* src_a, const uint8* src_b, int count);
|
||||
uint32 HammingDistance_SSE42(const uint8* src_a, const uint8* src_b, int count);
|
||||
uint32 HammingDistance_SSSE3(const uint8* src_a, const uint8* src_b, int count);
|
||||
uint32 HammingDistance_AVX2(const uint8* src_a, const uint8* src_b, int count);
|
||||
uint32 HammingDistance_NEON(const uint8* src_a, const uint8* src_b, int count);
|
||||
uint32 HammingDistance_MSA(const uint8* src_a, const uint8* src_b, int count);
|
||||
|
||||
uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count);
|
||||
uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count);
|
||||
uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count);
|
||||
uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count);
|
||||
uint32 SumSquareError_MSA(const uint8* src_a, const uint8* src_b, int count);
|
||||
|
||||
uint32 HashDjb2_C(const uint8* src, int count, uint32 seed);
|
||||
uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed);
|
||||
|
|
|
@ -188,6 +188,30 @@ int I420ToRAW(const uint8* src_y,
|
|||
int width,
|
||||
int height);
|
||||
|
||||
LIBYUV_API
|
||||
int H420ToRGB24(const uint8* src_y,
|
||||
int src_stride_y,
|
||||
const uint8* src_u,
|
||||
int src_stride_u,
|
||||
const uint8* src_v,
|
||||
int src_stride_v,
|
||||
uint8* dst_frame,
|
||||
int dst_stride_frame,
|
||||
int width,
|
||||
int height);
|
||||
|
||||
LIBYUV_API
|
||||
int H420ToRAW(const uint8* src_y,
|
||||
int src_stride_y,
|
||||
const uint8* src_u,
|
||||
int src_stride_u,
|
||||
const uint8* src_v,
|
||||
int src_stride_v,
|
||||
uint8* dst_frame,
|
||||
int dst_stride_frame,
|
||||
int width,
|
||||
int height);
|
||||
|
||||
LIBYUV_API
|
||||
int I420ToRGB565(const uint8* src_y,
|
||||
int src_stride_y,
|
||||
|
|
|
@ -36,15 +36,19 @@ static const int kCpuHasAVX = 0x200;
|
|||
static const int kCpuHasAVX2 = 0x400;
|
||||
static const int kCpuHasERMS = 0x800;
|
||||
static const int kCpuHasFMA3 = 0x1000;
|
||||
static const int kCpuHasAVX3 = 0x2000;
|
||||
static const int kCpuHasF16C = 0x4000;
|
||||
|
||||
// 0x8000 reserved for future X86 flags.
|
||||
static const int kCpuHasF16C = 0x2000;
|
||||
static const int kCpuHasGFNI = 0x4000;
|
||||
static const int kCpuHasAVX512BW = 0x8000;
|
||||
static const int kCpuHasAVX512VL = 0x10000;
|
||||
static const int kCpuHasAVX512VBMI = 0x20000;
|
||||
static const int kCpuHasAVX512VBMI2 = 0x40000;
|
||||
static const int kCpuHasAVX512VBITALG = 0x80000;
|
||||
static const int kCpuHasAVX512VPOPCNTDQ = 0x100000;
|
||||
|
||||
// These flags are only valid on MIPS processors.
|
||||
static const int kCpuHasMIPS = 0x10000;
|
||||
static const int kCpuHasDSPR2 = 0x20000;
|
||||
static const int kCpuHasMSA = 0x40000;
|
||||
static const int kCpuHasMIPS = 0x200000;
|
||||
static const int kCpuHasDSPR2 = 0x400000;
|
||||
static const int kCpuHasMSA = 0x800000;
|
||||
|
||||
// Optional init function. TestCpuFlag does an auto-init.
|
||||
// Returns cpu_info flags.
|
||||
|
|
|
@ -69,6 +69,32 @@ void MergeUVPlane(const uint8* src_u,
|
|||
int width,
|
||||
int height);
|
||||
|
||||
// Split interleaved RGB plane into separate R, G and B planes.
|
||||
LIBYUV_API
|
||||
void SplitRGBPlane(const uint8* src_rgb,
|
||||
int src_stride_rgb,
|
||||
uint8* dst_r,
|
||||
int dst_stride_r,
|
||||
uint8* dst_g,
|
||||
int dst_stride_g,
|
||||
uint8* dst_b,
|
||||
int dst_stride_b,
|
||||
int width,
|
||||
int height);
|
||||
|
||||
// Merge separate R, G and B planes into one interleaved RGB plane.
|
||||
LIBYUV_API
|
||||
void MergeRGBPlane(const uint8* src_r,
|
||||
int src_stride_r,
|
||||
const uint8* src_g,
|
||||
int src_stride_g,
|
||||
const uint8* src_b,
|
||||
int src_stride_b,
|
||||
uint8* dst_rgb,
|
||||
int dst_stride_rgb,
|
||||
int width,
|
||||
int height);
|
||||
|
||||
// Copy I400. Supports inverting.
|
||||
LIBYUV_API
|
||||
int I400ToI400(const uint8* src_y,
|
||||
|
@ -720,7 +746,7 @@ int I420Interpolate(const uint8* src0_y,
|
|||
int interpolation);
|
||||
|
||||
#if defined(__pnacl__) || defined(__CLR_VER) || \
|
||||
(defined(__i386__) && !defined(__SSE2__))
|
||||
(defined(__i386__) && !defined(__SSE__) && !defined(__clang__))
|
||||
#define LIBYUV_DISABLE_X86
|
||||
#endif
|
||||
// MemorySanitizer does not support assembly code yet. http://crbug.com/344505
|
||||
|
|
|
@ -19,7 +19,7 @@ extern "C" {
|
|||
#endif
|
||||
|
||||
#if defined(__pnacl__) || defined(__CLR_VER) || \
|
||||
(defined(__i386__) && !defined(__SSE2__))
|
||||
(defined(__i386__) && !defined(__SSE__) && !defined(__clang__))
|
||||
#define LIBYUV_DISABLE_X86
|
||||
#endif
|
||||
// MemorySanitizer does not support assembly code yet. http://crbug.com/344505
|
||||
|
@ -29,7 +29,7 @@ extern "C" {
|
|||
#endif
|
||||
#endif
|
||||
// The following are available for Visual C and clangcl 32 bit:
|
||||
#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
|
||||
#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
|
||||
#define HAS_TRANSPOSEWX8_SSSE3
|
||||
#define HAS_TRANSPOSEUVWX8_SSE2
|
||||
#endif
|
||||
|
|
|
@ -31,7 +31,7 @@ extern "C" {
|
|||
var = 0
|
||||
|
||||
#if defined(__pnacl__) || defined(__CLR_VER) || \
|
||||
(defined(__i386__) && !defined(__SSE2__))
|
||||
(defined(__i386__) && !defined(__SSE__) && !defined(__clang__))
|
||||
#define LIBYUV_DISABLE_X86
|
||||
#endif
|
||||
// MemorySanitizer does not support assembly code yet. http://crbug.com/344505
|
||||
|
@ -264,6 +264,23 @@ extern "C" {
|
|||
#define HAS_I422TOARGBROW_SSSE3
|
||||
#endif
|
||||
|
||||
// The following are available for gcc/clang x86 platforms:
|
||||
// TODO(fbarchard): Port to Visual C
|
||||
#if !defined(LIBYUV_DISABLE_X86) && \
|
||||
(defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
|
||||
#define HAS_MERGERGBROW_SSSE3
|
||||
#define HAS_SPLITRGBROW_SSSE3
|
||||
#endif
|
||||
|
||||
// The following are available for AVX2 gcc/clang x86 platforms:
|
||||
// TODO(fbarchard): Port to Visual C
|
||||
#if !defined(LIBYUV_DISABLE_X86) && \
|
||||
(defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) && \
|
||||
(defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2))
|
||||
#define HAS_MERGEUVROW_16_AVX2
|
||||
#define HAS_MULTIPLYROW_16_AVX2
|
||||
#endif
|
||||
|
||||
// The following are available on Neon platforms:
|
||||
#if !defined(LIBYUV_DISABLE_NEON) && \
|
||||
(defined(__aarch64__) || defined(__ARM_NEON__) || defined(LIBYUV_NEON))
|
||||
|
@ -323,6 +340,7 @@ extern "C" {
|
|||
#define HAS_RGBATOUVROW_NEON
|
||||
#define HAS_RGBATOYROW_NEON
|
||||
#define HAS_SETROW_NEON
|
||||
#define HAS_SPLITRGBROW_NEON
|
||||
#define HAS_SPLITUVROW_NEON
|
||||
#define HAS_UYVYTOARGBROW_NEON
|
||||
#define HAS_UYVYTOUV422ROW_NEON
|
||||
|
@ -354,6 +372,11 @@ extern "C" {
|
|||
#define HAS_SOBELYROW_NEON
|
||||
#endif
|
||||
|
||||
// The following are available on AArch64 platforms:
|
||||
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
|
||||
#define HAS_SCALESUMSAMPLES_NEON
|
||||
#endif
|
||||
|
||||
// The following are available on Mips platforms:
|
||||
#if !defined(LIBYUV_DISABLE_DSPR2) && defined(__mips__) && \
|
||||
(_MIPS_SIM == _MIPS_SIM_ABI32) && (__mips_isa_rev < 6)
|
||||
|
@ -385,72 +408,82 @@ extern "C" {
|
|||
#endif
|
||||
|
||||
#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
|
||||
#define HAS_ARGBMIRRORROW_MSA
|
||||
#define HAS_I422TOUYVYROW_MSA
|
||||
#define HAS_I422TOYUY2ROW_MSA
|
||||
#define HAS_MIRRORROW_MSA
|
||||
#define HAS_UYVYTOUVROW_MSA
|
||||
#define HAS_UYVYTOYROW_MSA
|
||||
#define HAS_YUY2TOUV422ROW_MSA
|
||||
#define HAS_YUY2TOUVROW_MSA
|
||||
#define HAS_YUY2TOYROW_MSA
|
||||
#define HAS_ABGRTOUVROW_MSA
|
||||
#define HAS_ABGRTOYROW_MSA
|
||||
#define HAS_ARGB1555TOARGBROW_MSA
|
||||
#define HAS_ARGB1555TOUVROW_MSA
|
||||
#define HAS_ARGB1555TOYROW_MSA
|
||||
#define HAS_ARGB4444TOARGBROW_MSA
|
||||
#define HAS_ARGBTOYROW_MSA
|
||||
#define HAS_ARGBTOUVROW_MSA
|
||||
#define HAS_I422TOARGBROW_MSA
|
||||
#define HAS_I422TORGBAROW_MSA
|
||||
#define HAS_I422ALPHATOARGBROW_MSA
|
||||
#define HAS_I422TORGB24ROW_MSA
|
||||
#define HAS_ARGBTORGB24ROW_MSA
|
||||
#define HAS_ARGBTORAWROW_MSA
|
||||
#define HAS_ARGBTORGB565ROW_MSA
|
||||
#define HAS_ARGBADDROW_MSA
|
||||
#define HAS_ARGBATTENUATEROW_MSA
|
||||
#define HAS_ARGBBLENDROW_MSA
|
||||
#define HAS_ARGBCOLORMATRIXROW_MSA
|
||||
#define HAS_ARGBEXTRACTALPHAROW_MSA
|
||||
#define HAS_ARGBGRAYROW_MSA
|
||||
#define HAS_ARGBMIRRORROW_MSA
|
||||
#define HAS_ARGBMULTIPLYROW_MSA
|
||||
#define HAS_ARGBQUANTIZEROW_MSA
|
||||
#define HAS_ARGBSEPIAROW_MSA
|
||||
#define HAS_ARGBSETROW_MSA
|
||||
#define HAS_ARGBSHADEROW_MSA
|
||||
#define HAS_ARGBSHUFFLEROW_MSA
|
||||
#define HAS_ARGBSUBTRACTROW_MSA
|
||||
#define HAS_ARGBTOARGB1555ROW_MSA
|
||||
#define HAS_ARGBTOARGB4444ROW_MSA
|
||||
#define HAS_ARGBTOUV444ROW_MSA
|
||||
#define HAS_ARGBMULTIPLYROW_MSA
|
||||
#define HAS_ARGBADDROW_MSA
|
||||
#define HAS_ARGBSUBTRACTROW_MSA
|
||||
#define HAS_ARGBATTENUATEROW_MSA
|
||||
#define HAS_ARGBTORAWROW_MSA
|
||||
#define HAS_ARGBTORGB24ROW_MSA
|
||||
#define HAS_ARGBTORGB565DITHERROW_MSA
|
||||
#define HAS_ARGBSHUFFLEROW_MSA
|
||||
#define HAS_ARGBSHADEROW_MSA
|
||||
#define HAS_ARGBGRAYROW_MSA
|
||||
#define HAS_ARGBSEPIAROW_MSA
|
||||
#define HAS_ARGB1555TOARGBROW_MSA
|
||||
#define HAS_RGB565TOARGBROW_MSA
|
||||
#define HAS_RGB24TOARGBROW_MSA
|
||||
#define HAS_RAWTOARGBROW_MSA
|
||||
#define HAS_ARGB1555TOYROW_MSA
|
||||
#define HAS_RGB565TOYROW_MSA
|
||||
#define HAS_RGB24TOYROW_MSA
|
||||
#define HAS_RAWTOYROW_MSA
|
||||
#define HAS_ARGB1555TOUVROW_MSA
|
||||
#define HAS_RGB565TOUVROW_MSA
|
||||
#define HAS_RGB24TOUVROW_MSA
|
||||
#define HAS_RAWTOUVROW_MSA
|
||||
#define HAS_ARGBTORGB565ROW_MSA
|
||||
#define HAS_ARGBTOUV444ROW_MSA
|
||||
#define HAS_ARGBTOUVJROW_MSA
|
||||
#define HAS_ARGBTOUVROW_MSA
|
||||
#define HAS_ARGBTOYJROW_MSA
|
||||
#define HAS_ARGBTOYROW_MSA
|
||||
#define HAS_BGRATOUVROW_MSA
|
||||
#define HAS_BGRATOYROW_MSA
|
||||
#define HAS_HALFFLOATROW_MSA
|
||||
#define HAS_I400TOARGBROW_MSA
|
||||
#define HAS_I422ALPHATOARGBROW_MSA
|
||||
#define HAS_I422TOARGBROW_MSA
|
||||
#define HAS_I422TORGB24ROW_MSA
|
||||
#define HAS_I422TORGBAROW_MSA
|
||||
#define HAS_I422TOUYVYROW_MSA
|
||||
#define HAS_I422TOYUY2ROW_MSA
|
||||
#define HAS_I444TOARGBROW_MSA
|
||||
#define HAS_INTERPOLATEROW_MSA
|
||||
#define HAS_J400TOARGBROW_MSA
|
||||
#define HAS_MERGEUVROW_MSA
|
||||
#define HAS_MIRRORROW_MSA
|
||||
#define HAS_MIRRORUVROW_MSA
|
||||
#define HAS_NV12TOARGBROW_MSA
|
||||
#define HAS_NV12TORGB565ROW_MSA
|
||||
#define HAS_NV21TOARGBROW_MSA
|
||||
#define HAS_RAWTOARGBROW_MSA
|
||||
#define HAS_RAWTORGB24ROW_MSA
|
||||
#define HAS_RAWTOUVROW_MSA
|
||||
#define HAS_RAWTOYROW_MSA
|
||||
#define HAS_RGB24TOARGBROW_MSA
|
||||
#define HAS_RGB24TOUVROW_MSA
|
||||
#define HAS_RGB24TOYROW_MSA
|
||||
#define HAS_RGB565TOARGBROW_MSA
|
||||
#define HAS_RGB565TOUVROW_MSA
|
||||
#define HAS_RGB565TOYROW_MSA
|
||||
#define HAS_RGBATOUVROW_MSA
|
||||
#define HAS_RGBATOYROW_MSA
|
||||
#define HAS_SETROW_MSA
|
||||
#define HAS_SOBELROW_MSA
|
||||
#define HAS_SOBELTOPLANEROW_MSA
|
||||
#define HAS_SOBELXROW_MSA
|
||||
#define HAS_SOBELXYROW_MSA
|
||||
#define HAS_ARGBTOYJROW_MSA
|
||||
#define HAS_BGRATOYROW_MSA
|
||||
#define HAS_ABGRTOYROW_MSA
|
||||
#define HAS_RGBATOYROW_MSA
|
||||
#define HAS_ARGBTOUVJROW_MSA
|
||||
#define HAS_BGRATOUVROW_MSA
|
||||
#define HAS_ABGRTOUVROW_MSA
|
||||
#define HAS_RGBATOUVROW_MSA
|
||||
#define HAS_I444TOARGBROW_MSA
|
||||
#define HAS_I400TOARGBROW_MSA
|
||||
#define HAS_J400TOARGBROW_MSA
|
||||
#define HAS_YUY2TOARGBROW_MSA
|
||||
#define HAS_SOBELYROW_MSA
|
||||
#define HAS_SPLITUVROW_MSA
|
||||
#define HAS_UYVYTOARGBROW_MSA
|
||||
#define HAS_INTERPOLATEROW_MSA
|
||||
#define HAS_ARGBSETROW_MSA
|
||||
#define HAS_RAWTORGB24ROW_MSA
|
||||
#define HAS_MERGEUVROW_MSA
|
||||
#define HAS_UYVYTOUVROW_MSA
|
||||
#define HAS_UYVYTOYROW_MSA
|
||||
#define HAS_YUY2TOARGBROW_MSA
|
||||
#define HAS_YUY2TOUV422ROW_MSA
|
||||
#define HAS_YUY2TOUVROW_MSA
|
||||
#define HAS_YUY2TOYROW_MSA
|
||||
#endif
|
||||
|
||||
#if defined(_MSC_VER) && !defined(__CLR_VER) && !defined(__clang__)
|
||||
|
@ -1345,6 +1378,10 @@ void MirrorUVRow_DSPR2(const uint8* src_uv,
|
|||
uint8* dst_u,
|
||||
uint8* dst_v,
|
||||
int width);
|
||||
void MirrorUVRow_MSA(const uint8* src_uv,
|
||||
uint8* dst_u,
|
||||
uint8* dst_v,
|
||||
int width);
|
||||
void MirrorUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width);
|
||||
|
||||
void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width);
|
||||
|
@ -1374,6 +1411,7 @@ void SplitUVRow_DSPR2(const uint8* src_uv,
|
|||
uint8* dst_u,
|
||||
uint8* dst_v,
|
||||
int width);
|
||||
void SplitUVRow_MSA(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width);
|
||||
void SplitUVRow_Any_SSE2(const uint8* src_uv,
|
||||
uint8* dst_u,
|
||||
uint8* dst_v,
|
||||
|
@ -1390,6 +1428,10 @@ void SplitUVRow_Any_DSPR2(const uint8* src_uv,
|
|||
uint8* dst_u,
|
||||
uint8* dst_v,
|
||||
int width);
|
||||
void SplitUVRow_Any_MSA(const uint8* src_uv,
|
||||
uint8* dst_u,
|
||||
uint8* dst_v,
|
||||
int width);
|
||||
|
||||
void MergeUVRow_C(const uint8* src_u,
|
||||
const uint8* src_v,
|
||||
|
@ -1428,6 +1470,75 @@ void MergeUVRow_Any_MSA(const uint8* src_u,
|
|||
uint8* dst_uv,
|
||||
int width);
|
||||
|
||||
void SplitRGBRow_C(const uint8* src_rgb,
|
||||
uint8* dst_r,
|
||||
uint8* dst_g,
|
||||
uint8* dst_b,
|
||||
int width);
|
||||
void SplitRGBRow_SSSE3(const uint8* src_rgb,
|
||||
uint8* dst_r,
|
||||
uint8* dst_g,
|
||||
uint8* dst_b,
|
||||
int width);
|
||||
void SplitRGBRow_NEON(const uint8* src_rgb,
|
||||
uint8* dst_r,
|
||||
uint8* dst_g,
|
||||
uint8* dst_b,
|
||||
int width);
|
||||
void SplitRGBRow_Any_SSSE3(const uint8* src_rgb,
|
||||
uint8* dst_r,
|
||||
uint8* dst_g,
|
||||
uint8* dst_b,
|
||||
int width);
|
||||
void SplitRGBRow_Any_NEON(const uint8* src_rgb,
|
||||
uint8* dst_r,
|
||||
uint8* dst_g,
|
||||
uint8* dst_b,
|
||||
int width);
|
||||
|
||||
void MergeRGBRow_C(const uint8* src_r,
|
||||
const uint8* src_g,
|
||||
const uint8* src_b,
|
||||
uint8* dst_rgb,
|
||||
int width);
|
||||
void MergeRGBRow_SSSE3(const uint8* src_r,
|
||||
const uint8* src_g,
|
||||
const uint8* src_b,
|
||||
uint8* dst_rgb,
|
||||
int width);
|
||||
void MergeRGBRow_NEON(const uint8* src_r,
|
||||
const uint8* src_g,
|
||||
const uint8* src_b,
|
||||
uint8* dst_rgb,
|
||||
int width);
|
||||
void MergeRGBRow_Any_SSSE3(const uint8* src_r,
|
||||
const uint8* src_g,
|
||||
const uint8* src_b,
|
||||
uint8* dst_rgb,
|
||||
int width);
|
||||
void MergeRGBRow_Any_NEON(const uint8* src_r,
|
||||
const uint8* src_g,
|
||||
const uint8* src_b,
|
||||
uint8* dst_rgb,
|
||||
int width);
|
||||
|
||||
void MergeUVRow_16_C(const uint16* src_u,
|
||||
const uint16* src_v,
|
||||
uint16* dst_uv,
|
||||
int scale, /* 64 for 10 bit */
|
||||
int width);
|
||||
void MergeUVRow_16_AVX2(const uint16* src_u,
|
||||
const uint16* src_v,
|
||||
uint16* dst_uv,
|
||||
int scale,
|
||||
int width);
|
||||
|
||||
void MultiplyRow_16_AVX2(const uint16* src_y,
|
||||
uint16* dst_y,
|
||||
int scale,
|
||||
int width);
|
||||
void MultiplyRow_16_C(const uint16* src_y, uint16* dst_y, int scale, int width);
|
||||
|
||||
void CopyRow_SSE2(const uint8* src, uint8* dst, int count);
|
||||
void CopyRow_AVX(const uint8* src, uint8* dst, int count);
|
||||
void CopyRow_ERMS(const uint8* src, uint8* dst, int count);
|
||||
|
@ -1454,6 +1565,7 @@ void ARGBExtractAlphaRow_C(const uint8* src_argb, uint8* dst_a, int width);
|
|||
void ARGBExtractAlphaRow_SSE2(const uint8* src_argb, uint8* dst_a, int width);
|
||||
void ARGBExtractAlphaRow_AVX2(const uint8* src_argb, uint8* dst_a, int width);
|
||||
void ARGBExtractAlphaRow_NEON(const uint8* src_argb, uint8* dst_a, int width);
|
||||
void ARGBExtractAlphaRow_MSA(const uint8* src_argb, uint8* dst_a, int width);
|
||||
void ARGBExtractAlphaRow_Any_SSE2(const uint8* src_argb,
|
||||
uint8* dst_a,
|
||||
int width);
|
||||
|
@ -1463,6 +1575,9 @@ void ARGBExtractAlphaRow_Any_AVX2(const uint8* src_argb,
|
|||
void ARGBExtractAlphaRow_Any_NEON(const uint8* src_argb,
|
||||
uint8* dst_a,
|
||||
int width);
|
||||
void ARGBExtractAlphaRow_Any_MSA(const uint8* src_argb,
|
||||
uint8* dst_a,
|
||||
int width);
|
||||
|
||||
void ARGBCopyYToAlphaRow_C(const uint8* src_y, uint8* dst_argb, int width);
|
||||
void ARGBCopyYToAlphaRow_SSE2(const uint8* src_y, uint8* dst_argb, int width);
|
||||
|
@ -1475,6 +1590,7 @@ void ARGBCopyYToAlphaRow_Any_AVX2(const uint8* src_y,
|
|||
int width);
|
||||
|
||||
void SetRow_C(uint8* dst, uint8 v8, int count);
|
||||
void SetRow_MSA(uint8* dst, uint8 v8, int count);
|
||||
void SetRow_X86(uint8* dst, uint8 v8, int count);
|
||||
void SetRow_ERMS(uint8* dst, uint8 v8, int count);
|
||||
void SetRow_NEON(uint8* dst, uint8 v8, int count);
|
||||
|
@ -2122,6 +2238,10 @@ void ARGBBlendRow_NEON(const uint8* src_argb,
|
|||
const uint8* src_argb1,
|
||||
uint8* dst_argb,
|
||||
int width);
|
||||
void ARGBBlendRow_MSA(const uint8* src_argb,
|
||||
const uint8* src_argb1,
|
||||
uint8* dst_argb,
|
||||
int width);
|
||||
void ARGBBlendRow_C(const uint8* src_argb,
|
||||
const uint8* src_argb1,
|
||||
uint8* dst_argb,
|
||||
|
@ -2835,6 +2955,10 @@ void ARGBColorMatrixRow_NEON(const uint8* src_argb,
|
|||
uint8* dst_argb,
|
||||
const int8* matrix_argb,
|
||||
int width);
|
||||
void ARGBColorMatrixRow_MSA(const uint8* src_argb,
|
||||
uint8* dst_argb,
|
||||
const int8* matrix_argb,
|
||||
int width);
|
||||
|
||||
void ARGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width);
|
||||
void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width);
|
||||
|
@ -2857,6 +2981,11 @@ void ARGBQuantizeRow_NEON(uint8* dst_argb,
|
|||
int interval_size,
|
||||
int interval_offset,
|
||||
int width);
|
||||
void ARGBQuantizeRow_MSA(uint8* dst_argb,
|
||||
int scale,
|
||||
int interval_size,
|
||||
int interval_offset,
|
||||
int width);
|
||||
|
||||
void ARGBShadeRow_C(const uint8* src_argb,
|
||||
uint8* dst_argb,
|
||||
|
@ -2990,6 +3119,11 @@ void SobelXRow_NEON(const uint8* src_y0,
|
|||
const uint8* src_y2,
|
||||
uint8* dst_sobelx,
|
||||
int width);
|
||||
void SobelXRow_MSA(const uint8* src_y0,
|
||||
const uint8* src_y1,
|
||||
const uint8* src_y2,
|
||||
uint8* dst_sobelx,
|
||||
int width);
|
||||
void SobelYRow_C(const uint8* src_y0,
|
||||
const uint8* src_y1,
|
||||
uint8* dst_sobely,
|
||||
|
@ -3002,6 +3136,10 @@ void SobelYRow_NEON(const uint8* src_y0,
|
|||
const uint8* src_y1,
|
||||
uint8* dst_sobely,
|
||||
int width);
|
||||
void SobelYRow_MSA(const uint8* src_y0,
|
||||
const uint8* src_y1,
|
||||
uint8* dst_sobely,
|
||||
int width);
|
||||
void SobelRow_C(const uint8* src_sobelx,
|
||||
const uint8* src_sobely,
|
||||
uint8* dst_argb,
|
||||
|
@ -3132,6 +3270,11 @@ void HalfFloat1Row_Any_NEON(const uint16* src,
|
|||
uint16* dst,
|
||||
float scale,
|
||||
int width);
|
||||
void HalfFloatRow_MSA(const uint16* src, uint16* dst, float scale, int width);
|
||||
void HalfFloatRow_Any_MSA(const uint16* src,
|
||||
uint16* dst,
|
||||
float scale,
|
||||
int width);
|
||||
|
||||
void ARGBLumaColorTableRow_C(const uint8* src_argb,
|
||||
uint8* dst_argb,
|
||||
|
@ -3144,6 +3287,19 @@ void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb,
|
|||
const uint8* luma,
|
||||
uint32 lumacoeff);
|
||||
|
||||
float ScaleMaxSamples_C(const float* src, float* dst, float scale, int width);
|
||||
float ScaleMaxSamples_NEON(const float* src,
|
||||
float* dst,
|
||||
float scale,
|
||||
int width);
|
||||
float ScaleSumSamples_C(const float* src, float* dst, float scale, int width);
|
||||
float ScaleSumSamples_NEON(const float* src,
|
||||
float* dst,
|
||||
float scale,
|
||||
int width);
|
||||
void ScaleSamples_C(const float* src, float* dst, float scale, int width);
|
||||
void ScaleSamples_NEON(const float* src, float* dst, float scale, int width);
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
} // namespace libyuv
|
||||
|
|
|
@ -20,7 +20,7 @@ extern "C" {
|
|||
#endif
|
||||
|
||||
#if defined(__pnacl__) || defined(__CLR_VER) || \
|
||||
(defined(__i386__) && !defined(__SSE2__))
|
||||
(defined(__i386__) && !defined(__SSE__) && !defined(__clang__))
|
||||
#define LIBYUV_DISABLE_X86
|
||||
#endif
|
||||
// MemorySanitizer does not support assembly code yet. http://crbug.com/344505
|
||||
|
@ -105,12 +105,16 @@ extern "C" {
|
|||
#endif
|
||||
|
||||
#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
|
||||
#define HAS_SCALEADDROW_MSA
|
||||
#define HAS_SCALEARGBCOLS_MSA
|
||||
#define HAS_SCALEARGBFILTERCOLS_MSA
|
||||
#define HAS_SCALEARGBROWDOWN2_MSA
|
||||
#define HAS_SCALEARGBROWDOWNEVEN_MSA
|
||||
#define HAS_SCALEFILTERCOLS_MSA
|
||||
#define HAS_SCALEROWDOWN2_MSA
|
||||
#define HAS_SCALEROWDOWN4_MSA
|
||||
#define HAS_SCALEROWDOWN34_MSA
|
||||
#define HAS_SCALEROWDOWN38_MSA
|
||||
#define HAS_SCALEADDROW_MSA
|
||||
#define HAS_SCALEROWDOWN4_MSA
|
||||
#endif
|
||||
|
||||
// Scale ARGB vertically with bilinear interpolation.
|
||||
|
@ -546,6 +550,26 @@ void ScaleARGBCols_Any_NEON(uint8* dst_argb,
|
|||
int dst_width,
|
||||
int x,
|
||||
int dx);
|
||||
void ScaleARGBFilterCols_MSA(uint8* dst_argb,
|
||||
const uint8* src_argb,
|
||||
int dst_width,
|
||||
int x,
|
||||
int dx);
|
||||
void ScaleARGBCols_MSA(uint8* dst_argb,
|
||||
const uint8* src_argb,
|
||||
int dst_width,
|
||||
int x,
|
||||
int dx);
|
||||
void ScaleARGBFilterCols_Any_MSA(uint8* dst_argb,
|
||||
const uint8* src_argb,
|
||||
int dst_width,
|
||||
int x,
|
||||
int dx);
|
||||
void ScaleARGBCols_Any_MSA(uint8* dst_argb,
|
||||
const uint8* src_argb,
|
||||
int dst_width,
|
||||
int x,
|
||||
int dx);
|
||||
|
||||
// ARGB Row functions
|
||||
void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
|
||||
|
@ -885,6 +909,24 @@ void ScaleRowDown38_3_Box_MSA(const uint8_t* src_ptr,
|
|||
uint8_t* dst_ptr,
|
||||
int dst_width);
|
||||
void ScaleAddRow_MSA(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width);
|
||||
void ScaleFilterCols_MSA(uint8* dst_ptr,
|
||||
const uint8* src_ptr,
|
||||
int dst_width,
|
||||
int x,
|
||||
int dx);
|
||||
void ScaleRowDown34_MSA(const uint8* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8* dst_ptr,
|
||||
int dst_width);
|
||||
void ScaleRowDown34_0_Box_MSA(const uint8* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8* dst_ptr,
|
||||
int dst_width);
|
||||
void ScaleRowDown34_1_Box_MSA(const uint8* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8* dst_ptr,
|
||||
int dst_width);
|
||||
|
||||
void ScaleRowDown2_Any_MSA(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst,
|
||||
|
@ -920,6 +962,23 @@ void ScaleRowDown38_3_Box_Any_MSA(const uint8_t* src_ptr,
|
|||
void ScaleAddRow_Any_MSA(const uint8_t* src_ptr,
|
||||
uint16_t* dst_ptr,
|
||||
int src_width);
|
||||
void ScaleFilterCols_Any_MSA(uint8* dst_ptr,
|
||||
const uint8* src_ptr,
|
||||
int dst_width,
|
||||
int x,
|
||||
int dx);
|
||||
void ScaleRowDown34_Any_MSA(const uint8* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8* dst_ptr,
|
||||
int dst_width);
|
||||
void ScaleRowDown34_0_Box_Any_MSA(const uint8* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8* dst_ptr,
|
||||
int dst_width);
|
||||
void ScaleRowDown34_1_Box_Any_MSA(const uint8* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8* dst_ptr,
|
||||
int dst_width);
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
|
|
|
@ -11,6 +11,6 @@
|
|||
#ifndef INCLUDE_LIBYUV_VERSION_H_
|
||||
#define INCLUDE_LIBYUV_VERSION_H_
|
||||
|
||||
#define LIBYUV_VERSION 1662
|
||||
#define LIBYUV_VERSION 1678
|
||||
|
||||
#endif // INCLUDE_LIBYUV_VERSION_H_
|
||||
|
|
|
@ -110,12 +110,17 @@ uint32 ARGBDetect(const uint8* argb, int stride_argb, int width, int height) {
|
|||
return fourcc;
|
||||
}
|
||||
|
||||
// NEON version accumulates in 16 bit shorts which overflow at 65536 bytes.
|
||||
// So actual maximum is 1 less loop, which is 64436 - 32 bytes.
|
||||
|
||||
LIBYUV_API
|
||||
uint64 ComputeHammingDistance(const uint8* src_a,
|
||||
const uint8* src_b,
|
||||
int count) {
|
||||
const int kBlockSize = 65536;
|
||||
int remainder = count & (kBlockSize - 1) & ~31;
|
||||
const int kBlockSize = 1 << 15; // 32768;
|
||||
const int kSimdSize = 64;
|
||||
// SIMD for multiple of 64, and C for remainder
|
||||
int remainder = count & (kBlockSize - 1) & ~(kSimdSize - 1);
|
||||
uint64 diff = 0;
|
||||
int i;
|
||||
uint32 (*HammingDistance)(const uint8* src_a, const uint8* src_b, int count) =
|
||||
|
@ -125,9 +130,14 @@ uint64 ComputeHammingDistance(const uint8* src_a,
|
|||
HammingDistance = HammingDistance_NEON;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_HAMMINGDISTANCE_X86)
|
||||
if (TestCpuFlag(kCpuHasX86)) {
|
||||
HammingDistance = HammingDistance_X86;
|
||||
#if defined(HAS_HAMMINGDISTANCE_SSSE3)
|
||||
if (TestCpuFlag(kCpuHasSSSE3)) {
|
||||
HammingDistance = HammingDistance_SSSE3;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_HAMMINGDISTANCE_SSE42)
|
||||
if (TestCpuFlag(kCpuHasSSE42)) {
|
||||
HammingDistance = HammingDistance_SSE42;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_HAMMINGDISTANCE_AVX2)
|
||||
|
@ -135,6 +145,11 @@ uint64 ComputeHammingDistance(const uint8* src_a,
|
|||
HammingDistance = HammingDistance_AVX2;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_HAMMINGDISTANCE_MSA)
|
||||
if (TestCpuFlag(kCpuHasMSA)) {
|
||||
HammingDistance = HammingDistance_MSA;
|
||||
}
|
||||
#endif
|
||||
#ifdef _OPENMP
|
||||
#pragma omp parallel for reduction(+ : diff)
|
||||
#endif
|
||||
|
@ -148,7 +163,7 @@ uint64 ComputeHammingDistance(const uint8* src_a,
|
|||
src_a += remainder;
|
||||
src_b += remainder;
|
||||
}
|
||||
remainder = count & 31;
|
||||
remainder = count & (kSimdSize - 1);
|
||||
if (remainder) {
|
||||
diff += HammingDistance_C(src_a, src_b, remainder);
|
||||
}
|
||||
|
@ -186,6 +201,11 @@ uint64 ComputeSumSquareError(const uint8* src_a,
|
|||
SumSquareError = SumSquareError_AVX2;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SUMSQUAREERROR_MSA)
|
||||
if (TestCpuFlag(kCpuHasMSA)) {
|
||||
SumSquareError = SumSquareError_MSA;
|
||||
}
|
||||
#endif
|
||||
#ifdef _OPENMP
|
||||
#pragma omp parallel for reduction(+ : sse)
|
||||
#endif
|
||||
|
|
|
@ -18,7 +18,7 @@ extern "C" {
|
|||
#endif
|
||||
|
||||
#if ORIGINAL_OPT
|
||||
uint32 HammingDistance_C(const uint8* src_a, const uint8* src_b, int count) {
|
||||
uint32 HammingDistance_C1(const uint8* src_a, const uint8* src_b, int count) {
|
||||
uint32 diff = 0u;
|
||||
|
||||
int i;
|
||||
|
@ -58,6 +58,16 @@ uint32 HammingDistance_C(const uint8* src_a, const uint8* src_b, int count) {
|
|||
src_a += 4;
|
||||
src_b += 4;
|
||||
}
|
||||
|
||||
for (; i < count; ++i) {
|
||||
uint32 x = *src_a ^ *src_b;
|
||||
uint32 u = x - ((x >> 1) & 0x55);
|
||||
u = ((u >> 2) & 0x33) + (u & 0x33);
|
||||
diff += (u + (u >> 4)) & 0x0f;
|
||||
src_a += 1;
|
||||
src_b += 1;
|
||||
}
|
||||
|
||||
return diff;
|
||||
}
|
||||
|
||||
|
|
|
@ -22,18 +22,210 @@ extern "C" {
|
|||
#if !defined(LIBYUV_DISABLE_X86) && \
|
||||
(defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
|
||||
|
||||
uint32 HammingDistance_X86(const uint8* src_a, const uint8* src_b, int count) {
|
||||
#if defined(__x86_64__)
|
||||
uint32 HammingDistance_SSE42(const uint8* src_a,
|
||||
const uint8* src_b,
|
||||
int count) {
|
||||
uint64 diff = 0u;
|
||||
|
||||
asm volatile(
|
||||
"xor %3,%3 \n"
|
||||
"xor %%r8,%%r8 \n"
|
||||
"xor %%r9,%%r9 \n"
|
||||
"xor %%r10,%%r10 \n"
|
||||
|
||||
// Process 32 bytes per loop.
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
"mov (%0),%%rcx \n"
|
||||
"mov 0x8(%0),%%rdx \n"
|
||||
"xor (%1),%%rcx \n"
|
||||
"xor 0x8(%1),%%rdx \n"
|
||||
"popcnt %%rcx,%%rcx \n"
|
||||
"popcnt %%rdx,%%rdx \n"
|
||||
"mov 0x10(%0),%%rsi \n"
|
||||
"mov 0x18(%0),%%rdi \n"
|
||||
"xor 0x10(%1),%%rsi \n"
|
||||
"xor 0x18(%1),%%rdi \n"
|
||||
"popcnt %%rsi,%%rsi \n"
|
||||
"popcnt %%rdi,%%rdi \n"
|
||||
"add $0x20,%0 \n"
|
||||
"add $0x20,%1 \n"
|
||||
"add %%rcx,%3 \n"
|
||||
"add %%rdx,%%r8 \n"
|
||||
"add %%rsi,%%r9 \n"
|
||||
"add %%rdi,%%r10 \n"
|
||||
"sub $0x20,%2 \n"
|
||||
"jg 1b \n"
|
||||
|
||||
"add %%r8, %3 \n"
|
||||
"add %%r9, %3 \n"
|
||||
"add %%r10, %3 \n"
|
||||
: "+r"(src_a), // %0
|
||||
"+r"(src_b), // %1
|
||||
"+r"(count), // %2
|
||||
"=r"(diff) // %3
|
||||
:
|
||||
: "memory", "cc", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10");
|
||||
|
||||
return static_cast<uint32>(diff);
|
||||
}
|
||||
#else
|
||||
uint32 HammingDistance_SSE42(const uint8* src_a,
|
||||
const uint8* src_b,
|
||||
int count) {
|
||||
uint32 diff = 0u;
|
||||
|
||||
int i;
|
||||
for (i = 0; i < count - 7; i += 8) {
|
||||
uint64 x = *((uint64*)src_a) ^ *((uint64*)src_b);
|
||||
src_a += 8;
|
||||
src_b += 8;
|
||||
diff += __builtin_popcountll(x);
|
||||
}
|
||||
asm volatile(
|
||||
// Process 16 bytes per loop.
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
"mov (%0),%%ecx \n"
|
||||
"mov 0x4(%0),%%edx \n"
|
||||
"xor (%1),%%ecx \n"
|
||||
"xor 0x4(%1),%%edx \n"
|
||||
"popcnt %%ecx,%%ecx \n"
|
||||
"add %%ecx,%3 \n"
|
||||
"popcnt %%edx,%%edx \n"
|
||||
"add %%edx,%3 \n"
|
||||
"mov 0x8(%0),%%ecx \n"
|
||||
"mov 0xc(%0),%%edx \n"
|
||||
"xor 0x8(%1),%%ecx \n"
|
||||
"xor 0xc(%1),%%edx \n"
|
||||
"popcnt %%ecx,%%ecx \n"
|
||||
"add %%ecx,%3 \n"
|
||||
"popcnt %%edx,%%edx \n"
|
||||
"add %%edx,%3 \n"
|
||||
"add $0x10,%0 \n"
|
||||
"add $0x10,%1 \n"
|
||||
"sub $0x10,%2 \n"
|
||||
"jg 1b \n"
|
||||
: "+r"(src_a), // %0
|
||||
"+r"(src_b), // %1
|
||||
"+r"(count), // %2
|
||||
"+r"(diff) // %3
|
||||
:
|
||||
: "memory", "cc", "ecx", "edx");
|
||||
|
||||
return diff;
|
||||
}
|
||||
#endif
|
||||
|
||||
static vec8 kNibbleMask = {15, 15, 15, 15, 15, 15, 15, 15,
|
||||
15, 15, 15, 15, 15, 15, 15, 15};
|
||||
static vec8 kBitCount = {0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4};
|
||||
|
||||
uint32 HammingDistance_SSSE3(const uint8* src_a,
|
||||
const uint8* src_b,
|
||||
int count) {
|
||||
uint32 diff = 0u;
|
||||
|
||||
asm volatile(
|
||||
"movdqa %4,%%xmm2 \n"
|
||||
"movdqa %5,%%xmm3 \n"
|
||||
"pxor %%xmm0,%%xmm0 \n"
|
||||
"pxor %%xmm1,%%xmm1 \n"
|
||||
"sub %0,%1 \n"
|
||||
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
"movdqa (%0),%%xmm4 \n"
|
||||
"movdqa 0x10(%0), %%xmm5 \n"
|
||||
"pxor (%0,%1), %%xmm4 \n"
|
||||
"movdqa %%xmm4,%%xmm6 \n"
|
||||
"pand %%xmm2,%%xmm6 \n"
|
||||
"psrlw $0x4,%%xmm4 \n"
|
||||
"movdqa %%xmm3,%%xmm7 \n"
|
||||
"pshufb %%xmm6,%%xmm7 \n"
|
||||
"pand %%xmm2,%%xmm4 \n"
|
||||
"movdqa %%xmm3,%%xmm6 \n"
|
||||
"pshufb %%xmm4,%%xmm6 \n"
|
||||
"paddb %%xmm7,%%xmm6 \n"
|
||||
"pxor 0x10(%0,%1),%%xmm5 \n"
|
||||
"add $0x20,%0 \n"
|
||||
"movdqa %%xmm5,%%xmm4 \n"
|
||||
"pand %%xmm2,%%xmm5 \n"
|
||||
"psrlw $0x4,%%xmm4 \n"
|
||||
"movdqa %%xmm3,%%xmm7 \n"
|
||||
"pshufb %%xmm5,%%xmm7 \n"
|
||||
"pand %%xmm2,%%xmm4 \n"
|
||||
"movdqa %%xmm3,%%xmm5 \n"
|
||||
"pshufb %%xmm4,%%xmm5 \n"
|
||||
"paddb %%xmm7,%%xmm5 \n"
|
||||
"paddb %%xmm5,%%xmm6 \n"
|
||||
"psadbw %%xmm1,%%xmm6 \n"
|
||||
"paddd %%xmm6,%%xmm0 \n"
|
||||
"sub $0x20,%2 \n"
|
||||
"jg 1b \n"
|
||||
|
||||
"pshufd $0xaa,%%xmm0,%%xmm1 \n"
|
||||
"paddd %%xmm1,%%xmm0 \n"
|
||||
"movd %%xmm0, %3 \n"
|
||||
: "+r"(src_a), // %0
|
||||
"+r"(src_b), // %1
|
||||
"+r"(count), // %2
|
||||
"=r"(diff) // %3
|
||||
: "m"(kNibbleMask), // %4
|
||||
"m"(kBitCount) // %5
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
|
||||
"xmm7");
|
||||
|
||||
return diff;
|
||||
}
|
||||
|
||||
#ifdef HAS_HAMMINGDISTANCE_AVX2
|
||||
uint32 HammingDistance_AVX2(const uint8* src_a, const uint8* src_b, int count) {
|
||||
uint32 diff = 0u;
|
||||
|
||||
asm volatile(
|
||||
"vbroadcastf128 %4,%%ymm2 \n"
|
||||
"vbroadcastf128 %5,%%ymm3 \n"
|
||||
"vpxor %%ymm0,%%ymm0,%%ymm0 \n"
|
||||
"vpxor %%ymm1,%%ymm1,%%ymm1 \n"
|
||||
"sub %0,%1 \n"
|
||||
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
"vmovdqa (%0),%%ymm4 \n"
|
||||
"vmovdqa 0x20(%0), %%ymm5 \n"
|
||||
"vpxor (%0,%1), %%ymm4, %%ymm4 \n"
|
||||
"vpand %%ymm2,%%ymm4,%%ymm6 \n"
|
||||
"vpsrlw $0x4,%%ymm4,%%ymm4 \n"
|
||||
"vpshufb %%ymm6,%%ymm3,%%ymm6 \n"
|
||||
"vpand %%ymm2,%%ymm4,%%ymm4 \n"
|
||||
"vpshufb %%ymm4,%%ymm3,%%ymm4 \n"
|
||||
"vpaddb %%ymm4,%%ymm6,%%ymm6 \n"
|
||||
"vpxor 0x20(%0,%1),%%ymm5,%%ymm4 \n"
|
||||
"add $0x40,%0 \n"
|
||||
"vpand %%ymm2,%%ymm4,%%ymm5 \n"
|
||||
"vpsrlw $0x4,%%ymm4,%%ymm4 \n"
|
||||
"vpshufb %%ymm5,%%ymm3,%%ymm5 \n"
|
||||
"vpand %%ymm2,%%ymm4,%%ymm4 \n"
|
||||
"vpshufb %%ymm4,%%ymm3,%%ymm4 \n"
|
||||
"vpaddb %%ymm5,%%ymm4,%%ymm4 \n"
|
||||
"vpaddb %%ymm6,%%ymm4,%%ymm4 \n"
|
||||
"vpsadbw %%ymm1,%%ymm4,%%ymm4 \n"
|
||||
"vpaddd %%ymm0,%%ymm4,%%ymm0 \n"
|
||||
"sub $0x40,%2 \n"
|
||||
"jg 1b \n"
|
||||
|
||||
"vpermq $0xb1,%%ymm0,%%ymm1 \n"
|
||||
"vpaddd %%ymm1,%%ymm0,%%ymm0 \n"
|
||||
"vpermq $0xaa,%%ymm0,%%ymm1 \n"
|
||||
"vpaddd %%ymm1,%%ymm0,%%ymm0 \n"
|
||||
"vmovd %%xmm0, %3 \n"
|
||||
"vzeroupper \n"
|
||||
: "+r"(src_a), // %0
|
||||
"+r"(src_b), // %1
|
||||
"+r"(count), // %2
|
||||
"=r"(diff) // %3
|
||||
: "m"(kNibbleMask), // %4
|
||||
"m"(kBitCount) // %5
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
|
||||
|
||||
return diff;
|
||||
}
|
||||
#endif // HAS_HAMMINGDISTANCE_AVX2
|
||||
|
||||
uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
|
||||
uint32 sse;
|
||||
|
|
|
@ -26,67 +26,61 @@ extern "C" {
|
|||
uint32 HammingDistance_NEON(const uint8* src_a, const uint8* src_b, int count) {
|
||||
uint32 diff;
|
||||
|
||||
asm volatile (
|
||||
"vmov.u16 q4, #0 \n" // accumulator
|
||||
asm volatile(
|
||||
"vmov.u16 q4, #0 \n" // accumulator
|
||||
|
||||
"1: \n"
|
||||
"vld1.8 {q0, q1}, [%0]! \n"
|
||||
"vld1.8 {q2, q3}, [%1]! \n"
|
||||
"veor.32 q0, q0, q2 \n"
|
||||
"veor.32 q1, q1, q3 \n"
|
||||
"vcnt.i8 q0, q0 \n"
|
||||
"vcnt.i8 q1, q1 \n"
|
||||
"subs %2, %2, #32 \n"
|
||||
"vadd.u8 q0, q0, q1 \n" // 16 byte counts
|
||||
"vpadal.u8 q4, q0 \n" // 8 shorts
|
||||
"bgt 1b \n"
|
||||
"1: \n"
|
||||
"vld1.8 {q0, q1}, [%0]! \n"
|
||||
"vld1.8 {q2, q3}, [%1]! \n"
|
||||
"veor.32 q0, q0, q2 \n"
|
||||
"veor.32 q1, q1, q3 \n"
|
||||
"vcnt.i8 q0, q0 \n"
|
||||
"vcnt.i8 q1, q1 \n"
|
||||
"subs %2, %2, #32 \n"
|
||||
"vadd.u8 q0, q0, q1 \n" // 16 byte counts
|
||||
"vpadal.u8 q4, q0 \n" // 8 shorts
|
||||
"bgt 1b \n"
|
||||
|
||||
"vpaddl.u16 q0, q4 \n" // 4 ints
|
||||
"vpadd.u32 d0, d0, d1 \n"
|
||||
"vpadd.u32 d0, d0, d0 \n"
|
||||
"vmov.32 %3, d0[0] \n"
|
||||
|
||||
: "+r"(src_a),
|
||||
"+r"(src_b),
|
||||
"+r"(count),
|
||||
"=r"(diff)
|
||||
:
|
||||
: "cc", "q0", "q1", "q2", "q3", "q4");
|
||||
"vpaddl.u16 q0, q4 \n" // 4 ints
|
||||
"vpadd.u32 d0, d0, d1 \n"
|
||||
"vpadd.u32 d0, d0, d0 \n"
|
||||
"vmov.32 %3, d0[0] \n"
|
||||
|
||||
: "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(diff)
|
||||
:
|
||||
: "cc", "q0", "q1", "q2", "q3", "q4");
|
||||
return diff;
|
||||
}
|
||||
|
||||
uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
|
||||
uint32 sse;
|
||||
asm volatile (
|
||||
"vmov.u8 q8, #0 \n"
|
||||
"vmov.u8 q10, #0 \n"
|
||||
"vmov.u8 q9, #0 \n"
|
||||
"vmov.u8 q11, #0 \n"
|
||||
asm volatile(
|
||||
"vmov.u8 q8, #0 \n"
|
||||
"vmov.u8 q10, #0 \n"
|
||||
"vmov.u8 q9, #0 \n"
|
||||
"vmov.u8 q11, #0 \n"
|
||||
|
||||
"1: \n"
|
||||
"vld1.8 {q0}, [%0]! \n"
|
||||
"vld1.8 {q1}, [%1]! \n"
|
||||
"subs %2, %2, #16 \n"
|
||||
"vsubl.u8 q2, d0, d2 \n"
|
||||
"vsubl.u8 q3, d1, d3 \n"
|
||||
"vmlal.s16 q8, d4, d4 \n"
|
||||
"vmlal.s16 q9, d6, d6 \n"
|
||||
"vmlal.s16 q10, d5, d5 \n"
|
||||
"vmlal.s16 q11, d7, d7 \n"
|
||||
"bgt 1b \n"
|
||||
"1: \n"
|
||||
"vld1.8 {q0}, [%0]! \n"
|
||||
"vld1.8 {q1}, [%1]! \n"
|
||||
"subs %2, %2, #16 \n"
|
||||
"vsubl.u8 q2, d0, d2 \n"
|
||||
"vsubl.u8 q3, d1, d3 \n"
|
||||
"vmlal.s16 q8, d4, d4 \n"
|
||||
"vmlal.s16 q9, d6, d6 \n"
|
||||
"vmlal.s16 q10, d5, d5 \n"
|
||||
"vmlal.s16 q11, d7, d7 \n"
|
||||
"bgt 1b \n"
|
||||
|
||||
"vadd.u32 q8, q8, q9 \n"
|
||||
"vadd.u32 q10, q10, q11 \n"
|
||||
"vadd.u32 q11, q8, q10 \n"
|
||||
"vpaddl.u32 q1, q11 \n"
|
||||
"vadd.u64 d0, d2, d3 \n"
|
||||
"vmov.32 %3, d0[0] \n"
|
||||
: "+r"(src_a),
|
||||
"+r"(src_b),
|
||||
"+r"(count),
|
||||
"=r"(sse)
|
||||
:
|
||||
: "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
|
||||
"vadd.u32 q8, q8, q9 \n"
|
||||
"vadd.u32 q10, q10, q11 \n"
|
||||
"vadd.u32 q11, q8, q10 \n"
|
||||
"vpaddl.u32 q1, q11 \n"
|
||||
"vadd.u64 d0, d2, d3 \n"
|
||||
"vmov.32 %3, d0[0] \n"
|
||||
: "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse)
|
||||
:
|
||||
: "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
|
||||
return sse;
|
||||
}
|
||||
|
||||
|
|
|
@ -24,63 +24,57 @@ extern "C" {
|
|||
// uses short accumulator which restricts count to 131 KB
|
||||
uint32 HammingDistance_NEON(const uint8* src_a, const uint8* src_b, int count) {
|
||||
uint32 diff;
|
||||
asm volatile (
|
||||
"movi v4.8h, #0 \n"
|
||||
asm volatile(
|
||||
"movi v4.8h, #0 \n"
|
||||
|
||||
"1: \n"
|
||||
"ld1 {v0.16b, v1.16b}, [%0], #32 \n"
|
||||
"ld1 {v2.16b, v3.16b}, [%1], #32 \n"
|
||||
"eor v0.16b, v0.16b, v2.16b \n"
|
||||
"eor v1.16b, v1.16b, v3.16b \n"
|
||||
"cnt v0.16b, v0.16b \n"
|
||||
"cnt v1.16b, v1.16b \n"
|
||||
"subs %w2, %w2, #32 \n"
|
||||
"add v0.16b, v0.16b, v1.16b \n"
|
||||
"uadalp v4.8h, v0.16b \n"
|
||||
"b.gt 1b \n"
|
||||
"1: \n"
|
||||
"ld1 {v0.16b, v1.16b}, [%0], #32 \n"
|
||||
"ld1 {v2.16b, v3.16b}, [%1], #32 \n"
|
||||
"eor v0.16b, v0.16b, v2.16b \n"
|
||||
"eor v1.16b, v1.16b, v3.16b \n"
|
||||
"cnt v0.16b, v0.16b \n"
|
||||
"cnt v1.16b, v1.16b \n"
|
||||
"subs %w2, %w2, #32 \n"
|
||||
"add v0.16b, v0.16b, v1.16b \n"
|
||||
"uadalp v4.8h, v0.16b \n"
|
||||
"b.gt 1b \n"
|
||||
|
||||
"uaddlv s4, v4.8h \n"
|
||||
"fmov %w3, s4 \n"
|
||||
: "+r"(src_a),
|
||||
"+r"(src_b),
|
||||
"+r"(count),
|
||||
"=r"(diff)
|
||||
:
|
||||
: "cc", "v0", "v1", "v2", "v3", "v4");
|
||||
"uaddlv s4, v4.8h \n"
|
||||
"fmov %w3, s4 \n"
|
||||
: "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(diff)
|
||||
:
|
||||
: "cc", "v0", "v1", "v2", "v3", "v4");
|
||||
return diff;
|
||||
}
|
||||
|
||||
uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
|
||||
uint32 sse;
|
||||
asm volatile (
|
||||
"eor v16.16b, v16.16b, v16.16b \n"
|
||||
"eor v18.16b, v18.16b, v18.16b \n"
|
||||
"eor v17.16b, v17.16b, v17.16b \n"
|
||||
"eor v19.16b, v19.16b, v19.16b \n"
|
||||
asm volatile(
|
||||
"eor v16.16b, v16.16b, v16.16b \n"
|
||||
"eor v18.16b, v18.16b, v18.16b \n"
|
||||
"eor v17.16b, v17.16b, v17.16b \n"
|
||||
"eor v19.16b, v19.16b, v19.16b \n"
|
||||
|
||||
"1: \n"
|
||||
"ld1 {v0.16b}, [%0], #16 \n"
|
||||
"ld1 {v1.16b}, [%1], #16 \n"
|
||||
"subs %w2, %w2, #16 \n"
|
||||
"usubl v2.8h, v0.8b, v1.8b \n"
|
||||
"usubl2 v3.8h, v0.16b, v1.16b \n"
|
||||
"smlal v16.4s, v2.4h, v2.4h \n"
|
||||
"smlal v17.4s, v3.4h, v3.4h \n"
|
||||
"smlal2 v18.4s, v2.8h, v2.8h \n"
|
||||
"smlal2 v19.4s, v3.8h, v3.8h \n"
|
||||
"b.gt 1b \n"
|
||||
"1: \n"
|
||||
"ld1 {v0.16b}, [%0], #16 \n"
|
||||
"ld1 {v1.16b}, [%1], #16 \n"
|
||||
"subs %w2, %w2, #16 \n"
|
||||
"usubl v2.8h, v0.8b, v1.8b \n"
|
||||
"usubl2 v3.8h, v0.16b, v1.16b \n"
|
||||
"smlal v16.4s, v2.4h, v2.4h \n"
|
||||
"smlal v17.4s, v3.4h, v3.4h \n"
|
||||
"smlal2 v18.4s, v2.8h, v2.8h \n"
|
||||
"smlal2 v19.4s, v3.8h, v3.8h \n"
|
||||
"b.gt 1b \n"
|
||||
|
||||
"add v16.4s, v16.4s, v17.4s \n"
|
||||
"add v18.4s, v18.4s, v19.4s \n"
|
||||
"add v19.4s, v16.4s, v18.4s \n"
|
||||
"addv s0, v19.4s \n"
|
||||
"fmov %w3, s0 \n"
|
||||
: "+r"(src_a),
|
||||
"+r"(src_b),
|
||||
"+r"(count),
|
||||
"=r"(sse)
|
||||
:
|
||||
: "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
|
||||
"add v16.4s, v16.4s, v17.4s \n"
|
||||
"add v18.4s, v18.4s, v19.4s \n"
|
||||
"add v19.4s, v16.4s, v18.4s \n"
|
||||
"addv s0, v19.4s \n"
|
||||
"fmov %w3, s0 \n"
|
||||
: "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse)
|
||||
:
|
||||
: "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
|
||||
return sse;
|
||||
}
|
||||
|
||||
|
|
|
@ -9,34 +9,51 @@
|
|||
*/
|
||||
|
||||
#include "libyuv/basic_types.h"
|
||||
|
||||
#include "libyuv/compare_row.h"
|
||||
#include "libyuv/row.h"
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#include <intrin.h> // For __popcnt
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
namespace libyuv {
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// This module is for 32 bit Visual C x86 and clangcl
|
||||
#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
|
||||
#if (_MSC_VER >= 1900)
|
||||
__declspec(naked)
|
||||
#else
|
||||
__declspec(naked) __declspec(align(16))
|
||||
#endif
|
||||
uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
|
||||
|
||||
uint32 HammingDistance_SSE42(const uint8* src_a,
|
||||
const uint8* src_b,
|
||||
int count) {
|
||||
uint32 diff = 0u;
|
||||
|
||||
int i;
|
||||
for (i = 0; i < count - 3; i += 4) {
|
||||
uint32 x = *((uint32*)src_a) ^ *((uint32*)src_b);
|
||||
src_a += 4;
|
||||
src_b += 4;
|
||||
diff += __popcnt(x);
|
||||
}
|
||||
return diff;
|
||||
}
|
||||
|
||||
__declspec(naked) uint32
|
||||
SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
|
||||
__asm {
|
||||
mov eax, [esp + 4] // src_a
|
||||
mov edx, [esp + 8] // src_b
|
||||
mov ecx, [esp + 12] // count
|
||||
mov eax, [esp + 4] // src_a
|
||||
mov edx, [esp + 8] // src_b
|
||||
mov ecx, [esp + 12] // count
|
||||
pxor xmm0, xmm0
|
||||
pxor xmm5, xmm5
|
||||
|
||||
align 4
|
||||
wloop:
|
||||
movdqa xmm1, [eax]
|
||||
movdqu xmm1, [eax]
|
||||
lea eax, [eax + 16]
|
||||
movdqa xmm2, [edx]
|
||||
movdqu xmm2, [edx]
|
||||
lea edx, [edx + 16]
|
||||
sub ecx, 16
|
||||
movdqa xmm3, xmm1 // abs trick
|
||||
psubusb xmm1, xmm2
|
||||
psubusb xmm2, xmm3
|
||||
|
@ -48,6 +65,7 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
|
|||
pmaddwd xmm2, xmm2
|
||||
paddd xmm0, xmm1
|
||||
paddd xmm0, xmm2
|
||||
sub ecx, 16
|
||||
jg wloop
|
||||
|
||||
pshufd xmm1, xmm0, 0xee
|
||||
|
@ -62,27 +80,21 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
|
|||
// Visual C 2012 required for AVX2.
|
||||
#if _MSC_VER >= 1700
|
||||
// C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX.
|
||||
#pragma warning(disable: 4752)
|
||||
#if (_MSC_VER >= 1900)
|
||||
__declspec(naked)
|
||||
#else
|
||||
__declspec(naked) __declspec(align(16))
|
||||
#endif
|
||||
uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
|
||||
#pragma warning(disable : 4752)
|
||||
__declspec(naked) uint32
|
||||
SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
|
||||
__asm {
|
||||
mov eax, [esp + 4] // src_a
|
||||
mov edx, [esp + 8] // src_b
|
||||
mov ecx, [esp + 12] // count
|
||||
mov eax, [esp + 4] // src_a
|
||||
mov edx, [esp + 8] // src_b
|
||||
mov ecx, [esp + 12] // count
|
||||
vpxor ymm0, ymm0, ymm0 // sum
|
||||
vpxor ymm5, ymm5, ymm5 // constant 0 for unpck
|
||||
sub edx, eax
|
||||
|
||||
align 4
|
||||
wloop:
|
||||
vmovdqu ymm1, [eax]
|
||||
vmovdqu ymm2, [eax + edx]
|
||||
lea eax, [eax + 32]
|
||||
sub ecx, 32
|
||||
vpsubusb ymm3, ymm1, ymm2 // abs difference trick
|
||||
vpsubusb ymm2, ymm2, ymm1
|
||||
vpor ymm1, ymm2, ymm3
|
||||
|
@ -92,6 +104,7 @@ uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
|
|||
vpmaddwd ymm1, ymm1, ymm1
|
||||
vpaddd ymm0, ymm0, ymm1
|
||||
vpaddd ymm0, ymm0, ymm2
|
||||
sub ecx, 32
|
||||
jg wloop
|
||||
|
||||
vpshufd ymm1, ymm0, 0xee // 3, 2 + 1, 0 both lanes.
|
||||
|
@ -107,81 +120,66 @@ uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
|
|||
}
|
||||
#endif // _MSC_VER >= 1700
|
||||
|
||||
#define HAS_HASHDJB2_SSE41
|
||||
static uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 }; // 33 ^ 16
|
||||
static uvec32 kHashMul0 = {
|
||||
0x0c3525e1, // 33 ^ 15
|
||||
0xa3476dc1, // 33 ^ 14
|
||||
0x3b4039a1, // 33 ^ 13
|
||||
0x4f5f0981, // 33 ^ 12
|
||||
uvec32 kHash16x33 = {0x92d9e201, 0, 0, 0}; // 33 ^ 16
|
||||
uvec32 kHashMul0 = {
|
||||
0x0c3525e1, // 33 ^ 15
|
||||
0xa3476dc1, // 33 ^ 14
|
||||
0x3b4039a1, // 33 ^ 13
|
||||
0x4f5f0981, // 33 ^ 12
|
||||
};
|
||||
static uvec32 kHashMul1 = {
|
||||
0x30f35d61, // 33 ^ 11
|
||||
0x855cb541, // 33 ^ 10
|
||||
0x040a9121, // 33 ^ 9
|
||||
0x747c7101, // 33 ^ 8
|
||||
uvec32 kHashMul1 = {
|
||||
0x30f35d61, // 33 ^ 11
|
||||
0x855cb541, // 33 ^ 10
|
||||
0x040a9121, // 33 ^ 9
|
||||
0x747c7101, // 33 ^ 8
|
||||
};
|
||||
static uvec32 kHashMul2 = {
|
||||
0xec41d4e1, // 33 ^ 7
|
||||
0x4cfa3cc1, // 33 ^ 6
|
||||
0x025528a1, // 33 ^ 5
|
||||
0x00121881, // 33 ^ 4
|
||||
uvec32 kHashMul2 = {
|
||||
0xec41d4e1, // 33 ^ 7
|
||||
0x4cfa3cc1, // 33 ^ 6
|
||||
0x025528a1, // 33 ^ 5
|
||||
0x00121881, // 33 ^ 4
|
||||
};
|
||||
static uvec32 kHashMul3 = {
|
||||
0x00008c61, // 33 ^ 3
|
||||
0x00000441, // 33 ^ 2
|
||||
0x00000021, // 33 ^ 1
|
||||
0x00000001, // 33 ^ 0
|
||||
uvec32 kHashMul3 = {
|
||||
0x00008c61, // 33 ^ 3
|
||||
0x00000441, // 33 ^ 2
|
||||
0x00000021, // 33 ^ 1
|
||||
0x00000001, // 33 ^ 0
|
||||
};
|
||||
|
||||
// 27: 66 0F 38 40 C6 pmulld xmm0,xmm6
|
||||
// 44: 66 0F 38 40 DD pmulld xmm3,xmm5
|
||||
// 59: 66 0F 38 40 E5 pmulld xmm4,xmm5
|
||||
// 72: 66 0F 38 40 D5 pmulld xmm2,xmm5
|
||||
// 83: 66 0F 38 40 CD pmulld xmm1,xmm5
|
||||
#define pmulld(reg) _asm _emit 0x66 _asm _emit 0x0F _asm _emit 0x38 \
|
||||
_asm _emit 0x40 _asm _emit reg
|
||||
|
||||
#if (_MSC_VER >= 1900)
|
||||
__declspec(naked)
|
||||
#else
|
||||
__declspec(naked) __declspec(align(16))
|
||||
#endif
|
||||
uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
|
||||
__declspec(naked) uint32
|
||||
HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
|
||||
__asm {
|
||||
mov eax, [esp + 4] // src
|
||||
mov ecx, [esp + 8] // count
|
||||
mov eax, [esp + 4] // src
|
||||
mov ecx, [esp + 8] // count
|
||||
movd xmm0, [esp + 12] // seed
|
||||
|
||||
pxor xmm7, xmm7 // constant 0 for unpck
|
||||
movdqa xmm6, kHash16x33
|
||||
pxor xmm7, xmm7 // constant 0 for unpck
|
||||
movdqa xmm6, xmmword ptr kHash16x33
|
||||
|
||||
align 4
|
||||
wloop:
|
||||
movdqu xmm1, [eax] // src[0-15]
|
||||
movdqu xmm1, [eax] // src[0-15]
|
||||
lea eax, [eax + 16]
|
||||
pmulld(0xc6) // pmulld xmm0,xmm6 hash *= 33 ^ 16
|
||||
movdqa xmm5, kHashMul0
|
||||
pmulld xmm0, xmm6 // hash *= 33 ^ 16
|
||||
movdqa xmm5, xmmword ptr kHashMul0
|
||||
movdqa xmm2, xmm1
|
||||
punpcklbw xmm2, xmm7 // src[0-7]
|
||||
punpcklbw xmm2, xmm7 // src[0-7]
|
||||
movdqa xmm3, xmm2
|
||||
punpcklwd xmm3, xmm7 // src[0-3]
|
||||
pmulld(0xdd) // pmulld xmm3, xmm5
|
||||
movdqa xmm5, kHashMul1
|
||||
punpcklwd xmm3, xmm7 // src[0-3]
|
||||
pmulld xmm3, xmm5
|
||||
movdqa xmm5, xmmword ptr kHashMul1
|
||||
movdqa xmm4, xmm2
|
||||
punpckhwd xmm4, xmm7 // src[4-7]
|
||||
pmulld(0xe5) // pmulld xmm4, xmm5
|
||||
movdqa xmm5, kHashMul2
|
||||
punpckhbw xmm1, xmm7 // src[8-15]
|
||||
punpckhwd xmm4, xmm7 // src[4-7]
|
||||
pmulld xmm4, xmm5
|
||||
movdqa xmm5, xmmword ptr kHashMul2
|
||||
punpckhbw xmm1, xmm7 // src[8-15]
|
||||
movdqa xmm2, xmm1
|
||||
punpcklwd xmm2, xmm7 // src[8-11]
|
||||
pmulld(0xd5) // pmulld xmm2, xmm5
|
||||
movdqa xmm5, kHashMul3
|
||||
punpckhwd xmm1, xmm7 // src[12-15]
|
||||
pmulld(0xcd) // pmulld xmm1, xmm5
|
||||
paddd xmm3, xmm4 // add 16 results
|
||||
punpcklwd xmm2, xmm7 // src[8-11]
|
||||
pmulld xmm2, xmm5
|
||||
movdqa xmm5, xmmword ptr kHashMul3
|
||||
punpckhwd xmm1, xmm7 // src[12-15]
|
||||
pmulld xmm1, xmm5
|
||||
paddd xmm3, xmm4 // add 16 results
|
||||
paddd xmm1, xmm2
|
||||
sub ecx, 16
|
||||
paddd xmm1, xmm3
|
||||
|
||||
pshufd xmm2, xmm1, 0x0e // upper 2 dwords
|
||||
|
@ -189,59 +187,55 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
|
|||
pshufd xmm2, xmm1, 0x01
|
||||
paddd xmm1, xmm2
|
||||
paddd xmm0, xmm1
|
||||
sub ecx, 16
|
||||
jg wloop
|
||||
|
||||
movd eax, xmm0 // return hash
|
||||
movd eax, xmm0 // return hash
|
||||
ret
|
||||
}
|
||||
}
|
||||
|
||||
// Visual C 2012 required for AVX2.
|
||||
#if _MSC_VER >= 1700
|
||||
#if (_MSC_VER >= 1900)
|
||||
__declspec(naked)
|
||||
#else
|
||||
__declspec(naked) __declspec(align(16))
|
||||
#endif
|
||||
uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) {
|
||||
__declspec(naked) uint32
|
||||
HashDjb2_AVX2(const uint8* src, int count, uint32 seed) {
|
||||
__asm {
|
||||
mov eax, [esp + 4] // src
|
||||
mov ecx, [esp + 8] // count
|
||||
movd xmm0, [esp + 12] // seed
|
||||
movdqa xmm6, kHash16x33
|
||||
mov eax, [esp + 4] // src
|
||||
mov ecx, [esp + 8] // count
|
||||
vmovd xmm0, [esp + 12] // seed
|
||||
|
||||
align 4
|
||||
wloop:
|
||||
vpmovzxbd xmm3, dword ptr [eax] // src[0-3]
|
||||
pmulld xmm0, xmm6 // hash *= 33 ^ 16
|
||||
vpmovzxbd xmm4, dword ptr [eax + 4] // src[4-7]
|
||||
pmulld xmm3, kHashMul0
|
||||
vpmovzxbd xmm2, dword ptr [eax + 8] // src[8-11]
|
||||
pmulld xmm4, kHashMul1
|
||||
vpmovzxbd xmm1, dword ptr [eax + 12] // src[12-15]
|
||||
pmulld xmm2, kHashMul2
|
||||
vpmovzxbd xmm3, [eax] // src[0-3]
|
||||
vpmulld xmm0, xmm0, xmmword ptr kHash16x33 // hash *= 33 ^ 16
|
||||
vpmovzxbd xmm4, [eax + 4] // src[4-7]
|
||||
vpmulld xmm3, xmm3, xmmword ptr kHashMul0
|
||||
vpmovzxbd xmm2, [eax + 8] // src[8-11]
|
||||
vpmulld xmm4, xmm4, xmmword ptr kHashMul1
|
||||
vpmovzxbd xmm1, [eax + 12] // src[12-15]
|
||||
vpmulld xmm2, xmm2, xmmword ptr kHashMul2
|
||||
lea eax, [eax + 16]
|
||||
pmulld xmm1, kHashMul3
|
||||
paddd xmm3, xmm4 // add 16 results
|
||||
paddd xmm1, xmm2
|
||||
vpmulld xmm1, xmm1, xmmword ptr kHashMul3
|
||||
vpaddd xmm3, xmm3, xmm4 // add 16 results
|
||||
vpaddd xmm1, xmm1, xmm2
|
||||
vpaddd xmm1, xmm1, xmm3
|
||||
vpshufd xmm2, xmm1, 0x0e // upper 2 dwords
|
||||
vpaddd xmm1, xmm1,xmm2
|
||||
vpshufd xmm2, xmm1, 0x01
|
||||
vpaddd xmm1, xmm1, xmm2
|
||||
vpaddd xmm0, xmm0, xmm1
|
||||
sub ecx, 16
|
||||
paddd xmm1, xmm3
|
||||
pshufd xmm2, xmm1, 0x0e // upper 2 dwords
|
||||
paddd xmm1, xmm2
|
||||
pshufd xmm2, xmm1, 0x01
|
||||
paddd xmm1, xmm2
|
||||
paddd xmm0, xmm1
|
||||
jg wloop
|
||||
|
||||
movd eax, xmm0 // return hash
|
||||
vmovd eax, xmm0 // return hash
|
||||
vzeroupper
|
||||
ret
|
||||
}
|
||||
}
|
||||
#endif // _MSC_VER >= 1700
|
||||
|
||||
#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
|
||||
#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
} // namespace libyuv
|
||||
#endif
|
||||
#endif
|
||||
|
|
|
@ -657,6 +657,42 @@ int I420ToRAW(const uint8* src_y,
|
|||
width, height);
|
||||
}
|
||||
|
||||
// Convert H420 to RGB24.
|
||||
LIBYUV_API
|
||||
int H420ToRGB24(const uint8* src_y,
|
||||
int src_stride_y,
|
||||
const uint8* src_u,
|
||||
int src_stride_u,
|
||||
const uint8* src_v,
|
||||
int src_stride_v,
|
||||
uint8* dst_rgb24,
|
||||
int dst_stride_rgb24,
|
||||
int width,
|
||||
int height) {
|
||||
return I420ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
|
||||
src_stride_v, dst_rgb24, dst_stride_rgb24,
|
||||
&kYuvH709Constants, width, height);
|
||||
}
|
||||
|
||||
// Convert H420 to RAW.
|
||||
LIBYUV_API
|
||||
int H420ToRAW(const uint8* src_y,
|
||||
int src_stride_y,
|
||||
const uint8* src_u,
|
||||
int src_stride_u,
|
||||
const uint8* src_v,
|
||||
int src_stride_v,
|
||||
uint8* dst_raw,
|
||||
int dst_stride_raw,
|
||||
int width,
|
||||
int height) {
|
||||
return I420ToRGB24Matrix(src_y, src_stride_y, src_v,
|
||||
src_stride_v, // Swap U and V
|
||||
src_u, src_stride_u, dst_raw, dst_stride_raw,
|
||||
&kYvuH709Constants, // Use Yvu matrix
|
||||
width, height);
|
||||
}
|
||||
|
||||
// Convert I420 to ARGB1555.
|
||||
LIBYUV_API
|
||||
int I420ToARGB1555(const uint8* src_y,
|
||||
|
@ -1075,8 +1111,8 @@ int I420ToRGB565Dither(const uint8* src_y,
|
|||
for (y = 0; y < height; ++y) {
|
||||
I422ToARGBRow(src_y, src_u, src_v, row_argb, &kYuvI601Constants, width);
|
||||
ARGBToRGB565DitherRow(row_argb, dst_rgb565,
|
||||
*(uint32*)(dither4x4 + ((y & 3) << 2)),
|
||||
width); // NOLINT
|
||||
*(uint32*)(dither4x4 + ((y & 3) << 2)), // NOLINT
|
||||
width); // NOLINT
|
||||
dst_rgb565 += dst_stride_rgb565;
|
||||
src_y += src_stride_y;
|
||||
if (y & 1) {
|
||||
|
|
|
@ -124,7 +124,7 @@ void CpuId(int eax, int ecx, int* cpu_info) {
|
|||
int GetXCR0() {
|
||||
int xcr0 = 0;
|
||||
#if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219)
|
||||
xcr0 = _xgetbv(0); // VS2010 SP1 required.
|
||||
xcr0 = (int)_xgetbv(0); // VS2010 SP1 required. NOLINT
|
||||
#elif defined(__i386__) || defined(__x86_64__)
|
||||
asm(".byte 0x0f, 0x01, 0xd0" : "=a"(xcr0) : "c"(0) : "%edx");
|
||||
#endif // defined(__i386__) || defined(__x86_64__)
|
||||
|
@ -242,10 +242,17 @@ static SAFEBUFFERS int GetCpuFlags(void) {
|
|||
|
||||
// Detect AVX512bw
|
||||
if ((GetXCR0() & 0xe0) == 0xe0) {
|
||||
cpu_info |= (cpu_info7[1] & 0x40000000) ? kCpuHasAVX3 : 0;
|
||||
cpu_info |= (cpu_info7[1] & 0x40000000) ? kCpuHasAVX512BW : 0;
|
||||
cpu_info |= (cpu_info7[1] & 0x80000000) ? kCpuHasAVX512VL : 0;
|
||||
cpu_info |= (cpu_info7[2] & 0x00000002) ? kCpuHasAVX512VBMI : 0;
|
||||
cpu_info |= (cpu_info7[2] & 0x00000040) ? kCpuHasAVX512VBMI2 : 0;
|
||||
cpu_info |= (cpu_info7[2] & 0x00001000) ? kCpuHasAVX512VBITALG : 0;
|
||||
cpu_info |= (cpu_info7[2] & 0x00004000) ? kCpuHasAVX512VPOPCNTDQ : 0;
|
||||
cpu_info |= (cpu_info7[2] & 0x00000100) ? kCpuHasGFNI : 0;
|
||||
}
|
||||
}
|
||||
|
||||
// TODO(fbarchard): Consider moving these to gtest
|
||||
// Environment variable overrides for testing.
|
||||
if (TestEnv("LIBYUV_DISABLE_X86")) {
|
||||
cpu_info &= ~kCpuHasX86;
|
||||
|
@ -274,12 +281,12 @@ static SAFEBUFFERS int GetCpuFlags(void) {
|
|||
if (TestEnv("LIBYUV_DISABLE_FMA3")) {
|
||||
cpu_info &= ~kCpuHasFMA3;
|
||||
}
|
||||
if (TestEnv("LIBYUV_DISABLE_AVX3")) {
|
||||
cpu_info &= ~kCpuHasAVX3;
|
||||
}
|
||||
if (TestEnv("LIBYUV_DISABLE_F16C")) {
|
||||
cpu_info &= ~kCpuHasF16C;
|
||||
}
|
||||
if (TestEnv("LIBYUV_DISABLE_AVX512BW")) {
|
||||
cpu_info &= ~kCpuHasAVX512BW;
|
||||
}
|
||||
|
||||
#endif
|
||||
#if defined(__mips__) && defined(__linux__)
|
||||
|
|
|
@ -13,10 +13,6 @@
|
|||
#ifdef HAVE_JPEG
|
||||
#include <assert.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
#include <new>
|
||||
#endif
|
||||
|
||||
#if !defined(__pnacl__) && !defined(__CLR_VER) && \
|
||||
!defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
|
||||
// Must be included before jpeglib.
|
||||
|
|
|
@ -24,7 +24,7 @@ static LIBYUV_BOOL ScanEOI(const uint8* sample, size_t sample_size) {
|
|||
const uint8* it = sample;
|
||||
while (it < end) {
|
||||
// TODO(fbarchard): scan for 0xd9 instead.
|
||||
it = static_cast<const uint8*>(memchr(it, 0xff, end - it));
|
||||
it = (const uint8*)(memchr(it, 0xff, end - it));
|
||||
if (it == NULL) {
|
||||
break;
|
||||
}
|
||||
|
|
|
@ -321,6 +321,14 @@ void SplitUVPlane(const uint8* src_uv,
|
|||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SPLITUVROW_MSA)
|
||||
if (TestCpuFlag(kCpuHasMSA)) {
|
||||
SplitUVRow = SplitUVRow_Any_MSA;
|
||||
if (IS_ALIGNED(width, 32)) {
|
||||
SplitUVRow = SplitUVRow_MSA;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
for (y = 0; y < height; ++y) {
|
||||
// Copy a row of UV.
|
||||
|
@ -399,6 +407,122 @@ void MergeUVPlane(const uint8* src_u,
|
|||
}
|
||||
}
|
||||
|
||||
// Support function for NV12 etc RGB channels.
|
||||
// Width and height are plane sizes (typically half pixel width).
|
||||
LIBYUV_API
|
||||
void SplitRGBPlane(const uint8* src_rgb,
|
||||
int src_stride_rgb,
|
||||
uint8* dst_r,
|
||||
int dst_stride_r,
|
||||
uint8* dst_g,
|
||||
int dst_stride_g,
|
||||
uint8* dst_b,
|
||||
int dst_stride_b,
|
||||
int width,
|
||||
int height) {
|
||||
int y;
|
||||
void (*SplitRGBRow)(const uint8* src_rgb, uint8* dst_r, uint8* dst_g,
|
||||
uint8* dst_b, int width) = SplitRGBRow_C;
|
||||
// Negative height means invert the image.
|
||||
if (height < 0) {
|
||||
height = -height;
|
||||
dst_r = dst_r + (height - 1) * dst_stride_r;
|
||||
dst_g = dst_g + (height - 1) * dst_stride_g;
|
||||
dst_b = dst_b + (height - 1) * dst_stride_b;
|
||||
dst_stride_r = -dst_stride_r;
|
||||
dst_stride_g = -dst_stride_g;
|
||||
dst_stride_b = -dst_stride_b;
|
||||
}
|
||||
// Coalesce rows.
|
||||
if (src_stride_rgb == width * 3 && dst_stride_r == width &&
|
||||
dst_stride_g == width && dst_stride_b == width) {
|
||||
width *= height;
|
||||
height = 1;
|
||||
src_stride_rgb = dst_stride_r = dst_stride_g = dst_stride_b = 0;
|
||||
}
|
||||
#if defined(HAS_SPLITRGBROW_SSSE3)
|
||||
if (TestCpuFlag(kCpuHasSSSE3)) {
|
||||
SplitRGBRow = SplitRGBRow_Any_SSSE3;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
SplitRGBRow = SplitRGBRow_SSSE3;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SPLITRGBROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
SplitRGBRow = SplitRGBRow_Any_NEON;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
SplitRGBRow = SplitRGBRow_NEON;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
for (y = 0; y < height; ++y) {
|
||||
// Copy a row of RGB.
|
||||
SplitRGBRow(src_rgb, dst_r, dst_g, dst_b, width);
|
||||
dst_r += dst_stride_r;
|
||||
dst_g += dst_stride_g;
|
||||
dst_b += dst_stride_b;
|
||||
src_rgb += src_stride_rgb;
|
||||
}
|
||||
}
|
||||
|
||||
LIBYUV_API
|
||||
void MergeRGBPlane(const uint8* src_r,
|
||||
int src_stride_r,
|
||||
const uint8* src_g,
|
||||
int src_stride_g,
|
||||
const uint8* src_b,
|
||||
int src_stride_b,
|
||||
uint8* dst_rgb,
|
||||
int dst_stride_rgb,
|
||||
int width,
|
||||
int height) {
|
||||
int y;
|
||||
void (*MergeRGBRow)(const uint8* src_r, const uint8* src_g,
|
||||
const uint8* src_b, uint8* dst_rgb, int width) =
|
||||
MergeRGBRow_C;
|
||||
// Coalesce rows.
|
||||
// Negative height means invert the image.
|
||||
if (height < 0) {
|
||||
height = -height;
|
||||
dst_rgb = dst_rgb + (height - 1) * dst_stride_rgb;
|
||||
dst_stride_rgb = -dst_stride_rgb;
|
||||
}
|
||||
// Coalesce rows.
|
||||
if (src_stride_r == width && src_stride_g == width && src_stride_b == width &&
|
||||
dst_stride_rgb == width * 3) {
|
||||
width *= height;
|
||||
height = 1;
|
||||
src_stride_r = src_stride_g = src_stride_b = dst_stride_rgb = 0;
|
||||
}
|
||||
#if defined(HAS_MERGERGBROW_SSSE3)
|
||||
if (TestCpuFlag(kCpuHasSSSE3)) {
|
||||
MergeRGBRow = MergeRGBRow_Any_SSSE3;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
MergeRGBRow = MergeRGBRow_SSSE3;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_MERGERGBROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
MergeRGBRow = MergeRGBRow_Any_NEON;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
MergeRGBRow = MergeRGBRow_NEON;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
for (y = 0; y < height; ++y) {
|
||||
// Merge a row of U and V into a row of RGB.
|
||||
MergeRGBRow(src_r, src_g, src_b, dst_rgb, width);
|
||||
src_r += src_stride_r;
|
||||
src_g += src_stride_g;
|
||||
src_b += src_stride_b;
|
||||
dst_rgb += dst_stride_rgb;
|
||||
}
|
||||
}
|
||||
|
||||
// Mirror a plane of data.
|
||||
void MirrorPlane(const uint8* src_y,
|
||||
int src_stride_y,
|
||||
|
@ -845,6 +969,11 @@ ARGBBlendRow GetARGBBlend() {
|
|||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
ARGBBlendRow = ARGBBlendRow_NEON;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBBLENDROW_MSA)
|
||||
if (TestCpuFlag(kCpuHasMSA)) {
|
||||
ARGBBlendRow = ARGBBlendRow_MSA;
|
||||
}
|
||||
#endif
|
||||
return ARGBBlendRow;
|
||||
}
|
||||
|
@ -1574,6 +1703,11 @@ void SetPlane(uint8* dst_y,
|
|||
SetRow = SetRow_ERMS;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SETROW_MSA)
|
||||
if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 16)) {
|
||||
SetRow = SetRow_MSA;
|
||||
}
|
||||
#endif
|
||||
|
||||
// Set plane
|
||||
for (y = 0; y < height; ++y) {
|
||||
|
@ -1973,6 +2107,11 @@ int ARGBColorMatrix(const uint8* src_argb,
|
|||
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
|
||||
ARGBColorMatrixRow = ARGBColorMatrixRow_NEON;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBCOLORMATRIXROW_MSA)
|
||||
if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) {
|
||||
ARGBColorMatrixRow = ARGBColorMatrixRow_MSA;
|
||||
}
|
||||
#endif
|
||||
for (y = 0; y < height; ++y) {
|
||||
ARGBColorMatrixRow(src_argb, dst_argb, matrix_argb, width);
|
||||
|
@ -2133,6 +2272,11 @@ int ARGBQuantize(uint8* dst_argb,
|
|||
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
|
||||
ARGBQuantizeRow = ARGBQuantizeRow_NEON;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBQUANTIZEROW_MSA)
|
||||
if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) {
|
||||
ARGBQuantizeRow = ARGBQuantizeRow_MSA;
|
||||
}
|
||||
#endif
|
||||
for (y = 0; y < height; ++y) {
|
||||
ARGBQuantizeRow(dst, scale, interval_size, interval_offset, width);
|
||||
|
@ -2619,6 +2763,11 @@ static int ARGBSobelize(const uint8* src_argb,
|
|||
SobelYRow = SobelYRow_NEON;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SOBELYROW_MSA)
|
||||
if (TestCpuFlag(kCpuHasMSA)) {
|
||||
SobelYRow = SobelYRow_MSA;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SOBELXROW_SSE2)
|
||||
if (TestCpuFlag(kCpuHasSSE2)) {
|
||||
SobelXRow = SobelXRow_SSE2;
|
||||
|
@ -2628,6 +2777,11 @@ static int ARGBSobelize(const uint8* src_argb,
|
|||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
SobelXRow = SobelXRow_NEON;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SOBELXROW_MSA)
|
||||
if (TestCpuFlag(kCpuHasMSA)) {
|
||||
SobelXRow = SobelXRow_MSA;
|
||||
}
|
||||
#endif
|
||||
{
|
||||
// 3 rows with edges before/after.
|
||||
|
@ -2903,6 +3057,14 @@ int HalfFloatPlane(const uint16* src_y,
|
|||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_HALFFLOATROW_MSA)
|
||||
if (TestCpuFlag(kCpuHasMSA)) {
|
||||
HalfFloatRow = HalfFloatRow_Any_MSA;
|
||||
if (IS_ALIGNED(width, 32)) {
|
||||
HalfFloatRow = HalfFloatRow_MSA;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
for (y = 0; y < height; ++y) {
|
||||
HalfFloatRow(src_y, dst_y, scale, width);
|
||||
|
@ -3048,6 +3210,12 @@ int ARGBExtractAlpha(const uint8* src_argb,
|
|||
: ARGBExtractAlphaRow_Any_NEON;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBEXTRACTALPHAROW_MSA)
|
||||
if (TestCpuFlag(kCpuHasMSA)) {
|
||||
ARGBExtractAlphaRow = IS_ALIGNED(width, 16) ? ARGBExtractAlphaRow_MSA
|
||||
: ARGBExtractAlphaRow_Any_MSA;
|
||||
}
|
||||
#endif
|
||||
|
||||
for (int y = 0; y < height; ++y) {
|
||||
ARGBExtractAlphaRow(src_argb, dst_a, width);
|
||||
|
@ -3160,6 +3328,14 @@ int YUY2ToNV12(const uint8* src_yuy2,
|
|||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SPLITUVROW_MSA)
|
||||
if (TestCpuFlag(kCpuHasMSA)) {
|
||||
SplitUVRow = SplitUVRow_Any_MSA;
|
||||
if (IS_ALIGNED(width, 32)) {
|
||||
SplitUVRow = SplitUVRow_MSA;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_INTERPOLATEROW_SSSE3)
|
||||
if (TestCpuFlag(kCpuHasSSSE3)) {
|
||||
InterpolateRow = InterpolateRow_Any_SSSE3;
|
||||
|
@ -3268,6 +3444,14 @@ int UYVYToNV12(const uint8* src_uyvy,
|
|||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SPLITUVROW_MSA)
|
||||
if (TestCpuFlag(kCpuHasMSA)) {
|
||||
SplitUVRow = SplitUVRow_Any_MSA;
|
||||
if (IS_ALIGNED(width, 32)) {
|
||||
SplitUVRow = SplitUVRow_MSA;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_INTERPOLATEROW_SSSE3)
|
||||
if (TestCpuFlag(kCpuHasSSSE3)) {
|
||||
InterpolateRow = InterpolateRow_Any_SSSE3;
|
||||
|
|
|
@ -361,6 +361,11 @@ void RotateUV180(const uint8* src,
|
|||
MirrorUVRow = MirrorUVRow_DSPR2;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_MIRRORUVROW_MSA)
|
||||
if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 32)) {
|
||||
MirrorUVRow = MirrorUVRow_MSA;
|
||||
}
|
||||
#endif
|
||||
|
||||
dst_a += dst_stride_a * (height - 1);
|
||||
dst_b += dst_stride_b * (height - 1);
|
||||
|
|
|
@ -30,14 +30,14 @@ void TransposeWx8_NEON(const uint8* src,
|
|||
int dst_stride,
|
||||
int width) {
|
||||
const uint8* src_temp;
|
||||
asm volatile (
|
||||
// loops are on blocks of 8. loop will stop when
|
||||
// counter gets to or below 0. starting the counter
|
||||
// at w-8 allow for this
|
||||
"sub %w3, %w3, #8 \n"
|
||||
asm volatile(
|
||||
// loops are on blocks of 8. loop will stop when
|
||||
// counter gets to or below 0. starting the counter
|
||||
// at w-8 allow for this
|
||||
"sub %w3, %w3, #8 \n"
|
||||
|
||||
// handle 8x8 blocks. this should be the majority of the plane
|
||||
"1: \n"
|
||||
// handle 8x8 blocks. this should be the majority of the plane
|
||||
"1: \n"
|
||||
"mov %0, %1 \n"
|
||||
|
||||
"ld1 {v0.8b}, [%0], %5 \n"
|
||||
|
@ -92,109 +92,108 @@ void TransposeWx8_NEON(const uint8* src,
|
|||
"subs %w3, %w3, #8 \n" // w -= 8
|
||||
"b.ge 1b \n"
|
||||
|
||||
// add 8 back to counter. if the result is 0 there are
|
||||
// no residuals.
|
||||
"adds %w3, %w3, #8 \n"
|
||||
"b.eq 4f \n"
|
||||
// add 8 back to counter. if the result is 0 there are
|
||||
// no residuals.
|
||||
"adds %w3, %w3, #8 \n"
|
||||
"b.eq 4f \n"
|
||||
|
||||
// some residual, so between 1 and 7 lines left to transpose
|
||||
"cmp %w3, #2 \n"
|
||||
"b.lt 3f \n"
|
||||
// some residual, so between 1 and 7 lines left to transpose
|
||||
"cmp %w3, #2 \n"
|
||||
"b.lt 3f \n"
|
||||
|
||||
"cmp %w3, #4 \n"
|
||||
"b.lt 2f \n"
|
||||
"cmp %w3, #4 \n"
|
||||
"b.lt 2f \n"
|
||||
|
||||
// 4x8 block
|
||||
"mov %0, %1 \n"
|
||||
"ld1 {v0.s}[0], [%0], %5 \n"
|
||||
"ld1 {v0.s}[1], [%0], %5 \n"
|
||||
"ld1 {v0.s}[2], [%0], %5 \n"
|
||||
"ld1 {v0.s}[3], [%0], %5 \n"
|
||||
"ld1 {v1.s}[0], [%0], %5 \n"
|
||||
"ld1 {v1.s}[1], [%0], %5 \n"
|
||||
"ld1 {v1.s}[2], [%0], %5 \n"
|
||||
"ld1 {v1.s}[3], [%0] \n"
|
||||
// 4x8 block
|
||||
"mov %0, %1 \n"
|
||||
"ld1 {v0.s}[0], [%0], %5 \n"
|
||||
"ld1 {v0.s}[1], [%0], %5 \n"
|
||||
"ld1 {v0.s}[2], [%0], %5 \n"
|
||||
"ld1 {v0.s}[3], [%0], %5 \n"
|
||||
"ld1 {v1.s}[0], [%0], %5 \n"
|
||||
"ld1 {v1.s}[1], [%0], %5 \n"
|
||||
"ld1 {v1.s}[2], [%0], %5 \n"
|
||||
"ld1 {v1.s}[3], [%0] \n"
|
||||
|
||||
"mov %0, %2 \n"
|
||||
"mov %0, %2 \n"
|
||||
|
||||
"ld1 {v2.16b}, [%4] \n"
|
||||
"ld1 {v2.16b}, [%4] \n"
|
||||
|
||||
"tbl v3.16b, {v0.16b}, v2.16b \n"
|
||||
"tbl v0.16b, {v1.16b}, v2.16b \n"
|
||||
"tbl v3.16b, {v0.16b}, v2.16b \n"
|
||||
"tbl v0.16b, {v1.16b}, v2.16b \n"
|
||||
|
||||
// TODO(frkoenig): Rework shuffle above to
|
||||
// write out with 4 instead of 8 writes.
|
||||
"st1 {v3.s}[0], [%0], %6 \n"
|
||||
"st1 {v3.s}[1], [%0], %6 \n"
|
||||
"st1 {v3.s}[2], [%0], %6 \n"
|
||||
"st1 {v3.s}[3], [%0] \n"
|
||||
// TODO(frkoenig): Rework shuffle above to
|
||||
// write out with 4 instead of 8 writes.
|
||||
"st1 {v3.s}[0], [%0], %6 \n"
|
||||
"st1 {v3.s}[1], [%0], %6 \n"
|
||||
"st1 {v3.s}[2], [%0], %6 \n"
|
||||
"st1 {v3.s}[3], [%0] \n"
|
||||
|
||||
"add %0, %2, #4 \n"
|
||||
"st1 {v0.s}[0], [%0], %6 \n"
|
||||
"st1 {v0.s}[1], [%0], %6 \n"
|
||||
"st1 {v0.s}[2], [%0], %6 \n"
|
||||
"st1 {v0.s}[3], [%0] \n"
|
||||
"add %0, %2, #4 \n"
|
||||
"st1 {v0.s}[0], [%0], %6 \n"
|
||||
"st1 {v0.s}[1], [%0], %6 \n"
|
||||
"st1 {v0.s}[2], [%0], %6 \n"
|
||||
"st1 {v0.s}[3], [%0] \n"
|
||||
|
||||
"add %1, %1, #4 \n" // src += 4
|
||||
"add %2, %2, %6, lsl #2 \n" // dst += 4 * dst_stride
|
||||
"subs %w3, %w3, #4 \n" // w -= 4
|
||||
"b.eq 4f \n"
|
||||
"add %1, %1, #4 \n" // src += 4
|
||||
"add %2, %2, %6, lsl #2 \n" // dst += 4 * dst_stride
|
||||
"subs %w3, %w3, #4 \n" // w -= 4
|
||||
"b.eq 4f \n"
|
||||
|
||||
// some residual, check to see if it includes a 2x8 block,
|
||||
// or less
|
||||
"cmp %w3, #2 \n"
|
||||
"b.lt 3f \n"
|
||||
// some residual, check to see if it includes a 2x8 block,
|
||||
// or less
|
||||
"cmp %w3, #2 \n"
|
||||
"b.lt 3f \n"
|
||||
|
||||
// 2x8 block
|
||||
"2: \n"
|
||||
"mov %0, %1 \n"
|
||||
"ld1 {v0.h}[0], [%0], %5 \n"
|
||||
"ld1 {v1.h}[0], [%0], %5 \n"
|
||||
"ld1 {v0.h}[1], [%0], %5 \n"
|
||||
"ld1 {v1.h}[1], [%0], %5 \n"
|
||||
"ld1 {v0.h}[2], [%0], %5 \n"
|
||||
"ld1 {v1.h}[2], [%0], %5 \n"
|
||||
"ld1 {v0.h}[3], [%0], %5 \n"
|
||||
"ld1 {v1.h}[3], [%0] \n"
|
||||
// 2x8 block
|
||||
"2: \n"
|
||||
"mov %0, %1 \n"
|
||||
"ld1 {v0.h}[0], [%0], %5 \n"
|
||||
"ld1 {v1.h}[0], [%0], %5 \n"
|
||||
"ld1 {v0.h}[1], [%0], %5 \n"
|
||||
"ld1 {v1.h}[1], [%0], %5 \n"
|
||||
"ld1 {v0.h}[2], [%0], %5 \n"
|
||||
"ld1 {v1.h}[2], [%0], %5 \n"
|
||||
"ld1 {v0.h}[3], [%0], %5 \n"
|
||||
"ld1 {v1.h}[3], [%0] \n"
|
||||
|
||||
"trn2 v2.8b, v0.8b, v1.8b \n"
|
||||
"trn1 v3.8b, v0.8b, v1.8b \n"
|
||||
"trn2 v2.8b, v0.8b, v1.8b \n"
|
||||
"trn1 v3.8b, v0.8b, v1.8b \n"
|
||||
|
||||
"mov %0, %2 \n"
|
||||
"mov %0, %2 \n"
|
||||
|
||||
"st1 {v3.8b}, [%0], %6 \n"
|
||||
"st1 {v2.8b}, [%0] \n"
|
||||
"st1 {v3.8b}, [%0], %6 \n"
|
||||
"st1 {v2.8b}, [%0] \n"
|
||||
|
||||
"add %1, %1, #2 \n" // src += 2
|
||||
"add %2, %2, %6, lsl #1 \n" // dst += 2 * dst_stride
|
||||
"subs %w3, %w3, #2 \n" // w -= 2
|
||||
"b.eq 4f \n"
|
||||
"add %1, %1, #2 \n" // src += 2
|
||||
"add %2, %2, %6, lsl #1 \n" // dst += 2 * dst_stride
|
||||
"subs %w3, %w3, #2 \n" // w -= 2
|
||||
"b.eq 4f \n"
|
||||
|
||||
// 1x8 block
|
||||
"3: \n"
|
||||
"ld1 {v0.b}[0], [%1], %5 \n"
|
||||
"ld1 {v0.b}[1], [%1], %5 \n"
|
||||
"ld1 {v0.b}[2], [%1], %5 \n"
|
||||
"ld1 {v0.b}[3], [%1], %5 \n"
|
||||
"ld1 {v0.b}[4], [%1], %5 \n"
|
||||
"ld1 {v0.b}[5], [%1], %5 \n"
|
||||
"ld1 {v0.b}[6], [%1], %5 \n"
|
||||
"ld1 {v0.b}[7], [%1] \n"
|
||||
// 1x8 block
|
||||
"3: \n"
|
||||
"ld1 {v0.b}[0], [%1], %5 \n"
|
||||
"ld1 {v0.b}[1], [%1], %5 \n"
|
||||
"ld1 {v0.b}[2], [%1], %5 \n"
|
||||
"ld1 {v0.b}[3], [%1], %5 \n"
|
||||
"ld1 {v0.b}[4], [%1], %5 \n"
|
||||
"ld1 {v0.b}[5], [%1], %5 \n"
|
||||
"ld1 {v0.b}[6], [%1], %5 \n"
|
||||
"ld1 {v0.b}[7], [%1] \n"
|
||||
|
||||
"st1 {v0.8b}, [%2] \n"
|
||||
"st1 {v0.8b}, [%2] \n"
|
||||
|
||||
"4: \n"
|
||||
"4: \n"
|
||||
|
||||
: "=&r"(src_temp), // %0
|
||||
"+r"(src), // %1
|
||||
"+r"(dst), // %2
|
||||
"+r"(width) // %3
|
||||
: "r"(&kVTbl4x4Transpose), // %4
|
||||
"r"(static_cast<ptrdiff_t>(src_stride)), // %5
|
||||
"r"(static_cast<ptrdiff_t>(dst_stride)) // %6
|
||||
: "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
|
||||
"v17", "v18", "v19", "v20", "v21", "v22", "v23"
|
||||
);
|
||||
: "=&r"(src_temp), // %0
|
||||
"+r"(src), // %1
|
||||
"+r"(dst), // %2
|
||||
"+r"(width) // %3
|
||||
: "r"(&kVTbl4x4Transpose), // %4
|
||||
"r"(static_cast<ptrdiff_t>(src_stride)), // %5
|
||||
"r"(static_cast<ptrdiff_t>(dst_stride)) // %6
|
||||
: "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
|
||||
"v17", "v18", "v19", "v20", "v21", "v22", "v23");
|
||||
}
|
||||
|
||||
static uint8 kVTbl4x4TransposeDi[32] = {
|
||||
|
@ -209,212 +208,215 @@ void TransposeUVWx8_NEON(const uint8* src,
|
|||
int dst_stride_b,
|
||||
int width) {
|
||||
const uint8* src_temp;
|
||||
asm volatile (
|
||||
// loops are on blocks of 8. loop will stop when
|
||||
// counter gets to or below 0. starting the counter
|
||||
// at w-8 allow for this
|
||||
"sub %w4, %w4, #8 \n"
|
||||
asm volatile(
|
||||
// loops are on blocks of 8. loop will stop when
|
||||
// counter gets to or below 0. starting the counter
|
||||
// at w-8 allow for this
|
||||
"sub %w4, %w4, #8 \n"
|
||||
|
||||
// handle 8x8 blocks. this should be the majority of the plane
|
||||
"1: \n"
|
||||
"mov %0, %1 \n"
|
||||
// handle 8x8 blocks. this should be the majority of the plane
|
||||
"1: \n"
|
||||
"mov %0, %1 \n"
|
||||
|
||||
"ld1 {v0.16b}, [%0], %5 \n"
|
||||
"ld1 {v1.16b}, [%0], %5 \n"
|
||||
"ld1 {v2.16b}, [%0], %5 \n"
|
||||
"ld1 {v3.16b}, [%0], %5 \n"
|
||||
"ld1 {v4.16b}, [%0], %5 \n"
|
||||
"ld1 {v5.16b}, [%0], %5 \n"
|
||||
"ld1 {v6.16b}, [%0], %5 \n"
|
||||
"ld1 {v7.16b}, [%0] \n"
|
||||
"ld1 {v0.16b}, [%0], %5 \n"
|
||||
"ld1 {v1.16b}, [%0], %5 \n"
|
||||
"ld1 {v2.16b}, [%0], %5 \n"
|
||||
"ld1 {v3.16b}, [%0], %5 \n"
|
||||
"ld1 {v4.16b}, [%0], %5 \n"
|
||||
"ld1 {v5.16b}, [%0], %5 \n"
|
||||
"ld1 {v6.16b}, [%0], %5 \n"
|
||||
"ld1 {v7.16b}, [%0] \n"
|
||||
|
||||
"trn1 v16.16b, v0.16b, v1.16b \n"
|
||||
"trn2 v17.16b, v0.16b, v1.16b \n"
|
||||
"trn1 v18.16b, v2.16b, v3.16b \n"
|
||||
"trn2 v19.16b, v2.16b, v3.16b \n"
|
||||
"trn1 v20.16b, v4.16b, v5.16b \n"
|
||||
"trn2 v21.16b, v4.16b, v5.16b \n"
|
||||
"trn1 v22.16b, v6.16b, v7.16b \n"
|
||||
"trn2 v23.16b, v6.16b, v7.16b \n"
|
||||
"trn1 v16.16b, v0.16b, v1.16b \n"
|
||||
"trn2 v17.16b, v0.16b, v1.16b \n"
|
||||
"trn1 v18.16b, v2.16b, v3.16b \n"
|
||||
"trn2 v19.16b, v2.16b, v3.16b \n"
|
||||
"trn1 v20.16b, v4.16b, v5.16b \n"
|
||||
"trn2 v21.16b, v4.16b, v5.16b \n"
|
||||
"trn1 v22.16b, v6.16b, v7.16b \n"
|
||||
"trn2 v23.16b, v6.16b, v7.16b \n"
|
||||
|
||||
"trn1 v0.8h, v16.8h, v18.8h \n"
|
||||
"trn2 v1.8h, v16.8h, v18.8h \n"
|
||||
"trn1 v2.8h, v20.8h, v22.8h \n"
|
||||
"trn2 v3.8h, v20.8h, v22.8h \n"
|
||||
"trn1 v4.8h, v17.8h, v19.8h \n"
|
||||
"trn2 v5.8h, v17.8h, v19.8h \n"
|
||||
"trn1 v6.8h, v21.8h, v23.8h \n"
|
||||
"trn2 v7.8h, v21.8h, v23.8h \n"
|
||||
"trn1 v0.8h, v16.8h, v18.8h \n"
|
||||
"trn2 v1.8h, v16.8h, v18.8h \n"
|
||||
"trn1 v2.8h, v20.8h, v22.8h \n"
|
||||
"trn2 v3.8h, v20.8h, v22.8h \n"
|
||||
"trn1 v4.8h, v17.8h, v19.8h \n"
|
||||
"trn2 v5.8h, v17.8h, v19.8h \n"
|
||||
"trn1 v6.8h, v21.8h, v23.8h \n"
|
||||
"trn2 v7.8h, v21.8h, v23.8h \n"
|
||||
|
||||
"trn1 v16.4s, v0.4s, v2.4s \n"
|
||||
"trn2 v17.4s, v0.4s, v2.4s \n"
|
||||
"trn1 v18.4s, v1.4s, v3.4s \n"
|
||||
"trn2 v19.4s, v1.4s, v3.4s \n"
|
||||
"trn1 v20.4s, v4.4s, v6.4s \n"
|
||||
"trn2 v21.4s, v4.4s, v6.4s \n"
|
||||
"trn1 v22.4s, v5.4s, v7.4s \n"
|
||||
"trn2 v23.4s, v5.4s, v7.4s \n"
|
||||
"trn1 v16.4s, v0.4s, v2.4s \n"
|
||||
"trn2 v17.4s, v0.4s, v2.4s \n"
|
||||
"trn1 v18.4s, v1.4s, v3.4s \n"
|
||||
"trn2 v19.4s, v1.4s, v3.4s \n"
|
||||
"trn1 v20.4s, v4.4s, v6.4s \n"
|
||||
"trn2 v21.4s, v4.4s, v6.4s \n"
|
||||
"trn1 v22.4s, v5.4s, v7.4s \n"
|
||||
"trn2 v23.4s, v5.4s, v7.4s \n"
|
||||
|
||||
"mov %0, %2 \n"
|
||||
"mov %0, %2 \n"
|
||||
|
||||
"st1 {v16.d}[0], [%0], %6 \n"
|
||||
"st1 {v18.d}[0], [%0], %6 \n"
|
||||
"st1 {v17.d}[0], [%0], %6 \n"
|
||||
"st1 {v19.d}[0], [%0], %6 \n"
|
||||
"st1 {v16.d}[1], [%0], %6 \n"
|
||||
"st1 {v18.d}[1], [%0], %6 \n"
|
||||
"st1 {v17.d}[1], [%0], %6 \n"
|
||||
"st1 {v19.d}[1], [%0] \n"
|
||||
"st1 {v16.d}[0], [%0], %6 \n"
|
||||
"st1 {v18.d}[0], [%0], %6 \n"
|
||||
"st1 {v17.d}[0], [%0], %6 \n"
|
||||
"st1 {v19.d}[0], [%0], %6 \n"
|
||||
"st1 {v16.d}[1], [%0], %6 \n"
|
||||
"st1 {v18.d}[1], [%0], %6 \n"
|
||||
"st1 {v17.d}[1], [%0], %6 \n"
|
||||
"st1 {v19.d}[1], [%0] \n"
|
||||
|
||||
"mov %0, %3 \n"
|
||||
"mov %0, %3 \n"
|
||||
|
||||
"st1 {v20.d}[0], [%0], %7 \n"
|
||||
"st1 {v22.d}[0], [%0], %7 \n"
|
||||
"st1 {v21.d}[0], [%0], %7 \n"
|
||||
"st1 {v23.d}[0], [%0], %7 \n"
|
||||
"st1 {v20.d}[1], [%0], %7 \n"
|
||||
"st1 {v22.d}[1], [%0], %7 \n"
|
||||
"st1 {v21.d}[1], [%0], %7 \n"
|
||||
"st1 {v23.d}[1], [%0] \n"
|
||||
"st1 {v20.d}[0], [%0], %7 \n"
|
||||
"st1 {v22.d}[0], [%0], %7 \n"
|
||||
"st1 {v21.d}[0], [%0], %7 \n"
|
||||
"st1 {v23.d}[0], [%0], %7 \n"
|
||||
"st1 {v20.d}[1], [%0], %7 \n"
|
||||
"st1 {v22.d}[1], [%0], %7 \n"
|
||||
"st1 {v21.d}[1], [%0], %7 \n"
|
||||
"st1 {v23.d}[1], [%0] \n"
|
||||
|
||||
"add %1, %1, #16 \n" // src += 8*2
|
||||
"add %2, %2, %6, lsl #3 \n" // dst_a += 8 * dst_stride_a
|
||||
"add %3, %3, %7, lsl #3 \n" // dst_b += 8 * dst_stride_b
|
||||
"subs %w4, %w4, #8 \n" // w -= 8
|
||||
"b.ge 1b \n"
|
||||
"add %1, %1, #16 \n" // src += 8*2
|
||||
"add %2, %2, %6, lsl #3 \n" // dst_a += 8 *
|
||||
// dst_stride_a
|
||||
"add %3, %3, %7, lsl #3 \n" // dst_b += 8 *
|
||||
// dst_stride_b
|
||||
"subs %w4, %w4, #8 \n" // w -= 8
|
||||
"b.ge 1b \n"
|
||||
|
||||
// add 8 back to counter. if the result is 0 there are
|
||||
// no residuals.
|
||||
"adds %w4, %w4, #8 \n"
|
||||
"b.eq 4f \n"
|
||||
// add 8 back to counter. if the result is 0 there are
|
||||
// no residuals.
|
||||
"adds %w4, %w4, #8 \n"
|
||||
"b.eq 4f \n"
|
||||
|
||||
// some residual, so between 1 and 7 lines left to transpose
|
||||
"cmp %w4, #2 \n"
|
||||
"b.lt 3f \n"
|
||||
// some residual, so between 1 and 7 lines left to transpose
|
||||
"cmp %w4, #2 \n"
|
||||
"b.lt 3f \n"
|
||||
|
||||
"cmp %w4, #4 \n"
|
||||
"b.lt 2f \n"
|
||||
"cmp %w4, #4 \n"
|
||||
"b.lt 2f \n"
|
||||
|
||||
// TODO(frkoenig): Clean this up
|
||||
// 4x8 block
|
||||
"mov %0, %1 \n"
|
||||
"ld1 {v0.8b}, [%0], %5 \n"
|
||||
"ld1 {v1.8b}, [%0], %5 \n"
|
||||
"ld1 {v2.8b}, [%0], %5 \n"
|
||||
"ld1 {v3.8b}, [%0], %5 \n"
|
||||
"ld1 {v4.8b}, [%0], %5 \n"
|
||||
"ld1 {v5.8b}, [%0], %5 \n"
|
||||
"ld1 {v6.8b}, [%0], %5 \n"
|
||||
"ld1 {v7.8b}, [%0] \n"
|
||||
// TODO(frkoenig): Clean this up
|
||||
// 4x8 block
|
||||
"mov %0, %1 \n"
|
||||
"ld1 {v0.8b}, [%0], %5 \n"
|
||||
"ld1 {v1.8b}, [%0], %5 \n"
|
||||
"ld1 {v2.8b}, [%0], %5 \n"
|
||||
"ld1 {v3.8b}, [%0], %5 \n"
|
||||
"ld1 {v4.8b}, [%0], %5 \n"
|
||||
"ld1 {v5.8b}, [%0], %5 \n"
|
||||
"ld1 {v6.8b}, [%0], %5 \n"
|
||||
"ld1 {v7.8b}, [%0] \n"
|
||||
|
||||
"ld1 {v30.16b}, [%8], #16 \n"
|
||||
"ld1 {v31.16b}, [%8] \n"
|
||||
"ld1 {v30.16b}, [%8], #16 \n"
|
||||
"ld1 {v31.16b}, [%8] \n"
|
||||
|
||||
"tbl v16.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b \n"
|
||||
"tbl v17.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v31.16b \n"
|
||||
"tbl v18.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v30.16b \n"
|
||||
"tbl v19.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v31.16b \n"
|
||||
"tbl v16.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b \n"
|
||||
"tbl v17.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v31.16b \n"
|
||||
"tbl v18.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v30.16b \n"
|
||||
"tbl v19.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v31.16b \n"
|
||||
|
||||
"mov %0, %2 \n"
|
||||
"mov %0, %2 \n"
|
||||
|
||||
"st1 {v16.s}[0], [%0], %6 \n"
|
||||
"st1 {v16.s}[1], [%0], %6 \n"
|
||||
"st1 {v16.s}[2], [%0], %6 \n"
|
||||
"st1 {v16.s}[3], [%0], %6 \n"
|
||||
"st1 {v16.s}[0], [%0], %6 \n"
|
||||
"st1 {v16.s}[1], [%0], %6 \n"
|
||||
"st1 {v16.s}[2], [%0], %6 \n"
|
||||
"st1 {v16.s}[3], [%0], %6 \n"
|
||||
|
||||
"add %0, %2, #4 \n"
|
||||
"st1 {v18.s}[0], [%0], %6 \n"
|
||||
"st1 {v18.s}[1], [%0], %6 \n"
|
||||
"st1 {v18.s}[2], [%0], %6 \n"
|
||||
"st1 {v18.s}[3], [%0] \n"
|
||||
"add %0, %2, #4 \n"
|
||||
"st1 {v18.s}[0], [%0], %6 \n"
|
||||
"st1 {v18.s}[1], [%0], %6 \n"
|
||||
"st1 {v18.s}[2], [%0], %6 \n"
|
||||
"st1 {v18.s}[3], [%0] \n"
|
||||
|
||||
"mov %0, %3 \n"
|
||||
"mov %0, %3 \n"
|
||||
|
||||
"st1 {v17.s}[0], [%0], %7 \n"
|
||||
"st1 {v17.s}[1], [%0], %7 \n"
|
||||
"st1 {v17.s}[2], [%0], %7 \n"
|
||||
"st1 {v17.s}[3], [%0], %7 \n"
|
||||
"st1 {v17.s}[0], [%0], %7 \n"
|
||||
"st1 {v17.s}[1], [%0], %7 \n"
|
||||
"st1 {v17.s}[2], [%0], %7 \n"
|
||||
"st1 {v17.s}[3], [%0], %7 \n"
|
||||
|
||||
"add %0, %3, #4 \n"
|
||||
"st1 {v19.s}[0], [%0], %7 \n"
|
||||
"st1 {v19.s}[1], [%0], %7 \n"
|
||||
"st1 {v19.s}[2], [%0], %7 \n"
|
||||
"st1 {v19.s}[3], [%0] \n"
|
||||
"add %0, %3, #4 \n"
|
||||
"st1 {v19.s}[0], [%0], %7 \n"
|
||||
"st1 {v19.s}[1], [%0], %7 \n"
|
||||
"st1 {v19.s}[2], [%0], %7 \n"
|
||||
"st1 {v19.s}[3], [%0] \n"
|
||||
|
||||
"add %1, %1, #8 \n" // src += 4 * 2
|
||||
"add %2, %2, %6, lsl #2 \n" // dst_a += 4 * dst_stride_a
|
||||
"add %3, %3, %7, lsl #2 \n" // dst_b += 4 * dst_stride_b
|
||||
"subs %w4, %w4, #4 \n" // w -= 4
|
||||
"b.eq 4f \n"
|
||||
"add %1, %1, #8 \n" // src += 4 * 2
|
||||
"add %2, %2, %6, lsl #2 \n" // dst_a += 4 *
|
||||
// dst_stride_a
|
||||
"add %3, %3, %7, lsl #2 \n" // dst_b += 4 *
|
||||
// dst_stride_b
|
||||
"subs %w4, %w4, #4 \n" // w -= 4
|
||||
"b.eq 4f \n"
|
||||
|
||||
// some residual, check to see if it includes a 2x8 block,
|
||||
// or less
|
||||
"cmp %w4, #2 \n"
|
||||
"b.lt 3f \n"
|
||||
// some residual, check to see if it includes a 2x8 block,
|
||||
// or less
|
||||
"cmp %w4, #2 \n"
|
||||
"b.lt 3f \n"
|
||||
|
||||
// 2x8 block
|
||||
"2: \n"
|
||||
"mov %0, %1 \n"
|
||||
"ld2 {v0.h, v1.h}[0], [%0], %5 \n"
|
||||
"ld2 {v2.h, v3.h}[0], [%0], %5 \n"
|
||||
"ld2 {v0.h, v1.h}[1], [%0], %5 \n"
|
||||
"ld2 {v2.h, v3.h}[1], [%0], %5 \n"
|
||||
"ld2 {v0.h, v1.h}[2], [%0], %5 \n"
|
||||
"ld2 {v2.h, v3.h}[2], [%0], %5 \n"
|
||||
"ld2 {v0.h, v1.h}[3], [%0], %5 \n"
|
||||
"ld2 {v2.h, v3.h}[3], [%0] \n"
|
||||
// 2x8 block
|
||||
"2: \n"
|
||||
"mov %0, %1 \n"
|
||||
"ld2 {v0.h, v1.h}[0], [%0], %5 \n"
|
||||
"ld2 {v2.h, v3.h}[0], [%0], %5 \n"
|
||||
"ld2 {v0.h, v1.h}[1], [%0], %5 \n"
|
||||
"ld2 {v2.h, v3.h}[1], [%0], %5 \n"
|
||||
"ld2 {v0.h, v1.h}[2], [%0], %5 \n"
|
||||
"ld2 {v2.h, v3.h}[2], [%0], %5 \n"
|
||||
"ld2 {v0.h, v1.h}[3], [%0], %5 \n"
|
||||
"ld2 {v2.h, v3.h}[3], [%0] \n"
|
||||
|
||||
"trn1 v4.8b, v0.8b, v2.8b \n"
|
||||
"trn2 v5.8b, v0.8b, v2.8b \n"
|
||||
"trn1 v6.8b, v1.8b, v3.8b \n"
|
||||
"trn2 v7.8b, v1.8b, v3.8b \n"
|
||||
"trn1 v4.8b, v0.8b, v2.8b \n"
|
||||
"trn2 v5.8b, v0.8b, v2.8b \n"
|
||||
"trn1 v6.8b, v1.8b, v3.8b \n"
|
||||
"trn2 v7.8b, v1.8b, v3.8b \n"
|
||||
|
||||
"mov %0, %2 \n"
|
||||
"mov %0, %2 \n"
|
||||
|
||||
"st1 {v4.d}[0], [%0], %6 \n"
|
||||
"st1 {v6.d}[0], [%0] \n"
|
||||
"st1 {v4.d}[0], [%0], %6 \n"
|
||||
"st1 {v6.d}[0], [%0] \n"
|
||||
|
||||
"mov %0, %3 \n"
|
||||
"mov %0, %3 \n"
|
||||
|
||||
"st1 {v5.d}[0], [%0], %7 \n"
|
||||
"st1 {v7.d}[0], [%0] \n"
|
||||
"st1 {v5.d}[0], [%0], %7 \n"
|
||||
"st1 {v7.d}[0], [%0] \n"
|
||||
|
||||
"add %1, %1, #4 \n" // src += 2 * 2
|
||||
"add %2, %2, %6, lsl #1 \n" // dst_a += 2 * dst_stride_a
|
||||
"add %3, %3, %7, lsl #1 \n" // dst_b += 2 * dst_stride_b
|
||||
"subs %w4, %w4, #2 \n" // w -= 2
|
||||
"b.eq 4f \n"
|
||||
"add %1, %1, #4 \n" // src += 2 * 2
|
||||
"add %2, %2, %6, lsl #1 \n" // dst_a += 2 *
|
||||
// dst_stride_a
|
||||
"add %3, %3, %7, lsl #1 \n" // dst_b += 2 *
|
||||
// dst_stride_b
|
||||
"subs %w4, %w4, #2 \n" // w -= 2
|
||||
"b.eq 4f \n"
|
||||
|
||||
// 1x8 block
|
||||
"3: \n"
|
||||
"ld2 {v0.b, v1.b}[0], [%1], %5 \n"
|
||||
"ld2 {v0.b, v1.b}[1], [%1], %5 \n"
|
||||
"ld2 {v0.b, v1.b}[2], [%1], %5 \n"
|
||||
"ld2 {v0.b, v1.b}[3], [%1], %5 \n"
|
||||
"ld2 {v0.b, v1.b}[4], [%1], %5 \n"
|
||||
"ld2 {v0.b, v1.b}[5], [%1], %5 \n"
|
||||
"ld2 {v0.b, v1.b}[6], [%1], %5 \n"
|
||||
"ld2 {v0.b, v1.b}[7], [%1] \n"
|
||||
// 1x8 block
|
||||
"3: \n"
|
||||
"ld2 {v0.b, v1.b}[0], [%1], %5 \n"
|
||||
"ld2 {v0.b, v1.b}[1], [%1], %5 \n"
|
||||
"ld2 {v0.b, v1.b}[2], [%1], %5 \n"
|
||||
"ld2 {v0.b, v1.b}[3], [%1], %5 \n"
|
||||
"ld2 {v0.b, v1.b}[4], [%1], %5 \n"
|
||||
"ld2 {v0.b, v1.b}[5], [%1], %5 \n"
|
||||
"ld2 {v0.b, v1.b}[6], [%1], %5 \n"
|
||||
"ld2 {v0.b, v1.b}[7], [%1] \n"
|
||||
|
||||
"st1 {v0.d}[0], [%2] \n"
|
||||
"st1 {v1.d}[0], [%3] \n"
|
||||
"st1 {v0.d}[0], [%2] \n"
|
||||
"st1 {v1.d}[0], [%3] \n"
|
||||
|
||||
"4: \n"
|
||||
"4: \n"
|
||||
|
||||
: "=&r"(src_temp), // %0
|
||||
"+r"(src), // %1
|
||||
"+r"(dst_a), // %2
|
||||
"+r"(dst_b), // %3
|
||||
"+r"(width) // %4
|
||||
: "r"(static_cast<ptrdiff_t>(src_stride)), // %5
|
||||
"r"(static_cast<ptrdiff_t>(dst_stride_a)), // %6
|
||||
"r"(static_cast<ptrdiff_t>(dst_stride_b)), // %7
|
||||
"r"(&kVTbl4x4TransposeDi) // %8
|
||||
: "memory", "cc",
|
||||
"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
|
||||
"v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
|
||||
"v30", "v31"
|
||||
);
|
||||
: "=&r"(src_temp), // %0
|
||||
"+r"(src), // %1
|
||||
"+r"(dst_a), // %2
|
||||
"+r"(dst_b), // %3
|
||||
"+r"(width) // %4
|
||||
: "r"(static_cast<ptrdiff_t>(src_stride)), // %5
|
||||
"r"(static_cast<ptrdiff_t>(dst_stride_a)), // %6
|
||||
"r"(static_cast<ptrdiff_t>(dst_stride_b)), // %7
|
||||
"r"(&kVTbl4x4TransposeDi) // %8
|
||||
: "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
|
||||
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v30", "v31");
|
||||
}
|
||||
#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
|
||||
|
||||
|
|
|
@ -17,7 +17,7 @@ extern "C" {
|
|||
#endif
|
||||
|
||||
// This module is for 32 bit Visual C x86 and clangcl
|
||||
#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
|
||||
#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
|
||||
|
||||
__declspec(naked) void TransposeWx8_SSSE3(const uint8* src,
|
||||
int src_stride,
|
||||
|
@ -172,7 +172,7 @@ __declspec(naked) void TransposeUVWx8_SSE2(const uint8* src,
|
|||
movdqa xmm7, xmm5
|
||||
lea eax, [eax + 8 * edi + 16]
|
||||
neg edi
|
||||
// Second round of bit swap.
|
||||
// Second round of bit swap.
|
||||
movdqa xmm5, xmm0
|
||||
punpcklwd xmm0, xmm2
|
||||
punpckhwd xmm5, xmm2
|
||||
|
@ -192,8 +192,8 @@ __declspec(naked) void TransposeUVWx8_SSE2(const uint8* src,
|
|||
punpckhwd xmm6, xmm7
|
||||
movdqa xmm7, xmm6
|
||||
|
||||
// Third round of bit swap.
|
||||
// Write to the destination pointer.
|
||||
// Third round of bit swap.
|
||||
// Write to the destination pointer.
|
||||
movdqa xmm6, xmm0
|
||||
punpckldq xmm0, xmm4
|
||||
punpckhdq xmm6, xmm4
|
||||
|
|
|
@ -84,6 +84,14 @@ ANY41C(I422AlphaToARGBRow_Any_MSA, I422AlphaToARGBRow_MSA, 1, 0, 4, 7)
|
|||
memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 192, \
|
||||
SS(r, DUVSHIFT) * BPP); \
|
||||
}
|
||||
|
||||
// Merge functions.
|
||||
#ifdef HAS_MERGERGBROW_SSSE3
|
||||
ANY31(MergeRGBRow_Any_SSSE3, MergeRGBRow_SSSE3, 0, 0, 3, 15)
|
||||
#endif
|
||||
#ifdef HAS_MERGERGBROW_NEON
|
||||
ANY31(MergeRGBRow_Any_NEON, MergeRGBRow_NEON, 0, 0, 3, 15)
|
||||
#endif
|
||||
#ifdef HAS_I422TOYUY2ROW_SSE2
|
||||
ANY31(I422ToYUY2Row_Any_SSE2, I422ToYUY2Row_SSE2, 1, 1, 4, 15)
|
||||
ANY31(I422ToUYVYRow_Any_SSE2, I422ToUYVYRow_SSE2, 1, 1, 4, 15)
|
||||
|
@ -621,6 +629,9 @@ ANY11(ARGBExtractAlphaRow_Any_AVX2, ARGBExtractAlphaRow_AVX2, 0, 4, 1, 32)
|
|||
#ifdef HAS_ARGBEXTRACTALPHAROW_NEON
|
||||
ANY11(ARGBExtractAlphaRow_Any_NEON, ARGBExtractAlphaRow_NEON, 0, 4, 1, 15)
|
||||
#endif
|
||||
#ifdef HAS_ARGBEXTRACTALPHAROW_MSA
|
||||
ANY11(ARGBExtractAlphaRow_Any_MSA, ARGBExtractAlphaRow_MSA, 0, 4, 1, 15)
|
||||
#endif
|
||||
#undef ANY11
|
||||
|
||||
// Any 1 to 1 blended. Destination is read, modify, write.
|
||||
|
@ -746,6 +757,9 @@ ANY11P16(HalfFloat1Row_Any_F16C, HalfFloat1Row_F16C, float, 2, 2, 15)
|
|||
ANY11P16(HalfFloatRow_Any_NEON, HalfFloatRow_NEON, float, 2, 2, 7)
|
||||
ANY11P16(HalfFloat1Row_Any_NEON, HalfFloat1Row_NEON, float, 2, 2, 7)
|
||||
#endif
|
||||
#ifdef HAS_HALFFLOATROW_MSA
|
||||
ANY11P16(HalfFloatRow_Any_MSA, HalfFloatRow_MSA, float, 2, 2, 31)
|
||||
#endif
|
||||
#undef ANY11P16
|
||||
|
||||
// Any 1 to 1 with yuvconstants
|
||||
|
@ -911,6 +925,9 @@ ANY12(SplitUVRow_Any_NEON, SplitUVRow_NEON, 0, 2, 0, 15)
|
|||
#ifdef HAS_SPLITUVROW_DSPR2
|
||||
ANY12(SplitUVRow_Any_DSPR2, SplitUVRow_DSPR2, 0, 2, 0, 15)
|
||||
#endif
|
||||
#ifdef HAS_SPLITUVROW_MSA
|
||||
ANY12(SplitUVRow_Any_MSA, SplitUVRow_MSA, 0, 2, 0, 31)
|
||||
#endif
|
||||
#ifdef HAS_ARGBTOUV444ROW_SSSE3
|
||||
ANY12(ARGBToUV444Row_Any_SSSE3, ARGBToUV444Row_SSSE3, 0, 4, 0, 15)
|
||||
#endif
|
||||
|
@ -934,6 +951,31 @@ ANY12(UYVYToUV422Row_Any_MSA, UYVYToUV422Row_MSA, 1, 4, 1, 31)
|
|||
#endif
|
||||
#undef ANY12
|
||||
|
||||
// Any 1 to 3. Outputs RGB planes.
|
||||
#define ANY13(NAMEANY, ANY_SIMD, BPP, MASK) \
|
||||
void NAMEANY(const uint8* src_ptr, uint8* dst_r, uint8* dst_g, uint8* dst_b, \
|
||||
int width) { \
|
||||
SIMD_ALIGNED(uint8 temp[16 * 6]); \
|
||||
memset(temp, 0, 16 * 3); /* for msan */ \
|
||||
int r = width & MASK; \
|
||||
int n = width & ~MASK; \
|
||||
if (n > 0) { \
|
||||
ANY_SIMD(src_ptr, dst_r, dst_g, dst_b, n); \
|
||||
} \
|
||||
memcpy(temp, src_ptr + n * BPP, r * BPP); \
|
||||
ANY_SIMD(temp, temp + 16 * 3, temp + 16 * 4, temp + 16 * 5, MASK + 1); \
|
||||
memcpy(dst_r + n, temp + 16 * 3, r); \
|
||||
memcpy(dst_g + n, temp + 16 * 4, r); \
|
||||
memcpy(dst_b + n, temp + 16 * 5, r); \
|
||||
}
|
||||
|
||||
#ifdef HAS_SPLITRGBROW_SSSE3
|
||||
ANY13(SplitRGBRow_Any_SSSE3, SplitRGBRow_SSSE3, 3, 15)
|
||||
#endif
|
||||
#ifdef HAS_SPLITRGBROW_NEON
|
||||
ANY13(SplitRGBRow_Any_NEON, SplitRGBRow_NEON, 3, 15)
|
||||
#endif
|
||||
|
||||
// Any 1 to 2 with source stride (2 rows of source). Outputs UV planes.
|
||||
// 128 byte row allows for 32 avx ARGB pixels.
|
||||
#define ANY12S(NAMEANY, ANY_SIMD, UVSHIFT, BPP, MASK) \
|
||||
|
|
|
@ -1770,6 +1770,63 @@ void MergeUVRow_C(const uint8* src_u,
|
|||
}
|
||||
}
|
||||
|
||||
void SplitRGBRow_C(const uint8* src_rgb,
|
||||
uint8* dst_r,
|
||||
uint8* dst_g,
|
||||
uint8* dst_b,
|
||||
int width) {
|
||||
int x;
|
||||
for (x = 0; x < width; ++x) {
|
||||
dst_r[x] = src_rgb[0];
|
||||
dst_g[x] = src_rgb[1];
|
||||
dst_b[x] = src_rgb[2];
|
||||
src_rgb += 3;
|
||||
}
|
||||
}
|
||||
|
||||
void MergeRGBRow_C(const uint8* src_r,
|
||||
const uint8* src_g,
|
||||
const uint8* src_b,
|
||||
uint8* dst_rgb,
|
||||
int width) {
|
||||
int x;
|
||||
for (x = 0; x < width; ++x) {
|
||||
dst_rgb[0] = src_r[x];
|
||||
dst_rgb[1] = src_g[x];
|
||||
dst_rgb[2] = src_b[x];
|
||||
dst_rgb += 3;
|
||||
}
|
||||
}
|
||||
|
||||
void MergeUVRow_16_C(const uint16* src_u,
|
||||
const uint16* src_v,
|
||||
uint16* dst_uv,
|
||||
int scale,
|
||||
int width) {
|
||||
int x;
|
||||
for (x = 0; x < width - 1; x += 2) {
|
||||
dst_uv[0] = src_u[x] * scale;
|
||||
dst_uv[1] = src_v[x] * scale;
|
||||
dst_uv[2] = src_u[x + 1] * scale;
|
||||
dst_uv[3] = src_v[x + 1] * scale;
|
||||
dst_uv += 4;
|
||||
}
|
||||
if (width & 1) {
|
||||
dst_uv[0] = src_u[width - 1] * scale;
|
||||
dst_uv[1] = src_v[width - 1] * scale;
|
||||
}
|
||||
}
|
||||
|
||||
void MultiplyRow_16_C(const uint16* src_y,
|
||||
uint16* dst_y,
|
||||
int scale,
|
||||
int width) {
|
||||
int x;
|
||||
for (x = 0; x < width; ++x) {
|
||||
dst_y[x] = src_y[x] * scale;
|
||||
}
|
||||
}
|
||||
|
||||
void CopyRow_C(const uint8* src, uint8* dst, int count) {
|
||||
memcpy(dst, src, count);
|
||||
}
|
||||
|
@ -2639,6 +2696,62 @@ void NV12ToRGB565Row_AVX2(const uint8* src_y,
|
|||
}
|
||||
#endif
|
||||
|
||||
float ScaleSumSamples_C(const float* src, float* dst, float scale, int width) {
|
||||
float fsum = 0.f;
|
||||
int i;
|
||||
#if defined(__clang__)
|
||||
#pragma clang loop vectorize_width(4)
|
||||
#endif
|
||||
for (i = 0; i < width; ++i) {
|
||||
float v = *src++;
|
||||
fsum += v * v;
|
||||
*dst++ = v * scale;
|
||||
}
|
||||
return fsum;
|
||||
}
|
||||
|
||||
float ScaleMaxSamples_C(const float* src, float* dst, float scale, int width) {
|
||||
float fmax = 0.f;
|
||||
int i;
|
||||
for (i = 0; i < width; ++i) {
|
||||
float v = *src++;
|
||||
float vs = v * scale;
|
||||
fmax = (v > fmax) ? v : fmax;
|
||||
*dst++ = vs;
|
||||
}
|
||||
return fmax;
|
||||
}
|
||||
|
||||
void ScaleSamples_C(const float* src, float* dst, float scale, int width) {
|
||||
int i;
|
||||
for (i = 0; i < width; ++i) {
|
||||
*dst++ = *src++ * scale;
|
||||
}
|
||||
}
|
||||
|
||||
void GaussRow_C(const uint32* src, uint16* dst, int width) {
|
||||
int i;
|
||||
for (i = 0; i < width; ++i) {
|
||||
*dst++ =
|
||||
(src[0] + src[1] * 4 + src[2] * 6 + src[3] * 4 + src[4] + 128) >> 8;
|
||||
++src;
|
||||
}
|
||||
}
|
||||
|
||||
// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
|
||||
void GaussCol_C(const uint16* src0,
|
||||
const uint16* src1,
|
||||
const uint16* src2,
|
||||
const uint16* src3,
|
||||
const uint16* src4,
|
||||
uint32* dst,
|
||||
int width) {
|
||||
int i;
|
||||
for (i = 0; i < width; ++i) {
|
||||
*dst++ = *src0++ + *src1++ * 4 + *src2++ * 6 + *src3++ * 4 + *src4++;
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
} // namespace libyuv
|
||||
|
|
|
@ -38,9 +38,8 @@ static vec8 kARGBToU = {112, -74, -38, 0, 112, -74, -38, 0,
|
|||
static vec8 kARGBToUJ = {127, -84, -43, 0, 127, -84, -43, 0,
|
||||
127, -84, -43, 0, 127, -84, -43, 0};
|
||||
|
||||
static vec8 kARGBToV = {
|
||||
-18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
|
||||
};
|
||||
static vec8 kARGBToV = {-18, -94, 112, 0, -18, -94, 112, 0,
|
||||
-18, -94, 112, 0, -18, -94, 112, 0};
|
||||
|
||||
static vec8 kARGBToVJ = {-20, -107, 127, 0, -20, -107, 127, 0,
|
||||
-20, -107, 127, 0, -20, -107, 127, 0};
|
||||
|
@ -2754,6 +2753,280 @@ void MergeUVRow_SSE2(const uint8* src_u,
|
|||
}
|
||||
#endif // HAS_MERGEUVROW_SSE2
|
||||
|
||||
// Use scale to convert lsb formats to msb, depending how many bits there are:
|
||||
// 128 = 9 bits
|
||||
// 64 = 10 bits
|
||||
// 16 = 12 bits
|
||||
// 1 = 16 bits
|
||||
#ifdef HAS_MERGEUVROW_16_AVX2
|
||||
void MergeUVRow_16_AVX2(const uint16* src_u,
|
||||
const uint16* src_v,
|
||||
uint16* dst_uv,
|
||||
int scale,
|
||||
int width) {
|
||||
// clang-format off
|
||||
asm volatile (
|
||||
"vmovd %4,%%xmm3 \n"
|
||||
"vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n"
|
||||
"vbroadcastss %%xmm3,%%ymm3 \n"
|
||||
"sub %0,%1 \n"
|
||||
|
||||
// 16 pixels per loop.
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
"vmovdqu (%0),%%ymm0 \n"
|
||||
"vmovdqu (%0,%1,1),%%ymm1 \n"
|
||||
"add $0x20,%0 \n"
|
||||
|
||||
"vpmullw %%ymm3,%%ymm0,%%ymm0 \n"
|
||||
"vpmullw %%ymm3,%%ymm1,%%ymm1 \n"
|
||||
"vpunpcklwd %%ymm1,%%ymm0,%%ymm2 \n" // mutates
|
||||
"vpunpckhwd %%ymm1,%%ymm0,%%ymm0 \n"
|
||||
"vextractf128 $0x0,%%ymm2,(%2) \n"
|
||||
"vextractf128 $0x0,%%ymm0,0x10(%2) \n"
|
||||
"vextractf128 $0x1,%%ymm2,0x20(%2) \n"
|
||||
"vextractf128 $0x1,%%ymm0,0x30(%2) \n"
|
||||
"add $0x40,%2 \n"
|
||||
"sub $0x10,%3 \n"
|
||||
"jg 1b \n"
|
||||
"vzeroupper \n"
|
||||
: "+r"(src_u), // %0
|
||||
"+r"(src_v), // %1
|
||||
"+r"(dst_uv), // %2
|
||||
"+r"(width) // %3
|
||||
: "r"(scale) // %4
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
|
||||
// clang-format on
|
||||
}
|
||||
#endif // HAS_MERGEUVROW_AVX2
|
||||
|
||||
#ifdef HAS_MULTIPLYROW_16_AVX2
|
||||
void MultiplyRow_16_AVX2(const uint16* src_y,
|
||||
uint16* dst_y,
|
||||
int scale,
|
||||
int width) {
|
||||
// clang-format off
|
||||
asm volatile (
|
||||
"vmovd %3,%%xmm3 \n"
|
||||
"vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n"
|
||||
"vbroadcastss %%xmm3,%%ymm3 \n"
|
||||
"sub %0,%1 \n"
|
||||
|
||||
// 16 pixels per loop.
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
"vmovdqu (%0),%%ymm0 \n"
|
||||
"vmovdqu 0x20(%0),%%ymm1 \n"
|
||||
"vpmullw %%ymm3,%%ymm0,%%ymm0 \n"
|
||||
"vpmullw %%ymm3,%%ymm1,%%ymm1 \n"
|
||||
"vmovdqu %%ymm0,(%0,%1) \n"
|
||||
"vmovdqu %%ymm1,0x20(%0,%1) \n"
|
||||
"add $0x40,%0 \n"
|
||||
"sub $0x20,%2 \n"
|
||||
"jg 1b \n"
|
||||
"vzeroupper \n"
|
||||
: "+r"(src_y), // %0
|
||||
"+r"(dst_y), // %1
|
||||
"+r"(width) // %2
|
||||
: "r"(scale) // %3
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm3");
|
||||
// clang-format on
|
||||
}
|
||||
#endif // HAS_MULTIPLYROW_16_AVX2
|
||||
|
||||
#ifdef HAS_SPLITRGBROW_SSSE3
|
||||
|
||||
// Shuffle table for converting RGB to Planar.
|
||||
static uvec8 kShuffleMaskRGBToR0 = {0u, 3u, 6u, 9u, 12u, 15u,
|
||||
128u, 128u, 128u, 128u, 128u, 128u,
|
||||
128u, 128u, 128u, 128u};
|
||||
static uvec8 kShuffleMaskRGBToR1 = {128u, 128u, 128u, 128u, 128u, 128u,
|
||||
2u, 5u, 8u, 11u, 14u, 128u,
|
||||
128u, 128u, 128u, 128u};
|
||||
static uvec8 kShuffleMaskRGBToR2 = {128u, 128u, 128u, 128u, 128u, 128u,
|
||||
128u, 128u, 128u, 128u, 128u, 1u,
|
||||
4u, 7u, 10u, 13u};
|
||||
|
||||
static uvec8 kShuffleMaskRGBToG0 = {1u, 4u, 7u, 10u, 13u, 128u,
|
||||
128u, 128u, 128u, 128u, 128u, 128u,
|
||||
128u, 128u, 128u, 128u};
|
||||
static uvec8 kShuffleMaskRGBToG1 = {128u, 128u, 128u, 128u, 128u, 0u,
|
||||
3u, 6u, 9u, 12u, 15u, 128u,
|
||||
128u, 128u, 128u, 128u};
|
||||
static uvec8 kShuffleMaskRGBToG2 = {128u, 128u, 128u, 128u, 128u, 128u,
|
||||
128u, 128u, 128u, 128u, 128u, 2u,
|
||||
5u, 8u, 11u, 14u};
|
||||
|
||||
static uvec8 kShuffleMaskRGBToB0 = {2u, 5u, 8u, 11u, 14u, 128u,
|
||||
128u, 128u, 128u, 128u, 128u, 128u,
|
||||
128u, 128u, 128u, 128u};
|
||||
static uvec8 kShuffleMaskRGBToB1 = {128u, 128u, 128u, 128u, 128u, 1u,
|
||||
4u, 7u, 10u, 13u, 128u, 128u,
|
||||
128u, 128u, 128u, 128u};
|
||||
static uvec8 kShuffleMaskRGBToB2 = {128u, 128u, 128u, 128u, 128u, 128u,
|
||||
128u, 128u, 128u, 128u, 0u, 3u,
|
||||
6u, 9u, 12u, 15u};
|
||||
|
||||
void SplitRGBRow_SSSE3(const uint8* src_rgb,
|
||||
uint8* dst_r,
|
||||
uint8* dst_g,
|
||||
uint8* dst_b,
|
||||
int width) {
|
||||
asm volatile (
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
|
||||
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
|
||||
"movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
|
||||
"pshufb %5, %%xmm0 \n"
|
||||
"pshufb %6, %%xmm1 \n"
|
||||
"pshufb %7, %%xmm2 \n"
|
||||
"por %%xmm1,%%xmm0 \n"
|
||||
"por %%xmm2,%%xmm0 \n"
|
||||
"movdqu %%xmm0," MEMACCESS(1) " \n"
|
||||
"lea " MEMLEA(0x10,1) ",%1 \n"
|
||||
|
||||
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
|
||||
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
|
||||
"movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
|
||||
"pshufb %8, %%xmm0 \n"
|
||||
"pshufb %9, %%xmm1 \n"
|
||||
"pshufb %10, %%xmm2 \n"
|
||||
"por %%xmm1,%%xmm0 \n"
|
||||
"por %%xmm2,%%xmm0 \n"
|
||||
"movdqu %%xmm0," MEMACCESS(2) " \n"
|
||||
"lea " MEMLEA(0x10,2) ",%2 \n"
|
||||
|
||||
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
|
||||
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
|
||||
"movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
|
||||
"pshufb %11, %%xmm0 \n"
|
||||
"pshufb %12, %%xmm1 \n"
|
||||
"pshufb %13, %%xmm2 \n"
|
||||
"por %%xmm1,%%xmm0 \n"
|
||||
"por %%xmm2,%%xmm0 \n"
|
||||
"movdqu %%xmm0," MEMACCESS(3) " \n"
|
||||
"lea " MEMLEA(0x10,3) ",%3 \n"
|
||||
"lea " MEMLEA(0x30,0) ",%0 \n"
|
||||
"sub $0x10,%4 \n"
|
||||
"jg 1b \n"
|
||||
: "+r"(src_rgb), // %0
|
||||
"+r"(dst_r), // %1
|
||||
"+r"(dst_g), // %2
|
||||
"+r"(dst_b), // %3
|
||||
"+r"(width) // %4
|
||||
: "m"(kShuffleMaskRGBToR0), // %5
|
||||
"m"(kShuffleMaskRGBToR1), // %6
|
||||
"m"(kShuffleMaskRGBToR2), // %7
|
||||
"m"(kShuffleMaskRGBToG0), // %8
|
||||
"m"(kShuffleMaskRGBToG1), // %9
|
||||
"m"(kShuffleMaskRGBToG2), // %10
|
||||
"m"(kShuffleMaskRGBToB0), // %11
|
||||
"m"(kShuffleMaskRGBToB1), // %12
|
||||
"m"(kShuffleMaskRGBToB2) // %13
|
||||
: "memory", "cc", NACL_R14
|
||||
"xmm0", "xmm1", "xmm2"
|
||||
);
|
||||
}
|
||||
#endif // HAS_SPLITRGBROW_SSSE3
|
||||
|
||||
#ifdef HAS_MERGERGBROW_SSSE3
|
||||
|
||||
// Shuffle table for converting RGB to Planar.
|
||||
static uvec8 kShuffleMaskRToRGB0 = {0u, 128u, 128u, 1u, 128u, 128u,
|
||||
2u, 128u, 128u, 3u, 128u, 128u,
|
||||
4u, 128u, 128u, 5u};
|
||||
static uvec8 kShuffleMaskGToRGB0 = {128u, 0u, 128u, 128u, 1u, 128u,
|
||||
128u, 2u, 128u, 128u, 3u, 128u,
|
||||
128u, 4u, 128u, 128u};
|
||||
static uvec8 kShuffleMaskBToRGB0 = {128u, 128u, 0u, 128u, 128u, 1u,
|
||||
128u, 128u, 2u, 128u, 128u, 3u,
|
||||
128u, 128u, 4u, 128u};
|
||||
|
||||
static uvec8 kShuffleMaskGToRGB1 = {5u, 128u, 128u, 6u, 128u, 128u,
|
||||
7u, 128u, 128u, 8u, 128u, 128u,
|
||||
9u, 128u, 128u, 10u};
|
||||
static uvec8 kShuffleMaskBToRGB1 = {128u, 5u, 128u, 128u, 6u, 128u,
|
||||
128u, 7u, 128u, 128u, 8u, 128u,
|
||||
128u, 9u, 128u, 128u};
|
||||
static uvec8 kShuffleMaskRToRGB1 = {128u, 128u, 6u, 128u, 128u, 7u,
|
||||
128u, 128u, 8u, 128u, 128u, 9u,
|
||||
128u, 128u, 10u, 128u};
|
||||
|
||||
static uvec8 kShuffleMaskBToRGB2 = {10u, 128u, 128u, 11u, 128u, 128u,
|
||||
12u, 128u, 128u, 13u, 128u, 128u,
|
||||
14u, 128u, 128u, 15u};
|
||||
static uvec8 kShuffleMaskRToRGB2 = {128u, 11u, 128u, 128u, 12u, 128u,
|
||||
128u, 13u, 128u, 128u, 14u, 128u,
|
||||
128u, 15u, 128u, 128u};
|
||||
static uvec8 kShuffleMaskGToRGB2 = {128u, 128u, 11u, 128u, 128u, 12u,
|
||||
128u, 128u, 13u, 128u, 128u, 14u,
|
||||
128u, 128u, 15u, 128u};
|
||||
|
||||
void MergeRGBRow_SSSE3(const uint8* src_r,
|
||||
const uint8* src_g,
|
||||
const uint8* src_b,
|
||||
uint8* dst_rgb,
|
||||
int width) {
|
||||
asm volatile (
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
|
||||
"movdqu " MEMACCESS(1) ",%%xmm1 \n"
|
||||
"movdqu " MEMACCESS(2) ",%%xmm2 \n"
|
||||
"pshufb %5, %%xmm0 \n"
|
||||
"pshufb %6, %%xmm1 \n"
|
||||
"pshufb %7, %%xmm2 \n"
|
||||
"por %%xmm1,%%xmm0 \n"
|
||||
"por %%xmm2,%%xmm0 \n"
|
||||
"movdqu %%xmm0," MEMACCESS(3) " \n"
|
||||
|
||||
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
|
||||
"movdqu " MEMACCESS(1) ",%%xmm1 \n"
|
||||
"movdqu " MEMACCESS(2) ",%%xmm2 \n"
|
||||
"pshufb %8, %%xmm0 \n"
|
||||
"pshufb %9, %%xmm1 \n"
|
||||
"pshufb %10, %%xmm2 \n"
|
||||
"por %%xmm1,%%xmm0 \n"
|
||||
"por %%xmm2,%%xmm0 \n"
|
||||
"movdqu %%xmm0," MEMACCESS2(16, 3) " \n"
|
||||
|
||||
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
|
||||
"movdqu " MEMACCESS(1) ",%%xmm1 \n"
|
||||
"movdqu " MEMACCESS(2) ",%%xmm2 \n"
|
||||
"pshufb %11, %%xmm0 \n"
|
||||
"pshufb %12, %%xmm1 \n"
|
||||
"pshufb %13, %%xmm2 \n"
|
||||
"por %%xmm1,%%xmm0 \n"
|
||||
"por %%xmm2,%%xmm0 \n"
|
||||
"movdqu %%xmm0," MEMACCESS2(32, 3) " \n"
|
||||
|
||||
"lea " MEMLEA(0x10,0) ",%0 \n"
|
||||
"lea " MEMLEA(0x10,1) ",%1 \n"
|
||||
"lea " MEMLEA(0x10,2) ",%2 \n"
|
||||
"lea " MEMLEA(0x30,3) ",%3 \n"
|
||||
"sub $0x10,%4 \n"
|
||||
"jg 1b \n"
|
||||
: "+r"(src_r), // %0
|
||||
"+r"(src_g), // %1
|
||||
"+r"(src_b), // %2
|
||||
"+r"(dst_rgb), // %3
|
||||
"+r"(width) // %4
|
||||
: "m"(kShuffleMaskRToRGB0), // %5
|
||||
"m"(kShuffleMaskGToRGB0), // %6
|
||||
"m"(kShuffleMaskBToRGB0), // %7
|
||||
"m"(kShuffleMaskRToRGB1), // %8
|
||||
"m"(kShuffleMaskGToRGB1), // %9
|
||||
"m"(kShuffleMaskBToRGB1), // %10
|
||||
"m"(kShuffleMaskRToRGB2), // %11
|
||||
"m"(kShuffleMaskGToRGB2), // %12
|
||||
"m"(kShuffleMaskBToRGB2) // %13
|
||||
: "memory", "cc", NACL_R14
|
||||
"xmm0", "xmm1", "xmm2"
|
||||
);
|
||||
}
|
||||
#endif // HAS_MERGERGBROW_SSSE3
|
||||
|
||||
#ifdef HAS_COPYROW_SSE2
|
||||
void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
|
||||
asm volatile (
|
||||
|
@ -5453,6 +5726,7 @@ void ARGBPolynomialRow_AVX2(const uint8* src_argb,
|
|||
#ifdef HAS_HALFFLOATROW_SSE2
|
||||
static float kScaleBias = 1.9259299444e-34f;
|
||||
void HalfFloatRow_SSE2(const uint16* src, uint16* dst, float scale, int width) {
|
||||
scale *= kScaleBias;
|
||||
asm volatile (
|
||||
"pshufd $0x0,%3,%%xmm4 \n"
|
||||
"pxor %%xmm5,%%xmm5 \n"
|
||||
|
@ -5479,7 +5753,11 @@ void HalfFloatRow_SSE2(const uint16* src, uint16* dst, float scale, int width) {
|
|||
: "+r"(src), // %0
|
||||
"+r"(dst), // %1
|
||||
"+r"(width) // %2
|
||||
: "x"(scale * kScaleBias) // %3
|
||||
#if defined(__x86_64__)
|
||||
: "x"(scale) // %3
|
||||
#else
|
||||
: "m"(scale) // %3
|
||||
#endif
|
||||
: "memory", "cc",
|
||||
"xmm2", "xmm3", "xmm4", "xmm5"
|
||||
);
|
||||
|
@ -5488,6 +5766,7 @@ void HalfFloatRow_SSE2(const uint16* src, uint16* dst, float scale, int width) {
|
|||
|
||||
#ifdef HAS_HALFFLOATROW_AVX2
|
||||
void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) {
|
||||
scale *= kScaleBias;
|
||||
asm volatile (
|
||||
"vbroadcastss %3, %%ymm4 \n"
|
||||
"vpxor %%ymm5,%%ymm5,%%ymm5 \n"
|
||||
|
@ -5515,7 +5794,11 @@ void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) {
|
|||
: "+r"(src), // %0
|
||||
"+r"(dst), // %1
|
||||
"+r"(width) // %2
|
||||
: "x"(scale * kScaleBias) // %3
|
||||
#if defined(__x86_64__)
|
||||
: "x"(scale) // %3
|
||||
#else
|
||||
: "m"(scale) // %3
|
||||
#endif
|
||||
: "memory", "cc",
|
||||
"xmm2", "xmm3", "xmm4", "xmm5"
|
||||
);
|
||||
|
@ -5548,7 +5831,11 @@ void HalfFloatRow_F16C(const uint16* src, uint16* dst, float scale, int width) {
|
|||
: "+r"(src), // %0
|
||||
"+r"(dst), // %1
|
||||
"+r"(width) // %2
|
||||
#if defined(__x86_64__)
|
||||
: "x"(scale) // %3
|
||||
#else
|
||||
: "m"(scale) // %3
|
||||
#endif
|
||||
: "memory", "cc",
|
||||
"xmm2", "xmm3", "xmm4"
|
||||
);
|
||||
|
|
|
@ -2917,7 +2917,7 @@ void InterpolateRow_MSA(uint8* dst_ptr,
|
|||
|
||||
void ARGBSetRow_MSA(uint8* dst_argb, uint32 v32, int width) {
|
||||
int x;
|
||||
v16u8 dst0 = (v16u8)__msa_fill_w(v32);
|
||||
v4i32 dst0 = __builtin_msa_fill_w(v32);
|
||||
|
||||
for (x = 0; x < width; x += 4) {
|
||||
ST_UB(dst0, dst_argb);
|
||||
|
@ -2969,6 +2969,524 @@ void MergeUVRow_MSA(const uint8* src_u,
|
|||
}
|
||||
}
|
||||
|
||||
void ARGBExtractAlphaRow_MSA(const uint8* src_argb, uint8* dst_a, int width) {
|
||||
int i;
|
||||
v16u8 src0, src1, src2, src3, vec0, vec1, dst0;
|
||||
|
||||
for (i = 0; i < width; i += 16) {
|
||||
src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0);
|
||||
src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16);
|
||||
src2 = (v16u8)__msa_ld_b((v16i8*)src_argb, 32);
|
||||
src3 = (v16u8)__msa_ld_b((v16i8*)src_argb, 48);
|
||||
vec0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
|
||||
vec1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
|
||||
dst0 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0);
|
||||
ST_UB(dst0, dst_a);
|
||||
src_argb += 64;
|
||||
dst_a += 16;
|
||||
}
|
||||
}
|
||||
|
||||
void ARGBBlendRow_MSA(const uint8* src_argb0,
|
||||
const uint8* src_argb1,
|
||||
uint8* dst_argb,
|
||||
int width) {
|
||||
int x;
|
||||
v16u8 src0, src1, src2, src3, dst0, dst1;
|
||||
v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
|
||||
v8u16 vec8, vec9, vec10, vec11, vec12, vec13;
|
||||
v8u16 const_256 = (v8u16)__msa_ldi_h(256);
|
||||
v16u8 const_255 = (v16u8)__msa_ldi_b(255);
|
||||
v16u8 mask = {0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255};
|
||||
v16i8 zero = {0};
|
||||
|
||||
for (x = 0; x < width; x += 8) {
|
||||
src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0);
|
||||
src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16);
|
||||
src2 = (v16u8)__msa_ld_b((v16i8*)src_argb1, 0);
|
||||
src3 = (v16u8)__msa_ld_b((v16i8*)src_argb1, 16);
|
||||
vec0 = (v8u16)__msa_ilvr_b(zero, (v16i8)src0);
|
||||
vec1 = (v8u16)__msa_ilvl_b(zero, (v16i8)src0);
|
||||
vec2 = (v8u16)__msa_ilvr_b(zero, (v16i8)src1);
|
||||
vec3 = (v8u16)__msa_ilvl_b(zero, (v16i8)src1);
|
||||
vec4 = (v8u16)__msa_ilvr_b(zero, (v16i8)src2);
|
||||
vec5 = (v8u16)__msa_ilvl_b(zero, (v16i8)src2);
|
||||
vec6 = (v8u16)__msa_ilvr_b(zero, (v16i8)src3);
|
||||
vec7 = (v8u16)__msa_ilvl_b(zero, (v16i8)src3);
|
||||
vec8 = (v8u16)__msa_fill_h(vec0[3]);
|
||||
vec9 = (v8u16)__msa_fill_h(vec0[7]);
|
||||
vec10 = (v8u16)__msa_fill_h(vec1[3]);
|
||||
vec11 = (v8u16)__msa_fill_h(vec1[7]);
|
||||
vec8 = (v8u16)__msa_pckev_d((v2i64)vec9, (v2i64)vec8);
|
||||
vec9 = (v8u16)__msa_pckev_d((v2i64)vec11, (v2i64)vec10);
|
||||
vec10 = (v8u16)__msa_fill_h(vec2[3]);
|
||||
vec11 = (v8u16)__msa_fill_h(vec2[7]);
|
||||
vec12 = (v8u16)__msa_fill_h(vec3[3]);
|
||||
vec13 = (v8u16)__msa_fill_h(vec3[7]);
|
||||
vec10 = (v8u16)__msa_pckev_d((v2i64)vec11, (v2i64)vec10);
|
||||
vec11 = (v8u16)__msa_pckev_d((v2i64)vec13, (v2i64)vec12);
|
||||
vec8 = const_256 - vec8;
|
||||
vec9 = const_256 - vec9;
|
||||
vec10 = const_256 - vec10;
|
||||
vec11 = const_256 - vec11;
|
||||
vec8 *= vec4;
|
||||
vec9 *= vec5;
|
||||
vec10 *= vec6;
|
||||
vec11 *= vec7;
|
||||
vec8 = (v8u16)__msa_srai_h((v8i16)vec8, 8);
|
||||
vec9 = (v8u16)__msa_srai_h((v8i16)vec9, 8);
|
||||
vec10 = (v8u16)__msa_srai_h((v8i16)vec10, 8);
|
||||
vec11 = (v8u16)__msa_srai_h((v8i16)vec11, 8);
|
||||
vec0 += vec8;
|
||||
vec1 += vec9;
|
||||
vec2 += vec10;
|
||||
vec3 += vec11;
|
||||
dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
|
||||
dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
|
||||
dst0 = __msa_bmnz_v(dst0, const_255, mask);
|
||||
dst1 = __msa_bmnz_v(dst1, const_255, mask);
|
||||
ST_UB2(dst0, dst1, dst_argb, 16);
|
||||
src_argb0 += 32;
|
||||
src_argb1 += 32;
|
||||
dst_argb += 32;
|
||||
}
|
||||
}
|
||||
|
||||
void ARGBQuantizeRow_MSA(uint8* dst_argb,
|
||||
int scale,
|
||||
int interval_size,
|
||||
int interval_offset,
|
||||
int width) {
|
||||
int x;
|
||||
v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3;
|
||||
v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
|
||||
v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
|
||||
v4i32 tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
|
||||
v4i32 vec_scale = __msa_fill_w(scale);
|
||||
v16u8 vec_int_sz = (v16u8)__msa_fill_b(interval_size);
|
||||
v16u8 vec_int_ofst = (v16u8)__msa_fill_b(interval_offset);
|
||||
v16i8 mask = {0, 1, 2, 19, 4, 5, 6, 23, 8, 9, 10, 27, 12, 13, 14, 31};
|
||||
v16i8 zero = {0};
|
||||
|
||||
for (x = 0; x < width; x += 8) {
|
||||
src0 = (v16u8)__msa_ld_b((v16i8*)dst_argb, 0);
|
||||
src1 = (v16u8)__msa_ld_b((v16i8*)dst_argb, 16);
|
||||
src2 = (v16u8)__msa_ld_b((v16i8*)dst_argb, 32);
|
||||
src3 = (v16u8)__msa_ld_b((v16i8*)dst_argb, 48);
|
||||
vec0 = (v8i16)__msa_ilvr_b(zero, (v16i8)src0);
|
||||
vec1 = (v8i16)__msa_ilvl_b(zero, (v16i8)src0);
|
||||
vec2 = (v8i16)__msa_ilvr_b(zero, (v16i8)src1);
|
||||
vec3 = (v8i16)__msa_ilvl_b(zero, (v16i8)src1);
|
||||
vec4 = (v8i16)__msa_ilvr_b(zero, (v16i8)src2);
|
||||
vec5 = (v8i16)__msa_ilvl_b(zero, (v16i8)src2);
|
||||
vec6 = (v8i16)__msa_ilvr_b(zero, (v16i8)src3);
|
||||
vec7 = (v8i16)__msa_ilvl_b(zero, (v16i8)src3);
|
||||
tmp0 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec0);
|
||||
tmp1 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec0);
|
||||
tmp2 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec1);
|
||||
tmp3 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec1);
|
||||
tmp4 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec2);
|
||||
tmp5 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec2);
|
||||
tmp6 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec3);
|
||||
tmp7 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec3);
|
||||
tmp8 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec4);
|
||||
tmp9 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec4);
|
||||
tmp10 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec5);
|
||||
tmp11 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec5);
|
||||
tmp12 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec6);
|
||||
tmp13 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec6);
|
||||
tmp14 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec7);
|
||||
tmp15 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec7);
|
||||
tmp0 *= vec_scale;
|
||||
tmp1 *= vec_scale;
|
||||
tmp2 *= vec_scale;
|
||||
tmp3 *= vec_scale;
|
||||
tmp4 *= vec_scale;
|
||||
tmp5 *= vec_scale;
|
||||
tmp6 *= vec_scale;
|
||||
tmp7 *= vec_scale;
|
||||
tmp8 *= vec_scale;
|
||||
tmp9 *= vec_scale;
|
||||
tmp10 *= vec_scale;
|
||||
tmp11 *= vec_scale;
|
||||
tmp12 *= vec_scale;
|
||||
tmp13 *= vec_scale;
|
||||
tmp14 *= vec_scale;
|
||||
tmp15 *= vec_scale;
|
||||
tmp0 >>= 16;
|
||||
tmp1 >>= 16;
|
||||
tmp2 >>= 16;
|
||||
tmp3 >>= 16;
|
||||
tmp4 >>= 16;
|
||||
tmp5 >>= 16;
|
||||
tmp6 >>= 16;
|
||||
tmp7 >>= 16;
|
||||
tmp8 >>= 16;
|
||||
tmp9 >>= 16;
|
||||
tmp10 >>= 16;
|
||||
tmp11 >>= 16;
|
||||
tmp12 >>= 16;
|
||||
tmp13 >>= 16;
|
||||
tmp14 >>= 16;
|
||||
tmp15 >>= 16;
|
||||
vec0 = (v8i16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0);
|
||||
vec1 = (v8i16)__msa_pckev_h((v8i16)tmp3, (v8i16)tmp2);
|
||||
vec2 = (v8i16)__msa_pckev_h((v8i16)tmp5, (v8i16)tmp4);
|
||||
vec3 = (v8i16)__msa_pckev_h((v8i16)tmp7, (v8i16)tmp6);
|
||||
vec4 = (v8i16)__msa_pckev_h((v8i16)tmp9, (v8i16)tmp8);
|
||||
vec5 = (v8i16)__msa_pckev_h((v8i16)tmp11, (v8i16)tmp10);
|
||||
vec6 = (v8i16)__msa_pckev_h((v8i16)tmp13, (v8i16)tmp12);
|
||||
vec7 = (v8i16)__msa_pckev_h((v8i16)tmp15, (v8i16)tmp14);
|
||||
dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
|
||||
dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
|
||||
dst2 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec4);
|
||||
dst3 = (v16u8)__msa_pckev_b((v16i8)vec7, (v16i8)vec6);
|
||||
dst0 *= vec_int_sz;
|
||||
dst1 *= vec_int_sz;
|
||||
dst2 *= vec_int_sz;
|
||||
dst3 *= vec_int_sz;
|
||||
dst0 += vec_int_ofst;
|
||||
dst1 += vec_int_ofst;
|
||||
dst2 += vec_int_ofst;
|
||||
dst3 += vec_int_ofst;
|
||||
dst0 = (v16u8)__msa_vshf_b(mask, (v16i8)src0, (v16i8)dst0);
|
||||
dst1 = (v16u8)__msa_vshf_b(mask, (v16i8)src1, (v16i8)dst1);
|
||||
dst2 = (v16u8)__msa_vshf_b(mask, (v16i8)src2, (v16i8)dst2);
|
||||
dst3 = (v16u8)__msa_vshf_b(mask, (v16i8)src3, (v16i8)dst3);
|
||||
ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
|
||||
dst_argb += 64;
|
||||
}
|
||||
}
|
||||
|
||||
void ARGBColorMatrixRow_MSA(const uint8* src_argb,
|
||||
uint8* dst_argb,
|
||||
const int8* matrix_argb,
|
||||
int width) {
|
||||
int32 x;
|
||||
v16i8 src0;
|
||||
v16u8 src1, src2, dst0, dst1;
|
||||
v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
|
||||
v8i16 vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17;
|
||||
v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
|
||||
v4i32 tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
|
||||
v16i8 zero = {0};
|
||||
v8i16 max = __msa_ldi_h(255);
|
||||
|
||||
src0 = __msa_ld_b((v16i8*)matrix_argb, 0);
|
||||
vec0 = (v8i16)__msa_ilvr_b(zero, src0);
|
||||
vec1 = (v8i16)__msa_ilvl_b(zero, src0);
|
||||
|
||||
for (x = 0; x < width; x += 8) {
|
||||
src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0);
|
||||
src2 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16);
|
||||
vec2 = (v8i16)__msa_ilvr_b(zero, (v16i8)src1);
|
||||
vec3 = (v8i16)__msa_ilvl_b(zero, (v16i8)src1);
|
||||
vec4 = (v8i16)__msa_ilvr_b(zero, (v16i8)src2);
|
||||
vec5 = (v8i16)__msa_ilvl_b(zero, (v16i8)src2);
|
||||
vec6 = (v8i16)__msa_pckod_d((v2i64)vec2, (v2i64)vec2);
|
||||
vec7 = (v8i16)__msa_pckod_d((v2i64)vec3, (v2i64)vec3);
|
||||
vec8 = (v8i16)__msa_pckod_d((v2i64)vec4, (v2i64)vec4);
|
||||
vec9 = (v8i16)__msa_pckod_d((v2i64)vec5, (v2i64)vec5);
|
||||
vec2 = (v8i16)__msa_pckev_d((v2i64)vec2, (v2i64)vec2);
|
||||
vec3 = (v8i16)__msa_pckev_d((v2i64)vec3, (v2i64)vec3);
|
||||
vec4 = (v8i16)__msa_pckev_d((v2i64)vec4, (v2i64)vec4);
|
||||
vec5 = (v8i16)__msa_pckev_d((v2i64)vec5, (v2i64)vec5);
|
||||
vec10 = vec2 * vec0;
|
||||
vec11 = vec2 * vec1;
|
||||
vec12 = vec6 * vec0;
|
||||
vec13 = vec6 * vec1;
|
||||
tmp0 = __msa_hadd_s_w(vec10, vec10);
|
||||
tmp1 = __msa_hadd_s_w(vec11, vec11);
|
||||
tmp2 = __msa_hadd_s_w(vec12, vec12);
|
||||
tmp3 = __msa_hadd_s_w(vec13, vec13);
|
||||
vec14 = vec3 * vec0;
|
||||
vec15 = vec3 * vec1;
|
||||
vec16 = vec7 * vec0;
|
||||
vec17 = vec7 * vec1;
|
||||
tmp4 = __msa_hadd_s_w(vec14, vec14);
|
||||
tmp5 = __msa_hadd_s_w(vec15, vec15);
|
||||
tmp6 = __msa_hadd_s_w(vec16, vec16);
|
||||
tmp7 = __msa_hadd_s_w(vec17, vec17);
|
||||
vec10 = __msa_pckev_h((v8i16)tmp1, (v8i16)tmp0);
|
||||
vec11 = __msa_pckev_h((v8i16)tmp3, (v8i16)tmp2);
|
||||
vec12 = __msa_pckev_h((v8i16)tmp5, (v8i16)tmp4);
|
||||
vec13 = __msa_pckev_h((v8i16)tmp7, (v8i16)tmp6);
|
||||
tmp0 = __msa_hadd_s_w(vec10, vec10);
|
||||
tmp1 = __msa_hadd_s_w(vec11, vec11);
|
||||
tmp2 = __msa_hadd_s_w(vec12, vec12);
|
||||
tmp3 = __msa_hadd_s_w(vec13, vec13);
|
||||
tmp0 = __msa_srai_w(tmp0, 6);
|
||||
tmp1 = __msa_srai_w(tmp1, 6);
|
||||
tmp2 = __msa_srai_w(tmp2, 6);
|
||||
tmp3 = __msa_srai_w(tmp3, 6);
|
||||
vec2 = vec4 * vec0;
|
||||
vec6 = vec4 * vec1;
|
||||
vec3 = vec8 * vec0;
|
||||
vec7 = vec8 * vec1;
|
||||
tmp8 = __msa_hadd_s_w(vec2, vec2);
|
||||
tmp9 = __msa_hadd_s_w(vec6, vec6);
|
||||
tmp10 = __msa_hadd_s_w(vec3, vec3);
|
||||
tmp11 = __msa_hadd_s_w(vec7, vec7);
|
||||
vec4 = vec5 * vec0;
|
||||
vec8 = vec5 * vec1;
|
||||
vec5 = vec9 * vec0;
|
||||
vec9 = vec9 * vec1;
|
||||
tmp12 = __msa_hadd_s_w(vec4, vec4);
|
||||
tmp13 = __msa_hadd_s_w(vec8, vec8);
|
||||
tmp14 = __msa_hadd_s_w(vec5, vec5);
|
||||
tmp15 = __msa_hadd_s_w(vec9, vec9);
|
||||
vec14 = __msa_pckev_h((v8i16)tmp9, (v8i16)tmp8);
|
||||
vec15 = __msa_pckev_h((v8i16)tmp11, (v8i16)tmp10);
|
||||
vec16 = __msa_pckev_h((v8i16)tmp13, (v8i16)tmp12);
|
||||
vec17 = __msa_pckev_h((v8i16)tmp15, (v8i16)tmp14);
|
||||
tmp4 = __msa_hadd_s_w(vec14, vec14);
|
||||
tmp5 = __msa_hadd_s_w(vec15, vec15);
|
||||
tmp6 = __msa_hadd_s_w(vec16, vec16);
|
||||
tmp7 = __msa_hadd_s_w(vec17, vec17);
|
||||
tmp4 = __msa_srai_w(tmp4, 6);
|
||||
tmp5 = __msa_srai_w(tmp5, 6);
|
||||
tmp6 = __msa_srai_w(tmp6, 6);
|
||||
tmp7 = __msa_srai_w(tmp7, 6);
|
||||
vec10 = __msa_pckev_h((v8i16)tmp1, (v8i16)tmp0);
|
||||
vec11 = __msa_pckev_h((v8i16)tmp3, (v8i16)tmp2);
|
||||
vec12 = __msa_pckev_h((v8i16)tmp5, (v8i16)tmp4);
|
||||
vec13 = __msa_pckev_h((v8i16)tmp7, (v8i16)tmp6);
|
||||
vec10 = __msa_maxi_s_h(vec10, 0);
|
||||
vec11 = __msa_maxi_s_h(vec11, 0);
|
||||
vec12 = __msa_maxi_s_h(vec12, 0);
|
||||
vec13 = __msa_maxi_s_h(vec13, 0);
|
||||
vec10 = __msa_min_s_h(vec10, max);
|
||||
vec11 = __msa_min_s_h(vec11, max);
|
||||
vec12 = __msa_min_s_h(vec12, max);
|
||||
vec13 = __msa_min_s_h(vec13, max);
|
||||
dst0 = (v16u8)__msa_pckev_b((v16i8)vec11, (v16i8)vec10);
|
||||
dst1 = (v16u8)__msa_pckev_b((v16i8)vec13, (v16i8)vec12);
|
||||
ST_UB2(dst0, dst1, dst_argb, 16);
|
||||
src_argb += 32;
|
||||
dst_argb += 32;
|
||||
}
|
||||
}
|
||||
|
||||
void SplitUVRow_MSA(const uint8* src_uv,
|
||||
uint8* dst_u,
|
||||
uint8* dst_v,
|
||||
int width) {
|
||||
int x;
|
||||
v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3;
|
||||
|
||||
for (x = 0; x < width; x += 32) {
|
||||
src0 = (v16u8)__msa_ld_b((v16i8*)src_uv, 0);
|
||||
src1 = (v16u8)__msa_ld_b((v16i8*)src_uv, 16);
|
||||
src2 = (v16u8)__msa_ld_b((v16i8*)src_uv, 32);
|
||||
src3 = (v16u8)__msa_ld_b((v16i8*)src_uv, 48);
|
||||
dst0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
|
||||
dst1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
|
||||
dst2 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
|
||||
dst3 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
|
||||
ST_UB2(dst0, dst1, dst_u, 16);
|
||||
ST_UB2(dst2, dst3, dst_v, 16);
|
||||
src_uv += 64;
|
||||
dst_u += 32;
|
||||
dst_v += 32;
|
||||
}
|
||||
}
|
||||
|
||||
void SetRow_MSA(uint8* dst, uint8 v8, int width) {
|
||||
int x;
|
||||
v16u8 dst0 = (v16u8)__msa_fill_b(v8);
|
||||
|
||||
for (x = 0; x < width; x += 16) {
|
||||
ST_UB(dst0, dst);
|
||||
dst += 16;
|
||||
}
|
||||
}
|
||||
|
||||
void MirrorUVRow_MSA(const uint8* src_uv,
|
||||
uint8* dst_u,
|
||||
uint8* dst_v,
|
||||
int width) {
|
||||
int x;
|
||||
v16u8 src0, src1, src2, src3;
|
||||
v16u8 dst0, dst1, dst2, dst3;
|
||||
v16i8 mask0 = {30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0};
|
||||
v16i8 mask1 = {31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1};
|
||||
|
||||
src_uv += (2 * width);
|
||||
|
||||
for (x = 0; x < width; x += 32) {
|
||||
src_uv -= 64;
|
||||
src2 = (v16u8)__msa_ld_b((v16i8*)src_uv, 0);
|
||||
src3 = (v16u8)__msa_ld_b((v16i8*)src_uv, 16);
|
||||
src0 = (v16u8)__msa_ld_b((v16i8*)src_uv, 32);
|
||||
src1 = (v16u8)__msa_ld_b((v16i8*)src_uv, 48);
|
||||
dst0 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0);
|
||||
dst1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src3, (v16i8)src2);
|
||||
dst2 = (v16u8)__msa_vshf_b(mask0, (v16i8)src1, (v16i8)src0);
|
||||
dst3 = (v16u8)__msa_vshf_b(mask0, (v16i8)src3, (v16i8)src2);
|
||||
ST_UB2(dst0, dst1, dst_v, 16);
|
||||
ST_UB2(dst2, dst3, dst_u, 16);
|
||||
dst_u += 32;
|
||||
dst_v += 32;
|
||||
}
|
||||
}
|
||||
|
||||
void SobelXRow_MSA(const uint8* src_y0,
|
||||
const uint8* src_y1,
|
||||
const uint8* src_y2,
|
||||
uint8* dst_sobelx,
|
||||
int32 width) {
|
||||
int x;
|
||||
v16u8 src0, src1, src2, src3, src4, src5, dst0;
|
||||
v8i16 vec0, vec1, vec2, vec3, vec4, vec5;
|
||||
v16i8 mask0 = {0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9};
|
||||
v16i8 tmp = __msa_ldi_b(8);
|
||||
v16i8 mask1 = mask0 + tmp;
|
||||
v8i16 zero = {0};
|
||||
v8i16 max = __msa_ldi_h(255);
|
||||
|
||||
for (x = 0; x < width; x += 16) {
|
||||
src0 = (v16u8)__msa_ld_b((v16i8*)src_y0, 0);
|
||||
src1 = (v16u8)__msa_ld_b((v16i8*)src_y0, 16);
|
||||
src2 = (v16u8)__msa_ld_b((v16i8*)src_y1, 0);
|
||||
src3 = (v16u8)__msa_ld_b((v16i8*)src_y1, 16);
|
||||
src4 = (v16u8)__msa_ld_b((v16i8*)src_y2, 0);
|
||||
src5 = (v16u8)__msa_ld_b((v16i8*)src_y2, 16);
|
||||
vec0 = (v8i16)__msa_vshf_b(mask0, (v16i8)src1, (v16i8)src0);
|
||||
vec1 = (v8i16)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0);
|
||||
vec2 = (v8i16)__msa_vshf_b(mask0, (v16i8)src3, (v16i8)src2);
|
||||
vec3 = (v8i16)__msa_vshf_b(mask1, (v16i8)src3, (v16i8)src2);
|
||||
vec4 = (v8i16)__msa_vshf_b(mask0, (v16i8)src5, (v16i8)src4);
|
||||
vec5 = (v8i16)__msa_vshf_b(mask1, (v16i8)src5, (v16i8)src4);
|
||||
vec0 = (v8i16)__msa_hsub_u_h((v16u8)vec0, (v16u8)vec0);
|
||||
vec1 = (v8i16)__msa_hsub_u_h((v16u8)vec1, (v16u8)vec1);
|
||||
vec2 = (v8i16)__msa_hsub_u_h((v16u8)vec2, (v16u8)vec2);
|
||||
vec3 = (v8i16)__msa_hsub_u_h((v16u8)vec3, (v16u8)vec3);
|
||||
vec4 = (v8i16)__msa_hsub_u_h((v16u8)vec4, (v16u8)vec4);
|
||||
vec5 = (v8i16)__msa_hsub_u_h((v16u8)vec5, (v16u8)vec5);
|
||||
vec0 += vec2;
|
||||
vec1 += vec3;
|
||||
vec4 += vec2;
|
||||
vec5 += vec3;
|
||||
vec0 += vec4;
|
||||
vec1 += vec5;
|
||||
vec0 = __msa_add_a_h(zero, vec0);
|
||||
vec1 = __msa_add_a_h(zero, vec1);
|
||||
vec0 = __msa_maxi_s_h(vec0, 0);
|
||||
vec1 = __msa_maxi_s_h(vec1, 0);
|
||||
vec0 = __msa_min_s_h(max, vec0);
|
||||
vec1 = __msa_min_s_h(max, vec1);
|
||||
dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
|
||||
ST_UB(dst0, dst_sobelx);
|
||||
src_y0 += 16;
|
||||
src_y1 += 16;
|
||||
src_y2 += 16;
|
||||
dst_sobelx += 16;
|
||||
}
|
||||
}
|
||||
|
||||
void SobelYRow_MSA(const uint8* src_y0,
|
||||
const uint8* src_y1,
|
||||
uint8* dst_sobely,
|
||||
int32 width) {
|
||||
int x;
|
||||
v16u8 src0, src1, dst0;
|
||||
v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6;
|
||||
v8i16 zero = {0};
|
||||
v8i16 max = __msa_ldi_h(255);
|
||||
|
||||
for (x = 0; x < width; x += 16) {
|
||||
src0 = (v16u8)__msa_ld_b((v16i8*)src_y0, 0);
|
||||
src1 = (v16u8)__msa_ld_b((v16i8*)src_y1, 0);
|
||||
vec0 = (v8i16)__msa_ilvr_b((v16i8)zero, (v16i8)src0);
|
||||
vec1 = (v8i16)__msa_ilvl_b((v16i8)zero, (v16i8)src0);
|
||||
vec2 = (v8i16)__msa_ilvr_b((v16i8)zero, (v16i8)src1);
|
||||
vec3 = (v8i16)__msa_ilvl_b((v16i8)zero, (v16i8)src1);
|
||||
vec0 -= vec2;
|
||||
vec1 -= vec3;
|
||||
vec6[0] = src_y0[16] - src_y1[16];
|
||||
vec6[1] = src_y0[17] - src_y1[17];
|
||||
vec2 = (v8i16)__msa_sldi_b((v16i8)vec1, (v16i8)vec0, 2);
|
||||
vec3 = (v8i16)__msa_sldi_b((v16i8)vec6, (v16i8)vec1, 2);
|
||||
vec4 = (v8i16)__msa_sldi_b((v16i8)vec1, (v16i8)vec0, 4);
|
||||
vec5 = (v8i16)__msa_sldi_b((v16i8)vec6, (v16i8)vec1, 4);
|
||||
vec0 += vec2;
|
||||
vec1 += vec3;
|
||||
vec4 += vec2;
|
||||
vec5 += vec3;
|
||||
vec0 += vec4;
|
||||
vec1 += vec5;
|
||||
vec0 = __msa_add_a_h(zero, vec0);
|
||||
vec1 = __msa_add_a_h(zero, vec1);
|
||||
vec0 = __msa_maxi_s_h(vec0, 0);
|
||||
vec1 = __msa_maxi_s_h(vec1, 0);
|
||||
vec0 = __msa_min_s_h(max, vec0);
|
||||
vec1 = __msa_min_s_h(max, vec1);
|
||||
dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
|
||||
ST_UB(dst0, dst_sobely);
|
||||
src_y0 += 16;
|
||||
src_y1 += 16;
|
||||
dst_sobely += 16;
|
||||
}
|
||||
}
|
||||
|
||||
void HalfFloatRow_MSA(const uint16* src, uint16* dst, float scale, int width) {
|
||||
int i;
|
||||
v8u16 src0, src1, src2, src3, dst0, dst1, dst2, dst3;
|
||||
v4u32 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
|
||||
v4f32 fvec0, fvec1, fvec2, fvec3, fvec4, fvec5, fvec6, fvec7;
|
||||
v4f32 mult_vec;
|
||||
v8i16 zero = {0};
|
||||
mult_vec[0] = 1.9259299444e-34f * scale;
|
||||
mult_vec = (v4f32)__msa_splati_w((v4i32)mult_vec, 0);
|
||||
|
||||
for (i = 0; i < width; i += 32) {
|
||||
src0 = (v8u16)__msa_ld_h((v8i16*)src, 0);
|
||||
src1 = (v8u16)__msa_ld_h((v8i16*)src, 16);
|
||||
src2 = (v8u16)__msa_ld_h((v8i16*)src, 32);
|
||||
src3 = (v8u16)__msa_ld_h((v8i16*)src, 48);
|
||||
vec0 = (v4u32)__msa_ilvr_h(zero, (v8i16)src0);
|
||||
vec1 = (v4u32)__msa_ilvl_h(zero, (v8i16)src0);
|
||||
vec2 = (v4u32)__msa_ilvr_h(zero, (v8i16)src1);
|
||||
vec3 = (v4u32)__msa_ilvl_h(zero, (v8i16)src1);
|
||||
vec4 = (v4u32)__msa_ilvr_h(zero, (v8i16)src2);
|
||||
vec5 = (v4u32)__msa_ilvl_h(zero, (v8i16)src2);
|
||||
vec6 = (v4u32)__msa_ilvr_h(zero, (v8i16)src3);
|
||||
vec7 = (v4u32)__msa_ilvl_h(zero, (v8i16)src3);
|
||||
fvec0 = __msa_ffint_u_w(vec0);
|
||||
fvec1 = __msa_ffint_u_w(vec1);
|
||||
fvec2 = __msa_ffint_u_w(vec2);
|
||||
fvec3 = __msa_ffint_u_w(vec3);
|
||||
fvec4 = __msa_ffint_u_w(vec4);
|
||||
fvec5 = __msa_ffint_u_w(vec5);
|
||||
fvec6 = __msa_ffint_u_w(vec6);
|
||||
fvec7 = __msa_ffint_u_w(vec7);
|
||||
fvec0 *= mult_vec;
|
||||
fvec1 *= mult_vec;
|
||||
fvec2 *= mult_vec;
|
||||
fvec3 *= mult_vec;
|
||||
fvec4 *= mult_vec;
|
||||
fvec5 *= mult_vec;
|
||||
fvec6 *= mult_vec;
|
||||
fvec7 *= mult_vec;
|
||||
vec0 = ((v4u32)fvec0) >> 13;
|
||||
vec1 = ((v4u32)fvec1) >> 13;
|
||||
vec2 = ((v4u32)fvec2) >> 13;
|
||||
vec3 = ((v4u32)fvec3) >> 13;
|
||||
vec4 = ((v4u32)fvec4) >> 13;
|
||||
vec5 = ((v4u32)fvec5) >> 13;
|
||||
vec6 = ((v4u32)fvec6) >> 13;
|
||||
vec7 = ((v4u32)fvec7) >> 13;
|
||||
dst0 = (v8u16)__msa_pckev_h((v8i16)vec1, (v8i16)vec0);
|
||||
dst1 = (v8u16)__msa_pckev_h((v8i16)vec3, (v8i16)vec2);
|
||||
dst2 = (v8u16)__msa_pckev_h((v8i16)vec5, (v8i16)vec4);
|
||||
dst3 = (v8u16)__msa_pckev_h((v8i16)vec7, (v8i16)vec6);
|
||||
ST_UH2(dst0, dst1, dst, 8);
|
||||
ST_UH2(dst2, dst3, dst + 16, 8);
|
||||
src += 32;
|
||||
dst += 32;
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
} // namespace libyuv
|
||||
|
|
|
@ -115,7 +115,7 @@ void I444ToARGBRow_NEON(const uint8* src_y,
|
|||
asm volatile(
|
||||
YUVTORGB_SETUP
|
||||
"vmov.u8 d23, #255 \n"
|
||||
"1: \n" READYUV444 YUVTORGB
|
||||
"1: \n" READYUV444 YUVTORGB
|
||||
"subs %4, %4, #8 \n"
|
||||
"vst4.8 {d20, d21, d22, d23}, [%3]! \n"
|
||||
"bgt 1b \n"
|
||||
|
@ -141,7 +141,7 @@ void I422ToARGBRow_NEON(const uint8* src_y,
|
|||
asm volatile(
|
||||
YUVTORGB_SETUP
|
||||
"vmov.u8 d23, #255 \n"
|
||||
"1: \n" READYUV422 YUVTORGB
|
||||
"1: \n" READYUV422 YUVTORGB
|
||||
"subs %4, %4, #8 \n"
|
||||
"vst4.8 {d20, d21, d22, d23}, [%3]! \n"
|
||||
"bgt 1b \n"
|
||||
|
@ -167,7 +167,7 @@ void I422AlphaToARGBRow_NEON(const uint8* src_y,
|
|||
int width) {
|
||||
asm volatile(
|
||||
YUVTORGB_SETUP
|
||||
"1: \n" READYUV422 YUVTORGB
|
||||
"1: \n" READYUV422 YUVTORGB
|
||||
"subs %5, %5, #8 \n"
|
||||
"vld1.8 {d23}, [%3]! \n"
|
||||
"vst4.8 {d20, d21, d22, d23}, [%4]! \n"
|
||||
|
@ -194,7 +194,7 @@ void I422ToRGBARow_NEON(const uint8* src_y,
|
|||
int width) {
|
||||
asm volatile(
|
||||
YUVTORGB_SETUP
|
||||
"1: \n" READYUV422 YUVTORGB
|
||||
"1: \n" READYUV422 YUVTORGB
|
||||
"subs %4, %4, #8 \n"
|
||||
"vmov.u8 d19, #255 \n" // d19 modified by
|
||||
// YUVTORGB
|
||||
|
@ -221,7 +221,7 @@ void I422ToRGB24Row_NEON(const uint8* src_y,
|
|||
int width) {
|
||||
asm volatile(
|
||||
YUVTORGB_SETUP
|
||||
"1: \n" READYUV422 YUVTORGB
|
||||
"1: \n" READYUV422 YUVTORGB
|
||||
"subs %4, %4, #8 \n"
|
||||
"vst3.8 {d20, d21, d22}, [%3]! \n"
|
||||
"bgt 1b \n"
|
||||
|
@ -253,7 +253,7 @@ void I422ToRGB565Row_NEON(const uint8* src_y,
|
|||
int width) {
|
||||
asm volatile(
|
||||
YUVTORGB_SETUP
|
||||
"1: \n" READYUV422 YUVTORGB
|
||||
"1: \n" READYUV422 YUVTORGB
|
||||
"subs %4, %4, #8 \n" ARGBTORGB565
|
||||
"vst1.8 {q0}, [%3]! \n" // store 8 pixels RGB565.
|
||||
"bgt 1b \n"
|
||||
|
@ -287,7 +287,7 @@ void I422ToARGB1555Row_NEON(const uint8* src_y,
|
|||
int width) {
|
||||
asm volatile(
|
||||
YUVTORGB_SETUP
|
||||
"1: \n" READYUV422 YUVTORGB
|
||||
"1: \n" READYUV422 YUVTORGB
|
||||
"subs %4, %4, #8 \n"
|
||||
"vmov.u8 d23, #255 \n" ARGBTOARGB1555
|
||||
"vst1.8 {q0}, [%3]! \n" // store 8 pixels
|
||||
|
@ -325,7 +325,7 @@ void I422ToARGB4444Row_NEON(const uint8* src_y,
|
|||
YUVTORGB_SETUP
|
||||
"vmov.u8 d4, #0x0f \n" // bits to clear with
|
||||
// vbic.
|
||||
"1: \n" READYUV422 YUVTORGB
|
||||
"1: \n" READYUV422 YUVTORGB
|
||||
"subs %4, %4, #8 \n"
|
||||
"vmov.u8 d23, #255 \n" ARGBTOARGB4444
|
||||
"vst1.8 {q0}, [%3]! \n" // store 8 pixels
|
||||
|
@ -348,7 +348,7 @@ void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) {
|
|||
asm volatile(
|
||||
YUVTORGB_SETUP
|
||||
"vmov.u8 d23, #255 \n"
|
||||
"1: \n" READYUV400 YUVTORGB
|
||||
"1: \n" READYUV400 YUVTORGB
|
||||
"subs %2, %2, #8 \n"
|
||||
"vst4.8 {d20, d21, d22, d23}, [%1]! \n"
|
||||
"bgt 1b \n"
|
||||
|
@ -366,7 +366,7 @@ void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) {
|
|||
void J400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) {
|
||||
asm volatile(
|
||||
"vmov.u8 d23, #255 \n"
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"vld1.8 {d20}, [%0]! \n"
|
||||
"vmov d21, d20 \n"
|
||||
"vmov d22, d20 \n"
|
||||
|
@ -385,23 +385,22 @@ void NV12ToARGBRow_NEON(const uint8* src_y,
|
|||
uint8* dst_argb,
|
||||
const struct YuvConstants* yuvconstants,
|
||||
int width) {
|
||||
asm volatile(
|
||||
YUVTORGB_SETUP
|
||||
"vmov.u8 d23, #255 \n"
|
||||
"1: \n" READNV12 YUVTORGB
|
||||
"subs %3, %3, #8 \n"
|
||||
"vst4.8 {d20, d21, d22, d23}, [%2]! \n"
|
||||
"bgt 1b \n"
|
||||
: "+r"(src_y), // %0
|
||||
"+r"(src_uv), // %1
|
||||
"+r"(dst_argb), // %2
|
||||
"+r"(width) // %3
|
||||
: [kUVToRB] "r"(&yuvconstants->kUVToRB),
|
||||
[kUVToG] "r"(&yuvconstants->kUVToG),
|
||||
[kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
|
||||
[kYToRgb] "r"(&yuvconstants->kYToRgb)
|
||||
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
|
||||
"q12", "q13", "q14", "q15");
|
||||
asm volatile(YUVTORGB_SETUP
|
||||
"vmov.u8 d23, #255 \n"
|
||||
"1: \n" READNV12 YUVTORGB
|
||||
"subs %3, %3, #8 \n"
|
||||
"vst4.8 {d20, d21, d22, d23}, [%2]! \n"
|
||||
"bgt 1b \n"
|
||||
: "+r"(src_y), // %0
|
||||
"+r"(src_uv), // %1
|
||||
"+r"(dst_argb), // %2
|
||||
"+r"(width) // %3
|
||||
: [kUVToRB] "r"(&yuvconstants->kUVToRB),
|
||||
[kUVToG] "r"(&yuvconstants->kUVToG),
|
||||
[kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
|
||||
[kYToRgb] "r"(&yuvconstants->kYToRgb)
|
||||
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9",
|
||||
"q10", "q11", "q12", "q13", "q14", "q15");
|
||||
}
|
||||
|
||||
void NV21ToARGBRow_NEON(const uint8* src_y,
|
||||
|
@ -409,23 +408,22 @@ void NV21ToARGBRow_NEON(const uint8* src_y,
|
|||
uint8* dst_argb,
|
||||
const struct YuvConstants* yuvconstants,
|
||||
int width) {
|
||||
asm volatile(
|
||||
YUVTORGB_SETUP
|
||||
"vmov.u8 d23, #255 \n"
|
||||
"1: \n" READNV21 YUVTORGB
|
||||
"subs %3, %3, #8 \n"
|
||||
"vst4.8 {d20, d21, d22, d23}, [%2]! \n"
|
||||
"bgt 1b \n"
|
||||
: "+r"(src_y), // %0
|
||||
"+r"(src_vu), // %1
|
||||
"+r"(dst_argb), // %2
|
||||
"+r"(width) // %3
|
||||
: [kUVToRB] "r"(&yuvconstants->kUVToRB),
|
||||
[kUVToG] "r"(&yuvconstants->kUVToG),
|
||||
[kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
|
||||
[kYToRgb] "r"(&yuvconstants->kYToRgb)
|
||||
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
|
||||
"q12", "q13", "q14", "q15");
|
||||
asm volatile(YUVTORGB_SETUP
|
||||
"vmov.u8 d23, #255 \n"
|
||||
"1: \n" READNV21 YUVTORGB
|
||||
"subs %3, %3, #8 \n"
|
||||
"vst4.8 {d20, d21, d22, d23}, [%2]! \n"
|
||||
"bgt 1b \n"
|
||||
: "+r"(src_y), // %0
|
||||
"+r"(src_vu), // %1
|
||||
"+r"(dst_argb), // %2
|
||||
"+r"(width) // %3
|
||||
: [kUVToRB] "r"(&yuvconstants->kUVToRB),
|
||||
[kUVToG] "r"(&yuvconstants->kUVToG),
|
||||
[kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
|
||||
[kYToRgb] "r"(&yuvconstants->kYToRgb)
|
||||
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9",
|
||||
"q10", "q11", "q12", "q13", "q14", "q15");
|
||||
}
|
||||
|
||||
void NV12ToRGB565Row_NEON(const uint8* src_y,
|
||||
|
@ -435,7 +433,7 @@ void NV12ToRGB565Row_NEON(const uint8* src_y,
|
|||
int width) {
|
||||
asm volatile(
|
||||
YUVTORGB_SETUP
|
||||
"1: \n" READNV12 YUVTORGB
|
||||
"1: \n" READNV12 YUVTORGB
|
||||
"subs %3, %3, #8 \n" ARGBTORGB565
|
||||
"vst1.8 {q0}, [%2]! \n" // store 8 pixels RGB565.
|
||||
"bgt 1b \n"
|
||||
|
@ -455,44 +453,42 @@ void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
|
|||
uint8* dst_argb,
|
||||
const struct YuvConstants* yuvconstants,
|
||||
int width) {
|
||||
asm volatile(
|
||||
YUVTORGB_SETUP
|
||||
"vmov.u8 d23, #255 \n"
|
||||
"1: \n" READYUY2 YUVTORGB
|
||||
"subs %2, %2, #8 \n"
|
||||
"vst4.8 {d20, d21, d22, d23}, [%1]! \n"
|
||||
"bgt 1b \n"
|
||||
: "+r"(src_yuy2), // %0
|
||||
"+r"(dst_argb), // %1
|
||||
"+r"(width) // %2
|
||||
: [kUVToRB] "r"(&yuvconstants->kUVToRB),
|
||||
[kUVToG] "r"(&yuvconstants->kUVToG),
|
||||
[kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
|
||||
[kYToRgb] "r"(&yuvconstants->kYToRgb)
|
||||
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
|
||||
"q12", "q13", "q14", "q15");
|
||||
asm volatile(YUVTORGB_SETUP
|
||||
"vmov.u8 d23, #255 \n"
|
||||
"1: \n" READYUY2 YUVTORGB
|
||||
"subs %2, %2, #8 \n"
|
||||
"vst4.8 {d20, d21, d22, d23}, [%1]! \n"
|
||||
"bgt 1b \n"
|
||||
: "+r"(src_yuy2), // %0
|
||||
"+r"(dst_argb), // %1
|
||||
"+r"(width) // %2
|
||||
: [kUVToRB] "r"(&yuvconstants->kUVToRB),
|
||||
[kUVToG] "r"(&yuvconstants->kUVToG),
|
||||
[kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
|
||||
[kYToRgb] "r"(&yuvconstants->kYToRgb)
|
||||
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9",
|
||||
"q10", "q11", "q12", "q13", "q14", "q15");
|
||||
}
|
||||
|
||||
void UYVYToARGBRow_NEON(const uint8* src_uyvy,
|
||||
uint8* dst_argb,
|
||||
const struct YuvConstants* yuvconstants,
|
||||
int width) {
|
||||
asm volatile(
|
||||
YUVTORGB_SETUP
|
||||
"vmov.u8 d23, #255 \n"
|
||||
"1: \n" READUYVY YUVTORGB
|
||||
"subs %2, %2, #8 \n"
|
||||
"vst4.8 {d20, d21, d22, d23}, [%1]! \n"
|
||||
"bgt 1b \n"
|
||||
: "+r"(src_uyvy), // %0
|
||||
"+r"(dst_argb), // %1
|
||||
"+r"(width) // %2
|
||||
: [kUVToRB] "r"(&yuvconstants->kUVToRB),
|
||||
[kUVToG] "r"(&yuvconstants->kUVToG),
|
||||
[kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
|
||||
[kYToRgb] "r"(&yuvconstants->kYToRgb)
|
||||
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
|
||||
"q12", "q13", "q14", "q15");
|
||||
asm volatile(YUVTORGB_SETUP
|
||||
"vmov.u8 d23, #255 \n"
|
||||
"1: \n" READUYVY YUVTORGB
|
||||
"subs %2, %2, #8 \n"
|
||||
"vst4.8 {d20, d21, d22, d23}, [%1]! \n"
|
||||
"bgt 1b \n"
|
||||
: "+r"(src_uyvy), // %0
|
||||
"+r"(dst_argb), // %1
|
||||
"+r"(width) // %2
|
||||
: [kUVToRB] "r"(&yuvconstants->kUVToRB),
|
||||
[kUVToG] "r"(&yuvconstants->kUVToG),
|
||||
[kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
|
||||
[kYToRgb] "r"(&yuvconstants->kYToRgb)
|
||||
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9",
|
||||
"q10", "q11", "q12", "q13", "q14", "q15");
|
||||
}
|
||||
|
||||
// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
|
||||
|
@ -501,7 +497,7 @@ void SplitUVRow_NEON(const uint8* src_uv,
|
|||
uint8* dst_v,
|
||||
int width) {
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"vld2.8 {q0, q1}, [%0]! \n" // load 16 pairs of UV
|
||||
"subs %3, %3, #16 \n" // 16 processed per loop
|
||||
"vst1.8 {q0}, [%1]! \n" // store U
|
||||
|
@ -522,11 +518,11 @@ void MergeUVRow_NEON(const uint8* src_u,
|
|||
uint8* dst_uv,
|
||||
int width) {
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"vld1.8 {q0}, [%0]! \n" // load U
|
||||
"vld1.8 {q1}, [%1]! \n" // load V
|
||||
"subs %3, %3, #16 \n" // 16 processed per loop
|
||||
"vst2.u8 {q0, q1}, [%2]! \n" // store 16 pairs of UV
|
||||
"vst2.8 {q0, q1}, [%2]! \n" // store 16 pairs of UV
|
||||
"bgt 1b \n"
|
||||
: "+r"(src_u), // %0
|
||||
"+r"(src_v), // %1
|
||||
|
@ -537,10 +533,60 @@ void MergeUVRow_NEON(const uint8* src_u,
|
|||
);
|
||||
}
|
||||
|
||||
// Reads 16 packed RGB and write to planar dst_r, dst_g, dst_b.
|
||||
void SplitRGBRow_NEON(const uint8* src_rgb,
|
||||
uint8* dst_r,
|
||||
uint8* dst_g,
|
||||
uint8* dst_b,
|
||||
int width) {
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB
|
||||
"vld3.8 {d1, d3, d5}, [%0]! \n" // next 8 RGB
|
||||
"subs %4, %4, #16 \n" // 16 processed per loop
|
||||
"vst1.8 {q0}, [%1]! \n" // store R
|
||||
"vst1.8 {q1}, [%2]! \n" // store G
|
||||
"vst1.8 {q2}, [%3]! \n" // store B
|
||||
"bgt 1b \n"
|
||||
: "+r"(src_rgb), // %0
|
||||
"+r"(dst_r), // %1
|
||||
"+r"(dst_g), // %2
|
||||
"+r"(dst_b), // %3
|
||||
"+r"(width) // %4
|
||||
: // Input registers
|
||||
: "cc", "memory", "d0", "d1", "d2" // Clobber List
|
||||
);
|
||||
}
|
||||
|
||||
// Reads 16 planar R's, G's and B's and writes out 16 packed RGB at a time
|
||||
void MergeRGBRow_NEON(const uint8* src_r,
|
||||
const uint8* src_g,
|
||||
const uint8* src_b,
|
||||
uint8* dst_rgb,
|
||||
int width) {
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"vld1.8 {q0}, [%0]! \n" // load R
|
||||
"vld1.8 {q1}, [%1]! \n" // load G
|
||||
"vld1.8 {q2}, [%2]! \n" // load B
|
||||
"subs %4, %4, #16 \n" // 16 processed per loop
|
||||
"vst3.8 {d0, d2, d4}, [%3]! \n" // store 8 RGB
|
||||
"vst3.8 {d1, d3, d5}, [%3]! \n" // next 8 RGB
|
||||
"bgt 1b \n"
|
||||
: "+r"(src_r), // %0
|
||||
"+r"(src_g), // %1
|
||||
"+r"(src_b), // %2
|
||||
"+r"(dst_rgb), // %3
|
||||
"+r"(width) // %4
|
||||
: // Input registers
|
||||
: "cc", "memory", "q0", "q1", "q2" // Clobber List
|
||||
);
|
||||
}
|
||||
|
||||
// Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15.
|
||||
void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"vld1.8 {d0, d1, d2, d3}, [%0]! \n" // load 32
|
||||
"subs %2, %2, #32 \n" // 32 processed per loop
|
||||
"vst1.8 {d0, d1, d2, d3}, [%1]! \n" // store 32
|
||||
|
@ -557,7 +603,7 @@ void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
|
|||
void SetRow_NEON(uint8* dst, uint8 v8, int count) {
|
||||
asm volatile(
|
||||
"vdup.8 q0, %2 \n" // duplicate 16 bytes
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"subs %1, %1, #16 \n" // 16 bytes per loop
|
||||
"vst1.8 {q0}, [%0]! \n" // store
|
||||
"bgt 1b \n"
|
||||
|
@ -571,7 +617,7 @@ void SetRow_NEON(uint8* dst, uint8 v8, int count) {
|
|||
void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) {
|
||||
asm volatile(
|
||||
"vdup.u32 q0, %2 \n" // duplicate 4 ints
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"subs %1, %1, #4 \n" // 4 pixels per loop
|
||||
"vst1.8 {q0}, [%0]! \n" // store
|
||||
"bgt 1b \n"
|
||||
|
@ -588,7 +634,7 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
|
|||
"add %0, %0, %2 \n"
|
||||
"sub %0, #16 \n"
|
||||
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"vld1.8 {q0}, [%0], r3 \n" // src -= 16
|
||||
"subs %2, #16 \n" // 16 pixels per loop.
|
||||
"vrev64.8 q0, q0 \n"
|
||||
|
@ -612,7 +658,7 @@ void MirrorUVRow_NEON(const uint8* src_uv,
|
|||
"add %0, %0, %3, lsl #1 \n"
|
||||
"sub %0, #16 \n"
|
||||
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16
|
||||
"subs %3, #8 \n" // 8 pixels per loop.
|
||||
"vrev64.8 q0, q0 \n"
|
||||
|
@ -634,7 +680,7 @@ void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
|
|||
"add %0, %0, %2, lsl #2 \n"
|
||||
"sub %0, #16 \n"
|
||||
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"vld1.8 {q0}, [%0], r3 \n" // src -= 16
|
||||
"subs %2, #4 \n" // 4 pixels per loop.
|
||||
"vrev64.32 q0, q0 \n"
|
||||
|
@ -651,7 +697,7 @@ void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
|
|||
void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width) {
|
||||
asm volatile(
|
||||
"vmov.u8 d4, #255 \n" // Alpha
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RGB24.
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||
"vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB.
|
||||
|
@ -667,7 +713,7 @@ void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width) {
|
|||
void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width) {
|
||||
asm volatile(
|
||||
"vmov.u8 d4, #255 \n" // Alpha
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW.
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||
"vswp.u8 d1, d3 \n" // swap R, B
|
||||
|
@ -683,7 +729,7 @@ void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width) {
|
|||
|
||||
void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width) {
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW.
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||
"vswp.u8 d1, d3 \n" // swap R, B
|
||||
|
@ -713,7 +759,7 @@ void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width) {
|
|||
void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width) {
|
||||
asm volatile(
|
||||
"vmov.u8 d3, #255 \n" // Alpha
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||
RGB565TOARGB
|
||||
|
@ -759,7 +805,7 @@ void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555,
|
|||
int width) {
|
||||
asm volatile(
|
||||
"vmov.u8 d3, #255 \n" // Alpha
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||
ARGB1555TOARGB
|
||||
|
@ -788,7 +834,7 @@ void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444,
|
|||
int width) {
|
||||
asm volatile(
|
||||
"vmov.u8 d3, #255 \n" // Alpha
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||
ARGB4444TOARGB
|
||||
|
@ -804,7 +850,7 @@ void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444,
|
|||
|
||||
void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int width) {
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB.
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||
"vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of
|
||||
|
@ -820,7 +866,7 @@ void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int width) {
|
|||
|
||||
void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int width) {
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB.
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||
"vswp.u8 d1, d3 \n" // swap R, B
|
||||
|
@ -836,7 +882,7 @@ void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int width) {
|
|||
|
||||
void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int width) {
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2.
|
||||
"subs %2, %2, #16 \n" // 16 processed per loop.
|
||||
"vst1.8 {q0}, [%1]! \n" // store 16 pixels of Y.
|
||||
|
@ -851,7 +897,7 @@ void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int width) {
|
|||
|
||||
void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int width) {
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY.
|
||||
"subs %2, %2, #16 \n" // 16 processed per loop.
|
||||
"vst1.8 {q1}, [%1]! \n" // store 16 pixels of Y.
|
||||
|
@ -869,7 +915,7 @@ void YUY2ToUV422Row_NEON(const uint8* src_yuy2,
|
|||
uint8* dst_v,
|
||||
int width) {
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
|
||||
"subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
|
||||
"vst1.8 {d1}, [%1]! \n" // store 8 U.
|
||||
|
@ -889,7 +935,7 @@ void UYVYToUV422Row_NEON(const uint8* src_uyvy,
|
|||
uint8* dst_v,
|
||||
int width) {
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
|
||||
"subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
|
||||
"vst1.8 {d0}, [%1]! \n" // store 8 U.
|
||||
|
@ -911,7 +957,7 @@ void YUY2ToUVRow_NEON(const uint8* src_yuy2,
|
|||
int width) {
|
||||
asm volatile(
|
||||
"add %1, %0, %1 \n" // stride + src_yuy2
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
|
||||
"subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
|
||||
"vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row YUY2.
|
||||
|
@ -938,7 +984,7 @@ void UYVYToUVRow_NEON(const uint8* src_uyvy,
|
|||
int width) {
|
||||
asm volatile(
|
||||
"add %1, %0, %1 \n" // stride + src_uyvy
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
|
||||
"subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
|
||||
"vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row UYVY.
|
||||
|
@ -965,7 +1011,7 @@ void ARGBShuffleRow_NEON(const uint8* src_argb,
|
|||
int width) {
|
||||
asm volatile(
|
||||
"vld1.8 {q2}, [%3] \n" // shuffler
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"vld1.8 {q0}, [%0]! \n" // load 4 pixels.
|
||||
"subs %2, %2, #4 \n" // 4 processed per loop
|
||||
"vtbl.8 d2, {d0, d1}, d4 \n" // look up 2 first pixels
|
||||
|
@ -986,7 +1032,7 @@ void I422ToYUY2Row_NEON(const uint8* src_y,
|
|||
uint8* dst_yuy2,
|
||||
int width) {
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"vld2.8 {d0, d2}, [%0]! \n" // load 16 Ys
|
||||
"vld1.8 {d1}, [%1]! \n" // load 8 Us
|
||||
"vld1.8 {d3}, [%2]! \n" // load 8 Vs
|
||||
|
@ -1008,7 +1054,7 @@ void I422ToUYVYRow_NEON(const uint8* src_y,
|
|||
uint8* dst_uyvy,
|
||||
int width) {
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"vld2.8 {d1, d3}, [%0]! \n" // load 16 Ys
|
||||
"vld1.8 {d0}, [%1]! \n" // load 8 Us
|
||||
"vld1.8 {d2}, [%2]! \n" // load 8 Vs
|
||||
|
@ -1026,7 +1072,7 @@ void I422ToUYVYRow_NEON(const uint8* src_y,
|
|||
|
||||
void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int width) {
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||
ARGBTORGB565
|
||||
|
@ -1045,13 +1091,14 @@ void ARGBToRGB565DitherRow_NEON(const uint8* src_argb,
|
|||
int width) {
|
||||
asm volatile(
|
||||
"vdup.32 d2, %2 \n" // dither4
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"vld4.8 {d20, d21, d22, d23}, [%1]! \n" // load 8 pixels of ARGB.
|
||||
"subs %3, %3, #8 \n" // 8 processed per loop.
|
||||
"vqadd.u8 d20, d20, d2 \n"
|
||||
"vqadd.u8 d21, d21, d2 \n"
|
||||
"vqadd.u8 d22, d22, d2 \n" ARGBTORGB565
|
||||
"vst1.8 {q0}, [%0]! \n" // store 8 pixels RGB565.
|
||||
"vqadd.u8 d22, d22, d2 \n" // add for dither
|
||||
ARGBTORGB565
|
||||
"vst1.8 {q0}, [%0]! \n" // store 8 RGB565.
|
||||
"bgt 1b \n"
|
||||
: "+r"(dst_rgb) // %0
|
||||
: "r"(src_argb), // %1
|
||||
|
@ -1064,12 +1111,11 @@ void ARGBToARGB1555Row_NEON(const uint8* src_argb,
|
|||
uint8* dst_argb1555,
|
||||
int width) {
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||
ARGBTOARGB1555
|
||||
"vst1.8 {q0}, [%1]! \n" // store 8 pixels
|
||||
// ARGB1555.
|
||||
"vst1.8 {q0}, [%1]! \n" // store 8 ARGB1555.
|
||||
"bgt 1b \n"
|
||||
: "+r"(src_argb), // %0
|
||||
"+r"(dst_argb1555), // %1
|
||||
|
@ -1084,12 +1130,11 @@ void ARGBToARGB4444Row_NEON(const uint8* src_argb,
|
|||
asm volatile(
|
||||
"vmov.u8 d4, #0x0f \n" // bits to clear with
|
||||
// vbic.
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||
ARGBTOARGB4444
|
||||
"vst1.8 {q0}, [%1]! \n" // store 8 pixels
|
||||
// ARGB4444.
|
||||
"vst1.8 {q0}, [%1]! \n" // store 8 ARGB4444.
|
||||
"bgt 1b \n"
|
||||
: "+r"(src_argb), // %0
|
||||
"+r"(dst_argb4444), // %1
|
||||
|
@ -1104,7 +1149,7 @@ void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
|
|||
"vmov.u8 d25, #65 \n" // G * 0.5078 coefficient
|
||||
"vmov.u8 d26, #33 \n" // R * 0.2578 coefficient
|
||||
"vmov.u8 d27, #16 \n" // Add 16 constant
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||
"vmull.u8 q2, d0, d24 \n" // B
|
||||
|
@ -1123,7 +1168,7 @@ void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
|
|||
|
||||
void ARGBExtractAlphaRow_NEON(const uint8* src_argb, uint8* dst_a, int width) {
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels
|
||||
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels
|
||||
"subs %2, %2, #16 \n" // 16 processed per loop
|
||||
|
@ -1142,7 +1187,7 @@ void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
|
|||
"vmov.u8 d24, #15 \n" // B * 0.11400 coefficient
|
||||
"vmov.u8 d25, #75 \n" // G * 0.58700 coefficient
|
||||
"vmov.u8 d26, #38 \n" // R * 0.29900 coefficient
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||
"vmull.u8 q2, d0, d24 \n" // B
|
||||
|
@ -1171,7 +1216,7 @@ void ARGBToUV444Row_NEON(const uint8* src_argb,
|
|||
"vmov.u8 d27, #18 \n" // VB -0.1406 coefficient
|
||||
"vmov.u8 d28, #94 \n" // VG -0.7344 coefficient
|
||||
"vmov.u16 q15, #0x8080 \n" // 128.5
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
|
||||
"subs %3, %3, #8 \n" // 8 processed per loop.
|
||||
"vmull.u8 q2, d0, d24 \n" // B
|
||||
|
@ -1199,24 +1244,20 @@ void ARGBToUV444Row_NEON(const uint8* src_argb,
|
|||
"q15");
|
||||
}
|
||||
|
||||
// clang-format off
|
||||
// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
|
||||
#define RGBTOUV(QB, QG, QR) \
|
||||
"vmul.s16 q8, " #QB \
|
||||
", q10 \n" /* B */ \
|
||||
"vmls.s16 q8, " #QG \
|
||||
", q11 \n" /* G */ \
|
||||
"vmls.s16 q8, " #QR \
|
||||
", q12 \n" /* R */ \
|
||||
"vmul.s16 q8, " #QB ", q10 \n" /* B */ \
|
||||
"vmls.s16 q8, " #QG ", q11 \n" /* G */ \
|
||||
"vmls.s16 q8, " #QR ", q12 \n" /* R */ \
|
||||
"vadd.u16 q8, q8, q15 \n" /* +128 -> unsigned */ \
|
||||
"vmul.s16 q9, " #QR \
|
||||
", q10 \n" /* R */ \
|
||||
"vmls.s16 q9, " #QG \
|
||||
", q14 \n" /* G */ \
|
||||
"vmls.s16 q9, " #QB \
|
||||
", q13 \n" /* B */ \
|
||||
"vmul.s16 q9, " #QR ", q10 \n" /* R */ \
|
||||
"vmls.s16 q9, " #QG ", q14 \n" /* G */ \
|
||||
"vmls.s16 q9, " #QB ", q13 \n" /* B */ \
|
||||
"vadd.u16 q9, q9, q15 \n" /* +128 -> unsigned */ \
|
||||
"vqshrn.u16 d0, q8, #8 \n" /* 16 bit to 8 bit U */ \
|
||||
"vqshrn.u16 d1, q9, #8 \n" /* 16 bit to 8 bit V */
|
||||
// clang-format on
|
||||
|
||||
// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
|
||||
void ARGBToUVRow_NEON(const uint8* src_argb,
|
||||
|
@ -1232,7 +1273,7 @@ void ARGBToUVRow_NEON(const uint8* src_argb,
|
|||
"vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
|
||||
"vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
|
||||
"vmov.u16 q15, #0x8080 \n" // 128.5
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
|
||||
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
|
||||
"vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
|
||||
|
@ -1278,7 +1319,7 @@ void ARGBToUVJRow_NEON(const uint8* src_argb,
|
|||
"vmov.s16 q13, #20 / 2 \n" // VB -0.08131 coefficient
|
||||
"vmov.s16 q14, #107 / 2 \n" // VG -0.41869 coefficient
|
||||
"vmov.u16 q15, #0x8080 \n" // 128.5
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
|
||||
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
|
||||
"vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
|
||||
|
@ -1323,7 +1364,7 @@ void BGRAToUVRow_NEON(const uint8* src_bgra,
|
|||
"vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
|
||||
"vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
|
||||
"vmov.u16 q15, #0x8080 \n" // 128.5
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 BGRA pixels.
|
||||
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 BGRA pixels.
|
||||
"vpaddl.u8 q3, q3 \n" // B 16 bytes -> 8 shorts.
|
||||
|
@ -1368,7 +1409,7 @@ void ABGRToUVRow_NEON(const uint8* src_abgr,
|
|||
"vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
|
||||
"vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
|
||||
"vmov.u16 q15, #0x8080 \n" // 128.5
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ABGR pixels.
|
||||
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ABGR pixels.
|
||||
"vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts.
|
||||
|
@ -1413,7 +1454,7 @@ void RGBAToUVRow_NEON(const uint8* src_rgba,
|
|||
"vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
|
||||
"vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
|
||||
"vmov.u16 q15, #0x8080 \n" // 128.5
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 RGBA pixels.
|
||||
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 RGBA pixels.
|
||||
"vpaddl.u8 q0, q1 \n" // B 16 bytes -> 8 shorts.
|
||||
|
@ -1458,7 +1499,7 @@ void RGB24ToUVRow_NEON(const uint8* src_rgb24,
|
|||
"vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
|
||||
"vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
|
||||
"vmov.u16 q15, #0x8080 \n" // 128.5
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB24 pixels.
|
||||
"vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RGB24 pixels.
|
||||
"vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
|
||||
|
@ -1503,7 +1544,7 @@ void RAWToUVRow_NEON(const uint8* src_raw,
|
|||
"vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
|
||||
"vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
|
||||
"vmov.u16 q15, #0x8080 \n" // 128.5
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RAW pixels.
|
||||
"vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RAW pixels.
|
||||
"vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts.
|
||||
|
@ -1550,7 +1591,7 @@ void RGB565ToUVRow_NEON(const uint8* src_rgb565,
|
|||
"vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
|
||||
"vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
|
||||
"vmov.u16 q15, #0x8080 \n" // 128.5
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
|
||||
RGB565TOARGB
|
||||
"vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
|
||||
|
@ -1616,7 +1657,7 @@ void ARGB1555ToUVRow_NEON(const uint8* src_argb1555,
|
|||
"vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
|
||||
"vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
|
||||
"vmov.u16 q15, #0x8080 \n" // 128.5
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
|
||||
RGB555TOARGB
|
||||
"vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
|
||||
|
@ -1682,7 +1723,7 @@ void ARGB4444ToUVRow_NEON(const uint8* src_argb4444,
|
|||
"vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
|
||||
"vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
|
||||
"vmov.u16 q15, #0x8080 \n" // 128.5
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
|
||||
ARGB4444TOARGB
|
||||
"vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
|
||||
|
@ -1739,7 +1780,7 @@ void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int width) {
|
|||
"vmov.u8 d25, #65 \n" // G * 0.5078 coefficient
|
||||
"vmov.u8 d26, #33 \n" // R * 0.2578 coefficient
|
||||
"vmov.u8 d27, #16 \n" // Add 16 constant
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||
RGB565TOARGB
|
||||
|
@ -1763,7 +1804,7 @@ void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int width) {
|
|||
"vmov.u8 d25, #65 \n" // G * 0.5078 coefficient
|
||||
"vmov.u8 d26, #33 \n" // R * 0.2578 coefficient
|
||||
"vmov.u8 d27, #16 \n" // Add 16 constant
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||
ARGB1555TOARGB
|
||||
|
@ -1787,7 +1828,7 @@ void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int width) {
|
|||
"vmov.u8 d25, #65 \n" // G * 0.5078 coefficient
|
||||
"vmov.u8 d26, #33 \n" // R * 0.2578 coefficient
|
||||
"vmov.u8 d27, #16 \n" // Add 16 constant
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||
ARGB4444TOARGB
|
||||
|
@ -1811,7 +1852,7 @@ void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int width) {
|
|||
"vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
|
||||
"vmov.u8 d6, #13 \n" // B * 0.1016 coefficient
|
||||
"vmov.u8 d7, #16 \n" // Add 16 constant
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of BGRA.
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||
"vmull.u8 q8, d1, d4 \n" // R
|
||||
|
@ -1834,7 +1875,7 @@ void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int width) {
|
|||
"vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
|
||||
"vmov.u8 d6, #13 \n" // B * 0.1016 coefficient
|
||||
"vmov.u8 d7, #16 \n" // Add 16 constant
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ABGR.
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||
"vmull.u8 q8, d0, d4 \n" // R
|
||||
|
@ -1857,7 +1898,7 @@ void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int width) {
|
|||
"vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
|
||||
"vmov.u8 d6, #33 \n" // R * 0.2578 coefficient
|
||||
"vmov.u8 d7, #16 \n" // Add 16 constant
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of RGBA.
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||
"vmull.u8 q8, d1, d4 \n" // B
|
||||
|
@ -1880,7 +1921,7 @@ void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int width) {
|
|||
"vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
|
||||
"vmov.u8 d6, #33 \n" // R * 0.2578 coefficient
|
||||
"vmov.u8 d7, #16 \n" // Add 16 constant
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RGB24.
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||
"vmull.u8 q8, d0, d4 \n" // B
|
||||
|
@ -1903,7 +1944,7 @@ void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int width) {
|
|||
"vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
|
||||
"vmov.u8 d6, #13 \n" // B * 0.1016 coefficient
|
||||
"vmov.u8 d7, #16 \n" // Add 16 constant
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RAW.
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||
"vmull.u8 q8, d0, d4 \n" // B
|
||||
|
@ -1938,7 +1979,7 @@ void InterpolateRow_NEON(uint8* dst_ptr,
|
|||
"rsb %4, #256 \n"
|
||||
"vdup.8 d4, %4 \n"
|
||||
// General purpose row blend.
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"vld1.8 {q0}, [%1]! \n"
|
||||
"vld1.8 {q1}, [%2]! \n"
|
||||
"subs %3, %3, #16 \n"
|
||||
|
@ -1953,7 +1994,7 @@ void InterpolateRow_NEON(uint8* dst_ptr,
|
|||
"b 99f \n"
|
||||
|
||||
// Blend 50 / 50.
|
||||
"50: \n"
|
||||
"50: \n"
|
||||
"vld1.8 {q0}, [%1]! \n"
|
||||
"vld1.8 {q1}, [%2]! \n"
|
||||
"subs %3, %3, #16 \n"
|
||||
|
@ -1963,13 +2004,13 @@ void InterpolateRow_NEON(uint8* dst_ptr,
|
|||
"b 99f \n"
|
||||
|
||||
// Blend 100 / 0 - Copy row unchanged.
|
||||
"100: \n"
|
||||
"100: \n"
|
||||
"vld1.8 {q0}, [%1]! \n"
|
||||
"subs %3, %3, #16 \n"
|
||||
"vst1.8 {q0}, [%0]! \n"
|
||||
"bgt 100b \n"
|
||||
|
||||
"99: \n"
|
||||
"99: \n"
|
||||
: "+r"(dst_ptr), // %0
|
||||
"+r"(src_ptr), // %1
|
||||
"+r"(src_stride), // %2
|
||||
|
@ -1988,7 +2029,7 @@ void ARGBBlendRow_NEON(const uint8* src_argb0,
|
|||
"subs %3, #8 \n"
|
||||
"blt 89f \n"
|
||||
// Blend 8 pixels.
|
||||
"8: \n"
|
||||
"8: \n"
|
||||
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB0.
|
||||
"vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 pixels of ARGB1.
|
||||
"subs %3, %3, #8 \n" // 8 processed per loop.
|
||||
|
@ -2006,12 +2047,12 @@ void ARGBBlendRow_NEON(const uint8* src_argb0,
|
|||
"vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 pixels of ARGB.
|
||||
"bge 8b \n"
|
||||
|
||||
"89: \n"
|
||||
"89: \n"
|
||||
"adds %3, #8-1 \n"
|
||||
"blt 99f \n"
|
||||
|
||||
// Blend 1 pixels.
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n" // load 1 pixel ARGB0.
|
||||
"vld4.8 {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n" // load 1 pixel ARGB1.
|
||||
"subs %3, %3, #1 \n" // 1 processed per loop.
|
||||
|
@ -2043,7 +2084,7 @@ void ARGBBlendRow_NEON(const uint8* src_argb0,
|
|||
void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
|
||||
asm volatile(
|
||||
// Attenuate 8 pixels.
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB.
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||
"vmull.u8 q10, d0, d3 \n" // b * a
|
||||
|
@ -2075,7 +2116,7 @@ void ARGBQuantizeRow_NEON(uint8* dst_argb,
|
|||
"vdup.u16 q10, %4 \n" // interval add
|
||||
|
||||
// 8 pixel loop.
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"vld4.8 {d0, d2, d4, d6}, [%0] \n" // load 8 pixels of ARGB.
|
||||
"subs %1, %1, #8 \n" // 8 processed per loop.
|
||||
"vmovl.u8 q0, d0 \n" // b (0 .. 255)
|
||||
|
@ -2116,7 +2157,7 @@ void ARGBShadeRow_NEON(const uint8* src_argb,
|
|||
"vshr.u16 q0, q0, #1 \n" // scale / 2.
|
||||
|
||||
// 8 pixel loop.
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"vld4.8 {d20, d22, d24, d26}, [%0]! \n" // load 8 pixels of ARGB.
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||
"vmovl.u8 q10, d20 \n" // b (0 .. 255)
|
||||
|
@ -2148,7 +2189,7 @@ void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
|
|||
"vmov.u8 d24, #15 \n" // B * 0.11400 coefficient
|
||||
"vmov.u8 d25, #75 \n" // G * 0.58700 coefficient
|
||||
"vmov.u8 d26, #38 \n" // R * 0.29900 coefficient
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||
"vmull.u8 q2, d0, d24 \n" // B
|
||||
|
@ -2181,7 +2222,7 @@ void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
|
|||
"vmov.u8 d28, #24 \n" // BB coefficient
|
||||
"vmov.u8 d29, #98 \n" // BG coefficient
|
||||
"vmov.u8 d30, #50 \n" // BR coefficient
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"vld4.8 {d0, d1, d2, d3}, [%0] \n" // load 8 ARGB pixels.
|
||||
"subs %1, %1, #8 \n" // 8 processed per loop.
|
||||
"vmull.u8 q2, d0, d20 \n" // B to Sepia B
|
||||
|
@ -2217,7 +2258,7 @@ void ARGBColorMatrixRow_NEON(const uint8* src_argb,
|
|||
"vmovl.s8 q0, d4 \n" // B,G coefficients s16.
|
||||
"vmovl.s8 q1, d5 \n" // R,A coefficients s16.
|
||||
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"vld4.8 {d16, d18, d20, d22}, [%0]! \n" // load 8 ARGB pixels.
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||
"vmovl.u8 q8, d16 \n" // b (0 .. 255) 16 bit
|
||||
|
@ -2273,10 +2314,9 @@ void ARGBMultiplyRow_NEON(const uint8* src_argb0,
|
|||
int width) {
|
||||
asm volatile(
|
||||
// 8 pixel loop.
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
|
||||
"vld4.8 {d1, d3, d5, d7}, [%1]! \n" // load 8 more ARGB
|
||||
// pixels.
|
||||
"subs %3, %3, #8 \n" // 8 processed per loop.
|
||||
"vmull.u8 q0, d0, d1 \n" // multiply B
|
||||
"vmull.u8 q1, d2, d3 \n" // multiply G
|
||||
|
@ -2288,7 +2328,6 @@ void ARGBMultiplyRow_NEON(const uint8* src_argb0,
|
|||
"vrshrn.u16 d3, q3, #8 \n" // 16 bit to 8 bit A
|
||||
"vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
|
||||
"bgt 1b \n"
|
||||
|
||||
: "+r"(src_argb0), // %0
|
||||
"+r"(src_argb1), // %1
|
||||
"+r"(dst_argb), // %2
|
||||
|
@ -2304,16 +2343,14 @@ void ARGBAddRow_NEON(const uint8* src_argb0,
|
|||
int width) {
|
||||
asm volatile(
|
||||
// 8 pixel loop.
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
|
||||
"vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB
|
||||
// pixels.
|
||||
"subs %3, %3, #8 \n" // 8 processed per loop.
|
||||
"vqadd.u8 q0, q0, q2 \n" // add B, G
|
||||
"vqadd.u8 q1, q1, q3 \n" // add R, A
|
||||
"vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
|
||||
"bgt 1b \n"
|
||||
|
||||
: "+r"(src_argb0), // %0
|
||||
"+r"(src_argb1), // %1
|
||||
"+r"(dst_argb), // %2
|
||||
|
@ -2329,16 +2366,14 @@ void ARGBSubtractRow_NEON(const uint8* src_argb0,
|
|||
int width) {
|
||||
asm volatile(
|
||||
// 8 pixel loop.
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
|
||||
"vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB
|
||||
// pixels.
|
||||
"subs %3, %3, #8 \n" // 8 processed per loop.
|
||||
"vqsub.u8 q0, q0, q2 \n" // subtract B, G
|
||||
"vqsub.u8 q1, q1, q3 \n" // subtract R, A
|
||||
"vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
|
||||
"bgt 1b \n"
|
||||
|
||||
: "+r"(src_argb0), // %0
|
||||
"+r"(src_argb1), // %1
|
||||
"+r"(dst_argb), // %2
|
||||
|
@ -2359,7 +2394,7 @@ void SobelRow_NEON(const uint8* src_sobelx,
|
|||
asm volatile(
|
||||
"vmov.u8 d3, #255 \n" // alpha
|
||||
// 8 pixel loop.
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"vld1.8 {d0}, [%0]! \n" // load 8 sobelx.
|
||||
"vld1.8 {d1}, [%1]! \n" // load 8 sobely.
|
||||
"subs %3, %3, #8 \n" // 8 processed per loop.
|
||||
|
@ -2383,7 +2418,7 @@ void SobelToPlaneRow_NEON(const uint8* src_sobelx,
|
|||
int width) {
|
||||
asm volatile(
|
||||
// 16 pixel loop.
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"vld1.8 {q0}, [%0]! \n" // load 16 sobelx.
|
||||
"vld1.8 {q1}, [%1]! \n" // load 16 sobely.
|
||||
"subs %3, %3, #16 \n" // 16 processed per loop.
|
||||
|
@ -2410,7 +2445,7 @@ void SobelXYRow_NEON(const uint8* src_sobelx,
|
|||
asm volatile(
|
||||
"vmov.u8 d3, #255 \n" // alpha
|
||||
// 8 pixel loop.
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"vld1.8 {d2}, [%0]! \n" // load 8 sobelx.
|
||||
"vld1.8 {d0}, [%1]! \n" // load 8 sobely.
|
||||
"subs %3, %3, #8 \n" // 8 processed per loop.
|
||||
|
@ -2435,7 +2470,7 @@ void SobelXRow_NEON(const uint8* src_y0,
|
|||
uint8* dst_sobelx,
|
||||
int width) {
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"vld1.8 {d0}, [%0],%5 \n" // top
|
||||
"vld1.8 {d1}, [%0],%6 \n"
|
||||
"vsubl.u8 q0, d0, d1 \n"
|
||||
|
@ -2473,7 +2508,7 @@ void SobelYRow_NEON(const uint8* src_y0,
|
|||
uint8* dst_sobely,
|
||||
int width) {
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"vld1.8 {d0}, [%0],%4 \n" // left
|
||||
"vld1.8 {d1}, [%1],%4 \n"
|
||||
"vsubl.u8 q0, d0, d1 \n"
|
||||
|
@ -2505,7 +2540,7 @@ void HalfFloat1Row_NEON(const uint16* src, uint16* dst, float, int width) {
|
|||
asm volatile(
|
||||
"vdup.32 q0, %3 \n"
|
||||
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"vld1.8 {q1}, [%0]! \n" // load 8 shorts
|
||||
"subs %2, %2, #8 \n" // 8 pixels per loop
|
||||
"vmovl.u16 q2, d2 \n" // 8 int's
|
||||
|
@ -2530,7 +2565,7 @@ void HalfFloatRow_NEON(const uint16* src, uint16* dst, float scale, int width) {
|
|||
asm volatile(
|
||||
"vdup.32 q0, %3 \n"
|
||||
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"vld1.8 {q1}, [%0]! \n" // load 8 shorts
|
||||
"subs %2, %2, #8 \n" // 8 pixels per loop
|
||||
"vmovl.u16 q2, d2 \n" // 8 int's
|
||||
|
|
|
@ -273,7 +273,7 @@ void I422ToRGB565Row_NEON(const uint8* src_y,
|
|||
int width) {
|
||||
asm volatile(
|
||||
YUVTORGB_SETUP
|
||||
"1: \n" READYUV422 YUVTORGB(
|
||||
"1: \n" READYUV422 YUVTORGB(
|
||||
v22, v21,
|
||||
v20) "subs %w4, %w4, #8 \n" ARGBTORGB565
|
||||
"st1 {v0.8h}, [%3], #16 \n" // store 8 pixels
|
||||
|
@ -310,7 +310,7 @@ void I422ToARGB1555Row_NEON(const uint8* src_y,
|
|||
asm volatile(
|
||||
YUVTORGB_SETUP
|
||||
"movi v23.8b, #255 \n"
|
||||
"1: \n" READYUV422 YUVTORGB(
|
||||
"1: \n" READYUV422 YUVTORGB(
|
||||
v22, v21,
|
||||
v20) "subs %w4, %w4, #8 \n" ARGBTOARGB1555
|
||||
"st1 {v0.8h}, [%3], #16 \n" // store 8 pixels
|
||||
|
@ -395,7 +395,7 @@ void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) {
|
|||
void J400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) {
|
||||
asm volatile(
|
||||
"movi v23.8b, #255 \n"
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"ld1 {v20.8b}, [%0], #8 \n"
|
||||
"orr v21.8b, v20.8b, v20.8b \n"
|
||||
"orr v22.8b, v20.8b, v20.8b \n"
|
||||
|
@ -470,7 +470,7 @@ void NV12ToRGB565Row_NEON(const uint8* src_y,
|
|||
int width) {
|
||||
asm volatile(
|
||||
YUVTORGB_SETUP
|
||||
"1: \n" READNV12 YUVTORGB(
|
||||
"1: \n" READNV12 YUVTORGB(
|
||||
v22, v21,
|
||||
v20) "subs %w3, %w3, #8 \n" ARGBTORGB565
|
||||
"st1 {v0.8h}, [%2], 16 \n" // store 8 pixels
|
||||
|
@ -544,7 +544,7 @@ void SplitUVRow_NEON(const uint8* src_uv,
|
|||
uint8* dst_v,
|
||||
int width) {
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pairs of UV
|
||||
"subs %w3, %w3, #16 \n" // 16 processed per loop
|
||||
"st1 {v0.16b}, [%1], #16 \n" // store U
|
||||
|
@ -565,7 +565,7 @@ void MergeUVRow_NEON(const uint8* src_u,
|
|||
uint8* dst_uv,
|
||||
int width) {
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"ld1 {v0.16b}, [%0], #16 \n" // load U
|
||||
"ld1 {v1.16b}, [%1], #16 \n" // load V
|
||||
"subs %w3, %w3, #16 \n" // 16 processed per loop
|
||||
|
@ -580,19 +580,67 @@ void MergeUVRow_NEON(const uint8* src_u,
|
|||
);
|
||||
}
|
||||
|
||||
// Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15.
|
||||
// Reads 16 packed RGB and write to planar dst_r, dst_g, dst_b.
|
||||
void SplitRGBRow_NEON(const uint8* src_rgb,
|
||||
uint8* dst_r,
|
||||
uint8* dst_g,
|
||||
uint8* dst_b,
|
||||
int width) {
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 RGB
|
||||
"subs %w4, %w4, #16 \n" // 16 processed per loop
|
||||
"st1 {v0.16b}, [%1], #16 \n" // store R
|
||||
"st1 {v1.16b}, [%2], #16 \n" // store G
|
||||
"st1 {v2.16b}, [%3], #16 \n" // store B
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_rgb), // %0
|
||||
"+r"(dst_r), // %1
|
||||
"+r"(dst_g), // %2
|
||||
"+r"(dst_b), // %3
|
||||
"+r"(width) // %4
|
||||
: // Input registers
|
||||
: "cc", "memory", "v0", "v1", "v2" // Clobber List
|
||||
);
|
||||
}
|
||||
|
||||
// Reads 16 planar R's, G's and B's and writes out 16 packed RGB at a time
|
||||
void MergeRGBRow_NEON(const uint8* src_r,
|
||||
const uint8* src_g,
|
||||
const uint8* src_b,
|
||||
uint8* dst_rgb,
|
||||
int width) {
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"ld1 {v0.16b}, [%0], #16 \n" // load R
|
||||
"ld1 {v1.16b}, [%1], #16 \n" // load G
|
||||
"ld1 {v2.16b}, [%2], #16 \n" // load B
|
||||
"subs %w4, %w4, #16 \n" // 16 processed per loop
|
||||
"st3 {v0.16b,v1.16b,v2.16b}, [%3], #48 \n" // store 16 RGB
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_r), // %0
|
||||
"+r"(src_g), // %1
|
||||
"+r"(src_b), // %2
|
||||
"+r"(dst_rgb), // %3
|
||||
"+r"(width) // %4
|
||||
: // Input registers
|
||||
: "cc", "memory", "v0", "v1", "v2" // Clobber List
|
||||
);
|
||||
}
|
||||
|
||||
// Copy multiple of 32.
|
||||
void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"ld1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 32
|
||||
"1: \n"
|
||||
"ldp q0, q1, [%0], #32 \n"
|
||||
"subs %w2, %w2, #32 \n" // 32 processed per loop
|
||||
"st1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 32
|
||||
"stp q0, q1, [%1], #32 \n"
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src), // %0
|
||||
"+r"(dst), // %1
|
||||
"+r"(count) // %2 // Output registers
|
||||
: // Input registers
|
||||
: "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
|
||||
: "+r"(src), // %0
|
||||
"+r"(dst), // %1
|
||||
"+r"(count) // %2 // Output registers
|
||||
: // Input registers
|
||||
: "cc", "memory", "v0", "v1" // Clobber List
|
||||
);
|
||||
}
|
||||
|
||||
|
@ -600,7 +648,7 @@ void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
|
|||
void SetRow_NEON(uint8* dst, uint8 v8, int count) {
|
||||
asm volatile(
|
||||
"dup v0.16b, %w2 \n" // duplicate 16 bytes
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"subs %w1, %w1, #16 \n" // 16 bytes per loop
|
||||
"st1 {v0.16b}, [%0], #16 \n" // store
|
||||
"b.gt 1b \n"
|
||||
|
@ -613,7 +661,7 @@ void SetRow_NEON(uint8* dst, uint8 v8, int count) {
|
|||
void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) {
|
||||
asm volatile(
|
||||
"dup v0.4s, %w2 \n" // duplicate 4 ints
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"subs %w1, %w1, #4 \n" // 4 ints per loop
|
||||
"st1 {v0.16b}, [%0], #16 \n" // store
|
||||
"b.gt 1b \n"
|
||||
|
@ -628,7 +676,7 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
|
|||
// Start at end of source row.
|
||||
"add %0, %0, %w2, sxtw \n"
|
||||
"sub %0, %0, #16 \n"
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"ld1 {v0.16b}, [%0], %3 \n" // src -= 16
|
||||
"subs %w2, %w2, #16 \n" // 16 pixels per loop.
|
||||
"rev64 v0.16b, v0.16b \n"
|
||||
|
@ -650,7 +698,7 @@ void MirrorUVRow_NEON(const uint8* src_uv,
|
|||
// Start at end of source row.
|
||||
"add %0, %0, %w3, sxtw #1 \n"
|
||||
"sub %0, %0, #16 \n"
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"ld2 {v0.8b, v1.8b}, [%0], %4 \n" // src -= 16
|
||||
"subs %w3, %w3, #8 \n" // 8 pixels per loop.
|
||||
"rev64 v0.8b, v0.8b \n"
|
||||
|
@ -671,7 +719,7 @@ void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
|
|||
// Start at end of source row.
|
||||
"add %0, %0, %w2, sxtw #2 \n"
|
||||
"sub %0, %0, #16 \n"
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"ld1 {v0.16b}, [%0], %3 \n" // src -= 16
|
||||
"subs %w2, %w2, #4 \n" // 4 pixels per loop.
|
||||
"rev64 v0.4s, v0.4s \n"
|
||||
|
@ -688,11 +736,10 @@ void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
|
|||
void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width) {
|
||||
asm volatile(
|
||||
"movi v4.8b, #255 \n" // Alpha
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24.
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||
"st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB
|
||||
// pixels
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_rgb24), // %0
|
||||
"+r"(dst_argb), // %1
|
||||
|
@ -705,7 +752,7 @@ void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width) {
|
|||
void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width) {
|
||||
asm volatile(
|
||||
"movi v5.8b, #255 \n" // Alpha
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||
"orr v3.8b, v1.8b, v1.8b \n" // move g
|
||||
|
@ -722,7 +769,7 @@ void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width) {
|
|||
|
||||
void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width) {
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||
"orr v3.8b, v1.8b, v1.8b \n" // move g
|
||||
|
@ -753,12 +800,11 @@ void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width) {
|
|||
void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width) {
|
||||
asm volatile(
|
||||
"movi v3.8b, #255 \n" // Alpha
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||
RGB565TOARGB
|
||||
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
|
||||
// pixels
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_rgb565), // %0
|
||||
"+r"(dst_argb), // %1
|
||||
|
@ -810,7 +856,7 @@ void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555,
|
|||
int width) {
|
||||
asm volatile(
|
||||
"movi v3.8b, #255 \n" // Alpha
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||
ARGB1555TOARGB
|
||||
|
@ -841,7 +887,7 @@ void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444,
|
|||
uint8* dst_argb,
|
||||
int width) {
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||
ARGB4444TOARGB
|
||||
|
@ -858,9 +904,8 @@ void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444,
|
|||
|
||||
void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int width) {
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load 8 ARGB
|
||||
// pixels
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||
"st3 {v1.8b,v2.8b,v3.8b}, [%1], #24 \n" // store 8 pixels of
|
||||
// RGB24.
|
||||
|
@ -875,7 +920,7 @@ void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int width) {
|
|||
|
||||
void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int width) {
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load b g r a
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||
"orr v4.8b, v2.8b, v2.8b \n" // mov g
|
||||
|
@ -892,7 +937,7 @@ void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int width) {
|
|||
|
||||
void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int width) {
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2.
|
||||
"subs %w2, %w2, #16 \n" // 16 processed per loop.
|
||||
"st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y.
|
||||
|
@ -907,7 +952,7 @@ void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int width) {
|
|||
|
||||
void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int width) {
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY.
|
||||
"subs %w2, %w2, #16 \n" // 16 processed per loop.
|
||||
"st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y.
|
||||
|
@ -925,9 +970,8 @@ void YUY2ToUV422Row_NEON(const uint8* src_yuy2,
|
|||
uint8* dst_v,
|
||||
int width) {
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 YUY2
|
||||
// pixels
|
||||
"subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs.
|
||||
"st1 {v1.8b}, [%1], #8 \n" // store 8 U.
|
||||
"st1 {v3.8b}, [%2], #8 \n" // store 8 V.
|
||||
|
@ -946,9 +990,8 @@ void UYVYToUV422Row_NEON(const uint8* src_uyvy,
|
|||
uint8* dst_v,
|
||||
int width) {
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 UYVY
|
||||
// pixels
|
||||
"subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs.
|
||||
"st1 {v0.8b}, [%1], #8 \n" // store 8 U.
|
||||
"st1 {v2.8b}, [%2], #8 \n" // store 8 V.
|
||||
|
@ -969,7 +1012,7 @@ void YUY2ToUVRow_NEON(const uint8* src_yuy2,
|
|||
int width) {
|
||||
const uint8* src_yuy2b = src_yuy2 + stride_yuy2;
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels
|
||||
"subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs.
|
||||
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row
|
||||
|
@ -996,7 +1039,7 @@ void UYVYToUVRow_NEON(const uint8* src_uyvy,
|
|||
int width) {
|
||||
const uint8* src_uyvyb = src_uyvy + stride_uyvy;
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels
|
||||
"subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs.
|
||||
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row
|
||||
|
@ -1023,7 +1066,7 @@ void ARGBShuffleRow_NEON(const uint8* src_argb,
|
|||
int width) {
|
||||
asm volatile(
|
||||
"ld1 {v2.16b}, [%3] \n" // shuffler
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels.
|
||||
"subs %w2, %w2, #4 \n" // 4 processed per loop
|
||||
"tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels
|
||||
|
@ -1043,7 +1086,7 @@ void I422ToYUY2Row_NEON(const uint8* src_y,
|
|||
uint8* dst_yuy2,
|
||||
int width) {
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"ld2 {v0.8b, v1.8b}, [%0], #16 \n" // load 16 Ys
|
||||
"orr v2.8b, v1.8b, v1.8b \n"
|
||||
"ld1 {v1.8b}, [%1], #8 \n" // load 8 Us
|
||||
|
@ -1066,7 +1109,7 @@ void I422ToUYVYRow_NEON(const uint8* src_y,
|
|||
uint8* dst_uyvy,
|
||||
int width) {
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"ld2 {v1.8b,v2.8b}, [%0], #16 \n" // load 16 Ys
|
||||
"orr v3.8b, v2.8b, v2.8b \n"
|
||||
"ld1 {v0.8b}, [%1], #8 \n" // load 8 Us
|
||||
|
@ -1085,7 +1128,7 @@ void I422ToUYVYRow_NEON(const uint8* src_y,
|
|||
|
||||
void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int width) {
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||
ARGBTORGB565
|
||||
|
@ -1104,7 +1147,7 @@ void ARGBToRGB565DitherRow_NEON(const uint8* src_argb,
|
|||
int width) {
|
||||
asm volatile(
|
||||
"dup v1.4s, %w2 \n" // dither4
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" // load 8 pixels
|
||||
"subs %w3, %w3, #8 \n" // 8 processed per loop.
|
||||
"uqadd v20.8b, v20.8b, v1.8b \n"
|
||||
|
@ -1123,7 +1166,7 @@ void ARGBToARGB1555Row_NEON(const uint8* src_argb,
|
|||
uint8* dst_argb1555,
|
||||
int width) {
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||
ARGBTOARGB1555
|
||||
|
@ -1143,7 +1186,7 @@ void ARGBToARGB4444Row_NEON(const uint8* src_argb,
|
|||
asm volatile(
|
||||
"movi v4.16b, #0x0f \n" // bits to clear with
|
||||
// vbic.
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||
ARGBTOARGB4444
|
||||
|
@ -1163,9 +1206,8 @@ void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
|
|||
"movi v5.8b, #65 \n" // G * 0.5078 coefficient
|
||||
"movi v6.8b, #33 \n" // R * 0.2578 coefficient
|
||||
"movi v7.8b, #16 \n" // Add 16 constant
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
|
||||
// pixels.
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||
"umull v3.8h, v0.8b, v4.8b \n" // B
|
||||
"umlal v3.8h, v1.8b, v5.8b \n" // G
|
||||
|
@ -1183,7 +1225,7 @@ void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
|
|||
|
||||
void ARGBExtractAlphaRow_NEON(const uint8* src_argb, uint8* dst_a, int width) {
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load row 16
|
||||
// pixels
|
||||
"subs %w2, %w2, #16 \n" // 16 processed per loop
|
||||
|
@ -1202,9 +1244,8 @@ void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
|
|||
"movi v4.8b, #15 \n" // B * 0.11400 coefficient
|
||||
"movi v5.8b, #75 \n" // G * 0.58700 coefficient
|
||||
"movi v6.8b, #38 \n" // R * 0.29900 coefficient
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
|
||||
// pixels.
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||
"umull v3.8h, v0.8b, v4.8b \n" // B
|
||||
"umlal v3.8h, v1.8b, v5.8b \n" // G
|
||||
|
@ -1232,7 +1273,7 @@ void ARGBToUV444Row_NEON(const uint8* src_argb,
|
|||
"movi v27.8b, #18 \n" // VB -0.1406 coefficient
|
||||
"movi v28.8b, #94 \n" // VG -0.7344 coefficient
|
||||
"movi v29.16b,#0x80 \n" // 128.5
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
|
||||
// pixels.
|
||||
"subs %w3, %w3, #8 \n" // 8 processed per loop.
|
||||
|
@ -1270,23 +1311,19 @@ void ARGBToUV444Row_NEON(const uint8* src_argb,
|
|||
"movi v25.16b, #0x80 \n" /* 128.5 (0x8080 in 16-bit) */
|
||||
|
||||
// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
|
||||
// clang-format off
|
||||
#define RGBTOUV(QB, QG, QR) \
|
||||
"mul v3.8h, " #QB \
|
||||
",v20.8h \n" /* B */ \
|
||||
"mul v4.8h, " #QR \
|
||||
",v20.8h \n" /* R */ \
|
||||
"mls v3.8h, " #QG \
|
||||
",v21.8h \n" /* G */ \
|
||||
"mls v4.8h, " #QG \
|
||||
",v24.8h \n" /* G */ \
|
||||
"mls v3.8h, " #QR \
|
||||
",v22.8h \n" /* R */ \
|
||||
"mls v4.8h, " #QB \
|
||||
",v23.8h \n" /* B */ \
|
||||
"mul v3.8h, " #QB ",v20.8h \n" /* B */ \
|
||||
"mul v4.8h, " #QR ",v20.8h \n" /* R */ \
|
||||
"mls v3.8h, " #QG ",v21.8h \n" /* G */ \
|
||||
"mls v4.8h, " #QG ",v24.8h \n" /* G */ \
|
||||
"mls v3.8h, " #QR ",v22.8h \n" /* R */ \
|
||||
"mls v4.8h, " #QB ",v23.8h \n" /* B */ \
|
||||
"add v3.8h, v3.8h, v25.8h \n" /* +128 -> unsigned */ \
|
||||
"add v4.8h, v4.8h, v25.8h \n" /* +128 -> unsigned */ \
|
||||
"uqshrn v0.8b, v3.8h, #8 \n" /* 16 bit to 8 bit U */ \
|
||||
"uqshrn v1.8b, v4.8h, #8 \n" /* 16 bit to 8 bit V */
|
||||
// clang-format on
|
||||
|
||||
// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
|
||||
// TODO(fbarchard): consider ptrdiff_t for all strides.
|
||||
|
@ -1578,9 +1615,8 @@ void RGB565ToUVRow_NEON(const uint8* src_rgb565,
|
|||
"movi v24.8h, #19, lsl #0 \n" // UR coeff (-0.2969) / 2
|
||||
"movi v25.8h, #9 , lsl #0 \n" // VB coeff (-0.1406) / 2
|
||||
"movi v26.8h, #47, lsl #0 \n" // VG coeff (-0.7344) / 2
|
||||
"movi v27.16b, #0x80 \n" // 128.5 (0x8080 in
|
||||
// 16-bit)
|
||||
"1: \n"
|
||||
"movi v27.16b, #0x80 \n" // 128.5 0x8080 in 16bit
|
||||
"1: \n"
|
||||
"ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
|
||||
RGB565TOARGB
|
||||
"uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
|
||||
|
@ -1645,7 +1681,7 @@ void ARGB1555ToUVRow_NEON(const uint8* src_argb1555,
|
|||
const uint8* src_argb1555_1 = src_argb1555 + src_stride_argb1555;
|
||||
asm volatile(
|
||||
RGBTOUV_SETUP_REG
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
|
||||
RGB555TOARGB
|
||||
"uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
|
||||
|
@ -1710,7 +1746,7 @@ void ARGB4444ToUVRow_NEON(const uint8* src_argb4444,
|
|||
const uint8* src_argb4444_1 = src_argb4444 + src_stride_argb4444;
|
||||
asm volatile(
|
||||
RGBTOUV_SETUP_REG
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
|
||||
ARGB4444TOARGB
|
||||
"uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
|
||||
|
@ -1774,7 +1810,7 @@ void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int width) {
|
|||
"movi v25.8b, #65 \n" // G * 0.5078 coefficient
|
||||
"movi v26.8b, #33 \n" // R * 0.2578 coefficient
|
||||
"movi v27.8b, #16 \n" // Add 16 constant
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||
RGB565TOARGB
|
||||
|
@ -1799,7 +1835,7 @@ void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int width) {
|
|||
"movi v5.8b, #65 \n" // G * 0.5078 coefficient
|
||||
"movi v6.8b, #33 \n" // R * 0.2578 coefficient
|
||||
"movi v7.8b, #16 \n" // Add 16 constant
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||
ARGB1555TOARGB
|
||||
|
@ -1823,7 +1859,7 @@ void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int width) {
|
|||
"movi v25.8b, #65 \n" // G * 0.5078 coefficient
|
||||
"movi v26.8b, #33 \n" // R * 0.2578 coefficient
|
||||
"movi v27.8b, #16 \n" // Add 16 constant
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||
ARGB4444TOARGB
|
||||
|
@ -1847,7 +1883,7 @@ void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int width) {
|
|||
"movi v5.8b, #65 \n" // G * 0.5078 coefficient
|
||||
"movi v6.8b, #13 \n" // B * 0.1016 coefficient
|
||||
"movi v7.8b, #16 \n" // Add 16 constant
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||
"umull v16.8h, v1.8b, v4.8b \n" // R
|
||||
|
@ -1870,7 +1906,7 @@ void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int width) {
|
|||
"movi v5.8b, #65 \n" // G * 0.5078 coefficient
|
||||
"movi v6.8b, #13 \n" // B * 0.1016 coefficient
|
||||
"movi v7.8b, #16 \n" // Add 16 constant
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||
"umull v16.8h, v0.8b, v4.8b \n" // R
|
||||
|
@ -1893,7 +1929,7 @@ void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int width) {
|
|||
"movi v5.8b, #65 \n" // G * 0.5078 coefficient
|
||||
"movi v6.8b, #33 \n" // R * 0.2578 coefficient
|
||||
"movi v7.8b, #16 \n" // Add 16 constant
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||
"umull v16.8h, v1.8b, v4.8b \n" // B
|
||||
|
@ -1916,7 +1952,7 @@ void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int width) {
|
|||
"movi v5.8b, #65 \n" // G * 0.5078 coefficient
|
||||
"movi v6.8b, #33 \n" // R * 0.2578 coefficient
|
||||
"movi v7.8b, #16 \n" // Add 16 constant
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels.
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||
"umull v16.8h, v0.8b, v4.8b \n" // B
|
||||
|
@ -1939,7 +1975,7 @@ void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int width) {
|
|||
"movi v5.8b, #65 \n" // G * 0.5078 coefficient
|
||||
"movi v6.8b, #13 \n" // B * 0.1016 coefficient
|
||||
"movi v7.8b, #16 \n" // Add 16 constant
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels.
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||
"umull v16.8h, v0.8b, v4.8b \n" // B
|
||||
|
@ -1974,7 +2010,7 @@ void InterpolateRow_NEON(uint8* dst_ptr,
|
|||
"dup v5.16b, %w4 \n"
|
||||
"dup v4.16b, %w5 \n"
|
||||
// General purpose row blend.
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"ld1 {v0.16b}, [%1], #16 \n"
|
||||
"ld1 {v1.16b}, [%2], #16 \n"
|
||||
"subs %w3, %w3, #16 \n"
|
||||
|
@ -1989,7 +2025,7 @@ void InterpolateRow_NEON(uint8* dst_ptr,
|
|||
"b 99f \n"
|
||||
|
||||
// Blend 50 / 50.
|
||||
"50: \n"
|
||||
"50: \n"
|
||||
"ld1 {v0.16b}, [%1], #16 \n"
|
||||
"ld1 {v1.16b}, [%2], #16 \n"
|
||||
"subs %w3, %w3, #16 \n"
|
||||
|
@ -1999,13 +2035,13 @@ void InterpolateRow_NEON(uint8* dst_ptr,
|
|||
"b 99f \n"
|
||||
|
||||
// Blend 100 / 0 - Copy row unchanged.
|
||||
"100: \n"
|
||||
"100: \n"
|
||||
"ld1 {v0.16b}, [%1], #16 \n"
|
||||
"subs %w3, %w3, #16 \n"
|
||||
"st1 {v0.16b}, [%0], #16 \n"
|
||||
"b.gt 100b \n"
|
||||
|
||||
"99: \n"
|
||||
"99: \n"
|
||||
: "+r"(dst_ptr), // %0
|
||||
"+r"(src_ptr), // %1
|
||||
"+r"(src_ptr1), // %2
|
||||
|
@ -2025,7 +2061,7 @@ void ARGBBlendRow_NEON(const uint8* src_argb0,
|
|||
"subs %w3, %w3, #8 \n"
|
||||
"b.lt 89f \n"
|
||||
// Blend 8 pixels.
|
||||
"8: \n"
|
||||
"8: \n"
|
||||
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB0
|
||||
// pixels
|
||||
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 ARGB1
|
||||
|
@ -2048,12 +2084,12 @@ void ARGBBlendRow_NEON(const uint8* src_argb0,
|
|||
// pixels
|
||||
"b.ge 8b \n"
|
||||
|
||||
"89: \n"
|
||||
"89: \n"
|
||||
"adds %w3, %w3, #8-1 \n"
|
||||
"b.lt 99f \n"
|
||||
|
||||
// Blend 1 pixels.
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"ld4 {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n" // load 1 pixel ARGB0.
|
||||
"ld4 {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n" // load 1 pixel ARGB1.
|
||||
"subs %w3, %w3, #1 \n" // 1 processed per loop.
|
||||
|
@ -2073,7 +2109,7 @@ void ARGBBlendRow_NEON(const uint8* src_argb0,
|
|||
"st4 {v0.b,v1.b,v2.b,v3.b}[0], [%2], #4 \n" // store 1 pixel.
|
||||
"b.ge 1b \n"
|
||||
|
||||
"99: \n"
|
||||
"99: \n"
|
||||
|
||||
: "+r"(src_argb0), // %0
|
||||
"+r"(src_argb1), // %1
|
||||
|
@ -2088,9 +2124,8 @@ void ARGBBlendRow_NEON(const uint8* src_argb0,
|
|||
void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
|
||||
asm volatile(
|
||||
// Attenuate 8 pixels.
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
|
||||
// pixels
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||
"umull v4.8h, v0.8b, v3.8b \n" // b * a
|
||||
"umull v5.8h, v1.8b, v3.8b \n" // g * a
|
||||
|
@ -2122,9 +2157,8 @@ void ARGBQuantizeRow_NEON(uint8* dst_argb,
|
|||
"dup v6.8h, %w4 \n" // interval add
|
||||
|
||||
// 8 pixel loop.
|
||||
"1: \n"
|
||||
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 pixels of
|
||||
// ARGB.
|
||||
"1: \n"
|
||||
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB.
|
||||
"subs %w1, %w1, #8 \n" // 8 processed per loop.
|
||||
"uxtl v0.8h, v0.8b \n" // b (0 .. 255)
|
||||
"uxtl v1.8h, v1.8b \n"
|
||||
|
@ -2142,7 +2176,6 @@ void ARGBQuantizeRow_NEON(uint8* dst_argb,
|
|||
"uqxtn v1.8b, v1.8h \n"
|
||||
"uqxtn v2.8b, v2.8h \n"
|
||||
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 ARGB
|
||||
// pixels
|
||||
"b.gt 1b \n"
|
||||
: "+r"(dst_argb), // %0
|
||||
"+r"(width) // %1
|
||||
|
@ -2165,9 +2198,8 @@ void ARGBShadeRow_NEON(const uint8* src_argb,
|
|||
"ushr v0.8h, v0.8h, #1 \n" // scale / 2.
|
||||
|
||||
// 8 pixel loop.
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n" // load 8 ARGB
|
||||
// pixels.
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||
"uxtl v4.8h, v4.8b \n" // b (0 .. 255)
|
||||
"uxtl v5.8h, v5.8b \n"
|
||||
|
@ -2182,7 +2214,6 @@ void ARGBShadeRow_NEON(const uint8* src_argb,
|
|||
"uqxtn v6.8b, v6.8h \n"
|
||||
"uqxtn v7.8b, v7.8h \n"
|
||||
"st4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // store 8 ARGB
|
||||
// pixels
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_argb), // %0
|
||||
"+r"(dst_argb), // %1
|
||||
|
@ -2199,9 +2230,8 @@ void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
|
|||
"movi v24.8b, #15 \n" // B * 0.11400 coefficient
|
||||
"movi v25.8b, #75 \n" // G * 0.58700 coefficient
|
||||
"movi v26.8b, #38 \n" // R * 0.29900 coefficient
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
|
||||
// pixels.
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||
"umull v4.8h, v0.8b, v24.8b \n" // B
|
||||
"umlal v4.8h, v1.8b, v25.8b \n" // G
|
||||
|
@ -2234,7 +2264,7 @@ void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
|
|||
"movi v28.8b, #24 \n" // BB coefficient
|
||||
"movi v29.8b, #98 \n" // BG coefficient
|
||||
"movi v30.8b, #50 \n" // BR coefficient
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB pixels.
|
||||
"subs %w1, %w1, #8 \n" // 8 processed per loop.
|
||||
"umull v4.8h, v0.8b, v20.8b \n" // B to Sepia B
|
||||
|
@ -2270,9 +2300,8 @@ void ARGBColorMatrixRow_NEON(const uint8* src_argb,
|
|||
"sxtl v0.8h, v2.8b \n" // B,G coefficients s16.
|
||||
"sxtl2 v1.8h, v2.16b \n" // R,A coefficients s16.
|
||||
|
||||
"1: \n"
|
||||
"ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8
|
||||
// pixels.
|
||||
"1: \n"
|
||||
"ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 ARGB
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||
"uxtl v16.8h, v16.8b \n" // b (0 .. 255) 16 bit
|
||||
"uxtl v17.8h, v17.8b \n" // g
|
||||
|
@ -2310,8 +2339,7 @@ void ARGBColorMatrixRow_NEON(const uint8* src_argb,
|
|||
"sqshrun v17.8b, v23.8h, #6 \n" // 16 bit to 8 bit G
|
||||
"sqshrun v18.8b, v24.8h, #6 \n" // 16 bit to 8 bit R
|
||||
"sqshrun v19.8b, v25.8h, #6 \n" // 16 bit to 8 bit A
|
||||
"st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n" // store 8
|
||||
// pixels.
|
||||
"st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n" // store 8 ARGB
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_argb), // %0
|
||||
"+r"(dst_argb), // %1
|
||||
|
@ -2329,11 +2357,9 @@ void ARGBMultiplyRow_NEON(const uint8* src_argb0,
|
|||
int width) {
|
||||
asm volatile(
|
||||
// 8 pixel loop.
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
|
||||
// pixels.
|
||||
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more
|
||||
// pixels.
|
||||
"subs %w3, %w3, #8 \n" // 8 processed per loop.
|
||||
"umull v0.8h, v0.8b, v4.8b \n" // multiply B
|
||||
"umull v1.8h, v1.8b, v5.8b \n" // multiply G
|
||||
|
@ -2344,9 +2370,7 @@ void ARGBMultiplyRow_NEON(const uint8* src_argb0,
|
|||
"rshrn v2.8b, v2.8h, #8 \n" // 16 bit to 8 bit R
|
||||
"rshrn v3.8b, v3.8h, #8 \n" // 16 bit to 8 bit A
|
||||
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
|
||||
// pixels
|
||||
"b.gt 1b \n"
|
||||
|
||||
: "+r"(src_argb0), // %0
|
||||
"+r"(src_argb1), // %1
|
||||
"+r"(dst_argb), // %2
|
||||
|
@ -2362,20 +2386,16 @@ void ARGBAddRow_NEON(const uint8* src_argb0,
|
|||
int width) {
|
||||
asm volatile(
|
||||
// 8 pixel loop.
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
|
||||
// pixels.
|
||||
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more
|
||||
// pixels.
|
||||
"subs %w3, %w3, #8 \n" // 8 processed per loop.
|
||||
"uqadd v0.8b, v0.8b, v4.8b \n"
|
||||
"uqadd v1.8b, v1.8b, v5.8b \n"
|
||||
"uqadd v2.8b, v2.8b, v6.8b \n"
|
||||
"uqadd v3.8b, v3.8b, v7.8b \n"
|
||||
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
|
||||
// pixels
|
||||
"b.gt 1b \n"
|
||||
|
||||
: "+r"(src_argb0), // %0
|
||||
"+r"(src_argb1), // %1
|
||||
"+r"(dst_argb), // %2
|
||||
|
@ -2391,20 +2411,16 @@ void ARGBSubtractRow_NEON(const uint8* src_argb0,
|
|||
int width) {
|
||||
asm volatile(
|
||||
// 8 pixel loop.
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
|
||||
// pixels.
|
||||
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more
|
||||
// pixels.
|
||||
"subs %w3, %w3, #8 \n" // 8 processed per loop.
|
||||
"uqsub v0.8b, v0.8b, v4.8b \n"
|
||||
"uqsub v1.8b, v1.8b, v5.8b \n"
|
||||
"uqsub v2.8b, v2.8b, v6.8b \n"
|
||||
"uqsub v3.8b, v3.8b, v7.8b \n"
|
||||
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
|
||||
// pixels
|
||||
"b.gt 1b \n"
|
||||
|
||||
: "+r"(src_argb0), // %0
|
||||
"+r"(src_argb1), // %1
|
||||
"+r"(dst_argb), // %2
|
||||
|
@ -2425,7 +2441,7 @@ void SobelRow_NEON(const uint8* src_sobelx,
|
|||
asm volatile(
|
||||
"movi v3.8b, #255 \n" // alpha
|
||||
// 8 pixel loop.
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"ld1 {v0.8b}, [%0], #8 \n" // load 8 sobelx.
|
||||
"ld1 {v1.8b}, [%1], #8 \n" // load 8 sobely.
|
||||
"subs %w3, %w3, #8 \n" // 8 processed per loop.
|
||||
|
@ -2433,7 +2449,6 @@ void SobelRow_NEON(const uint8* src_sobelx,
|
|||
"orr v1.8b, v0.8b, v0.8b \n"
|
||||
"orr v2.8b, v0.8b, v0.8b \n"
|
||||
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
|
||||
// pixels
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_sobelx), // %0
|
||||
"+r"(src_sobely), // %1
|
||||
|
@ -2450,7 +2465,7 @@ void SobelToPlaneRow_NEON(const uint8* src_sobelx,
|
|||
int width) {
|
||||
asm volatile(
|
||||
// 16 pixel loop.
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"ld1 {v0.16b}, [%0], #16 \n" // load 16 sobelx.
|
||||
"ld1 {v1.16b}, [%1], #16 \n" // load 16 sobely.
|
||||
"subs %w3, %w3, #16 \n" // 16 processed per loop.
|
||||
|
@ -2477,13 +2492,12 @@ void SobelXYRow_NEON(const uint8* src_sobelx,
|
|||
asm volatile(
|
||||
"movi v3.8b, #255 \n" // alpha
|
||||
// 8 pixel loop.
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"ld1 {v2.8b}, [%0], #8 \n" // load 8 sobelx.
|
||||
"ld1 {v0.8b}, [%1], #8 \n" // load 8 sobely.
|
||||
"subs %w3, %w3, #8 \n" // 8 processed per loop.
|
||||
"uqadd v1.8b, v0.8b, v2.8b \n" // add
|
||||
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
|
||||
// pixels
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_sobelx), // %0
|
||||
"+r"(src_sobely), // %1
|
||||
|
@ -2503,7 +2517,7 @@ void SobelXRow_NEON(const uint8* src_y0,
|
|||
uint8* dst_sobelx,
|
||||
int width) {
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"ld1 {v0.8b}, [%0],%5 \n" // top
|
||||
"ld1 {v1.8b}, [%0],%6 \n"
|
||||
"usubl v0.8h, v0.8b, v1.8b \n"
|
||||
|
@ -2541,7 +2555,7 @@ void SobelYRow_NEON(const uint8* src_y0,
|
|||
uint8* dst_sobely,
|
||||
int width) {
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"ld1 {v0.8b}, [%0],%4 \n" // left
|
||||
"ld1 {v1.8b}, [%1],%4 \n"
|
||||
"usubl v0.8h, v0.8b, v1.8b \n"
|
||||
|
@ -2572,7 +2586,7 @@ void SobelYRow_NEON(const uint8* src_y0,
|
|||
// Caveat - rounds float to half float whereas scaling version truncates.
|
||||
void HalfFloat1Row_NEON(const uint16* src, uint16* dst, float, int width) {
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts
|
||||
"subs %w2, %w2, #8 \n" // 8 pixels per loop
|
||||
"uxtl v2.4s, v1.4h \n" // 8 int's
|
||||
|
@ -2592,7 +2606,7 @@ void HalfFloat1Row_NEON(const uint16* src, uint16* dst, float, int width) {
|
|||
|
||||
void HalfFloatRow_NEON(const uint16* src, uint16* dst, float scale, int width) {
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts
|
||||
"subs %w2, %w2, #8 \n" // 8 pixels per loop
|
||||
"uxtl v2.4s, v1.4h \n" // 8 int's
|
||||
|
@ -2612,6 +2626,158 @@ void HalfFloatRow_NEON(const uint16* src, uint16* dst, float scale, int width) {
|
|||
: "cc", "memory", "v1", "v2", "v3");
|
||||
}
|
||||
|
||||
float ScaleMaxSamples_NEON(const float* src,
|
||||
float* dst,
|
||||
float scale,
|
||||
int width) {
|
||||
float fmax;
|
||||
asm volatile(
|
||||
"movi v5.4s, #0 \n" // max
|
||||
"movi v6.4s, #0 \n"
|
||||
|
||||
"1: \n"
|
||||
"ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop
|
||||
"fmul v3.4s, v1.4s, %4.s[0] \n" // scale
|
||||
"fmul v4.4s, v2.4s, %4.s[0] \n" // scale
|
||||
"fmax v5.4s, v5.4s, v1.4s \n" // max
|
||||
"fmax v6.4s, v6.4s, v2.4s \n"
|
||||
"st1 {v3.4s, v4.4s}, [%1], #32 \n" // store 8 samples
|
||||
"b.gt 1b \n"
|
||||
"fmax v5.4s, v5.4s, v6.4s \n" // max
|
||||
"fmaxv %s3, v5.4s \n" // signed max acculator
|
||||
: "+r"(src), // %0
|
||||
"+r"(dst), // %1
|
||||
"+r"(width), // %2
|
||||
"=w"(fmax) // %3
|
||||
: "w"(scale) // %4
|
||||
: "cc", "memory", "v1", "v2", "v3", "v4", "v5", "v6");
|
||||
return fmax;
|
||||
}
|
||||
|
||||
float ScaleSumSamples_NEON(const float* src,
|
||||
float* dst,
|
||||
float scale,
|
||||
int width) {
|
||||
float fsum;
|
||||
asm volatile(
|
||||
"movi v5.4s, #0 \n" // max
|
||||
"movi v6.4s, #0 \n" // max
|
||||
|
||||
"1: \n"
|
||||
"ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop
|
||||
"fmul v3.4s, v1.4s, %4.s[0] \n" // scale
|
||||
"fmul v4.4s, v2.4s, %4.s[0] \n"
|
||||
"fmla v5.4s, v1.4s, v1.4s \n" // sum of squares
|
||||
"fmla v6.4s, v2.4s, v2.4s \n"
|
||||
"st1 {v3.4s, v4.4s}, [%1], #32 \n" // store 8 samples
|
||||
"b.gt 1b \n"
|
||||
"faddp v5.4s, v5.4s, v6.4s \n"
|
||||
"faddp v5.4s, v5.4s, v5.4s \n"
|
||||
"faddp %3.4s, v5.4s, v5.4s \n" // sum
|
||||
: "+r"(src), // %0
|
||||
"+r"(dst), // %1
|
||||
"+r"(width), // %2
|
||||
"=w"(fsum) // %3
|
||||
: "w"(scale) // %4
|
||||
: "cc", "memory", "v1", "v2", "v3", "v4", "v5", "v6");
|
||||
return fsum;
|
||||
}
|
||||
|
||||
void ScaleSamples_NEON(const float* src, float* dst, float scale, int width) {
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop
|
||||
"fmul v1.4s, v1.4s, %3.s[0] \n" // scale
|
||||
"fmul v2.4s, v2.4s, %3.s[0] \n" // scale
|
||||
"st1 {v1.4s, v2.4s}, [%1], #32 \n" // store 8 samples
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src), // %0
|
||||
"+r"(dst), // %1
|
||||
"+r"(width) // %2
|
||||
: "w"(scale) // %3
|
||||
: "cc", "memory", "v1", "v2");
|
||||
}
|
||||
|
||||
// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
|
||||
void GaussCol_NEON(const uint16* src0,
|
||||
const uint16* src1,
|
||||
const uint16* src2,
|
||||
const uint16* src3,
|
||||
const uint16* src4,
|
||||
uint32* dst,
|
||||
int width) {
|
||||
asm volatile(
|
||||
"movi v6.8h, #4 \n" // constant 4
|
||||
"movi v7.8h, #6 \n" // constant 6
|
||||
|
||||
"1: \n"
|
||||
"ld1 {v1.8h}, [%0], #16 \n" // load 8 samples, 5 rows
|
||||
"ld1 {v2.8h}, [%4], #16 \n"
|
||||
"uaddl v0.4s, v1.4h, v2.4h \n" // * 1
|
||||
"uaddl2 v1.4s, v1.8h, v2.8h \n" // * 1
|
||||
"ld1 {v2.8h}, [%1], #16 \n"
|
||||
"umlal v0.4s, v2.4h, v6.4h \n" // * 4
|
||||
"umlal2 v1.4s, v2.8h, v6.8h \n" // * 4
|
||||
"ld1 {v2.8h}, [%2], #16 \n"
|
||||
"umlal v0.4s, v2.4h, v7.4h \n" // * 6
|
||||
"umlal2 v1.4s, v2.8h, v7.8h \n" // * 6
|
||||
"ld1 {v2.8h}, [%3], #16 \n"
|
||||
"umlal v0.4s, v2.4h, v6.4h \n" // * 4
|
||||
"umlal2 v1.4s, v2.8h, v6.8h \n" // * 4
|
||||
"subs %w6, %w6, #8 \n" // 8 processed per loop
|
||||
"st1 {v0.4s,v1.4s}, [%5], #32 \n" // store 8 samples
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src0), // %0
|
||||
"+r"(src1), // %1
|
||||
"+r"(src2), // %2
|
||||
"+r"(src3), // %3
|
||||
"+r"(src4), // %4
|
||||
"+r"(dst), // %5
|
||||
"+r"(width) // %6
|
||||
:
|
||||
: "cc", "memory", "v0", "v1", "v2", "v6", "v7");
|
||||
}
|
||||
|
||||
// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
|
||||
void GaussRow_NEON(const uint32* src, uint16* dst, int width) {
|
||||
const uint32* src1 = src + 1;
|
||||
const uint32* src2 = src + 2;
|
||||
const uint32* src3 = src + 3;
|
||||
asm volatile(
|
||||
"movi v6.4s, #4 \n" // constant 4
|
||||
"movi v7.4s, #6 \n" // constant 6
|
||||
|
||||
"1: \n"
|
||||
"ld1 {v0.4s,v1.4s,v2.4s}, [%0], %6 \n" // load 12 source samples
|
||||
"add v0.4s, v0.4s, v1.4s \n" // * 1
|
||||
"add v1.4s, v1.4s, v2.4s \n" // * 1
|
||||
"ld1 {v2.4s,v3.4s}, [%2], #32 \n"
|
||||
"mla v0.4s, v2.4s, v7.4s \n" // * 6
|
||||
"mla v1.4s, v3.4s, v7.4s \n" // * 6
|
||||
"ld1 {v2.4s,v3.4s}, [%1], #32 \n"
|
||||
"ld1 {v4.4s,v5.4s}, [%3], #32 \n"
|
||||
"add v2.4s, v2.4s, v4.4s \n" // add rows for * 4
|
||||
"add v3.4s, v3.4s, v5.4s \n"
|
||||
"mla v0.4s, v2.4s, v6.4s \n" // * 4
|
||||
"mla v1.4s, v3.4s, v6.4s \n" // * 4
|
||||
"subs %w5, %w5, #8 \n" // 8 processed per loop
|
||||
"uqrshrn v0.4h, v0.4s, #8 \n" // round and pack
|
||||
"uqrshrn2 v0.8h, v1.4s, #8 \n"
|
||||
"st1 {v0.8h}, [%4], #16 \n" // store 8 samples
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src), // %0
|
||||
"+r"(src1), // %1
|
||||
"+r"(src2), // %2
|
||||
"+r"(src3), // %3
|
||||
"+r"(dst), // %4
|
||||
"+r"(width) // %5
|
||||
: "r"(32LL) // %6
|
||||
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
|
||||
}
|
||||
|
||||
#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
|
|
@ -1410,9 +1410,9 @@ __declspec(naked) void ARGBToUVRow_SSSE3(const uint8* src_argb0,
|
|||
shufps xmm4, xmm3, 0xdd
|
||||
pavgb xmm2, xmm4
|
||||
|
||||
// step 2 - convert to U and V
|
||||
// from here down is very similar to Y code except
|
||||
// instead of 16 different pixels, its 8 pixels of U and 8 of V
|
||||
// step 2 - convert to U and V
|
||||
// from here down is very similar to Y code except
|
||||
// instead of 16 different pixels, its 8 pixels of U and 8 of V
|
||||
movdqa xmm1, xmm0
|
||||
movdqa xmm3, xmm2
|
||||
pmaddubsw xmm0, xmm7 // U
|
||||
|
@ -1426,7 +1426,7 @@ __declspec(naked) void ARGBToUVRow_SSSE3(const uint8* src_argb0,
|
|||
packsswb xmm0, xmm1
|
||||
paddb xmm0, xmm5 // -> unsigned
|
||||
|
||||
// step 3 - store 8 U and 8 V values
|
||||
// step 3 - store 8 U and 8 V values
|
||||
movlps qword ptr [edx], xmm0 // U
|
||||
movhps qword ptr [edx + edi], xmm0 // V
|
||||
lea edx, [edx + 8]
|
||||
|
@ -1482,9 +1482,9 @@ __declspec(naked) void ARGBToUVJRow_SSSE3(const uint8* src_argb0,
|
|||
shufps xmm4, xmm3, 0xdd
|
||||
pavgb xmm2, xmm4
|
||||
|
||||
// step 2 - convert to U and V
|
||||
// from here down is very similar to Y code except
|
||||
// instead of 16 different pixels, its 8 pixels of U and 8 of V
|
||||
// step 2 - convert to U and V
|
||||
// from here down is very similar to Y code except
|
||||
// instead of 16 different pixels, its 8 pixels of U and 8 of V
|
||||
movdqa xmm1, xmm0
|
||||
movdqa xmm3, xmm2
|
||||
pmaddubsw xmm0, xmm7 // U
|
||||
|
@ -1499,7 +1499,7 @@ __declspec(naked) void ARGBToUVJRow_SSSE3(const uint8* src_argb0,
|
|||
psraw xmm1, 8
|
||||
packsswb xmm0, xmm1
|
||||
|
||||
// step 3 - store 8 U and 8 V values
|
||||
// step 3 - store 8 U and 8 V values
|
||||
movlps qword ptr [edx], xmm0 // U
|
||||
movhps qword ptr [edx + edi], xmm0 // V
|
||||
lea edx, [edx + 8]
|
||||
|
@ -1549,9 +1549,9 @@ __declspec(naked) void ARGBToUVRow_AVX2(const uint8* src_argb0,
|
|||
vshufps ymm2, ymm2, ymm3, 0xdd
|
||||
vpavgb ymm2, ymm2, ymm4 // mutated by vshufps
|
||||
|
||||
// step 2 - convert to U and V
|
||||
// from here down is very similar to Y code except
|
||||
// instead of 32 different pixels, its 16 pixels of U and 16 of V
|
||||
// step 2 - convert to U and V
|
||||
// from here down is very similar to Y code except
|
||||
// instead of 32 different pixels, its 16 pixels of U and 16 of V
|
||||
vpmaddubsw ymm1, ymm0, ymm7 // U
|
||||
vpmaddubsw ymm3, ymm2, ymm7
|
||||
vpmaddubsw ymm0, ymm0, ymm6 // V
|
||||
|
@ -1565,7 +1565,7 @@ __declspec(naked) void ARGBToUVRow_AVX2(const uint8* src_argb0,
|
|||
vpshufb ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX // for vshufps/vphaddw
|
||||
vpaddb ymm0, ymm0, ymm5 // -> unsigned
|
||||
|
||||
// step 3 - store 16 U and 16 V values
|
||||
// step 3 - store 16 U and 16 V values
|
||||
vextractf128 [edx], ymm0, 0 // U
|
||||
vextractf128 [edx + edi], ymm0, 1 // V
|
||||
lea edx, [edx + 16]
|
||||
|
@ -1617,9 +1617,9 @@ __declspec(naked) void ARGBToUVJRow_AVX2(const uint8* src_argb0,
|
|||
vshufps ymm2, ymm2, ymm3, 0xdd
|
||||
vpavgb ymm2, ymm2, ymm4 // mutated by vshufps
|
||||
|
||||
// step 2 - convert to U and V
|
||||
// from here down is very similar to Y code except
|
||||
// instead of 32 different pixels, its 16 pixels of U and 16 of V
|
||||
// step 2 - convert to U and V
|
||||
// from here down is very similar to Y code except
|
||||
// instead of 32 different pixels, its 16 pixels of U and 16 of V
|
||||
vpmaddubsw ymm1, ymm0, ymm7 // U
|
||||
vpmaddubsw ymm3, ymm2, ymm7
|
||||
vpmaddubsw ymm0, ymm0, ymm6 // V
|
||||
|
@ -1634,7 +1634,7 @@ __declspec(naked) void ARGBToUVJRow_AVX2(const uint8* src_argb0,
|
|||
vpermq ymm0, ymm0, 0xd8 // For vpacksswb
|
||||
vpshufb ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX // for vshufps/vphaddw
|
||||
|
||||
// step 3 - store 16 U and 16 V values
|
||||
// step 3 - store 16 U and 16 V values
|
||||
vextractf128 [edx], ymm0, 0 // U
|
||||
vextractf128 [edx + edi], ymm0, 1 // V
|
||||
lea edx, [edx + 16]
|
||||
|
@ -1750,9 +1750,9 @@ __declspec(naked) void BGRAToUVRow_SSSE3(const uint8* src_argb0,
|
|||
shufps xmm4, xmm3, 0xdd
|
||||
pavgb xmm2, xmm4
|
||||
|
||||
// step 2 - convert to U and V
|
||||
// from here down is very similar to Y code except
|
||||
// instead of 16 different pixels, its 8 pixels of U and 8 of V
|
||||
// step 2 - convert to U and V
|
||||
// from here down is very similar to Y code except
|
||||
// instead of 16 different pixels, its 8 pixels of U and 8 of V
|
||||
movdqa xmm1, xmm0
|
||||
movdqa xmm3, xmm2
|
||||
pmaddubsw xmm0, xmm7 // U
|
||||
|
@ -1766,7 +1766,7 @@ __declspec(naked) void BGRAToUVRow_SSSE3(const uint8* src_argb0,
|
|||
packsswb xmm0, xmm1
|
||||
paddb xmm0, xmm5 // -> unsigned
|
||||
|
||||
// step 3 - store 8 U and 8 V values
|
||||
// step 3 - store 8 U and 8 V values
|
||||
movlps qword ptr [edx], xmm0 // U
|
||||
movhps qword ptr [edx + edi], xmm0 // V
|
||||
lea edx, [edx + 8]
|
||||
|
@ -1822,9 +1822,9 @@ __declspec(naked) void ABGRToUVRow_SSSE3(const uint8* src_argb0,
|
|||
shufps xmm4, xmm3, 0xdd
|
||||
pavgb xmm2, xmm4
|
||||
|
||||
// step 2 - convert to U and V
|
||||
// from here down is very similar to Y code except
|
||||
// instead of 16 different pixels, its 8 pixels of U and 8 of V
|
||||
// step 2 - convert to U and V
|
||||
// from here down is very similar to Y code except
|
||||
// instead of 16 different pixels, its 8 pixels of U and 8 of V
|
||||
movdqa xmm1, xmm0
|
||||
movdqa xmm3, xmm2
|
||||
pmaddubsw xmm0, xmm7 // U
|
||||
|
@ -1838,7 +1838,7 @@ __declspec(naked) void ABGRToUVRow_SSSE3(const uint8* src_argb0,
|
|||
packsswb xmm0, xmm1
|
||||
paddb xmm0, xmm5 // -> unsigned
|
||||
|
||||
// step 3 - store 8 U and 8 V values
|
||||
// step 3 - store 8 U and 8 V values
|
||||
movlps qword ptr [edx], xmm0 // U
|
||||
movhps qword ptr [edx + edi], xmm0 // V
|
||||
lea edx, [edx + 8]
|
||||
|
@ -1894,9 +1894,9 @@ __declspec(naked) void RGBAToUVRow_SSSE3(const uint8* src_argb0,
|
|||
shufps xmm4, xmm3, 0xdd
|
||||
pavgb xmm2, xmm4
|
||||
|
||||
// step 2 - convert to U and V
|
||||
// from here down is very similar to Y code except
|
||||
// instead of 16 different pixels, its 8 pixels of U and 8 of V
|
||||
// step 2 - convert to U and V
|
||||
// from here down is very similar to Y code except
|
||||
// instead of 16 different pixels, its 8 pixels of U and 8 of V
|
||||
movdqa xmm1, xmm0
|
||||
movdqa xmm3, xmm2
|
||||
pmaddubsw xmm0, xmm7 // U
|
||||
|
@ -1910,7 +1910,7 @@ __declspec(naked) void RGBAToUVRow_SSSE3(const uint8* src_argb0,
|
|||
packsswb xmm0, xmm1
|
||||
paddb xmm0, xmm5 // -> unsigned
|
||||
|
||||
// step 3 - store 8 U and 8 V values
|
||||
// step 3 - store 8 U and 8 V values
|
||||
movlps qword ptr [edx], xmm0 // U
|
||||
movhps qword ptr [edx + edi], xmm0 // V
|
||||
lea edx, [edx + 8]
|
||||
|
@ -2927,7 +2927,7 @@ __declspec(naked) void I400ToARGBRow_SSE2(const uint8* y_buf,
|
|||
psrlw xmm0, 6
|
||||
packuswb xmm0, xmm0 // G
|
||||
|
||||
// Step 2: Weave into ARGB
|
||||
// Step 2: Weave into ARGB
|
||||
punpcklbw xmm0, xmm0 // GG
|
||||
movdqa xmm1, xmm0
|
||||
punpcklwd xmm0, xmm0 // BGRA first 4 pixels
|
||||
|
@ -2975,8 +2975,8 @@ __declspec(naked) void I400ToARGBRow_AVX2(const uint8* y_buf,
|
|||
vpsrlw ymm0, ymm0, 6
|
||||
vpackuswb ymm0, ymm0, ymm0 // G. still mutated: 3120
|
||||
|
||||
// TODO(fbarchard): Weave alpha with unpack.
|
||||
// Step 2: Weave into ARGB
|
||||
// TODO(fbarchard): Weave alpha with unpack.
|
||||
// Step 2: Weave into ARGB
|
||||
vpunpcklbw ymm1, ymm0, ymm0 // GG - mutates
|
||||
vpermq ymm1, ymm1, 0xd8
|
||||
vpunpcklwd ymm0, ymm1, ymm1 // GGGG first 8 pixels
|
||||
|
@ -4067,7 +4067,7 @@ __declspec(naked) void BlendPlaneRow_SSSE3(const uint8* src0,
|
|||
sub edx, esi
|
||||
sub edi, esi
|
||||
|
||||
// 8 pixel loop.
|
||||
// 8 pixel loop.
|
||||
convertloop8:
|
||||
movq xmm0, qword ptr [esi] // alpha
|
||||
punpcklbw xmm0, xmm0
|
||||
|
@ -4123,7 +4123,7 @@ __declspec(naked) void BlendPlaneRow_AVX2(const uint8* src0,
|
|||
sub edx, esi
|
||||
sub edi, esi
|
||||
|
||||
// 32 pixel loop.
|
||||
// 32 pixel loop.
|
||||
convertloop32:
|
||||
vmovdqu ymm0, [esi] // alpha
|
||||
vpunpckhbw ymm3, ymm0, ymm0 // 8..15, 24..31
|
||||
|
@ -4183,7 +4183,7 @@ __declspec(naked) void ARGBBlendRow_SSSE3(const uint8* src_argb0,
|
|||
sub ecx, 4
|
||||
jl convertloop4b // less than 4 pixels?
|
||||
|
||||
// 4 pixel loop.
|
||||
// 4 pixel loop.
|
||||
convertloop4:
|
||||
movdqu xmm3, [eax] // src argb
|
||||
lea eax, [eax + 16]
|
||||
|
@ -4212,7 +4212,7 @@ __declspec(naked) void ARGBBlendRow_SSSE3(const uint8* src_argb0,
|
|||
add ecx, 4 - 1
|
||||
jl convertloop1b
|
||||
|
||||
// 1 pixel loop.
|
||||
// 1 pixel loop.
|
||||
convertloop1:
|
||||
movd xmm3, [eax] // src argb
|
||||
lea eax, [eax + 4]
|
||||
|
@ -5256,7 +5256,7 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft,
|
|||
cvtps2dq xmm5, xmm5 // 0.16 fixed point
|
||||
packssdw xmm5, xmm5 // 16 bit shorts
|
||||
|
||||
// 4 pixel loop small blocks.
|
||||
// 4 pixel loop small blocks.
|
||||
s4:
|
||||
// top left
|
||||
movdqu xmm0, [eax]
|
||||
|
@ -5298,7 +5298,7 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft,
|
|||
|
||||
jmp l4b
|
||||
|
||||
// 4 pixel loop
|
||||
// 4 pixel loop
|
||||
l4:
|
||||
// top left
|
||||
movdqu xmm0, [eax]
|
||||
|
@ -5350,7 +5350,7 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft,
|
|||
add ecx, 4 - 1
|
||||
jl l1b
|
||||
|
||||
// 1 pixel loop
|
||||
// 1 pixel loop
|
||||
l1:
|
||||
movdqu xmm0, [eax]
|
||||
psubd xmm0, [eax + edx * 4]
|
||||
|
@ -5392,7 +5392,7 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row,
|
|||
test edx, 15
|
||||
jne l4b
|
||||
|
||||
// 4 pixel loop
|
||||
// 4 pixel loop
|
||||
l4:
|
||||
movdqu xmm2, [eax] // 4 argb pixels 16 bytes.
|
||||
lea eax, [eax + 16]
|
||||
|
@ -5438,7 +5438,7 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row,
|
|||
add ecx, 4 - 1
|
||||
jl l1b
|
||||
|
||||
// 1 pixel loop
|
||||
// 1 pixel loop
|
||||
l1:
|
||||
movd xmm2, dword ptr [eax] // 1 argb pixel 4 bytes.
|
||||
lea eax, [eax + 4]
|
||||
|
@ -5481,7 +5481,7 @@ __declspec(naked) LIBYUV_API void ARGBAffineRow_SSE2(const uint8* src_argb,
|
|||
sub ecx, 4
|
||||
jl l4b
|
||||
|
||||
// setup for 4 pixel loop
|
||||
// setup for 4 pixel loop
|
||||
pshufd xmm7, xmm7, 0x44 // dup dudv
|
||||
pshufd xmm5, xmm5, 0 // dup 4, stride
|
||||
movdqa xmm0, xmm2 // x0, y0, x1, y1
|
||||
|
@ -5493,7 +5493,7 @@ __declspec(naked) LIBYUV_API void ARGBAffineRow_SSE2(const uint8* src_argb,
|
|||
addps xmm3, xmm4
|
||||
addps xmm4, xmm4 // dudv *= 4
|
||||
|
||||
// 4 pixel loop
|
||||
// 4 pixel loop
|
||||
l4:
|
||||
cvttps2dq xmm0, xmm2 // x, y float to int first 2
|
||||
cvttps2dq xmm1, xmm3 // x, y float to int next 2
|
||||
|
@ -5524,7 +5524,7 @@ __declspec(naked) LIBYUV_API void ARGBAffineRow_SSE2(const uint8* src_argb,
|
|||
add ecx, 4 - 1
|
||||
jl l1b
|
||||
|
||||
// 1 pixel loop
|
||||
// 1 pixel loop
|
||||
l1:
|
||||
cvttps2dq xmm0, xmm2 // x, y float to int
|
||||
packssdw xmm0, xmm0 // x, y as shorts
|
||||
|
@ -5598,7 +5598,7 @@ __declspec(naked) void InterpolateRow_AVX2(uint8* dst_ptr,
|
|||
jg xloop
|
||||
jmp xloop99
|
||||
|
||||
// Blend 50 / 50.
|
||||
// Blend 50 / 50.
|
||||
xloop50:
|
||||
vmovdqu ymm0, [esi]
|
||||
vpavgb ymm0, ymm0, [esi + edx]
|
||||
|
@ -5608,7 +5608,7 @@ __declspec(naked) void InterpolateRow_AVX2(uint8* dst_ptr,
|
|||
jg xloop50
|
||||
jmp xloop99
|
||||
|
||||
// Blend 100 / 0 - Copy row unchanged.
|
||||
// Blend 100 / 0 - Copy row unchanged.
|
||||
xloop100:
|
||||
rep movsb
|
||||
|
||||
|
@ -5638,7 +5638,7 @@ __declspec(naked) void InterpolateRow_SSSE3(uint8* dst_ptr,
|
|||
mov ecx, [esp + 8 + 16] // dst_width
|
||||
mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
|
||||
sub edi, esi
|
||||
// Dispatch to specialized filters if applicable.
|
||||
// Dispatch to specialized filters if applicable.
|
||||
cmp eax, 0
|
||||
je xloop100 // 0 /256. Blend 100 / 0.
|
||||
cmp eax, 128
|
||||
|
@ -5678,7 +5678,7 @@ __declspec(naked) void InterpolateRow_SSSE3(uint8* dst_ptr,
|
|||
jg xloop
|
||||
jmp xloop99
|
||||
|
||||
// Blend 50 / 50.
|
||||
// Blend 50 / 50.
|
||||
xloop50:
|
||||
movdqu xmm0, [esi]
|
||||
movdqu xmm1, [esi + edx]
|
||||
|
@ -5689,7 +5689,7 @@ __declspec(naked) void InterpolateRow_SSSE3(uint8* dst_ptr,
|
|||
jg xloop50
|
||||
jmp xloop99
|
||||
|
||||
// Blend 100 / 0 - Copy row unchanged.
|
||||
// Blend 100 / 0 - Copy row unchanged.
|
||||
xloop100:
|
||||
movdqu xmm0, [esi]
|
||||
movdqu [esi + edi], xmm0
|
||||
|
@ -5784,7 +5784,7 @@ __declspec(naked) void ARGBShuffleRow_SSE2(const uint8* src_argb,
|
|||
cmp ebx, 0x02010003
|
||||
je shuf_2103
|
||||
|
||||
// TODO(fbarchard): Use one source pointer and 3 offsets.
|
||||
// TODO(fbarchard): Use one source pointer and 3 offsets.
|
||||
shuf_any1:
|
||||
movzx ebx, byte ptr [esi]
|
||||
movzx ebx, byte ptr [eax + ebx]
|
||||
|
@ -5971,7 +5971,7 @@ __declspec(naked) void ARGBPolynomialRow_SSE2(const uint8* src_argb,
|
|||
mov ecx, [esp + 4 + 16] /* width */
|
||||
pxor xmm3, xmm3 // 0 constant for zero extending bytes to ints.
|
||||
|
||||
// 2 pixel loop.
|
||||
// 2 pixel loop.
|
||||
convertloop:
|
||||
// pmovzxbd xmm0, dword ptr [eax] // BGRA pixel
|
||||
// pmovzxbd xmm4, dword ptr [eax + 4] // BGRA pixel
|
||||
|
@ -6072,7 +6072,7 @@ __declspec(naked) void HalfFloatRow_SSE2(const uint16* src,
|
|||
pxor xmm5, xmm5
|
||||
sub edx, eax
|
||||
|
||||
// 8 pixel loop.
|
||||
// 8 pixel loop.
|
||||
convertloop:
|
||||
movdqu xmm2, xmmword ptr [eax] // 8 shorts
|
||||
add eax, 16
|
||||
|
@ -6110,7 +6110,7 @@ __declspec(naked) void HalfFloatRow_AVX2(const uint16* src,
|
|||
vpxor ymm5, ymm5, ymm5
|
||||
sub edx, eax
|
||||
|
||||
// 16 pixel loop.
|
||||
// 16 pixel loop.
|
||||
convertloop:
|
||||
vmovdqu ymm2, [eax] // 16 shorts
|
||||
add eax, 32
|
||||
|
@ -6144,7 +6144,7 @@ __declspec(naked) void HalfFloatRow_F16C(const uint16* src,
|
|||
mov ecx, [esp + 16] /* width */
|
||||
sub edx, eax
|
||||
|
||||
// 16 pixel loop.
|
||||
// 16 pixel loop.
|
||||
convertloop:
|
||||
vpmovzxwd ymm2, xmmword ptr [eax] // 8 shorts -> 8 ints
|
||||
vpmovzxwd ymm3, xmmword ptr [eax + 16] // 8 more shorts
|
||||
|
@ -6252,7 +6252,7 @@ __declspec(naked) void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb,
|
|||
psllw xmm4, 8
|
||||
pxor xmm5, xmm5
|
||||
|
||||
// 4 pixel loop.
|
||||
// 4 pixel loop.
|
||||
convertloop:
|
||||
movdqu xmm0, xmmword ptr [eax] // generate luma ptr
|
||||
pmaddubsw xmm0, xmm3
|
||||
|
|
|
@ -371,6 +371,26 @@ static void ScalePlaneDown34(int src_width,
|
|||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SCALEROWDOWN34_MSA)
|
||||
if (TestCpuFlag(kCpuHasMSA)) {
|
||||
if (!filtering) {
|
||||
ScaleRowDown34_0 = ScaleRowDown34_Any_MSA;
|
||||
ScaleRowDown34_1 = ScaleRowDown34_Any_MSA;
|
||||
} else {
|
||||
ScaleRowDown34_0 = ScaleRowDown34_0_Box_Any_MSA;
|
||||
ScaleRowDown34_1 = ScaleRowDown34_1_Box_Any_MSA;
|
||||
}
|
||||
if (dst_width % 48 == 0) {
|
||||
if (!filtering) {
|
||||
ScaleRowDown34_0 = ScaleRowDown34_MSA;
|
||||
ScaleRowDown34_1 = ScaleRowDown34_MSA;
|
||||
} else {
|
||||
ScaleRowDown34_0 = ScaleRowDown34_0_Box_MSA;
|
||||
ScaleRowDown34_1 = ScaleRowDown34_1_Box_MSA;
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SCALEROWDOWN34_SSSE3)
|
||||
if (TestCpuFlag(kCpuHasSSSE3)) {
|
||||
if (!filtering) {
|
||||
|
@ -802,11 +822,12 @@ static void ScaleAddCols2_16_C(int dst_width,
|
|||
static void ScaleAddCols0_C(int dst_width,
|
||||
int boxheight,
|
||||
int x,
|
||||
int,
|
||||
int dx,
|
||||
const uint16* src_ptr,
|
||||
uint8* dst_ptr) {
|
||||
int scaleval = 65536 / boxheight;
|
||||
int i;
|
||||
(void)dx;
|
||||
src_ptr += (x >> 16);
|
||||
for (i = 0; i < dst_width; ++i) {
|
||||
*dst_ptr++ = src_ptr[i] * scaleval >> 16;
|
||||
|
@ -1078,6 +1099,14 @@ void ScalePlaneBilinearDown(int src_width,
|
|||
ScaleFilterCols = ScaleFilterCols_NEON;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SCALEFILTERCOLS_MSA)
|
||||
if (TestCpuFlag(kCpuHasMSA) && src_width < 32768) {
|
||||
ScaleFilterCols = ScaleFilterCols_Any_MSA;
|
||||
if (IS_ALIGNED(dst_width, 16)) {
|
||||
ScaleFilterCols = ScaleFilterCols_MSA;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
if (y > max_y) {
|
||||
y = max_y;
|
||||
|
@ -1276,6 +1305,14 @@ void ScalePlaneBilinearUp(int src_width,
|
|||
ScaleFilterCols = ScaleFilterCols_NEON;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SCALEFILTERCOLS_MSA)
|
||||
if (filtering && TestCpuFlag(kCpuHasMSA) && src_width < 32768) {
|
||||
ScaleFilterCols = ScaleFilterCols_Any_MSA;
|
||||
if (IS_ALIGNED(dst_width, 16)) {
|
||||
ScaleFilterCols = ScaleFilterCols_MSA;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
|
||||
ScaleFilterCols = ScaleColsUp2_C;
|
||||
|
@ -1663,7 +1700,7 @@ void ScalePlane_16(const uint16* src,
|
|||
CopyPlane_16(src, src_stride, dst, dst_stride, dst_width, dst_height);
|
||||
return;
|
||||
}
|
||||
if (dst_width == src_width) {
|
||||
if (dst_width == src_width && filtering != kFilterBox) {
|
||||
int dy = FixedDiv(src_height, dst_height);
|
||||
// Arbitrary scale vertically, but unscaled vertically.
|
||||
ScalePlaneVertical_16(src_height, dst_width, dst_height, src_stride,
|
||||
|
@ -1692,7 +1729,7 @@ void ScalePlane_16(const uint16* src,
|
|||
return;
|
||||
}
|
||||
if (4 * dst_width == src_width && 4 * dst_height == src_height &&
|
||||
filtering != kFilterBilinear) {
|
||||
(filtering == kFilterBox || filtering == kFilterNone)) {
|
||||
// optimized, 1/4
|
||||
ScalePlaneDown4_16(src_width, src_height, dst_width, dst_height,
|
||||
src_stride, dst_stride, src, dst, filtering);
|
||||
|
|
|
@ -33,9 +33,15 @@ extern "C" {
|
|||
#ifdef HAS_SCALEFILTERCOLS_NEON
|
||||
CANY(ScaleFilterCols_Any_NEON, ScaleFilterCols_NEON, ScaleFilterCols_C, 1, 7)
|
||||
#endif
|
||||
#ifdef HAS_SCALEFILTERCOLS_MSA
|
||||
CANY(ScaleFilterCols_Any_MSA, ScaleFilterCols_MSA, ScaleFilterCols_C, 1, 15)
|
||||
#endif
|
||||
#ifdef HAS_SCALEARGBCOLS_NEON
|
||||
CANY(ScaleARGBCols_Any_NEON, ScaleARGBCols_NEON, ScaleARGBCols_C, 4, 7)
|
||||
#endif
|
||||
#ifdef HAS_SCALEARGBCOLS_MSA
|
||||
CANY(ScaleARGBCols_Any_MSA, ScaleARGBCols_MSA, ScaleARGBCols_C, 4, 3)
|
||||
#endif
|
||||
#ifdef HAS_SCALEARGBFILTERCOLS_NEON
|
||||
CANY(ScaleARGBFilterCols_Any_NEON,
|
||||
ScaleARGBFilterCols_NEON,
|
||||
|
@ -43,6 +49,13 @@ CANY(ScaleARGBFilterCols_Any_NEON,
|
|||
4,
|
||||
3)
|
||||
#endif
|
||||
#ifdef HAS_SCALEARGBFILTERCOLS_MSA
|
||||
CANY(ScaleARGBFilterCols_Any_MSA,
|
||||
ScaleARGBFilterCols_MSA,
|
||||
ScaleARGBFilterCols_C,
|
||||
4,
|
||||
7)
|
||||
#endif
|
||||
#undef CANY
|
||||
|
||||
// Fixed scale down.
|
||||
|
@ -228,6 +241,26 @@ SDANY(ScaleRowDown34_1_Box_Any_NEON,
|
|||
1,
|
||||
23)
|
||||
#endif
|
||||
#ifdef HAS_SCALEROWDOWN34_MSA
|
||||
SDANY(ScaleRowDown34_Any_MSA,
|
||||
ScaleRowDown34_MSA,
|
||||
ScaleRowDown34_C,
|
||||
4 / 3,
|
||||
1,
|
||||
47)
|
||||
SDANY(ScaleRowDown34_0_Box_Any_MSA,
|
||||
ScaleRowDown34_0_Box_MSA,
|
||||
ScaleRowDown34_0_Box_C,
|
||||
4 / 3,
|
||||
1,
|
||||
47)
|
||||
SDANY(ScaleRowDown34_1_Box_Any_MSA,
|
||||
ScaleRowDown34_1_Box_MSA,
|
||||
ScaleRowDown34_1_Box_C,
|
||||
4 / 3,
|
||||
1,
|
||||
47)
|
||||
#endif
|
||||
#ifdef HAS_SCALEROWDOWN38_SSSE3
|
||||
SDANY(ScaleRowDown38_Any_SSSE3,
|
||||
ScaleRowDown38_SSSE3,
|
||||
|
|
|
@ -335,6 +335,14 @@ static void ScaleARGBBilinearDown(int src_width,
|
|||
ScaleARGBFilterCols = ScaleARGBFilterCols_NEON;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SCALEARGBFILTERCOLS_MSA)
|
||||
if (TestCpuFlag(kCpuHasMSA)) {
|
||||
ScaleARGBFilterCols = ScaleARGBFilterCols_Any_MSA;
|
||||
if (IS_ALIGNED(dst_width, 8)) {
|
||||
ScaleARGBFilterCols = ScaleARGBFilterCols_MSA;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
// TODO(fbarchard): Consider not allocating row buffer for kFilterLinear.
|
||||
// Allocate a row of ARGB.
|
||||
|
@ -442,6 +450,14 @@ static void ScaleARGBBilinearUp(int src_width,
|
|||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SCALEARGBFILTERCOLS_MSA)
|
||||
if (filtering && TestCpuFlag(kCpuHasMSA)) {
|
||||
ScaleARGBFilterCols = ScaleARGBFilterCols_Any_MSA;
|
||||
if (IS_ALIGNED(dst_width, 8)) {
|
||||
ScaleARGBFilterCols = ScaleARGBFilterCols_MSA;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SCALEARGBCOLS_SSE2)
|
||||
if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
|
||||
ScaleARGBFilterCols = ScaleARGBCols_SSE2;
|
||||
|
@ -454,6 +470,14 @@ static void ScaleARGBBilinearUp(int src_width,
|
|||
ScaleARGBFilterCols = ScaleARGBCols_NEON;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SCALEARGBCOLS_MSA)
|
||||
if (!filtering && TestCpuFlag(kCpuHasMSA)) {
|
||||
ScaleARGBFilterCols = ScaleARGBCols_Any_MSA;
|
||||
if (IS_ALIGNED(dst_width, 4)) {
|
||||
ScaleARGBFilterCols = ScaleARGBCols_MSA;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
|
||||
ScaleARGBFilterCols = ScaleARGBColsUp2_C;
|
||||
|
@ -643,6 +667,14 @@ static void ScaleYUVToARGBBilinearUp(int src_width,
|
|||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SCALEARGBFILTERCOLS_MSA)
|
||||
if (filtering && TestCpuFlag(kCpuHasMSA)) {
|
||||
ScaleARGBFilterCols = ScaleARGBFilterCols_Any_MSA;
|
||||
if (IS_ALIGNED(dst_width, 8)) {
|
||||
ScaleARGBFilterCols = ScaleARGBFilterCols_MSA;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SCALEARGBCOLS_SSE2)
|
||||
if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
|
||||
ScaleARGBFilterCols = ScaleARGBCols_SSE2;
|
||||
|
@ -655,6 +687,14 @@ static void ScaleYUVToARGBBilinearUp(int src_width,
|
|||
ScaleARGBFilterCols = ScaleARGBCols_NEON;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SCALEARGBCOLS_MSA)
|
||||
if (!filtering && TestCpuFlag(kCpuHasMSA)) {
|
||||
ScaleARGBFilterCols = ScaleARGBCols_Any_MSA;
|
||||
if (IS_ALIGNED(dst_width, 4)) {
|
||||
ScaleARGBFilterCols = ScaleARGBCols_MSA;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
|
||||
ScaleARGBFilterCols = ScaleARGBColsUp2_C;
|
||||
|
@ -778,6 +818,14 @@ static void ScaleARGBSimple(int src_width,
|
|||
ScaleARGBCols = ScaleARGBCols_NEON;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SCALEARGBCOLS_MSA)
|
||||
if (TestCpuFlag(kCpuHasMSA)) {
|
||||
ScaleARGBCols = ScaleARGBCols_Any_MSA;
|
||||
if (IS_ALIGNED(dst_width, 4)) {
|
||||
ScaleARGBCols = ScaleARGBCols_MSA;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
if (src_width * 2 == dst_width && x < 0x8000) {
|
||||
ScaleARGBCols = ScaleARGBColsUp2_C;
|
||||
|
|
|
@ -1306,6 +1306,35 @@ void ScaleSlope(int src_width,
|
|||
}
|
||||
#undef CENTERSTART
|
||||
|
||||
// Read 8x2 upsample with filtering and write 16x1.
|
||||
// actually reads an extra pixel, so 9x2.
|
||||
void ScaleRowUp2_16_C(const uint16* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint16* dst,
|
||||
int dst_width) {
|
||||
const uint16* src2 = src_ptr + src_stride;
|
||||
|
||||
int x;
|
||||
for (x = 0; x < dst_width - 1; x += 2) {
|
||||
uint16 p0 = src_ptr[0];
|
||||
uint16 p1 = src_ptr[1];
|
||||
uint16 p2 = src2[0];
|
||||
uint16 p3 = src2[1];
|
||||
dst[0] = (p0 * 9 + p1 * 3 + p2 * 3 + p3 + 8) >> 4;
|
||||
dst[1] = (p0 * 3 + p1 * 9 + p2 + p3 * 3 + 8) >> 4;
|
||||
++src_ptr;
|
||||
++src2;
|
||||
dst += 2;
|
||||
}
|
||||
if (dst_width & 1) {
|
||||
uint16 p0 = src_ptr[0];
|
||||
uint16 p1 = src_ptr[1];
|
||||
uint16 p2 = src2[0];
|
||||
uint16 p3 = src2[1];
|
||||
dst[0] = (p0 * 9 + p1 * 3 + p2 * 3 + p3 + 8) >> 4;
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
} // namespace libyuv
|
||||
|
|
|
@ -21,6 +21,14 @@ namespace libyuv {
|
|||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define LOAD_INDEXED_DATA(srcp, indx0, out0) \
|
||||
{ \
|
||||
out0[0] = srcp[indx0[0]]; \
|
||||
out0[1] = srcp[indx0[1]]; \
|
||||
out0[2] = srcp[indx0[2]]; \
|
||||
out0[3] = srcp[indx0[3]]; \
|
||||
}
|
||||
|
||||
void ScaleARGBRowDown2_MSA(const uint8_t* src_argb,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst_argb,
|
||||
|
@ -545,6 +553,394 @@ void ScaleAddRow_MSA(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) {
|
|||
}
|
||||
}
|
||||
|
||||
void ScaleFilterCols_MSA(uint8* dst_ptr,
|
||||
const uint8* src_ptr,
|
||||
int dst_width,
|
||||
int x,
|
||||
int dx) {
|
||||
int j;
|
||||
v4i32 vec_x = __msa_fill_w(x);
|
||||
v4i32 vec_dx = __msa_fill_w(dx);
|
||||
v4i32 vec_const = {0, 1, 2, 3};
|
||||
v4i32 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
|
||||
v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
|
||||
v8u16 reg0, reg1;
|
||||
v16u8 dst0;
|
||||
v4i32 const_0xFFFF = __msa_fill_w(0xFFFF);
|
||||
v4i32 const_0x40 = __msa_fill_w(0x40);
|
||||
|
||||
vec0 = vec_dx * vec_const;
|
||||
vec1 = vec_dx * 4;
|
||||
vec_x += vec0;
|
||||
|
||||
for (j = 0; j < dst_width - 1; j += 16) {
|
||||
vec2 = vec_x >> 16;
|
||||
vec6 = vec_x & const_0xFFFF;
|
||||
vec_x += vec1;
|
||||
vec3 = vec_x >> 16;
|
||||
vec7 = vec_x & const_0xFFFF;
|
||||
vec_x += vec1;
|
||||
vec4 = vec_x >> 16;
|
||||
vec8 = vec_x & const_0xFFFF;
|
||||
vec_x += vec1;
|
||||
vec5 = vec_x >> 16;
|
||||
vec9 = vec_x & const_0xFFFF;
|
||||
vec_x += vec1;
|
||||
vec6 >>= 9;
|
||||
vec7 >>= 9;
|
||||
vec8 >>= 9;
|
||||
vec9 >>= 9;
|
||||
LOAD_INDEXED_DATA(src_ptr, vec2, tmp0);
|
||||
LOAD_INDEXED_DATA(src_ptr, vec3, tmp1);
|
||||
LOAD_INDEXED_DATA(src_ptr, vec4, tmp2);
|
||||
LOAD_INDEXED_DATA(src_ptr, vec5, tmp3);
|
||||
vec2 += 1;
|
||||
vec3 += 1;
|
||||
vec4 += 1;
|
||||
vec5 += 1;
|
||||
LOAD_INDEXED_DATA(src_ptr, vec2, tmp4);
|
||||
LOAD_INDEXED_DATA(src_ptr, vec3, tmp5);
|
||||
LOAD_INDEXED_DATA(src_ptr, vec4, tmp6);
|
||||
LOAD_INDEXED_DATA(src_ptr, vec5, tmp7);
|
||||
tmp4 -= tmp0;
|
||||
tmp5 -= tmp1;
|
||||
tmp6 -= tmp2;
|
||||
tmp7 -= tmp3;
|
||||
tmp4 *= vec6;
|
||||
tmp5 *= vec7;
|
||||
tmp6 *= vec8;
|
||||
tmp7 *= vec9;
|
||||
tmp4 += const_0x40;
|
||||
tmp5 += const_0x40;
|
||||
tmp6 += const_0x40;
|
||||
tmp7 += const_0x40;
|
||||
tmp4 >>= 7;
|
||||
tmp5 >>= 7;
|
||||
tmp6 >>= 7;
|
||||
tmp7 >>= 7;
|
||||
tmp0 += tmp4;
|
||||
tmp1 += tmp5;
|
||||
tmp2 += tmp6;
|
||||
tmp3 += tmp7;
|
||||
reg0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0);
|
||||
reg1 = (v8u16)__msa_pckev_h((v8i16)tmp3, (v8i16)tmp2);
|
||||
dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0);
|
||||
__msa_st_b(dst0, dst_ptr, 0);
|
||||
dst_ptr += 16;
|
||||
}
|
||||
}
|
||||
|
||||
void ScaleARGBCols_MSA(uint8* dst_argb,
|
||||
const uint8* src_argb,
|
||||
int dst_width,
|
||||
int x,
|
||||
int dx) {
|
||||
const uint32* src = (const uint32*)(src_argb);
|
||||
uint32* dst = (uint32*)(dst_argb);
|
||||
int j;
|
||||
v4i32 x_vec = __msa_fill_w(x);
|
||||
v4i32 dx_vec = __msa_fill_w(dx);
|
||||
v4i32 const_vec = {0, 1, 2, 3};
|
||||
v4i32 vec0, vec1, vec2;
|
||||
v4i32 dst0;
|
||||
|
||||
vec0 = dx_vec * const_vec;
|
||||
vec1 = dx_vec * 4;
|
||||
x_vec += vec0;
|
||||
|
||||
for (j = 0; j < dst_width; j += 4) {
|
||||
vec2 = x_vec >> 16;
|
||||
x_vec += vec1;
|
||||
LOAD_INDEXED_DATA(src, vec2, dst0);
|
||||
__msa_st_w(dst0, dst, 0);
|
||||
dst += 4;
|
||||
}
|
||||
}
|
||||
|
||||
void ScaleARGBFilterCols_MSA(uint8* dst_argb,
|
||||
const uint8* src_argb,
|
||||
int dst_width,
|
||||
int x,
|
||||
int dx) {
|
||||
const uint32* src = (const uint32*)(src_argb);
|
||||
int j;
|
||||
v4u32 src0, src1, src2, src3;
|
||||
v4u32 vec0, vec1, vec2, vec3;
|
||||
v16u8 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
|
||||
v16u8 mult0, mult1, mult2, mult3;
|
||||
v8u16 tmp0, tmp1, tmp2, tmp3;
|
||||
v16u8 dst0, dst1;
|
||||
v4u32 vec_x = (v4u32)__msa_fill_w(x);
|
||||
v4u32 vec_dx = (v4u32)__msa_fill_w(dx);
|
||||
v4u32 vec_const = {0, 1, 2, 3};
|
||||
v16u8 const_0x7f = (v16u8)__msa_fill_b(0x7f);
|
||||
|
||||
vec0 = vec_dx * vec_const;
|
||||
vec1 = vec_dx * 4;
|
||||
vec_x += vec0;
|
||||
|
||||
for (j = 0; j < dst_width - 1; j += 8) {
|
||||
vec2 = vec_x >> 16;
|
||||
reg0 = (v16u8)(vec_x >> 9);
|
||||
vec_x += vec1;
|
||||
vec3 = vec_x >> 16;
|
||||
reg1 = (v16u8)(vec_x >> 9);
|
||||
vec_x += vec1;
|
||||
reg0 = reg0 & const_0x7f;
|
||||
reg1 = reg1 & const_0x7f;
|
||||
reg0 = (v16u8)__msa_shf_b((v16i8)reg0, 0);
|
||||
reg1 = (v16u8)__msa_shf_b((v16i8)reg1, 0);
|
||||
reg2 = reg0 ^ const_0x7f;
|
||||
reg3 = reg1 ^ const_0x7f;
|
||||
mult0 = (v16u8)__msa_ilvr_b((v16i8)reg0, (v16i8)reg2);
|
||||
mult1 = (v16u8)__msa_ilvl_b((v16i8)reg0, (v16i8)reg2);
|
||||
mult2 = (v16u8)__msa_ilvr_b((v16i8)reg1, (v16i8)reg3);
|
||||
mult3 = (v16u8)__msa_ilvl_b((v16i8)reg1, (v16i8)reg3);
|
||||
LOAD_INDEXED_DATA(src, vec2, src0);
|
||||
LOAD_INDEXED_DATA(src, vec3, src1);
|
||||
vec2 += 1;
|
||||
vec3 += 1;
|
||||
LOAD_INDEXED_DATA(src, vec2, src2);
|
||||
LOAD_INDEXED_DATA(src, vec3, src3);
|
||||
reg4 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src0);
|
||||
reg5 = (v16u8)__msa_ilvl_b((v16i8)src2, (v16i8)src0);
|
||||
reg6 = (v16u8)__msa_ilvr_b((v16i8)src3, (v16i8)src1);
|
||||
reg7 = (v16u8)__msa_ilvl_b((v16i8)src3, (v16i8)src1);
|
||||
tmp0 = __msa_dotp_u_h(reg4, mult0);
|
||||
tmp1 = __msa_dotp_u_h(reg5, mult1);
|
||||
tmp2 = __msa_dotp_u_h(reg6, mult2);
|
||||
tmp3 = __msa_dotp_u_h(reg7, mult3);
|
||||
tmp0 >>= 7;
|
||||
tmp1 >>= 7;
|
||||
tmp2 >>= 7;
|
||||
tmp3 >>= 7;
|
||||
dst0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
|
||||
dst1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
|
||||
__msa_st_b(dst0, dst_argb, 0);
|
||||
__msa_st_b(dst1, dst_argb, 16);
|
||||
dst_argb += 32;
|
||||
}
|
||||
}
|
||||
|
||||
void ScaleRowDown34_MSA(const uint8* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8* dst,
|
||||
int dst_width) {
|
||||
int x;
|
||||
(void)src_stride;
|
||||
v16u8 src0, src1, src2, src3;
|
||||
v16u8 vec0, vec1, vec2;
|
||||
v16i8 mask0 = {0, 1, 3, 4, 5, 7, 8, 9, 11, 12, 13, 15, 16, 17, 19, 20};
|
||||
v16i8 mask1 = {5, 7, 8, 9, 11, 12, 13, 15, 16, 17, 19, 20, 21, 23, 24, 25};
|
||||
v16i8 mask2 = {11, 12, 13, 15, 16, 17, 19, 20,
|
||||
21, 23, 24, 25, 27, 28, 29, 31};
|
||||
|
||||
assert((dst_width % 3 == 0) && (dst_width > 0));
|
||||
|
||||
for (x = 0; x < dst_width; x += 48) {
|
||||
src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0);
|
||||
src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16);
|
||||
src2 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 32);
|
||||
src3 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 48);
|
||||
vec0 = (v16u8)__msa_vshf_b(mask0, (v16i8)src1, (v16i8)src0);
|
||||
vec1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src2, (v16i8)src1);
|
||||
vec2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src3, (v16i8)src2);
|
||||
__msa_st_b((v16i8)vec0, dst, 0);
|
||||
__msa_st_b((v16i8)vec1, dst, 16);
|
||||
__msa_st_b((v16i8)vec2, dst, 32);
|
||||
src_ptr += 64;
|
||||
dst += 48;
|
||||
}
|
||||
}
|
||||
|
||||
void ScaleRowDown34_0_Box_MSA(const uint8* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8* d,
|
||||
int dst_width) {
|
||||
const uint8* s = src_ptr;
|
||||
const uint8* t = src_ptr + src_stride;
|
||||
int x;
|
||||
v16u8 src0, src1, src2, src3, src4, src5, src6, src7, dst0, dst1, dst2;
|
||||
v16u8 vec0, vec1, vec2, vec3, vec4, vec5;
|
||||
v16u8 vec6, vec7, vec8, vec9, vec10, vec11;
|
||||
v8i16 reg0, reg1, reg2, reg3, reg4, reg5;
|
||||
v8i16 reg6, reg7, reg8, reg9, reg10, reg11;
|
||||
v16u8 const0 = {3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1};
|
||||
v16u8 const1 = {1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1};
|
||||
v16u8 const2 = {1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3};
|
||||
v16i8 mask0 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10};
|
||||
v16i8 mask1 = {10, 11, 12, 13, 13, 14, 14, 15,
|
||||
16, 17, 17, 18, 18, 19, 20, 21};
|
||||
v16i8 mask2 = {5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15};
|
||||
v8i16 shft0 = {2, 1, 2, 2, 1, 2, 2, 1};
|
||||
v8i16 shft1 = {2, 2, 1, 2, 2, 1, 2, 2};
|
||||
v8i16 shft2 = {1, 2, 2, 1, 2, 2, 1, 2};
|
||||
|
||||
assert((dst_width % 3 == 0) && (dst_width > 0));
|
||||
|
||||
for (x = 0; x < dst_width; x += 48) {
|
||||
src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
|
||||
src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
|
||||
src2 = (v16u8)__msa_ld_b((v16i8*)s, 32);
|
||||
src3 = (v16u8)__msa_ld_b((v16i8*)s, 48);
|
||||
src4 = (v16u8)__msa_ld_b((v16i8*)t, 0);
|
||||
src5 = (v16u8)__msa_ld_b((v16i8*)t, 16);
|
||||
src6 = (v16u8)__msa_ld_b((v16i8*)t, 32);
|
||||
src7 = (v16u8)__msa_ld_b((v16i8*)t, 48);
|
||||
vec0 = (v16u8)__msa_vshf_b(mask0, (v16i8)src0, (v16i8)src0);
|
||||
vec1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0);
|
||||
vec2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src1, (v16i8)src1);
|
||||
vec3 = (v16u8)__msa_vshf_b(mask0, (v16i8)src2, (v16i8)src2);
|
||||
vec4 = (v16u8)__msa_vshf_b(mask1, (v16i8)src3, (v16i8)src2);
|
||||
vec5 = (v16u8)__msa_vshf_b(mask2, (v16i8)src3, (v16i8)src3);
|
||||
vec6 = (v16u8)__msa_vshf_b(mask0, (v16i8)src4, (v16i8)src4);
|
||||
vec7 = (v16u8)__msa_vshf_b(mask1, (v16i8)src5, (v16i8)src4);
|
||||
vec8 = (v16u8)__msa_vshf_b(mask2, (v16i8)src5, (v16i8)src5);
|
||||
vec9 = (v16u8)__msa_vshf_b(mask0, (v16i8)src6, (v16i8)src6);
|
||||
vec10 = (v16u8)__msa_vshf_b(mask1, (v16i8)src7, (v16i8)src6);
|
||||
vec11 = (v16u8)__msa_vshf_b(mask2, (v16i8)src7, (v16i8)src7);
|
||||
reg0 = (v8i16)__msa_dotp_u_h(vec0, const0);
|
||||
reg1 = (v8i16)__msa_dotp_u_h(vec1, const1);
|
||||
reg2 = (v8i16)__msa_dotp_u_h(vec2, const2);
|
||||
reg3 = (v8i16)__msa_dotp_u_h(vec3, const0);
|
||||
reg4 = (v8i16)__msa_dotp_u_h(vec4, const1);
|
||||
reg5 = (v8i16)__msa_dotp_u_h(vec5, const2);
|
||||
reg6 = (v8i16)__msa_dotp_u_h(vec6, const0);
|
||||
reg7 = (v8i16)__msa_dotp_u_h(vec7, const1);
|
||||
reg8 = (v8i16)__msa_dotp_u_h(vec8, const2);
|
||||
reg9 = (v8i16)__msa_dotp_u_h(vec9, const0);
|
||||
reg10 = (v8i16)__msa_dotp_u_h(vec10, const1);
|
||||
reg11 = (v8i16)__msa_dotp_u_h(vec11, const2);
|
||||
reg0 = __msa_srar_h(reg0, shft0);
|
||||
reg1 = __msa_srar_h(reg1, shft1);
|
||||
reg2 = __msa_srar_h(reg2, shft2);
|
||||
reg3 = __msa_srar_h(reg3, shft0);
|
||||
reg4 = __msa_srar_h(reg4, shft1);
|
||||
reg5 = __msa_srar_h(reg5, shft2);
|
||||
reg6 = __msa_srar_h(reg6, shft0);
|
||||
reg7 = __msa_srar_h(reg7, shft1);
|
||||
reg8 = __msa_srar_h(reg8, shft2);
|
||||
reg9 = __msa_srar_h(reg9, shft0);
|
||||
reg10 = __msa_srar_h(reg10, shft1);
|
||||
reg11 = __msa_srar_h(reg11, shft2);
|
||||
reg0 = reg0 * 3 + reg6;
|
||||
reg1 = reg1 * 3 + reg7;
|
||||
reg2 = reg2 * 3 + reg8;
|
||||
reg3 = reg3 * 3 + reg9;
|
||||
reg4 = reg4 * 3 + reg10;
|
||||
reg5 = reg5 * 3 + reg11;
|
||||
reg0 = __msa_srari_h(reg0, 2);
|
||||
reg1 = __msa_srari_h(reg1, 2);
|
||||
reg2 = __msa_srari_h(reg2, 2);
|
||||
reg3 = __msa_srari_h(reg3, 2);
|
||||
reg4 = __msa_srari_h(reg4, 2);
|
||||
reg5 = __msa_srari_h(reg5, 2);
|
||||
dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0);
|
||||
dst1 = (v16u8)__msa_pckev_b((v16i8)reg3, (v16i8)reg2);
|
||||
dst2 = (v16u8)__msa_pckev_b((v16i8)reg5, (v16i8)reg4);
|
||||
__msa_st_b((v16i8)dst0, d, 0);
|
||||
__msa_st_b((v16i8)dst1, d, 16);
|
||||
__msa_st_b((v16i8)dst2, d, 32);
|
||||
s += 64;
|
||||
t += 64;
|
||||
d += 48;
|
||||
}
|
||||
}
|
||||
|
||||
void ScaleRowDown34_1_Box_MSA(const uint8* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8* d,
|
||||
int dst_width) {
|
||||
const uint8* s = src_ptr;
|
||||
const uint8* t = src_ptr + src_stride;
|
||||
int x;
|
||||
v16u8 src0, src1, src2, src3, src4, src5, src6, src7, dst0, dst1, dst2;
|
||||
v16u8 vec0, vec1, vec2, vec3, vec4, vec5;
|
||||
v16u8 vec6, vec7, vec8, vec9, vec10, vec11;
|
||||
v8i16 reg0, reg1, reg2, reg3, reg4, reg5;
|
||||
v8i16 reg6, reg7, reg8, reg9, reg10, reg11;
|
||||
v16u8 const0 = {3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1};
|
||||
v16u8 const1 = {1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1};
|
||||
v16u8 const2 = {1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3};
|
||||
v16i8 mask0 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10};
|
||||
v16i8 mask1 = {10, 11, 12, 13, 13, 14, 14, 15,
|
||||
16, 17, 17, 18, 18, 19, 20, 21};
|
||||
v16i8 mask2 = {5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15};
|
||||
v8i16 shft0 = {2, 1, 2, 2, 1, 2, 2, 1};
|
||||
v8i16 shft1 = {2, 2, 1, 2, 2, 1, 2, 2};
|
||||
v8i16 shft2 = {1, 2, 2, 1, 2, 2, 1, 2};
|
||||
|
||||
assert((dst_width % 3 == 0) && (dst_width > 0));
|
||||
|
||||
for (x = 0; x < dst_width; x += 48) {
|
||||
src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
|
||||
src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
|
||||
src2 = (v16u8)__msa_ld_b((v16i8*)s, 32);
|
||||
src3 = (v16u8)__msa_ld_b((v16i8*)s, 48);
|
||||
src4 = (v16u8)__msa_ld_b((v16i8*)t, 0);
|
||||
src5 = (v16u8)__msa_ld_b((v16i8*)t, 16);
|
||||
src6 = (v16u8)__msa_ld_b((v16i8*)t, 32);
|
||||
src7 = (v16u8)__msa_ld_b((v16i8*)t, 48);
|
||||
vec0 = (v16u8)__msa_vshf_b(mask0, (v16i8)src0, (v16i8)src0);
|
||||
vec1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0);
|
||||
vec2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src1, (v16i8)src1);
|
||||
vec3 = (v16u8)__msa_vshf_b(mask0, (v16i8)src2, (v16i8)src2);
|
||||
vec4 = (v16u8)__msa_vshf_b(mask1, (v16i8)src3, (v16i8)src2);
|
||||
vec5 = (v16u8)__msa_vshf_b(mask2, (v16i8)src3, (v16i8)src3);
|
||||
vec6 = (v16u8)__msa_vshf_b(mask0, (v16i8)src4, (v16i8)src4);
|
||||
vec7 = (v16u8)__msa_vshf_b(mask1, (v16i8)src5, (v16i8)src4);
|
||||
vec8 = (v16u8)__msa_vshf_b(mask2, (v16i8)src5, (v16i8)src5);
|
||||
vec9 = (v16u8)__msa_vshf_b(mask0, (v16i8)src6, (v16i8)src6);
|
||||
vec10 = (v16u8)__msa_vshf_b(mask1, (v16i8)src7, (v16i8)src6);
|
||||
vec11 = (v16u8)__msa_vshf_b(mask2, (v16i8)src7, (v16i8)src7);
|
||||
reg0 = (v8i16)__msa_dotp_u_h(vec0, const0);
|
||||
reg1 = (v8i16)__msa_dotp_u_h(vec1, const1);
|
||||
reg2 = (v8i16)__msa_dotp_u_h(vec2, const2);
|
||||
reg3 = (v8i16)__msa_dotp_u_h(vec3, const0);
|
||||
reg4 = (v8i16)__msa_dotp_u_h(vec4, const1);
|
||||
reg5 = (v8i16)__msa_dotp_u_h(vec5, const2);
|
||||
reg6 = (v8i16)__msa_dotp_u_h(vec6, const0);
|
||||
reg7 = (v8i16)__msa_dotp_u_h(vec7, const1);
|
||||
reg8 = (v8i16)__msa_dotp_u_h(vec8, const2);
|
||||
reg9 = (v8i16)__msa_dotp_u_h(vec9, const0);
|
||||
reg10 = (v8i16)__msa_dotp_u_h(vec10, const1);
|
||||
reg11 = (v8i16)__msa_dotp_u_h(vec11, const2);
|
||||
reg0 = __msa_srar_h(reg0, shft0);
|
||||
reg1 = __msa_srar_h(reg1, shft1);
|
||||
reg2 = __msa_srar_h(reg2, shft2);
|
||||
reg3 = __msa_srar_h(reg3, shft0);
|
||||
reg4 = __msa_srar_h(reg4, shft1);
|
||||
reg5 = __msa_srar_h(reg5, shft2);
|
||||
reg6 = __msa_srar_h(reg6, shft0);
|
||||
reg7 = __msa_srar_h(reg7, shft1);
|
||||
reg8 = __msa_srar_h(reg8, shft2);
|
||||
reg9 = __msa_srar_h(reg9, shft0);
|
||||
reg10 = __msa_srar_h(reg10, shft1);
|
||||
reg11 = __msa_srar_h(reg11, shft2);
|
||||
reg0 += reg6;
|
||||
reg1 += reg7;
|
||||
reg2 += reg8;
|
||||
reg3 += reg9;
|
||||
reg4 += reg10;
|
||||
reg5 += reg11;
|
||||
reg0 = __msa_srari_h(reg0, 1);
|
||||
reg1 = __msa_srari_h(reg1, 1);
|
||||
reg2 = __msa_srari_h(reg2, 1);
|
||||
reg3 = __msa_srari_h(reg3, 1);
|
||||
reg4 = __msa_srari_h(reg4, 1);
|
||||
reg5 = __msa_srari_h(reg5, 1);
|
||||
dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0);
|
||||
dst1 = (v16u8)__msa_pckev_b((v16i8)reg3, (v16i8)reg2);
|
||||
dst2 = (v16u8)__msa_pckev_b((v16i8)reg5, (v16i8)reg4);
|
||||
__msa_st_b((v16i8)dst0, d, 0);
|
||||
__msa_st_b((v16i8)dst1, d, 16);
|
||||
__msa_st_b((v16i8)dst2, d, 32);
|
||||
s += 64;
|
||||
t += 64;
|
||||
d += 48;
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
} // namespace libyuv
|
||||
|
|
|
@ -29,7 +29,7 @@ void ScaleRowDown2_NEON(const uint8* src_ptr,
|
|||
int dst_width) {
|
||||
(void)src_stride;
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
// load even pixels into q0, odd into q1
|
||||
"vld2.8 {q0, q1}, [%0]! \n"
|
||||
"subs %2, %2, #16 \n" // 16 processed per loop
|
||||
|
@ -50,15 +50,10 @@ void ScaleRowDown2Linear_NEON(const uint8* src_ptr,
|
|||
int dst_width) {
|
||||
(void)src_stride;
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"vld1.8 {q0, q1}, [%0]! \n" // load pixels and post
|
||||
// inc
|
||||
"1: \n"
|
||||
"vld2.8 {q0, q1}, [%0]! \n" // load 32 pixels
|
||||
"subs %2, %2, #16 \n" // 16 processed per loop
|
||||
"vpaddl.u8 q0, q0 \n" // add adjacent
|
||||
"vpaddl.u8 q1, q1 \n"
|
||||
"vrshrn.u16 d0, q0, #1 \n" // downshift, round and
|
||||
// pack
|
||||
"vrshrn.u16 d1, q1, #1 \n"
|
||||
"vrhadd.u8 q0, q0, q1 \n" // rounding half add
|
||||
"vst1.8 {q0}, [%1]! \n"
|
||||
"bgt 1b \n"
|
||||
: "+r"(src_ptr), // %0
|
||||
|
@ -77,7 +72,7 @@ void ScaleRowDown2Box_NEON(const uint8* src_ptr,
|
|||
asm volatile(
|
||||
// change the stride to row 2 pointer
|
||||
"add %1, %0 \n"
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"vld1.8 {q0, q1}, [%0]! \n" // load row 1 and post inc
|
||||
"vld1.8 {q2, q3}, [%1]! \n" // load row 2 and post inc
|
||||
"subs %3, %3, #16 \n" // 16 processed per loop
|
||||
|
@ -106,7 +101,7 @@ void ScaleRowDown4_NEON(const uint8* src_ptr,
|
|||
int dst_width) {
|
||||
(void)src_stride;
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop
|
||||
"vst1.8 {d2}, [%1]! \n"
|
||||
|
@ -126,7 +121,7 @@ void ScaleRowDown4Box_NEON(const uint8* src_ptr,
|
|||
const uint8* src_ptr2 = src_ptr + src_stride * 2;
|
||||
const uint8* src_ptr3 = src_ptr + src_stride * 3;
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"vld1.8 {q0}, [%0]! \n" // load up 16x4
|
||||
"vld1.8 {q1}, [%3]! \n"
|
||||
"vld1.8 {q2}, [%4]! \n"
|
||||
|
@ -160,12 +155,12 @@ void ScaleRowDown34_NEON(const uint8* src_ptr,
|
|||
int dst_width) {
|
||||
(void)src_stride;
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
|
||||
"subs %2, %2, #24 \n"
|
||||
"vmov d2, d3 \n" // order d0, d1, d2
|
||||
"vst3.8 {d0, d1, d2}, [%1]! \n"
|
||||
"bgt 1b \n"
|
||||
"1: \n"
|
||||
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
|
||||
"subs %2, %2, #24 \n"
|
||||
"vmov d2, d3 \n" // order d0, d1, d2
|
||||
"vst3.8 {d0, d1, d2}, [%1]! \n"
|
||||
"bgt 1b \n"
|
||||
: "+r"(src_ptr), // %0
|
||||
"+r"(dst_ptr), // %1
|
||||
"+r"(dst_width) // %2
|
||||
|
@ -180,7 +175,7 @@ void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
|
|||
asm volatile(
|
||||
"vmov.u8 d24, #3 \n"
|
||||
"add %3, %0 \n"
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
|
||||
"vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
|
||||
"subs %2, %2, #24 \n"
|
||||
|
@ -237,7 +232,7 @@ void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
|
|||
asm volatile(
|
||||
"vmov.u8 d24, #3 \n"
|
||||
"add %3, %0 \n"
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
|
||||
"vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
|
||||
"subs %2, %2, #24 \n"
|
||||
|
@ -285,7 +280,7 @@ void ScaleRowDown38_NEON(const uint8* src_ptr,
|
|||
(void)src_stride;
|
||||
asm volatile(
|
||||
"vld1.8 {q3}, [%3] \n"
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"vld1.8 {d0, d1, d2, d3}, [%0]! \n"
|
||||
"subs %2, %2, #12 \n"
|
||||
"vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n"
|
||||
|
@ -312,7 +307,7 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
|
|||
"vld1.8 {q14}, [%6] \n"
|
||||
"vld1.8 {q15}, [%7] \n"
|
||||
"add %3, %0 \n"
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
|
||||
// d0 = 00 40 01 41 02 42 03 43
|
||||
// d1 = 10 50 11 51 12 52 13 53
|
||||
|
@ -421,7 +416,7 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
|
|||
"vld1.16 {q13}, [%4] \n"
|
||||
"vld1.8 {q14}, [%5] \n"
|
||||
"add %3, %0 \n"
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
|
||||
// d0 = 00 40 01 41 02 42 03 43
|
||||
// d1 = 10 50 11 51 12 52 13 53
|
||||
|
@ -513,12 +508,12 @@ void ScaleAddRows_NEON(const uint8* src_ptr,
|
|||
int src_height) {
|
||||
const uint8* src_tmp;
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"mov %0, %1 \n"
|
||||
"mov r12, %5 \n"
|
||||
"veor q2, q2, q2 \n"
|
||||
"veor q3, q3, q3 \n"
|
||||
"2: \n"
|
||||
"2: \n"
|
||||
// load 16 pixels into q0
|
||||
"vld1.8 {q0}, [%0], %3 \n"
|
||||
"vaddw.u8 q3, q3, d1 \n"
|
||||
|
@ -540,15 +535,13 @@ void ScaleAddRows_NEON(const uint8* src_ptr,
|
|||
);
|
||||
}
|
||||
|
||||
// clang-format off
|
||||
// TODO(Yang Zhang): Investigate less load instructions for
|
||||
// the x/dx stepping
|
||||
#define LOAD2_DATA8_LANE(n) \
|
||||
"lsr %5, %3, #16 \n" \
|
||||
"add %6, %1, %5 \n" \
|
||||
"add %3, %3, %4 \n" \
|
||||
"vld2.8 {d6["#n"], d7["#n"]}, [%6] \n"
|
||||
// clang-format on
|
||||
#define LOAD2_DATA8_LANE(n) \
|
||||
"lsr %5, %3, #16 \n" \
|
||||
"add %6, %1, %5 \n" \
|
||||
"add %3, %3, %4 \n" \
|
||||
"vld2.8 {d6[" #n "], d7[" #n "]}, [%6] \n"
|
||||
|
||||
// The NEON version mimics this formula (from row_common.cc):
|
||||
// #define BLENDER(a, b, f) (uint8)((int)(a) +
|
||||
|
@ -639,7 +632,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
|
|||
"rsb %4, #256 \n"
|
||||
"vdup.8 d4, %4 \n"
|
||||
// General purpose row blend.
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"vld1.8 {q0}, [%1]! \n"
|
||||
"vld1.8 {q1}, [%2]! \n"
|
||||
"subs %3, %3, #16 \n"
|
||||
|
@ -654,7 +647,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
|
|||
"b 99f \n"
|
||||
|
||||
// Blend 25 / 75.
|
||||
"25: \n"
|
||||
"25: \n"
|
||||
"vld1.8 {q0}, [%1]! \n"
|
||||
"vld1.8 {q1}, [%2]! \n"
|
||||
"subs %3, %3, #16 \n"
|
||||
|
@ -665,7 +658,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
|
|||
"b 99f \n"
|
||||
|
||||
// Blend 50 / 50.
|
||||
"50: \n"
|
||||
"50: \n"
|
||||
"vld1.8 {q0}, [%1]! \n"
|
||||
"vld1.8 {q1}, [%2]! \n"
|
||||
"subs %3, %3, #16 \n"
|
||||
|
@ -675,7 +668,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
|
|||
"b 99f \n"
|
||||
|
||||
// Blend 75 / 25.
|
||||
"75: \n"
|
||||
"75: \n"
|
||||
"vld1.8 {q1}, [%1]! \n"
|
||||
"vld1.8 {q0}, [%2]! \n"
|
||||
"subs %3, %3, #16 \n"
|
||||
|
@ -686,13 +679,13 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
|
|||
"b 99f \n"
|
||||
|
||||
// Blend 100 / 0 - Copy row unchanged.
|
||||
"100: \n"
|
||||
"100: \n"
|
||||
"vld1.8 {q0}, [%1]! \n"
|
||||
"subs %3, %3, #16 \n"
|
||||
"vst1.8 {q0}, [%0]! \n"
|
||||
"bgt 100b \n"
|
||||
|
||||
"99: \n"
|
||||
"99: \n"
|
||||
"vst1.8 {d1[7]}, [%0] \n"
|
||||
: "+r"(dst_ptr), // %0
|
||||
"+r"(src_ptr), // %1
|
||||
|
@ -709,13 +702,12 @@ void ScaleARGBRowDown2_NEON(const uint8* src_ptr,
|
|||
int dst_width) {
|
||||
(void)src_stride;
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
// load even pixels into q0, odd into q1
|
||||
"vld2.32 {q0, q1}, [%0]! \n"
|
||||
"vld2.32 {q2, q3}, [%0]! \n"
|
||||
"1: \n"
|
||||
"vld4.32 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
|
||||
"vld4.32 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop
|
||||
"vst1.8 {q1}, [%1]! \n" // store odd pixels
|
||||
"vst1.8 {q3}, [%1]! \n"
|
||||
"vmov q2, q1 \n" // load next 8 ARGB
|
||||
"vst2.32 {q2, q3}, [%1]! \n" // store odd pixels
|
||||
"bgt 1b \n"
|
||||
: "+r"(src_ptr), // %0
|
||||
"+r"(dst), // %1
|
||||
|
@ -725,27 +717,26 @@ void ScaleARGBRowDown2_NEON(const uint8* src_ptr,
|
|||
);
|
||||
}
|
||||
|
||||
// 46: f964 018d vld4.32 {d16,d18,d20,d22}, [r4]!
|
||||
// 4a: 3e04 subs r6, #4
|
||||
// 4c: f964 118d vld4.32 {d17,d19,d21,d23}, [r4]!
|
||||
// 50: ef64 21f4 vorr q9, q10, q10
|
||||
// 54: f942 038d vst2.32 {d16-d19}, [r2]!
|
||||
// 58: d1f5 bne.n 46 <ScaleARGBRowDown2_C+0x46>
|
||||
|
||||
void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb,
|
||||
ptrdiff_t src_stride,
|
||||
uint8* dst_argb,
|
||||
int dst_width) {
|
||||
(void)src_stride;
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
|
||||
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB
|
||||
// pixels.
|
||||
"1: \n"
|
||||
"vld4.32 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
|
||||
"vld4.32 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop
|
||||
"vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
|
||||
"vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
|
||||
"vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
|
||||
"vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts.
|
||||
"vrshrn.u16 d0, q0, #1 \n" // downshift, round and
|
||||
// pack
|
||||
"vrshrn.u16 d1, q1, #1 \n"
|
||||
"vrshrn.u16 d2, q2, #1 \n"
|
||||
"vrshrn.u16 d3, q3, #1 \n"
|
||||
"vst4.8 {d0, d1, d2, d3}, [%1]! \n"
|
||||
"vrhadd.u8 q0, q0, q1 \n" // rounding half add
|
||||
"vrhadd.u8 q1, q2, q3 \n" // rounding half add
|
||||
"vst2.32 {q0, q1}, [%1]! \n"
|
||||
"bgt 1b \n"
|
||||
: "+r"(src_argb), // %0
|
||||
"+r"(dst_argb), // %1
|
||||
|
@ -762,25 +753,21 @@ void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr,
|
|||
asm volatile(
|
||||
// change the stride to row 2 pointer
|
||||
"add %1, %1, %0 \n"
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
|
||||
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB
|
||||
// pixels.
|
||||
"subs %3, %3, #8 \n" // 8 processed per loop.
|
||||
"vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
|
||||
"vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
|
||||
"vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
|
||||
"vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts.
|
||||
"vld4.8 {d16, d18, d20, d22}, [%1]! \n" // load 8 more ARGB
|
||||
// pixels.
|
||||
"vld4.8 {d17, d19, d21, d23}, [%1]! \n" // load last 8 ARGB
|
||||
// pixels.
|
||||
"vpadal.u8 q0, q8 \n" // B 16 bytes -> 8 shorts.
|
||||
"vpadal.u8 q1, q9 \n" // G 16 bytes -> 8 shorts.
|
||||
"vpadal.u8 q2, q10 \n" // R 16 bytes -> 8 shorts.
|
||||
"vpadal.u8 q3, q11 \n" // A 16 bytes -> 8 shorts.
|
||||
"vrshrn.u16 d0, q0, #2 \n" // downshift, round and
|
||||
// pack
|
||||
"vrshrn.u16 d0, q0, #2 \n" // round and pack to bytes
|
||||
"vrshrn.u16 d1, q1, #2 \n"
|
||||
"vrshrn.u16 d2, q2, #2 \n"
|
||||
"vrshrn.u16 d3, q3, #2 \n"
|
||||
|
@ -804,7 +791,7 @@ void ScaleARGBRowDownEven_NEON(const uint8* src_argb,
|
|||
(void)src_stride;
|
||||
asm volatile(
|
||||
"mov r12, %3, lsl #2 \n"
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"vld1.32 {d0[0]}, [%0], r12 \n"
|
||||
"vld1.32 {d0[1]}, [%0], r12 \n"
|
||||
"vld1.32 {d1[0]}, [%0], r12 \n"
|
||||
|
@ -829,9 +816,8 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb,
|
|||
asm volatile(
|
||||
"mov r12, %4, lsl #2 \n"
|
||||
"add %1, %1, %0 \n"
|
||||
"1: \n"
|
||||
"vld1.8 {d0}, [%0], r12 \n" // Read 4 2x2 blocks ->
|
||||
// 2x1
|
||||
"1: \n"
|
||||
"vld1.8 {d0}, [%0], r12 \n" // 4 2x2 blocks -> 2x1
|
||||
"vld1.8 {d1}, [%1], r12 \n"
|
||||
"vld1.8 {d2}, [%0], r12 \n"
|
||||
"vld1.8 {d3}, [%1], r12 \n"
|
||||
|
@ -860,15 +846,13 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb,
|
|||
: "memory", "cc", "r12", "q0", "q1", "q2", "q3");
|
||||
}
|
||||
|
||||
// clang-format off
|
||||
// TODO(Yang Zhang): Investigate less load instructions for
|
||||
// the x/dx stepping
|
||||
#define LOAD1_DATA32_LANE(dn, n) \
|
||||
"lsr %5, %3, #16 \n" \
|
||||
"add %6, %1, %5, lsl #2 \n" \
|
||||
"add %3, %3, %4 \n" \
|
||||
#define LOAD1_DATA32_LANE(dn, n) \
|
||||
"lsr %5, %3, #16 \n" \
|
||||
"add %6, %1, %5, lsl #2 \n" \
|
||||
"add %3, %3, %4 \n" \
|
||||
"vld1.32 {" #dn "[" #n "]}, [%6] \n"
|
||||
// clang-format on
|
||||
|
||||
void ScaleARGBCols_NEON(uint8* dst_argb,
|
||||
const uint8* src_argb,
|
||||
|
@ -878,15 +862,20 @@ void ScaleARGBCols_NEON(uint8* dst_argb,
|
|||
int tmp;
|
||||
const uint8* src_tmp = src_argb;
|
||||
asm volatile(
|
||||
"1: \n" LOAD1_DATA32_LANE(
|
||||
d0, 0) LOAD1_DATA32_LANE(d0, 1) LOAD1_DATA32_LANE(d1, 0)
|
||||
LOAD1_DATA32_LANE(d1, 1) LOAD1_DATA32_LANE(d2, 0) LOAD1_DATA32_LANE(
|
||||
d2, 1) LOAD1_DATA32_LANE(d3, 0) LOAD1_DATA32_LANE(d3, 1)
|
||||
|
||||
"vst1.32 {q0, q1}, [%0]! \n" // store pixels
|
||||
"subs %2, %2, #8 \n" // 8 processed per
|
||||
// loop
|
||||
"bgt 1b \n"
|
||||
"1: \n"
|
||||
// clang-format off
|
||||
LOAD1_DATA32_LANE(d0, 0)
|
||||
LOAD1_DATA32_LANE(d0, 1)
|
||||
LOAD1_DATA32_LANE(d1, 0)
|
||||
LOAD1_DATA32_LANE(d1, 1)
|
||||
LOAD1_DATA32_LANE(d2, 0)
|
||||
LOAD1_DATA32_LANE(d2, 1)
|
||||
LOAD1_DATA32_LANE(d3, 0)
|
||||
LOAD1_DATA32_LANE(d3, 1)
|
||||
// clang-format on
|
||||
"vst1.32 {q0, q1}, [%0]! \n" // store pixels
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop
|
||||
"bgt 1b \n"
|
||||
: "+r"(dst_argb), // %0
|
||||
"+r"(src_argb), // %1
|
||||
"+r"(dst_width), // %2
|
||||
|
@ -900,15 +889,13 @@ void ScaleARGBCols_NEON(uint8* dst_argb,
|
|||
|
||||
#undef LOAD1_DATA32_LANE
|
||||
|
||||
// clang-format off
|
||||
// TODO(Yang Zhang): Investigate less load instructions for
|
||||
// the x/dx stepping
|
||||
#define LOAD2_DATA32_LANE(dn1, dn2, n) \
|
||||
"lsr %5, %3, #16 \n" \
|
||||
"add %6, %1, %5, lsl #2 \n" \
|
||||
"add %3, %3, %4 \n" \
|
||||
"vld2.32 {" #dn1 "[" #n "], " #dn2 "[" #n "]}, [%6] \n"
|
||||
// clang-format on
|
||||
#define LOAD2_DATA32_LANE(dn1, dn2, n) \
|
||||
"lsr %5, %3, #16 \n" \
|
||||
"add %6, %1, %5, lsl #2 \n" \
|
||||
"add %3, %3, %4 \n" \
|
||||
"vld2.32 {" #dn1 "[" #n "], " #dn2 "[" #n "]}, [%6] \n"
|
||||
|
||||
void ScaleARGBFilterCols_NEON(uint8* dst_argb,
|
||||
const uint8* src_argb,
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -17,7 +17,7 @@ extern "C" {
|
|||
#endif
|
||||
|
||||
// This module is for 32 bit Visual C x86 and clangcl
|
||||
#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
|
||||
#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
|
||||
|
||||
// Offsets for source bytes 0 to 9
|
||||
static uvec8 kShuf0 = {0, 1, 3, 4, 5, 7, 8, 9,
|
||||
|
@ -816,7 +816,7 @@ __declspec(naked) void ScaleAddRow_SSE2(const uint8* src_ptr,
|
|||
mov ecx, [esp + 12] // src_width
|
||||
pxor xmm5, xmm5
|
||||
|
||||
// sum rows
|
||||
// sum rows
|
||||
xloop:
|
||||
movdqu xmm3, [eax] // read 16 bytes
|
||||
lea eax, [eax + 16]
|
||||
|
@ -847,7 +847,7 @@ __declspec(naked) void ScaleAddRow_AVX2(const uint8* src_ptr,
|
|||
mov ecx, [esp + 12] // src_width
|
||||
vpxor ymm5, ymm5, ymm5
|
||||
|
||||
// sum rows
|
||||
// sum rows
|
||||
xloop:
|
||||
vmovdqu ymm3, [eax] // read 32 bytes
|
||||
lea eax, [eax + 32]
|
||||
|
@ -939,7 +939,7 @@ __declspec(naked) void ScaleFilterCols_SSSE3(uint8* dst_ptr,
|
|||
add ecx, 2 - 1
|
||||
jl xloop99
|
||||
|
||||
// 1 pixel remainder
|
||||
// 1 pixel remainder
|
||||
movzx ebx, word ptr [esi + eax] // 2 source x0 pixels
|
||||
movd xmm0, ebx
|
||||
psrlw xmm2, 9 // 7 bit fractions.
|
||||
|
@ -1194,7 +1194,7 @@ __declspec(naked) void ScaleARGBCols_SSE2(uint8* dst_argb,
|
|||
sub ecx, 4
|
||||
jl xloop49
|
||||
|
||||
// 4 Pixel loop.
|
||||
// 4 Pixel loop.
|
||||
xloop4:
|
||||
movd xmm0, [esi + eax * 4] // 1 source x0 pixels
|
||||
movd xmm1, [esi + edx * 4] // 1 source x1 pixels
|
||||
|
@ -1218,7 +1218,7 @@ __declspec(naked) void ScaleARGBCols_SSE2(uint8* dst_argb,
|
|||
test ecx, 2
|
||||
je xloop29
|
||||
|
||||
// 2 Pixels.
|
||||
// 2 Pixels.
|
||||
movd xmm0, [esi + eax * 4] // 1 source x0 pixels
|
||||
movd xmm1, [esi + edx * 4] // 1 source x1 pixels
|
||||
pextrw eax, xmm2, 5 // get x2 integer.
|
||||
|
@ -1231,7 +1231,7 @@ __declspec(naked) void ScaleARGBCols_SSE2(uint8* dst_argb,
|
|||
test ecx, 1
|
||||
je xloop99
|
||||
|
||||
// 1 Pixels.
|
||||
// 1 Pixels.
|
||||
movd xmm0, [esi + eax * 4] // 1 source x2 pixels
|
||||
movd dword ptr [edi], xmm0
|
||||
xloop99:
|
||||
|
@ -1309,7 +1309,7 @@ __declspec(naked) void ScaleARGBFilterCols_SSSE3(uint8* dst_argb,
|
|||
add ecx, 2 - 1
|
||||
jl xloop99
|
||||
|
||||
// 1 pixel remainder
|
||||
// 1 pixel remainder
|
||||
psrlw xmm2, 9 // 7 bit fractions.
|
||||
movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels
|
||||
pshufb xmm2, xmm5 // 00000000
|
||||
|
|
Loading…
Reference in New Issue