Close #2065: Update libyuv to fix linker error when building libyuv as dll on Visual Studio 2015.

git-svn-id: https://svn.pjsip.org/repos/pjproject/trunk@5699 74dad513-b988-da41-8d7b-12977e46ad98
This commit is contained in:
Riza Sulistyo 2017-11-21 09:25:11 +00:00
parent c8847d0d12
commit 77545dfdac
39 changed files with 4140 additions and 1705 deletions

View File

@ -1,30 +1,4 @@
Notes:
* Source code for libyuv from https://chromium.googlesource.com/libyuv/libyuv/ dated 27 July 2017.
* Source code for libyuv from https://chromium.googlesource.com/libyuv/libyuv/ dated 17 November 2017.
* All code is compilable, except for compare_win.cc
- Use older version (https://chromium.googlesource.com/libyuv/libyuv/+/baf6a3c1bd385e7ffe6b7634560e71fb49e4f589%5E%21/)
Since there's a compiler error on:
--------------------------------------------------------------------------------------
pmulld xmm0,xmm6
--------------------------------------------------------------------------------------
- On VS2015, error C2024: 'alignas' attribute applies to variables, data members and tag types only
--------------------------------------------------------------------------------------
__declspec(naked) __declspec(align(16))
Change to :
__declspec(naked)
--------------------------------------------------------------------------------------
* Added these lines to file include/libyuv/basic_types.h:
--
#if _MSC_VER==1400
# include <stdint.h> // for uint8_t
#endif
...
#if defined(_MSC_VER)
# pragma warning(disable:4996) // This function or variable may be unsafe.
#endif
--

View File

@ -14,18 +14,11 @@
#include <stddef.h> // for NULL, size_t
#if defined(_MSC_VER) && (_MSC_VER < 1600)
#if _MSC_VER==1400
# include <stdint.h> // for uint8_t
#endif
#include <sys/types.h> // for uintptr_t on x86
#else
#include <stdint.h> // for uintptr_t
#endif
#if defined(_MSC_VER)
# pragma warning(disable:4996) // This function or variable may be unsafe.
#endif
#ifndef GG_LONGLONG
#ifndef INT_TYPES_DEFINED
#define INT_TYPES_DEFINED

View File

@ -19,7 +19,7 @@ extern "C" {
#endif
#if defined(__pnacl__) || defined(__CLR_VER) || \
(defined(__i386__) && !defined(__SSE2__))
(defined(__i386__) && !defined(__SSE__) && !defined(__clang__))
#define LIBYUV_DISABLE_X86
#endif
// MemorySanitizer does not support assembly code yet. http://crbug.com/344505
@ -42,6 +42,7 @@ extern "C" {
#endif // clang >= 3.4
#endif // __clang__
// The following are available for Visual C:
#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \
(defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2))
#define HAS_HASHDJB2_AVX2
@ -52,7 +53,7 @@ extern "C" {
(defined(__x86_64__) || defined(__i386__) || defined(_M_IX86))
#define HAS_HASHDJB2_SSE41
#define HAS_SUMSQUAREERROR_SSE2
#define HAS_HAMMINGDISTANCE_X86
#define HAS_HAMMINGDISTANCE_SSE42
#endif
// The following are available for Visual C and clangcl 32 bit:
@ -62,6 +63,18 @@ extern "C" {
#define HAS_SUMSQUAREERROR_AVX2
#endif
// The following are available for GCC and clangcl 64 bit:
#if !defined(LIBYUV_DISABLE_X86) && \
(defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
#define HAS_HAMMINGDISTANCE_SSSE3
#endif
// The following are available for GCC and clangcl 64 bit:
#if !defined(LIBYUV_DISABLE_X86) && defined(CLANG_HAS_AVX2) && \
(defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
#define HAS_HAMMINGDISTANCE_AVX2
#endif
// The following are available for Neon:
#if !defined(LIBYUV_DISABLE_NEON) && \
(defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
@ -69,14 +82,23 @@ extern "C" {
#define HAS_HAMMINGDISTANCE_NEON
#endif
#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
#define HAS_HAMMINGDISTANCE_MSA
#define HAS_SUMSQUAREERROR_MSA
#endif
uint32 HammingDistance_C(const uint8* src_a, const uint8* src_b, int count);
uint32 HammingDistance_X86(const uint8* src_a, const uint8* src_b, int count);
uint32 HammingDistance_SSE42(const uint8* src_a, const uint8* src_b, int count);
uint32 HammingDistance_SSSE3(const uint8* src_a, const uint8* src_b, int count);
uint32 HammingDistance_AVX2(const uint8* src_a, const uint8* src_b, int count);
uint32 HammingDistance_NEON(const uint8* src_a, const uint8* src_b, int count);
uint32 HammingDistance_MSA(const uint8* src_a, const uint8* src_b, int count);
uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count);
uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count);
uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count);
uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count);
uint32 SumSquareError_MSA(const uint8* src_a, const uint8* src_b, int count);
uint32 HashDjb2_C(const uint8* src, int count, uint32 seed);
uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed);

View File

@ -188,6 +188,30 @@ int I420ToRAW(const uint8* src_y,
int width,
int height);
LIBYUV_API
int H420ToRGB24(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_frame,
int dst_stride_frame,
int width,
int height);
LIBYUV_API
int H420ToRAW(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_frame,
int dst_stride_frame,
int width,
int height);
LIBYUV_API
int I420ToRGB565(const uint8* src_y,
int src_stride_y,

View File

@ -36,15 +36,19 @@ static const int kCpuHasAVX = 0x200;
static const int kCpuHasAVX2 = 0x400;
static const int kCpuHasERMS = 0x800;
static const int kCpuHasFMA3 = 0x1000;
static const int kCpuHasAVX3 = 0x2000;
static const int kCpuHasF16C = 0x4000;
// 0x8000 reserved for future X86 flags.
static const int kCpuHasF16C = 0x2000;
static const int kCpuHasGFNI = 0x4000;
static const int kCpuHasAVX512BW = 0x8000;
static const int kCpuHasAVX512VL = 0x10000;
static const int kCpuHasAVX512VBMI = 0x20000;
static const int kCpuHasAVX512VBMI2 = 0x40000;
static const int kCpuHasAVX512VBITALG = 0x80000;
static const int kCpuHasAVX512VPOPCNTDQ = 0x100000;
// These flags are only valid on MIPS processors.
static const int kCpuHasMIPS = 0x10000;
static const int kCpuHasDSPR2 = 0x20000;
static const int kCpuHasMSA = 0x40000;
static const int kCpuHasMIPS = 0x200000;
static const int kCpuHasDSPR2 = 0x400000;
static const int kCpuHasMSA = 0x800000;
// Optional init function. TestCpuFlag does an auto-init.
// Returns cpu_info flags.

View File

@ -69,6 +69,32 @@ void MergeUVPlane(const uint8* src_u,
int width,
int height);
// Split interleaved RGB plane into separate R, G and B planes.
LIBYUV_API
void SplitRGBPlane(const uint8* src_rgb,
int src_stride_rgb,
uint8* dst_r,
int dst_stride_r,
uint8* dst_g,
int dst_stride_g,
uint8* dst_b,
int dst_stride_b,
int width,
int height);
// Merge separate R, G and B planes into one interleaved RGB plane.
LIBYUV_API
void MergeRGBPlane(const uint8* src_r,
int src_stride_r,
const uint8* src_g,
int src_stride_g,
const uint8* src_b,
int src_stride_b,
uint8* dst_rgb,
int dst_stride_rgb,
int width,
int height);
// Copy I400. Supports inverting.
LIBYUV_API
int I400ToI400(const uint8* src_y,
@ -720,7 +746,7 @@ int I420Interpolate(const uint8* src0_y,
int interpolation);
#if defined(__pnacl__) || defined(__CLR_VER) || \
(defined(__i386__) && !defined(__SSE2__))
(defined(__i386__) && !defined(__SSE__) && !defined(__clang__))
#define LIBYUV_DISABLE_X86
#endif
// MemorySanitizer does not support assembly code yet. http://crbug.com/344505

View File

@ -19,7 +19,7 @@ extern "C" {
#endif
#if defined(__pnacl__) || defined(__CLR_VER) || \
(defined(__i386__) && !defined(__SSE2__))
(defined(__i386__) && !defined(__SSE__) && !defined(__clang__))
#define LIBYUV_DISABLE_X86
#endif
// MemorySanitizer does not support assembly code yet. http://crbug.com/344505
@ -29,7 +29,7 @@ extern "C" {
#endif
#endif
// The following are available for Visual C and clangcl 32 bit:
#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
#define HAS_TRANSPOSEWX8_SSSE3
#define HAS_TRANSPOSEUVWX8_SSE2
#endif

View File

@ -31,7 +31,7 @@ extern "C" {
var = 0
#if defined(__pnacl__) || defined(__CLR_VER) || \
(defined(__i386__) && !defined(__SSE2__))
(defined(__i386__) && !defined(__SSE__) && !defined(__clang__))
#define LIBYUV_DISABLE_X86
#endif
// MemorySanitizer does not support assembly code yet. http://crbug.com/344505
@ -264,6 +264,23 @@ extern "C" {
#define HAS_I422TOARGBROW_SSSE3
#endif
// The following are available for gcc/clang x86 platforms:
// TODO(fbarchard): Port to Visual C
#if !defined(LIBYUV_DISABLE_X86) && \
(defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
#define HAS_MERGERGBROW_SSSE3
#define HAS_SPLITRGBROW_SSSE3
#endif
// The following are available for AVX2 gcc/clang x86 platforms:
// TODO(fbarchard): Port to Visual C
#if !defined(LIBYUV_DISABLE_X86) && \
(defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) && \
(defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2))
#define HAS_MERGEUVROW_16_AVX2
#define HAS_MULTIPLYROW_16_AVX2
#endif
// The following are available on Neon platforms:
#if !defined(LIBYUV_DISABLE_NEON) && \
(defined(__aarch64__) || defined(__ARM_NEON__) || defined(LIBYUV_NEON))
@ -323,6 +340,7 @@ extern "C" {
#define HAS_RGBATOUVROW_NEON
#define HAS_RGBATOYROW_NEON
#define HAS_SETROW_NEON
#define HAS_SPLITRGBROW_NEON
#define HAS_SPLITUVROW_NEON
#define HAS_UYVYTOARGBROW_NEON
#define HAS_UYVYTOUV422ROW_NEON
@ -354,6 +372,11 @@ extern "C" {
#define HAS_SOBELYROW_NEON
#endif
// The following are available on AArch64 platforms:
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
#define HAS_SCALESUMSAMPLES_NEON
#endif
// The following are available on Mips platforms:
#if !defined(LIBYUV_DISABLE_DSPR2) && defined(__mips__) && \
(_MIPS_SIM == _MIPS_SIM_ABI32) && (__mips_isa_rev < 6)
@ -385,72 +408,82 @@ extern "C" {
#endif
#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
#define HAS_ARGBMIRRORROW_MSA
#define HAS_I422TOUYVYROW_MSA
#define HAS_I422TOYUY2ROW_MSA
#define HAS_MIRRORROW_MSA
#define HAS_UYVYTOUVROW_MSA
#define HAS_UYVYTOYROW_MSA
#define HAS_YUY2TOUV422ROW_MSA
#define HAS_YUY2TOUVROW_MSA
#define HAS_YUY2TOYROW_MSA
#define HAS_ABGRTOUVROW_MSA
#define HAS_ABGRTOYROW_MSA
#define HAS_ARGB1555TOARGBROW_MSA
#define HAS_ARGB1555TOUVROW_MSA
#define HAS_ARGB1555TOYROW_MSA
#define HAS_ARGB4444TOARGBROW_MSA
#define HAS_ARGBTOYROW_MSA
#define HAS_ARGBTOUVROW_MSA
#define HAS_I422TOARGBROW_MSA
#define HAS_I422TORGBAROW_MSA
#define HAS_I422ALPHATOARGBROW_MSA
#define HAS_I422TORGB24ROW_MSA
#define HAS_ARGBTORGB24ROW_MSA
#define HAS_ARGBTORAWROW_MSA
#define HAS_ARGBTORGB565ROW_MSA
#define HAS_ARGBADDROW_MSA
#define HAS_ARGBATTENUATEROW_MSA
#define HAS_ARGBBLENDROW_MSA
#define HAS_ARGBCOLORMATRIXROW_MSA
#define HAS_ARGBEXTRACTALPHAROW_MSA
#define HAS_ARGBGRAYROW_MSA
#define HAS_ARGBMIRRORROW_MSA
#define HAS_ARGBMULTIPLYROW_MSA
#define HAS_ARGBQUANTIZEROW_MSA
#define HAS_ARGBSEPIAROW_MSA
#define HAS_ARGBSETROW_MSA
#define HAS_ARGBSHADEROW_MSA
#define HAS_ARGBSHUFFLEROW_MSA
#define HAS_ARGBSUBTRACTROW_MSA
#define HAS_ARGBTOARGB1555ROW_MSA
#define HAS_ARGBTOARGB4444ROW_MSA
#define HAS_ARGBTOUV444ROW_MSA
#define HAS_ARGBMULTIPLYROW_MSA
#define HAS_ARGBADDROW_MSA
#define HAS_ARGBSUBTRACTROW_MSA
#define HAS_ARGBATTENUATEROW_MSA
#define HAS_ARGBTORAWROW_MSA
#define HAS_ARGBTORGB24ROW_MSA
#define HAS_ARGBTORGB565DITHERROW_MSA
#define HAS_ARGBSHUFFLEROW_MSA
#define HAS_ARGBSHADEROW_MSA
#define HAS_ARGBGRAYROW_MSA
#define HAS_ARGBSEPIAROW_MSA
#define HAS_ARGB1555TOARGBROW_MSA
#define HAS_RGB565TOARGBROW_MSA
#define HAS_RGB24TOARGBROW_MSA
#define HAS_RAWTOARGBROW_MSA
#define HAS_ARGB1555TOYROW_MSA
#define HAS_RGB565TOYROW_MSA
#define HAS_RGB24TOYROW_MSA
#define HAS_RAWTOYROW_MSA
#define HAS_ARGB1555TOUVROW_MSA
#define HAS_RGB565TOUVROW_MSA
#define HAS_RGB24TOUVROW_MSA
#define HAS_RAWTOUVROW_MSA
#define HAS_ARGBTORGB565ROW_MSA
#define HAS_ARGBTOUV444ROW_MSA
#define HAS_ARGBTOUVJROW_MSA
#define HAS_ARGBTOUVROW_MSA
#define HAS_ARGBTOYJROW_MSA
#define HAS_ARGBTOYROW_MSA
#define HAS_BGRATOUVROW_MSA
#define HAS_BGRATOYROW_MSA
#define HAS_HALFFLOATROW_MSA
#define HAS_I400TOARGBROW_MSA
#define HAS_I422ALPHATOARGBROW_MSA
#define HAS_I422TOARGBROW_MSA
#define HAS_I422TORGB24ROW_MSA
#define HAS_I422TORGBAROW_MSA
#define HAS_I422TOUYVYROW_MSA
#define HAS_I422TOYUY2ROW_MSA
#define HAS_I444TOARGBROW_MSA
#define HAS_INTERPOLATEROW_MSA
#define HAS_J400TOARGBROW_MSA
#define HAS_MERGEUVROW_MSA
#define HAS_MIRRORROW_MSA
#define HAS_MIRRORUVROW_MSA
#define HAS_NV12TOARGBROW_MSA
#define HAS_NV12TORGB565ROW_MSA
#define HAS_NV21TOARGBROW_MSA
#define HAS_RAWTOARGBROW_MSA
#define HAS_RAWTORGB24ROW_MSA
#define HAS_RAWTOUVROW_MSA
#define HAS_RAWTOYROW_MSA
#define HAS_RGB24TOARGBROW_MSA
#define HAS_RGB24TOUVROW_MSA
#define HAS_RGB24TOYROW_MSA
#define HAS_RGB565TOARGBROW_MSA
#define HAS_RGB565TOUVROW_MSA
#define HAS_RGB565TOYROW_MSA
#define HAS_RGBATOUVROW_MSA
#define HAS_RGBATOYROW_MSA
#define HAS_SETROW_MSA
#define HAS_SOBELROW_MSA
#define HAS_SOBELTOPLANEROW_MSA
#define HAS_SOBELXROW_MSA
#define HAS_SOBELXYROW_MSA
#define HAS_ARGBTOYJROW_MSA
#define HAS_BGRATOYROW_MSA
#define HAS_ABGRTOYROW_MSA
#define HAS_RGBATOYROW_MSA
#define HAS_ARGBTOUVJROW_MSA
#define HAS_BGRATOUVROW_MSA
#define HAS_ABGRTOUVROW_MSA
#define HAS_RGBATOUVROW_MSA
#define HAS_I444TOARGBROW_MSA
#define HAS_I400TOARGBROW_MSA
#define HAS_J400TOARGBROW_MSA
#define HAS_YUY2TOARGBROW_MSA
#define HAS_SOBELYROW_MSA
#define HAS_SPLITUVROW_MSA
#define HAS_UYVYTOARGBROW_MSA
#define HAS_INTERPOLATEROW_MSA
#define HAS_ARGBSETROW_MSA
#define HAS_RAWTORGB24ROW_MSA
#define HAS_MERGEUVROW_MSA
#define HAS_UYVYTOUVROW_MSA
#define HAS_UYVYTOYROW_MSA
#define HAS_YUY2TOARGBROW_MSA
#define HAS_YUY2TOUV422ROW_MSA
#define HAS_YUY2TOUVROW_MSA
#define HAS_YUY2TOYROW_MSA
#endif
#if defined(_MSC_VER) && !defined(__CLR_VER) && !defined(__clang__)
@ -1345,6 +1378,10 @@ void MirrorUVRow_DSPR2(const uint8* src_uv,
uint8* dst_u,
uint8* dst_v,
int width);
void MirrorUVRow_MSA(const uint8* src_uv,
uint8* dst_u,
uint8* dst_v,
int width);
void MirrorUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width);
void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width);
@ -1374,6 +1411,7 @@ void SplitUVRow_DSPR2(const uint8* src_uv,
uint8* dst_u,
uint8* dst_v,
int width);
void SplitUVRow_MSA(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width);
void SplitUVRow_Any_SSE2(const uint8* src_uv,
uint8* dst_u,
uint8* dst_v,
@ -1390,6 +1428,10 @@ void SplitUVRow_Any_DSPR2(const uint8* src_uv,
uint8* dst_u,
uint8* dst_v,
int width);
void SplitUVRow_Any_MSA(const uint8* src_uv,
uint8* dst_u,
uint8* dst_v,
int width);
void MergeUVRow_C(const uint8* src_u,
const uint8* src_v,
@ -1428,6 +1470,75 @@ void MergeUVRow_Any_MSA(const uint8* src_u,
uint8* dst_uv,
int width);
void SplitRGBRow_C(const uint8* src_rgb,
uint8* dst_r,
uint8* dst_g,
uint8* dst_b,
int width);
void SplitRGBRow_SSSE3(const uint8* src_rgb,
uint8* dst_r,
uint8* dst_g,
uint8* dst_b,
int width);
void SplitRGBRow_NEON(const uint8* src_rgb,
uint8* dst_r,
uint8* dst_g,
uint8* dst_b,
int width);
void SplitRGBRow_Any_SSSE3(const uint8* src_rgb,
uint8* dst_r,
uint8* dst_g,
uint8* dst_b,
int width);
void SplitRGBRow_Any_NEON(const uint8* src_rgb,
uint8* dst_r,
uint8* dst_g,
uint8* dst_b,
int width);
void MergeRGBRow_C(const uint8* src_r,
const uint8* src_g,
const uint8* src_b,
uint8* dst_rgb,
int width);
void MergeRGBRow_SSSE3(const uint8* src_r,
const uint8* src_g,
const uint8* src_b,
uint8* dst_rgb,
int width);
void MergeRGBRow_NEON(const uint8* src_r,
const uint8* src_g,
const uint8* src_b,
uint8* dst_rgb,
int width);
void MergeRGBRow_Any_SSSE3(const uint8* src_r,
const uint8* src_g,
const uint8* src_b,
uint8* dst_rgb,
int width);
void MergeRGBRow_Any_NEON(const uint8* src_r,
const uint8* src_g,
const uint8* src_b,
uint8* dst_rgb,
int width);
void MergeUVRow_16_C(const uint16* src_u,
const uint16* src_v,
uint16* dst_uv,
int scale, /* 64 for 10 bit */
int width);
void MergeUVRow_16_AVX2(const uint16* src_u,
const uint16* src_v,
uint16* dst_uv,
int scale,
int width);
void MultiplyRow_16_AVX2(const uint16* src_y,
uint16* dst_y,
int scale,
int width);
void MultiplyRow_16_C(const uint16* src_y, uint16* dst_y, int scale, int width);
void CopyRow_SSE2(const uint8* src, uint8* dst, int count);
void CopyRow_AVX(const uint8* src, uint8* dst, int count);
void CopyRow_ERMS(const uint8* src, uint8* dst, int count);
@ -1454,6 +1565,7 @@ void ARGBExtractAlphaRow_C(const uint8* src_argb, uint8* dst_a, int width);
void ARGBExtractAlphaRow_SSE2(const uint8* src_argb, uint8* dst_a, int width);
void ARGBExtractAlphaRow_AVX2(const uint8* src_argb, uint8* dst_a, int width);
void ARGBExtractAlphaRow_NEON(const uint8* src_argb, uint8* dst_a, int width);
void ARGBExtractAlphaRow_MSA(const uint8* src_argb, uint8* dst_a, int width);
void ARGBExtractAlphaRow_Any_SSE2(const uint8* src_argb,
uint8* dst_a,
int width);
@ -1463,6 +1575,9 @@ void ARGBExtractAlphaRow_Any_AVX2(const uint8* src_argb,
void ARGBExtractAlphaRow_Any_NEON(const uint8* src_argb,
uint8* dst_a,
int width);
void ARGBExtractAlphaRow_Any_MSA(const uint8* src_argb,
uint8* dst_a,
int width);
void ARGBCopyYToAlphaRow_C(const uint8* src_y, uint8* dst_argb, int width);
void ARGBCopyYToAlphaRow_SSE2(const uint8* src_y, uint8* dst_argb, int width);
@ -1475,6 +1590,7 @@ void ARGBCopyYToAlphaRow_Any_AVX2(const uint8* src_y,
int width);
void SetRow_C(uint8* dst, uint8 v8, int count);
void SetRow_MSA(uint8* dst, uint8 v8, int count);
void SetRow_X86(uint8* dst, uint8 v8, int count);
void SetRow_ERMS(uint8* dst, uint8 v8, int count);
void SetRow_NEON(uint8* dst, uint8 v8, int count);
@ -2122,6 +2238,10 @@ void ARGBBlendRow_NEON(const uint8* src_argb,
const uint8* src_argb1,
uint8* dst_argb,
int width);
void ARGBBlendRow_MSA(const uint8* src_argb,
const uint8* src_argb1,
uint8* dst_argb,
int width);
void ARGBBlendRow_C(const uint8* src_argb,
const uint8* src_argb1,
uint8* dst_argb,
@ -2835,6 +2955,10 @@ void ARGBColorMatrixRow_NEON(const uint8* src_argb,
uint8* dst_argb,
const int8* matrix_argb,
int width);
void ARGBColorMatrixRow_MSA(const uint8* src_argb,
uint8* dst_argb,
const int8* matrix_argb,
int width);
void ARGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width);
void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width);
@ -2857,6 +2981,11 @@ void ARGBQuantizeRow_NEON(uint8* dst_argb,
int interval_size,
int interval_offset,
int width);
void ARGBQuantizeRow_MSA(uint8* dst_argb,
int scale,
int interval_size,
int interval_offset,
int width);
void ARGBShadeRow_C(const uint8* src_argb,
uint8* dst_argb,
@ -2990,6 +3119,11 @@ void SobelXRow_NEON(const uint8* src_y0,
const uint8* src_y2,
uint8* dst_sobelx,
int width);
void SobelXRow_MSA(const uint8* src_y0,
const uint8* src_y1,
const uint8* src_y2,
uint8* dst_sobelx,
int width);
void SobelYRow_C(const uint8* src_y0,
const uint8* src_y1,
uint8* dst_sobely,
@ -3002,6 +3136,10 @@ void SobelYRow_NEON(const uint8* src_y0,
const uint8* src_y1,
uint8* dst_sobely,
int width);
void SobelYRow_MSA(const uint8* src_y0,
const uint8* src_y1,
uint8* dst_sobely,
int width);
void SobelRow_C(const uint8* src_sobelx,
const uint8* src_sobely,
uint8* dst_argb,
@ -3132,6 +3270,11 @@ void HalfFloat1Row_Any_NEON(const uint16* src,
uint16* dst,
float scale,
int width);
void HalfFloatRow_MSA(const uint16* src, uint16* dst, float scale, int width);
void HalfFloatRow_Any_MSA(const uint16* src,
uint16* dst,
float scale,
int width);
void ARGBLumaColorTableRow_C(const uint8* src_argb,
uint8* dst_argb,
@ -3144,6 +3287,19 @@ void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb,
const uint8* luma,
uint32 lumacoeff);
float ScaleMaxSamples_C(const float* src, float* dst, float scale, int width);
float ScaleMaxSamples_NEON(const float* src,
float* dst,
float scale,
int width);
float ScaleSumSamples_C(const float* src, float* dst, float scale, int width);
float ScaleSumSamples_NEON(const float* src,
float* dst,
float scale,
int width);
void ScaleSamples_C(const float* src, float* dst, float scale, int width);
void ScaleSamples_NEON(const float* src, float* dst, float scale, int width);
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv

View File

@ -20,7 +20,7 @@ extern "C" {
#endif
#if defined(__pnacl__) || defined(__CLR_VER) || \
(defined(__i386__) && !defined(__SSE2__))
(defined(__i386__) && !defined(__SSE__) && !defined(__clang__))
#define LIBYUV_DISABLE_X86
#endif
// MemorySanitizer does not support assembly code yet. http://crbug.com/344505
@ -105,12 +105,16 @@ extern "C" {
#endif
#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
#define HAS_SCALEADDROW_MSA
#define HAS_SCALEARGBCOLS_MSA
#define HAS_SCALEARGBFILTERCOLS_MSA
#define HAS_SCALEARGBROWDOWN2_MSA
#define HAS_SCALEARGBROWDOWNEVEN_MSA
#define HAS_SCALEFILTERCOLS_MSA
#define HAS_SCALEROWDOWN2_MSA
#define HAS_SCALEROWDOWN4_MSA
#define HAS_SCALEROWDOWN34_MSA
#define HAS_SCALEROWDOWN38_MSA
#define HAS_SCALEADDROW_MSA
#define HAS_SCALEROWDOWN4_MSA
#endif
// Scale ARGB vertically with bilinear interpolation.
@ -546,6 +550,26 @@ void ScaleARGBCols_Any_NEON(uint8* dst_argb,
int dst_width,
int x,
int dx);
void ScaleARGBFilterCols_MSA(uint8* dst_argb,
const uint8* src_argb,
int dst_width,
int x,
int dx);
void ScaleARGBCols_MSA(uint8* dst_argb,
const uint8* src_argb,
int dst_width,
int x,
int dx);
void ScaleARGBFilterCols_Any_MSA(uint8* dst_argb,
const uint8* src_argb,
int dst_width,
int x,
int dx);
void ScaleARGBCols_Any_MSA(uint8* dst_argb,
const uint8* src_argb,
int dst_width,
int x,
int dx);
// ARGB Row functions
void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
@ -885,6 +909,24 @@ void ScaleRowDown38_3_Box_MSA(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width);
void ScaleAddRow_MSA(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width);
void ScaleFilterCols_MSA(uint8* dst_ptr,
const uint8* src_ptr,
int dst_width,
int x,
int dx);
void ScaleRowDown34_MSA(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr,
int dst_width);
void ScaleRowDown34_0_Box_MSA(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr,
int dst_width);
void ScaleRowDown34_1_Box_MSA(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr,
int dst_width);
void ScaleRowDown2_Any_MSA(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst,
@ -920,6 +962,23 @@ void ScaleRowDown38_3_Box_Any_MSA(const uint8_t* src_ptr,
void ScaleAddRow_Any_MSA(const uint8_t* src_ptr,
uint16_t* dst_ptr,
int src_width);
void ScaleFilterCols_Any_MSA(uint8* dst_ptr,
const uint8* src_ptr,
int dst_width,
int x,
int dx);
void ScaleRowDown34_Any_MSA(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr,
int dst_width);
void ScaleRowDown34_0_Box_Any_MSA(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr,
int dst_width);
void ScaleRowDown34_1_Box_Any_MSA(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr,
int dst_width);
#ifdef __cplusplus
} // extern "C"

View File

@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1662
#define LIBYUV_VERSION 1678
#endif // INCLUDE_LIBYUV_VERSION_H_

View File

@ -110,12 +110,17 @@ uint32 ARGBDetect(const uint8* argb, int stride_argb, int width, int height) {
return fourcc;
}
// NEON version accumulates in 16 bit shorts which overflow at 65536 bytes.
// So actual maximum is 1 less loop, which is 64436 - 32 bytes.
LIBYUV_API
uint64 ComputeHammingDistance(const uint8* src_a,
const uint8* src_b,
int count) {
const int kBlockSize = 65536;
int remainder = count & (kBlockSize - 1) & ~31;
const int kBlockSize = 1 << 15; // 32768;
const int kSimdSize = 64;
// SIMD for multiple of 64, and C for remainder
int remainder = count & (kBlockSize - 1) & ~(kSimdSize - 1);
uint64 diff = 0;
int i;
uint32 (*HammingDistance)(const uint8* src_a, const uint8* src_b, int count) =
@ -125,9 +130,14 @@ uint64 ComputeHammingDistance(const uint8* src_a,
HammingDistance = HammingDistance_NEON;
}
#endif
#if defined(HAS_HAMMINGDISTANCE_X86)
if (TestCpuFlag(kCpuHasX86)) {
HammingDistance = HammingDistance_X86;
#if defined(HAS_HAMMINGDISTANCE_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
HammingDistance = HammingDistance_SSSE3;
}
#endif
#if defined(HAS_HAMMINGDISTANCE_SSE42)
if (TestCpuFlag(kCpuHasSSE42)) {
HammingDistance = HammingDistance_SSE42;
}
#endif
#if defined(HAS_HAMMINGDISTANCE_AVX2)
@ -135,6 +145,11 @@ uint64 ComputeHammingDistance(const uint8* src_a,
HammingDistance = HammingDistance_AVX2;
}
#endif
#if defined(HAS_HAMMINGDISTANCE_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
HammingDistance = HammingDistance_MSA;
}
#endif
#ifdef _OPENMP
#pragma omp parallel for reduction(+ : diff)
#endif
@ -148,7 +163,7 @@ uint64 ComputeHammingDistance(const uint8* src_a,
src_a += remainder;
src_b += remainder;
}
remainder = count & 31;
remainder = count & (kSimdSize - 1);
if (remainder) {
diff += HammingDistance_C(src_a, src_b, remainder);
}
@ -186,6 +201,11 @@ uint64 ComputeSumSquareError(const uint8* src_a,
SumSquareError = SumSquareError_AVX2;
}
#endif
#if defined(HAS_SUMSQUAREERROR_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
SumSquareError = SumSquareError_MSA;
}
#endif
#ifdef _OPENMP
#pragma omp parallel for reduction(+ : sse)
#endif

View File

@ -18,7 +18,7 @@ extern "C" {
#endif
#if ORIGINAL_OPT
uint32 HammingDistance_C(const uint8* src_a, const uint8* src_b, int count) {
uint32 HammingDistance_C1(const uint8* src_a, const uint8* src_b, int count) {
uint32 diff = 0u;
int i;
@ -58,6 +58,16 @@ uint32 HammingDistance_C(const uint8* src_a, const uint8* src_b, int count) {
src_a += 4;
src_b += 4;
}
for (; i < count; ++i) {
uint32 x = *src_a ^ *src_b;
uint32 u = x - ((x >> 1) & 0x55);
u = ((u >> 2) & 0x33) + (u & 0x33);
diff += (u + (u >> 4)) & 0x0f;
src_a += 1;
src_b += 1;
}
return diff;
}

View File

@ -22,18 +22,210 @@ extern "C" {
#if !defined(LIBYUV_DISABLE_X86) && \
(defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
uint32 HammingDistance_X86(const uint8* src_a, const uint8* src_b, int count) {
#if defined(__x86_64__)
uint32 HammingDistance_SSE42(const uint8* src_a,
const uint8* src_b,
int count) {
uint64 diff = 0u;
asm volatile(
"xor %3,%3 \n"
"xor %%r8,%%r8 \n"
"xor %%r9,%%r9 \n"
"xor %%r10,%%r10 \n"
// Process 32 bytes per loop.
LABELALIGN
"1: \n"
"mov (%0),%%rcx \n"
"mov 0x8(%0),%%rdx \n"
"xor (%1),%%rcx \n"
"xor 0x8(%1),%%rdx \n"
"popcnt %%rcx,%%rcx \n"
"popcnt %%rdx,%%rdx \n"
"mov 0x10(%0),%%rsi \n"
"mov 0x18(%0),%%rdi \n"
"xor 0x10(%1),%%rsi \n"
"xor 0x18(%1),%%rdi \n"
"popcnt %%rsi,%%rsi \n"
"popcnt %%rdi,%%rdi \n"
"add $0x20,%0 \n"
"add $0x20,%1 \n"
"add %%rcx,%3 \n"
"add %%rdx,%%r8 \n"
"add %%rsi,%%r9 \n"
"add %%rdi,%%r10 \n"
"sub $0x20,%2 \n"
"jg 1b \n"
"add %%r8, %3 \n"
"add %%r9, %3 \n"
"add %%r10, %3 \n"
: "+r"(src_a), // %0
"+r"(src_b), // %1
"+r"(count), // %2
"=r"(diff) // %3
:
: "memory", "cc", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10");
return static_cast<uint32>(diff);
}
#else
uint32 HammingDistance_SSE42(const uint8* src_a,
const uint8* src_b,
int count) {
uint32 diff = 0u;
int i;
for (i = 0; i < count - 7; i += 8) {
uint64 x = *((uint64*)src_a) ^ *((uint64*)src_b);
src_a += 8;
src_b += 8;
diff += __builtin_popcountll(x);
}
asm volatile(
// Process 16 bytes per loop.
LABELALIGN
"1: \n"
"mov (%0),%%ecx \n"
"mov 0x4(%0),%%edx \n"
"xor (%1),%%ecx \n"
"xor 0x4(%1),%%edx \n"
"popcnt %%ecx,%%ecx \n"
"add %%ecx,%3 \n"
"popcnt %%edx,%%edx \n"
"add %%edx,%3 \n"
"mov 0x8(%0),%%ecx \n"
"mov 0xc(%0),%%edx \n"
"xor 0x8(%1),%%ecx \n"
"xor 0xc(%1),%%edx \n"
"popcnt %%ecx,%%ecx \n"
"add %%ecx,%3 \n"
"popcnt %%edx,%%edx \n"
"add %%edx,%3 \n"
"add $0x10,%0 \n"
"add $0x10,%1 \n"
"sub $0x10,%2 \n"
"jg 1b \n"
: "+r"(src_a), // %0
"+r"(src_b), // %1
"+r"(count), // %2
"+r"(diff) // %3
:
: "memory", "cc", "ecx", "edx");
return diff;
}
#endif
static vec8 kNibbleMask = {15, 15, 15, 15, 15, 15, 15, 15,
15, 15, 15, 15, 15, 15, 15, 15};
static vec8 kBitCount = {0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4};
uint32 HammingDistance_SSSE3(const uint8* src_a,
const uint8* src_b,
int count) {
uint32 diff = 0u;
asm volatile(
"movdqa %4,%%xmm2 \n"
"movdqa %5,%%xmm3 \n"
"pxor %%xmm0,%%xmm0 \n"
"pxor %%xmm1,%%xmm1 \n"
"sub %0,%1 \n"
LABELALIGN
"1: \n"
"movdqa (%0),%%xmm4 \n"
"movdqa 0x10(%0), %%xmm5 \n"
"pxor (%0,%1), %%xmm4 \n"
"movdqa %%xmm4,%%xmm6 \n"
"pand %%xmm2,%%xmm6 \n"
"psrlw $0x4,%%xmm4 \n"
"movdqa %%xmm3,%%xmm7 \n"
"pshufb %%xmm6,%%xmm7 \n"
"pand %%xmm2,%%xmm4 \n"
"movdqa %%xmm3,%%xmm6 \n"
"pshufb %%xmm4,%%xmm6 \n"
"paddb %%xmm7,%%xmm6 \n"
"pxor 0x10(%0,%1),%%xmm5 \n"
"add $0x20,%0 \n"
"movdqa %%xmm5,%%xmm4 \n"
"pand %%xmm2,%%xmm5 \n"
"psrlw $0x4,%%xmm4 \n"
"movdqa %%xmm3,%%xmm7 \n"
"pshufb %%xmm5,%%xmm7 \n"
"pand %%xmm2,%%xmm4 \n"
"movdqa %%xmm3,%%xmm5 \n"
"pshufb %%xmm4,%%xmm5 \n"
"paddb %%xmm7,%%xmm5 \n"
"paddb %%xmm5,%%xmm6 \n"
"psadbw %%xmm1,%%xmm6 \n"
"paddd %%xmm6,%%xmm0 \n"
"sub $0x20,%2 \n"
"jg 1b \n"
"pshufd $0xaa,%%xmm0,%%xmm1 \n"
"paddd %%xmm1,%%xmm0 \n"
"movd %%xmm0, %3 \n"
: "+r"(src_a), // %0
"+r"(src_b), // %1
"+r"(count), // %2
"=r"(diff) // %3
: "m"(kNibbleMask), // %4
"m"(kBitCount) // %5
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
"xmm7");
return diff;
}
#ifdef HAS_HAMMINGDISTANCE_AVX2
uint32 HammingDistance_AVX2(const uint8* src_a, const uint8* src_b, int count) {
uint32 diff = 0u;
asm volatile(
"vbroadcastf128 %4,%%ymm2 \n"
"vbroadcastf128 %5,%%ymm3 \n"
"vpxor %%ymm0,%%ymm0,%%ymm0 \n"
"vpxor %%ymm1,%%ymm1,%%ymm1 \n"
"sub %0,%1 \n"
LABELALIGN
"1: \n"
"vmovdqa (%0),%%ymm4 \n"
"vmovdqa 0x20(%0), %%ymm5 \n"
"vpxor (%0,%1), %%ymm4, %%ymm4 \n"
"vpand %%ymm2,%%ymm4,%%ymm6 \n"
"vpsrlw $0x4,%%ymm4,%%ymm4 \n"
"vpshufb %%ymm6,%%ymm3,%%ymm6 \n"
"vpand %%ymm2,%%ymm4,%%ymm4 \n"
"vpshufb %%ymm4,%%ymm3,%%ymm4 \n"
"vpaddb %%ymm4,%%ymm6,%%ymm6 \n"
"vpxor 0x20(%0,%1),%%ymm5,%%ymm4 \n"
"add $0x40,%0 \n"
"vpand %%ymm2,%%ymm4,%%ymm5 \n"
"vpsrlw $0x4,%%ymm4,%%ymm4 \n"
"vpshufb %%ymm5,%%ymm3,%%ymm5 \n"
"vpand %%ymm2,%%ymm4,%%ymm4 \n"
"vpshufb %%ymm4,%%ymm3,%%ymm4 \n"
"vpaddb %%ymm5,%%ymm4,%%ymm4 \n"
"vpaddb %%ymm6,%%ymm4,%%ymm4 \n"
"vpsadbw %%ymm1,%%ymm4,%%ymm4 \n"
"vpaddd %%ymm0,%%ymm4,%%ymm0 \n"
"sub $0x40,%2 \n"
"jg 1b \n"
"vpermq $0xb1,%%ymm0,%%ymm1 \n"
"vpaddd %%ymm1,%%ymm0,%%ymm0 \n"
"vpermq $0xaa,%%ymm0,%%ymm1 \n"
"vpaddd %%ymm1,%%ymm0,%%ymm0 \n"
"vmovd %%xmm0, %3 \n"
"vzeroupper \n"
: "+r"(src_a), // %0
"+r"(src_b), // %1
"+r"(count), // %2
"=r"(diff) // %3
: "m"(kNibbleMask), // %4
"m"(kBitCount) // %5
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
return diff;
}
#endif // HAS_HAMMINGDISTANCE_AVX2
uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
uint32 sse;

View File

@ -26,67 +26,61 @@ extern "C" {
uint32 HammingDistance_NEON(const uint8* src_a, const uint8* src_b, int count) {
uint32 diff;
asm volatile (
"vmov.u16 q4, #0 \n" // accumulator
asm volatile(
"vmov.u16 q4, #0 \n" // accumulator
"1: \n"
"vld1.8 {q0, q1}, [%0]! \n"
"vld1.8 {q2, q3}, [%1]! \n"
"veor.32 q0, q0, q2 \n"
"veor.32 q1, q1, q3 \n"
"vcnt.i8 q0, q0 \n"
"vcnt.i8 q1, q1 \n"
"subs %2, %2, #32 \n"
"vadd.u8 q0, q0, q1 \n" // 16 byte counts
"vpadal.u8 q4, q0 \n" // 8 shorts
"bgt 1b \n"
"1: \n"
"vld1.8 {q0, q1}, [%0]! \n"
"vld1.8 {q2, q3}, [%1]! \n"
"veor.32 q0, q0, q2 \n"
"veor.32 q1, q1, q3 \n"
"vcnt.i8 q0, q0 \n"
"vcnt.i8 q1, q1 \n"
"subs %2, %2, #32 \n"
"vadd.u8 q0, q0, q1 \n" // 16 byte counts
"vpadal.u8 q4, q0 \n" // 8 shorts
"bgt 1b \n"
"vpaddl.u16 q0, q4 \n" // 4 ints
"vpadd.u32 d0, d0, d1 \n"
"vpadd.u32 d0, d0, d0 \n"
"vmov.32 %3, d0[0] \n"
: "+r"(src_a),
"+r"(src_b),
"+r"(count),
"=r"(diff)
:
: "cc", "q0", "q1", "q2", "q3", "q4");
"vpaddl.u16 q0, q4 \n" // 4 ints
"vpadd.u32 d0, d0, d1 \n"
"vpadd.u32 d0, d0, d0 \n"
"vmov.32 %3, d0[0] \n"
: "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(diff)
:
: "cc", "q0", "q1", "q2", "q3", "q4");
return diff;
}
uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
uint32 sse;
asm volatile (
"vmov.u8 q8, #0 \n"
"vmov.u8 q10, #0 \n"
"vmov.u8 q9, #0 \n"
"vmov.u8 q11, #0 \n"
asm volatile(
"vmov.u8 q8, #0 \n"
"vmov.u8 q10, #0 \n"
"vmov.u8 q9, #0 \n"
"vmov.u8 q11, #0 \n"
"1: \n"
"vld1.8 {q0}, [%0]! \n"
"vld1.8 {q1}, [%1]! \n"
"subs %2, %2, #16 \n"
"vsubl.u8 q2, d0, d2 \n"
"vsubl.u8 q3, d1, d3 \n"
"vmlal.s16 q8, d4, d4 \n"
"vmlal.s16 q9, d6, d6 \n"
"vmlal.s16 q10, d5, d5 \n"
"vmlal.s16 q11, d7, d7 \n"
"bgt 1b \n"
"1: \n"
"vld1.8 {q0}, [%0]! \n"
"vld1.8 {q1}, [%1]! \n"
"subs %2, %2, #16 \n"
"vsubl.u8 q2, d0, d2 \n"
"vsubl.u8 q3, d1, d3 \n"
"vmlal.s16 q8, d4, d4 \n"
"vmlal.s16 q9, d6, d6 \n"
"vmlal.s16 q10, d5, d5 \n"
"vmlal.s16 q11, d7, d7 \n"
"bgt 1b \n"
"vadd.u32 q8, q8, q9 \n"
"vadd.u32 q10, q10, q11 \n"
"vadd.u32 q11, q8, q10 \n"
"vpaddl.u32 q1, q11 \n"
"vadd.u64 d0, d2, d3 \n"
"vmov.32 %3, d0[0] \n"
: "+r"(src_a),
"+r"(src_b),
"+r"(count),
"=r"(sse)
:
: "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
"vadd.u32 q8, q8, q9 \n"
"vadd.u32 q10, q10, q11 \n"
"vadd.u32 q11, q8, q10 \n"
"vpaddl.u32 q1, q11 \n"
"vadd.u64 d0, d2, d3 \n"
"vmov.32 %3, d0[0] \n"
: "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse)
:
: "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
return sse;
}

View File

@ -24,63 +24,57 @@ extern "C" {
// uses short accumulator which restricts count to 131 KB
uint32 HammingDistance_NEON(const uint8* src_a, const uint8* src_b, int count) {
uint32 diff;
asm volatile (
"movi v4.8h, #0 \n"
asm volatile(
"movi v4.8h, #0 \n"
"1: \n"
"ld1 {v0.16b, v1.16b}, [%0], #32 \n"
"ld1 {v2.16b, v3.16b}, [%1], #32 \n"
"eor v0.16b, v0.16b, v2.16b \n"
"eor v1.16b, v1.16b, v3.16b \n"
"cnt v0.16b, v0.16b \n"
"cnt v1.16b, v1.16b \n"
"subs %w2, %w2, #32 \n"
"add v0.16b, v0.16b, v1.16b \n"
"uadalp v4.8h, v0.16b \n"
"b.gt 1b \n"
"1: \n"
"ld1 {v0.16b, v1.16b}, [%0], #32 \n"
"ld1 {v2.16b, v3.16b}, [%1], #32 \n"
"eor v0.16b, v0.16b, v2.16b \n"
"eor v1.16b, v1.16b, v3.16b \n"
"cnt v0.16b, v0.16b \n"
"cnt v1.16b, v1.16b \n"
"subs %w2, %w2, #32 \n"
"add v0.16b, v0.16b, v1.16b \n"
"uadalp v4.8h, v0.16b \n"
"b.gt 1b \n"
"uaddlv s4, v4.8h \n"
"fmov %w3, s4 \n"
: "+r"(src_a),
"+r"(src_b),
"+r"(count),
"=r"(diff)
:
: "cc", "v0", "v1", "v2", "v3", "v4");
"uaddlv s4, v4.8h \n"
"fmov %w3, s4 \n"
: "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(diff)
:
: "cc", "v0", "v1", "v2", "v3", "v4");
return diff;
}
uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
uint32 sse;
asm volatile (
"eor v16.16b, v16.16b, v16.16b \n"
"eor v18.16b, v18.16b, v18.16b \n"
"eor v17.16b, v17.16b, v17.16b \n"
"eor v19.16b, v19.16b, v19.16b \n"
asm volatile(
"eor v16.16b, v16.16b, v16.16b \n"
"eor v18.16b, v18.16b, v18.16b \n"
"eor v17.16b, v17.16b, v17.16b \n"
"eor v19.16b, v19.16b, v19.16b \n"
"1: \n"
"ld1 {v0.16b}, [%0], #16 \n"
"ld1 {v1.16b}, [%1], #16 \n"
"subs %w2, %w2, #16 \n"
"usubl v2.8h, v0.8b, v1.8b \n"
"usubl2 v3.8h, v0.16b, v1.16b \n"
"smlal v16.4s, v2.4h, v2.4h \n"
"smlal v17.4s, v3.4h, v3.4h \n"
"smlal2 v18.4s, v2.8h, v2.8h \n"
"smlal2 v19.4s, v3.8h, v3.8h \n"
"b.gt 1b \n"
"1: \n"
"ld1 {v0.16b}, [%0], #16 \n"
"ld1 {v1.16b}, [%1], #16 \n"
"subs %w2, %w2, #16 \n"
"usubl v2.8h, v0.8b, v1.8b \n"
"usubl2 v3.8h, v0.16b, v1.16b \n"
"smlal v16.4s, v2.4h, v2.4h \n"
"smlal v17.4s, v3.4h, v3.4h \n"
"smlal2 v18.4s, v2.8h, v2.8h \n"
"smlal2 v19.4s, v3.8h, v3.8h \n"
"b.gt 1b \n"
"add v16.4s, v16.4s, v17.4s \n"
"add v18.4s, v18.4s, v19.4s \n"
"add v19.4s, v16.4s, v18.4s \n"
"addv s0, v19.4s \n"
"fmov %w3, s0 \n"
: "+r"(src_a),
"+r"(src_b),
"+r"(count),
"=r"(sse)
:
: "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
"add v16.4s, v16.4s, v17.4s \n"
"add v18.4s, v18.4s, v19.4s \n"
"add v19.4s, v16.4s, v18.4s \n"
"addv s0, v19.4s \n"
"fmov %w3, s0 \n"
: "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse)
:
: "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
return sse;
}

View File

@ -9,34 +9,51 @@
*/
#include "libyuv/basic_types.h"
#include "libyuv/compare_row.h"
#include "libyuv/row.h"
#if defined(_MSC_VER)
#include <intrin.h> // For __popcnt
#endif
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
// This module is for 32 bit Visual C x86 and clangcl
#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
#if (_MSC_VER >= 1900)
__declspec(naked)
#else
__declspec(naked) __declspec(align(16))
#endif
uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
uint32 HammingDistance_SSE42(const uint8* src_a,
const uint8* src_b,
int count) {
uint32 diff = 0u;
int i;
for (i = 0; i < count - 3; i += 4) {
uint32 x = *((uint32*)src_a) ^ *((uint32*)src_b);
src_a += 4;
src_b += 4;
diff += __popcnt(x);
}
return diff;
}
__declspec(naked) uint32
SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
__asm {
mov eax, [esp + 4] // src_a
mov edx, [esp + 8] // src_b
mov ecx, [esp + 12] // count
mov eax, [esp + 4] // src_a
mov edx, [esp + 8] // src_b
mov ecx, [esp + 12] // count
pxor xmm0, xmm0
pxor xmm5, xmm5
align 4
wloop:
movdqa xmm1, [eax]
movdqu xmm1, [eax]
lea eax, [eax + 16]
movdqa xmm2, [edx]
movdqu xmm2, [edx]
lea edx, [edx + 16]
sub ecx, 16
movdqa xmm3, xmm1 // abs trick
psubusb xmm1, xmm2
psubusb xmm2, xmm3
@ -48,6 +65,7 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
pmaddwd xmm2, xmm2
paddd xmm0, xmm1
paddd xmm0, xmm2
sub ecx, 16
jg wloop
pshufd xmm1, xmm0, 0xee
@ -62,27 +80,21 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
// Visual C 2012 required for AVX2.
#if _MSC_VER >= 1700
// C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX.
#pragma warning(disable: 4752)
#if (_MSC_VER >= 1900)
__declspec(naked)
#else
__declspec(naked) __declspec(align(16))
#endif
uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
#pragma warning(disable : 4752)
__declspec(naked) uint32
SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
__asm {
mov eax, [esp + 4] // src_a
mov edx, [esp + 8] // src_b
mov ecx, [esp + 12] // count
mov eax, [esp + 4] // src_a
mov edx, [esp + 8] // src_b
mov ecx, [esp + 12] // count
vpxor ymm0, ymm0, ymm0 // sum
vpxor ymm5, ymm5, ymm5 // constant 0 for unpck
sub edx, eax
align 4
wloop:
vmovdqu ymm1, [eax]
vmovdqu ymm2, [eax + edx]
lea eax, [eax + 32]
sub ecx, 32
vpsubusb ymm3, ymm1, ymm2 // abs difference trick
vpsubusb ymm2, ymm2, ymm1
vpor ymm1, ymm2, ymm3
@ -92,6 +104,7 @@ uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
vpmaddwd ymm1, ymm1, ymm1
vpaddd ymm0, ymm0, ymm1
vpaddd ymm0, ymm0, ymm2
sub ecx, 32
jg wloop
vpshufd ymm1, ymm0, 0xee // 3, 2 + 1, 0 both lanes.
@ -107,81 +120,66 @@ uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
}
#endif // _MSC_VER >= 1700
#define HAS_HASHDJB2_SSE41
static uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 }; // 33 ^ 16
static uvec32 kHashMul0 = {
0x0c3525e1, // 33 ^ 15
0xa3476dc1, // 33 ^ 14
0x3b4039a1, // 33 ^ 13
0x4f5f0981, // 33 ^ 12
uvec32 kHash16x33 = {0x92d9e201, 0, 0, 0}; // 33 ^ 16
uvec32 kHashMul0 = {
0x0c3525e1, // 33 ^ 15
0xa3476dc1, // 33 ^ 14
0x3b4039a1, // 33 ^ 13
0x4f5f0981, // 33 ^ 12
};
static uvec32 kHashMul1 = {
0x30f35d61, // 33 ^ 11
0x855cb541, // 33 ^ 10
0x040a9121, // 33 ^ 9
0x747c7101, // 33 ^ 8
uvec32 kHashMul1 = {
0x30f35d61, // 33 ^ 11
0x855cb541, // 33 ^ 10
0x040a9121, // 33 ^ 9
0x747c7101, // 33 ^ 8
};
static uvec32 kHashMul2 = {
0xec41d4e1, // 33 ^ 7
0x4cfa3cc1, // 33 ^ 6
0x025528a1, // 33 ^ 5
0x00121881, // 33 ^ 4
uvec32 kHashMul2 = {
0xec41d4e1, // 33 ^ 7
0x4cfa3cc1, // 33 ^ 6
0x025528a1, // 33 ^ 5
0x00121881, // 33 ^ 4
};
static uvec32 kHashMul3 = {
0x00008c61, // 33 ^ 3
0x00000441, // 33 ^ 2
0x00000021, // 33 ^ 1
0x00000001, // 33 ^ 0
uvec32 kHashMul3 = {
0x00008c61, // 33 ^ 3
0x00000441, // 33 ^ 2
0x00000021, // 33 ^ 1
0x00000001, // 33 ^ 0
};
// 27: 66 0F 38 40 C6 pmulld xmm0,xmm6
// 44: 66 0F 38 40 DD pmulld xmm3,xmm5
// 59: 66 0F 38 40 E5 pmulld xmm4,xmm5
// 72: 66 0F 38 40 D5 pmulld xmm2,xmm5
// 83: 66 0F 38 40 CD pmulld xmm1,xmm5
#define pmulld(reg) _asm _emit 0x66 _asm _emit 0x0F _asm _emit 0x38 \
_asm _emit 0x40 _asm _emit reg
#if (_MSC_VER >= 1900)
__declspec(naked)
#else
__declspec(naked) __declspec(align(16))
#endif
uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
__declspec(naked) uint32
HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
__asm {
mov eax, [esp + 4] // src
mov ecx, [esp + 8] // count
mov eax, [esp + 4] // src
mov ecx, [esp + 8] // count
movd xmm0, [esp + 12] // seed
pxor xmm7, xmm7 // constant 0 for unpck
movdqa xmm6, kHash16x33
pxor xmm7, xmm7 // constant 0 for unpck
movdqa xmm6, xmmword ptr kHash16x33
align 4
wloop:
movdqu xmm1, [eax] // src[0-15]
movdqu xmm1, [eax] // src[0-15]
lea eax, [eax + 16]
pmulld(0xc6) // pmulld xmm0,xmm6 hash *= 33 ^ 16
movdqa xmm5, kHashMul0
pmulld xmm0, xmm6 // hash *= 33 ^ 16
movdqa xmm5, xmmword ptr kHashMul0
movdqa xmm2, xmm1
punpcklbw xmm2, xmm7 // src[0-7]
punpcklbw xmm2, xmm7 // src[0-7]
movdqa xmm3, xmm2
punpcklwd xmm3, xmm7 // src[0-3]
pmulld(0xdd) // pmulld xmm3, xmm5
movdqa xmm5, kHashMul1
punpcklwd xmm3, xmm7 // src[0-3]
pmulld xmm3, xmm5
movdqa xmm5, xmmword ptr kHashMul1
movdqa xmm4, xmm2
punpckhwd xmm4, xmm7 // src[4-7]
pmulld(0xe5) // pmulld xmm4, xmm5
movdqa xmm5, kHashMul2
punpckhbw xmm1, xmm7 // src[8-15]
punpckhwd xmm4, xmm7 // src[4-7]
pmulld xmm4, xmm5
movdqa xmm5, xmmword ptr kHashMul2
punpckhbw xmm1, xmm7 // src[8-15]
movdqa xmm2, xmm1
punpcklwd xmm2, xmm7 // src[8-11]
pmulld(0xd5) // pmulld xmm2, xmm5
movdqa xmm5, kHashMul3
punpckhwd xmm1, xmm7 // src[12-15]
pmulld(0xcd) // pmulld xmm1, xmm5
paddd xmm3, xmm4 // add 16 results
punpcklwd xmm2, xmm7 // src[8-11]
pmulld xmm2, xmm5
movdqa xmm5, xmmword ptr kHashMul3
punpckhwd xmm1, xmm7 // src[12-15]
pmulld xmm1, xmm5
paddd xmm3, xmm4 // add 16 results
paddd xmm1, xmm2
sub ecx, 16
paddd xmm1, xmm3
pshufd xmm2, xmm1, 0x0e // upper 2 dwords
@ -189,59 +187,55 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
pshufd xmm2, xmm1, 0x01
paddd xmm1, xmm2
paddd xmm0, xmm1
sub ecx, 16
jg wloop
movd eax, xmm0 // return hash
movd eax, xmm0 // return hash
ret
}
}
// Visual C 2012 required for AVX2.
#if _MSC_VER >= 1700
#if (_MSC_VER >= 1900)
__declspec(naked)
#else
__declspec(naked) __declspec(align(16))
#endif
uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) {
__declspec(naked) uint32
HashDjb2_AVX2(const uint8* src, int count, uint32 seed) {
__asm {
mov eax, [esp + 4] // src
mov ecx, [esp + 8] // count
movd xmm0, [esp + 12] // seed
movdqa xmm6, kHash16x33
mov eax, [esp + 4] // src
mov ecx, [esp + 8] // count
vmovd xmm0, [esp + 12] // seed
align 4
wloop:
vpmovzxbd xmm3, dword ptr [eax] // src[0-3]
pmulld xmm0, xmm6 // hash *= 33 ^ 16
vpmovzxbd xmm4, dword ptr [eax + 4] // src[4-7]
pmulld xmm3, kHashMul0
vpmovzxbd xmm2, dword ptr [eax + 8] // src[8-11]
pmulld xmm4, kHashMul1
vpmovzxbd xmm1, dword ptr [eax + 12] // src[12-15]
pmulld xmm2, kHashMul2
vpmovzxbd xmm3, [eax] // src[0-3]
vpmulld xmm0, xmm0, xmmword ptr kHash16x33 // hash *= 33 ^ 16
vpmovzxbd xmm4, [eax + 4] // src[4-7]
vpmulld xmm3, xmm3, xmmword ptr kHashMul0
vpmovzxbd xmm2, [eax + 8] // src[8-11]
vpmulld xmm4, xmm4, xmmword ptr kHashMul1
vpmovzxbd xmm1, [eax + 12] // src[12-15]
vpmulld xmm2, xmm2, xmmword ptr kHashMul2
lea eax, [eax + 16]
pmulld xmm1, kHashMul3
paddd xmm3, xmm4 // add 16 results
paddd xmm1, xmm2
vpmulld xmm1, xmm1, xmmword ptr kHashMul3
vpaddd xmm3, xmm3, xmm4 // add 16 results
vpaddd xmm1, xmm1, xmm2
vpaddd xmm1, xmm1, xmm3
vpshufd xmm2, xmm1, 0x0e // upper 2 dwords
vpaddd xmm1, xmm1,xmm2
vpshufd xmm2, xmm1, 0x01
vpaddd xmm1, xmm1, xmm2
vpaddd xmm0, xmm0, xmm1
sub ecx, 16
paddd xmm1, xmm3
pshufd xmm2, xmm1, 0x0e // upper 2 dwords
paddd xmm1, xmm2
pshufd xmm2, xmm1, 0x01
paddd xmm1, xmm2
paddd xmm0, xmm1
jg wloop
movd eax, xmm0 // return hash
vmovd eax, xmm0 // return hash
vzeroupper
ret
}
}
#endif // _MSC_VER >= 1700
#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif
#endif

View File

@ -657,6 +657,42 @@ int I420ToRAW(const uint8* src_y,
width, height);
}
// Convert H420 to RGB24.
LIBYUV_API
int H420ToRGB24(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_rgb24,
int dst_stride_rgb24,
int width,
int height) {
return I420ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
src_stride_v, dst_rgb24, dst_stride_rgb24,
&kYuvH709Constants, width, height);
}
// Convert H420 to RAW.
LIBYUV_API
int H420ToRAW(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_raw,
int dst_stride_raw,
int width,
int height) {
return I420ToRGB24Matrix(src_y, src_stride_y, src_v,
src_stride_v, // Swap U and V
src_u, src_stride_u, dst_raw, dst_stride_raw,
&kYvuH709Constants, // Use Yvu matrix
width, height);
}
// Convert I420 to ARGB1555.
LIBYUV_API
int I420ToARGB1555(const uint8* src_y,
@ -1075,8 +1111,8 @@ int I420ToRGB565Dither(const uint8* src_y,
for (y = 0; y < height; ++y) {
I422ToARGBRow(src_y, src_u, src_v, row_argb, &kYuvI601Constants, width);
ARGBToRGB565DitherRow(row_argb, dst_rgb565,
*(uint32*)(dither4x4 + ((y & 3) << 2)),
width); // NOLINT
*(uint32*)(dither4x4 + ((y & 3) << 2)), // NOLINT
width); // NOLINT
dst_rgb565 += dst_stride_rgb565;
src_y += src_stride_y;
if (y & 1) {

View File

@ -124,7 +124,7 @@ void CpuId(int eax, int ecx, int* cpu_info) {
int GetXCR0() {
int xcr0 = 0;
#if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219)
xcr0 = _xgetbv(0); // VS2010 SP1 required.
xcr0 = (int)_xgetbv(0); // VS2010 SP1 required. NOLINT
#elif defined(__i386__) || defined(__x86_64__)
asm(".byte 0x0f, 0x01, 0xd0" : "=a"(xcr0) : "c"(0) : "%edx");
#endif // defined(__i386__) || defined(__x86_64__)
@ -242,10 +242,17 @@ static SAFEBUFFERS int GetCpuFlags(void) {
// Detect AVX512bw
if ((GetXCR0() & 0xe0) == 0xe0) {
cpu_info |= (cpu_info7[1] & 0x40000000) ? kCpuHasAVX3 : 0;
cpu_info |= (cpu_info7[1] & 0x40000000) ? kCpuHasAVX512BW : 0;
cpu_info |= (cpu_info7[1] & 0x80000000) ? kCpuHasAVX512VL : 0;
cpu_info |= (cpu_info7[2] & 0x00000002) ? kCpuHasAVX512VBMI : 0;
cpu_info |= (cpu_info7[2] & 0x00000040) ? kCpuHasAVX512VBMI2 : 0;
cpu_info |= (cpu_info7[2] & 0x00001000) ? kCpuHasAVX512VBITALG : 0;
cpu_info |= (cpu_info7[2] & 0x00004000) ? kCpuHasAVX512VPOPCNTDQ : 0;
cpu_info |= (cpu_info7[2] & 0x00000100) ? kCpuHasGFNI : 0;
}
}
// TODO(fbarchard): Consider moving these to gtest
// Environment variable overrides for testing.
if (TestEnv("LIBYUV_DISABLE_X86")) {
cpu_info &= ~kCpuHasX86;
@ -274,12 +281,12 @@ static SAFEBUFFERS int GetCpuFlags(void) {
if (TestEnv("LIBYUV_DISABLE_FMA3")) {
cpu_info &= ~kCpuHasFMA3;
}
if (TestEnv("LIBYUV_DISABLE_AVX3")) {
cpu_info &= ~kCpuHasAVX3;
}
if (TestEnv("LIBYUV_DISABLE_F16C")) {
cpu_info &= ~kCpuHasF16C;
}
if (TestEnv("LIBYUV_DISABLE_AVX512BW")) {
cpu_info &= ~kCpuHasAVX512BW;
}
#endif
#if defined(__mips__) && defined(__linux__)

View File

@ -13,10 +13,6 @@
#ifdef HAVE_JPEG
#include <assert.h>
#ifdef __cplusplus
#include <new>
#endif
#if !defined(__pnacl__) && !defined(__CLR_VER) && \
!defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
// Must be included before jpeglib.

View File

@ -24,7 +24,7 @@ static LIBYUV_BOOL ScanEOI(const uint8* sample, size_t sample_size) {
const uint8* it = sample;
while (it < end) {
// TODO(fbarchard): scan for 0xd9 instead.
it = static_cast<const uint8*>(memchr(it, 0xff, end - it));
it = (const uint8*)(memchr(it, 0xff, end - it));
if (it == NULL) {
break;
}

View File

@ -321,6 +321,14 @@ void SplitUVPlane(const uint8* src_uv,
}
}
#endif
#if defined(HAS_SPLITUVROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
SplitUVRow = SplitUVRow_Any_MSA;
if (IS_ALIGNED(width, 32)) {
SplitUVRow = SplitUVRow_MSA;
}
}
#endif
for (y = 0; y < height; ++y) {
// Copy a row of UV.
@ -399,6 +407,122 @@ void MergeUVPlane(const uint8* src_u,
}
}
// Support function for NV12 etc RGB channels.
// Width and height are plane sizes (typically half pixel width).
LIBYUV_API
void SplitRGBPlane(const uint8* src_rgb,
int src_stride_rgb,
uint8* dst_r,
int dst_stride_r,
uint8* dst_g,
int dst_stride_g,
uint8* dst_b,
int dst_stride_b,
int width,
int height) {
int y;
void (*SplitRGBRow)(const uint8* src_rgb, uint8* dst_r, uint8* dst_g,
uint8* dst_b, int width) = SplitRGBRow_C;
// Negative height means invert the image.
if (height < 0) {
height = -height;
dst_r = dst_r + (height - 1) * dst_stride_r;
dst_g = dst_g + (height - 1) * dst_stride_g;
dst_b = dst_b + (height - 1) * dst_stride_b;
dst_stride_r = -dst_stride_r;
dst_stride_g = -dst_stride_g;
dst_stride_b = -dst_stride_b;
}
// Coalesce rows.
if (src_stride_rgb == width * 3 && dst_stride_r == width &&
dst_stride_g == width && dst_stride_b == width) {
width *= height;
height = 1;
src_stride_rgb = dst_stride_r = dst_stride_g = dst_stride_b = 0;
}
#if defined(HAS_SPLITRGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
SplitRGBRow = SplitRGBRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
SplitRGBRow = SplitRGBRow_SSSE3;
}
}
#endif
#if defined(HAS_SPLITRGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
SplitRGBRow = SplitRGBRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
SplitRGBRow = SplitRGBRow_NEON;
}
}
#endif
for (y = 0; y < height; ++y) {
// Copy a row of RGB.
SplitRGBRow(src_rgb, dst_r, dst_g, dst_b, width);
dst_r += dst_stride_r;
dst_g += dst_stride_g;
dst_b += dst_stride_b;
src_rgb += src_stride_rgb;
}
}
LIBYUV_API
void MergeRGBPlane(const uint8* src_r,
int src_stride_r,
const uint8* src_g,
int src_stride_g,
const uint8* src_b,
int src_stride_b,
uint8* dst_rgb,
int dst_stride_rgb,
int width,
int height) {
int y;
void (*MergeRGBRow)(const uint8* src_r, const uint8* src_g,
const uint8* src_b, uint8* dst_rgb, int width) =
MergeRGBRow_C;
// Coalesce rows.
// Negative height means invert the image.
if (height < 0) {
height = -height;
dst_rgb = dst_rgb + (height - 1) * dst_stride_rgb;
dst_stride_rgb = -dst_stride_rgb;
}
// Coalesce rows.
if (src_stride_r == width && src_stride_g == width && src_stride_b == width &&
dst_stride_rgb == width * 3) {
width *= height;
height = 1;
src_stride_r = src_stride_g = src_stride_b = dst_stride_rgb = 0;
}
#if defined(HAS_MERGERGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
MergeRGBRow = MergeRGBRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
MergeRGBRow = MergeRGBRow_SSSE3;
}
}
#endif
#if defined(HAS_MERGERGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
MergeRGBRow = MergeRGBRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
MergeRGBRow = MergeRGBRow_NEON;
}
}
#endif
for (y = 0; y < height; ++y) {
// Merge a row of U and V into a row of RGB.
MergeRGBRow(src_r, src_g, src_b, dst_rgb, width);
src_r += src_stride_r;
src_g += src_stride_g;
src_b += src_stride_b;
dst_rgb += dst_stride_rgb;
}
}
// Mirror a plane of data.
void MirrorPlane(const uint8* src_y,
int src_stride_y,
@ -845,6 +969,11 @@ ARGBBlendRow GetARGBBlend() {
if (TestCpuFlag(kCpuHasNEON)) {
ARGBBlendRow = ARGBBlendRow_NEON;
}
#endif
#if defined(HAS_ARGBBLENDROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ARGBBlendRow = ARGBBlendRow_MSA;
}
#endif
return ARGBBlendRow;
}
@ -1574,6 +1703,11 @@ void SetPlane(uint8* dst_y,
SetRow = SetRow_ERMS;
}
#endif
#if defined(HAS_SETROW_MSA)
if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 16)) {
SetRow = SetRow_MSA;
}
#endif
// Set plane
for (y = 0; y < height; ++y) {
@ -1973,6 +2107,11 @@ int ARGBColorMatrix(const uint8* src_argb,
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
ARGBColorMatrixRow = ARGBColorMatrixRow_NEON;
}
#endif
#if defined(HAS_ARGBCOLORMATRIXROW_MSA)
if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) {
ARGBColorMatrixRow = ARGBColorMatrixRow_MSA;
}
#endif
for (y = 0; y < height; ++y) {
ARGBColorMatrixRow(src_argb, dst_argb, matrix_argb, width);
@ -2133,6 +2272,11 @@ int ARGBQuantize(uint8* dst_argb,
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
ARGBQuantizeRow = ARGBQuantizeRow_NEON;
}
#endif
#if defined(HAS_ARGBQUANTIZEROW_MSA)
if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) {
ARGBQuantizeRow = ARGBQuantizeRow_MSA;
}
#endif
for (y = 0; y < height; ++y) {
ARGBQuantizeRow(dst, scale, interval_size, interval_offset, width);
@ -2619,6 +2763,11 @@ static int ARGBSobelize(const uint8* src_argb,
SobelYRow = SobelYRow_NEON;
}
#endif
#if defined(HAS_SOBELYROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
SobelYRow = SobelYRow_MSA;
}
#endif
#if defined(HAS_SOBELXROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
SobelXRow = SobelXRow_SSE2;
@ -2628,6 +2777,11 @@ static int ARGBSobelize(const uint8* src_argb,
if (TestCpuFlag(kCpuHasNEON)) {
SobelXRow = SobelXRow_NEON;
}
#endif
#if defined(HAS_SOBELXROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
SobelXRow = SobelXRow_MSA;
}
#endif
{
// 3 rows with edges before/after.
@ -2903,6 +3057,14 @@ int HalfFloatPlane(const uint16* src_y,
}
}
#endif
#if defined(HAS_HALFFLOATROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
HalfFloatRow = HalfFloatRow_Any_MSA;
if (IS_ALIGNED(width, 32)) {
HalfFloatRow = HalfFloatRow_MSA;
}
}
#endif
for (y = 0; y < height; ++y) {
HalfFloatRow(src_y, dst_y, scale, width);
@ -3048,6 +3210,12 @@ int ARGBExtractAlpha(const uint8* src_argb,
: ARGBExtractAlphaRow_Any_NEON;
}
#endif
#if defined(HAS_ARGBEXTRACTALPHAROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ARGBExtractAlphaRow = IS_ALIGNED(width, 16) ? ARGBExtractAlphaRow_MSA
: ARGBExtractAlphaRow_Any_MSA;
}
#endif
for (int y = 0; y < height; ++y) {
ARGBExtractAlphaRow(src_argb, dst_a, width);
@ -3160,6 +3328,14 @@ int YUY2ToNV12(const uint8* src_yuy2,
}
}
#endif
#if defined(HAS_SPLITUVROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
SplitUVRow = SplitUVRow_Any_MSA;
if (IS_ALIGNED(width, 32)) {
SplitUVRow = SplitUVRow_MSA;
}
}
#endif
#if defined(HAS_INTERPOLATEROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
InterpolateRow = InterpolateRow_Any_SSSE3;
@ -3268,6 +3444,14 @@ int UYVYToNV12(const uint8* src_uyvy,
}
}
#endif
#if defined(HAS_SPLITUVROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
SplitUVRow = SplitUVRow_Any_MSA;
if (IS_ALIGNED(width, 32)) {
SplitUVRow = SplitUVRow_MSA;
}
}
#endif
#if defined(HAS_INTERPOLATEROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
InterpolateRow = InterpolateRow_Any_SSSE3;

View File

@ -361,6 +361,11 @@ void RotateUV180(const uint8* src,
MirrorUVRow = MirrorUVRow_DSPR2;
}
#endif
#if defined(HAS_MIRRORUVROW_MSA)
if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 32)) {
MirrorUVRow = MirrorUVRow_MSA;
}
#endif
dst_a += dst_stride_a * (height - 1);
dst_b += dst_stride_b * (height - 1);

View File

@ -30,14 +30,14 @@ void TransposeWx8_NEON(const uint8* src,
int dst_stride,
int width) {
const uint8* src_temp;
asm volatile (
// loops are on blocks of 8. loop will stop when
// counter gets to or below 0. starting the counter
// at w-8 allow for this
"sub %w3, %w3, #8 \n"
asm volatile(
// loops are on blocks of 8. loop will stop when
// counter gets to or below 0. starting the counter
// at w-8 allow for this
"sub %w3, %w3, #8 \n"
// handle 8x8 blocks. this should be the majority of the plane
"1: \n"
// handle 8x8 blocks. this should be the majority of the plane
"1: \n"
"mov %0, %1 \n"
"ld1 {v0.8b}, [%0], %5 \n"
@ -92,109 +92,108 @@ void TransposeWx8_NEON(const uint8* src,
"subs %w3, %w3, #8 \n" // w -= 8
"b.ge 1b \n"
// add 8 back to counter. if the result is 0 there are
// no residuals.
"adds %w3, %w3, #8 \n"
"b.eq 4f \n"
// add 8 back to counter. if the result is 0 there are
// no residuals.
"adds %w3, %w3, #8 \n"
"b.eq 4f \n"
// some residual, so between 1 and 7 lines left to transpose
"cmp %w3, #2 \n"
"b.lt 3f \n"
// some residual, so between 1 and 7 lines left to transpose
"cmp %w3, #2 \n"
"b.lt 3f \n"
"cmp %w3, #4 \n"
"b.lt 2f \n"
"cmp %w3, #4 \n"
"b.lt 2f \n"
// 4x8 block
"mov %0, %1 \n"
"ld1 {v0.s}[0], [%0], %5 \n"
"ld1 {v0.s}[1], [%0], %5 \n"
"ld1 {v0.s}[2], [%0], %5 \n"
"ld1 {v0.s}[3], [%0], %5 \n"
"ld1 {v1.s}[0], [%0], %5 \n"
"ld1 {v1.s}[1], [%0], %5 \n"
"ld1 {v1.s}[2], [%0], %5 \n"
"ld1 {v1.s}[3], [%0] \n"
// 4x8 block
"mov %0, %1 \n"
"ld1 {v0.s}[0], [%0], %5 \n"
"ld1 {v0.s}[1], [%0], %5 \n"
"ld1 {v0.s}[2], [%0], %5 \n"
"ld1 {v0.s}[3], [%0], %5 \n"
"ld1 {v1.s}[0], [%0], %5 \n"
"ld1 {v1.s}[1], [%0], %5 \n"
"ld1 {v1.s}[2], [%0], %5 \n"
"ld1 {v1.s}[3], [%0] \n"
"mov %0, %2 \n"
"mov %0, %2 \n"
"ld1 {v2.16b}, [%4] \n"
"ld1 {v2.16b}, [%4] \n"
"tbl v3.16b, {v0.16b}, v2.16b \n"
"tbl v0.16b, {v1.16b}, v2.16b \n"
"tbl v3.16b, {v0.16b}, v2.16b \n"
"tbl v0.16b, {v1.16b}, v2.16b \n"
// TODO(frkoenig): Rework shuffle above to
// write out with 4 instead of 8 writes.
"st1 {v3.s}[0], [%0], %6 \n"
"st1 {v3.s}[1], [%0], %6 \n"
"st1 {v3.s}[2], [%0], %6 \n"
"st1 {v3.s}[3], [%0] \n"
// TODO(frkoenig): Rework shuffle above to
// write out with 4 instead of 8 writes.
"st1 {v3.s}[0], [%0], %6 \n"
"st1 {v3.s}[1], [%0], %6 \n"
"st1 {v3.s}[2], [%0], %6 \n"
"st1 {v3.s}[3], [%0] \n"
"add %0, %2, #4 \n"
"st1 {v0.s}[0], [%0], %6 \n"
"st1 {v0.s}[1], [%0], %6 \n"
"st1 {v0.s}[2], [%0], %6 \n"
"st1 {v0.s}[3], [%0] \n"
"add %0, %2, #4 \n"
"st1 {v0.s}[0], [%0], %6 \n"
"st1 {v0.s}[1], [%0], %6 \n"
"st1 {v0.s}[2], [%0], %6 \n"
"st1 {v0.s}[3], [%0] \n"
"add %1, %1, #4 \n" // src += 4
"add %2, %2, %6, lsl #2 \n" // dst += 4 * dst_stride
"subs %w3, %w3, #4 \n" // w -= 4
"b.eq 4f \n"
"add %1, %1, #4 \n" // src += 4
"add %2, %2, %6, lsl #2 \n" // dst += 4 * dst_stride
"subs %w3, %w3, #4 \n" // w -= 4
"b.eq 4f \n"
// some residual, check to see if it includes a 2x8 block,
// or less
"cmp %w3, #2 \n"
"b.lt 3f \n"
// some residual, check to see if it includes a 2x8 block,
// or less
"cmp %w3, #2 \n"
"b.lt 3f \n"
// 2x8 block
"2: \n"
"mov %0, %1 \n"
"ld1 {v0.h}[0], [%0], %5 \n"
"ld1 {v1.h}[0], [%0], %5 \n"
"ld1 {v0.h}[1], [%0], %5 \n"
"ld1 {v1.h}[1], [%0], %5 \n"
"ld1 {v0.h}[2], [%0], %5 \n"
"ld1 {v1.h}[2], [%0], %5 \n"
"ld1 {v0.h}[3], [%0], %5 \n"
"ld1 {v1.h}[3], [%0] \n"
// 2x8 block
"2: \n"
"mov %0, %1 \n"
"ld1 {v0.h}[0], [%0], %5 \n"
"ld1 {v1.h}[0], [%0], %5 \n"
"ld1 {v0.h}[1], [%0], %5 \n"
"ld1 {v1.h}[1], [%0], %5 \n"
"ld1 {v0.h}[2], [%0], %5 \n"
"ld1 {v1.h}[2], [%0], %5 \n"
"ld1 {v0.h}[3], [%0], %5 \n"
"ld1 {v1.h}[3], [%0] \n"
"trn2 v2.8b, v0.8b, v1.8b \n"
"trn1 v3.8b, v0.8b, v1.8b \n"
"trn2 v2.8b, v0.8b, v1.8b \n"
"trn1 v3.8b, v0.8b, v1.8b \n"
"mov %0, %2 \n"
"mov %0, %2 \n"
"st1 {v3.8b}, [%0], %6 \n"
"st1 {v2.8b}, [%0] \n"
"st1 {v3.8b}, [%0], %6 \n"
"st1 {v2.8b}, [%0] \n"
"add %1, %1, #2 \n" // src += 2
"add %2, %2, %6, lsl #1 \n" // dst += 2 * dst_stride
"subs %w3, %w3, #2 \n" // w -= 2
"b.eq 4f \n"
"add %1, %1, #2 \n" // src += 2
"add %2, %2, %6, lsl #1 \n" // dst += 2 * dst_stride
"subs %w3, %w3, #2 \n" // w -= 2
"b.eq 4f \n"
// 1x8 block
"3: \n"
"ld1 {v0.b}[0], [%1], %5 \n"
"ld1 {v0.b}[1], [%1], %5 \n"
"ld1 {v0.b}[2], [%1], %5 \n"
"ld1 {v0.b}[3], [%1], %5 \n"
"ld1 {v0.b}[4], [%1], %5 \n"
"ld1 {v0.b}[5], [%1], %5 \n"
"ld1 {v0.b}[6], [%1], %5 \n"
"ld1 {v0.b}[7], [%1] \n"
// 1x8 block
"3: \n"
"ld1 {v0.b}[0], [%1], %5 \n"
"ld1 {v0.b}[1], [%1], %5 \n"
"ld1 {v0.b}[2], [%1], %5 \n"
"ld1 {v0.b}[3], [%1], %5 \n"
"ld1 {v0.b}[4], [%1], %5 \n"
"ld1 {v0.b}[5], [%1], %5 \n"
"ld1 {v0.b}[6], [%1], %5 \n"
"ld1 {v0.b}[7], [%1] \n"
"st1 {v0.8b}, [%2] \n"
"st1 {v0.8b}, [%2] \n"
"4: \n"
"4: \n"
: "=&r"(src_temp), // %0
"+r"(src), // %1
"+r"(dst), // %2
"+r"(width) // %3
: "r"(&kVTbl4x4Transpose), // %4
"r"(static_cast<ptrdiff_t>(src_stride)), // %5
"r"(static_cast<ptrdiff_t>(dst_stride)) // %6
: "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
"v17", "v18", "v19", "v20", "v21", "v22", "v23"
);
: "=&r"(src_temp), // %0
"+r"(src), // %1
"+r"(dst), // %2
"+r"(width) // %3
: "r"(&kVTbl4x4Transpose), // %4
"r"(static_cast<ptrdiff_t>(src_stride)), // %5
"r"(static_cast<ptrdiff_t>(dst_stride)) // %6
: "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
"v17", "v18", "v19", "v20", "v21", "v22", "v23");
}
static uint8 kVTbl4x4TransposeDi[32] = {
@ -209,212 +208,215 @@ void TransposeUVWx8_NEON(const uint8* src,
int dst_stride_b,
int width) {
const uint8* src_temp;
asm volatile (
// loops are on blocks of 8. loop will stop when
// counter gets to or below 0. starting the counter
// at w-8 allow for this
"sub %w4, %w4, #8 \n"
asm volatile(
// loops are on blocks of 8. loop will stop when
// counter gets to or below 0. starting the counter
// at w-8 allow for this
"sub %w4, %w4, #8 \n"
// handle 8x8 blocks. this should be the majority of the plane
"1: \n"
"mov %0, %1 \n"
// handle 8x8 blocks. this should be the majority of the plane
"1: \n"
"mov %0, %1 \n"
"ld1 {v0.16b}, [%0], %5 \n"
"ld1 {v1.16b}, [%0], %5 \n"
"ld1 {v2.16b}, [%0], %5 \n"
"ld1 {v3.16b}, [%0], %5 \n"
"ld1 {v4.16b}, [%0], %5 \n"
"ld1 {v5.16b}, [%0], %5 \n"
"ld1 {v6.16b}, [%0], %5 \n"
"ld1 {v7.16b}, [%0] \n"
"ld1 {v0.16b}, [%0], %5 \n"
"ld1 {v1.16b}, [%0], %5 \n"
"ld1 {v2.16b}, [%0], %5 \n"
"ld1 {v3.16b}, [%0], %5 \n"
"ld1 {v4.16b}, [%0], %5 \n"
"ld1 {v5.16b}, [%0], %5 \n"
"ld1 {v6.16b}, [%0], %5 \n"
"ld1 {v7.16b}, [%0] \n"
"trn1 v16.16b, v0.16b, v1.16b \n"
"trn2 v17.16b, v0.16b, v1.16b \n"
"trn1 v18.16b, v2.16b, v3.16b \n"
"trn2 v19.16b, v2.16b, v3.16b \n"
"trn1 v20.16b, v4.16b, v5.16b \n"
"trn2 v21.16b, v4.16b, v5.16b \n"
"trn1 v22.16b, v6.16b, v7.16b \n"
"trn2 v23.16b, v6.16b, v7.16b \n"
"trn1 v16.16b, v0.16b, v1.16b \n"
"trn2 v17.16b, v0.16b, v1.16b \n"
"trn1 v18.16b, v2.16b, v3.16b \n"
"trn2 v19.16b, v2.16b, v3.16b \n"
"trn1 v20.16b, v4.16b, v5.16b \n"
"trn2 v21.16b, v4.16b, v5.16b \n"
"trn1 v22.16b, v6.16b, v7.16b \n"
"trn2 v23.16b, v6.16b, v7.16b \n"
"trn1 v0.8h, v16.8h, v18.8h \n"
"trn2 v1.8h, v16.8h, v18.8h \n"
"trn1 v2.8h, v20.8h, v22.8h \n"
"trn2 v3.8h, v20.8h, v22.8h \n"
"trn1 v4.8h, v17.8h, v19.8h \n"
"trn2 v5.8h, v17.8h, v19.8h \n"
"trn1 v6.8h, v21.8h, v23.8h \n"
"trn2 v7.8h, v21.8h, v23.8h \n"
"trn1 v0.8h, v16.8h, v18.8h \n"
"trn2 v1.8h, v16.8h, v18.8h \n"
"trn1 v2.8h, v20.8h, v22.8h \n"
"trn2 v3.8h, v20.8h, v22.8h \n"
"trn1 v4.8h, v17.8h, v19.8h \n"
"trn2 v5.8h, v17.8h, v19.8h \n"
"trn1 v6.8h, v21.8h, v23.8h \n"
"trn2 v7.8h, v21.8h, v23.8h \n"
"trn1 v16.4s, v0.4s, v2.4s \n"
"trn2 v17.4s, v0.4s, v2.4s \n"
"trn1 v18.4s, v1.4s, v3.4s \n"
"trn2 v19.4s, v1.4s, v3.4s \n"
"trn1 v20.4s, v4.4s, v6.4s \n"
"trn2 v21.4s, v4.4s, v6.4s \n"
"trn1 v22.4s, v5.4s, v7.4s \n"
"trn2 v23.4s, v5.4s, v7.4s \n"
"trn1 v16.4s, v0.4s, v2.4s \n"
"trn2 v17.4s, v0.4s, v2.4s \n"
"trn1 v18.4s, v1.4s, v3.4s \n"
"trn2 v19.4s, v1.4s, v3.4s \n"
"trn1 v20.4s, v4.4s, v6.4s \n"
"trn2 v21.4s, v4.4s, v6.4s \n"
"trn1 v22.4s, v5.4s, v7.4s \n"
"trn2 v23.4s, v5.4s, v7.4s \n"
"mov %0, %2 \n"
"mov %0, %2 \n"
"st1 {v16.d}[0], [%0], %6 \n"
"st1 {v18.d}[0], [%0], %6 \n"
"st1 {v17.d}[0], [%0], %6 \n"
"st1 {v19.d}[0], [%0], %6 \n"
"st1 {v16.d}[1], [%0], %6 \n"
"st1 {v18.d}[1], [%0], %6 \n"
"st1 {v17.d}[1], [%0], %6 \n"
"st1 {v19.d}[1], [%0] \n"
"st1 {v16.d}[0], [%0], %6 \n"
"st1 {v18.d}[0], [%0], %6 \n"
"st1 {v17.d}[0], [%0], %6 \n"
"st1 {v19.d}[0], [%0], %6 \n"
"st1 {v16.d}[1], [%0], %6 \n"
"st1 {v18.d}[1], [%0], %6 \n"
"st1 {v17.d}[1], [%0], %6 \n"
"st1 {v19.d}[1], [%0] \n"
"mov %0, %3 \n"
"mov %0, %3 \n"
"st1 {v20.d}[0], [%0], %7 \n"
"st1 {v22.d}[0], [%0], %7 \n"
"st1 {v21.d}[0], [%0], %7 \n"
"st1 {v23.d}[0], [%0], %7 \n"
"st1 {v20.d}[1], [%0], %7 \n"
"st1 {v22.d}[1], [%0], %7 \n"
"st1 {v21.d}[1], [%0], %7 \n"
"st1 {v23.d}[1], [%0] \n"
"st1 {v20.d}[0], [%0], %7 \n"
"st1 {v22.d}[0], [%0], %7 \n"
"st1 {v21.d}[0], [%0], %7 \n"
"st1 {v23.d}[0], [%0], %7 \n"
"st1 {v20.d}[1], [%0], %7 \n"
"st1 {v22.d}[1], [%0], %7 \n"
"st1 {v21.d}[1], [%0], %7 \n"
"st1 {v23.d}[1], [%0] \n"
"add %1, %1, #16 \n" // src += 8*2
"add %2, %2, %6, lsl #3 \n" // dst_a += 8 * dst_stride_a
"add %3, %3, %7, lsl #3 \n" // dst_b += 8 * dst_stride_b
"subs %w4, %w4, #8 \n" // w -= 8
"b.ge 1b \n"
"add %1, %1, #16 \n" // src += 8*2
"add %2, %2, %6, lsl #3 \n" // dst_a += 8 *
// dst_stride_a
"add %3, %3, %7, lsl #3 \n" // dst_b += 8 *
// dst_stride_b
"subs %w4, %w4, #8 \n" // w -= 8
"b.ge 1b \n"
// add 8 back to counter. if the result is 0 there are
// no residuals.
"adds %w4, %w4, #8 \n"
"b.eq 4f \n"
// add 8 back to counter. if the result is 0 there are
// no residuals.
"adds %w4, %w4, #8 \n"
"b.eq 4f \n"
// some residual, so between 1 and 7 lines left to transpose
"cmp %w4, #2 \n"
"b.lt 3f \n"
// some residual, so between 1 and 7 lines left to transpose
"cmp %w4, #2 \n"
"b.lt 3f \n"
"cmp %w4, #4 \n"
"b.lt 2f \n"
"cmp %w4, #4 \n"
"b.lt 2f \n"
// TODO(frkoenig): Clean this up
// 4x8 block
"mov %0, %1 \n"
"ld1 {v0.8b}, [%0], %5 \n"
"ld1 {v1.8b}, [%0], %5 \n"
"ld1 {v2.8b}, [%0], %5 \n"
"ld1 {v3.8b}, [%0], %5 \n"
"ld1 {v4.8b}, [%0], %5 \n"
"ld1 {v5.8b}, [%0], %5 \n"
"ld1 {v6.8b}, [%0], %5 \n"
"ld1 {v7.8b}, [%0] \n"
// TODO(frkoenig): Clean this up
// 4x8 block
"mov %0, %1 \n"
"ld1 {v0.8b}, [%0], %5 \n"
"ld1 {v1.8b}, [%0], %5 \n"
"ld1 {v2.8b}, [%0], %5 \n"
"ld1 {v3.8b}, [%0], %5 \n"
"ld1 {v4.8b}, [%0], %5 \n"
"ld1 {v5.8b}, [%0], %5 \n"
"ld1 {v6.8b}, [%0], %5 \n"
"ld1 {v7.8b}, [%0] \n"
"ld1 {v30.16b}, [%8], #16 \n"
"ld1 {v31.16b}, [%8] \n"
"ld1 {v30.16b}, [%8], #16 \n"
"ld1 {v31.16b}, [%8] \n"
"tbl v16.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b \n"
"tbl v17.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v31.16b \n"
"tbl v18.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v30.16b \n"
"tbl v19.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v31.16b \n"
"tbl v16.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b \n"
"tbl v17.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v31.16b \n"
"tbl v18.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v30.16b \n"
"tbl v19.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v31.16b \n"
"mov %0, %2 \n"
"mov %0, %2 \n"
"st1 {v16.s}[0], [%0], %6 \n"
"st1 {v16.s}[1], [%0], %6 \n"
"st1 {v16.s}[2], [%0], %6 \n"
"st1 {v16.s}[3], [%0], %6 \n"
"st1 {v16.s}[0], [%0], %6 \n"
"st1 {v16.s}[1], [%0], %6 \n"
"st1 {v16.s}[2], [%0], %6 \n"
"st1 {v16.s}[3], [%0], %6 \n"
"add %0, %2, #4 \n"
"st1 {v18.s}[0], [%0], %6 \n"
"st1 {v18.s}[1], [%0], %6 \n"
"st1 {v18.s}[2], [%0], %6 \n"
"st1 {v18.s}[3], [%0] \n"
"add %0, %2, #4 \n"
"st1 {v18.s}[0], [%0], %6 \n"
"st1 {v18.s}[1], [%0], %6 \n"
"st1 {v18.s}[2], [%0], %6 \n"
"st1 {v18.s}[3], [%0] \n"
"mov %0, %3 \n"
"mov %0, %3 \n"
"st1 {v17.s}[0], [%0], %7 \n"
"st1 {v17.s}[1], [%0], %7 \n"
"st1 {v17.s}[2], [%0], %7 \n"
"st1 {v17.s}[3], [%0], %7 \n"
"st1 {v17.s}[0], [%0], %7 \n"
"st1 {v17.s}[1], [%0], %7 \n"
"st1 {v17.s}[2], [%0], %7 \n"
"st1 {v17.s}[3], [%0], %7 \n"
"add %0, %3, #4 \n"
"st1 {v19.s}[0], [%0], %7 \n"
"st1 {v19.s}[1], [%0], %7 \n"
"st1 {v19.s}[2], [%0], %7 \n"
"st1 {v19.s}[3], [%0] \n"
"add %0, %3, #4 \n"
"st1 {v19.s}[0], [%0], %7 \n"
"st1 {v19.s}[1], [%0], %7 \n"
"st1 {v19.s}[2], [%0], %7 \n"
"st1 {v19.s}[3], [%0] \n"
"add %1, %1, #8 \n" // src += 4 * 2
"add %2, %2, %6, lsl #2 \n" // dst_a += 4 * dst_stride_a
"add %3, %3, %7, lsl #2 \n" // dst_b += 4 * dst_stride_b
"subs %w4, %w4, #4 \n" // w -= 4
"b.eq 4f \n"
"add %1, %1, #8 \n" // src += 4 * 2
"add %2, %2, %6, lsl #2 \n" // dst_a += 4 *
// dst_stride_a
"add %3, %3, %7, lsl #2 \n" // dst_b += 4 *
// dst_stride_b
"subs %w4, %w4, #4 \n" // w -= 4
"b.eq 4f \n"
// some residual, check to see if it includes a 2x8 block,
// or less
"cmp %w4, #2 \n"
"b.lt 3f \n"
// some residual, check to see if it includes a 2x8 block,
// or less
"cmp %w4, #2 \n"
"b.lt 3f \n"
// 2x8 block
"2: \n"
"mov %0, %1 \n"
"ld2 {v0.h, v1.h}[0], [%0], %5 \n"
"ld2 {v2.h, v3.h}[0], [%0], %5 \n"
"ld2 {v0.h, v1.h}[1], [%0], %5 \n"
"ld2 {v2.h, v3.h}[1], [%0], %5 \n"
"ld2 {v0.h, v1.h}[2], [%0], %5 \n"
"ld2 {v2.h, v3.h}[2], [%0], %5 \n"
"ld2 {v0.h, v1.h}[3], [%0], %5 \n"
"ld2 {v2.h, v3.h}[3], [%0] \n"
// 2x8 block
"2: \n"
"mov %0, %1 \n"
"ld2 {v0.h, v1.h}[0], [%0], %5 \n"
"ld2 {v2.h, v3.h}[0], [%0], %5 \n"
"ld2 {v0.h, v1.h}[1], [%0], %5 \n"
"ld2 {v2.h, v3.h}[1], [%0], %5 \n"
"ld2 {v0.h, v1.h}[2], [%0], %5 \n"
"ld2 {v2.h, v3.h}[2], [%0], %5 \n"
"ld2 {v0.h, v1.h}[3], [%0], %5 \n"
"ld2 {v2.h, v3.h}[3], [%0] \n"
"trn1 v4.8b, v0.8b, v2.8b \n"
"trn2 v5.8b, v0.8b, v2.8b \n"
"trn1 v6.8b, v1.8b, v3.8b \n"
"trn2 v7.8b, v1.8b, v3.8b \n"
"trn1 v4.8b, v0.8b, v2.8b \n"
"trn2 v5.8b, v0.8b, v2.8b \n"
"trn1 v6.8b, v1.8b, v3.8b \n"
"trn2 v7.8b, v1.8b, v3.8b \n"
"mov %0, %2 \n"
"mov %0, %2 \n"
"st1 {v4.d}[0], [%0], %6 \n"
"st1 {v6.d}[0], [%0] \n"
"st1 {v4.d}[0], [%0], %6 \n"
"st1 {v6.d}[0], [%0] \n"
"mov %0, %3 \n"
"mov %0, %3 \n"
"st1 {v5.d}[0], [%0], %7 \n"
"st1 {v7.d}[0], [%0] \n"
"st1 {v5.d}[0], [%0], %7 \n"
"st1 {v7.d}[0], [%0] \n"
"add %1, %1, #4 \n" // src += 2 * 2
"add %2, %2, %6, lsl #1 \n" // dst_a += 2 * dst_stride_a
"add %3, %3, %7, lsl #1 \n" // dst_b += 2 * dst_stride_b
"subs %w4, %w4, #2 \n" // w -= 2
"b.eq 4f \n"
"add %1, %1, #4 \n" // src += 2 * 2
"add %2, %2, %6, lsl #1 \n" // dst_a += 2 *
// dst_stride_a
"add %3, %3, %7, lsl #1 \n" // dst_b += 2 *
// dst_stride_b
"subs %w4, %w4, #2 \n" // w -= 2
"b.eq 4f \n"
// 1x8 block
"3: \n"
"ld2 {v0.b, v1.b}[0], [%1], %5 \n"
"ld2 {v0.b, v1.b}[1], [%1], %5 \n"
"ld2 {v0.b, v1.b}[2], [%1], %5 \n"
"ld2 {v0.b, v1.b}[3], [%1], %5 \n"
"ld2 {v0.b, v1.b}[4], [%1], %5 \n"
"ld2 {v0.b, v1.b}[5], [%1], %5 \n"
"ld2 {v0.b, v1.b}[6], [%1], %5 \n"
"ld2 {v0.b, v1.b}[7], [%1] \n"
// 1x8 block
"3: \n"
"ld2 {v0.b, v1.b}[0], [%1], %5 \n"
"ld2 {v0.b, v1.b}[1], [%1], %5 \n"
"ld2 {v0.b, v1.b}[2], [%1], %5 \n"
"ld2 {v0.b, v1.b}[3], [%1], %5 \n"
"ld2 {v0.b, v1.b}[4], [%1], %5 \n"
"ld2 {v0.b, v1.b}[5], [%1], %5 \n"
"ld2 {v0.b, v1.b}[6], [%1], %5 \n"
"ld2 {v0.b, v1.b}[7], [%1] \n"
"st1 {v0.d}[0], [%2] \n"
"st1 {v1.d}[0], [%3] \n"
"st1 {v0.d}[0], [%2] \n"
"st1 {v1.d}[0], [%3] \n"
"4: \n"
"4: \n"
: "=&r"(src_temp), // %0
"+r"(src), // %1
"+r"(dst_a), // %2
"+r"(dst_b), // %3
"+r"(width) // %4
: "r"(static_cast<ptrdiff_t>(src_stride)), // %5
"r"(static_cast<ptrdiff_t>(dst_stride_a)), // %6
"r"(static_cast<ptrdiff_t>(dst_stride_b)), // %7
"r"(&kVTbl4x4TransposeDi) // %8
: "memory", "cc",
"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
"v30", "v31"
);
: "=&r"(src_temp), // %0
"+r"(src), // %1
"+r"(dst_a), // %2
"+r"(dst_b), // %3
"+r"(width) // %4
: "r"(static_cast<ptrdiff_t>(src_stride)), // %5
"r"(static_cast<ptrdiff_t>(dst_stride_a)), // %6
"r"(static_cast<ptrdiff_t>(dst_stride_b)), // %7
"r"(&kVTbl4x4TransposeDi) // %8
: "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v30", "v31");
}
#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)

View File

@ -17,7 +17,7 @@ extern "C" {
#endif
// This module is for 32 bit Visual C x86 and clangcl
#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
__declspec(naked) void TransposeWx8_SSSE3(const uint8* src,
int src_stride,
@ -172,7 +172,7 @@ __declspec(naked) void TransposeUVWx8_SSE2(const uint8* src,
movdqa xmm7, xmm5
lea eax, [eax + 8 * edi + 16]
neg edi
// Second round of bit swap.
// Second round of bit swap.
movdqa xmm5, xmm0
punpcklwd xmm0, xmm2
punpckhwd xmm5, xmm2
@ -192,8 +192,8 @@ __declspec(naked) void TransposeUVWx8_SSE2(const uint8* src,
punpckhwd xmm6, xmm7
movdqa xmm7, xmm6
// Third round of bit swap.
// Write to the destination pointer.
// Third round of bit swap.
// Write to the destination pointer.
movdqa xmm6, xmm0
punpckldq xmm0, xmm4
punpckhdq xmm6, xmm4

View File

@ -84,6 +84,14 @@ ANY41C(I422AlphaToARGBRow_Any_MSA, I422AlphaToARGBRow_MSA, 1, 0, 4, 7)
memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 192, \
SS(r, DUVSHIFT) * BPP); \
}
// Merge functions.
#ifdef HAS_MERGERGBROW_SSSE3
ANY31(MergeRGBRow_Any_SSSE3, MergeRGBRow_SSSE3, 0, 0, 3, 15)
#endif
#ifdef HAS_MERGERGBROW_NEON
ANY31(MergeRGBRow_Any_NEON, MergeRGBRow_NEON, 0, 0, 3, 15)
#endif
#ifdef HAS_I422TOYUY2ROW_SSE2
ANY31(I422ToYUY2Row_Any_SSE2, I422ToYUY2Row_SSE2, 1, 1, 4, 15)
ANY31(I422ToUYVYRow_Any_SSE2, I422ToUYVYRow_SSE2, 1, 1, 4, 15)
@ -621,6 +629,9 @@ ANY11(ARGBExtractAlphaRow_Any_AVX2, ARGBExtractAlphaRow_AVX2, 0, 4, 1, 32)
#ifdef HAS_ARGBEXTRACTALPHAROW_NEON
ANY11(ARGBExtractAlphaRow_Any_NEON, ARGBExtractAlphaRow_NEON, 0, 4, 1, 15)
#endif
#ifdef HAS_ARGBEXTRACTALPHAROW_MSA
ANY11(ARGBExtractAlphaRow_Any_MSA, ARGBExtractAlphaRow_MSA, 0, 4, 1, 15)
#endif
#undef ANY11
// Any 1 to 1 blended. Destination is read, modify, write.
@ -746,6 +757,9 @@ ANY11P16(HalfFloat1Row_Any_F16C, HalfFloat1Row_F16C, float, 2, 2, 15)
ANY11P16(HalfFloatRow_Any_NEON, HalfFloatRow_NEON, float, 2, 2, 7)
ANY11P16(HalfFloat1Row_Any_NEON, HalfFloat1Row_NEON, float, 2, 2, 7)
#endif
#ifdef HAS_HALFFLOATROW_MSA
ANY11P16(HalfFloatRow_Any_MSA, HalfFloatRow_MSA, float, 2, 2, 31)
#endif
#undef ANY11P16
// Any 1 to 1 with yuvconstants
@ -911,6 +925,9 @@ ANY12(SplitUVRow_Any_NEON, SplitUVRow_NEON, 0, 2, 0, 15)
#ifdef HAS_SPLITUVROW_DSPR2
ANY12(SplitUVRow_Any_DSPR2, SplitUVRow_DSPR2, 0, 2, 0, 15)
#endif
#ifdef HAS_SPLITUVROW_MSA
ANY12(SplitUVRow_Any_MSA, SplitUVRow_MSA, 0, 2, 0, 31)
#endif
#ifdef HAS_ARGBTOUV444ROW_SSSE3
ANY12(ARGBToUV444Row_Any_SSSE3, ARGBToUV444Row_SSSE3, 0, 4, 0, 15)
#endif
@ -934,6 +951,31 @@ ANY12(UYVYToUV422Row_Any_MSA, UYVYToUV422Row_MSA, 1, 4, 1, 31)
#endif
#undef ANY12
// Any 1 to 3. Outputs RGB planes.
#define ANY13(NAMEANY, ANY_SIMD, BPP, MASK) \
void NAMEANY(const uint8* src_ptr, uint8* dst_r, uint8* dst_g, uint8* dst_b, \
int width) { \
SIMD_ALIGNED(uint8 temp[16 * 6]); \
memset(temp, 0, 16 * 3); /* for msan */ \
int r = width & MASK; \
int n = width & ~MASK; \
if (n > 0) { \
ANY_SIMD(src_ptr, dst_r, dst_g, dst_b, n); \
} \
memcpy(temp, src_ptr + n * BPP, r * BPP); \
ANY_SIMD(temp, temp + 16 * 3, temp + 16 * 4, temp + 16 * 5, MASK + 1); \
memcpy(dst_r + n, temp + 16 * 3, r); \
memcpy(dst_g + n, temp + 16 * 4, r); \
memcpy(dst_b + n, temp + 16 * 5, r); \
}
#ifdef HAS_SPLITRGBROW_SSSE3
ANY13(SplitRGBRow_Any_SSSE3, SplitRGBRow_SSSE3, 3, 15)
#endif
#ifdef HAS_SPLITRGBROW_NEON
ANY13(SplitRGBRow_Any_NEON, SplitRGBRow_NEON, 3, 15)
#endif
// Any 1 to 2 with source stride (2 rows of source). Outputs UV planes.
// 128 byte row allows for 32 avx ARGB pixels.
#define ANY12S(NAMEANY, ANY_SIMD, UVSHIFT, BPP, MASK) \

View File

@ -1770,6 +1770,63 @@ void MergeUVRow_C(const uint8* src_u,
}
}
void SplitRGBRow_C(const uint8* src_rgb,
uint8* dst_r,
uint8* dst_g,
uint8* dst_b,
int width) {
int x;
for (x = 0; x < width; ++x) {
dst_r[x] = src_rgb[0];
dst_g[x] = src_rgb[1];
dst_b[x] = src_rgb[2];
src_rgb += 3;
}
}
void MergeRGBRow_C(const uint8* src_r,
const uint8* src_g,
const uint8* src_b,
uint8* dst_rgb,
int width) {
int x;
for (x = 0; x < width; ++x) {
dst_rgb[0] = src_r[x];
dst_rgb[1] = src_g[x];
dst_rgb[2] = src_b[x];
dst_rgb += 3;
}
}
void MergeUVRow_16_C(const uint16* src_u,
const uint16* src_v,
uint16* dst_uv,
int scale,
int width) {
int x;
for (x = 0; x < width - 1; x += 2) {
dst_uv[0] = src_u[x] * scale;
dst_uv[1] = src_v[x] * scale;
dst_uv[2] = src_u[x + 1] * scale;
dst_uv[3] = src_v[x + 1] * scale;
dst_uv += 4;
}
if (width & 1) {
dst_uv[0] = src_u[width - 1] * scale;
dst_uv[1] = src_v[width - 1] * scale;
}
}
void MultiplyRow_16_C(const uint16* src_y,
uint16* dst_y,
int scale,
int width) {
int x;
for (x = 0; x < width; ++x) {
dst_y[x] = src_y[x] * scale;
}
}
void CopyRow_C(const uint8* src, uint8* dst, int count) {
memcpy(dst, src, count);
}
@ -2639,6 +2696,62 @@ void NV12ToRGB565Row_AVX2(const uint8* src_y,
}
#endif
float ScaleSumSamples_C(const float* src, float* dst, float scale, int width) {
float fsum = 0.f;
int i;
#if defined(__clang__)
#pragma clang loop vectorize_width(4)
#endif
for (i = 0; i < width; ++i) {
float v = *src++;
fsum += v * v;
*dst++ = v * scale;
}
return fsum;
}
float ScaleMaxSamples_C(const float* src, float* dst, float scale, int width) {
float fmax = 0.f;
int i;
for (i = 0; i < width; ++i) {
float v = *src++;
float vs = v * scale;
fmax = (v > fmax) ? v : fmax;
*dst++ = vs;
}
return fmax;
}
void ScaleSamples_C(const float* src, float* dst, float scale, int width) {
int i;
for (i = 0; i < width; ++i) {
*dst++ = *src++ * scale;
}
}
void GaussRow_C(const uint32* src, uint16* dst, int width) {
int i;
for (i = 0; i < width; ++i) {
*dst++ =
(src[0] + src[1] * 4 + src[2] * 6 + src[3] * 4 + src[4] + 128) >> 8;
++src;
}
}
// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
void GaussCol_C(const uint16* src0,
const uint16* src1,
const uint16* src2,
const uint16* src3,
const uint16* src4,
uint32* dst,
int width) {
int i;
for (i = 0; i < width; ++i) {
*dst++ = *src0++ + *src1++ * 4 + *src2++ * 6 + *src3++ * 4 + *src4++;
}
}
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv

View File

@ -38,9 +38,8 @@ static vec8 kARGBToU = {112, -74, -38, 0, 112, -74, -38, 0,
static vec8 kARGBToUJ = {127, -84, -43, 0, 127, -84, -43, 0,
127, -84, -43, 0, 127, -84, -43, 0};
static vec8 kARGBToV = {
-18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
};
static vec8 kARGBToV = {-18, -94, 112, 0, -18, -94, 112, 0,
-18, -94, 112, 0, -18, -94, 112, 0};
static vec8 kARGBToVJ = {-20, -107, 127, 0, -20, -107, 127, 0,
-20, -107, 127, 0, -20, -107, 127, 0};
@ -2754,6 +2753,280 @@ void MergeUVRow_SSE2(const uint8* src_u,
}
#endif // HAS_MERGEUVROW_SSE2
// Use scale to convert lsb formats to msb, depending how many bits there are:
// 128 = 9 bits
// 64 = 10 bits
// 16 = 12 bits
// 1 = 16 bits
#ifdef HAS_MERGEUVROW_16_AVX2
void MergeUVRow_16_AVX2(const uint16* src_u,
const uint16* src_v,
uint16* dst_uv,
int scale,
int width) {
// clang-format off
asm volatile (
"vmovd %4,%%xmm3 \n"
"vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n"
"vbroadcastss %%xmm3,%%ymm3 \n"
"sub %0,%1 \n"
// 16 pixels per loop.
LABELALIGN
"1: \n"
"vmovdqu (%0),%%ymm0 \n"
"vmovdqu (%0,%1,1),%%ymm1 \n"
"add $0x20,%0 \n"
"vpmullw %%ymm3,%%ymm0,%%ymm0 \n"
"vpmullw %%ymm3,%%ymm1,%%ymm1 \n"
"vpunpcklwd %%ymm1,%%ymm0,%%ymm2 \n" // mutates
"vpunpckhwd %%ymm1,%%ymm0,%%ymm0 \n"
"vextractf128 $0x0,%%ymm2,(%2) \n"
"vextractf128 $0x0,%%ymm0,0x10(%2) \n"
"vextractf128 $0x1,%%ymm2,0x20(%2) \n"
"vextractf128 $0x1,%%ymm0,0x30(%2) \n"
"add $0x40,%2 \n"
"sub $0x10,%3 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_u), // %0
"+r"(src_v), // %1
"+r"(dst_uv), // %2
"+r"(width) // %3
: "r"(scale) // %4
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
// clang-format on
}
#endif // HAS_MERGEUVROW_AVX2
#ifdef HAS_MULTIPLYROW_16_AVX2
void MultiplyRow_16_AVX2(const uint16* src_y,
uint16* dst_y,
int scale,
int width) {
// clang-format off
asm volatile (
"vmovd %3,%%xmm3 \n"
"vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n"
"vbroadcastss %%xmm3,%%ymm3 \n"
"sub %0,%1 \n"
// 16 pixels per loop.
LABELALIGN
"1: \n"
"vmovdqu (%0),%%ymm0 \n"
"vmovdqu 0x20(%0),%%ymm1 \n"
"vpmullw %%ymm3,%%ymm0,%%ymm0 \n"
"vpmullw %%ymm3,%%ymm1,%%ymm1 \n"
"vmovdqu %%ymm0,(%0,%1) \n"
"vmovdqu %%ymm1,0x20(%0,%1) \n"
"add $0x40,%0 \n"
"sub $0x20,%2 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_y), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
: "r"(scale) // %3
: "memory", "cc", "xmm0", "xmm1", "xmm3");
// clang-format on
}
#endif // HAS_MULTIPLYROW_16_AVX2
#ifdef HAS_SPLITRGBROW_SSSE3
// Shuffle table for converting RGB to Planar.
static uvec8 kShuffleMaskRGBToR0 = {0u, 3u, 6u, 9u, 12u, 15u,
128u, 128u, 128u, 128u, 128u, 128u,
128u, 128u, 128u, 128u};
static uvec8 kShuffleMaskRGBToR1 = {128u, 128u, 128u, 128u, 128u, 128u,
2u, 5u, 8u, 11u, 14u, 128u,
128u, 128u, 128u, 128u};
static uvec8 kShuffleMaskRGBToR2 = {128u, 128u, 128u, 128u, 128u, 128u,
128u, 128u, 128u, 128u, 128u, 1u,
4u, 7u, 10u, 13u};
static uvec8 kShuffleMaskRGBToG0 = {1u, 4u, 7u, 10u, 13u, 128u,
128u, 128u, 128u, 128u, 128u, 128u,
128u, 128u, 128u, 128u};
static uvec8 kShuffleMaskRGBToG1 = {128u, 128u, 128u, 128u, 128u, 0u,
3u, 6u, 9u, 12u, 15u, 128u,
128u, 128u, 128u, 128u};
static uvec8 kShuffleMaskRGBToG2 = {128u, 128u, 128u, 128u, 128u, 128u,
128u, 128u, 128u, 128u, 128u, 2u,
5u, 8u, 11u, 14u};
static uvec8 kShuffleMaskRGBToB0 = {2u, 5u, 8u, 11u, 14u, 128u,
128u, 128u, 128u, 128u, 128u, 128u,
128u, 128u, 128u, 128u};
static uvec8 kShuffleMaskRGBToB1 = {128u, 128u, 128u, 128u, 128u, 1u,
4u, 7u, 10u, 13u, 128u, 128u,
128u, 128u, 128u, 128u};
static uvec8 kShuffleMaskRGBToB2 = {128u, 128u, 128u, 128u, 128u, 128u,
128u, 128u, 128u, 128u, 0u, 3u,
6u, 9u, 12u, 15u};
void SplitRGBRow_SSSE3(const uint8* src_rgb,
uint8* dst_r,
uint8* dst_g,
uint8* dst_b,
int width) {
asm volatile (
LABELALIGN
"1: \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
"pshufb %5, %%xmm0 \n"
"pshufb %6, %%xmm1 \n"
"pshufb %7, %%xmm2 \n"
"por %%xmm1,%%xmm0 \n"
"por %%xmm2,%%xmm0 \n"
"movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
"pshufb %8, %%xmm0 \n"
"pshufb %9, %%xmm1 \n"
"pshufb %10, %%xmm2 \n"
"por %%xmm1,%%xmm0 \n"
"por %%xmm2,%%xmm0 \n"
"movdqu %%xmm0," MEMACCESS(2) " \n"
"lea " MEMLEA(0x10,2) ",%2 \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
"pshufb %11, %%xmm0 \n"
"pshufb %12, %%xmm1 \n"
"pshufb %13, %%xmm2 \n"
"por %%xmm1,%%xmm0 \n"
"por %%xmm2,%%xmm0 \n"
"movdqu %%xmm0," MEMACCESS(3) " \n"
"lea " MEMLEA(0x10,3) ",%3 \n"
"lea " MEMLEA(0x30,0) ",%0 \n"
"sub $0x10,%4 \n"
"jg 1b \n"
: "+r"(src_rgb), // %0
"+r"(dst_r), // %1
"+r"(dst_g), // %2
"+r"(dst_b), // %3
"+r"(width) // %4
: "m"(kShuffleMaskRGBToR0), // %5
"m"(kShuffleMaskRGBToR1), // %6
"m"(kShuffleMaskRGBToR2), // %7
"m"(kShuffleMaskRGBToG0), // %8
"m"(kShuffleMaskRGBToG1), // %9
"m"(kShuffleMaskRGBToG2), // %10
"m"(kShuffleMaskRGBToB0), // %11
"m"(kShuffleMaskRGBToB1), // %12
"m"(kShuffleMaskRGBToB2) // %13
: "memory", "cc", NACL_R14
"xmm0", "xmm1", "xmm2"
);
}
#endif // HAS_SPLITRGBROW_SSSE3
#ifdef HAS_MERGERGBROW_SSSE3
// Shuffle table for converting RGB to Planar.
static uvec8 kShuffleMaskRToRGB0 = {0u, 128u, 128u, 1u, 128u, 128u,
2u, 128u, 128u, 3u, 128u, 128u,
4u, 128u, 128u, 5u};
static uvec8 kShuffleMaskGToRGB0 = {128u, 0u, 128u, 128u, 1u, 128u,
128u, 2u, 128u, 128u, 3u, 128u,
128u, 4u, 128u, 128u};
static uvec8 kShuffleMaskBToRGB0 = {128u, 128u, 0u, 128u, 128u, 1u,
128u, 128u, 2u, 128u, 128u, 3u,
128u, 128u, 4u, 128u};
static uvec8 kShuffleMaskGToRGB1 = {5u, 128u, 128u, 6u, 128u, 128u,
7u, 128u, 128u, 8u, 128u, 128u,
9u, 128u, 128u, 10u};
static uvec8 kShuffleMaskBToRGB1 = {128u, 5u, 128u, 128u, 6u, 128u,
128u, 7u, 128u, 128u, 8u, 128u,
128u, 9u, 128u, 128u};
static uvec8 kShuffleMaskRToRGB1 = {128u, 128u, 6u, 128u, 128u, 7u,
128u, 128u, 8u, 128u, 128u, 9u,
128u, 128u, 10u, 128u};
static uvec8 kShuffleMaskBToRGB2 = {10u, 128u, 128u, 11u, 128u, 128u,
12u, 128u, 128u, 13u, 128u, 128u,
14u, 128u, 128u, 15u};
static uvec8 kShuffleMaskRToRGB2 = {128u, 11u, 128u, 128u, 12u, 128u,
128u, 13u, 128u, 128u, 14u, 128u,
128u, 15u, 128u, 128u};
static uvec8 kShuffleMaskGToRGB2 = {128u, 128u, 11u, 128u, 128u, 12u,
128u, 128u, 13u, 128u, 128u, 14u,
128u, 128u, 15u, 128u};
void MergeRGBRow_SSSE3(const uint8* src_r,
const uint8* src_g,
const uint8* src_b,
uint8* dst_rgb,
int width) {
asm volatile (
LABELALIGN
"1: \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS(1) ",%%xmm1 \n"
"movdqu " MEMACCESS(2) ",%%xmm2 \n"
"pshufb %5, %%xmm0 \n"
"pshufb %6, %%xmm1 \n"
"pshufb %7, %%xmm2 \n"
"por %%xmm1,%%xmm0 \n"
"por %%xmm2,%%xmm0 \n"
"movdqu %%xmm0," MEMACCESS(3) " \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS(1) ",%%xmm1 \n"
"movdqu " MEMACCESS(2) ",%%xmm2 \n"
"pshufb %8, %%xmm0 \n"
"pshufb %9, %%xmm1 \n"
"pshufb %10, %%xmm2 \n"
"por %%xmm1,%%xmm0 \n"
"por %%xmm2,%%xmm0 \n"
"movdqu %%xmm0," MEMACCESS2(16, 3) " \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS(1) ",%%xmm1 \n"
"movdqu " MEMACCESS(2) ",%%xmm2 \n"
"pshufb %11, %%xmm0 \n"
"pshufb %12, %%xmm1 \n"
"pshufb %13, %%xmm2 \n"
"por %%xmm1,%%xmm0 \n"
"por %%xmm2,%%xmm0 \n"
"movdqu %%xmm0," MEMACCESS2(32, 3) " \n"
"lea " MEMLEA(0x10,0) ",%0 \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
"lea " MEMLEA(0x10,2) ",%2 \n"
"lea " MEMLEA(0x30,3) ",%3 \n"
"sub $0x10,%4 \n"
"jg 1b \n"
: "+r"(src_r), // %0
"+r"(src_g), // %1
"+r"(src_b), // %2
"+r"(dst_rgb), // %3
"+r"(width) // %4
: "m"(kShuffleMaskRToRGB0), // %5
"m"(kShuffleMaskGToRGB0), // %6
"m"(kShuffleMaskBToRGB0), // %7
"m"(kShuffleMaskRToRGB1), // %8
"m"(kShuffleMaskGToRGB1), // %9
"m"(kShuffleMaskBToRGB1), // %10
"m"(kShuffleMaskRToRGB2), // %11
"m"(kShuffleMaskGToRGB2), // %12
"m"(kShuffleMaskBToRGB2) // %13
: "memory", "cc", NACL_R14
"xmm0", "xmm1", "xmm2"
);
}
#endif // HAS_MERGERGBROW_SSSE3
#ifdef HAS_COPYROW_SSE2
void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
asm volatile (
@ -5453,6 +5726,7 @@ void ARGBPolynomialRow_AVX2(const uint8* src_argb,
#ifdef HAS_HALFFLOATROW_SSE2
static float kScaleBias = 1.9259299444e-34f;
void HalfFloatRow_SSE2(const uint16* src, uint16* dst, float scale, int width) {
scale *= kScaleBias;
asm volatile (
"pshufd $0x0,%3,%%xmm4 \n"
"pxor %%xmm5,%%xmm5 \n"
@ -5479,7 +5753,11 @@ void HalfFloatRow_SSE2(const uint16* src, uint16* dst, float scale, int width) {
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
: "x"(scale * kScaleBias) // %3
#if defined(__x86_64__)
: "x"(scale) // %3
#else
: "m"(scale) // %3
#endif
: "memory", "cc",
"xmm2", "xmm3", "xmm4", "xmm5"
);
@ -5488,6 +5766,7 @@ void HalfFloatRow_SSE2(const uint16* src, uint16* dst, float scale, int width) {
#ifdef HAS_HALFFLOATROW_AVX2
void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) {
scale *= kScaleBias;
asm volatile (
"vbroadcastss %3, %%ymm4 \n"
"vpxor %%ymm5,%%ymm5,%%ymm5 \n"
@ -5515,7 +5794,11 @@ void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) {
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
: "x"(scale * kScaleBias) // %3
#if defined(__x86_64__)
: "x"(scale) // %3
#else
: "m"(scale) // %3
#endif
: "memory", "cc",
"xmm2", "xmm3", "xmm4", "xmm5"
);
@ -5548,7 +5831,11 @@ void HalfFloatRow_F16C(const uint16* src, uint16* dst, float scale, int width) {
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
#if defined(__x86_64__)
: "x"(scale) // %3
#else
: "m"(scale) // %3
#endif
: "memory", "cc",
"xmm2", "xmm3", "xmm4"
);

View File

@ -2917,7 +2917,7 @@ void InterpolateRow_MSA(uint8* dst_ptr,
void ARGBSetRow_MSA(uint8* dst_argb, uint32 v32, int width) {
int x;
v16u8 dst0 = (v16u8)__msa_fill_w(v32);
v4i32 dst0 = __builtin_msa_fill_w(v32);
for (x = 0; x < width; x += 4) {
ST_UB(dst0, dst_argb);
@ -2969,6 +2969,524 @@ void MergeUVRow_MSA(const uint8* src_u,
}
}
void ARGBExtractAlphaRow_MSA(const uint8* src_argb, uint8* dst_a, int width) {
int i;
v16u8 src0, src1, src2, src3, vec0, vec1, dst0;
for (i = 0; i < width; i += 16) {
src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0);
src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16);
src2 = (v16u8)__msa_ld_b((v16i8*)src_argb, 32);
src3 = (v16u8)__msa_ld_b((v16i8*)src_argb, 48);
vec0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
vec1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
dst0 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0);
ST_UB(dst0, dst_a);
src_argb += 64;
dst_a += 16;
}
}
void ARGBBlendRow_MSA(const uint8* src_argb0,
const uint8* src_argb1,
uint8* dst_argb,
int width) {
int x;
v16u8 src0, src1, src2, src3, dst0, dst1;
v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
v8u16 vec8, vec9, vec10, vec11, vec12, vec13;
v8u16 const_256 = (v8u16)__msa_ldi_h(256);
v16u8 const_255 = (v16u8)__msa_ldi_b(255);
v16u8 mask = {0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255};
v16i8 zero = {0};
for (x = 0; x < width; x += 8) {
src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0);
src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16);
src2 = (v16u8)__msa_ld_b((v16i8*)src_argb1, 0);
src3 = (v16u8)__msa_ld_b((v16i8*)src_argb1, 16);
vec0 = (v8u16)__msa_ilvr_b(zero, (v16i8)src0);
vec1 = (v8u16)__msa_ilvl_b(zero, (v16i8)src0);
vec2 = (v8u16)__msa_ilvr_b(zero, (v16i8)src1);
vec3 = (v8u16)__msa_ilvl_b(zero, (v16i8)src1);
vec4 = (v8u16)__msa_ilvr_b(zero, (v16i8)src2);
vec5 = (v8u16)__msa_ilvl_b(zero, (v16i8)src2);
vec6 = (v8u16)__msa_ilvr_b(zero, (v16i8)src3);
vec7 = (v8u16)__msa_ilvl_b(zero, (v16i8)src3);
vec8 = (v8u16)__msa_fill_h(vec0[3]);
vec9 = (v8u16)__msa_fill_h(vec0[7]);
vec10 = (v8u16)__msa_fill_h(vec1[3]);
vec11 = (v8u16)__msa_fill_h(vec1[7]);
vec8 = (v8u16)__msa_pckev_d((v2i64)vec9, (v2i64)vec8);
vec9 = (v8u16)__msa_pckev_d((v2i64)vec11, (v2i64)vec10);
vec10 = (v8u16)__msa_fill_h(vec2[3]);
vec11 = (v8u16)__msa_fill_h(vec2[7]);
vec12 = (v8u16)__msa_fill_h(vec3[3]);
vec13 = (v8u16)__msa_fill_h(vec3[7]);
vec10 = (v8u16)__msa_pckev_d((v2i64)vec11, (v2i64)vec10);
vec11 = (v8u16)__msa_pckev_d((v2i64)vec13, (v2i64)vec12);
vec8 = const_256 - vec8;
vec9 = const_256 - vec9;
vec10 = const_256 - vec10;
vec11 = const_256 - vec11;
vec8 *= vec4;
vec9 *= vec5;
vec10 *= vec6;
vec11 *= vec7;
vec8 = (v8u16)__msa_srai_h((v8i16)vec8, 8);
vec9 = (v8u16)__msa_srai_h((v8i16)vec9, 8);
vec10 = (v8u16)__msa_srai_h((v8i16)vec10, 8);
vec11 = (v8u16)__msa_srai_h((v8i16)vec11, 8);
vec0 += vec8;
vec1 += vec9;
vec2 += vec10;
vec3 += vec11;
dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
dst0 = __msa_bmnz_v(dst0, const_255, mask);
dst1 = __msa_bmnz_v(dst1, const_255, mask);
ST_UB2(dst0, dst1, dst_argb, 16);
src_argb0 += 32;
src_argb1 += 32;
dst_argb += 32;
}
}
void ARGBQuantizeRow_MSA(uint8* dst_argb,
int scale,
int interval_size,
int interval_offset,
int width) {
int x;
v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3;
v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
v4i32 tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
v4i32 vec_scale = __msa_fill_w(scale);
v16u8 vec_int_sz = (v16u8)__msa_fill_b(interval_size);
v16u8 vec_int_ofst = (v16u8)__msa_fill_b(interval_offset);
v16i8 mask = {0, 1, 2, 19, 4, 5, 6, 23, 8, 9, 10, 27, 12, 13, 14, 31};
v16i8 zero = {0};
for (x = 0; x < width; x += 8) {
src0 = (v16u8)__msa_ld_b((v16i8*)dst_argb, 0);
src1 = (v16u8)__msa_ld_b((v16i8*)dst_argb, 16);
src2 = (v16u8)__msa_ld_b((v16i8*)dst_argb, 32);
src3 = (v16u8)__msa_ld_b((v16i8*)dst_argb, 48);
vec0 = (v8i16)__msa_ilvr_b(zero, (v16i8)src0);
vec1 = (v8i16)__msa_ilvl_b(zero, (v16i8)src0);
vec2 = (v8i16)__msa_ilvr_b(zero, (v16i8)src1);
vec3 = (v8i16)__msa_ilvl_b(zero, (v16i8)src1);
vec4 = (v8i16)__msa_ilvr_b(zero, (v16i8)src2);
vec5 = (v8i16)__msa_ilvl_b(zero, (v16i8)src2);
vec6 = (v8i16)__msa_ilvr_b(zero, (v16i8)src3);
vec7 = (v8i16)__msa_ilvl_b(zero, (v16i8)src3);
tmp0 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec0);
tmp1 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec0);
tmp2 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec1);
tmp3 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec1);
tmp4 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec2);
tmp5 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec2);
tmp6 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec3);
tmp7 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec3);
tmp8 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec4);
tmp9 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec4);
tmp10 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec5);
tmp11 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec5);
tmp12 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec6);
tmp13 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec6);
tmp14 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec7);
tmp15 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec7);
tmp0 *= vec_scale;
tmp1 *= vec_scale;
tmp2 *= vec_scale;
tmp3 *= vec_scale;
tmp4 *= vec_scale;
tmp5 *= vec_scale;
tmp6 *= vec_scale;
tmp7 *= vec_scale;
tmp8 *= vec_scale;
tmp9 *= vec_scale;
tmp10 *= vec_scale;
tmp11 *= vec_scale;
tmp12 *= vec_scale;
tmp13 *= vec_scale;
tmp14 *= vec_scale;
tmp15 *= vec_scale;
tmp0 >>= 16;
tmp1 >>= 16;
tmp2 >>= 16;
tmp3 >>= 16;
tmp4 >>= 16;
tmp5 >>= 16;
tmp6 >>= 16;
tmp7 >>= 16;
tmp8 >>= 16;
tmp9 >>= 16;
tmp10 >>= 16;
tmp11 >>= 16;
tmp12 >>= 16;
tmp13 >>= 16;
tmp14 >>= 16;
tmp15 >>= 16;
vec0 = (v8i16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0);
vec1 = (v8i16)__msa_pckev_h((v8i16)tmp3, (v8i16)tmp2);
vec2 = (v8i16)__msa_pckev_h((v8i16)tmp5, (v8i16)tmp4);
vec3 = (v8i16)__msa_pckev_h((v8i16)tmp7, (v8i16)tmp6);
vec4 = (v8i16)__msa_pckev_h((v8i16)tmp9, (v8i16)tmp8);
vec5 = (v8i16)__msa_pckev_h((v8i16)tmp11, (v8i16)tmp10);
vec6 = (v8i16)__msa_pckev_h((v8i16)tmp13, (v8i16)tmp12);
vec7 = (v8i16)__msa_pckev_h((v8i16)tmp15, (v8i16)tmp14);
dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
dst2 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec4);
dst3 = (v16u8)__msa_pckev_b((v16i8)vec7, (v16i8)vec6);
dst0 *= vec_int_sz;
dst1 *= vec_int_sz;
dst2 *= vec_int_sz;
dst3 *= vec_int_sz;
dst0 += vec_int_ofst;
dst1 += vec_int_ofst;
dst2 += vec_int_ofst;
dst3 += vec_int_ofst;
dst0 = (v16u8)__msa_vshf_b(mask, (v16i8)src0, (v16i8)dst0);
dst1 = (v16u8)__msa_vshf_b(mask, (v16i8)src1, (v16i8)dst1);
dst2 = (v16u8)__msa_vshf_b(mask, (v16i8)src2, (v16i8)dst2);
dst3 = (v16u8)__msa_vshf_b(mask, (v16i8)src3, (v16i8)dst3);
ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
dst_argb += 64;
}
}
void ARGBColorMatrixRow_MSA(const uint8* src_argb,
uint8* dst_argb,
const int8* matrix_argb,
int width) {
int32 x;
v16i8 src0;
v16u8 src1, src2, dst0, dst1;
v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
v8i16 vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17;
v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
v4i32 tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
v16i8 zero = {0};
v8i16 max = __msa_ldi_h(255);
src0 = __msa_ld_b((v16i8*)matrix_argb, 0);
vec0 = (v8i16)__msa_ilvr_b(zero, src0);
vec1 = (v8i16)__msa_ilvl_b(zero, src0);
for (x = 0; x < width; x += 8) {
src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0);
src2 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16);
vec2 = (v8i16)__msa_ilvr_b(zero, (v16i8)src1);
vec3 = (v8i16)__msa_ilvl_b(zero, (v16i8)src1);
vec4 = (v8i16)__msa_ilvr_b(zero, (v16i8)src2);
vec5 = (v8i16)__msa_ilvl_b(zero, (v16i8)src2);
vec6 = (v8i16)__msa_pckod_d((v2i64)vec2, (v2i64)vec2);
vec7 = (v8i16)__msa_pckod_d((v2i64)vec3, (v2i64)vec3);
vec8 = (v8i16)__msa_pckod_d((v2i64)vec4, (v2i64)vec4);
vec9 = (v8i16)__msa_pckod_d((v2i64)vec5, (v2i64)vec5);
vec2 = (v8i16)__msa_pckev_d((v2i64)vec2, (v2i64)vec2);
vec3 = (v8i16)__msa_pckev_d((v2i64)vec3, (v2i64)vec3);
vec4 = (v8i16)__msa_pckev_d((v2i64)vec4, (v2i64)vec4);
vec5 = (v8i16)__msa_pckev_d((v2i64)vec5, (v2i64)vec5);
vec10 = vec2 * vec0;
vec11 = vec2 * vec1;
vec12 = vec6 * vec0;
vec13 = vec6 * vec1;
tmp0 = __msa_hadd_s_w(vec10, vec10);
tmp1 = __msa_hadd_s_w(vec11, vec11);
tmp2 = __msa_hadd_s_w(vec12, vec12);
tmp3 = __msa_hadd_s_w(vec13, vec13);
vec14 = vec3 * vec0;
vec15 = vec3 * vec1;
vec16 = vec7 * vec0;
vec17 = vec7 * vec1;
tmp4 = __msa_hadd_s_w(vec14, vec14);
tmp5 = __msa_hadd_s_w(vec15, vec15);
tmp6 = __msa_hadd_s_w(vec16, vec16);
tmp7 = __msa_hadd_s_w(vec17, vec17);
vec10 = __msa_pckev_h((v8i16)tmp1, (v8i16)tmp0);
vec11 = __msa_pckev_h((v8i16)tmp3, (v8i16)tmp2);
vec12 = __msa_pckev_h((v8i16)tmp5, (v8i16)tmp4);
vec13 = __msa_pckev_h((v8i16)tmp7, (v8i16)tmp6);
tmp0 = __msa_hadd_s_w(vec10, vec10);
tmp1 = __msa_hadd_s_w(vec11, vec11);
tmp2 = __msa_hadd_s_w(vec12, vec12);
tmp3 = __msa_hadd_s_w(vec13, vec13);
tmp0 = __msa_srai_w(tmp0, 6);
tmp1 = __msa_srai_w(tmp1, 6);
tmp2 = __msa_srai_w(tmp2, 6);
tmp3 = __msa_srai_w(tmp3, 6);
vec2 = vec4 * vec0;
vec6 = vec4 * vec1;
vec3 = vec8 * vec0;
vec7 = vec8 * vec1;
tmp8 = __msa_hadd_s_w(vec2, vec2);
tmp9 = __msa_hadd_s_w(vec6, vec6);
tmp10 = __msa_hadd_s_w(vec3, vec3);
tmp11 = __msa_hadd_s_w(vec7, vec7);
vec4 = vec5 * vec0;
vec8 = vec5 * vec1;
vec5 = vec9 * vec0;
vec9 = vec9 * vec1;
tmp12 = __msa_hadd_s_w(vec4, vec4);
tmp13 = __msa_hadd_s_w(vec8, vec8);
tmp14 = __msa_hadd_s_w(vec5, vec5);
tmp15 = __msa_hadd_s_w(vec9, vec9);
vec14 = __msa_pckev_h((v8i16)tmp9, (v8i16)tmp8);
vec15 = __msa_pckev_h((v8i16)tmp11, (v8i16)tmp10);
vec16 = __msa_pckev_h((v8i16)tmp13, (v8i16)tmp12);
vec17 = __msa_pckev_h((v8i16)tmp15, (v8i16)tmp14);
tmp4 = __msa_hadd_s_w(vec14, vec14);
tmp5 = __msa_hadd_s_w(vec15, vec15);
tmp6 = __msa_hadd_s_w(vec16, vec16);
tmp7 = __msa_hadd_s_w(vec17, vec17);
tmp4 = __msa_srai_w(tmp4, 6);
tmp5 = __msa_srai_w(tmp5, 6);
tmp6 = __msa_srai_w(tmp6, 6);
tmp7 = __msa_srai_w(tmp7, 6);
vec10 = __msa_pckev_h((v8i16)tmp1, (v8i16)tmp0);
vec11 = __msa_pckev_h((v8i16)tmp3, (v8i16)tmp2);
vec12 = __msa_pckev_h((v8i16)tmp5, (v8i16)tmp4);
vec13 = __msa_pckev_h((v8i16)tmp7, (v8i16)tmp6);
vec10 = __msa_maxi_s_h(vec10, 0);
vec11 = __msa_maxi_s_h(vec11, 0);
vec12 = __msa_maxi_s_h(vec12, 0);
vec13 = __msa_maxi_s_h(vec13, 0);
vec10 = __msa_min_s_h(vec10, max);
vec11 = __msa_min_s_h(vec11, max);
vec12 = __msa_min_s_h(vec12, max);
vec13 = __msa_min_s_h(vec13, max);
dst0 = (v16u8)__msa_pckev_b((v16i8)vec11, (v16i8)vec10);
dst1 = (v16u8)__msa_pckev_b((v16i8)vec13, (v16i8)vec12);
ST_UB2(dst0, dst1, dst_argb, 16);
src_argb += 32;
dst_argb += 32;
}
}
void SplitUVRow_MSA(const uint8* src_uv,
uint8* dst_u,
uint8* dst_v,
int width) {
int x;
v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3;
for (x = 0; x < width; x += 32) {
src0 = (v16u8)__msa_ld_b((v16i8*)src_uv, 0);
src1 = (v16u8)__msa_ld_b((v16i8*)src_uv, 16);
src2 = (v16u8)__msa_ld_b((v16i8*)src_uv, 32);
src3 = (v16u8)__msa_ld_b((v16i8*)src_uv, 48);
dst0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
dst1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
dst2 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
dst3 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
ST_UB2(dst0, dst1, dst_u, 16);
ST_UB2(dst2, dst3, dst_v, 16);
src_uv += 64;
dst_u += 32;
dst_v += 32;
}
}
void SetRow_MSA(uint8* dst, uint8 v8, int width) {
int x;
v16u8 dst0 = (v16u8)__msa_fill_b(v8);
for (x = 0; x < width; x += 16) {
ST_UB(dst0, dst);
dst += 16;
}
}
void MirrorUVRow_MSA(const uint8* src_uv,
uint8* dst_u,
uint8* dst_v,
int width) {
int x;
v16u8 src0, src1, src2, src3;
v16u8 dst0, dst1, dst2, dst3;
v16i8 mask0 = {30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0};
v16i8 mask1 = {31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1};
src_uv += (2 * width);
for (x = 0; x < width; x += 32) {
src_uv -= 64;
src2 = (v16u8)__msa_ld_b((v16i8*)src_uv, 0);
src3 = (v16u8)__msa_ld_b((v16i8*)src_uv, 16);
src0 = (v16u8)__msa_ld_b((v16i8*)src_uv, 32);
src1 = (v16u8)__msa_ld_b((v16i8*)src_uv, 48);
dst0 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0);
dst1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src3, (v16i8)src2);
dst2 = (v16u8)__msa_vshf_b(mask0, (v16i8)src1, (v16i8)src0);
dst3 = (v16u8)__msa_vshf_b(mask0, (v16i8)src3, (v16i8)src2);
ST_UB2(dst0, dst1, dst_v, 16);
ST_UB2(dst2, dst3, dst_u, 16);
dst_u += 32;
dst_v += 32;
}
}
void SobelXRow_MSA(const uint8* src_y0,
const uint8* src_y1,
const uint8* src_y2,
uint8* dst_sobelx,
int32 width) {
int x;
v16u8 src0, src1, src2, src3, src4, src5, dst0;
v8i16 vec0, vec1, vec2, vec3, vec4, vec5;
v16i8 mask0 = {0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9};
v16i8 tmp = __msa_ldi_b(8);
v16i8 mask1 = mask0 + tmp;
v8i16 zero = {0};
v8i16 max = __msa_ldi_h(255);
for (x = 0; x < width; x += 16) {
src0 = (v16u8)__msa_ld_b((v16i8*)src_y0, 0);
src1 = (v16u8)__msa_ld_b((v16i8*)src_y0, 16);
src2 = (v16u8)__msa_ld_b((v16i8*)src_y1, 0);
src3 = (v16u8)__msa_ld_b((v16i8*)src_y1, 16);
src4 = (v16u8)__msa_ld_b((v16i8*)src_y2, 0);
src5 = (v16u8)__msa_ld_b((v16i8*)src_y2, 16);
vec0 = (v8i16)__msa_vshf_b(mask0, (v16i8)src1, (v16i8)src0);
vec1 = (v8i16)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0);
vec2 = (v8i16)__msa_vshf_b(mask0, (v16i8)src3, (v16i8)src2);
vec3 = (v8i16)__msa_vshf_b(mask1, (v16i8)src3, (v16i8)src2);
vec4 = (v8i16)__msa_vshf_b(mask0, (v16i8)src5, (v16i8)src4);
vec5 = (v8i16)__msa_vshf_b(mask1, (v16i8)src5, (v16i8)src4);
vec0 = (v8i16)__msa_hsub_u_h((v16u8)vec0, (v16u8)vec0);
vec1 = (v8i16)__msa_hsub_u_h((v16u8)vec1, (v16u8)vec1);
vec2 = (v8i16)__msa_hsub_u_h((v16u8)vec2, (v16u8)vec2);
vec3 = (v8i16)__msa_hsub_u_h((v16u8)vec3, (v16u8)vec3);
vec4 = (v8i16)__msa_hsub_u_h((v16u8)vec4, (v16u8)vec4);
vec5 = (v8i16)__msa_hsub_u_h((v16u8)vec5, (v16u8)vec5);
vec0 += vec2;
vec1 += vec3;
vec4 += vec2;
vec5 += vec3;
vec0 += vec4;
vec1 += vec5;
vec0 = __msa_add_a_h(zero, vec0);
vec1 = __msa_add_a_h(zero, vec1);
vec0 = __msa_maxi_s_h(vec0, 0);
vec1 = __msa_maxi_s_h(vec1, 0);
vec0 = __msa_min_s_h(max, vec0);
vec1 = __msa_min_s_h(max, vec1);
dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
ST_UB(dst0, dst_sobelx);
src_y0 += 16;
src_y1 += 16;
src_y2 += 16;
dst_sobelx += 16;
}
}
void SobelYRow_MSA(const uint8* src_y0,
const uint8* src_y1,
uint8* dst_sobely,
int32 width) {
int x;
v16u8 src0, src1, dst0;
v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6;
v8i16 zero = {0};
v8i16 max = __msa_ldi_h(255);
for (x = 0; x < width; x += 16) {
src0 = (v16u8)__msa_ld_b((v16i8*)src_y0, 0);
src1 = (v16u8)__msa_ld_b((v16i8*)src_y1, 0);
vec0 = (v8i16)__msa_ilvr_b((v16i8)zero, (v16i8)src0);
vec1 = (v8i16)__msa_ilvl_b((v16i8)zero, (v16i8)src0);
vec2 = (v8i16)__msa_ilvr_b((v16i8)zero, (v16i8)src1);
vec3 = (v8i16)__msa_ilvl_b((v16i8)zero, (v16i8)src1);
vec0 -= vec2;
vec1 -= vec3;
vec6[0] = src_y0[16] - src_y1[16];
vec6[1] = src_y0[17] - src_y1[17];
vec2 = (v8i16)__msa_sldi_b((v16i8)vec1, (v16i8)vec0, 2);
vec3 = (v8i16)__msa_sldi_b((v16i8)vec6, (v16i8)vec1, 2);
vec4 = (v8i16)__msa_sldi_b((v16i8)vec1, (v16i8)vec0, 4);
vec5 = (v8i16)__msa_sldi_b((v16i8)vec6, (v16i8)vec1, 4);
vec0 += vec2;
vec1 += vec3;
vec4 += vec2;
vec5 += vec3;
vec0 += vec4;
vec1 += vec5;
vec0 = __msa_add_a_h(zero, vec0);
vec1 = __msa_add_a_h(zero, vec1);
vec0 = __msa_maxi_s_h(vec0, 0);
vec1 = __msa_maxi_s_h(vec1, 0);
vec0 = __msa_min_s_h(max, vec0);
vec1 = __msa_min_s_h(max, vec1);
dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
ST_UB(dst0, dst_sobely);
src_y0 += 16;
src_y1 += 16;
dst_sobely += 16;
}
}
void HalfFloatRow_MSA(const uint16* src, uint16* dst, float scale, int width) {
int i;
v8u16 src0, src1, src2, src3, dst0, dst1, dst2, dst3;
v4u32 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
v4f32 fvec0, fvec1, fvec2, fvec3, fvec4, fvec5, fvec6, fvec7;
v4f32 mult_vec;
v8i16 zero = {0};
mult_vec[0] = 1.9259299444e-34f * scale;
mult_vec = (v4f32)__msa_splati_w((v4i32)mult_vec, 0);
for (i = 0; i < width; i += 32) {
src0 = (v8u16)__msa_ld_h((v8i16*)src, 0);
src1 = (v8u16)__msa_ld_h((v8i16*)src, 16);
src2 = (v8u16)__msa_ld_h((v8i16*)src, 32);
src3 = (v8u16)__msa_ld_h((v8i16*)src, 48);
vec0 = (v4u32)__msa_ilvr_h(zero, (v8i16)src0);
vec1 = (v4u32)__msa_ilvl_h(zero, (v8i16)src0);
vec2 = (v4u32)__msa_ilvr_h(zero, (v8i16)src1);
vec3 = (v4u32)__msa_ilvl_h(zero, (v8i16)src1);
vec4 = (v4u32)__msa_ilvr_h(zero, (v8i16)src2);
vec5 = (v4u32)__msa_ilvl_h(zero, (v8i16)src2);
vec6 = (v4u32)__msa_ilvr_h(zero, (v8i16)src3);
vec7 = (v4u32)__msa_ilvl_h(zero, (v8i16)src3);
fvec0 = __msa_ffint_u_w(vec0);
fvec1 = __msa_ffint_u_w(vec1);
fvec2 = __msa_ffint_u_w(vec2);
fvec3 = __msa_ffint_u_w(vec3);
fvec4 = __msa_ffint_u_w(vec4);
fvec5 = __msa_ffint_u_w(vec5);
fvec6 = __msa_ffint_u_w(vec6);
fvec7 = __msa_ffint_u_w(vec7);
fvec0 *= mult_vec;
fvec1 *= mult_vec;
fvec2 *= mult_vec;
fvec3 *= mult_vec;
fvec4 *= mult_vec;
fvec5 *= mult_vec;
fvec6 *= mult_vec;
fvec7 *= mult_vec;
vec0 = ((v4u32)fvec0) >> 13;
vec1 = ((v4u32)fvec1) >> 13;
vec2 = ((v4u32)fvec2) >> 13;
vec3 = ((v4u32)fvec3) >> 13;
vec4 = ((v4u32)fvec4) >> 13;
vec5 = ((v4u32)fvec5) >> 13;
vec6 = ((v4u32)fvec6) >> 13;
vec7 = ((v4u32)fvec7) >> 13;
dst0 = (v8u16)__msa_pckev_h((v8i16)vec1, (v8i16)vec0);
dst1 = (v8u16)__msa_pckev_h((v8i16)vec3, (v8i16)vec2);
dst2 = (v8u16)__msa_pckev_h((v8i16)vec5, (v8i16)vec4);
dst3 = (v8u16)__msa_pckev_h((v8i16)vec7, (v8i16)vec6);
ST_UH2(dst0, dst1, dst, 8);
ST_UH2(dst2, dst3, dst + 16, 8);
src += 32;
dst += 32;
}
}
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv

View File

@ -115,7 +115,7 @@ void I444ToARGBRow_NEON(const uint8* src_y,
asm volatile(
YUVTORGB_SETUP
"vmov.u8 d23, #255 \n"
"1: \n" READYUV444 YUVTORGB
"1: \n" READYUV444 YUVTORGB
"subs %4, %4, #8 \n"
"vst4.8 {d20, d21, d22, d23}, [%3]! \n"
"bgt 1b \n"
@ -141,7 +141,7 @@ void I422ToARGBRow_NEON(const uint8* src_y,
asm volatile(
YUVTORGB_SETUP
"vmov.u8 d23, #255 \n"
"1: \n" READYUV422 YUVTORGB
"1: \n" READYUV422 YUVTORGB
"subs %4, %4, #8 \n"
"vst4.8 {d20, d21, d22, d23}, [%3]! \n"
"bgt 1b \n"
@ -167,7 +167,7 @@ void I422AlphaToARGBRow_NEON(const uint8* src_y,
int width) {
asm volatile(
YUVTORGB_SETUP
"1: \n" READYUV422 YUVTORGB
"1: \n" READYUV422 YUVTORGB
"subs %5, %5, #8 \n"
"vld1.8 {d23}, [%3]! \n"
"vst4.8 {d20, d21, d22, d23}, [%4]! \n"
@ -194,7 +194,7 @@ void I422ToRGBARow_NEON(const uint8* src_y,
int width) {
asm volatile(
YUVTORGB_SETUP
"1: \n" READYUV422 YUVTORGB
"1: \n" READYUV422 YUVTORGB
"subs %4, %4, #8 \n"
"vmov.u8 d19, #255 \n" // d19 modified by
// YUVTORGB
@ -221,7 +221,7 @@ void I422ToRGB24Row_NEON(const uint8* src_y,
int width) {
asm volatile(
YUVTORGB_SETUP
"1: \n" READYUV422 YUVTORGB
"1: \n" READYUV422 YUVTORGB
"subs %4, %4, #8 \n"
"vst3.8 {d20, d21, d22}, [%3]! \n"
"bgt 1b \n"
@ -253,7 +253,7 @@ void I422ToRGB565Row_NEON(const uint8* src_y,
int width) {
asm volatile(
YUVTORGB_SETUP
"1: \n" READYUV422 YUVTORGB
"1: \n" READYUV422 YUVTORGB
"subs %4, %4, #8 \n" ARGBTORGB565
"vst1.8 {q0}, [%3]! \n" // store 8 pixels RGB565.
"bgt 1b \n"
@ -287,7 +287,7 @@ void I422ToARGB1555Row_NEON(const uint8* src_y,
int width) {
asm volatile(
YUVTORGB_SETUP
"1: \n" READYUV422 YUVTORGB
"1: \n" READYUV422 YUVTORGB
"subs %4, %4, #8 \n"
"vmov.u8 d23, #255 \n" ARGBTOARGB1555
"vst1.8 {q0}, [%3]! \n" // store 8 pixels
@ -325,7 +325,7 @@ void I422ToARGB4444Row_NEON(const uint8* src_y,
YUVTORGB_SETUP
"vmov.u8 d4, #0x0f \n" // bits to clear with
// vbic.
"1: \n" READYUV422 YUVTORGB
"1: \n" READYUV422 YUVTORGB
"subs %4, %4, #8 \n"
"vmov.u8 d23, #255 \n" ARGBTOARGB4444
"vst1.8 {q0}, [%3]! \n" // store 8 pixels
@ -348,7 +348,7 @@ void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) {
asm volatile(
YUVTORGB_SETUP
"vmov.u8 d23, #255 \n"
"1: \n" READYUV400 YUVTORGB
"1: \n" READYUV400 YUVTORGB
"subs %2, %2, #8 \n"
"vst4.8 {d20, d21, d22, d23}, [%1]! \n"
"bgt 1b \n"
@ -366,7 +366,7 @@ void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) {
void J400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) {
asm volatile(
"vmov.u8 d23, #255 \n"
"1: \n"
"1: \n"
"vld1.8 {d20}, [%0]! \n"
"vmov d21, d20 \n"
"vmov d22, d20 \n"
@ -385,23 +385,22 @@ void NV12ToARGBRow_NEON(const uint8* src_y,
uint8* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile(
YUVTORGB_SETUP
"vmov.u8 d23, #255 \n"
"1: \n" READNV12 YUVTORGB
"subs %3, %3, #8 \n"
"vst4.8 {d20, d21, d22, d23}, [%2]! \n"
"bgt 1b \n"
: "+r"(src_y), // %0
"+r"(src_uv), // %1
"+r"(dst_argb), // %2
"+r"(width) // %3
: [kUVToRB] "r"(&yuvconstants->kUVToRB),
[kUVToG] "r"(&yuvconstants->kUVToG),
[kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
[kYToRgb] "r"(&yuvconstants->kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
"q12", "q13", "q14", "q15");
asm volatile(YUVTORGB_SETUP
"vmov.u8 d23, #255 \n"
"1: \n" READNV12 YUVTORGB
"subs %3, %3, #8 \n"
"vst4.8 {d20, d21, d22, d23}, [%2]! \n"
"bgt 1b \n"
: "+r"(src_y), // %0
"+r"(src_uv), // %1
"+r"(dst_argb), // %2
"+r"(width) // %3
: [kUVToRB] "r"(&yuvconstants->kUVToRB),
[kUVToG] "r"(&yuvconstants->kUVToG),
[kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
[kYToRgb] "r"(&yuvconstants->kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9",
"q10", "q11", "q12", "q13", "q14", "q15");
}
void NV21ToARGBRow_NEON(const uint8* src_y,
@ -409,23 +408,22 @@ void NV21ToARGBRow_NEON(const uint8* src_y,
uint8* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile(
YUVTORGB_SETUP
"vmov.u8 d23, #255 \n"
"1: \n" READNV21 YUVTORGB
"subs %3, %3, #8 \n"
"vst4.8 {d20, d21, d22, d23}, [%2]! \n"
"bgt 1b \n"
: "+r"(src_y), // %0
"+r"(src_vu), // %1
"+r"(dst_argb), // %2
"+r"(width) // %3
: [kUVToRB] "r"(&yuvconstants->kUVToRB),
[kUVToG] "r"(&yuvconstants->kUVToG),
[kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
[kYToRgb] "r"(&yuvconstants->kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
"q12", "q13", "q14", "q15");
asm volatile(YUVTORGB_SETUP
"vmov.u8 d23, #255 \n"
"1: \n" READNV21 YUVTORGB
"subs %3, %3, #8 \n"
"vst4.8 {d20, d21, d22, d23}, [%2]! \n"
"bgt 1b \n"
: "+r"(src_y), // %0
"+r"(src_vu), // %1
"+r"(dst_argb), // %2
"+r"(width) // %3
: [kUVToRB] "r"(&yuvconstants->kUVToRB),
[kUVToG] "r"(&yuvconstants->kUVToG),
[kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
[kYToRgb] "r"(&yuvconstants->kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9",
"q10", "q11", "q12", "q13", "q14", "q15");
}
void NV12ToRGB565Row_NEON(const uint8* src_y,
@ -435,7 +433,7 @@ void NV12ToRGB565Row_NEON(const uint8* src_y,
int width) {
asm volatile(
YUVTORGB_SETUP
"1: \n" READNV12 YUVTORGB
"1: \n" READNV12 YUVTORGB
"subs %3, %3, #8 \n" ARGBTORGB565
"vst1.8 {q0}, [%2]! \n" // store 8 pixels RGB565.
"bgt 1b \n"
@ -455,44 +453,42 @@ void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
uint8* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile(
YUVTORGB_SETUP
"vmov.u8 d23, #255 \n"
"1: \n" READYUY2 YUVTORGB
"subs %2, %2, #8 \n"
"vst4.8 {d20, d21, d22, d23}, [%1]! \n"
"bgt 1b \n"
: "+r"(src_yuy2), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
: [kUVToRB] "r"(&yuvconstants->kUVToRB),
[kUVToG] "r"(&yuvconstants->kUVToG),
[kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
[kYToRgb] "r"(&yuvconstants->kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
"q12", "q13", "q14", "q15");
asm volatile(YUVTORGB_SETUP
"vmov.u8 d23, #255 \n"
"1: \n" READYUY2 YUVTORGB
"subs %2, %2, #8 \n"
"vst4.8 {d20, d21, d22, d23}, [%1]! \n"
"bgt 1b \n"
: "+r"(src_yuy2), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
: [kUVToRB] "r"(&yuvconstants->kUVToRB),
[kUVToG] "r"(&yuvconstants->kUVToG),
[kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
[kYToRgb] "r"(&yuvconstants->kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9",
"q10", "q11", "q12", "q13", "q14", "q15");
}
void UYVYToARGBRow_NEON(const uint8* src_uyvy,
uint8* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile(
YUVTORGB_SETUP
"vmov.u8 d23, #255 \n"
"1: \n" READUYVY YUVTORGB
"subs %2, %2, #8 \n"
"vst4.8 {d20, d21, d22, d23}, [%1]! \n"
"bgt 1b \n"
: "+r"(src_uyvy), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
: [kUVToRB] "r"(&yuvconstants->kUVToRB),
[kUVToG] "r"(&yuvconstants->kUVToG),
[kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
[kYToRgb] "r"(&yuvconstants->kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
"q12", "q13", "q14", "q15");
asm volatile(YUVTORGB_SETUP
"vmov.u8 d23, #255 \n"
"1: \n" READUYVY YUVTORGB
"subs %2, %2, #8 \n"
"vst4.8 {d20, d21, d22, d23}, [%1]! \n"
"bgt 1b \n"
: "+r"(src_uyvy), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
: [kUVToRB] "r"(&yuvconstants->kUVToRB),
[kUVToG] "r"(&yuvconstants->kUVToG),
[kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
[kYToRgb] "r"(&yuvconstants->kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9",
"q10", "q11", "q12", "q13", "q14", "q15");
}
// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
@ -501,7 +497,7 @@ void SplitUVRow_NEON(const uint8* src_uv,
uint8* dst_v,
int width) {
asm volatile(
"1: \n"
"1: \n"
"vld2.8 {q0, q1}, [%0]! \n" // load 16 pairs of UV
"subs %3, %3, #16 \n" // 16 processed per loop
"vst1.8 {q0}, [%1]! \n" // store U
@ -522,11 +518,11 @@ void MergeUVRow_NEON(const uint8* src_u,
uint8* dst_uv,
int width) {
asm volatile(
"1: \n"
"1: \n"
"vld1.8 {q0}, [%0]! \n" // load U
"vld1.8 {q1}, [%1]! \n" // load V
"subs %3, %3, #16 \n" // 16 processed per loop
"vst2.u8 {q0, q1}, [%2]! \n" // store 16 pairs of UV
"vst2.8 {q0, q1}, [%2]! \n" // store 16 pairs of UV
"bgt 1b \n"
: "+r"(src_u), // %0
"+r"(src_v), // %1
@ -537,10 +533,60 @@ void MergeUVRow_NEON(const uint8* src_u,
);
}
// Reads 16 packed RGB and write to planar dst_r, dst_g, dst_b.
void SplitRGBRow_NEON(const uint8* src_rgb,
uint8* dst_r,
uint8* dst_g,
uint8* dst_b,
int width) {
asm volatile(
"1: \n"
"vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB
"vld3.8 {d1, d3, d5}, [%0]! \n" // next 8 RGB
"subs %4, %4, #16 \n" // 16 processed per loop
"vst1.8 {q0}, [%1]! \n" // store R
"vst1.8 {q1}, [%2]! \n" // store G
"vst1.8 {q2}, [%3]! \n" // store B
"bgt 1b \n"
: "+r"(src_rgb), // %0
"+r"(dst_r), // %1
"+r"(dst_g), // %2
"+r"(dst_b), // %3
"+r"(width) // %4
: // Input registers
: "cc", "memory", "d0", "d1", "d2" // Clobber List
);
}
// Reads 16 planar R's, G's and B's and writes out 16 packed RGB at a time
void MergeRGBRow_NEON(const uint8* src_r,
const uint8* src_g,
const uint8* src_b,
uint8* dst_rgb,
int width) {
asm volatile(
"1: \n"
"vld1.8 {q0}, [%0]! \n" // load R
"vld1.8 {q1}, [%1]! \n" // load G
"vld1.8 {q2}, [%2]! \n" // load B
"subs %4, %4, #16 \n" // 16 processed per loop
"vst3.8 {d0, d2, d4}, [%3]! \n" // store 8 RGB
"vst3.8 {d1, d3, d5}, [%3]! \n" // next 8 RGB
"bgt 1b \n"
: "+r"(src_r), // %0
"+r"(src_g), // %1
"+r"(src_b), // %2
"+r"(dst_rgb), // %3
"+r"(width) // %4
: // Input registers
: "cc", "memory", "q0", "q1", "q2" // Clobber List
);
}
// Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15.
void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
asm volatile(
"1: \n"
"1: \n"
"vld1.8 {d0, d1, d2, d3}, [%0]! \n" // load 32
"subs %2, %2, #32 \n" // 32 processed per loop
"vst1.8 {d0, d1, d2, d3}, [%1]! \n" // store 32
@ -557,7 +603,7 @@ void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
void SetRow_NEON(uint8* dst, uint8 v8, int count) {
asm volatile(
"vdup.8 q0, %2 \n" // duplicate 16 bytes
"1: \n"
"1: \n"
"subs %1, %1, #16 \n" // 16 bytes per loop
"vst1.8 {q0}, [%0]! \n" // store
"bgt 1b \n"
@ -571,7 +617,7 @@ void SetRow_NEON(uint8* dst, uint8 v8, int count) {
void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) {
asm volatile(
"vdup.u32 q0, %2 \n" // duplicate 4 ints
"1: \n"
"1: \n"
"subs %1, %1, #4 \n" // 4 pixels per loop
"vst1.8 {q0}, [%0]! \n" // store
"bgt 1b \n"
@ -588,7 +634,7 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
"add %0, %0, %2 \n"
"sub %0, #16 \n"
"1: \n"
"1: \n"
"vld1.8 {q0}, [%0], r3 \n" // src -= 16
"subs %2, #16 \n" // 16 pixels per loop.
"vrev64.8 q0, q0 \n"
@ -612,7 +658,7 @@ void MirrorUVRow_NEON(const uint8* src_uv,
"add %0, %0, %3, lsl #1 \n"
"sub %0, #16 \n"
"1: \n"
"1: \n"
"vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16
"subs %3, #8 \n" // 8 pixels per loop.
"vrev64.8 q0, q0 \n"
@ -634,7 +680,7 @@ void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
"add %0, %0, %2, lsl #2 \n"
"sub %0, #16 \n"
"1: \n"
"1: \n"
"vld1.8 {q0}, [%0], r3 \n" // src -= 16
"subs %2, #4 \n" // 4 pixels per loop.
"vrev64.32 q0, q0 \n"
@ -651,7 +697,7 @@ void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width) {
asm volatile(
"vmov.u8 d4, #255 \n" // Alpha
"1: \n"
"1: \n"
"vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RGB24.
"subs %2, %2, #8 \n" // 8 processed per loop.
"vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB.
@ -667,7 +713,7 @@ void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width) {
void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width) {
asm volatile(
"vmov.u8 d4, #255 \n" // Alpha
"1: \n"
"1: \n"
"vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW.
"subs %2, %2, #8 \n" // 8 processed per loop.
"vswp.u8 d1, d3 \n" // swap R, B
@ -683,7 +729,7 @@ void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width) {
void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width) {
asm volatile(
"1: \n"
"1: \n"
"vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW.
"subs %2, %2, #8 \n" // 8 processed per loop.
"vswp.u8 d1, d3 \n" // swap R, B
@ -713,7 +759,7 @@ void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width) {
void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width) {
asm volatile(
"vmov.u8 d3, #255 \n" // Alpha
"1: \n"
"1: \n"
"vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
RGB565TOARGB
@ -759,7 +805,7 @@ void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555,
int width) {
asm volatile(
"vmov.u8 d3, #255 \n" // Alpha
"1: \n"
"1: \n"
"vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
ARGB1555TOARGB
@ -788,7 +834,7 @@ void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444,
int width) {
asm volatile(
"vmov.u8 d3, #255 \n" // Alpha
"1: \n"
"1: \n"
"vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
ARGB4444TOARGB
@ -804,7 +850,7 @@ void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444,
void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int width) {
asm volatile(
"1: \n"
"1: \n"
"vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB.
"subs %2, %2, #8 \n" // 8 processed per loop.
"vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of
@ -820,7 +866,7 @@ void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int width) {
void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int width) {
asm volatile(
"1: \n"
"1: \n"
"vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB.
"subs %2, %2, #8 \n" // 8 processed per loop.
"vswp.u8 d1, d3 \n" // swap R, B
@ -836,7 +882,7 @@ void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int width) {
void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int width) {
asm volatile(
"1: \n"
"1: \n"
"vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2.
"subs %2, %2, #16 \n" // 16 processed per loop.
"vst1.8 {q0}, [%1]! \n" // store 16 pixels of Y.
@ -851,7 +897,7 @@ void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int width) {
void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int width) {
asm volatile(
"1: \n"
"1: \n"
"vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY.
"subs %2, %2, #16 \n" // 16 processed per loop.
"vst1.8 {q1}, [%1]! \n" // store 16 pixels of Y.
@ -869,7 +915,7 @@ void YUY2ToUV422Row_NEON(const uint8* src_yuy2,
uint8* dst_v,
int width) {
asm volatile(
"1: \n"
"1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
"subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
"vst1.8 {d1}, [%1]! \n" // store 8 U.
@ -889,7 +935,7 @@ void UYVYToUV422Row_NEON(const uint8* src_uyvy,
uint8* dst_v,
int width) {
asm volatile(
"1: \n"
"1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
"subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
"vst1.8 {d0}, [%1]! \n" // store 8 U.
@ -911,7 +957,7 @@ void YUY2ToUVRow_NEON(const uint8* src_yuy2,
int width) {
asm volatile(
"add %1, %0, %1 \n" // stride + src_yuy2
"1: \n"
"1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
"subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
"vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row YUY2.
@ -938,7 +984,7 @@ void UYVYToUVRow_NEON(const uint8* src_uyvy,
int width) {
asm volatile(
"add %1, %0, %1 \n" // stride + src_uyvy
"1: \n"
"1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
"subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
"vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row UYVY.
@ -965,7 +1011,7 @@ void ARGBShuffleRow_NEON(const uint8* src_argb,
int width) {
asm volatile(
"vld1.8 {q2}, [%3] \n" // shuffler
"1: \n"
"1: \n"
"vld1.8 {q0}, [%0]! \n" // load 4 pixels.
"subs %2, %2, #4 \n" // 4 processed per loop
"vtbl.8 d2, {d0, d1}, d4 \n" // look up 2 first pixels
@ -986,7 +1032,7 @@ void I422ToYUY2Row_NEON(const uint8* src_y,
uint8* dst_yuy2,
int width) {
asm volatile(
"1: \n"
"1: \n"
"vld2.8 {d0, d2}, [%0]! \n" // load 16 Ys
"vld1.8 {d1}, [%1]! \n" // load 8 Us
"vld1.8 {d3}, [%2]! \n" // load 8 Vs
@ -1008,7 +1054,7 @@ void I422ToUYVYRow_NEON(const uint8* src_y,
uint8* dst_uyvy,
int width) {
asm volatile(
"1: \n"
"1: \n"
"vld2.8 {d1, d3}, [%0]! \n" // load 16 Ys
"vld1.8 {d0}, [%1]! \n" // load 8 Us
"vld1.8 {d2}, [%2]! \n" // load 8 Vs
@ -1026,7 +1072,7 @@ void I422ToUYVYRow_NEON(const uint8* src_y,
void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int width) {
asm volatile(
"1: \n"
"1: \n"
"vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
"subs %2, %2, #8 \n" // 8 processed per loop.
ARGBTORGB565
@ -1045,13 +1091,14 @@ void ARGBToRGB565DitherRow_NEON(const uint8* src_argb,
int width) {
asm volatile(
"vdup.32 d2, %2 \n" // dither4
"1: \n"
"1: \n"
"vld4.8 {d20, d21, d22, d23}, [%1]! \n" // load 8 pixels of ARGB.
"subs %3, %3, #8 \n" // 8 processed per loop.
"vqadd.u8 d20, d20, d2 \n"
"vqadd.u8 d21, d21, d2 \n"
"vqadd.u8 d22, d22, d2 \n" ARGBTORGB565
"vst1.8 {q0}, [%0]! \n" // store 8 pixels RGB565.
"vqadd.u8 d22, d22, d2 \n" // add for dither
ARGBTORGB565
"vst1.8 {q0}, [%0]! \n" // store 8 RGB565.
"bgt 1b \n"
: "+r"(dst_rgb) // %0
: "r"(src_argb), // %1
@ -1064,12 +1111,11 @@ void ARGBToARGB1555Row_NEON(const uint8* src_argb,
uint8* dst_argb1555,
int width) {
asm volatile(
"1: \n"
"1: \n"
"vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
"subs %2, %2, #8 \n" // 8 processed per loop.
ARGBTOARGB1555
"vst1.8 {q0}, [%1]! \n" // store 8 pixels
// ARGB1555.
"vst1.8 {q0}, [%1]! \n" // store 8 ARGB1555.
"bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb1555), // %1
@ -1084,12 +1130,11 @@ void ARGBToARGB4444Row_NEON(const uint8* src_argb,
asm volatile(
"vmov.u8 d4, #0x0f \n" // bits to clear with
// vbic.
"1: \n"
"1: \n"
"vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
"subs %2, %2, #8 \n" // 8 processed per loop.
ARGBTOARGB4444
"vst1.8 {q0}, [%1]! \n" // store 8 pixels
// ARGB4444.
"vst1.8 {q0}, [%1]! \n" // store 8 ARGB4444.
"bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb4444), // %1
@ -1104,7 +1149,7 @@ void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
"vmov.u8 d25, #65 \n" // G * 0.5078 coefficient
"vmov.u8 d26, #33 \n" // R * 0.2578 coefficient
"vmov.u8 d27, #16 \n" // Add 16 constant
"1: \n"
"1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
"vmull.u8 q2, d0, d24 \n" // B
@ -1123,7 +1168,7 @@ void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
void ARGBExtractAlphaRow_NEON(const uint8* src_argb, uint8* dst_a, int width) {
asm volatile(
"1: \n"
"1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels
"subs %2, %2, #16 \n" // 16 processed per loop
@ -1142,7 +1187,7 @@ void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
"vmov.u8 d24, #15 \n" // B * 0.11400 coefficient
"vmov.u8 d25, #75 \n" // G * 0.58700 coefficient
"vmov.u8 d26, #38 \n" // R * 0.29900 coefficient
"1: \n"
"1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
"vmull.u8 q2, d0, d24 \n" // B
@ -1171,7 +1216,7 @@ void ARGBToUV444Row_NEON(const uint8* src_argb,
"vmov.u8 d27, #18 \n" // VB -0.1406 coefficient
"vmov.u8 d28, #94 \n" // VG -0.7344 coefficient
"vmov.u16 q15, #0x8080 \n" // 128.5
"1: \n"
"1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
"subs %3, %3, #8 \n" // 8 processed per loop.
"vmull.u8 q2, d0, d24 \n" // B
@ -1199,24 +1244,20 @@ void ARGBToUV444Row_NEON(const uint8* src_argb,
"q15");
}
// clang-format off
// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
#define RGBTOUV(QB, QG, QR) \
"vmul.s16 q8, " #QB \
", q10 \n" /* B */ \
"vmls.s16 q8, " #QG \
", q11 \n" /* G */ \
"vmls.s16 q8, " #QR \
", q12 \n" /* R */ \
"vmul.s16 q8, " #QB ", q10 \n" /* B */ \
"vmls.s16 q8, " #QG ", q11 \n" /* G */ \
"vmls.s16 q8, " #QR ", q12 \n" /* R */ \
"vadd.u16 q8, q8, q15 \n" /* +128 -> unsigned */ \
"vmul.s16 q9, " #QR \
", q10 \n" /* R */ \
"vmls.s16 q9, " #QG \
", q14 \n" /* G */ \
"vmls.s16 q9, " #QB \
", q13 \n" /* B */ \
"vmul.s16 q9, " #QR ", q10 \n" /* R */ \
"vmls.s16 q9, " #QG ", q14 \n" /* G */ \
"vmls.s16 q9, " #QB ", q13 \n" /* B */ \
"vadd.u16 q9, q9, q15 \n" /* +128 -> unsigned */ \
"vqshrn.u16 d0, q8, #8 \n" /* 16 bit to 8 bit U */ \
"vqshrn.u16 d1, q9, #8 \n" /* 16 bit to 8 bit V */
// clang-format on
// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
void ARGBToUVRow_NEON(const uint8* src_argb,
@ -1232,7 +1273,7 @@ void ARGBToUVRow_NEON(const uint8* src_argb,
"vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
"vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
"vmov.u16 q15, #0x8080 \n" // 128.5
"1: \n"
"1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
"vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
@ -1278,7 +1319,7 @@ void ARGBToUVJRow_NEON(const uint8* src_argb,
"vmov.s16 q13, #20 / 2 \n" // VB -0.08131 coefficient
"vmov.s16 q14, #107 / 2 \n" // VG -0.41869 coefficient
"vmov.u16 q15, #0x8080 \n" // 128.5
"1: \n"
"1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
"vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
@ -1323,7 +1364,7 @@ void BGRAToUVRow_NEON(const uint8* src_bgra,
"vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
"vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
"vmov.u16 q15, #0x8080 \n" // 128.5
"1: \n"
"1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 BGRA pixels.
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 BGRA pixels.
"vpaddl.u8 q3, q3 \n" // B 16 bytes -> 8 shorts.
@ -1368,7 +1409,7 @@ void ABGRToUVRow_NEON(const uint8* src_abgr,
"vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
"vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
"vmov.u16 q15, #0x8080 \n" // 128.5
"1: \n"
"1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ABGR pixels.
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ABGR pixels.
"vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts.
@ -1413,7 +1454,7 @@ void RGBAToUVRow_NEON(const uint8* src_rgba,
"vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
"vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
"vmov.u16 q15, #0x8080 \n" // 128.5
"1: \n"
"1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 RGBA pixels.
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 RGBA pixels.
"vpaddl.u8 q0, q1 \n" // B 16 bytes -> 8 shorts.
@ -1458,7 +1499,7 @@ void RGB24ToUVRow_NEON(const uint8* src_rgb24,
"vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
"vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
"vmov.u16 q15, #0x8080 \n" // 128.5
"1: \n"
"1: \n"
"vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB24 pixels.
"vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RGB24 pixels.
"vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
@ -1503,7 +1544,7 @@ void RAWToUVRow_NEON(const uint8* src_raw,
"vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
"vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
"vmov.u16 q15, #0x8080 \n" // 128.5
"1: \n"
"1: \n"
"vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RAW pixels.
"vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RAW pixels.
"vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts.
@ -1550,7 +1591,7 @@ void RGB565ToUVRow_NEON(const uint8* src_rgb565,
"vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
"vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
"vmov.u16 q15, #0x8080 \n" // 128.5
"1: \n"
"1: \n"
"vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
RGB565TOARGB
"vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
@ -1616,7 +1657,7 @@ void ARGB1555ToUVRow_NEON(const uint8* src_argb1555,
"vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
"vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
"vmov.u16 q15, #0x8080 \n" // 128.5
"1: \n"
"1: \n"
"vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
RGB555TOARGB
"vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
@ -1682,7 +1723,7 @@ void ARGB4444ToUVRow_NEON(const uint8* src_argb4444,
"vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
"vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
"vmov.u16 q15, #0x8080 \n" // 128.5
"1: \n"
"1: \n"
"vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
ARGB4444TOARGB
"vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
@ -1739,7 +1780,7 @@ void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int width) {
"vmov.u8 d25, #65 \n" // G * 0.5078 coefficient
"vmov.u8 d26, #33 \n" // R * 0.2578 coefficient
"vmov.u8 d27, #16 \n" // Add 16 constant
"1: \n"
"1: \n"
"vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
RGB565TOARGB
@ -1763,7 +1804,7 @@ void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int width) {
"vmov.u8 d25, #65 \n" // G * 0.5078 coefficient
"vmov.u8 d26, #33 \n" // R * 0.2578 coefficient
"vmov.u8 d27, #16 \n" // Add 16 constant
"1: \n"
"1: \n"
"vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
ARGB1555TOARGB
@ -1787,7 +1828,7 @@ void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int width) {
"vmov.u8 d25, #65 \n" // G * 0.5078 coefficient
"vmov.u8 d26, #33 \n" // R * 0.2578 coefficient
"vmov.u8 d27, #16 \n" // Add 16 constant
"1: \n"
"1: \n"
"vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
ARGB4444TOARGB
@ -1811,7 +1852,7 @@ void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int width) {
"vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
"vmov.u8 d6, #13 \n" // B * 0.1016 coefficient
"vmov.u8 d7, #16 \n" // Add 16 constant
"1: \n"
"1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of BGRA.
"subs %2, %2, #8 \n" // 8 processed per loop.
"vmull.u8 q8, d1, d4 \n" // R
@ -1834,7 +1875,7 @@ void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int width) {
"vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
"vmov.u8 d6, #13 \n" // B * 0.1016 coefficient
"vmov.u8 d7, #16 \n" // Add 16 constant
"1: \n"
"1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ABGR.
"subs %2, %2, #8 \n" // 8 processed per loop.
"vmull.u8 q8, d0, d4 \n" // R
@ -1857,7 +1898,7 @@ void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int width) {
"vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
"vmov.u8 d6, #33 \n" // R * 0.2578 coefficient
"vmov.u8 d7, #16 \n" // Add 16 constant
"1: \n"
"1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of RGBA.
"subs %2, %2, #8 \n" // 8 processed per loop.
"vmull.u8 q8, d1, d4 \n" // B
@ -1880,7 +1921,7 @@ void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int width) {
"vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
"vmov.u8 d6, #33 \n" // R * 0.2578 coefficient
"vmov.u8 d7, #16 \n" // Add 16 constant
"1: \n"
"1: \n"
"vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RGB24.
"subs %2, %2, #8 \n" // 8 processed per loop.
"vmull.u8 q8, d0, d4 \n" // B
@ -1903,7 +1944,7 @@ void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int width) {
"vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
"vmov.u8 d6, #13 \n" // B * 0.1016 coefficient
"vmov.u8 d7, #16 \n" // Add 16 constant
"1: \n"
"1: \n"
"vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RAW.
"subs %2, %2, #8 \n" // 8 processed per loop.
"vmull.u8 q8, d0, d4 \n" // B
@ -1938,7 +1979,7 @@ void InterpolateRow_NEON(uint8* dst_ptr,
"rsb %4, #256 \n"
"vdup.8 d4, %4 \n"
// General purpose row blend.
"1: \n"
"1: \n"
"vld1.8 {q0}, [%1]! \n"
"vld1.8 {q1}, [%2]! \n"
"subs %3, %3, #16 \n"
@ -1953,7 +1994,7 @@ void InterpolateRow_NEON(uint8* dst_ptr,
"b 99f \n"
// Blend 50 / 50.
"50: \n"
"50: \n"
"vld1.8 {q0}, [%1]! \n"
"vld1.8 {q1}, [%2]! \n"
"subs %3, %3, #16 \n"
@ -1963,13 +2004,13 @@ void InterpolateRow_NEON(uint8* dst_ptr,
"b 99f \n"
// Blend 100 / 0 - Copy row unchanged.
"100: \n"
"100: \n"
"vld1.8 {q0}, [%1]! \n"
"subs %3, %3, #16 \n"
"vst1.8 {q0}, [%0]! \n"
"bgt 100b \n"
"99: \n"
"99: \n"
: "+r"(dst_ptr), // %0
"+r"(src_ptr), // %1
"+r"(src_stride), // %2
@ -1988,7 +2029,7 @@ void ARGBBlendRow_NEON(const uint8* src_argb0,
"subs %3, #8 \n"
"blt 89f \n"
// Blend 8 pixels.
"8: \n"
"8: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB0.
"vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 pixels of ARGB1.
"subs %3, %3, #8 \n" // 8 processed per loop.
@ -2006,12 +2047,12 @@ void ARGBBlendRow_NEON(const uint8* src_argb0,
"vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 pixels of ARGB.
"bge 8b \n"
"89: \n"
"89: \n"
"adds %3, #8-1 \n"
"blt 99f \n"
// Blend 1 pixels.
"1: \n"
"1: \n"
"vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n" // load 1 pixel ARGB0.
"vld4.8 {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n" // load 1 pixel ARGB1.
"subs %3, %3, #1 \n" // 1 processed per loop.
@ -2043,7 +2084,7 @@ void ARGBBlendRow_NEON(const uint8* src_argb0,
void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
asm volatile(
// Attenuate 8 pixels.
"1: \n"
"1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB.
"subs %2, %2, #8 \n" // 8 processed per loop.
"vmull.u8 q10, d0, d3 \n" // b * a
@ -2075,7 +2116,7 @@ void ARGBQuantizeRow_NEON(uint8* dst_argb,
"vdup.u16 q10, %4 \n" // interval add
// 8 pixel loop.
"1: \n"
"1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0] \n" // load 8 pixels of ARGB.
"subs %1, %1, #8 \n" // 8 processed per loop.
"vmovl.u8 q0, d0 \n" // b (0 .. 255)
@ -2116,7 +2157,7 @@ void ARGBShadeRow_NEON(const uint8* src_argb,
"vshr.u16 q0, q0, #1 \n" // scale / 2.
// 8 pixel loop.
"1: \n"
"1: \n"
"vld4.8 {d20, d22, d24, d26}, [%0]! \n" // load 8 pixels of ARGB.
"subs %2, %2, #8 \n" // 8 processed per loop.
"vmovl.u8 q10, d20 \n" // b (0 .. 255)
@ -2148,7 +2189,7 @@ void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
"vmov.u8 d24, #15 \n" // B * 0.11400 coefficient
"vmov.u8 d25, #75 \n" // G * 0.58700 coefficient
"vmov.u8 d26, #38 \n" // R * 0.29900 coefficient
"1: \n"
"1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
"vmull.u8 q2, d0, d24 \n" // B
@ -2181,7 +2222,7 @@ void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
"vmov.u8 d28, #24 \n" // BB coefficient
"vmov.u8 d29, #98 \n" // BG coefficient
"vmov.u8 d30, #50 \n" // BR coefficient
"1: \n"
"1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0] \n" // load 8 ARGB pixels.
"subs %1, %1, #8 \n" // 8 processed per loop.
"vmull.u8 q2, d0, d20 \n" // B to Sepia B
@ -2217,7 +2258,7 @@ void ARGBColorMatrixRow_NEON(const uint8* src_argb,
"vmovl.s8 q0, d4 \n" // B,G coefficients s16.
"vmovl.s8 q1, d5 \n" // R,A coefficients s16.
"1: \n"
"1: \n"
"vld4.8 {d16, d18, d20, d22}, [%0]! \n" // load 8 ARGB pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
"vmovl.u8 q8, d16 \n" // b (0 .. 255) 16 bit
@ -2273,10 +2314,9 @@ void ARGBMultiplyRow_NEON(const uint8* src_argb0,
int width) {
asm volatile(
// 8 pixel loop.
"1: \n"
"1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
"vld4.8 {d1, d3, d5, d7}, [%1]! \n" // load 8 more ARGB
// pixels.
"subs %3, %3, #8 \n" // 8 processed per loop.
"vmull.u8 q0, d0, d1 \n" // multiply B
"vmull.u8 q1, d2, d3 \n" // multiply G
@ -2288,7 +2328,6 @@ void ARGBMultiplyRow_NEON(const uint8* src_argb0,
"vrshrn.u16 d3, q3, #8 \n" // 16 bit to 8 bit A
"vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
"bgt 1b \n"
: "+r"(src_argb0), // %0
"+r"(src_argb1), // %1
"+r"(dst_argb), // %2
@ -2304,16 +2343,14 @@ void ARGBAddRow_NEON(const uint8* src_argb0,
int width) {
asm volatile(
// 8 pixel loop.
"1: \n"
"1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
"vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB
// pixels.
"subs %3, %3, #8 \n" // 8 processed per loop.
"vqadd.u8 q0, q0, q2 \n" // add B, G
"vqadd.u8 q1, q1, q3 \n" // add R, A
"vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
"bgt 1b \n"
: "+r"(src_argb0), // %0
"+r"(src_argb1), // %1
"+r"(dst_argb), // %2
@ -2329,16 +2366,14 @@ void ARGBSubtractRow_NEON(const uint8* src_argb0,
int width) {
asm volatile(
// 8 pixel loop.
"1: \n"
"1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
"vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB
// pixels.
"subs %3, %3, #8 \n" // 8 processed per loop.
"vqsub.u8 q0, q0, q2 \n" // subtract B, G
"vqsub.u8 q1, q1, q3 \n" // subtract R, A
"vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
"bgt 1b \n"
: "+r"(src_argb0), // %0
"+r"(src_argb1), // %1
"+r"(dst_argb), // %2
@ -2359,7 +2394,7 @@ void SobelRow_NEON(const uint8* src_sobelx,
asm volatile(
"vmov.u8 d3, #255 \n" // alpha
// 8 pixel loop.
"1: \n"
"1: \n"
"vld1.8 {d0}, [%0]! \n" // load 8 sobelx.
"vld1.8 {d1}, [%1]! \n" // load 8 sobely.
"subs %3, %3, #8 \n" // 8 processed per loop.
@ -2383,7 +2418,7 @@ void SobelToPlaneRow_NEON(const uint8* src_sobelx,
int width) {
asm volatile(
// 16 pixel loop.
"1: \n"
"1: \n"
"vld1.8 {q0}, [%0]! \n" // load 16 sobelx.
"vld1.8 {q1}, [%1]! \n" // load 16 sobely.
"subs %3, %3, #16 \n" // 16 processed per loop.
@ -2410,7 +2445,7 @@ void SobelXYRow_NEON(const uint8* src_sobelx,
asm volatile(
"vmov.u8 d3, #255 \n" // alpha
// 8 pixel loop.
"1: \n"
"1: \n"
"vld1.8 {d2}, [%0]! \n" // load 8 sobelx.
"vld1.8 {d0}, [%1]! \n" // load 8 sobely.
"subs %3, %3, #8 \n" // 8 processed per loop.
@ -2435,7 +2470,7 @@ void SobelXRow_NEON(const uint8* src_y0,
uint8* dst_sobelx,
int width) {
asm volatile(
"1: \n"
"1: \n"
"vld1.8 {d0}, [%0],%5 \n" // top
"vld1.8 {d1}, [%0],%6 \n"
"vsubl.u8 q0, d0, d1 \n"
@ -2473,7 +2508,7 @@ void SobelYRow_NEON(const uint8* src_y0,
uint8* dst_sobely,
int width) {
asm volatile(
"1: \n"
"1: \n"
"vld1.8 {d0}, [%0],%4 \n" // left
"vld1.8 {d1}, [%1],%4 \n"
"vsubl.u8 q0, d0, d1 \n"
@ -2505,7 +2540,7 @@ void HalfFloat1Row_NEON(const uint16* src, uint16* dst, float, int width) {
asm volatile(
"vdup.32 q0, %3 \n"
"1: \n"
"1: \n"
"vld1.8 {q1}, [%0]! \n" // load 8 shorts
"subs %2, %2, #8 \n" // 8 pixels per loop
"vmovl.u16 q2, d2 \n" // 8 int's
@ -2530,7 +2565,7 @@ void HalfFloatRow_NEON(const uint16* src, uint16* dst, float scale, int width) {
asm volatile(
"vdup.32 q0, %3 \n"
"1: \n"
"1: \n"
"vld1.8 {q1}, [%0]! \n" // load 8 shorts
"subs %2, %2, #8 \n" // 8 pixels per loop
"vmovl.u16 q2, d2 \n" // 8 int's

View File

@ -273,7 +273,7 @@ void I422ToRGB565Row_NEON(const uint8* src_y,
int width) {
asm volatile(
YUVTORGB_SETUP
"1: \n" READYUV422 YUVTORGB(
"1: \n" READYUV422 YUVTORGB(
v22, v21,
v20) "subs %w4, %w4, #8 \n" ARGBTORGB565
"st1 {v0.8h}, [%3], #16 \n" // store 8 pixels
@ -310,7 +310,7 @@ void I422ToARGB1555Row_NEON(const uint8* src_y,
asm volatile(
YUVTORGB_SETUP
"movi v23.8b, #255 \n"
"1: \n" READYUV422 YUVTORGB(
"1: \n" READYUV422 YUVTORGB(
v22, v21,
v20) "subs %w4, %w4, #8 \n" ARGBTOARGB1555
"st1 {v0.8h}, [%3], #16 \n" // store 8 pixels
@ -395,7 +395,7 @@ void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) {
void J400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) {
asm volatile(
"movi v23.8b, #255 \n"
"1: \n"
"1: \n"
"ld1 {v20.8b}, [%0], #8 \n"
"orr v21.8b, v20.8b, v20.8b \n"
"orr v22.8b, v20.8b, v20.8b \n"
@ -470,7 +470,7 @@ void NV12ToRGB565Row_NEON(const uint8* src_y,
int width) {
asm volatile(
YUVTORGB_SETUP
"1: \n" READNV12 YUVTORGB(
"1: \n" READNV12 YUVTORGB(
v22, v21,
v20) "subs %w3, %w3, #8 \n" ARGBTORGB565
"st1 {v0.8h}, [%2], 16 \n" // store 8 pixels
@ -544,7 +544,7 @@ void SplitUVRow_NEON(const uint8* src_uv,
uint8* dst_v,
int width) {
asm volatile(
"1: \n"
"1: \n"
"ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pairs of UV
"subs %w3, %w3, #16 \n" // 16 processed per loop
"st1 {v0.16b}, [%1], #16 \n" // store U
@ -565,7 +565,7 @@ void MergeUVRow_NEON(const uint8* src_u,
uint8* dst_uv,
int width) {
asm volatile(
"1: \n"
"1: \n"
"ld1 {v0.16b}, [%0], #16 \n" // load U
"ld1 {v1.16b}, [%1], #16 \n" // load V
"subs %w3, %w3, #16 \n" // 16 processed per loop
@ -580,19 +580,67 @@ void MergeUVRow_NEON(const uint8* src_u,
);
}
// Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15.
// Reads 16 packed RGB and write to planar dst_r, dst_g, dst_b.
void SplitRGBRow_NEON(const uint8* src_rgb,
uint8* dst_r,
uint8* dst_g,
uint8* dst_b,
int width) {
asm volatile(
"1: \n"
"ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 RGB
"subs %w4, %w4, #16 \n" // 16 processed per loop
"st1 {v0.16b}, [%1], #16 \n" // store R
"st1 {v1.16b}, [%2], #16 \n" // store G
"st1 {v2.16b}, [%3], #16 \n" // store B
"b.gt 1b \n"
: "+r"(src_rgb), // %0
"+r"(dst_r), // %1
"+r"(dst_g), // %2
"+r"(dst_b), // %3
"+r"(width) // %4
: // Input registers
: "cc", "memory", "v0", "v1", "v2" // Clobber List
);
}
// Reads 16 planar R's, G's and B's and writes out 16 packed RGB at a time
void MergeRGBRow_NEON(const uint8* src_r,
const uint8* src_g,
const uint8* src_b,
uint8* dst_rgb,
int width) {
asm volatile(
"1: \n"
"ld1 {v0.16b}, [%0], #16 \n" // load R
"ld1 {v1.16b}, [%1], #16 \n" // load G
"ld1 {v2.16b}, [%2], #16 \n" // load B
"subs %w4, %w4, #16 \n" // 16 processed per loop
"st3 {v0.16b,v1.16b,v2.16b}, [%3], #48 \n" // store 16 RGB
"b.gt 1b \n"
: "+r"(src_r), // %0
"+r"(src_g), // %1
"+r"(src_b), // %2
"+r"(dst_rgb), // %3
"+r"(width) // %4
: // Input registers
: "cc", "memory", "v0", "v1", "v2" // Clobber List
);
}
// Copy multiple of 32.
void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
asm volatile(
"1: \n"
"ld1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 32
"1: \n"
"ldp q0, q1, [%0], #32 \n"
"subs %w2, %w2, #32 \n" // 32 processed per loop
"st1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 32
"stp q0, q1, [%1], #32 \n"
"b.gt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(count) // %2 // Output registers
: // Input registers
: "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(count) // %2 // Output registers
: // Input registers
: "cc", "memory", "v0", "v1" // Clobber List
);
}
@ -600,7 +648,7 @@ void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
void SetRow_NEON(uint8* dst, uint8 v8, int count) {
asm volatile(
"dup v0.16b, %w2 \n" // duplicate 16 bytes
"1: \n"
"1: \n"
"subs %w1, %w1, #16 \n" // 16 bytes per loop
"st1 {v0.16b}, [%0], #16 \n" // store
"b.gt 1b \n"
@ -613,7 +661,7 @@ void SetRow_NEON(uint8* dst, uint8 v8, int count) {
void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) {
asm volatile(
"dup v0.4s, %w2 \n" // duplicate 4 ints
"1: \n"
"1: \n"
"subs %w1, %w1, #4 \n" // 4 ints per loop
"st1 {v0.16b}, [%0], #16 \n" // store
"b.gt 1b \n"
@ -628,7 +676,7 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
// Start at end of source row.
"add %0, %0, %w2, sxtw \n"
"sub %0, %0, #16 \n"
"1: \n"
"1: \n"
"ld1 {v0.16b}, [%0], %3 \n" // src -= 16
"subs %w2, %w2, #16 \n" // 16 pixels per loop.
"rev64 v0.16b, v0.16b \n"
@ -650,7 +698,7 @@ void MirrorUVRow_NEON(const uint8* src_uv,
// Start at end of source row.
"add %0, %0, %w3, sxtw #1 \n"
"sub %0, %0, #16 \n"
"1: \n"
"1: \n"
"ld2 {v0.8b, v1.8b}, [%0], %4 \n" // src -= 16
"subs %w3, %w3, #8 \n" // 8 pixels per loop.
"rev64 v0.8b, v0.8b \n"
@ -671,7 +719,7 @@ void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
// Start at end of source row.
"add %0, %0, %w2, sxtw #2 \n"
"sub %0, %0, #16 \n"
"1: \n"
"1: \n"
"ld1 {v0.16b}, [%0], %3 \n" // src -= 16
"subs %w2, %w2, #4 \n" // 4 pixels per loop.
"rev64 v0.4s, v0.4s \n"
@ -688,11 +736,10 @@ void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width) {
asm volatile(
"movi v4.8b, #255 \n" // Alpha
"1: \n"
"1: \n"
"ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24.
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB
// pixels
"b.gt 1b \n"
: "+r"(src_rgb24), // %0
"+r"(dst_argb), // %1
@ -705,7 +752,7 @@ void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width) {
void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width) {
asm volatile(
"movi v5.8b, #255 \n" // Alpha
"1: \n"
"1: \n"
"ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"orr v3.8b, v1.8b, v1.8b \n" // move g
@ -722,7 +769,7 @@ void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width) {
void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width) {
asm volatile(
"1: \n"
"1: \n"
"ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"orr v3.8b, v1.8b, v1.8b \n" // move g
@ -753,12 +800,11 @@ void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width) {
void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width) {
asm volatile(
"movi v3.8b, #255 \n" // Alpha
"1: \n"
"1: \n"
"ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
"subs %w2, %w2, #8 \n" // 8 processed per loop.
RGB565TOARGB
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
// pixels
"b.gt 1b \n"
: "+r"(src_rgb565), // %0
"+r"(dst_argb), // %1
@ -810,7 +856,7 @@ void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555,
int width) {
asm volatile(
"movi v3.8b, #255 \n" // Alpha
"1: \n"
"1: \n"
"ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
"subs %w2, %w2, #8 \n" // 8 processed per loop.
ARGB1555TOARGB
@ -841,7 +887,7 @@ void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444,
uint8* dst_argb,
int width) {
asm volatile(
"1: \n"
"1: \n"
"ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
"subs %w2, %w2, #8 \n" // 8 processed per loop.
ARGB4444TOARGB
@ -858,9 +904,8 @@ void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444,
void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int width) {
asm volatile(
"1: \n"
"1: \n"
"ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load 8 ARGB
// pixels
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"st3 {v1.8b,v2.8b,v3.8b}, [%1], #24 \n" // store 8 pixels of
// RGB24.
@ -875,7 +920,7 @@ void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int width) {
void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int width) {
asm volatile(
"1: \n"
"1: \n"
"ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load b g r a
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"orr v4.8b, v2.8b, v2.8b \n" // mov g
@ -892,7 +937,7 @@ void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int width) {
void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int width) {
asm volatile(
"1: \n"
"1: \n"
"ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2.
"subs %w2, %w2, #16 \n" // 16 processed per loop.
"st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y.
@ -907,7 +952,7 @@ void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int width) {
void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int width) {
asm volatile(
"1: \n"
"1: \n"
"ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY.
"subs %w2, %w2, #16 \n" // 16 processed per loop.
"st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y.
@ -925,9 +970,8 @@ void YUY2ToUV422Row_NEON(const uint8* src_yuy2,
uint8* dst_v,
int width) {
asm volatile(
"1: \n"
"1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 YUY2
// pixels
"subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs.
"st1 {v1.8b}, [%1], #8 \n" // store 8 U.
"st1 {v3.8b}, [%2], #8 \n" // store 8 V.
@ -946,9 +990,8 @@ void UYVYToUV422Row_NEON(const uint8* src_uyvy,
uint8* dst_v,
int width) {
asm volatile(
"1: \n"
"1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 UYVY
// pixels
"subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs.
"st1 {v0.8b}, [%1], #8 \n" // store 8 U.
"st1 {v2.8b}, [%2], #8 \n" // store 8 V.
@ -969,7 +1012,7 @@ void YUY2ToUVRow_NEON(const uint8* src_yuy2,
int width) {
const uint8* src_yuy2b = src_yuy2 + stride_yuy2;
asm volatile(
"1: \n"
"1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels
"subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs.
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row
@ -996,7 +1039,7 @@ void UYVYToUVRow_NEON(const uint8* src_uyvy,
int width) {
const uint8* src_uyvyb = src_uyvy + stride_uyvy;
asm volatile(
"1: \n"
"1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels
"subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs.
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row
@ -1023,7 +1066,7 @@ void ARGBShuffleRow_NEON(const uint8* src_argb,
int width) {
asm volatile(
"ld1 {v2.16b}, [%3] \n" // shuffler
"1: \n"
"1: \n"
"ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels.
"subs %w2, %w2, #4 \n" // 4 processed per loop
"tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels
@ -1043,7 +1086,7 @@ void I422ToYUY2Row_NEON(const uint8* src_y,
uint8* dst_yuy2,
int width) {
asm volatile(
"1: \n"
"1: \n"
"ld2 {v0.8b, v1.8b}, [%0], #16 \n" // load 16 Ys
"orr v2.8b, v1.8b, v1.8b \n"
"ld1 {v1.8b}, [%1], #8 \n" // load 8 Us
@ -1066,7 +1109,7 @@ void I422ToUYVYRow_NEON(const uint8* src_y,
uint8* dst_uyvy,
int width) {
asm volatile(
"1: \n"
"1: \n"
"ld2 {v1.8b,v2.8b}, [%0], #16 \n" // load 16 Ys
"orr v3.8b, v2.8b, v2.8b \n"
"ld1 {v0.8b}, [%1], #8 \n" // load 8 Us
@ -1085,7 +1128,7 @@ void I422ToUYVYRow_NEON(const uint8* src_y,
void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int width) {
asm volatile(
"1: \n"
"1: \n"
"ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels
"subs %w2, %w2, #8 \n" // 8 processed per loop.
ARGBTORGB565
@ -1104,7 +1147,7 @@ void ARGBToRGB565DitherRow_NEON(const uint8* src_argb,
int width) {
asm volatile(
"dup v1.4s, %w2 \n" // dither4
"1: \n"
"1: \n"
"ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" // load 8 pixels
"subs %w3, %w3, #8 \n" // 8 processed per loop.
"uqadd v20.8b, v20.8b, v1.8b \n"
@ -1123,7 +1166,7 @@ void ARGBToARGB1555Row_NEON(const uint8* src_argb,
uint8* dst_argb1555,
int width) {
asm volatile(
"1: \n"
"1: \n"
"ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels
"subs %w2, %w2, #8 \n" // 8 processed per loop.
ARGBTOARGB1555
@ -1143,7 +1186,7 @@ void ARGBToARGB4444Row_NEON(const uint8* src_argb,
asm volatile(
"movi v4.16b, #0x0f \n" // bits to clear with
// vbic.
"1: \n"
"1: \n"
"ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels
"subs %w2, %w2, #8 \n" // 8 processed per loop.
ARGBTOARGB4444
@ -1163,9 +1206,8 @@ void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
"movi v5.8b, #65 \n" // G * 0.5078 coefficient
"movi v6.8b, #33 \n" // R * 0.2578 coefficient
"movi v7.8b, #16 \n" // Add 16 constant
"1: \n"
"1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
// pixels.
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"umull v3.8h, v0.8b, v4.8b \n" // B
"umlal v3.8h, v1.8b, v5.8b \n" // G
@ -1183,7 +1225,7 @@ void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
void ARGBExtractAlphaRow_NEON(const uint8* src_argb, uint8* dst_a, int width) {
asm volatile(
"1: \n"
"1: \n"
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load row 16
// pixels
"subs %w2, %w2, #16 \n" // 16 processed per loop
@ -1202,9 +1244,8 @@ void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
"movi v4.8b, #15 \n" // B * 0.11400 coefficient
"movi v5.8b, #75 \n" // G * 0.58700 coefficient
"movi v6.8b, #38 \n" // R * 0.29900 coefficient
"1: \n"
"1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
// pixels.
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"umull v3.8h, v0.8b, v4.8b \n" // B
"umlal v3.8h, v1.8b, v5.8b \n" // G
@ -1232,7 +1273,7 @@ void ARGBToUV444Row_NEON(const uint8* src_argb,
"movi v27.8b, #18 \n" // VB -0.1406 coefficient
"movi v28.8b, #94 \n" // VG -0.7344 coefficient
"movi v29.16b,#0x80 \n" // 128.5
"1: \n"
"1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
// pixels.
"subs %w3, %w3, #8 \n" // 8 processed per loop.
@ -1270,23 +1311,19 @@ void ARGBToUV444Row_NEON(const uint8* src_argb,
"movi v25.16b, #0x80 \n" /* 128.5 (0x8080 in 16-bit) */
// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
// clang-format off
#define RGBTOUV(QB, QG, QR) \
"mul v3.8h, " #QB \
",v20.8h \n" /* B */ \
"mul v4.8h, " #QR \
",v20.8h \n" /* R */ \
"mls v3.8h, " #QG \
",v21.8h \n" /* G */ \
"mls v4.8h, " #QG \
",v24.8h \n" /* G */ \
"mls v3.8h, " #QR \
",v22.8h \n" /* R */ \
"mls v4.8h, " #QB \
",v23.8h \n" /* B */ \
"mul v3.8h, " #QB ",v20.8h \n" /* B */ \
"mul v4.8h, " #QR ",v20.8h \n" /* R */ \
"mls v3.8h, " #QG ",v21.8h \n" /* G */ \
"mls v4.8h, " #QG ",v24.8h \n" /* G */ \
"mls v3.8h, " #QR ",v22.8h \n" /* R */ \
"mls v4.8h, " #QB ",v23.8h \n" /* B */ \
"add v3.8h, v3.8h, v25.8h \n" /* +128 -> unsigned */ \
"add v4.8h, v4.8h, v25.8h \n" /* +128 -> unsigned */ \
"uqshrn v0.8b, v3.8h, #8 \n" /* 16 bit to 8 bit U */ \
"uqshrn v1.8b, v4.8h, #8 \n" /* 16 bit to 8 bit V */
// clang-format on
// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
// TODO(fbarchard): consider ptrdiff_t for all strides.
@ -1578,9 +1615,8 @@ void RGB565ToUVRow_NEON(const uint8* src_rgb565,
"movi v24.8h, #19, lsl #0 \n" // UR coeff (-0.2969) / 2
"movi v25.8h, #9 , lsl #0 \n" // VB coeff (-0.1406) / 2
"movi v26.8h, #47, lsl #0 \n" // VG coeff (-0.7344) / 2
"movi v27.16b, #0x80 \n" // 128.5 (0x8080 in
// 16-bit)
"1: \n"
"movi v27.16b, #0x80 \n" // 128.5 0x8080 in 16bit
"1: \n"
"ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
RGB565TOARGB
"uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
@ -1645,7 +1681,7 @@ void ARGB1555ToUVRow_NEON(const uint8* src_argb1555,
const uint8* src_argb1555_1 = src_argb1555 + src_stride_argb1555;
asm volatile(
RGBTOUV_SETUP_REG
"1: \n"
"1: \n"
"ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
RGB555TOARGB
"uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
@ -1710,7 +1746,7 @@ void ARGB4444ToUVRow_NEON(const uint8* src_argb4444,
const uint8* src_argb4444_1 = src_argb4444 + src_stride_argb4444;
asm volatile(
RGBTOUV_SETUP_REG
"1: \n"
"1: \n"
"ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
ARGB4444TOARGB
"uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
@ -1774,7 +1810,7 @@ void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int width) {
"movi v25.8b, #65 \n" // G * 0.5078 coefficient
"movi v26.8b, #33 \n" // R * 0.2578 coefficient
"movi v27.8b, #16 \n" // Add 16 constant
"1: \n"
"1: \n"
"ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
"subs %w2, %w2, #8 \n" // 8 processed per loop.
RGB565TOARGB
@ -1799,7 +1835,7 @@ void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int width) {
"movi v5.8b, #65 \n" // G * 0.5078 coefficient
"movi v6.8b, #33 \n" // R * 0.2578 coefficient
"movi v7.8b, #16 \n" // Add 16 constant
"1: \n"
"1: \n"
"ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
"subs %w2, %w2, #8 \n" // 8 processed per loop.
ARGB1555TOARGB
@ -1823,7 +1859,7 @@ void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int width) {
"movi v25.8b, #65 \n" // G * 0.5078 coefficient
"movi v26.8b, #33 \n" // R * 0.2578 coefficient
"movi v27.8b, #16 \n" // Add 16 constant
"1: \n"
"1: \n"
"ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
"subs %w2, %w2, #8 \n" // 8 processed per loop.
ARGB4444TOARGB
@ -1847,7 +1883,7 @@ void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int width) {
"movi v5.8b, #65 \n" // G * 0.5078 coefficient
"movi v6.8b, #13 \n" // B * 0.1016 coefficient
"movi v7.8b, #16 \n" // Add 16 constant
"1: \n"
"1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"umull v16.8h, v1.8b, v4.8b \n" // R
@ -1870,7 +1906,7 @@ void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int width) {
"movi v5.8b, #65 \n" // G * 0.5078 coefficient
"movi v6.8b, #13 \n" // B * 0.1016 coefficient
"movi v7.8b, #16 \n" // Add 16 constant
"1: \n"
"1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"umull v16.8h, v0.8b, v4.8b \n" // R
@ -1893,7 +1929,7 @@ void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int width) {
"movi v5.8b, #65 \n" // G * 0.5078 coefficient
"movi v6.8b, #33 \n" // R * 0.2578 coefficient
"movi v7.8b, #16 \n" // Add 16 constant
"1: \n"
"1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"umull v16.8h, v1.8b, v4.8b \n" // B
@ -1916,7 +1952,7 @@ void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int width) {
"movi v5.8b, #65 \n" // G * 0.5078 coefficient
"movi v6.8b, #33 \n" // R * 0.2578 coefficient
"movi v7.8b, #16 \n" // Add 16 constant
"1: \n"
"1: \n"
"ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels.
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"umull v16.8h, v0.8b, v4.8b \n" // B
@ -1939,7 +1975,7 @@ void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int width) {
"movi v5.8b, #65 \n" // G * 0.5078 coefficient
"movi v6.8b, #13 \n" // B * 0.1016 coefficient
"movi v7.8b, #16 \n" // Add 16 constant
"1: \n"
"1: \n"
"ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels.
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"umull v16.8h, v0.8b, v4.8b \n" // B
@ -1974,7 +2010,7 @@ void InterpolateRow_NEON(uint8* dst_ptr,
"dup v5.16b, %w4 \n"
"dup v4.16b, %w5 \n"
// General purpose row blend.
"1: \n"
"1: \n"
"ld1 {v0.16b}, [%1], #16 \n"
"ld1 {v1.16b}, [%2], #16 \n"
"subs %w3, %w3, #16 \n"
@ -1989,7 +2025,7 @@ void InterpolateRow_NEON(uint8* dst_ptr,
"b 99f \n"
// Blend 50 / 50.
"50: \n"
"50: \n"
"ld1 {v0.16b}, [%1], #16 \n"
"ld1 {v1.16b}, [%2], #16 \n"
"subs %w3, %w3, #16 \n"
@ -1999,13 +2035,13 @@ void InterpolateRow_NEON(uint8* dst_ptr,
"b 99f \n"
// Blend 100 / 0 - Copy row unchanged.
"100: \n"
"100: \n"
"ld1 {v0.16b}, [%1], #16 \n"
"subs %w3, %w3, #16 \n"
"st1 {v0.16b}, [%0], #16 \n"
"b.gt 100b \n"
"99: \n"
"99: \n"
: "+r"(dst_ptr), // %0
"+r"(src_ptr), // %1
"+r"(src_ptr1), // %2
@ -2025,7 +2061,7 @@ void ARGBBlendRow_NEON(const uint8* src_argb0,
"subs %w3, %w3, #8 \n"
"b.lt 89f \n"
// Blend 8 pixels.
"8: \n"
"8: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB0
// pixels
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 ARGB1
@ -2048,12 +2084,12 @@ void ARGBBlendRow_NEON(const uint8* src_argb0,
// pixels
"b.ge 8b \n"
"89: \n"
"89: \n"
"adds %w3, %w3, #8-1 \n"
"b.lt 99f \n"
// Blend 1 pixels.
"1: \n"
"1: \n"
"ld4 {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n" // load 1 pixel ARGB0.
"ld4 {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n" // load 1 pixel ARGB1.
"subs %w3, %w3, #1 \n" // 1 processed per loop.
@ -2073,7 +2109,7 @@ void ARGBBlendRow_NEON(const uint8* src_argb0,
"st4 {v0.b,v1.b,v2.b,v3.b}[0], [%2], #4 \n" // store 1 pixel.
"b.ge 1b \n"
"99: \n"
"99: \n"
: "+r"(src_argb0), // %0
"+r"(src_argb1), // %1
@ -2088,9 +2124,8 @@ void ARGBBlendRow_NEON(const uint8* src_argb0,
void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
asm volatile(
// Attenuate 8 pixels.
"1: \n"
"1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
// pixels
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"umull v4.8h, v0.8b, v3.8b \n" // b * a
"umull v5.8h, v1.8b, v3.8b \n" // g * a
@ -2122,9 +2157,8 @@ void ARGBQuantizeRow_NEON(uint8* dst_argb,
"dup v6.8h, %w4 \n" // interval add
// 8 pixel loop.
"1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 pixels of
// ARGB.
"1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB.
"subs %w1, %w1, #8 \n" // 8 processed per loop.
"uxtl v0.8h, v0.8b \n" // b (0 .. 255)
"uxtl v1.8h, v1.8b \n"
@ -2142,7 +2176,6 @@ void ARGBQuantizeRow_NEON(uint8* dst_argb,
"uqxtn v1.8b, v1.8h \n"
"uqxtn v2.8b, v2.8h \n"
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 ARGB
// pixels
"b.gt 1b \n"
: "+r"(dst_argb), // %0
"+r"(width) // %1
@ -2165,9 +2198,8 @@ void ARGBShadeRow_NEON(const uint8* src_argb,
"ushr v0.8h, v0.8h, #1 \n" // scale / 2.
// 8 pixel loop.
"1: \n"
"1: \n"
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n" // load 8 ARGB
// pixels.
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"uxtl v4.8h, v4.8b \n" // b (0 .. 255)
"uxtl v5.8h, v5.8b \n"
@ -2182,7 +2214,6 @@ void ARGBShadeRow_NEON(const uint8* src_argb,
"uqxtn v6.8b, v6.8h \n"
"uqxtn v7.8b, v7.8h \n"
"st4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // store 8 ARGB
// pixels
"b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
@ -2199,9 +2230,8 @@ void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
"movi v24.8b, #15 \n" // B * 0.11400 coefficient
"movi v25.8b, #75 \n" // G * 0.58700 coefficient
"movi v26.8b, #38 \n" // R * 0.29900 coefficient
"1: \n"
"1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
// pixels.
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"umull v4.8h, v0.8b, v24.8b \n" // B
"umlal v4.8h, v1.8b, v25.8b \n" // G
@ -2234,7 +2264,7 @@ void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
"movi v28.8b, #24 \n" // BB coefficient
"movi v29.8b, #98 \n" // BG coefficient
"movi v30.8b, #50 \n" // BR coefficient
"1: \n"
"1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB pixels.
"subs %w1, %w1, #8 \n" // 8 processed per loop.
"umull v4.8h, v0.8b, v20.8b \n" // B to Sepia B
@ -2270,9 +2300,8 @@ void ARGBColorMatrixRow_NEON(const uint8* src_argb,
"sxtl v0.8h, v2.8b \n" // B,G coefficients s16.
"sxtl2 v1.8h, v2.16b \n" // R,A coefficients s16.
"1: \n"
"ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8
// pixels.
"1: \n"
"ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 ARGB
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"uxtl v16.8h, v16.8b \n" // b (0 .. 255) 16 bit
"uxtl v17.8h, v17.8b \n" // g
@ -2310,8 +2339,7 @@ void ARGBColorMatrixRow_NEON(const uint8* src_argb,
"sqshrun v17.8b, v23.8h, #6 \n" // 16 bit to 8 bit G
"sqshrun v18.8b, v24.8h, #6 \n" // 16 bit to 8 bit R
"sqshrun v19.8b, v25.8h, #6 \n" // 16 bit to 8 bit A
"st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n" // store 8
// pixels.
"st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n" // store 8 ARGB
"b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
@ -2329,11 +2357,9 @@ void ARGBMultiplyRow_NEON(const uint8* src_argb0,
int width) {
asm volatile(
// 8 pixel loop.
"1: \n"
"1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
// pixels.
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more
// pixels.
"subs %w3, %w3, #8 \n" // 8 processed per loop.
"umull v0.8h, v0.8b, v4.8b \n" // multiply B
"umull v1.8h, v1.8b, v5.8b \n" // multiply G
@ -2344,9 +2370,7 @@ void ARGBMultiplyRow_NEON(const uint8* src_argb0,
"rshrn v2.8b, v2.8h, #8 \n" // 16 bit to 8 bit R
"rshrn v3.8b, v3.8h, #8 \n" // 16 bit to 8 bit A
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
// pixels
"b.gt 1b \n"
: "+r"(src_argb0), // %0
"+r"(src_argb1), // %1
"+r"(dst_argb), // %2
@ -2362,20 +2386,16 @@ void ARGBAddRow_NEON(const uint8* src_argb0,
int width) {
asm volatile(
// 8 pixel loop.
"1: \n"
"1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
// pixels.
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more
// pixels.
"subs %w3, %w3, #8 \n" // 8 processed per loop.
"uqadd v0.8b, v0.8b, v4.8b \n"
"uqadd v1.8b, v1.8b, v5.8b \n"
"uqadd v2.8b, v2.8b, v6.8b \n"
"uqadd v3.8b, v3.8b, v7.8b \n"
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
// pixels
"b.gt 1b \n"
: "+r"(src_argb0), // %0
"+r"(src_argb1), // %1
"+r"(dst_argb), // %2
@ -2391,20 +2411,16 @@ void ARGBSubtractRow_NEON(const uint8* src_argb0,
int width) {
asm volatile(
// 8 pixel loop.
"1: \n"
"1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
// pixels.
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more
// pixels.
"subs %w3, %w3, #8 \n" // 8 processed per loop.
"uqsub v0.8b, v0.8b, v4.8b \n"
"uqsub v1.8b, v1.8b, v5.8b \n"
"uqsub v2.8b, v2.8b, v6.8b \n"
"uqsub v3.8b, v3.8b, v7.8b \n"
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
// pixels
"b.gt 1b \n"
: "+r"(src_argb0), // %0
"+r"(src_argb1), // %1
"+r"(dst_argb), // %2
@ -2425,7 +2441,7 @@ void SobelRow_NEON(const uint8* src_sobelx,
asm volatile(
"movi v3.8b, #255 \n" // alpha
// 8 pixel loop.
"1: \n"
"1: \n"
"ld1 {v0.8b}, [%0], #8 \n" // load 8 sobelx.
"ld1 {v1.8b}, [%1], #8 \n" // load 8 sobely.
"subs %w3, %w3, #8 \n" // 8 processed per loop.
@ -2433,7 +2449,6 @@ void SobelRow_NEON(const uint8* src_sobelx,
"orr v1.8b, v0.8b, v0.8b \n"
"orr v2.8b, v0.8b, v0.8b \n"
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
// pixels
"b.gt 1b \n"
: "+r"(src_sobelx), // %0
"+r"(src_sobely), // %1
@ -2450,7 +2465,7 @@ void SobelToPlaneRow_NEON(const uint8* src_sobelx,
int width) {
asm volatile(
// 16 pixel loop.
"1: \n"
"1: \n"
"ld1 {v0.16b}, [%0], #16 \n" // load 16 sobelx.
"ld1 {v1.16b}, [%1], #16 \n" // load 16 sobely.
"subs %w3, %w3, #16 \n" // 16 processed per loop.
@ -2477,13 +2492,12 @@ void SobelXYRow_NEON(const uint8* src_sobelx,
asm volatile(
"movi v3.8b, #255 \n" // alpha
// 8 pixel loop.
"1: \n"
"1: \n"
"ld1 {v2.8b}, [%0], #8 \n" // load 8 sobelx.
"ld1 {v0.8b}, [%1], #8 \n" // load 8 sobely.
"subs %w3, %w3, #8 \n" // 8 processed per loop.
"uqadd v1.8b, v0.8b, v2.8b \n" // add
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
// pixels
"b.gt 1b \n"
: "+r"(src_sobelx), // %0
"+r"(src_sobely), // %1
@ -2503,7 +2517,7 @@ void SobelXRow_NEON(const uint8* src_y0,
uint8* dst_sobelx,
int width) {
asm volatile(
"1: \n"
"1: \n"
"ld1 {v0.8b}, [%0],%5 \n" // top
"ld1 {v1.8b}, [%0],%6 \n"
"usubl v0.8h, v0.8b, v1.8b \n"
@ -2541,7 +2555,7 @@ void SobelYRow_NEON(const uint8* src_y0,
uint8* dst_sobely,
int width) {
asm volatile(
"1: \n"
"1: \n"
"ld1 {v0.8b}, [%0],%4 \n" // left
"ld1 {v1.8b}, [%1],%4 \n"
"usubl v0.8h, v0.8b, v1.8b \n"
@ -2572,7 +2586,7 @@ void SobelYRow_NEON(const uint8* src_y0,
// Caveat - rounds float to half float whereas scaling version truncates.
void HalfFloat1Row_NEON(const uint16* src, uint16* dst, float, int width) {
asm volatile(
"1: \n"
"1: \n"
"ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts
"subs %w2, %w2, #8 \n" // 8 pixels per loop
"uxtl v2.4s, v1.4h \n" // 8 int's
@ -2592,7 +2606,7 @@ void HalfFloat1Row_NEON(const uint16* src, uint16* dst, float, int width) {
void HalfFloatRow_NEON(const uint16* src, uint16* dst, float scale, int width) {
asm volatile(
"1: \n"
"1: \n"
"ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts
"subs %w2, %w2, #8 \n" // 8 pixels per loop
"uxtl v2.4s, v1.4h \n" // 8 int's
@ -2612,6 +2626,158 @@ void HalfFloatRow_NEON(const uint16* src, uint16* dst, float scale, int width) {
: "cc", "memory", "v1", "v2", "v3");
}
float ScaleMaxSamples_NEON(const float* src,
float* dst,
float scale,
int width) {
float fmax;
asm volatile(
"movi v5.4s, #0 \n" // max
"movi v6.4s, #0 \n"
"1: \n"
"ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples
"subs %w2, %w2, #8 \n" // 8 processed per loop
"fmul v3.4s, v1.4s, %4.s[0] \n" // scale
"fmul v4.4s, v2.4s, %4.s[0] \n" // scale
"fmax v5.4s, v5.4s, v1.4s \n" // max
"fmax v6.4s, v6.4s, v2.4s \n"
"st1 {v3.4s, v4.4s}, [%1], #32 \n" // store 8 samples
"b.gt 1b \n"
"fmax v5.4s, v5.4s, v6.4s \n" // max
"fmaxv %s3, v5.4s \n" // signed max acculator
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width), // %2
"=w"(fmax) // %3
: "w"(scale) // %4
: "cc", "memory", "v1", "v2", "v3", "v4", "v5", "v6");
return fmax;
}
float ScaleSumSamples_NEON(const float* src,
float* dst,
float scale,
int width) {
float fsum;
asm volatile(
"movi v5.4s, #0 \n" // max
"movi v6.4s, #0 \n" // max
"1: \n"
"ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples
"subs %w2, %w2, #8 \n" // 8 processed per loop
"fmul v3.4s, v1.4s, %4.s[0] \n" // scale
"fmul v4.4s, v2.4s, %4.s[0] \n"
"fmla v5.4s, v1.4s, v1.4s \n" // sum of squares
"fmla v6.4s, v2.4s, v2.4s \n"
"st1 {v3.4s, v4.4s}, [%1], #32 \n" // store 8 samples
"b.gt 1b \n"
"faddp v5.4s, v5.4s, v6.4s \n"
"faddp v5.4s, v5.4s, v5.4s \n"
"faddp %3.4s, v5.4s, v5.4s \n" // sum
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width), // %2
"=w"(fsum) // %3
: "w"(scale) // %4
: "cc", "memory", "v1", "v2", "v3", "v4", "v5", "v6");
return fsum;
}
void ScaleSamples_NEON(const float* src, float* dst, float scale, int width) {
asm volatile(
"1: \n"
"ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples
"subs %w2, %w2, #8 \n" // 8 processed per loop
"fmul v1.4s, v1.4s, %3.s[0] \n" // scale
"fmul v2.4s, v2.4s, %3.s[0] \n" // scale
"st1 {v1.4s, v2.4s}, [%1], #32 \n" // store 8 samples
"b.gt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
: "w"(scale) // %3
: "cc", "memory", "v1", "v2");
}
// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
void GaussCol_NEON(const uint16* src0,
const uint16* src1,
const uint16* src2,
const uint16* src3,
const uint16* src4,
uint32* dst,
int width) {
asm volatile(
"movi v6.8h, #4 \n" // constant 4
"movi v7.8h, #6 \n" // constant 6
"1: \n"
"ld1 {v1.8h}, [%0], #16 \n" // load 8 samples, 5 rows
"ld1 {v2.8h}, [%4], #16 \n"
"uaddl v0.4s, v1.4h, v2.4h \n" // * 1
"uaddl2 v1.4s, v1.8h, v2.8h \n" // * 1
"ld1 {v2.8h}, [%1], #16 \n"
"umlal v0.4s, v2.4h, v6.4h \n" // * 4
"umlal2 v1.4s, v2.8h, v6.8h \n" // * 4
"ld1 {v2.8h}, [%2], #16 \n"
"umlal v0.4s, v2.4h, v7.4h \n" // * 6
"umlal2 v1.4s, v2.8h, v7.8h \n" // * 6
"ld1 {v2.8h}, [%3], #16 \n"
"umlal v0.4s, v2.4h, v6.4h \n" // * 4
"umlal2 v1.4s, v2.8h, v6.8h \n" // * 4
"subs %w6, %w6, #8 \n" // 8 processed per loop
"st1 {v0.4s,v1.4s}, [%5], #32 \n" // store 8 samples
"b.gt 1b \n"
: "+r"(src0), // %0
"+r"(src1), // %1
"+r"(src2), // %2
"+r"(src3), // %3
"+r"(src4), // %4
"+r"(dst), // %5
"+r"(width) // %6
:
: "cc", "memory", "v0", "v1", "v2", "v6", "v7");
}
// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
void GaussRow_NEON(const uint32* src, uint16* dst, int width) {
const uint32* src1 = src + 1;
const uint32* src2 = src + 2;
const uint32* src3 = src + 3;
asm volatile(
"movi v6.4s, #4 \n" // constant 4
"movi v7.4s, #6 \n" // constant 6
"1: \n"
"ld1 {v0.4s,v1.4s,v2.4s}, [%0], %6 \n" // load 12 source samples
"add v0.4s, v0.4s, v1.4s \n" // * 1
"add v1.4s, v1.4s, v2.4s \n" // * 1
"ld1 {v2.4s,v3.4s}, [%2], #32 \n"
"mla v0.4s, v2.4s, v7.4s \n" // * 6
"mla v1.4s, v3.4s, v7.4s \n" // * 6
"ld1 {v2.4s,v3.4s}, [%1], #32 \n"
"ld1 {v4.4s,v5.4s}, [%3], #32 \n"
"add v2.4s, v2.4s, v4.4s \n" // add rows for * 4
"add v3.4s, v3.4s, v5.4s \n"
"mla v0.4s, v2.4s, v6.4s \n" // * 4
"mla v1.4s, v3.4s, v6.4s \n" // * 4
"subs %w5, %w5, #8 \n" // 8 processed per loop
"uqrshrn v0.4h, v0.4s, #8 \n" // round and pack
"uqrshrn2 v0.8h, v1.4s, #8 \n"
"st1 {v0.8h}, [%4], #16 \n" // store 8 samples
"b.gt 1b \n"
: "+r"(src), // %0
"+r"(src1), // %1
"+r"(src2), // %2
"+r"(src3), // %3
"+r"(dst), // %4
"+r"(width) // %5
: "r"(32LL) // %6
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
}
#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
#ifdef __cplusplus

View File

@ -1410,9 +1410,9 @@ __declspec(naked) void ARGBToUVRow_SSSE3(const uint8* src_argb0,
shufps xmm4, xmm3, 0xdd
pavgb xmm2, xmm4
// step 2 - convert to U and V
// from here down is very similar to Y code except
// instead of 16 different pixels, its 8 pixels of U and 8 of V
// step 2 - convert to U and V
// from here down is very similar to Y code except
// instead of 16 different pixels, its 8 pixels of U and 8 of V
movdqa xmm1, xmm0
movdqa xmm3, xmm2
pmaddubsw xmm0, xmm7 // U
@ -1426,7 +1426,7 @@ __declspec(naked) void ARGBToUVRow_SSSE3(const uint8* src_argb0,
packsswb xmm0, xmm1
paddb xmm0, xmm5 // -> unsigned
// step 3 - store 8 U and 8 V values
// step 3 - store 8 U and 8 V values
movlps qword ptr [edx], xmm0 // U
movhps qword ptr [edx + edi], xmm0 // V
lea edx, [edx + 8]
@ -1482,9 +1482,9 @@ __declspec(naked) void ARGBToUVJRow_SSSE3(const uint8* src_argb0,
shufps xmm4, xmm3, 0xdd
pavgb xmm2, xmm4
// step 2 - convert to U and V
// from here down is very similar to Y code except
// instead of 16 different pixels, its 8 pixels of U and 8 of V
// step 2 - convert to U and V
// from here down is very similar to Y code except
// instead of 16 different pixels, its 8 pixels of U and 8 of V
movdqa xmm1, xmm0
movdqa xmm3, xmm2
pmaddubsw xmm0, xmm7 // U
@ -1499,7 +1499,7 @@ __declspec(naked) void ARGBToUVJRow_SSSE3(const uint8* src_argb0,
psraw xmm1, 8
packsswb xmm0, xmm1
// step 3 - store 8 U and 8 V values
// step 3 - store 8 U and 8 V values
movlps qword ptr [edx], xmm0 // U
movhps qword ptr [edx + edi], xmm0 // V
lea edx, [edx + 8]
@ -1549,9 +1549,9 @@ __declspec(naked) void ARGBToUVRow_AVX2(const uint8* src_argb0,
vshufps ymm2, ymm2, ymm3, 0xdd
vpavgb ymm2, ymm2, ymm4 // mutated by vshufps
// step 2 - convert to U and V
// from here down is very similar to Y code except
// instead of 32 different pixels, its 16 pixels of U and 16 of V
// step 2 - convert to U and V
// from here down is very similar to Y code except
// instead of 32 different pixels, its 16 pixels of U and 16 of V
vpmaddubsw ymm1, ymm0, ymm7 // U
vpmaddubsw ymm3, ymm2, ymm7
vpmaddubsw ymm0, ymm0, ymm6 // V
@ -1565,7 +1565,7 @@ __declspec(naked) void ARGBToUVRow_AVX2(const uint8* src_argb0,
vpshufb ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX // for vshufps/vphaddw
vpaddb ymm0, ymm0, ymm5 // -> unsigned
// step 3 - store 16 U and 16 V values
// step 3 - store 16 U and 16 V values
vextractf128 [edx], ymm0, 0 // U
vextractf128 [edx + edi], ymm0, 1 // V
lea edx, [edx + 16]
@ -1617,9 +1617,9 @@ __declspec(naked) void ARGBToUVJRow_AVX2(const uint8* src_argb0,
vshufps ymm2, ymm2, ymm3, 0xdd
vpavgb ymm2, ymm2, ymm4 // mutated by vshufps
// step 2 - convert to U and V
// from here down is very similar to Y code except
// instead of 32 different pixels, its 16 pixels of U and 16 of V
// step 2 - convert to U and V
// from here down is very similar to Y code except
// instead of 32 different pixels, its 16 pixels of U and 16 of V
vpmaddubsw ymm1, ymm0, ymm7 // U
vpmaddubsw ymm3, ymm2, ymm7
vpmaddubsw ymm0, ymm0, ymm6 // V
@ -1634,7 +1634,7 @@ __declspec(naked) void ARGBToUVJRow_AVX2(const uint8* src_argb0,
vpermq ymm0, ymm0, 0xd8 // For vpacksswb
vpshufb ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX // for vshufps/vphaddw
// step 3 - store 16 U and 16 V values
// step 3 - store 16 U and 16 V values
vextractf128 [edx], ymm0, 0 // U
vextractf128 [edx + edi], ymm0, 1 // V
lea edx, [edx + 16]
@ -1750,9 +1750,9 @@ __declspec(naked) void BGRAToUVRow_SSSE3(const uint8* src_argb0,
shufps xmm4, xmm3, 0xdd
pavgb xmm2, xmm4
// step 2 - convert to U and V
// from here down is very similar to Y code except
// instead of 16 different pixels, its 8 pixels of U and 8 of V
// step 2 - convert to U and V
// from here down is very similar to Y code except
// instead of 16 different pixels, its 8 pixels of U and 8 of V
movdqa xmm1, xmm0
movdqa xmm3, xmm2
pmaddubsw xmm0, xmm7 // U
@ -1766,7 +1766,7 @@ __declspec(naked) void BGRAToUVRow_SSSE3(const uint8* src_argb0,
packsswb xmm0, xmm1
paddb xmm0, xmm5 // -> unsigned
// step 3 - store 8 U and 8 V values
// step 3 - store 8 U and 8 V values
movlps qword ptr [edx], xmm0 // U
movhps qword ptr [edx + edi], xmm0 // V
lea edx, [edx + 8]
@ -1822,9 +1822,9 @@ __declspec(naked) void ABGRToUVRow_SSSE3(const uint8* src_argb0,
shufps xmm4, xmm3, 0xdd
pavgb xmm2, xmm4
// step 2 - convert to U and V
// from here down is very similar to Y code except
// instead of 16 different pixels, its 8 pixels of U and 8 of V
// step 2 - convert to U and V
// from here down is very similar to Y code except
// instead of 16 different pixels, its 8 pixels of U and 8 of V
movdqa xmm1, xmm0
movdqa xmm3, xmm2
pmaddubsw xmm0, xmm7 // U
@ -1838,7 +1838,7 @@ __declspec(naked) void ABGRToUVRow_SSSE3(const uint8* src_argb0,
packsswb xmm0, xmm1
paddb xmm0, xmm5 // -> unsigned
// step 3 - store 8 U and 8 V values
// step 3 - store 8 U and 8 V values
movlps qword ptr [edx], xmm0 // U
movhps qword ptr [edx + edi], xmm0 // V
lea edx, [edx + 8]
@ -1894,9 +1894,9 @@ __declspec(naked) void RGBAToUVRow_SSSE3(const uint8* src_argb0,
shufps xmm4, xmm3, 0xdd
pavgb xmm2, xmm4
// step 2 - convert to U and V
// from here down is very similar to Y code except
// instead of 16 different pixels, its 8 pixels of U and 8 of V
// step 2 - convert to U and V
// from here down is very similar to Y code except
// instead of 16 different pixels, its 8 pixels of U and 8 of V
movdqa xmm1, xmm0
movdqa xmm3, xmm2
pmaddubsw xmm0, xmm7 // U
@ -1910,7 +1910,7 @@ __declspec(naked) void RGBAToUVRow_SSSE3(const uint8* src_argb0,
packsswb xmm0, xmm1
paddb xmm0, xmm5 // -> unsigned
// step 3 - store 8 U and 8 V values
// step 3 - store 8 U and 8 V values
movlps qword ptr [edx], xmm0 // U
movhps qword ptr [edx + edi], xmm0 // V
lea edx, [edx + 8]
@ -2927,7 +2927,7 @@ __declspec(naked) void I400ToARGBRow_SSE2(const uint8* y_buf,
psrlw xmm0, 6
packuswb xmm0, xmm0 // G
// Step 2: Weave into ARGB
// Step 2: Weave into ARGB
punpcklbw xmm0, xmm0 // GG
movdqa xmm1, xmm0
punpcklwd xmm0, xmm0 // BGRA first 4 pixels
@ -2975,8 +2975,8 @@ __declspec(naked) void I400ToARGBRow_AVX2(const uint8* y_buf,
vpsrlw ymm0, ymm0, 6
vpackuswb ymm0, ymm0, ymm0 // G. still mutated: 3120
// TODO(fbarchard): Weave alpha with unpack.
// Step 2: Weave into ARGB
// TODO(fbarchard): Weave alpha with unpack.
// Step 2: Weave into ARGB
vpunpcklbw ymm1, ymm0, ymm0 // GG - mutates
vpermq ymm1, ymm1, 0xd8
vpunpcklwd ymm0, ymm1, ymm1 // GGGG first 8 pixels
@ -4067,7 +4067,7 @@ __declspec(naked) void BlendPlaneRow_SSSE3(const uint8* src0,
sub edx, esi
sub edi, esi
// 8 pixel loop.
// 8 pixel loop.
convertloop8:
movq xmm0, qword ptr [esi] // alpha
punpcklbw xmm0, xmm0
@ -4123,7 +4123,7 @@ __declspec(naked) void BlendPlaneRow_AVX2(const uint8* src0,
sub edx, esi
sub edi, esi
// 32 pixel loop.
// 32 pixel loop.
convertloop32:
vmovdqu ymm0, [esi] // alpha
vpunpckhbw ymm3, ymm0, ymm0 // 8..15, 24..31
@ -4183,7 +4183,7 @@ __declspec(naked) void ARGBBlendRow_SSSE3(const uint8* src_argb0,
sub ecx, 4
jl convertloop4b // less than 4 pixels?
// 4 pixel loop.
// 4 pixel loop.
convertloop4:
movdqu xmm3, [eax] // src argb
lea eax, [eax + 16]
@ -4212,7 +4212,7 @@ __declspec(naked) void ARGBBlendRow_SSSE3(const uint8* src_argb0,
add ecx, 4 - 1
jl convertloop1b
// 1 pixel loop.
// 1 pixel loop.
convertloop1:
movd xmm3, [eax] // src argb
lea eax, [eax + 4]
@ -5256,7 +5256,7 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft,
cvtps2dq xmm5, xmm5 // 0.16 fixed point
packssdw xmm5, xmm5 // 16 bit shorts
// 4 pixel loop small blocks.
// 4 pixel loop small blocks.
s4:
// top left
movdqu xmm0, [eax]
@ -5298,7 +5298,7 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft,
jmp l4b
// 4 pixel loop
// 4 pixel loop
l4:
// top left
movdqu xmm0, [eax]
@ -5350,7 +5350,7 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft,
add ecx, 4 - 1
jl l1b
// 1 pixel loop
// 1 pixel loop
l1:
movdqu xmm0, [eax]
psubd xmm0, [eax + edx * 4]
@ -5392,7 +5392,7 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row,
test edx, 15
jne l4b
// 4 pixel loop
// 4 pixel loop
l4:
movdqu xmm2, [eax] // 4 argb pixels 16 bytes.
lea eax, [eax + 16]
@ -5438,7 +5438,7 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row,
add ecx, 4 - 1
jl l1b
// 1 pixel loop
// 1 pixel loop
l1:
movd xmm2, dword ptr [eax] // 1 argb pixel 4 bytes.
lea eax, [eax + 4]
@ -5481,7 +5481,7 @@ __declspec(naked) LIBYUV_API void ARGBAffineRow_SSE2(const uint8* src_argb,
sub ecx, 4
jl l4b
// setup for 4 pixel loop
// setup for 4 pixel loop
pshufd xmm7, xmm7, 0x44 // dup dudv
pshufd xmm5, xmm5, 0 // dup 4, stride
movdqa xmm0, xmm2 // x0, y0, x1, y1
@ -5493,7 +5493,7 @@ __declspec(naked) LIBYUV_API void ARGBAffineRow_SSE2(const uint8* src_argb,
addps xmm3, xmm4
addps xmm4, xmm4 // dudv *= 4
// 4 pixel loop
// 4 pixel loop
l4:
cvttps2dq xmm0, xmm2 // x, y float to int first 2
cvttps2dq xmm1, xmm3 // x, y float to int next 2
@ -5524,7 +5524,7 @@ __declspec(naked) LIBYUV_API void ARGBAffineRow_SSE2(const uint8* src_argb,
add ecx, 4 - 1
jl l1b
// 1 pixel loop
// 1 pixel loop
l1:
cvttps2dq xmm0, xmm2 // x, y float to int
packssdw xmm0, xmm0 // x, y as shorts
@ -5598,7 +5598,7 @@ __declspec(naked) void InterpolateRow_AVX2(uint8* dst_ptr,
jg xloop
jmp xloop99
// Blend 50 / 50.
// Blend 50 / 50.
xloop50:
vmovdqu ymm0, [esi]
vpavgb ymm0, ymm0, [esi + edx]
@ -5608,7 +5608,7 @@ __declspec(naked) void InterpolateRow_AVX2(uint8* dst_ptr,
jg xloop50
jmp xloop99
// Blend 100 / 0 - Copy row unchanged.
// Blend 100 / 0 - Copy row unchanged.
xloop100:
rep movsb
@ -5638,7 +5638,7 @@ __declspec(naked) void InterpolateRow_SSSE3(uint8* dst_ptr,
mov ecx, [esp + 8 + 16] // dst_width
mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
sub edi, esi
// Dispatch to specialized filters if applicable.
// Dispatch to specialized filters if applicable.
cmp eax, 0
je xloop100 // 0 /256. Blend 100 / 0.
cmp eax, 128
@ -5678,7 +5678,7 @@ __declspec(naked) void InterpolateRow_SSSE3(uint8* dst_ptr,
jg xloop
jmp xloop99
// Blend 50 / 50.
// Blend 50 / 50.
xloop50:
movdqu xmm0, [esi]
movdqu xmm1, [esi + edx]
@ -5689,7 +5689,7 @@ __declspec(naked) void InterpolateRow_SSSE3(uint8* dst_ptr,
jg xloop50
jmp xloop99
// Blend 100 / 0 - Copy row unchanged.
// Blend 100 / 0 - Copy row unchanged.
xloop100:
movdqu xmm0, [esi]
movdqu [esi + edi], xmm0
@ -5784,7 +5784,7 @@ __declspec(naked) void ARGBShuffleRow_SSE2(const uint8* src_argb,
cmp ebx, 0x02010003
je shuf_2103
// TODO(fbarchard): Use one source pointer and 3 offsets.
// TODO(fbarchard): Use one source pointer and 3 offsets.
shuf_any1:
movzx ebx, byte ptr [esi]
movzx ebx, byte ptr [eax + ebx]
@ -5971,7 +5971,7 @@ __declspec(naked) void ARGBPolynomialRow_SSE2(const uint8* src_argb,
mov ecx, [esp + 4 + 16] /* width */
pxor xmm3, xmm3 // 0 constant for zero extending bytes to ints.
// 2 pixel loop.
// 2 pixel loop.
convertloop:
// pmovzxbd xmm0, dword ptr [eax] // BGRA pixel
// pmovzxbd xmm4, dword ptr [eax + 4] // BGRA pixel
@ -6072,7 +6072,7 @@ __declspec(naked) void HalfFloatRow_SSE2(const uint16* src,
pxor xmm5, xmm5
sub edx, eax
// 8 pixel loop.
// 8 pixel loop.
convertloop:
movdqu xmm2, xmmword ptr [eax] // 8 shorts
add eax, 16
@ -6110,7 +6110,7 @@ __declspec(naked) void HalfFloatRow_AVX2(const uint16* src,
vpxor ymm5, ymm5, ymm5
sub edx, eax
// 16 pixel loop.
// 16 pixel loop.
convertloop:
vmovdqu ymm2, [eax] // 16 shorts
add eax, 32
@ -6144,7 +6144,7 @@ __declspec(naked) void HalfFloatRow_F16C(const uint16* src,
mov ecx, [esp + 16] /* width */
sub edx, eax
// 16 pixel loop.
// 16 pixel loop.
convertloop:
vpmovzxwd ymm2, xmmword ptr [eax] // 8 shorts -> 8 ints
vpmovzxwd ymm3, xmmword ptr [eax + 16] // 8 more shorts
@ -6252,7 +6252,7 @@ __declspec(naked) void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb,
psllw xmm4, 8
pxor xmm5, xmm5
// 4 pixel loop.
// 4 pixel loop.
convertloop:
movdqu xmm0, xmmword ptr [eax] // generate luma ptr
pmaddubsw xmm0, xmm3

View File

@ -371,6 +371,26 @@ static void ScalePlaneDown34(int src_width,
}
}
#endif
#if defined(HAS_SCALEROWDOWN34_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
if (!filtering) {
ScaleRowDown34_0 = ScaleRowDown34_Any_MSA;
ScaleRowDown34_1 = ScaleRowDown34_Any_MSA;
} else {
ScaleRowDown34_0 = ScaleRowDown34_0_Box_Any_MSA;
ScaleRowDown34_1 = ScaleRowDown34_1_Box_Any_MSA;
}
if (dst_width % 48 == 0) {
if (!filtering) {
ScaleRowDown34_0 = ScaleRowDown34_MSA;
ScaleRowDown34_1 = ScaleRowDown34_MSA;
} else {
ScaleRowDown34_0 = ScaleRowDown34_0_Box_MSA;
ScaleRowDown34_1 = ScaleRowDown34_1_Box_MSA;
}
}
}
#endif
#if defined(HAS_SCALEROWDOWN34_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
if (!filtering) {
@ -802,11 +822,12 @@ static void ScaleAddCols2_16_C(int dst_width,
static void ScaleAddCols0_C(int dst_width,
int boxheight,
int x,
int,
int dx,
const uint16* src_ptr,
uint8* dst_ptr) {
int scaleval = 65536 / boxheight;
int i;
(void)dx;
src_ptr += (x >> 16);
for (i = 0; i < dst_width; ++i) {
*dst_ptr++ = src_ptr[i] * scaleval >> 16;
@ -1078,6 +1099,14 @@ void ScalePlaneBilinearDown(int src_width,
ScaleFilterCols = ScaleFilterCols_NEON;
}
}
#endif
#if defined(HAS_SCALEFILTERCOLS_MSA)
if (TestCpuFlag(kCpuHasMSA) && src_width < 32768) {
ScaleFilterCols = ScaleFilterCols_Any_MSA;
if (IS_ALIGNED(dst_width, 16)) {
ScaleFilterCols = ScaleFilterCols_MSA;
}
}
#endif
if (y > max_y) {
y = max_y;
@ -1276,6 +1305,14 @@ void ScalePlaneBilinearUp(int src_width,
ScaleFilterCols = ScaleFilterCols_NEON;
}
}
#endif
#if defined(HAS_SCALEFILTERCOLS_MSA)
if (filtering && TestCpuFlag(kCpuHasMSA) && src_width < 32768) {
ScaleFilterCols = ScaleFilterCols_Any_MSA;
if (IS_ALIGNED(dst_width, 16)) {
ScaleFilterCols = ScaleFilterCols_MSA;
}
}
#endif
if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
ScaleFilterCols = ScaleColsUp2_C;
@ -1663,7 +1700,7 @@ void ScalePlane_16(const uint16* src,
CopyPlane_16(src, src_stride, dst, dst_stride, dst_width, dst_height);
return;
}
if (dst_width == src_width) {
if (dst_width == src_width && filtering != kFilterBox) {
int dy = FixedDiv(src_height, dst_height);
// Arbitrary scale vertically, but unscaled vertically.
ScalePlaneVertical_16(src_height, dst_width, dst_height, src_stride,
@ -1692,7 +1729,7 @@ void ScalePlane_16(const uint16* src,
return;
}
if (4 * dst_width == src_width && 4 * dst_height == src_height &&
filtering != kFilterBilinear) {
(filtering == kFilterBox || filtering == kFilterNone)) {
// optimized, 1/4
ScalePlaneDown4_16(src_width, src_height, dst_width, dst_height,
src_stride, dst_stride, src, dst, filtering);

View File

@ -33,9 +33,15 @@ extern "C" {
#ifdef HAS_SCALEFILTERCOLS_NEON
CANY(ScaleFilterCols_Any_NEON, ScaleFilterCols_NEON, ScaleFilterCols_C, 1, 7)
#endif
#ifdef HAS_SCALEFILTERCOLS_MSA
CANY(ScaleFilterCols_Any_MSA, ScaleFilterCols_MSA, ScaleFilterCols_C, 1, 15)
#endif
#ifdef HAS_SCALEARGBCOLS_NEON
CANY(ScaleARGBCols_Any_NEON, ScaleARGBCols_NEON, ScaleARGBCols_C, 4, 7)
#endif
#ifdef HAS_SCALEARGBCOLS_MSA
CANY(ScaleARGBCols_Any_MSA, ScaleARGBCols_MSA, ScaleARGBCols_C, 4, 3)
#endif
#ifdef HAS_SCALEARGBFILTERCOLS_NEON
CANY(ScaleARGBFilterCols_Any_NEON,
ScaleARGBFilterCols_NEON,
@ -43,6 +49,13 @@ CANY(ScaleARGBFilterCols_Any_NEON,
4,
3)
#endif
#ifdef HAS_SCALEARGBFILTERCOLS_MSA
CANY(ScaleARGBFilterCols_Any_MSA,
ScaleARGBFilterCols_MSA,
ScaleARGBFilterCols_C,
4,
7)
#endif
#undef CANY
// Fixed scale down.
@ -228,6 +241,26 @@ SDANY(ScaleRowDown34_1_Box_Any_NEON,
1,
23)
#endif
#ifdef HAS_SCALEROWDOWN34_MSA
SDANY(ScaleRowDown34_Any_MSA,
ScaleRowDown34_MSA,
ScaleRowDown34_C,
4 / 3,
1,
47)
SDANY(ScaleRowDown34_0_Box_Any_MSA,
ScaleRowDown34_0_Box_MSA,
ScaleRowDown34_0_Box_C,
4 / 3,
1,
47)
SDANY(ScaleRowDown34_1_Box_Any_MSA,
ScaleRowDown34_1_Box_MSA,
ScaleRowDown34_1_Box_C,
4 / 3,
1,
47)
#endif
#ifdef HAS_SCALEROWDOWN38_SSSE3
SDANY(ScaleRowDown38_Any_SSSE3,
ScaleRowDown38_SSSE3,

View File

@ -335,6 +335,14 @@ static void ScaleARGBBilinearDown(int src_width,
ScaleARGBFilterCols = ScaleARGBFilterCols_NEON;
}
}
#endif
#if defined(HAS_SCALEARGBFILTERCOLS_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ScaleARGBFilterCols = ScaleARGBFilterCols_Any_MSA;
if (IS_ALIGNED(dst_width, 8)) {
ScaleARGBFilterCols = ScaleARGBFilterCols_MSA;
}
}
#endif
// TODO(fbarchard): Consider not allocating row buffer for kFilterLinear.
// Allocate a row of ARGB.
@ -442,6 +450,14 @@ static void ScaleARGBBilinearUp(int src_width,
}
}
#endif
#if defined(HAS_SCALEARGBFILTERCOLS_MSA)
if (filtering && TestCpuFlag(kCpuHasMSA)) {
ScaleARGBFilterCols = ScaleARGBFilterCols_Any_MSA;
if (IS_ALIGNED(dst_width, 8)) {
ScaleARGBFilterCols = ScaleARGBFilterCols_MSA;
}
}
#endif
#if defined(HAS_SCALEARGBCOLS_SSE2)
if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
ScaleARGBFilterCols = ScaleARGBCols_SSE2;
@ -454,6 +470,14 @@ static void ScaleARGBBilinearUp(int src_width,
ScaleARGBFilterCols = ScaleARGBCols_NEON;
}
}
#endif
#if defined(HAS_SCALEARGBCOLS_MSA)
if (!filtering && TestCpuFlag(kCpuHasMSA)) {
ScaleARGBFilterCols = ScaleARGBCols_Any_MSA;
if (IS_ALIGNED(dst_width, 4)) {
ScaleARGBFilterCols = ScaleARGBCols_MSA;
}
}
#endif
if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
ScaleARGBFilterCols = ScaleARGBColsUp2_C;
@ -643,6 +667,14 @@ static void ScaleYUVToARGBBilinearUp(int src_width,
}
}
#endif
#if defined(HAS_SCALEARGBFILTERCOLS_MSA)
if (filtering && TestCpuFlag(kCpuHasMSA)) {
ScaleARGBFilterCols = ScaleARGBFilterCols_Any_MSA;
if (IS_ALIGNED(dst_width, 8)) {
ScaleARGBFilterCols = ScaleARGBFilterCols_MSA;
}
}
#endif
#if defined(HAS_SCALEARGBCOLS_SSE2)
if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
ScaleARGBFilterCols = ScaleARGBCols_SSE2;
@ -655,6 +687,14 @@ static void ScaleYUVToARGBBilinearUp(int src_width,
ScaleARGBFilterCols = ScaleARGBCols_NEON;
}
}
#endif
#if defined(HAS_SCALEARGBCOLS_MSA)
if (!filtering && TestCpuFlag(kCpuHasMSA)) {
ScaleARGBFilterCols = ScaleARGBCols_Any_MSA;
if (IS_ALIGNED(dst_width, 4)) {
ScaleARGBFilterCols = ScaleARGBCols_MSA;
}
}
#endif
if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
ScaleARGBFilterCols = ScaleARGBColsUp2_C;
@ -778,6 +818,14 @@ static void ScaleARGBSimple(int src_width,
ScaleARGBCols = ScaleARGBCols_NEON;
}
}
#endif
#if defined(HAS_SCALEARGBCOLS_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ScaleARGBCols = ScaleARGBCols_Any_MSA;
if (IS_ALIGNED(dst_width, 4)) {
ScaleARGBCols = ScaleARGBCols_MSA;
}
}
#endif
if (src_width * 2 == dst_width && x < 0x8000) {
ScaleARGBCols = ScaleARGBColsUp2_C;

View File

@ -1306,6 +1306,35 @@ void ScaleSlope(int src_width,
}
#undef CENTERSTART
// Read 8x2 upsample with filtering and write 16x1.
// actually reads an extra pixel, so 9x2.
void ScaleRowUp2_16_C(const uint16* src_ptr,
ptrdiff_t src_stride,
uint16* dst,
int dst_width) {
const uint16* src2 = src_ptr + src_stride;
int x;
for (x = 0; x < dst_width - 1; x += 2) {
uint16 p0 = src_ptr[0];
uint16 p1 = src_ptr[1];
uint16 p2 = src2[0];
uint16 p3 = src2[1];
dst[0] = (p0 * 9 + p1 * 3 + p2 * 3 + p3 + 8) >> 4;
dst[1] = (p0 * 3 + p1 * 9 + p2 + p3 * 3 + 8) >> 4;
++src_ptr;
++src2;
dst += 2;
}
if (dst_width & 1) {
uint16 p0 = src_ptr[0];
uint16 p1 = src_ptr[1];
uint16 p2 = src2[0];
uint16 p3 = src2[1];
dst[0] = (p0 * 9 + p1 * 3 + p2 * 3 + p3 + 8) >> 4;
}
}
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv

View File

@ -21,6 +21,14 @@ namespace libyuv {
extern "C" {
#endif
#define LOAD_INDEXED_DATA(srcp, indx0, out0) \
{ \
out0[0] = srcp[indx0[0]]; \
out0[1] = srcp[indx0[1]]; \
out0[2] = srcp[indx0[2]]; \
out0[3] = srcp[indx0[3]]; \
}
void ScaleARGBRowDown2_MSA(const uint8_t* src_argb,
ptrdiff_t src_stride,
uint8_t* dst_argb,
@ -545,6 +553,394 @@ void ScaleAddRow_MSA(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) {
}
}
void ScaleFilterCols_MSA(uint8* dst_ptr,
const uint8* src_ptr,
int dst_width,
int x,
int dx) {
int j;
v4i32 vec_x = __msa_fill_w(x);
v4i32 vec_dx = __msa_fill_w(dx);
v4i32 vec_const = {0, 1, 2, 3};
v4i32 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
v8u16 reg0, reg1;
v16u8 dst0;
v4i32 const_0xFFFF = __msa_fill_w(0xFFFF);
v4i32 const_0x40 = __msa_fill_w(0x40);
vec0 = vec_dx * vec_const;
vec1 = vec_dx * 4;
vec_x += vec0;
for (j = 0; j < dst_width - 1; j += 16) {
vec2 = vec_x >> 16;
vec6 = vec_x & const_0xFFFF;
vec_x += vec1;
vec3 = vec_x >> 16;
vec7 = vec_x & const_0xFFFF;
vec_x += vec1;
vec4 = vec_x >> 16;
vec8 = vec_x & const_0xFFFF;
vec_x += vec1;
vec5 = vec_x >> 16;
vec9 = vec_x & const_0xFFFF;
vec_x += vec1;
vec6 >>= 9;
vec7 >>= 9;
vec8 >>= 9;
vec9 >>= 9;
LOAD_INDEXED_DATA(src_ptr, vec2, tmp0);
LOAD_INDEXED_DATA(src_ptr, vec3, tmp1);
LOAD_INDEXED_DATA(src_ptr, vec4, tmp2);
LOAD_INDEXED_DATA(src_ptr, vec5, tmp3);
vec2 += 1;
vec3 += 1;
vec4 += 1;
vec5 += 1;
LOAD_INDEXED_DATA(src_ptr, vec2, tmp4);
LOAD_INDEXED_DATA(src_ptr, vec3, tmp5);
LOAD_INDEXED_DATA(src_ptr, vec4, tmp6);
LOAD_INDEXED_DATA(src_ptr, vec5, tmp7);
tmp4 -= tmp0;
tmp5 -= tmp1;
tmp6 -= tmp2;
tmp7 -= tmp3;
tmp4 *= vec6;
tmp5 *= vec7;
tmp6 *= vec8;
tmp7 *= vec9;
tmp4 += const_0x40;
tmp5 += const_0x40;
tmp6 += const_0x40;
tmp7 += const_0x40;
tmp4 >>= 7;
tmp5 >>= 7;
tmp6 >>= 7;
tmp7 >>= 7;
tmp0 += tmp4;
tmp1 += tmp5;
tmp2 += tmp6;
tmp3 += tmp7;
reg0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0);
reg1 = (v8u16)__msa_pckev_h((v8i16)tmp3, (v8i16)tmp2);
dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0);
__msa_st_b(dst0, dst_ptr, 0);
dst_ptr += 16;
}
}
void ScaleARGBCols_MSA(uint8* dst_argb,
const uint8* src_argb,
int dst_width,
int x,
int dx) {
const uint32* src = (const uint32*)(src_argb);
uint32* dst = (uint32*)(dst_argb);
int j;
v4i32 x_vec = __msa_fill_w(x);
v4i32 dx_vec = __msa_fill_w(dx);
v4i32 const_vec = {0, 1, 2, 3};
v4i32 vec0, vec1, vec2;
v4i32 dst0;
vec0 = dx_vec * const_vec;
vec1 = dx_vec * 4;
x_vec += vec0;
for (j = 0; j < dst_width; j += 4) {
vec2 = x_vec >> 16;
x_vec += vec1;
LOAD_INDEXED_DATA(src, vec2, dst0);
__msa_st_w(dst0, dst, 0);
dst += 4;
}
}
void ScaleARGBFilterCols_MSA(uint8* dst_argb,
const uint8* src_argb,
int dst_width,
int x,
int dx) {
const uint32* src = (const uint32*)(src_argb);
int j;
v4u32 src0, src1, src2, src3;
v4u32 vec0, vec1, vec2, vec3;
v16u8 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
v16u8 mult0, mult1, mult2, mult3;
v8u16 tmp0, tmp1, tmp2, tmp3;
v16u8 dst0, dst1;
v4u32 vec_x = (v4u32)__msa_fill_w(x);
v4u32 vec_dx = (v4u32)__msa_fill_w(dx);
v4u32 vec_const = {0, 1, 2, 3};
v16u8 const_0x7f = (v16u8)__msa_fill_b(0x7f);
vec0 = vec_dx * vec_const;
vec1 = vec_dx * 4;
vec_x += vec0;
for (j = 0; j < dst_width - 1; j += 8) {
vec2 = vec_x >> 16;
reg0 = (v16u8)(vec_x >> 9);
vec_x += vec1;
vec3 = vec_x >> 16;
reg1 = (v16u8)(vec_x >> 9);
vec_x += vec1;
reg0 = reg0 & const_0x7f;
reg1 = reg1 & const_0x7f;
reg0 = (v16u8)__msa_shf_b((v16i8)reg0, 0);
reg1 = (v16u8)__msa_shf_b((v16i8)reg1, 0);
reg2 = reg0 ^ const_0x7f;
reg3 = reg1 ^ const_0x7f;
mult0 = (v16u8)__msa_ilvr_b((v16i8)reg0, (v16i8)reg2);
mult1 = (v16u8)__msa_ilvl_b((v16i8)reg0, (v16i8)reg2);
mult2 = (v16u8)__msa_ilvr_b((v16i8)reg1, (v16i8)reg3);
mult3 = (v16u8)__msa_ilvl_b((v16i8)reg1, (v16i8)reg3);
LOAD_INDEXED_DATA(src, vec2, src0);
LOAD_INDEXED_DATA(src, vec3, src1);
vec2 += 1;
vec3 += 1;
LOAD_INDEXED_DATA(src, vec2, src2);
LOAD_INDEXED_DATA(src, vec3, src3);
reg4 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src0);
reg5 = (v16u8)__msa_ilvl_b((v16i8)src2, (v16i8)src0);
reg6 = (v16u8)__msa_ilvr_b((v16i8)src3, (v16i8)src1);
reg7 = (v16u8)__msa_ilvl_b((v16i8)src3, (v16i8)src1);
tmp0 = __msa_dotp_u_h(reg4, mult0);
tmp1 = __msa_dotp_u_h(reg5, mult1);
tmp2 = __msa_dotp_u_h(reg6, mult2);
tmp3 = __msa_dotp_u_h(reg7, mult3);
tmp0 >>= 7;
tmp1 >>= 7;
tmp2 >>= 7;
tmp3 >>= 7;
dst0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
dst1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
__msa_st_b(dst0, dst_argb, 0);
__msa_st_b(dst1, dst_argb, 16);
dst_argb += 32;
}
}
void ScaleRowDown34_MSA(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst,
int dst_width) {
int x;
(void)src_stride;
v16u8 src0, src1, src2, src3;
v16u8 vec0, vec1, vec2;
v16i8 mask0 = {0, 1, 3, 4, 5, 7, 8, 9, 11, 12, 13, 15, 16, 17, 19, 20};
v16i8 mask1 = {5, 7, 8, 9, 11, 12, 13, 15, 16, 17, 19, 20, 21, 23, 24, 25};
v16i8 mask2 = {11, 12, 13, 15, 16, 17, 19, 20,
21, 23, 24, 25, 27, 28, 29, 31};
assert((dst_width % 3 == 0) && (dst_width > 0));
for (x = 0; x < dst_width; x += 48) {
src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0);
src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16);
src2 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 32);
src3 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 48);
vec0 = (v16u8)__msa_vshf_b(mask0, (v16i8)src1, (v16i8)src0);
vec1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src2, (v16i8)src1);
vec2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src3, (v16i8)src2);
__msa_st_b((v16i8)vec0, dst, 0);
__msa_st_b((v16i8)vec1, dst, 16);
__msa_st_b((v16i8)vec2, dst, 32);
src_ptr += 64;
dst += 48;
}
}
void ScaleRowDown34_0_Box_MSA(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* d,
int dst_width) {
const uint8* s = src_ptr;
const uint8* t = src_ptr + src_stride;
int x;
v16u8 src0, src1, src2, src3, src4, src5, src6, src7, dst0, dst1, dst2;
v16u8 vec0, vec1, vec2, vec3, vec4, vec5;
v16u8 vec6, vec7, vec8, vec9, vec10, vec11;
v8i16 reg0, reg1, reg2, reg3, reg4, reg5;
v8i16 reg6, reg7, reg8, reg9, reg10, reg11;
v16u8 const0 = {3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1};
v16u8 const1 = {1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1};
v16u8 const2 = {1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3};
v16i8 mask0 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10};
v16i8 mask1 = {10, 11, 12, 13, 13, 14, 14, 15,
16, 17, 17, 18, 18, 19, 20, 21};
v16i8 mask2 = {5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15};
v8i16 shft0 = {2, 1, 2, 2, 1, 2, 2, 1};
v8i16 shft1 = {2, 2, 1, 2, 2, 1, 2, 2};
v8i16 shft2 = {1, 2, 2, 1, 2, 2, 1, 2};
assert((dst_width % 3 == 0) && (dst_width > 0));
for (x = 0; x < dst_width; x += 48) {
src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
src2 = (v16u8)__msa_ld_b((v16i8*)s, 32);
src3 = (v16u8)__msa_ld_b((v16i8*)s, 48);
src4 = (v16u8)__msa_ld_b((v16i8*)t, 0);
src5 = (v16u8)__msa_ld_b((v16i8*)t, 16);
src6 = (v16u8)__msa_ld_b((v16i8*)t, 32);
src7 = (v16u8)__msa_ld_b((v16i8*)t, 48);
vec0 = (v16u8)__msa_vshf_b(mask0, (v16i8)src0, (v16i8)src0);
vec1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0);
vec2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src1, (v16i8)src1);
vec3 = (v16u8)__msa_vshf_b(mask0, (v16i8)src2, (v16i8)src2);
vec4 = (v16u8)__msa_vshf_b(mask1, (v16i8)src3, (v16i8)src2);
vec5 = (v16u8)__msa_vshf_b(mask2, (v16i8)src3, (v16i8)src3);
vec6 = (v16u8)__msa_vshf_b(mask0, (v16i8)src4, (v16i8)src4);
vec7 = (v16u8)__msa_vshf_b(mask1, (v16i8)src5, (v16i8)src4);
vec8 = (v16u8)__msa_vshf_b(mask2, (v16i8)src5, (v16i8)src5);
vec9 = (v16u8)__msa_vshf_b(mask0, (v16i8)src6, (v16i8)src6);
vec10 = (v16u8)__msa_vshf_b(mask1, (v16i8)src7, (v16i8)src6);
vec11 = (v16u8)__msa_vshf_b(mask2, (v16i8)src7, (v16i8)src7);
reg0 = (v8i16)__msa_dotp_u_h(vec0, const0);
reg1 = (v8i16)__msa_dotp_u_h(vec1, const1);
reg2 = (v8i16)__msa_dotp_u_h(vec2, const2);
reg3 = (v8i16)__msa_dotp_u_h(vec3, const0);
reg4 = (v8i16)__msa_dotp_u_h(vec4, const1);
reg5 = (v8i16)__msa_dotp_u_h(vec5, const2);
reg6 = (v8i16)__msa_dotp_u_h(vec6, const0);
reg7 = (v8i16)__msa_dotp_u_h(vec7, const1);
reg8 = (v8i16)__msa_dotp_u_h(vec8, const2);
reg9 = (v8i16)__msa_dotp_u_h(vec9, const0);
reg10 = (v8i16)__msa_dotp_u_h(vec10, const1);
reg11 = (v8i16)__msa_dotp_u_h(vec11, const2);
reg0 = __msa_srar_h(reg0, shft0);
reg1 = __msa_srar_h(reg1, shft1);
reg2 = __msa_srar_h(reg2, shft2);
reg3 = __msa_srar_h(reg3, shft0);
reg4 = __msa_srar_h(reg4, shft1);
reg5 = __msa_srar_h(reg5, shft2);
reg6 = __msa_srar_h(reg6, shft0);
reg7 = __msa_srar_h(reg7, shft1);
reg8 = __msa_srar_h(reg8, shft2);
reg9 = __msa_srar_h(reg9, shft0);
reg10 = __msa_srar_h(reg10, shft1);
reg11 = __msa_srar_h(reg11, shft2);
reg0 = reg0 * 3 + reg6;
reg1 = reg1 * 3 + reg7;
reg2 = reg2 * 3 + reg8;
reg3 = reg3 * 3 + reg9;
reg4 = reg4 * 3 + reg10;
reg5 = reg5 * 3 + reg11;
reg0 = __msa_srari_h(reg0, 2);
reg1 = __msa_srari_h(reg1, 2);
reg2 = __msa_srari_h(reg2, 2);
reg3 = __msa_srari_h(reg3, 2);
reg4 = __msa_srari_h(reg4, 2);
reg5 = __msa_srari_h(reg5, 2);
dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0);
dst1 = (v16u8)__msa_pckev_b((v16i8)reg3, (v16i8)reg2);
dst2 = (v16u8)__msa_pckev_b((v16i8)reg5, (v16i8)reg4);
__msa_st_b((v16i8)dst0, d, 0);
__msa_st_b((v16i8)dst1, d, 16);
__msa_st_b((v16i8)dst2, d, 32);
s += 64;
t += 64;
d += 48;
}
}
void ScaleRowDown34_1_Box_MSA(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* d,
int dst_width) {
const uint8* s = src_ptr;
const uint8* t = src_ptr + src_stride;
int x;
v16u8 src0, src1, src2, src3, src4, src5, src6, src7, dst0, dst1, dst2;
v16u8 vec0, vec1, vec2, vec3, vec4, vec5;
v16u8 vec6, vec7, vec8, vec9, vec10, vec11;
v8i16 reg0, reg1, reg2, reg3, reg4, reg5;
v8i16 reg6, reg7, reg8, reg9, reg10, reg11;
v16u8 const0 = {3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1};
v16u8 const1 = {1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1};
v16u8 const2 = {1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3};
v16i8 mask0 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10};
v16i8 mask1 = {10, 11, 12, 13, 13, 14, 14, 15,
16, 17, 17, 18, 18, 19, 20, 21};
v16i8 mask2 = {5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15};
v8i16 shft0 = {2, 1, 2, 2, 1, 2, 2, 1};
v8i16 shft1 = {2, 2, 1, 2, 2, 1, 2, 2};
v8i16 shft2 = {1, 2, 2, 1, 2, 2, 1, 2};
assert((dst_width % 3 == 0) && (dst_width > 0));
for (x = 0; x < dst_width; x += 48) {
src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
src2 = (v16u8)__msa_ld_b((v16i8*)s, 32);
src3 = (v16u8)__msa_ld_b((v16i8*)s, 48);
src4 = (v16u8)__msa_ld_b((v16i8*)t, 0);
src5 = (v16u8)__msa_ld_b((v16i8*)t, 16);
src6 = (v16u8)__msa_ld_b((v16i8*)t, 32);
src7 = (v16u8)__msa_ld_b((v16i8*)t, 48);
vec0 = (v16u8)__msa_vshf_b(mask0, (v16i8)src0, (v16i8)src0);
vec1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0);
vec2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src1, (v16i8)src1);
vec3 = (v16u8)__msa_vshf_b(mask0, (v16i8)src2, (v16i8)src2);
vec4 = (v16u8)__msa_vshf_b(mask1, (v16i8)src3, (v16i8)src2);
vec5 = (v16u8)__msa_vshf_b(mask2, (v16i8)src3, (v16i8)src3);
vec6 = (v16u8)__msa_vshf_b(mask0, (v16i8)src4, (v16i8)src4);
vec7 = (v16u8)__msa_vshf_b(mask1, (v16i8)src5, (v16i8)src4);
vec8 = (v16u8)__msa_vshf_b(mask2, (v16i8)src5, (v16i8)src5);
vec9 = (v16u8)__msa_vshf_b(mask0, (v16i8)src6, (v16i8)src6);
vec10 = (v16u8)__msa_vshf_b(mask1, (v16i8)src7, (v16i8)src6);
vec11 = (v16u8)__msa_vshf_b(mask2, (v16i8)src7, (v16i8)src7);
reg0 = (v8i16)__msa_dotp_u_h(vec0, const0);
reg1 = (v8i16)__msa_dotp_u_h(vec1, const1);
reg2 = (v8i16)__msa_dotp_u_h(vec2, const2);
reg3 = (v8i16)__msa_dotp_u_h(vec3, const0);
reg4 = (v8i16)__msa_dotp_u_h(vec4, const1);
reg5 = (v8i16)__msa_dotp_u_h(vec5, const2);
reg6 = (v8i16)__msa_dotp_u_h(vec6, const0);
reg7 = (v8i16)__msa_dotp_u_h(vec7, const1);
reg8 = (v8i16)__msa_dotp_u_h(vec8, const2);
reg9 = (v8i16)__msa_dotp_u_h(vec9, const0);
reg10 = (v8i16)__msa_dotp_u_h(vec10, const1);
reg11 = (v8i16)__msa_dotp_u_h(vec11, const2);
reg0 = __msa_srar_h(reg0, shft0);
reg1 = __msa_srar_h(reg1, shft1);
reg2 = __msa_srar_h(reg2, shft2);
reg3 = __msa_srar_h(reg3, shft0);
reg4 = __msa_srar_h(reg4, shft1);
reg5 = __msa_srar_h(reg5, shft2);
reg6 = __msa_srar_h(reg6, shft0);
reg7 = __msa_srar_h(reg7, shft1);
reg8 = __msa_srar_h(reg8, shft2);
reg9 = __msa_srar_h(reg9, shft0);
reg10 = __msa_srar_h(reg10, shft1);
reg11 = __msa_srar_h(reg11, shft2);
reg0 += reg6;
reg1 += reg7;
reg2 += reg8;
reg3 += reg9;
reg4 += reg10;
reg5 += reg11;
reg0 = __msa_srari_h(reg0, 1);
reg1 = __msa_srari_h(reg1, 1);
reg2 = __msa_srari_h(reg2, 1);
reg3 = __msa_srari_h(reg3, 1);
reg4 = __msa_srari_h(reg4, 1);
reg5 = __msa_srari_h(reg5, 1);
dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0);
dst1 = (v16u8)__msa_pckev_b((v16i8)reg3, (v16i8)reg2);
dst2 = (v16u8)__msa_pckev_b((v16i8)reg5, (v16i8)reg4);
__msa_st_b((v16i8)dst0, d, 0);
__msa_st_b((v16i8)dst1, d, 16);
__msa_st_b((v16i8)dst2, d, 32);
s += 64;
t += 64;
d += 48;
}
}
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv

View File

@ -29,7 +29,7 @@ void ScaleRowDown2_NEON(const uint8* src_ptr,
int dst_width) {
(void)src_stride;
asm volatile(
"1: \n"
"1: \n"
// load even pixels into q0, odd into q1
"vld2.8 {q0, q1}, [%0]! \n"
"subs %2, %2, #16 \n" // 16 processed per loop
@ -50,15 +50,10 @@ void ScaleRowDown2Linear_NEON(const uint8* src_ptr,
int dst_width) {
(void)src_stride;
asm volatile(
"1: \n"
"vld1.8 {q0, q1}, [%0]! \n" // load pixels and post
// inc
"1: \n"
"vld2.8 {q0, q1}, [%0]! \n" // load 32 pixels
"subs %2, %2, #16 \n" // 16 processed per loop
"vpaddl.u8 q0, q0 \n" // add adjacent
"vpaddl.u8 q1, q1 \n"
"vrshrn.u16 d0, q0, #1 \n" // downshift, round and
// pack
"vrshrn.u16 d1, q1, #1 \n"
"vrhadd.u8 q0, q0, q1 \n" // rounding half add
"vst1.8 {q0}, [%1]! \n"
"bgt 1b \n"
: "+r"(src_ptr), // %0
@ -77,7 +72,7 @@ void ScaleRowDown2Box_NEON(const uint8* src_ptr,
asm volatile(
// change the stride to row 2 pointer
"add %1, %0 \n"
"1: \n"
"1: \n"
"vld1.8 {q0, q1}, [%0]! \n" // load row 1 and post inc
"vld1.8 {q2, q3}, [%1]! \n" // load row 2 and post inc
"subs %3, %3, #16 \n" // 16 processed per loop
@ -106,7 +101,7 @@ void ScaleRowDown4_NEON(const uint8* src_ptr,
int dst_width) {
(void)src_stride;
asm volatile(
"1: \n"
"1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
"subs %2, %2, #8 \n" // 8 processed per loop
"vst1.8 {d2}, [%1]! \n"
@ -126,7 +121,7 @@ void ScaleRowDown4Box_NEON(const uint8* src_ptr,
const uint8* src_ptr2 = src_ptr + src_stride * 2;
const uint8* src_ptr3 = src_ptr + src_stride * 3;
asm volatile(
"1: \n"
"1: \n"
"vld1.8 {q0}, [%0]! \n" // load up 16x4
"vld1.8 {q1}, [%3]! \n"
"vld1.8 {q2}, [%4]! \n"
@ -160,12 +155,12 @@ void ScaleRowDown34_NEON(const uint8* src_ptr,
int dst_width) {
(void)src_stride;
asm volatile(
"1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
"subs %2, %2, #24 \n"
"vmov d2, d3 \n" // order d0, d1, d2
"vst3.8 {d0, d1, d2}, [%1]! \n"
"bgt 1b \n"
"1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
"subs %2, %2, #24 \n"
"vmov d2, d3 \n" // order d0, d1, d2
"vst3.8 {d0, d1, d2}, [%1]! \n"
"bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
@ -180,7 +175,7 @@ void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
asm volatile(
"vmov.u8 d24, #3 \n"
"add %3, %0 \n"
"1: \n"
"1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
"vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
"subs %2, %2, #24 \n"
@ -237,7 +232,7 @@ void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
asm volatile(
"vmov.u8 d24, #3 \n"
"add %3, %0 \n"
"1: \n"
"1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
"vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
"subs %2, %2, #24 \n"
@ -285,7 +280,7 @@ void ScaleRowDown38_NEON(const uint8* src_ptr,
(void)src_stride;
asm volatile(
"vld1.8 {q3}, [%3] \n"
"1: \n"
"1: \n"
"vld1.8 {d0, d1, d2, d3}, [%0]! \n"
"subs %2, %2, #12 \n"
"vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n"
@ -312,7 +307,7 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
"vld1.8 {q14}, [%6] \n"
"vld1.8 {q15}, [%7] \n"
"add %3, %0 \n"
"1: \n"
"1: \n"
// d0 = 00 40 01 41 02 42 03 43
// d1 = 10 50 11 51 12 52 13 53
@ -421,7 +416,7 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
"vld1.16 {q13}, [%4] \n"
"vld1.8 {q14}, [%5] \n"
"add %3, %0 \n"
"1: \n"
"1: \n"
// d0 = 00 40 01 41 02 42 03 43
// d1 = 10 50 11 51 12 52 13 53
@ -513,12 +508,12 @@ void ScaleAddRows_NEON(const uint8* src_ptr,
int src_height) {
const uint8* src_tmp;
asm volatile(
"1: \n"
"1: \n"
"mov %0, %1 \n"
"mov r12, %5 \n"
"veor q2, q2, q2 \n"
"veor q3, q3, q3 \n"
"2: \n"
"2: \n"
// load 16 pixels into q0
"vld1.8 {q0}, [%0], %3 \n"
"vaddw.u8 q3, q3, d1 \n"
@ -540,15 +535,13 @@ void ScaleAddRows_NEON(const uint8* src_ptr,
);
}
// clang-format off
// TODO(Yang Zhang): Investigate less load instructions for
// the x/dx stepping
#define LOAD2_DATA8_LANE(n) \
"lsr %5, %3, #16 \n" \
"add %6, %1, %5 \n" \
"add %3, %3, %4 \n" \
"vld2.8 {d6["#n"], d7["#n"]}, [%6] \n"
// clang-format on
#define LOAD2_DATA8_LANE(n) \
"lsr %5, %3, #16 \n" \
"add %6, %1, %5 \n" \
"add %3, %3, %4 \n" \
"vld2.8 {d6[" #n "], d7[" #n "]}, [%6] \n"
// The NEON version mimics this formula (from row_common.cc):
// #define BLENDER(a, b, f) (uint8)((int)(a) +
@ -639,7 +632,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
"rsb %4, #256 \n"
"vdup.8 d4, %4 \n"
// General purpose row blend.
"1: \n"
"1: \n"
"vld1.8 {q0}, [%1]! \n"
"vld1.8 {q1}, [%2]! \n"
"subs %3, %3, #16 \n"
@ -654,7 +647,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
"b 99f \n"
// Blend 25 / 75.
"25: \n"
"25: \n"
"vld1.8 {q0}, [%1]! \n"
"vld1.8 {q1}, [%2]! \n"
"subs %3, %3, #16 \n"
@ -665,7 +658,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
"b 99f \n"
// Blend 50 / 50.
"50: \n"
"50: \n"
"vld1.8 {q0}, [%1]! \n"
"vld1.8 {q1}, [%2]! \n"
"subs %3, %3, #16 \n"
@ -675,7 +668,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
"b 99f \n"
// Blend 75 / 25.
"75: \n"
"75: \n"
"vld1.8 {q1}, [%1]! \n"
"vld1.8 {q0}, [%2]! \n"
"subs %3, %3, #16 \n"
@ -686,13 +679,13 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
"b 99f \n"
// Blend 100 / 0 - Copy row unchanged.
"100: \n"
"100: \n"
"vld1.8 {q0}, [%1]! \n"
"subs %3, %3, #16 \n"
"vst1.8 {q0}, [%0]! \n"
"bgt 100b \n"
"99: \n"
"99: \n"
"vst1.8 {d1[7]}, [%0] \n"
: "+r"(dst_ptr), // %0
"+r"(src_ptr), // %1
@ -709,13 +702,12 @@ void ScaleARGBRowDown2_NEON(const uint8* src_ptr,
int dst_width) {
(void)src_stride;
asm volatile(
"1: \n"
// load even pixels into q0, odd into q1
"vld2.32 {q0, q1}, [%0]! \n"
"vld2.32 {q2, q3}, [%0]! \n"
"1: \n"
"vld4.32 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
"vld4.32 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB
"subs %2, %2, #8 \n" // 8 processed per loop
"vst1.8 {q1}, [%1]! \n" // store odd pixels
"vst1.8 {q3}, [%1]! \n"
"vmov q2, q1 \n" // load next 8 ARGB
"vst2.32 {q2, q3}, [%1]! \n" // store odd pixels
"bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst), // %1
@ -725,27 +717,26 @@ void ScaleARGBRowDown2_NEON(const uint8* src_ptr,
);
}
// 46: f964 018d vld4.32 {d16,d18,d20,d22}, [r4]!
// 4a: 3e04 subs r6, #4
// 4c: f964 118d vld4.32 {d17,d19,d21,d23}, [r4]!
// 50: ef64 21f4 vorr q9, q10, q10
// 54: f942 038d vst2.32 {d16-d19}, [r2]!
// 58: d1f5 bne.n 46 <ScaleARGBRowDown2_C+0x46>
void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb,
ptrdiff_t src_stride,
uint8* dst_argb,
int dst_width) {
(void)src_stride;
asm volatile(
"1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB
// pixels.
"1: \n"
"vld4.32 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
"vld4.32 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB
"subs %2, %2, #8 \n" // 8 processed per loop
"vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
"vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
"vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
"vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts.
"vrshrn.u16 d0, q0, #1 \n" // downshift, round and
// pack
"vrshrn.u16 d1, q1, #1 \n"
"vrshrn.u16 d2, q2, #1 \n"
"vrshrn.u16 d3, q3, #1 \n"
"vst4.8 {d0, d1, d2, d3}, [%1]! \n"
"vrhadd.u8 q0, q0, q1 \n" // rounding half add
"vrhadd.u8 q1, q2, q3 \n" // rounding half add
"vst2.32 {q0, q1}, [%1]! \n"
"bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
@ -762,25 +753,21 @@ void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr,
asm volatile(
// change the stride to row 2 pointer
"add %1, %1, %0 \n"
"1: \n"
"1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB
// pixels.
"subs %3, %3, #8 \n" // 8 processed per loop.
"vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
"vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
"vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
"vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts.
"vld4.8 {d16, d18, d20, d22}, [%1]! \n" // load 8 more ARGB
// pixels.
"vld4.8 {d17, d19, d21, d23}, [%1]! \n" // load last 8 ARGB
// pixels.
"vpadal.u8 q0, q8 \n" // B 16 bytes -> 8 shorts.
"vpadal.u8 q1, q9 \n" // G 16 bytes -> 8 shorts.
"vpadal.u8 q2, q10 \n" // R 16 bytes -> 8 shorts.
"vpadal.u8 q3, q11 \n" // A 16 bytes -> 8 shorts.
"vrshrn.u16 d0, q0, #2 \n" // downshift, round and
// pack
"vrshrn.u16 d0, q0, #2 \n" // round and pack to bytes
"vrshrn.u16 d1, q1, #2 \n"
"vrshrn.u16 d2, q2, #2 \n"
"vrshrn.u16 d3, q3, #2 \n"
@ -804,7 +791,7 @@ void ScaleARGBRowDownEven_NEON(const uint8* src_argb,
(void)src_stride;
asm volatile(
"mov r12, %3, lsl #2 \n"
"1: \n"
"1: \n"
"vld1.32 {d0[0]}, [%0], r12 \n"
"vld1.32 {d0[1]}, [%0], r12 \n"
"vld1.32 {d1[0]}, [%0], r12 \n"
@ -829,9 +816,8 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb,
asm volatile(
"mov r12, %4, lsl #2 \n"
"add %1, %1, %0 \n"
"1: \n"
"vld1.8 {d0}, [%0], r12 \n" // Read 4 2x2 blocks ->
// 2x1
"1: \n"
"vld1.8 {d0}, [%0], r12 \n" // 4 2x2 blocks -> 2x1
"vld1.8 {d1}, [%1], r12 \n"
"vld1.8 {d2}, [%0], r12 \n"
"vld1.8 {d3}, [%1], r12 \n"
@ -860,15 +846,13 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb,
: "memory", "cc", "r12", "q0", "q1", "q2", "q3");
}
// clang-format off
// TODO(Yang Zhang): Investigate less load instructions for
// the x/dx stepping
#define LOAD1_DATA32_LANE(dn, n) \
"lsr %5, %3, #16 \n" \
"add %6, %1, %5, lsl #2 \n" \
"add %3, %3, %4 \n" \
#define LOAD1_DATA32_LANE(dn, n) \
"lsr %5, %3, #16 \n" \
"add %6, %1, %5, lsl #2 \n" \
"add %3, %3, %4 \n" \
"vld1.32 {" #dn "[" #n "]}, [%6] \n"
// clang-format on
void ScaleARGBCols_NEON(uint8* dst_argb,
const uint8* src_argb,
@ -878,15 +862,20 @@ void ScaleARGBCols_NEON(uint8* dst_argb,
int tmp;
const uint8* src_tmp = src_argb;
asm volatile(
"1: \n" LOAD1_DATA32_LANE(
d0, 0) LOAD1_DATA32_LANE(d0, 1) LOAD1_DATA32_LANE(d1, 0)
LOAD1_DATA32_LANE(d1, 1) LOAD1_DATA32_LANE(d2, 0) LOAD1_DATA32_LANE(
d2, 1) LOAD1_DATA32_LANE(d3, 0) LOAD1_DATA32_LANE(d3, 1)
"vst1.32 {q0, q1}, [%0]! \n" // store pixels
"subs %2, %2, #8 \n" // 8 processed per
// loop
"bgt 1b \n"
"1: \n"
// clang-format off
LOAD1_DATA32_LANE(d0, 0)
LOAD1_DATA32_LANE(d0, 1)
LOAD1_DATA32_LANE(d1, 0)
LOAD1_DATA32_LANE(d1, 1)
LOAD1_DATA32_LANE(d2, 0)
LOAD1_DATA32_LANE(d2, 1)
LOAD1_DATA32_LANE(d3, 0)
LOAD1_DATA32_LANE(d3, 1)
// clang-format on
"vst1.32 {q0, q1}, [%0]! \n" // store pixels
"subs %2, %2, #8 \n" // 8 processed per loop
"bgt 1b \n"
: "+r"(dst_argb), // %0
"+r"(src_argb), // %1
"+r"(dst_width), // %2
@ -900,15 +889,13 @@ void ScaleARGBCols_NEON(uint8* dst_argb,
#undef LOAD1_DATA32_LANE
// clang-format off
// TODO(Yang Zhang): Investigate less load instructions for
// the x/dx stepping
#define LOAD2_DATA32_LANE(dn1, dn2, n) \
"lsr %5, %3, #16 \n" \
"add %6, %1, %5, lsl #2 \n" \
"add %3, %3, %4 \n" \
"vld2.32 {" #dn1 "[" #n "], " #dn2 "[" #n "]}, [%6] \n"
// clang-format on
#define LOAD2_DATA32_LANE(dn1, dn2, n) \
"lsr %5, %3, #16 \n" \
"add %6, %1, %5, lsl #2 \n" \
"add %3, %3, %4 \n" \
"vld2.32 {" #dn1 "[" #n "], " #dn2 "[" #n "]}, [%6] \n"
void ScaleARGBFilterCols_NEON(uint8* dst_argb,
const uint8* src_argb,

File diff suppressed because it is too large Load Diff

View File

@ -17,7 +17,7 @@ extern "C" {
#endif
// This module is for 32 bit Visual C x86 and clangcl
#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
// Offsets for source bytes 0 to 9
static uvec8 kShuf0 = {0, 1, 3, 4, 5, 7, 8, 9,
@ -816,7 +816,7 @@ __declspec(naked) void ScaleAddRow_SSE2(const uint8* src_ptr,
mov ecx, [esp + 12] // src_width
pxor xmm5, xmm5
// sum rows
// sum rows
xloop:
movdqu xmm3, [eax] // read 16 bytes
lea eax, [eax + 16]
@ -847,7 +847,7 @@ __declspec(naked) void ScaleAddRow_AVX2(const uint8* src_ptr,
mov ecx, [esp + 12] // src_width
vpxor ymm5, ymm5, ymm5
// sum rows
// sum rows
xloop:
vmovdqu ymm3, [eax] // read 32 bytes
lea eax, [eax + 32]
@ -939,7 +939,7 @@ __declspec(naked) void ScaleFilterCols_SSSE3(uint8* dst_ptr,
add ecx, 2 - 1
jl xloop99
// 1 pixel remainder
// 1 pixel remainder
movzx ebx, word ptr [esi + eax] // 2 source x0 pixels
movd xmm0, ebx
psrlw xmm2, 9 // 7 bit fractions.
@ -1194,7 +1194,7 @@ __declspec(naked) void ScaleARGBCols_SSE2(uint8* dst_argb,
sub ecx, 4
jl xloop49
// 4 Pixel loop.
// 4 Pixel loop.
xloop4:
movd xmm0, [esi + eax * 4] // 1 source x0 pixels
movd xmm1, [esi + edx * 4] // 1 source x1 pixels
@ -1218,7 +1218,7 @@ __declspec(naked) void ScaleARGBCols_SSE2(uint8* dst_argb,
test ecx, 2
je xloop29
// 2 Pixels.
// 2 Pixels.
movd xmm0, [esi + eax * 4] // 1 source x0 pixels
movd xmm1, [esi + edx * 4] // 1 source x1 pixels
pextrw eax, xmm2, 5 // get x2 integer.
@ -1231,7 +1231,7 @@ __declspec(naked) void ScaleARGBCols_SSE2(uint8* dst_argb,
test ecx, 1
je xloop99
// 1 Pixels.
// 1 Pixels.
movd xmm0, [esi + eax * 4] // 1 source x2 pixels
movd dword ptr [edi], xmm0
xloop99:
@ -1309,7 +1309,7 @@ __declspec(naked) void ScaleARGBFilterCols_SSSE3(uint8* dst_argb,
add ecx, 2 - 1
jl xloop99
// 1 pixel remainder
// 1 pixel remainder
psrlw xmm2, 9 // 7 bit fractions.
movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels
pshufb xmm2, xmm5 // 00000000