Re #2004: Update libyuv version from libyuv git master repo dated 27 July 2017, the compile errors on old gcc versions issue persists though.

git-svn-id: https://svn.pjsip.org/repos/pjproject/trunk@5633 74dad513-b988-da41-8d7b-12977e46ad98
This commit is contained in:
Nanang Izzuddin 2017-07-28 02:51:44 +00:00
parent e33d5ff658
commit 4b6c5064c3
69 changed files with 26282 additions and 16359 deletions

View File

@ -48,14 +48,14 @@ export YUV_OBJS = \
rotate.o \
rotate_common.o \
rotate_gcc.o \
rotate_mips.o \
rotate_dspr2.o \
rotate_neon64.o \
rotate_neon.o \
rotate_win.o \
row_any.o \
row_common.o \
row_gcc.o \
row_mips.o \
row_dspr2.o \
row_neon64.o \
row_neon.o \
row_win.o \
@ -64,7 +64,7 @@ export YUV_OBJS = \
scale.o \
scale_common.o \
scale_gcc.o \
scale_mips.o \
scale_dspr2.o \
scale_neon64.o \
scale_neon.o \
scale_win.o \

View File

@ -1,5 +1,7 @@
Notes:
* Source code for libyuv from https://chromium.googlesource.com/libyuv/libyuv/ dated 23 June 2016.
* Source code for libyuv from https://chromium.googlesource.com/libyuv/libyuv/ dated 27 July 2017.
* All code is compilable, except for compare_win.cc
- Use older version (https://chromium.googlesource.com/libyuv/libyuv/+/baf6a3c1bd385e7ffe6b7634560e71fb49e4f589%5E%21/)
Since there's a compiler error on:
@ -15,7 +17,14 @@ Notes:
__declspec(naked)
--------------------------------------------------------------------------------------
* Disable some compiler warning which apear alot:
- warning C4100: unreferenced formal parameter
- warning C4127: conditional expression is constant
- warning C4244: '=' : conversion from 'uint32' to 'uint8', possible loss of data
* Added these lines to file include/libyuv/basic_types.h:
--
#if _MSC_VER==1400
# include <stdint.h> // for uint8_t
#endif
...
#if defined(_MSC_VER)
# pragma warning(disable:4996) // This function or variable may be unsafe.
#endif
--

View File

@ -8,7 +8,7 @@
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef INCLUDE_LIBYUV_H_ // NOLINT
#ifndef INCLUDE_LIBYUV_H_
#define INCLUDE_LIBYUV_H_
#include "libyuv/basic_types.h"
@ -29,4 +29,4 @@
#include "libyuv/version.h"
#include "libyuv/video_common.h"
#endif // INCLUDE_LIBYUV_H_ NOLINT
#endif // INCLUDE_LIBYUV_H_

View File

@ -8,17 +8,24 @@
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef INCLUDE_LIBYUV_BASIC_TYPES_H_ // NOLINT
#ifndef INCLUDE_LIBYUV_BASIC_TYPES_H_
#define INCLUDE_LIBYUV_BASIC_TYPES_H_
#include <stddef.h> // for NULL, size_t
#if defined(__ANDROID__) || (defined(_MSC_VER) && (_MSC_VER < 1600))
#if defined(_MSC_VER) && (_MSC_VER < 1600)
#if _MSC_VER==1400
# include <stdint.h> // for uint8_t
#endif
#include <sys/types.h> // for uintptr_t on x86
#else
#include <stdint.h> // for uintptr_t
#endif
#if defined(_MSC_VER)
# pragma warning(disable:4996) // This function or variable may be unsafe.
#endif
#ifndef GG_LONGLONG
#ifndef INT_TYPES_DEFINED
#define INT_TYPES_DEFINED
@ -26,31 +33,31 @@
typedef unsigned __int64 uint64;
typedef __int64 int64;
#ifndef INT64_C
#define INT64_C(x) x ## I64
#define INT64_C(x) x##I64
#endif
#ifndef UINT64_C
#define UINT64_C(x) x ## UI64
#define UINT64_C(x) x##UI64
#endif
#define INT64_F "I64"
#else // COMPILER_MSVC
#if defined(__LP64__) && !defined(__OpenBSD__) && !defined(__APPLE__)
typedef unsigned long uint64; // NOLINT
typedef long int64; // NOLINT
typedef long int64; // NOLINT
#ifndef INT64_C
#define INT64_C(x) x ## L
#define INT64_C(x) x##L
#endif
#ifndef UINT64_C
#define UINT64_C(x) x ## UL
#define UINT64_C(x) x##UL
#endif
#define INT64_F "l"
#else // defined(__LP64__) && !defined(__OpenBSD__) && !defined(__APPLE__)
typedef unsigned long long uint64; // NOLINT
typedef long long int64; // NOLINT
typedef long long int64; // NOLINT
#ifndef INT64_C
#define INT64_C(x) x ## LL
#define INT64_C(x) x##LL
#endif
#ifndef UINT64_C
#define UINT64_C(x) x ## ULL
#define UINT64_C(x) x##ULL
#endif
#define INT64_F "ll"
#endif // __LP64__
@ -58,15 +65,15 @@ typedef long long int64; // NOLINT
typedef unsigned int uint32;
typedef int int32;
typedef unsigned short uint16; // NOLINT
typedef short int16; // NOLINT
typedef short int16; // NOLINT
typedef unsigned char uint8;
typedef signed char int8;
#endif // INT_TYPES_DEFINED
#endif // GG_LONGLONG
// Detect compiler is for x86 or x64.
#if defined(__x86_64__) || defined(_M_X64) || \
defined(__i386__) || defined(_M_IX86)
#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || \
defined(_M_IX86)
#define CPU_X86 1
#endif
// Detect compiler is for ARM.
@ -76,12 +83,12 @@ typedef signed char int8;
#ifndef ALIGNP
#ifdef __cplusplus
#define ALIGNP(p, t) \
(reinterpret_cast<uint8*>(((reinterpret_cast<uintptr_t>(p) + \
((t) - 1)) & ~((t) - 1))))
#define ALIGNP(p, t) \
reinterpret_cast<uint8*>( \
((reinterpret_cast<uintptr_t>(p) + ((t)-1)) & ~((t)-1)))
#else
#define ALIGNP(p, t) \
((uint8*)((((uintptr_t)(p) + ((t) - 1)) & ~((t) - 1)))) /* NOLINT */
(uint8*)((((uintptr_t)(p) + ((t)-1)) & ~((t)-1))) /* NOLINT */
#endif
#endif
@ -95,9 +102,9 @@ typedef signed char int8;
#define LIBYUV_API
#endif // LIBYUV_BUILDING_SHARED_LIBRARY
#elif defined(__GNUC__) && (__GNUC__ >= 4) && !defined(__APPLE__) && \
(defined(LIBYUV_BUILDING_SHARED_LIBRARY) || \
defined(LIBYUV_USING_SHARED_LIBRARY))
#define LIBYUV_API __attribute__ ((visibility ("default")))
(defined(LIBYUV_BUILDING_SHARED_LIBRARY) || \
defined(LIBYUV_USING_SHARED_LIBRARY))
#define LIBYUV_API __attribute__((visibility("default")))
#else
#define LIBYUV_API
#endif // __GNUC__
@ -108,11 +115,10 @@ typedef signed char int8;
#define LIBYUV_TRUE 1
// Visual C x86 or GCC little endian.
#if defined(__x86_64__) || defined(_M_X64) || \
defined(__i386__) || defined(_M_IX86) || \
defined(__arm__) || defined(_M_ARM) || \
(defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || \
defined(_M_IX86) || defined(__arm__) || defined(_M_ARM) || \
(defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
#define LIBYUV_LITTLE_ENDIAN
#endif
#endif // INCLUDE_LIBYUV_BASIC_TYPES_H_ NOLINT
#endif // INCLUDE_LIBYUV_BASIC_TYPES_H_

View File

@ -8,7 +8,7 @@
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef INCLUDE_LIBYUV_COMPARE_H_ // NOLINT
#ifndef INCLUDE_LIBYUV_COMPARE_H_
#define INCLUDE_LIBYUV_COMPARE_H_
#include "libyuv/basic_types.h"
@ -22,6 +22,12 @@ extern "C" {
LIBYUV_API
uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed);
// Hamming Distance
LIBYUV_API
uint64 ComputeHammingDistance(const uint8* src_a,
const uint8* src_b,
int count);
// Scan an opaque argb image and return fourcc based on alpha offset.
// Returns FOURCC_ARGB, FOURCC_BGRA, or 0 if unknown.
LIBYUV_API
@ -29,13 +35,15 @@ uint32 ARGBDetect(const uint8* argb, int stride_argb, int width, int height);
// Sum Square Error - used to compute Mean Square Error or PSNR.
LIBYUV_API
uint64 ComputeSumSquareError(const uint8* src_a,
const uint8* src_b, int count);
uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b, int count);
LIBYUV_API
uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a,
const uint8* src_b, int stride_b,
int width, int height);
uint64 ComputeSumSquareErrorPlane(const uint8* src_a,
int stride_a,
const uint8* src_b,
int stride_b,
int width,
int height);
static const int kMaxPsnr = 128;
@ -43,36 +51,56 @@ LIBYUV_API
double SumSquareErrorToPsnr(uint64 sse, uint64 count);
LIBYUV_API
double CalcFramePsnr(const uint8* src_a, int stride_a,
const uint8* src_b, int stride_b,
int width, int height);
double CalcFramePsnr(const uint8* src_a,
int stride_a,
const uint8* src_b,
int stride_b,
int width,
int height);
LIBYUV_API
double I420Psnr(const uint8* src_y_a, int stride_y_a,
const uint8* src_u_a, int stride_u_a,
const uint8* src_v_a, int stride_v_a,
const uint8* src_y_b, int stride_y_b,
const uint8* src_u_b, int stride_u_b,
const uint8* src_v_b, int stride_v_b,
int width, int height);
double I420Psnr(const uint8* src_y_a,
int stride_y_a,
const uint8* src_u_a,
int stride_u_a,
const uint8* src_v_a,
int stride_v_a,
const uint8* src_y_b,
int stride_y_b,
const uint8* src_u_b,
int stride_u_b,
const uint8* src_v_b,
int stride_v_b,
int width,
int height);
LIBYUV_API
double CalcFrameSsim(const uint8* src_a, int stride_a,
const uint8* src_b, int stride_b,
int width, int height);
double CalcFrameSsim(const uint8* src_a,
int stride_a,
const uint8* src_b,
int stride_b,
int width,
int height);
LIBYUV_API
double I420Ssim(const uint8* src_y_a, int stride_y_a,
const uint8* src_u_a, int stride_u_a,
const uint8* src_v_a, int stride_v_a,
const uint8* src_y_b, int stride_y_b,
const uint8* src_u_b, int stride_u_b,
const uint8* src_v_b, int stride_v_b,
int width, int height);
double I420Ssim(const uint8* src_y_a,
int stride_y_a,
const uint8* src_u_a,
int stride_u_a,
const uint8* src_v_a,
int stride_v_a,
const uint8* src_y_b,
int stride_y_b,
const uint8* src_u_b,
int stride_u_b,
const uint8* src_v_b,
int stride_v_b,
int width,
int height);
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif
#endif // INCLUDE_LIBYUV_COMPARE_H_ NOLINT
#endif // INCLUDE_LIBYUV_COMPARE_H_

View File

@ -8,7 +8,7 @@
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef INCLUDE_LIBYUV_COMPARE_ROW_H_ // NOLINT
#ifndef INCLUDE_LIBYUV_COMPARE_ROW_H_
#define INCLUDE_LIBYUV_COMPARE_ROW_H_
#include "libyuv/basic_types.h"
@ -30,8 +30,8 @@ extern "C" {
#endif
// Visual C 2012 required for AVX2.
#if defined(_M_IX86) && !defined(__clang__) && \
defined(_MSC_VER) && _MSC_VER >= 1700
#if defined(_M_IX86) && !defined(__clang__) && defined(_MSC_VER) && \
_MSC_VER >= 1700
#define VISUALC_HAS_AVX2 1
#endif // VisualStudio >= 2012
@ -42,16 +42,17 @@ extern "C" {
#endif // clang >= 3.4
#endif // __clang__
#if !defined(LIBYUV_DISABLE_X86) && \
defined(_M_IX86) && (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2))
#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \
(defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2))
#define HAS_HASHDJB2_AVX2
#endif
// The following are available for Visual C and GCC:
#if !defined(LIBYUV_DISABLE_X86) && \
(defined(__x86_64__) || (defined(__i386__) || defined(_M_IX86)))
(defined(__x86_64__) || defined(__i386__) || defined(_M_IX86))
#define HAS_HASHDJB2_SSE41
#define HAS_SUMSQUAREERROR_SSE2
#define HAS_HAMMINGDISTANCE_X86
#endif
// The following are available for Visual C and clangcl 32 bit:
@ -65,8 +66,13 @@ extern "C" {
#if !defined(LIBYUV_DISABLE_NEON) && \
(defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
#define HAS_SUMSQUAREERROR_NEON
#define HAS_HAMMINGDISTANCE_NEON
#endif
uint32 HammingDistance_C(const uint8* src_a, const uint8* src_b, int count);
uint32 HammingDistance_X86(const uint8* src_a, const uint8* src_b, int count);
uint32 HammingDistance_NEON(const uint8* src_a, const uint8* src_b, int count);
uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count);
uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count);
uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count);
@ -81,4 +87,4 @@ uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed);
} // namespace libyuv
#endif
#endif // INCLUDE_LIBYUV_COMPARE_ROW_H_ NOLINT
#endif // INCLUDE_LIBYUV_COMPARE_ROW_H_

View File

@ -8,13 +8,18 @@
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef INCLUDE_LIBYUV_CONVERT_H_ // NOLINT
#ifndef INCLUDE_LIBYUV_CONVERT_H_
#define INCLUDE_LIBYUV_CONVERT_H_
#include "libyuv/basic_types.h"
#include "libyuv/rotate.h" // For enum RotationMode.
// TODO(fbarchard): fix WebRTC source to include following libyuv headers:
#include "libyuv/convert_argb.h" // For WebRTC I420ToARGB. b/620
#include "libyuv/convert_from.h" // For WebRTC ConvertFromI420. b/620
#include "libyuv/planar_functions.h" // For WebRTC I420Rect, CopyPlane. b/618
#ifdef __cplusplus
namespace libyuv {
extern "C" {
@ -22,184 +27,295 @@ extern "C" {
// Convert I444 to I420.
LIBYUV_API
int I444ToI420(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
int I444ToI420(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_y,
int dst_stride_y,
uint8* dst_u,
int dst_stride_u,
uint8* dst_v,
int dst_stride_v,
int width,
int height);
// Convert I422 to I420.
LIBYUV_API
int I422ToI420(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
// Convert I411 to I420.
LIBYUV_API
int I411ToI420(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
int I422ToI420(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_y,
int dst_stride_y,
uint8* dst_u,
int dst_stride_u,
uint8* dst_v,
int dst_stride_v,
int width,
int height);
// Copy I420 to I420.
#define I420ToI420 I420Copy
LIBYUV_API
int I420Copy(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
int I420Copy(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_y,
int dst_stride_y,
uint8* dst_u,
int dst_stride_u,
uint8* dst_v,
int dst_stride_v,
int width,
int height);
// Convert I400 (grey) to I420.
LIBYUV_API
int I400ToI420(const uint8* src_y, int src_stride_y,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
int I400ToI420(const uint8* src_y,
int src_stride_y,
uint8* dst_y,
int dst_stride_y,
uint8* dst_u,
int dst_stride_u,
uint8* dst_v,
int dst_stride_v,
int width,
int height);
#define J400ToJ420 I400ToI420
// Convert NV12 to I420.
LIBYUV_API
int NV12ToI420(const uint8* src_y, int src_stride_y,
const uint8* src_uv, int src_stride_uv,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
int NV12ToI420(const uint8* src_y,
int src_stride_y,
const uint8* src_uv,
int src_stride_uv,
uint8* dst_y,
int dst_stride_y,
uint8* dst_u,
int dst_stride_u,
uint8* dst_v,
int dst_stride_v,
int width,
int height);
// Convert NV21 to I420.
LIBYUV_API
int NV21ToI420(const uint8* src_y, int src_stride_y,
const uint8* src_vu, int src_stride_vu,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
int NV21ToI420(const uint8* src_y,
int src_stride_y,
const uint8* src_vu,
int src_stride_vu,
uint8* dst_y,
int dst_stride_y,
uint8* dst_u,
int dst_stride_u,
uint8* dst_v,
int dst_stride_v,
int width,
int height);
// Convert YUY2 to I420.
LIBYUV_API
int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
int YUY2ToI420(const uint8* src_yuy2,
int src_stride_yuy2,
uint8* dst_y,
int dst_stride_y,
uint8* dst_u,
int dst_stride_u,
uint8* dst_v,
int dst_stride_v,
int width,
int height);
// Convert UYVY to I420.
LIBYUV_API
int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
int UYVYToI420(const uint8* src_uyvy,
int src_stride_uyvy,
uint8* dst_y,
int dst_stride_y,
uint8* dst_u,
int dst_stride_u,
uint8* dst_v,
int dst_stride_v,
int width,
int height);
// Convert M420 to I420.
LIBYUV_API
int M420ToI420(const uint8* src_m420, int src_stride_m420,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
int M420ToI420(const uint8* src_m420,
int src_stride_m420,
uint8* dst_y,
int dst_stride_y,
uint8* dst_u,
int dst_stride_u,
uint8* dst_v,
int dst_stride_v,
int width,
int height);
// Convert Android420 to I420.
LIBYUV_API
int Android420ToI420(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
int pixel_stride_uv,
uint8* dst_y,
int dst_stride_y,
uint8* dst_u,
int dst_stride_u,
uint8* dst_v,
int dst_stride_v,
int width,
int height);
// ARGB little endian (bgra in memory) to I420.
LIBYUV_API
int ARGBToI420(const uint8* src_frame, int src_stride_frame,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
int ARGBToI420(const uint8* src_frame,
int src_stride_frame,
uint8* dst_y,
int dst_stride_y,
uint8* dst_u,
int dst_stride_u,
uint8* dst_v,
int dst_stride_v,
int width,
int height);
// BGRA little endian (argb in memory) to I420.
LIBYUV_API
int BGRAToI420(const uint8* src_frame, int src_stride_frame,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
int BGRAToI420(const uint8* src_frame,
int src_stride_frame,
uint8* dst_y,
int dst_stride_y,
uint8* dst_u,
int dst_stride_u,
uint8* dst_v,
int dst_stride_v,
int width,
int height);
// ABGR little endian (rgba in memory) to I420.
LIBYUV_API
int ABGRToI420(const uint8* src_frame, int src_stride_frame,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
int ABGRToI420(const uint8* src_frame,
int src_stride_frame,
uint8* dst_y,
int dst_stride_y,
uint8* dst_u,
int dst_stride_u,
uint8* dst_v,
int dst_stride_v,
int width,
int height);
// RGBA little endian (abgr in memory) to I420.
LIBYUV_API
int RGBAToI420(const uint8* src_frame, int src_stride_frame,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
int RGBAToI420(const uint8* src_frame,
int src_stride_frame,
uint8* dst_y,
int dst_stride_y,
uint8* dst_u,
int dst_stride_u,
uint8* dst_v,
int dst_stride_v,
int width,
int height);
// RGB little endian (bgr in memory) to I420.
LIBYUV_API
int RGB24ToI420(const uint8* src_frame, int src_stride_frame,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
int RGB24ToI420(const uint8* src_frame,
int src_stride_frame,
uint8* dst_y,
int dst_stride_y,
uint8* dst_u,
int dst_stride_u,
uint8* dst_v,
int dst_stride_v,
int width,
int height);
// RGB big endian (rgb in memory) to I420.
LIBYUV_API
int RAWToI420(const uint8* src_frame, int src_stride_frame,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
int RAWToI420(const uint8* src_frame,
int src_stride_frame,
uint8* dst_y,
int dst_stride_y,
uint8* dst_u,
int dst_stride_u,
uint8* dst_v,
int dst_stride_v,
int width,
int height);
// RGB16 (RGBP fourcc) little endian to I420.
LIBYUV_API
int RGB565ToI420(const uint8* src_frame, int src_stride_frame,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
int RGB565ToI420(const uint8* src_frame,
int src_stride_frame,
uint8* dst_y,
int dst_stride_y,
uint8* dst_u,
int dst_stride_u,
uint8* dst_v,
int dst_stride_v,
int width,
int height);
// RGB15 (RGBO fourcc) little endian to I420.
LIBYUV_API
int ARGB1555ToI420(const uint8* src_frame, int src_stride_frame,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
int ARGB1555ToI420(const uint8* src_frame,
int src_stride_frame,
uint8* dst_y,
int dst_stride_y,
uint8* dst_u,
int dst_stride_u,
uint8* dst_v,
int dst_stride_v,
int width,
int height);
// RGB12 (R444 fourcc) little endian to I420.
LIBYUV_API
int ARGB4444ToI420(const uint8* src_frame, int src_stride_frame,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
int ARGB4444ToI420(const uint8* src_frame,
int src_stride_frame,
uint8* dst_y,
int dst_stride_y,
uint8* dst_u,
int dst_stride_u,
uint8* dst_v,
int dst_stride_v,
int width,
int height);
#ifdef HAVE_JPEG
// src_width/height provided by capture.
// dst_width/height for clipping determine final size.
LIBYUV_API
int MJPGToI420(const uint8* sample, size_t sample_size,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int src_width, int src_height,
int dst_width, int dst_height);
int MJPGToI420(const uint8* sample,
size_t sample_size,
uint8* dst_y,
int dst_stride_y,
uint8* dst_u,
int dst_stride_u,
uint8* dst_v,
int dst_stride_v,
int src_width,
int src_height,
int dst_width,
int dst_height);
// Query size of MJPG in pixels.
LIBYUV_API
int MJPGSize(const uint8* sample, size_t sample_size,
int* width, int* height);
int MJPGSize(const uint8* sample, size_t sample_size, int* width, int* height);
#endif
// Convert camera sample to I420 with cropping, rotation and vertical flip.
@ -225,13 +341,20 @@ int MJPGSize(const uint8* sample, size_t sample_size,
// "format" is a fourcc. ie 'I420', 'YUY2'
// Returns 0 for successful; -1 for invalid parameter. Non-zero for failure.
LIBYUV_API
int ConvertToI420(const uint8* src_frame, size_t src_size,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int crop_x, int crop_y,
int src_width, int src_height,
int crop_width, int crop_height,
int ConvertToI420(const uint8* src_frame,
size_t src_size,
uint8* dst_y,
int dst_stride_y,
uint8* dst_u,
int dst_stride_u,
uint8* dst_v,
int dst_stride_v,
int crop_x,
int crop_y,
int src_width,
int src_height,
int crop_width,
int crop_height,
enum RotationMode rotation,
uint32 format);
@ -240,4 +363,4 @@ int ConvertToI420(const uint8* src_frame, size_t src_size,
} // namespace libyuv
#endif
#endif // INCLUDE_LIBYUV_CONVERT_H_ NOLINT
#endif // INCLUDE_LIBYUV_CONVERT_H_

View File

@ -8,7 +8,7 @@
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef INCLUDE_LIBYUV_CONVERT_ARGB_H_ // NOLINT
#ifndef INCLUDE_LIBYUV_CONVERT_ARGB_H_
#define INCLUDE_LIBYUV_CONVERT_ARGB_H_
#include "libyuv/basic_types.h"
@ -30,246 +30,384 @@ extern "C" {
// Copy ARGB to ARGB.
LIBYUV_API
int ARGBCopy(const uint8* src_argb, int src_stride_argb,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
int ARGBCopy(const uint8* src_argb,
int src_stride_argb,
uint8* dst_argb,
int dst_stride_argb,
int width,
int height);
// Convert I420 to ARGB.
LIBYUV_API
int I420ToARGB(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
int I420ToARGB(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_argb,
int dst_stride_argb,
int width,
int height);
// Duplicate prototype for function in convert_from.h for remoting.
LIBYUV_API
int I420ToABGR(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_argb,
int dst_stride_argb,
int width,
int height);
// Convert I422 to ARGB.
LIBYUV_API
int I422ToARGB(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
int I422ToARGB(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_argb,
int dst_stride_argb,
int width,
int height);
// Convert I444 to ARGB.
LIBYUV_API
int I444ToARGB(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
int I444ToARGB(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_argb,
int dst_stride_argb,
int width,
int height);
// Convert J444 to ARGB.
LIBYUV_API
int J444ToARGB(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
int J444ToARGB(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_argb,
int dst_stride_argb,
int width,
int height);
// Convert I444 to ABGR.
LIBYUV_API
int I444ToABGR(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_abgr, int dst_stride_abgr,
int width, int height);
// Convert I411 to ARGB.
LIBYUV_API
int I411ToARGB(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
int I444ToABGR(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_abgr,
int dst_stride_abgr,
int width,
int height);
// Convert I420 with Alpha to preattenuated ARGB.
LIBYUV_API
int I420AlphaToARGB(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
const uint8* src_a, int src_stride_a,
uint8* dst_argb, int dst_stride_argb,
int width, int height, int attenuate);
int I420AlphaToARGB(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
const uint8* src_a,
int src_stride_a,
uint8* dst_argb,
int dst_stride_argb,
int width,
int height,
int attenuate);
// Convert I420 with Alpha to preattenuated ABGR.
LIBYUV_API
int I420AlphaToABGR(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
const uint8* src_a, int src_stride_a,
uint8* dst_abgr, int dst_stride_abgr,
int width, int height, int attenuate);
int I420AlphaToABGR(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
const uint8* src_a,
int src_stride_a,
uint8* dst_abgr,
int dst_stride_abgr,
int width,
int height,
int attenuate);
// Convert I400 (grey) to ARGB. Reverse of ARGBToI400.
LIBYUV_API
int I400ToARGB(const uint8* src_y, int src_stride_y,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
int I400ToARGB(const uint8* src_y,
int src_stride_y,
uint8* dst_argb,
int dst_stride_argb,
int width,
int height);
// Convert J400 (jpeg grey) to ARGB.
LIBYUV_API
int J400ToARGB(const uint8* src_y, int src_stride_y,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
int J400ToARGB(const uint8* src_y,
int src_stride_y,
uint8* dst_argb,
int dst_stride_argb,
int width,
int height);
// Alias.
#define YToARGB I400ToARGB
// Convert NV12 to ARGB.
LIBYUV_API
int NV12ToARGB(const uint8* src_y, int src_stride_y,
const uint8* src_uv, int src_stride_uv,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
int NV12ToARGB(const uint8* src_y,
int src_stride_y,
const uint8* src_uv,
int src_stride_uv,
uint8* dst_argb,
int dst_stride_argb,
int width,
int height);
// Convert NV21 to ARGB.
LIBYUV_API
int NV21ToARGB(const uint8* src_y, int src_stride_y,
const uint8* src_vu, int src_stride_vu,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
int NV21ToARGB(const uint8* src_y,
int src_stride_y,
const uint8* src_vu,
int src_stride_vu,
uint8* dst_argb,
int dst_stride_argb,
int width,
int height);
// Convert M420 to ARGB.
LIBYUV_API
int M420ToARGB(const uint8* src_m420, int src_stride_m420,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
int M420ToARGB(const uint8* src_m420,
int src_stride_m420,
uint8* dst_argb,
int dst_stride_argb,
int width,
int height);
// Convert YUY2 to ARGB.
LIBYUV_API
int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
int YUY2ToARGB(const uint8* src_yuy2,
int src_stride_yuy2,
uint8* dst_argb,
int dst_stride_argb,
int width,
int height);
// Convert UYVY to ARGB.
LIBYUV_API
int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
int UYVYToARGB(const uint8* src_uyvy,
int src_stride_uyvy,
uint8* dst_argb,
int dst_stride_argb,
int width,
int height);
// Convert J420 to ARGB.
LIBYUV_API
int J420ToARGB(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
int J420ToARGB(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_argb,
int dst_stride_argb,
int width,
int height);
// Convert J422 to ARGB.
LIBYUV_API
int J422ToARGB(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
int J422ToARGB(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_argb,
int dst_stride_argb,
int width,
int height);
// Convert J420 to ABGR.
LIBYUV_API
int J420ToABGR(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_abgr, int dst_stride_abgr,
int width, int height);
int J420ToABGR(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_abgr,
int dst_stride_abgr,
int width,
int height);
// Convert J422 to ABGR.
LIBYUV_API
int J422ToABGR(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_abgr, int dst_stride_abgr,
int width, int height);
int J422ToABGR(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_abgr,
int dst_stride_abgr,
int width,
int height);
// Convert H420 to ARGB.
LIBYUV_API
int H420ToARGB(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
int H420ToARGB(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_argb,
int dst_stride_argb,
int width,
int height);
// Convert H422 to ARGB.
LIBYUV_API
int H422ToARGB(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
int H422ToARGB(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_argb,
int dst_stride_argb,
int width,
int height);
// Convert H420 to ABGR.
LIBYUV_API
int H420ToABGR(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_abgr, int dst_stride_abgr,
int width, int height);
int H420ToABGR(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_abgr,
int dst_stride_abgr,
int width,
int height);
// Convert H422 to ABGR.
LIBYUV_API
int H422ToABGR(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_abgr, int dst_stride_abgr,
int width, int height);
int H422ToABGR(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_abgr,
int dst_stride_abgr,
int width,
int height);
// BGRA little endian (argb in memory) to ARGB.
LIBYUV_API
int BGRAToARGB(const uint8* src_frame, int src_stride_frame,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
int BGRAToARGB(const uint8* src_frame,
int src_stride_frame,
uint8* dst_argb,
int dst_stride_argb,
int width,
int height);
// ABGR little endian (rgba in memory) to ARGB.
LIBYUV_API
int ABGRToARGB(const uint8* src_frame, int src_stride_frame,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
int ABGRToARGB(const uint8* src_frame,
int src_stride_frame,
uint8* dst_argb,
int dst_stride_argb,
int width,
int height);
// RGBA little endian (abgr in memory) to ARGB.
LIBYUV_API
int RGBAToARGB(const uint8* src_frame, int src_stride_frame,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
int RGBAToARGB(const uint8* src_frame,
int src_stride_frame,
uint8* dst_argb,
int dst_stride_argb,
int width,
int height);
// Deprecated function name.
#define BG24ToARGB RGB24ToARGB
// RGB little endian (bgr in memory) to ARGB.
LIBYUV_API
int RGB24ToARGB(const uint8* src_frame, int src_stride_frame,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
int RGB24ToARGB(const uint8* src_frame,
int src_stride_frame,
uint8* dst_argb,
int dst_stride_argb,
int width,
int height);
// RGB big endian (rgb in memory) to ARGB.
LIBYUV_API
int RAWToARGB(const uint8* src_frame, int src_stride_frame,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
int RAWToARGB(const uint8* src_frame,
int src_stride_frame,
uint8* dst_argb,
int dst_stride_argb,
int width,
int height);
// RGB16 (RGBP fourcc) little endian to ARGB.
LIBYUV_API
int RGB565ToARGB(const uint8* src_frame, int src_stride_frame,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
int RGB565ToARGB(const uint8* src_frame,
int src_stride_frame,
uint8* dst_argb,
int dst_stride_argb,
int width,
int height);
// RGB15 (RGBO fourcc) little endian to ARGB.
LIBYUV_API
int ARGB1555ToARGB(const uint8* src_frame, int src_stride_frame,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
int ARGB1555ToARGB(const uint8* src_frame,
int src_stride_frame,
uint8* dst_argb,
int dst_stride_argb,
int width,
int height);
// RGB12 (R444 fourcc) little endian to ARGB.
LIBYUV_API
int ARGB4444ToARGB(const uint8* src_frame, int src_stride_frame,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
int ARGB4444ToARGB(const uint8* src_frame,
int src_stride_frame,
uint8* dst_argb,
int dst_stride_argb,
int width,
int height);
#ifdef HAVE_JPEG
// src_width/height provided by capture
// dst_width/height for clipping determine final size.
LIBYUV_API
int MJPGToARGB(const uint8* sample, size_t sample_size,
uint8* dst_argb, int dst_stride_argb,
int src_width, int src_height,
int dst_width, int dst_height);
int MJPGToARGB(const uint8* sample,
size_t sample_size,
uint8* dst_argb,
int dst_stride_argb,
int src_width,
int src_height,
int dst_width,
int dst_height);
#endif
// Convert camera sample to ARGB with cropping, rotation and vertical flip.
@ -295,11 +433,16 @@ int MJPGToARGB(const uint8* sample, size_t sample_size,
// "format" is a fourcc. ie 'I420', 'YUY2'
// Returns 0 for successful; -1 for invalid parameter. Non-zero for failure.
LIBYUV_API
int ConvertToARGB(const uint8* src_frame, size_t src_size,
uint8* dst_argb, int dst_stride_argb,
int crop_x, int crop_y,
int src_width, int src_height,
int crop_width, int crop_height,
int ConvertToARGB(const uint8* src_frame,
size_t src_size,
uint8* dst_argb,
int dst_stride_argb,
int crop_x,
int crop_y,
int src_width,
int src_height,
int crop_width,
int crop_height,
enum RotationMode rotation,
uint32 format);
@ -308,4 +451,4 @@ int ConvertToARGB(const uint8* src_frame, size_t src_size,
} // namespace libyuv
#endif
#endif // INCLUDE_LIBYUV_CONVERT_ARGB_H_ NOLINT
#endif // INCLUDE_LIBYUV_CONVERT_ARGB_H_

View File

@ -8,7 +8,7 @@
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef INCLUDE_LIBYUV_CONVERT_FROM_H_ // NOLINT
#ifndef INCLUDE_LIBYUV_CONVERT_FROM_H_
#define INCLUDE_LIBYUV_CONVERT_FROM_H_
#include "libyuv/basic_types.h"
@ -24,151 +24,249 @@ extern "C" {
// I420Copy in convert to I420ToI420.
LIBYUV_API
int I420ToI422(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
int I420ToI422(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_y,
int dst_stride_y,
uint8* dst_u,
int dst_stride_u,
uint8* dst_v,
int dst_stride_v,
int width,
int height);
LIBYUV_API
int I420ToI444(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
LIBYUV_API
int I420ToI411(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
int I420ToI444(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_y,
int dst_stride_y,
uint8* dst_u,
int dst_stride_u,
uint8* dst_v,
int dst_stride_v,
int width,
int height);
// Copy to I400. Source can be I420, I422, I444, I400, NV12 or NV21.
LIBYUV_API
int I400Copy(const uint8* src_y, int src_stride_y,
uint8* dst_y, int dst_stride_y,
int width, int height);
int I400Copy(const uint8* src_y,
int src_stride_y,
uint8* dst_y,
int dst_stride_y,
int width,
int height);
LIBYUV_API
int I420ToNV12(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_y, int dst_stride_y,
uint8* dst_uv, int dst_stride_uv,
int width, int height);
int I420ToNV12(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_y,
int dst_stride_y,
uint8* dst_uv,
int dst_stride_uv,
int width,
int height);
LIBYUV_API
int I420ToNV21(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_y, int dst_stride_y,
uint8* dst_vu, int dst_stride_vu,
int width, int height);
int I420ToNV21(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_y,
int dst_stride_y,
uint8* dst_vu,
int dst_stride_vu,
int width,
int height);
LIBYUV_API
int I420ToYUY2(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_frame, int dst_stride_frame,
int width, int height);
int I420ToYUY2(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_frame,
int dst_stride_frame,
int width,
int height);
LIBYUV_API
int I420ToUYVY(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_frame, int dst_stride_frame,
int width, int height);
int I420ToUYVY(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_frame,
int dst_stride_frame,
int width,
int height);
LIBYUV_API
int I420ToARGB(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
int I420ToARGB(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_argb,
int dst_stride_argb,
int width,
int height);
LIBYUV_API
int I420ToBGRA(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
int I420ToBGRA(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_argb,
int dst_stride_argb,
int width,
int height);
LIBYUV_API
int I420ToABGR(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
int I420ToABGR(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_argb,
int dst_stride_argb,
int width,
int height);
LIBYUV_API
int I420ToRGBA(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_rgba, int dst_stride_rgba,
int width, int height);
int I420ToRGBA(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_rgba,
int dst_stride_rgba,
int width,
int height);
LIBYUV_API
int I420ToRGB24(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_frame, int dst_stride_frame,
int width, int height);
int I420ToRGB24(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_frame,
int dst_stride_frame,
int width,
int height);
LIBYUV_API
int I420ToRAW(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_frame, int dst_stride_frame,
int width, int height);
int I420ToRAW(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_frame,
int dst_stride_frame,
int width,
int height);
LIBYUV_API
int I420ToRGB565(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_frame, int dst_stride_frame,
int width, int height);
int I420ToRGB565(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_frame,
int dst_stride_frame,
int width,
int height);
LIBYUV_API
int I422ToRGB565(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_frame,
int dst_stride_frame,
int width,
int height);
// Convert I420 To RGB565 with 4x4 dither matrix (16 bytes).
// Values in dither matrix from 0 to 7 recommended.
// The order of the dither matrix is first byte is upper left.
LIBYUV_API
int I420ToRGB565Dither(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_frame, int dst_stride_frame,
const uint8* dither4x4, int width, int height);
int I420ToRGB565Dither(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_frame,
int dst_stride_frame,
const uint8* dither4x4,
int width,
int height);
LIBYUV_API
int I420ToARGB1555(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_frame, int dst_stride_frame,
int width, int height);
int I420ToARGB1555(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_frame,
int dst_stride_frame,
int width,
int height);
LIBYUV_API
int I420ToARGB4444(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_frame, int dst_stride_frame,
int width, int height);
int I420ToARGB4444(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_frame,
int dst_stride_frame,
int width,
int height);
// Convert I420 to specified format.
// "dst_sample_stride" is bytes in a row for the destination. Pass 0 if the
// buffer has contiguous rows. Can be negative. A multiple of 16 is optimal.
LIBYUV_API
int ConvertFromI420(const uint8* y, int y_stride,
const uint8* u, int u_stride,
const uint8* v, int v_stride,
uint8* dst_sample, int dst_sample_stride,
int width, int height,
int ConvertFromI420(const uint8* y,
int y_stride,
const uint8* u,
int u_stride,
const uint8* v,
int v_stride,
uint8* dst_sample,
int dst_sample_stride,
int width,
int height,
uint32 format);
#ifdef __cplusplus
@ -176,4 +274,4 @@ int ConvertFromI420(const uint8* y, int y_stride,
} // namespace libyuv
#endif
#endif // INCLUDE_LIBYUV_CONVERT_FROM_H_ NOLINT
#endif // INCLUDE_LIBYUV_CONVERT_FROM_H_

View File

@ -8,7 +8,7 @@
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_ // NOLINT
#ifndef INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_
#define INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_
#include "libyuv/basic_types.h"
@ -21,45 +21,66 @@ extern "C" {
// Copy ARGB to ARGB.
#define ARGBToARGB ARGBCopy
LIBYUV_API
int ARGBCopy(const uint8* src_argb, int src_stride_argb,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
int ARGBCopy(const uint8* src_argb,
int src_stride_argb,
uint8* dst_argb,
int dst_stride_argb,
int width,
int height);
// Convert ARGB To BGRA.
LIBYUV_API
int ARGBToBGRA(const uint8* src_argb, int src_stride_argb,
uint8* dst_bgra, int dst_stride_bgra,
int width, int height);
int ARGBToBGRA(const uint8* src_argb,
int src_stride_argb,
uint8* dst_bgra,
int dst_stride_bgra,
int width,
int height);
// Convert ARGB To ABGR.
LIBYUV_API
int ARGBToABGR(const uint8* src_argb, int src_stride_argb,
uint8* dst_abgr, int dst_stride_abgr,
int width, int height);
int ARGBToABGR(const uint8* src_argb,
int src_stride_argb,
uint8* dst_abgr,
int dst_stride_abgr,
int width,
int height);
// Convert ARGB To RGBA.
LIBYUV_API
int ARGBToRGBA(const uint8* src_argb, int src_stride_argb,
uint8* dst_rgba, int dst_stride_rgba,
int width, int height);
int ARGBToRGBA(const uint8* src_argb,
int src_stride_argb,
uint8* dst_rgba,
int dst_stride_rgba,
int width,
int height);
// Convert ARGB To RGB24.
LIBYUV_API
int ARGBToRGB24(const uint8* src_argb, int src_stride_argb,
uint8* dst_rgb24, int dst_stride_rgb24,
int width, int height);
int ARGBToRGB24(const uint8* src_argb,
int src_stride_argb,
uint8* dst_rgb24,
int dst_stride_rgb24,
int width,
int height);
// Convert ARGB To RAW.
LIBYUV_API
int ARGBToRAW(const uint8* src_argb, int src_stride_argb,
uint8* dst_rgb, int dst_stride_rgb,
int width, int height);
int ARGBToRAW(const uint8* src_argb,
int src_stride_argb,
uint8* dst_rgb,
int dst_stride_rgb,
int width,
int height);
// Convert ARGB To RGB565.
LIBYUV_API
int ARGBToRGB565(const uint8* src_argb, int src_stride_argb,
uint8* dst_rgb565, int dst_stride_rgb565,
int width, int height);
int ARGBToRGB565(const uint8* src_argb,
int src_stride_argb,
uint8* dst_rgb565,
int dst_stride_rgb565,
int width,
int height);
// Convert ARGB To RGB565 with 4x4 dither matrix (16 bytes).
// Values in dither matrix from 0 to 7 recommended.
@ -67,124 +88,178 @@ int ARGBToRGB565(const uint8* src_argb, int src_stride_argb,
// TODO(fbarchard): Consider pointer to 2d array for dither4x4.
// const uint8(*dither)[4][4];
LIBYUV_API
int ARGBToRGB565Dither(const uint8* src_argb, int src_stride_argb,
uint8* dst_rgb565, int dst_stride_rgb565,
const uint8* dither4x4, int width, int height);
int ARGBToRGB565Dither(const uint8* src_argb,
int src_stride_argb,
uint8* dst_rgb565,
int dst_stride_rgb565,
const uint8* dither4x4,
int width,
int height);
// Convert ARGB To ARGB1555.
LIBYUV_API
int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb,
uint8* dst_argb1555, int dst_stride_argb1555,
int width, int height);
int ARGBToARGB1555(const uint8* src_argb,
int src_stride_argb,
uint8* dst_argb1555,
int dst_stride_argb1555,
int width,
int height);
// Convert ARGB To ARGB4444.
LIBYUV_API
int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb,
uint8* dst_argb4444, int dst_stride_argb4444,
int width, int height);
int ARGBToARGB4444(const uint8* src_argb,
int src_stride_argb,
uint8* dst_argb4444,
int dst_stride_argb4444,
int width,
int height);
// Convert ARGB To I444.
LIBYUV_API
int ARGBToI444(const uint8* src_argb, int src_stride_argb,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
int ARGBToI444(const uint8* src_argb,
int src_stride_argb,
uint8* dst_y,
int dst_stride_y,
uint8* dst_u,
int dst_stride_u,
uint8* dst_v,
int dst_stride_v,
int width,
int height);
// Convert ARGB To I422.
LIBYUV_API
int ARGBToI422(const uint8* src_argb, int src_stride_argb,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
int ARGBToI422(const uint8* src_argb,
int src_stride_argb,
uint8* dst_y,
int dst_stride_y,
uint8* dst_u,
int dst_stride_u,
uint8* dst_v,
int dst_stride_v,
int width,
int height);
// Convert ARGB To I420. (also in convert.h)
LIBYUV_API
int ARGBToI420(const uint8* src_argb, int src_stride_argb,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
int ARGBToI420(const uint8* src_argb,
int src_stride_argb,
uint8* dst_y,
int dst_stride_y,
uint8* dst_u,
int dst_stride_u,
uint8* dst_v,
int dst_stride_v,
int width,
int height);
// Convert ARGB to J420. (JPeg full range I420).
LIBYUV_API
int ARGBToJ420(const uint8* src_argb, int src_stride_argb,
uint8* dst_yj, int dst_stride_yj,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
int ARGBToJ420(const uint8* src_argb,
int src_stride_argb,
uint8* dst_yj,
int dst_stride_yj,
uint8* dst_u,
int dst_stride_u,
uint8* dst_v,
int dst_stride_v,
int width,
int height);
// Convert ARGB to J422.
LIBYUV_API
int ARGBToJ422(const uint8* src_argb, int src_stride_argb,
uint8* dst_yj, int dst_stride_yj,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
// Convert ARGB To I411.
LIBYUV_API
int ARGBToI411(const uint8* src_argb, int src_stride_argb,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
int ARGBToJ422(const uint8* src_argb,
int src_stride_argb,
uint8* dst_yj,
int dst_stride_yj,
uint8* dst_u,
int dst_stride_u,
uint8* dst_v,
int dst_stride_v,
int width,
int height);
// Convert ARGB to J400. (JPeg full range).
LIBYUV_API
int ARGBToJ400(const uint8* src_argb, int src_stride_argb,
uint8* dst_yj, int dst_stride_yj,
int width, int height);
int ARGBToJ400(const uint8* src_argb,
int src_stride_argb,
uint8* dst_yj,
int dst_stride_yj,
int width,
int height);
// Convert ARGB to I400.
LIBYUV_API
int ARGBToI400(const uint8* src_argb, int src_stride_argb,
uint8* dst_y, int dst_stride_y,
int width, int height);
int ARGBToI400(const uint8* src_argb,
int src_stride_argb,
uint8* dst_y,
int dst_stride_y,
int width,
int height);
// Convert ARGB to G. (Reverse of J400toARGB, which replicates G back to ARGB)
LIBYUV_API
int ARGBToG(const uint8* src_argb, int src_stride_argb,
uint8* dst_g, int dst_stride_g,
int width, int height);
int ARGBToG(const uint8* src_argb,
int src_stride_argb,
uint8* dst_g,
int dst_stride_g,
int width,
int height);
// Convert ARGB To NV12.
LIBYUV_API
int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
uint8* dst_y, int dst_stride_y,
uint8* dst_uv, int dst_stride_uv,
int width, int height);
int ARGBToNV12(const uint8* src_argb,
int src_stride_argb,
uint8* dst_y,
int dst_stride_y,
uint8* dst_uv,
int dst_stride_uv,
int width,
int height);
// Convert ARGB To NV21.
LIBYUV_API
int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
uint8* dst_y, int dst_stride_y,
uint8* dst_vu, int dst_stride_vu,
int width, int height);
int ARGBToNV21(const uint8* src_argb,
int src_stride_argb,
uint8* dst_y,
int dst_stride_y,
uint8* dst_vu,
int dst_stride_vu,
int width,
int height);
// Convert ARGB To NV21.
LIBYUV_API
int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
uint8* dst_y, int dst_stride_y,
uint8* dst_vu, int dst_stride_vu,
int width, int height);
int ARGBToNV21(const uint8* src_argb,
int src_stride_argb,
uint8* dst_y,
int dst_stride_y,
uint8* dst_vu,
int dst_stride_vu,
int width,
int height);
// Convert ARGB To YUY2.
LIBYUV_API
int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
uint8* dst_yuy2, int dst_stride_yuy2,
int width, int height);
int ARGBToYUY2(const uint8* src_argb,
int src_stride_argb,
uint8* dst_yuy2,
int dst_stride_yuy2,
int width,
int height);
// Convert ARGB To UYVY.
LIBYUV_API
int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,
uint8* dst_uyvy, int dst_stride_uyvy,
int width, int height);
int ARGBToUYVY(const uint8* src_argb,
int src_stride_argb,
uint8* dst_uyvy,
int dst_stride_uyvy,
int width,
int height);
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif
#endif // INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_ NOLINT
#endif // INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_

View File

@ -8,7 +8,7 @@
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef INCLUDE_LIBYUV_CPU_ID_H_ // NOLINT
#ifndef INCLUDE_LIBYUV_CPU_ID_H_
#define INCLUDE_LIBYUV_CPU_ID_H_
#include "libyuv/basic_types.h"
@ -31,50 +31,61 @@ static const int kCpuHasX86 = 0x10;
static const int kCpuHasSSE2 = 0x20;
static const int kCpuHasSSSE3 = 0x40;
static const int kCpuHasSSE41 = 0x80;
static const int kCpuHasSSE42 = 0x100;
static const int kCpuHasSSE42 = 0x100; // unused at this time.
static const int kCpuHasAVX = 0x200;
static const int kCpuHasAVX2 = 0x400;
static const int kCpuHasERMS = 0x800;
static const int kCpuHasFMA3 = 0x1000;
static const int kCpuHasAVX3 = 0x2000;
// 0x2000, 0x4000, 0x8000 reserved for future X86 flags.
static const int kCpuHasF16C = 0x4000;
// 0x8000 reserved for future X86 flags.
// These flags are only valid on MIPS processors.
static const int kCpuHasMIPS = 0x10000;
static const int kCpuHasDSPR2 = 0x20000;
static const int kCpuHasMSA = 0x40000;
// Internal function used to auto-init.
// Optional init function. TestCpuFlag does an auto-init.
// Returns cpu_info flags.
LIBYUV_API
int InitCpuFlags(void);
// Detect CPU has SSE2 etc.
// Test_flag parameter should be one of kCpuHas constants above.
// Returns non-zero if instruction set is detected
static __inline int TestCpuFlag(int test_flag) {
LIBYUV_API extern int cpu_info_;
#ifdef __ATOMIC_RELAXED
int cpu_info = __atomic_load_n(&cpu_info_, __ATOMIC_RELAXED);
#else
int cpu_info = cpu_info_;
#endif
return (!cpu_info ? InitCpuFlags() : cpu_info) & test_flag;
}
// Internal function for parsing /proc/cpuinfo.
LIBYUV_API
int ArmCpuCaps(const char* cpuinfo_name);
// Detect CPU has SSE2 etc.
// Test_flag parameter should be one of kCpuHas constants above.
// returns non-zero if instruction set is detected
static __inline int TestCpuFlag(int test_flag) {
LIBYUV_API extern int cpu_info_;
return (!cpu_info_ ? InitCpuFlags() : cpu_info_) & test_flag;
}
// For testing, allow CPU flags to be disabled.
// ie MaskCpuFlags(~kCpuHasSSSE3) to disable SSSE3.
// MaskCpuFlags(-1) to enable all cpu specific optimizations.
// MaskCpuFlags(1) to disable all cpu specific optimizations.
// MaskCpuFlags(0) to reset state so next call will auto init.
// Returns cpu_info flags.
LIBYUV_API
void MaskCpuFlags(int enable_flags);
int MaskCpuFlags(int enable_flags);
// Low level cpuid for X86. Returns zeros on other CPUs.
// eax is the info type that you want.
// ecx is typically the cpu number, and should normally be zero.
LIBYUV_API
void CpuId(uint32 eax, uint32 ecx, uint32* cpu_info);
void CpuId(int eax, int ecx, int* cpu_info);
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif
#endif // INCLUDE_LIBYUV_CPU_ID_H_ NOLINT
#endif // INCLUDE_LIBYUV_CPU_ID_H_

View File

@ -0,0 +1,233 @@
/*
* Copyright 2016 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef INCLUDE_LIBYUV_MACROS_MSA_H_
#define INCLUDE_LIBYUV_MACROS_MSA_H_
#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
#include <msa.h>
#include <stdint.h>
#if (__mips_isa_rev >= 6)
#define LW(psrc) \
({ \
uint8* psrc_lw_m = (uint8*)(psrc); /* NOLINT */ \
uint32 val_m; \
asm volatile("lw %[val_m], %[psrc_lw_m] \n" \
: [val_m] "=r"(val_m) \
: [psrc_lw_m] "m"(*psrc_lw_m)); \
val_m; \
})
#if (__mips == 64)
#define LD(psrc) \
({ \
uint8* psrc_ld_m = (uint8*)(psrc); /* NOLINT */ \
uint64 val_m = 0; \
asm volatile("ld %[val_m], %[psrc_ld_m] \n" \
: [val_m] "=r"(val_m) \
: [psrc_ld_m] "m"(*psrc_ld_m)); \
val_m; \
})
#else // !(__mips == 64)
#define LD(psrc) \
({ \
uint8* psrc_ld_m = (uint8*)(psrc); /* NOLINT */ \
uint32 val0_m, val1_m; \
uint64 val_m = 0; \
val0_m = LW(psrc_ld_m); \
val1_m = LW(psrc_ld_m + 4); \
val_m = (uint64)(val1_m); /* NOLINT */ \
val_m = (uint64)((val_m << 32) & 0xFFFFFFFF00000000); /* NOLINT */ \
val_m = (uint64)(val_m | (uint64)val0_m); /* NOLINT */ \
val_m; \
})
#endif // (__mips == 64)
#define SW(val, pdst) \
({ \
uint8_t* pdst_sw_m = (uint8_t*)(pdst); /* NOLINT */ \
uint32_t val_m = (val); \
asm volatile("sw %[val_m], %[pdst_sw_m] \n" \
: [pdst_sw_m] "=m"(*pdst_sw_m) \
: [val_m] "r"(val_m)); \
})
#if (__mips == 64)
#define SD(val, pdst) \
({ \
uint8_t* pdst_sd_m = (uint8_t*)(pdst); /* NOLINT */ \
uint64_t val_m = (val); \
asm volatile("sd %[val_m], %[pdst_sd_m] \n" \
: [pdst_sd_m] "=m"(*pdst_sd_m) \
: [val_m] "r"(val_m)); \
})
#else // !(__mips == 64)
#define SD(val, pdst) \
({ \
uint8_t* pdst_sd_m = (uint8_t*)(pdst); /* NOLINT */ \
uint32_t val0_m, val1_m; \
val0_m = (uint32_t)((val)&0x00000000FFFFFFFF); \
val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \
SW(val0_m, pdst_sd_m); \
SW(val1_m, pdst_sd_m + 4); \
})
#endif // !(__mips == 64)
#else // !(__mips_isa_rev >= 6)
#define LW(psrc) \
({ \
uint8* psrc_lw_m = (uint8*)(psrc); /* NOLINT */ \
uint32 val_m; \
asm volatile("ulw %[val_m], %[psrc_lw_m] \n" \
: [val_m] "=r"(val_m) \
: [psrc_lw_m] "m"(*psrc_lw_m)); \
val_m; \
})
#if (__mips == 64)
#define LD(psrc) \
({ \
uint8* psrc_ld_m = (uint8*)(psrc); /* NOLINT */ \
uint64 val_m = 0; \
asm volatile("uld %[val_m], %[psrc_ld_m] \n" \
: [val_m] "=r"(val_m) \
: [psrc_ld_m] "m"(*psrc_ld_m)); \
val_m; \
})
#else // !(__mips == 64)
#define LD(psrc) \
({ \
uint8* psrc_ld_m = (uint8*)(psrc); /* NOLINT */ \
uint32 val0_m, val1_m; \
uint64 val_m = 0; \
val0_m = LW(psrc_ld_m); \
val1_m = LW(psrc_ld_m + 4); \
val_m = (uint64)(val1_m); /* NOLINT */ \
val_m = (uint64)((val_m << 32) & 0xFFFFFFFF00000000); /* NOLINT */ \
val_m = (uint64)(val_m | (uint64)val0_m); /* NOLINT */ \
val_m; \
})
#endif // (__mips == 64)
#define SW(val, pdst) \
({ \
uint8_t* pdst_sw_m = (uint8_t*)(pdst); /* NOLINT */ \
uint32_t val_m = (val); \
asm volatile("usw %[val_m], %[pdst_sw_m] \n" \
: [pdst_sw_m] "=m"(*pdst_sw_m) \
: [val_m] "r"(val_m)); \
})
#define SD(val, pdst) \
({ \
uint8_t* pdst_sd_m = (uint8_t*)(pdst); /* NOLINT */ \
uint32_t val0_m, val1_m; \
val0_m = (uint32_t)((val)&0x00000000FFFFFFFF); \
val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \
SW(val0_m, pdst_sd_m); \
SW(val1_m, pdst_sd_m + 4); \
})
#endif // (__mips_isa_rev >= 6)
// TODO(fbarchard): Consider removing __VAR_ARGS versions.
#define LD_B(RTYPE, psrc) *((RTYPE*)(psrc)) /* NOLINT */
#define LD_UB(...) LD_B(v16u8, __VA_ARGS__)
#define ST_B(RTYPE, in, pdst) *((RTYPE*)(pdst)) = (in) /* NOLINT */
#define ST_UB(...) ST_B(v16u8, __VA_ARGS__)
#define ST_H(RTYPE, in, pdst) *((RTYPE*)(pdst)) = (in) /* NOLINT */
#define ST_UH(...) ST_H(v8u16, __VA_ARGS__)
/* Description : Load two vectors with 16 'byte' sized elements
Arguments : Inputs - psrc, stride
Outputs - out0, out1
Return Type - as per RTYPE
Details : Load 16 byte elements in 'out0' from (psrc)
Load 16 byte elements in 'out1' from (psrc + stride)
*/
#define LD_B2(RTYPE, psrc, stride, out0, out1) \
{ \
out0 = LD_B(RTYPE, (psrc)); \
out1 = LD_B(RTYPE, (psrc) + stride); \
}
#define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__)
#define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) \
{ \
LD_B2(RTYPE, (psrc), stride, out0, out1); \
LD_B2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \
}
#define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__)
/* Description : Store two vectors with stride each having 16 'byte' sized
elements
Arguments : Inputs - in0, in1, pdst, stride
Details : Store 16 byte elements from 'in0' to (pdst)
Store 16 byte elements from 'in1' to (pdst + stride)
*/
#define ST_B2(RTYPE, in0, in1, pdst, stride) \
{ \
ST_B(RTYPE, in0, (pdst)); \
ST_B(RTYPE, in1, (pdst) + stride); \
}
#define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__)
#define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride) \
{ \
ST_B2(RTYPE, in0, in1, (pdst), stride); \
ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \
}
#define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__)
/* Description : Store vectors of 8 halfword elements with stride
Arguments : Inputs - in0, in1, pdst, stride
Details : Store 8 halfword elements from 'in0' to (pdst)
Store 8 halfword elements from 'in1' to (pdst + stride)
*/
#define ST_H2(RTYPE, in0, in1, pdst, stride) \
{ \
ST_H(RTYPE, in0, (pdst)); \
ST_H(RTYPE, in1, (pdst) + stride); \
}
#define ST_UH2(...) ST_H2(v8u16, __VA_ARGS__)
// TODO(fbarchard): Consider using __msa_vshf_b and __msa_ilvr_b directly.
/* Description : Shuffle byte vector elements as per mask vector
Arguments : Inputs - in0, in1, in2, in3, mask0, mask1
Outputs - out0, out1
Return Type - as per RTYPE
Details : Byte elements from 'in0' & 'in1' are copied selectively to
'out0' as per control vector 'mask0'
*/
#define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \
{ \
out0 = (RTYPE)__msa_vshf_b((v16i8)mask0, (v16i8)in1, (v16i8)in0); \
out1 = (RTYPE)__msa_vshf_b((v16i8)mask1, (v16i8)in3, (v16i8)in2); \
}
#define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__)
/* Description : Interleave both left and right half of input vectors
Arguments : Inputs - in0, in1
Outputs - out0, out1
Return Type - as per RTYPE
Details : Right half of byte elements from 'in0' and 'in1' are
interleaved and written to 'out0'
*/
#define ILVRL_B2(RTYPE, in0, in1, out0, out1) \
{ \
out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \
out1 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \
}
#define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__)
#endif /* !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) */
#endif // INCLUDE_LIBYUV_MACROS_MSA_H_

View File

@ -8,7 +8,7 @@
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef INCLUDE_LIBYUV_MJPEG_DECODER_H_ // NOLINT
#ifndef INCLUDE_LIBYUV_MJPEG_DECODER_H_
#define INCLUDE_LIBYUV_MJPEG_DECODER_H_
#include "libyuv/basic_types.h"
@ -37,7 +37,6 @@ static const uint32 kUnknownDataSize = 0xFFFFFFFF;
enum JpegSubsamplingType {
kJpegYuv420,
kJpegYuv422,
kJpegYuv411,
kJpegYuv444,
kJpegYuv400,
kJpegUnknown
@ -145,12 +144,16 @@ class LIBYUV_API MJpegDecoder {
// callback function. Each call will get the data for a whole number of
// image scanlines.
// TODO(fbarchard): Add dst_x, dst_y to allow specific rect to be decoded.
LIBYUV_BOOL DecodeToCallback(CallbackFunction fn, void* opaque,
int dst_width, int dst_height);
LIBYUV_BOOL DecodeToCallback(CallbackFunction fn,
void* opaque,
int dst_width,
int dst_height);
// The helper function which recognizes the jpeg sub-sampling type.
static JpegSubsamplingType JpegSubsamplingTypeHelper(
int* subsample_x, int* subsample_y, int number_of_components);
int* subsample_x,
int* subsample_y,
int number_of_components);
private:
void AllocOutputBuffers(int num_outbufs);
@ -189,4 +192,4 @@ class LIBYUV_API MJpegDecoder {
} // namespace libyuv
#endif // __cplusplus
#endif // INCLUDE_LIBYUV_MJPEG_DECODER_H_ NOLINT
#endif // INCLUDE_LIBYUV_MJPEG_DECODER_H_

View File

@ -8,7 +8,7 @@
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_ // NOLINT
#ifndef INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_
#define INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_
#include "libyuv/basic_types.h"
@ -24,86 +24,164 @@ extern "C" {
// Copy a plane of data.
LIBYUV_API
void CopyPlane(const uint8* src_y, int src_stride_y,
uint8* dst_y, int dst_stride_y,
int width, int height);
void CopyPlane(const uint8* src_y,
int src_stride_y,
uint8* dst_y,
int dst_stride_y,
int width,
int height);
LIBYUV_API
void CopyPlane_16(const uint16* src_y, int src_stride_y,
uint16* dst_y, int dst_stride_y,
int width, int height);
void CopyPlane_16(const uint16* src_y,
int src_stride_y,
uint16* dst_y,
int dst_stride_y,
int width,
int height);
// Set a plane of data to a 32 bit value.
LIBYUV_API
void SetPlane(uint8* dst_y, int dst_stride_y,
int width, int height,
void SetPlane(uint8* dst_y,
int dst_stride_y,
int width,
int height,
uint32 value);
// Split interleaved UV plane into separate U and V planes.
LIBYUV_API
void SplitUVPlane(const uint8* src_uv,
int src_stride_uv,
uint8* dst_u,
int dst_stride_u,
uint8* dst_v,
int dst_stride_v,
int width,
int height);
// Merge separate U and V planes into one interleaved UV plane.
LIBYUV_API
void MergeUVPlane(const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_uv,
int dst_stride_uv,
int width,
int height);
// Copy I400. Supports inverting.
LIBYUV_API
int I400ToI400(const uint8* src_y, int src_stride_y,
uint8* dst_y, int dst_stride_y,
int width, int height);
int I400ToI400(const uint8* src_y,
int src_stride_y,
uint8* dst_y,
int dst_stride_y,
int width,
int height);
#define J400ToJ400 I400ToI400
// Copy I422 to I422.
#define I422ToI422 I422Copy
LIBYUV_API
int I422Copy(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
int I422Copy(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_y,
int dst_stride_y,
uint8* dst_u,
int dst_stride_u,
uint8* dst_v,
int dst_stride_v,
int width,
int height);
// Copy I444 to I444.
#define I444ToI444 I444Copy
LIBYUV_API
int I444Copy(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
int I444Copy(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_y,
int dst_stride_y,
uint8* dst_u,
int dst_stride_u,
uint8* dst_v,
int dst_stride_v,
int width,
int height);
// Convert YUY2 to I422.
LIBYUV_API
int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
int YUY2ToI422(const uint8* src_yuy2,
int src_stride_yuy2,
uint8* dst_y,
int dst_stride_y,
uint8* dst_u,
int dst_stride_u,
uint8* dst_v,
int dst_stride_v,
int width,
int height);
// Convert UYVY to I422.
LIBYUV_API
int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
int UYVYToI422(const uint8* src_uyvy,
int src_stride_uyvy,
uint8* dst_y,
int dst_stride_y,
uint8* dst_u,
int dst_stride_u,
uint8* dst_v,
int dst_stride_v,
int width,
int height);
LIBYUV_API
int YUY2ToNV12(const uint8* src_yuy2, int src_stride_yuy2,
uint8* dst_y, int dst_stride_y,
uint8* dst_uv, int dst_stride_uv,
int width, int height);
int YUY2ToNV12(const uint8* src_yuy2,
int src_stride_yuy2,
uint8* dst_y,
int dst_stride_y,
uint8* dst_uv,
int dst_stride_uv,
int width,
int height);
LIBYUV_API
int UYVYToNV12(const uint8* src_uyvy, int src_stride_uyvy,
uint8* dst_y, int dst_stride_y,
uint8* dst_uv, int dst_stride_uv,
int width, int height);
int UYVYToNV12(const uint8* src_uyvy,
int src_stride_uyvy,
uint8* dst_y,
int dst_stride_y,
uint8* dst_uv,
int dst_stride_uv,
int width,
int height);
LIBYUV_API
int YUY2ToY(const uint8* src_yuy2,
int src_stride_yuy2,
uint8* dst_y,
int dst_stride_y,
int width,
int height);
// Convert I420 to I400. (calls CopyPlane ignoring u/v).
LIBYUV_API
int I420ToI400(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_y, int dst_stride_y,
int width, int height);
int I420ToI400(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_y,
int dst_stride_y,
int width,
int height);
// Alias
#define J420ToJ400 I420ToI400
@ -111,13 +189,20 @@ int I420ToI400(const uint8* src_y, int src_stride_y,
// I420 mirror.
LIBYUV_API
int I420Mirror(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
int I420Mirror(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_y,
int dst_stride_y,
uint8* dst_u,
int dst_stride_u,
uint8* dst_v,
int dst_stride_v,
int width,
int height);
// Alias
#define I400ToI400Mirror I400Mirror
@ -125,87 +210,139 @@ int I420Mirror(const uint8* src_y, int src_stride_y,
// I400 mirror. A single plane is mirrored horizontally.
// Pass negative height to achieve 180 degree rotation.
LIBYUV_API
int I400Mirror(const uint8* src_y, int src_stride_y,
uint8* dst_y, int dst_stride_y,
int width, int height);
int I400Mirror(const uint8* src_y,
int src_stride_y,
uint8* dst_y,
int dst_stride_y,
int width,
int height);
// Alias
#define ARGBToARGBMirror ARGBMirror
// ARGB mirror.
LIBYUV_API
int ARGBMirror(const uint8* src_argb, int src_stride_argb,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
int ARGBMirror(const uint8* src_argb,
int src_stride_argb,
uint8* dst_argb,
int dst_stride_argb,
int width,
int height);
// Convert NV12 to RGB565.
LIBYUV_API
int NV12ToRGB565(const uint8* src_y, int src_stride_y,
const uint8* src_uv, int src_stride_uv,
uint8* dst_rgb565, int dst_stride_rgb565,
int width, int height);
int NV12ToRGB565(const uint8* src_y,
int src_stride_y,
const uint8* src_uv,
int src_stride_uv,
uint8* dst_rgb565,
int dst_stride_rgb565,
int width,
int height);
// I422ToARGB is in convert_argb.h
// Convert I422 to BGRA.
LIBYUV_API
int I422ToBGRA(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_bgra, int dst_stride_bgra,
int width, int height);
int I422ToBGRA(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_bgra,
int dst_stride_bgra,
int width,
int height);
// Convert I422 to ABGR.
LIBYUV_API
int I422ToABGR(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_abgr, int dst_stride_abgr,
int width, int height);
int I422ToABGR(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_abgr,
int dst_stride_abgr,
int width,
int height);
// Convert I422 to RGBA.
LIBYUV_API
int I422ToRGBA(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_rgba, int dst_stride_rgba,
int width, int height);
int I422ToRGBA(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_rgba,
int dst_stride_rgba,
int width,
int height);
// Alias
#define RGB24ToRAW RAWToRGB24
LIBYUV_API
int RAWToRGB24(const uint8* src_raw, int src_stride_raw,
uint8* dst_rgb24, int dst_stride_rgb24,
int width, int height);
int RAWToRGB24(const uint8* src_raw,
int src_stride_raw,
uint8* dst_rgb24,
int dst_stride_rgb24,
int width,
int height);
// Draw a rectangle into I420.
LIBYUV_API
int I420Rect(uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int x, int y, int width, int height,
int value_y, int value_u, int value_v);
int I420Rect(uint8* dst_y,
int dst_stride_y,
uint8* dst_u,
int dst_stride_u,
uint8* dst_v,
int dst_stride_v,
int x,
int y,
int width,
int height,
int value_y,
int value_u,
int value_v);
// Draw a rectangle into ARGB.
LIBYUV_API
int ARGBRect(uint8* dst_argb, int dst_stride_argb,
int x, int y, int width, int height, uint32 value);
int ARGBRect(uint8* dst_argb,
int dst_stride_argb,
int x,
int y,
int width,
int height,
uint32 value);
// Convert ARGB to gray scale ARGB.
LIBYUV_API
int ARGBGrayTo(const uint8* src_argb, int src_stride_argb,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
int ARGBGrayTo(const uint8* src_argb,
int src_stride_argb,
uint8* dst_argb,
int dst_stride_argb,
int width,
int height);
// Make a rectangle of ARGB gray scale.
LIBYUV_API
int ARGBGray(uint8* dst_argb, int dst_stride_argb,
int x, int y, int width, int height);
int ARGBGray(uint8* dst_argb,
int dst_stride_argb,
int x,
int y,
int width,
int height);
// Make a rectangle of ARGB Sepia tone.
LIBYUV_API
int ARGBSepia(uint8* dst_argb, int dst_stride_argb,
int x, int y, int width, int height);
int ARGBSepia(uint8* dst_argb,
int dst_stride_argb,
int x,
int y,
int width,
int height);
// Apply a matrix rotation to each ARGB pixel.
// matrix_argb is 4 signed ARGB values. -128 to 127 representing -2 to 2.
@ -214,10 +351,13 @@ int ARGBSepia(uint8* dst_argb, int dst_stride_argb,
// The next 4 coefficients apply to B, G, R, A and produce R of the output.
// The last 4 coefficients apply to B, G, R, A and produce A of the output.
LIBYUV_API
int ARGBColorMatrix(const uint8* src_argb, int src_stride_argb,
uint8* dst_argb, int dst_stride_argb,
int ARGBColorMatrix(const uint8* src_argb,
int src_stride_argb,
uint8* dst_argb,
int dst_stride_argb,
const int8* matrix_argb,
int width, int height);
int width,
int height);
// Deprecated. Use ARGBColorMatrix instead.
// Apply a matrix rotation to each ARGB pixel.
@ -226,32 +366,47 @@ int ARGBColorMatrix(const uint8* src_argb, int src_stride_argb,
// The next 4 coefficients apply to B, G, R, A and produce G of the output.
// The last 4 coefficients apply to B, G, R, A and produce R of the output.
LIBYUV_API
int RGBColorMatrix(uint8* dst_argb, int dst_stride_argb,
int RGBColorMatrix(uint8* dst_argb,
int dst_stride_argb,
const int8* matrix_rgb,
int x, int y, int width, int height);
int x,
int y,
int width,
int height);
// Apply a color table each ARGB pixel.
// Table contains 256 ARGB values.
LIBYUV_API
int ARGBColorTable(uint8* dst_argb, int dst_stride_argb,
int ARGBColorTable(uint8* dst_argb,
int dst_stride_argb,
const uint8* table_argb,
int x, int y, int width, int height);
int x,
int y,
int width,
int height);
// Apply a color table each ARGB pixel but preserve destination alpha.
// Table contains 256 ARGB values.
LIBYUV_API
int RGBColorTable(uint8* dst_argb, int dst_stride_argb,
int RGBColorTable(uint8* dst_argb,
int dst_stride_argb,
const uint8* table_argb,
int x, int y, int width, int height);
int x,
int y,
int width,
int height);
// Apply a luma/color table each ARGB pixel but preserve destination alpha.
// Table contains 32768 values indexed by [Y][C] where 7 it 7 bit luma from
// RGB (YJ style) and C is an 8 bit color component (R, G or B).
LIBYUV_API
int ARGBLumaColorTable(const uint8* src_argb, int src_stride_argb,
uint8* dst_argb, int dst_stride_argb,
int ARGBLumaColorTable(const uint8* src_argb,
int src_stride_argb,
uint8* dst_argb,
int dst_stride_argb,
const uint8* luma_rgb_table,
int width, int height);
int width,
int height);
// Apply a 3 term polynomial to ARGB values.
// poly points to a 4x4 matrix. The first row is constants. The 2nd row is
@ -262,46 +417,80 @@ int ARGBLumaColorTable(const uint8* src_argb, int src_stride_argb,
// A polynomial approximation can be dirived using software such as 'R'.
LIBYUV_API
int ARGBPolynomial(const uint8* src_argb, int src_stride_argb,
uint8* dst_argb, int dst_stride_argb,
int ARGBPolynomial(const uint8* src_argb,
int src_stride_argb,
uint8* dst_argb,
int dst_stride_argb,
const float* poly,
int width, int height);
int width,
int height);
// Convert plane of 16 bit shorts to half floats.
// Source values are multiplied by scale before storing as half float.
LIBYUV_API
int HalfFloatPlane(const uint16* src_y,
int src_stride_y,
uint16* dst_y,
int dst_stride_y,
float scale,
int width,
int height);
// Quantize a rectangle of ARGB. Alpha unaffected.
// scale is a 16 bit fractional fixed point scaler between 0 and 65535.
// interval_size should be a value between 1 and 255.
// interval_offset should be a value between 0 and 255.
LIBYUV_API
int ARGBQuantize(uint8* dst_argb, int dst_stride_argb,
int scale, int interval_size, int interval_offset,
int x, int y, int width, int height);
int ARGBQuantize(uint8* dst_argb,
int dst_stride_argb,
int scale,
int interval_size,
int interval_offset,
int x,
int y,
int width,
int height);
// Copy ARGB to ARGB.
LIBYUV_API
int ARGBCopy(const uint8* src_argb, int src_stride_argb,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
int ARGBCopy(const uint8* src_argb,
int src_stride_argb,
uint8* dst_argb,
int dst_stride_argb,
int width,
int height);
// Copy Alpha channel of ARGB to alpha of ARGB.
LIBYUV_API
int ARGBCopyAlpha(const uint8* src_argb, int src_stride_argb,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
int ARGBCopyAlpha(const uint8* src_argb,
int src_stride_argb,
uint8* dst_argb,
int dst_stride_argb,
int width,
int height);
// Extract the alpha channel from ARGB.
LIBYUV_API
int ARGBExtractAlpha(const uint8* src_argb, int src_stride_argb,
uint8* dst_a, int dst_stride_a,
int width, int height);
int ARGBExtractAlpha(const uint8* src_argb,
int src_stride_argb,
uint8* dst_a,
int dst_stride_a,
int width,
int height);
// Copy Y channel to Alpha of ARGB.
LIBYUV_API
int ARGBCopyYToAlpha(const uint8* src_y, int src_stride_y,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
int ARGBCopyYToAlpha(const uint8* src_y,
int src_stride_y,
uint8* dst_argb,
int dst_stride_argb,
int width,
int height);
typedef void (*ARGBBlendRow)(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width);
typedef void (*ARGBBlendRow)(const uint8* src_argb0,
const uint8* src_argb1,
uint8* dst_argb,
int width);
// Get function to Alpha Blend ARGB pixels and store to destination.
LIBYUV_API
@ -311,92 +500,143 @@ ARGBBlendRow GetARGBBlend();
// Source is pre-multiplied by alpha using ARGBAttenuate.
// Alpha of destination is set to 255.
LIBYUV_API
int ARGBBlend(const uint8* src_argb0, int src_stride_argb0,
const uint8* src_argb1, int src_stride_argb1,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
int ARGBBlend(const uint8* src_argb0,
int src_stride_argb0,
const uint8* src_argb1,
int src_stride_argb1,
uint8* dst_argb,
int dst_stride_argb,
int width,
int height);
// Alpha Blend plane and store to destination.
// Source is not pre-multiplied by alpha.
LIBYUV_API
int BlendPlane(const uint8* src_y0, int src_stride_y0,
const uint8* src_y1, int src_stride_y1,
const uint8* alpha, int alpha_stride,
uint8* dst_y, int dst_stride_y,
int width, int height);
int BlendPlane(const uint8* src_y0,
int src_stride_y0,
const uint8* src_y1,
int src_stride_y1,
const uint8* alpha,
int alpha_stride,
uint8* dst_y,
int dst_stride_y,
int width,
int height);
// Alpha Blend YUV images and store to destination.
// Source is not pre-multiplied by alpha.
// Alpha is full width x height and subsampled to half size to apply to UV.
LIBYUV_API
int I420Blend(const uint8* src_y0, int src_stride_y0,
const uint8* src_u0, int src_stride_u0,
const uint8* src_v0, int src_stride_v0,
const uint8* src_y1, int src_stride_y1,
const uint8* src_u1, int src_stride_u1,
const uint8* src_v1, int src_stride_v1,
const uint8* alpha, int alpha_stride,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
int I420Blend(const uint8* src_y0,
int src_stride_y0,
const uint8* src_u0,
int src_stride_u0,
const uint8* src_v0,
int src_stride_v0,
const uint8* src_y1,
int src_stride_y1,
const uint8* src_u1,
int src_stride_u1,
const uint8* src_v1,
int src_stride_v1,
const uint8* alpha,
int alpha_stride,
uint8* dst_y,
int dst_stride_y,
uint8* dst_u,
int dst_stride_u,
uint8* dst_v,
int dst_stride_v,
int width,
int height);
// Multiply ARGB image by ARGB image. Shifted down by 8. Saturates to 255.
LIBYUV_API
int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0,
const uint8* src_argb1, int src_stride_argb1,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
int ARGBMultiply(const uint8* src_argb0,
int src_stride_argb0,
const uint8* src_argb1,
int src_stride_argb1,
uint8* dst_argb,
int dst_stride_argb,
int width,
int height);
// Add ARGB image with ARGB image. Saturates to 255.
LIBYUV_API
int ARGBAdd(const uint8* src_argb0, int src_stride_argb0,
const uint8* src_argb1, int src_stride_argb1,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
int ARGBAdd(const uint8* src_argb0,
int src_stride_argb0,
const uint8* src_argb1,
int src_stride_argb1,
uint8* dst_argb,
int dst_stride_argb,
int width,
int height);
// Subtract ARGB image (argb1) from ARGB image (argb0). Saturates to 0.
LIBYUV_API
int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0,
const uint8* src_argb1, int src_stride_argb1,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
int ARGBSubtract(const uint8* src_argb0,
int src_stride_argb0,
const uint8* src_argb1,
int src_stride_argb1,
uint8* dst_argb,
int dst_stride_argb,
int width,
int height);
// Convert I422 to YUY2.
LIBYUV_API
int I422ToYUY2(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_frame, int dst_stride_frame,
int width, int height);
int I422ToYUY2(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_frame,
int dst_stride_frame,
int width,
int height);
// Convert I422 to UYVY.
LIBYUV_API
int I422ToUYVY(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_frame, int dst_stride_frame,
int width, int height);
int I422ToUYVY(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_frame,
int dst_stride_frame,
int width,
int height);
// Convert unattentuated ARGB to preattenuated ARGB.
LIBYUV_API
int ARGBAttenuate(const uint8* src_argb, int src_stride_argb,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
int ARGBAttenuate(const uint8* src_argb,
int src_stride_argb,
uint8* dst_argb,
int dst_stride_argb,
int width,
int height);
// Convert preattentuated ARGB to unattenuated ARGB.
LIBYUV_API
int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
int ARGBUnattenuate(const uint8* src_argb,
int src_stride_argb,
uint8* dst_argb,
int dst_stride_argb,
int width,
int height);
// Internal function - do not call directly.
// Computes table of cumulative sum for image where the value is the sum
// of all values above and to the left of the entry. Used by ARGBBlur.
LIBYUV_API
int ARGBComputeCumulativeSum(const uint8* src_argb, int src_stride_argb,
int32* dst_cumsum, int dst_stride32_cumsum,
int width, int height);
int ARGBComputeCumulativeSum(const uint8* src_argb,
int src_stride_argb,
int32* dst_cumsum,
int dst_stride32_cumsum,
int width,
int height);
// Blur ARGB image.
// dst_cumsum table of width * (height + 1) * 16 bytes aligned to
@ -405,49 +645,79 @@ int ARGBComputeCumulativeSum(const uint8* src_argb, int src_stride_argb,
// radius is number of pixels around the center. e.g. 1 = 3x3. 2=5x5.
// Blur is optimized for radius of 5 (11x11) or less.
LIBYUV_API
int ARGBBlur(const uint8* src_argb, int src_stride_argb,
uint8* dst_argb, int dst_stride_argb,
int32* dst_cumsum, int dst_stride32_cumsum,
int width, int height, int radius);
int ARGBBlur(const uint8* src_argb,
int src_stride_argb,
uint8* dst_argb,
int dst_stride_argb,
int32* dst_cumsum,
int dst_stride32_cumsum,
int width,
int height,
int radius);
// Multiply ARGB image by ARGB value.
LIBYUV_API
int ARGBShade(const uint8* src_argb, int src_stride_argb,
uint8* dst_argb, int dst_stride_argb,
int width, int height, uint32 value);
int ARGBShade(const uint8* src_argb,
int src_stride_argb,
uint8* dst_argb,
int dst_stride_argb,
int width,
int height,
uint32 value);
// Interpolate between two images using specified amount of interpolation
// (0 to 255) and store to destination.
// 'interpolation' is specified as 8 bit fraction where 0 means 100% src0
// and 255 means 1% src0 and 99% src1.
LIBYUV_API
int InterpolatePlane(const uint8* src0, int src_stride0,
const uint8* src1, int src_stride1,
uint8* dst, int dst_stride,
int width, int height, int interpolation);
int InterpolatePlane(const uint8* src0,
int src_stride0,
const uint8* src1,
int src_stride1,
uint8* dst,
int dst_stride,
int width,
int height,
int interpolation);
// Interpolate between two ARGB images using specified amount of interpolation
// Internally calls InterpolatePlane with width * 4 (bpp).
LIBYUV_API
int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0,
const uint8* src_argb1, int src_stride_argb1,
uint8* dst_argb, int dst_stride_argb,
int width, int height, int interpolation);
int ARGBInterpolate(const uint8* src_argb0,
int src_stride_argb0,
const uint8* src_argb1,
int src_stride_argb1,
uint8* dst_argb,
int dst_stride_argb,
int width,
int height,
int interpolation);
// Interpolate between two YUV images using specified amount of interpolation
// Internally calls InterpolatePlane on each plane where the U and V planes
// are half width and half height.
LIBYUV_API
int I420Interpolate(const uint8* src0_y, int src0_stride_y,
const uint8* src0_u, int src0_stride_u,
const uint8* src0_v, int src0_stride_v,
const uint8* src1_y, int src1_stride_y,
const uint8* src1_u, int src1_stride_u,
const uint8* src1_v, int src1_stride_v,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height, int interpolation);
int I420Interpolate(const uint8* src0_y,
int src0_stride_y,
const uint8* src0_u,
int src0_stride_u,
const uint8* src0_v,
int src0_stride_v,
const uint8* src1_y,
int src1_stride_y,
const uint8* src1_u,
int src1_stride_u,
const uint8* src1_v,
int src1_stride_v,
uint8* dst_y,
int dst_stride_y,
uint8* dst_u,
int dst_stride_u,
uint8* dst_v,
int dst_stride_v,
int width,
int height,
int interpolation);
#if defined(__pnacl__) || defined(__CLR_VER) || \
(defined(__i386__) && !defined(__SSE2__))
@ -468,40 +738,59 @@ int I420Interpolate(const uint8* src0_y, int src0_stride_y,
// Row function for copying pixels from a source with a slope to a row
// of destination. Useful for scaling, rotation, mirror, texture mapping.
LIBYUV_API
void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride,
uint8* dst_argb, const float* uv_dudv, int width);
void ARGBAffineRow_C(const uint8* src_argb,
int src_argb_stride,
uint8* dst_argb,
const float* uv_dudv,
int width);
LIBYUV_API
void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
uint8* dst_argb, const float* uv_dudv, int width);
void ARGBAffineRow_SSE2(const uint8* src_argb,
int src_argb_stride,
uint8* dst_argb,
const float* uv_dudv,
int width);
// Shuffle ARGB channel order. e.g. BGRA to ARGB.
// shuffler is 16 bytes and must be aligned.
LIBYUV_API
int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra,
uint8* dst_argb, int dst_stride_argb,
const uint8* shuffler, int width, int height);
int ARGBShuffle(const uint8* src_bgra,
int src_stride_bgra,
uint8* dst_argb,
int dst_stride_argb,
const uint8* shuffler,
int width,
int height);
// Sobel ARGB effect with planar output.
LIBYUV_API
int ARGBSobelToPlane(const uint8* src_argb, int src_stride_argb,
uint8* dst_y, int dst_stride_y,
int width, int height);
int ARGBSobelToPlane(const uint8* src_argb,
int src_stride_argb,
uint8* dst_y,
int dst_stride_y,
int width,
int height);
// Sobel ARGB effect.
LIBYUV_API
int ARGBSobel(const uint8* src_argb, int src_stride_argb,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
int ARGBSobel(const uint8* src_argb,
int src_stride_argb,
uint8* dst_argb,
int dst_stride_argb,
int width,
int height);
// Sobel ARGB effect w/ Sobel X, Sobel, Sobel Y in ARGB.
LIBYUV_API
int ARGBSobelXY(const uint8* src_argb, int src_stride_argb,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
int ARGBSobelXY(const uint8* src_argb,
int src_stride_argb,
uint8* dst_argb,
int dst_stride_argb,
int width,
int height);
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif
#endif // INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_ NOLINT
#endif // INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_

View File

@ -8,7 +8,7 @@
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef INCLUDE_LIBYUV_ROTATE_H_ // NOLINT
#ifndef INCLUDE_LIBYUV_ROTATE_H_
#define INCLUDE_LIBYUV_ROTATE_H_
#include "libyuv/basic_types.h"
@ -20,8 +20,8 @@ extern "C" {
// Supported rotation.
typedef enum RotationMode {
kRotate0 = 0, // No rotation.
kRotate90 = 90, // Rotate 90 degrees clockwise.
kRotate0 = 0, // No rotation.
kRotate90 = 90, // Rotate 90 degrees clockwise.
kRotate180 = 180, // Rotate 180 degrees.
kRotate270 = 270, // Rotate 270 degrees clockwise.
@ -33,85 +33,132 @@ typedef enum RotationMode {
// Rotate I420 frame.
LIBYUV_API
int I420Rotate(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int src_width, int src_height, enum RotationMode mode);
int I420Rotate(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_y,
int dst_stride_y,
uint8* dst_u,
int dst_stride_u,
uint8* dst_v,
int dst_stride_v,
int src_width,
int src_height,
enum RotationMode mode);
// Rotate NV12 input and store in I420.
LIBYUV_API
int NV12ToI420Rotate(const uint8* src_y, int src_stride_y,
const uint8* src_uv, int src_stride_uv,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int src_width, int src_height, enum RotationMode mode);
int NV12ToI420Rotate(const uint8* src_y,
int src_stride_y,
const uint8* src_uv,
int src_stride_uv,
uint8* dst_y,
int dst_stride_y,
uint8* dst_u,
int dst_stride_u,
uint8* dst_v,
int dst_stride_v,
int src_width,
int src_height,
enum RotationMode mode);
// Rotate a plane by 0, 90, 180, or 270.
LIBYUV_API
int RotatePlane(const uint8* src, int src_stride,
uint8* dst, int dst_stride,
int src_width, int src_height, enum RotationMode mode);
int RotatePlane(const uint8* src,
int src_stride,
uint8* dst,
int dst_stride,
int src_width,
int src_height,
enum RotationMode mode);
// Rotate planes by 90, 180, 270. Deprecated.
LIBYUV_API
void RotatePlane90(const uint8* src, int src_stride,
uint8* dst, int dst_stride,
int width, int height);
void RotatePlane90(const uint8* src,
int src_stride,
uint8* dst,
int dst_stride,
int width,
int height);
LIBYUV_API
void RotatePlane180(const uint8* src, int src_stride,
uint8* dst, int dst_stride,
int width, int height);
void RotatePlane180(const uint8* src,
int src_stride,
uint8* dst,
int dst_stride,
int width,
int height);
LIBYUV_API
void RotatePlane270(const uint8* src, int src_stride,
uint8* dst, int dst_stride,
int width, int height);
void RotatePlane270(const uint8* src,
int src_stride,
uint8* dst,
int dst_stride,
int width,
int height);
LIBYUV_API
void RotateUV90(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b,
int width, int height);
void RotateUV90(const uint8* src,
int src_stride,
uint8* dst_a,
int dst_stride_a,
uint8* dst_b,
int dst_stride_b,
int width,
int height);
// Rotations for when U and V are interleaved.
// These functions take one input pointer and
// split the data into two buffers while
// rotating them. Deprecated.
LIBYUV_API
void RotateUV180(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b,
int width, int height);
void RotateUV180(const uint8* src,
int src_stride,
uint8* dst_a,
int dst_stride_a,
uint8* dst_b,
int dst_stride_b,
int width,
int height);
LIBYUV_API
void RotateUV270(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b,
int width, int height);
void RotateUV270(const uint8* src,
int src_stride,
uint8* dst_a,
int dst_stride_a,
uint8* dst_b,
int dst_stride_b,
int width,
int height);
// The 90 and 270 functions are based on transposes.
// Doing a transpose with reversing the read/write
// order will result in a rotation by +- 90 degrees.
// Deprecated.
LIBYUV_API
void TransposePlane(const uint8* src, int src_stride,
uint8* dst, int dst_stride,
int width, int height);
void TransposePlane(const uint8* src,
int src_stride,
uint8* dst,
int dst_stride,
int width,
int height);
LIBYUV_API
void TransposeUV(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b,
int width, int height);
void TransposeUV(const uint8* src,
int src_stride,
uint8* dst_a,
int dst_stride_a,
uint8* dst_b,
int dst_stride_b,
int width,
int height);
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif
#endif // INCLUDE_LIBYUV_ROTATE_H_ NOLINT
#endif // INCLUDE_LIBYUV_ROTATE_H_

View File

@ -8,7 +8,7 @@
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef INCLUDE_LIBYUV_ROTATE_ARGB_H_ // NOLINT
#ifndef INCLUDE_LIBYUV_ROTATE_ARGB_H_
#define INCLUDE_LIBYUV_ROTATE_ARGB_H_
#include "libyuv/basic_types.h"
@ -21,13 +21,17 @@ extern "C" {
// Rotate ARGB frame
LIBYUV_API
int ARGBRotate(const uint8* src_argb, int src_stride_argb,
uint8* dst_argb, int dst_stride_argb,
int src_width, int src_height, enum RotationMode mode);
int ARGBRotate(const uint8* src_argb,
int src_stride_argb,
uint8* dst_argb,
int dst_stride_argb,
int src_width,
int src_height,
enum RotationMode mode);
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif
#endif // INCLUDE_LIBYUV_ROTATE_ARGB_H_ NOLINT
#endif // INCLUDE_LIBYUV_ROTATE_ARGB_H_

View File

@ -8,7 +8,7 @@
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef INCLUDE_LIBYUV_ROTATE_ROW_H_ // NOLINT
#ifndef INCLUDE_LIBYUV_ROTATE_ROW_H_
#define INCLUDE_LIBYUV_ROTATE_ROW_H_
#include "libyuv/basic_types.h"
@ -36,7 +36,8 @@ extern "C" {
// The following are available for GCC 32 or 64 bit but not NaCL for 64 bit:
#if !defined(LIBYUV_DISABLE_X86) && \
(defined(__i386__) || (defined(__x86_64__) && !defined(__native_client__)))
(defined(__i386__) || \
(defined(__x86_64__) && !defined(__native_client__)))
#define HAS_TRANSPOSEWX8_SSSE3
#endif
@ -53,69 +54,175 @@ extern "C" {
#define HAS_TRANSPOSEUVWX8_NEON
#endif
#if !defined(LIBYUV_DISABLE_MIPS) && !defined(__native_client__) && \
defined(__mips__) && \
defined(__mips_dsp) && (__mips_dsp_rev >= 2)
#if !defined(LIBYUV_DISABLE_DSPR2) && !defined(__native_client__) && \
defined(__mips__) && defined(__mips_dsp) && (__mips_dsp_rev >= 2)
#define HAS_TRANSPOSEWX8_DSPR2
#define HAS_TRANSPOSEUVWX8_DSPR2
#endif // defined(__mips__)
void TransposeWxH_C(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width, int height);
#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
#define HAS_TRANSPOSEWX16_MSA
#define HAS_TRANSPOSEUVWX16_MSA
#endif
void TransposeWx8_C(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width);
void TransposeWx8_NEON(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width);
void TransposeWx8_SSSE3(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width);
void TransposeWx8_Fast_SSSE3(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width);
void TransposeWx8_DSPR2(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width);
void TransposeWx8_Fast_DSPR2(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width);
void TransposeWxH_C(const uint8* src,
int src_stride,
uint8* dst,
int dst_stride,
int width,
int height);
void TransposeWx8_Any_NEON(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width);
void TransposeWx8_Any_SSSE3(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width);
void TransposeWx8_Fast_Any_SSSE3(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width);
void TransposeWx8_Any_DSPR2(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width);
void TransposeWx8_C(const uint8* src,
int src_stride,
uint8* dst,
int dst_stride,
int width);
void TransposeWx16_C(const uint8* src,
int src_stride,
uint8* dst,
int dst_stride,
int width);
void TransposeWx8_NEON(const uint8* src,
int src_stride,
uint8* dst,
int dst_stride,
int width);
void TransposeWx8_SSSE3(const uint8* src,
int src_stride,
uint8* dst,
int dst_stride,
int width);
void TransposeWx8_Fast_SSSE3(const uint8* src,
int src_stride,
uint8* dst,
int dst_stride,
int width);
void TransposeWx8_DSPR2(const uint8* src,
int src_stride,
uint8* dst,
int dst_stride,
int width);
void TransposeWx8_Fast_DSPR2(const uint8* src,
int src_stride,
uint8* dst,
int dst_stride,
int width);
void TransposeWx16_MSA(const uint8* src,
int src_stride,
uint8* dst,
int dst_stride,
int width);
void TransposeUVWxH_C(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b,
int width, int height);
void TransposeWx8_Any_NEON(const uint8* src,
int src_stride,
uint8* dst,
int dst_stride,
int width);
void TransposeWx8_Any_SSSE3(const uint8* src,
int src_stride,
uint8* dst,
int dst_stride,
int width);
void TransposeWx8_Fast_Any_SSSE3(const uint8* src,
int src_stride,
uint8* dst,
int dst_stride,
int width);
void TransposeWx8_Any_DSPR2(const uint8* src,
int src_stride,
uint8* dst,
int dst_stride,
int width);
void TransposeWx16_Any_MSA(const uint8* src,
int src_stride,
uint8* dst,
int dst_stride,
int width);
void TransposeUVWx8_C(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b, int width);
void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b, int width);
void TransposeUVWx8_NEON(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b, int width);
void TransposeUVWx8_DSPR2(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b, int width);
void TransposeUVWxH_C(const uint8* src,
int src_stride,
uint8* dst_a,
int dst_stride_a,
uint8* dst_b,
int dst_stride_b,
int width,
int height);
void TransposeUVWx8_Any_SSE2(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b, int width);
void TransposeUVWx8_Any_NEON(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b, int width);
void TransposeUVWx8_Any_DSPR2(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b, int width);
void TransposeUVWx8_C(const uint8* src,
int src_stride,
uint8* dst_a,
int dst_stride_a,
uint8* dst_b,
int dst_stride_b,
int width);
void TransposeUVWx16_C(const uint8* src,
int src_stride,
uint8* dst_a,
int dst_stride_a,
uint8* dst_b,
int dst_stride_b,
int width);
void TransposeUVWx8_SSE2(const uint8* src,
int src_stride,
uint8* dst_a,
int dst_stride_a,
uint8* dst_b,
int dst_stride_b,
int width);
void TransposeUVWx8_NEON(const uint8* src,
int src_stride,
uint8* dst_a,
int dst_stride_a,
uint8* dst_b,
int dst_stride_b,
int width);
void TransposeUVWx8_DSPR2(const uint8* src,
int src_stride,
uint8* dst_a,
int dst_stride_a,
uint8* dst_b,
int dst_stride_b,
int width);
void TransposeUVWx16_MSA(const uint8* src,
int src_stride,
uint8* dst_a,
int dst_stride_a,
uint8* dst_b,
int dst_stride_b,
int width);
void TransposeUVWx8_Any_SSE2(const uint8* src,
int src_stride,
uint8* dst_a,
int dst_stride_a,
uint8* dst_b,
int dst_stride_b,
int width);
void TransposeUVWx8_Any_NEON(const uint8* src,
int src_stride,
uint8* dst_a,
int dst_stride_a,
uint8* dst_b,
int dst_stride_b,
int width);
void TransposeUVWx8_Any_DSPR2(const uint8* src,
int src_stride,
uint8* dst_a,
int dst_stride_a,
uint8* dst_b,
int dst_stride_b,
int width);
void TransposeUVWx16_Any_MSA(const uint8* src,
int src_stride,
uint8* dst_a,
int dst_stride_a,
uint8* dst_b,
int dst_stride_b,
int width);
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif
#endif // INCLUDE_LIBYUV_ROTATE_ROW_H_ NOLINT
#endif // INCLUDE_LIBYUV_ROTATE_ROW_H_

File diff suppressed because it is too large Load Diff

View File

@ -8,7 +8,7 @@
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef INCLUDE_LIBYUV_SCALE_H_ // NOLINT
#ifndef INCLUDE_LIBYUV_SCALE_H_
#define INCLUDE_LIBYUV_SCALE_H_
#include "libyuv/basic_types.h"
@ -20,25 +20,33 @@ extern "C" {
// Supported filtering.
typedef enum FilterMode {
kFilterNone = 0, // Point sample; Fastest.
kFilterLinear = 1, // Filter horizontally only.
kFilterNone = 0, // Point sample; Fastest.
kFilterLinear = 1, // Filter horizontally only.
kFilterBilinear = 2, // Faster than box, but lower quality scaling down.
kFilterBox = 3 // Highest quality.
kFilterBox = 3 // Highest quality.
} FilterModeEnum;
// Scale a YUV plane.
LIBYUV_API
void ScalePlane(const uint8* src, int src_stride,
int src_width, int src_height,
uint8* dst, int dst_stride,
int dst_width, int dst_height,
void ScalePlane(const uint8* src,
int src_stride,
int src_width,
int src_height,
uint8* dst,
int dst_stride,
int dst_width,
int dst_height,
enum FilterMode filtering);
LIBYUV_API
void ScalePlane_16(const uint16* src, int src_stride,
int src_width, int src_height,
uint16* dst, int dst_stride,
int dst_width, int dst_height,
void ScalePlane_16(const uint16* src,
int src_stride,
int src_width,
int src_height,
uint16* dst,
int dst_stride,
int dst_width,
int dst_height,
enum FilterMode filtering);
// Scales a YUV 4:2:0 image from the src width and height to the
@ -52,42 +60,73 @@ void ScalePlane_16(const uint16* src, int src_stride,
// Returns 0 if successful.
LIBYUV_API
int I420Scale(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
int src_width, int src_height,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int dst_width, int dst_height,
int I420Scale(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
int src_width,
int src_height,
uint8* dst_y,
int dst_stride_y,
uint8* dst_u,
int dst_stride_u,
uint8* dst_v,
int dst_stride_v,
int dst_width,
int dst_height,
enum FilterMode filtering);
LIBYUV_API
int I420Scale_16(const uint16* src_y, int src_stride_y,
const uint16* src_u, int src_stride_u,
const uint16* src_v, int src_stride_v,
int src_width, int src_height,
uint16* dst_y, int dst_stride_y,
uint16* dst_u, int dst_stride_u,
uint16* dst_v, int dst_stride_v,
int dst_width, int dst_height,
int I420Scale_16(const uint16* src_y,
int src_stride_y,
const uint16* src_u,
int src_stride_u,
const uint16* src_v,
int src_stride_v,
int src_width,
int src_height,
uint16* dst_y,
int dst_stride_y,
uint16* dst_u,
int dst_stride_u,
uint16* dst_v,
int dst_stride_v,
int dst_width,
int dst_height,
enum FilterMode filtering);
#ifdef __cplusplus
// Legacy API. Deprecated.
LIBYUV_API
int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v,
int src_stride_y, int src_stride_u, int src_stride_v,
int src_width, int src_height,
uint8* dst_y, uint8* dst_u, uint8* dst_v,
int dst_stride_y, int dst_stride_u, int dst_stride_v,
int dst_width, int dst_height,
int Scale(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
int src_stride_y,
int src_stride_u,
int src_stride_v,
int src_width,
int src_height,
uint8* dst_y,
uint8* dst_u,
uint8* dst_v,
int dst_stride_y,
int dst_stride_u,
int dst_stride_v,
int dst_width,
int dst_height,
LIBYUV_BOOL interpolate);
// Legacy API. Deprecated.
LIBYUV_API
int ScaleOffset(const uint8* src_i420, int src_width, int src_height,
uint8* dst_i420, int dst_width, int dst_height, int dst_yoffset,
int ScaleOffset(const uint8* src_i420,
int src_width,
int src_height,
uint8* dst_i420,
int dst_width,
int dst_height,
int dst_yoffset,
LIBYUV_BOOL interpolate);
// For testing, allow disabling of specialized scalers.
@ -100,4 +139,4 @@ void SetUseReferenceImpl(LIBYUV_BOOL use);
} // namespace libyuv
#endif
#endif // INCLUDE_LIBYUV_SCALE_H_ NOLINT
#endif // INCLUDE_LIBYUV_SCALE_H_

View File

@ -8,7 +8,7 @@
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef INCLUDE_LIBYUV_SCALE_ARGB_H_ // NOLINT
#ifndef INCLUDE_LIBYUV_SCALE_ARGB_H_
#define INCLUDE_LIBYUV_SCALE_ARGB_H_
#include "libyuv/basic_types.h"
@ -20,32 +20,52 @@ extern "C" {
#endif
LIBYUV_API
int ARGBScale(const uint8* src_argb, int src_stride_argb,
int src_width, int src_height,
uint8* dst_argb, int dst_stride_argb,
int dst_width, int dst_height,
int ARGBScale(const uint8* src_argb,
int src_stride_argb,
int src_width,
int src_height,
uint8* dst_argb,
int dst_stride_argb,
int dst_width,
int dst_height,
enum FilterMode filtering);
// Clipped scale takes destination rectangle coordinates for clip values.
LIBYUV_API
int ARGBScaleClip(const uint8* src_argb, int src_stride_argb,
int src_width, int src_height,
uint8* dst_argb, int dst_stride_argb,
int dst_width, int dst_height,
int clip_x, int clip_y, int clip_width, int clip_height,
int ARGBScaleClip(const uint8* src_argb,
int src_stride_argb,
int src_width,
int src_height,
uint8* dst_argb,
int dst_stride_argb,
int dst_width,
int dst_height,
int clip_x,
int clip_y,
int clip_width,
int clip_height,
enum FilterMode filtering);
// Scale with YUV conversion to ARGB and clipping.
LIBYUV_API
int YUVToARGBScaleClip(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
int YUVToARGBScaleClip(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint32 src_fourcc,
int src_width, int src_height,
uint8* dst_argb, int dst_stride_argb,
int src_width,
int src_height,
uint8* dst_argb,
int dst_stride_argb,
uint32 dst_fourcc,
int dst_width, int dst_height,
int clip_x, int clip_y, int clip_width, int clip_height,
int dst_width,
int dst_height,
int clip_x,
int clip_y,
int clip_width,
int clip_height,
enum FilterMode filtering);
#ifdef __cplusplus
@ -53,4 +73,4 @@ int YUVToARGBScaleClip(const uint8* src_y, int src_stride_y,
} // namespace libyuv
#endif
#endif // INCLUDE_LIBYUV_SCALE_ARGB_H_ NOLINT
#endif // INCLUDE_LIBYUV_SCALE_ARGB_H_

File diff suppressed because it is too large Load Diff

View File

@ -8,9 +8,9 @@
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1598
#define LIBYUV_VERSION 1662
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
#endif // INCLUDE_LIBYUV_VERSION_H_

View File

@ -10,7 +10,7 @@
// Common definitions for video, including fourcc and VideoFormat.
#ifndef INCLUDE_LIBYUV_VIDEO_COMMON_H_ // NOLINT
#ifndef INCLUDE_LIBYUV_VIDEO_COMMON_H_
#define INCLUDE_LIBYUV_VIDEO_COMMON_H_
#include "libyuv/basic_types.h"
@ -28,13 +28,13 @@ extern "C" {
// Needs to be a macro otherwise the OS X compiler complains when the kFormat*
// constants are used in a switch.
#ifdef __cplusplus
#define FOURCC(a, b, c, d) ( \
(static_cast<uint32>(a)) | (static_cast<uint32>(b) << 8) | \
(static_cast<uint32>(c) << 16) | (static_cast<uint32>(d) << 24))
#define FOURCC(a, b, c, d) \
((static_cast<uint32>(a)) | (static_cast<uint32>(b) << 8) | \
(static_cast<uint32>(c) << 16) | (static_cast<uint32>(d) << 24))
#else
#define FOURCC(a, b, c, d) ( \
((uint32)(a)) | ((uint32)(b) << 8) | /* NOLINT */ \
((uint32)(c) << 16) | ((uint32)(d) << 24)) /* NOLINT */
#define FOURCC(a, b, c, d) \
(((uint32)(a)) | ((uint32)(b) << 8) | /* NOLINT */ \
((uint32)(c) << 16) | ((uint32)(d) << 24)) /* NOLINT */
#endif
// Some pages discussing FourCC codes:
@ -49,18 +49,18 @@ extern "C" {
// Secondary formats are converted in 2 steps.
// Auxilliary formats call primary converters.
enum FourCC {
// 9 Primary YUV formats: 5 planar, 2 biplanar, 2 packed.
// 8 Primary YUV formats: 5 planar, 2 biplanar, 2 packed.
FOURCC_I420 = FOURCC('I', '4', '2', '0'),
FOURCC_I422 = FOURCC('I', '4', '2', '2'),
FOURCC_I444 = FOURCC('I', '4', '4', '4'),
FOURCC_I411 = FOURCC('I', '4', '1', '1'),
FOURCC_I411 = FOURCC('I', '4', '1', '1'), // deprecated.
FOURCC_I400 = FOURCC('I', '4', '0', '0'),
FOURCC_NV21 = FOURCC('N', 'V', '2', '1'),
FOURCC_NV12 = FOURCC('N', 'V', '1', '2'),
FOURCC_YUY2 = FOURCC('Y', 'U', 'Y', '2'),
FOURCC_UYVY = FOURCC('U', 'Y', 'V', 'Y'),
// 2 Secondary YUV formats: row biplanar.
// 1 Secondary YUV format: row biplanar.
FOURCC_M420 = FOURCC('M', '4', '2', '0'),
FOURCC_Q420 = FOURCC('Q', '4', '2', '0'), // deprecated.
@ -69,7 +69,7 @@ enum FourCC {
FOURCC_BGRA = FOURCC('B', 'G', 'R', 'A'),
FOURCC_ABGR = FOURCC('A', 'B', 'G', 'R'),
FOURCC_24BG = FOURCC('2', '4', 'B', 'G'),
FOURCC_RAW = FOURCC('r', 'a', 'w', ' '),
FOURCC_RAW = FOURCC('r', 'a', 'w', ' '),
FOURCC_RGBA = FOURCC('R', 'G', 'B', 'A'),
FOURCC_RGBP = FOURCC('R', 'G', 'B', 'P'), // rgb565 LE.
FOURCC_RGBO = FOURCC('R', 'G', 'B', 'O'), // argb1555 LE.
@ -137,7 +137,7 @@ enum FourCCBpp {
FOURCC_BPP_ABGR = 32,
FOURCC_BPP_RGBA = 32,
FOURCC_BPP_24BG = 24,
FOURCC_BPP_RAW = 24,
FOURCC_BPP_RAW = 24,
FOURCC_BPP_RGBP = 16,
FOURCC_BPP_RGBO = 16,
FOURCC_BPP_R444 = 16,
@ -170,7 +170,7 @@ enum FourCCBpp {
FOURCC_BPP_CM24 = 24,
// Match any fourcc.
FOURCC_BPP_ANY = 0, // 0 means unknown.
FOURCC_BPP_ANY = 0, // 0 means unknown.
};
// Converts fourcc aliases into canonical ones.
@ -181,4 +181,4 @@ LIBYUV_API uint32 CanonicalFourCC(uint32 fourcc);
} // namespace libyuv
#endif
#endif // INCLUDE_LIBYUV_VIDEO_COMMON_H_ NOLINT
#endif // INCLUDE_LIBYUV_VIDEO_COMMON_H_

View File

@ -32,8 +32,7 @@ LIBYUV_API
uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) {
const int kBlockSize = 1 << 15; // 32768;
int remainder;
uint32 (*HashDjb2_SSE)(const uint8* src, int count, uint32 seed) =
HashDjb2_C;
uint32 (*HashDjb2_SSE)(const uint8* src, int count, uint32 seed) = HashDjb2_C;
#if defined(HAS_HASHDJB2_SSE41)
if (TestCpuFlag(kCpuHasSSE41)) {
HashDjb2_SSE = HashDjb2_SSE41;
@ -50,13 +49,13 @@ uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) {
src += kBlockSize;
count -= kBlockSize;
}
remainder = (int)(count) & ~15;
remainder = (int)count & ~15;
if (remainder) {
seed = HashDjb2_SSE(src, remainder, seed);
src += remainder;
count -= remainder;
}
remainder = (int)(count) & 15;
remainder = (int)count & 15;
if (remainder) {
seed = HashDjb2_C(src, remainder, seed);
}
@ -111,9 +110,55 @@ uint32 ARGBDetect(const uint8* argb, int stride_argb, int width, int height) {
return fourcc;
}
LIBYUV_API
uint64 ComputeHammingDistance(const uint8* src_a,
const uint8* src_b,
int count) {
const int kBlockSize = 65536;
int remainder = count & (kBlockSize - 1) & ~31;
uint64 diff = 0;
int i;
uint32 (*HammingDistance)(const uint8* src_a, const uint8* src_b, int count) =
HammingDistance_C;
#if defined(HAS_HAMMINGDISTANCE_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
HammingDistance = HammingDistance_NEON;
}
#endif
#if defined(HAS_HAMMINGDISTANCE_X86)
if (TestCpuFlag(kCpuHasX86)) {
HammingDistance = HammingDistance_X86;
}
#endif
#if defined(HAS_HAMMINGDISTANCE_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
HammingDistance = HammingDistance_AVX2;
}
#endif
#ifdef _OPENMP
#pragma omp parallel for reduction(+ : diff)
#endif
for (i = 0; i < (count - (kBlockSize - 1)); i += kBlockSize) {
diff += HammingDistance(src_a + i, src_b + i, kBlockSize);
}
src_a += count & ~(kBlockSize - 1);
src_b += count & ~(kBlockSize - 1);
if (remainder) {
diff += HammingDistance(src_a, src_b, remainder);
src_a += remainder;
src_b += remainder;
}
remainder = count & 31;
if (remainder) {
diff += HammingDistance_C(src_a, src_b, remainder);
}
return diff;
}
// TODO(fbarchard): Refactor into row function.
LIBYUV_API
uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b,
uint64 ComputeSumSquareError(const uint8* src_a,
const uint8* src_b,
int count) {
// SumSquareError returns values 0 to 65535 for each squared difference.
// Up to 65536 of those can be summed and remain within a uint32.
@ -142,7 +187,7 @@ uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b,
}
#endif
#ifdef _OPENMP
#pragma omp parallel for reduction(+: sse)
#pragma omp parallel for reduction(+ : sse)
#endif
for (i = 0; i < (count - (kBlockSize - 1)); i += kBlockSize) {
sse += SumSquareError(src_a + i, src_b + i, kBlockSize);
@ -162,14 +207,16 @@ uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b,
}
LIBYUV_API
uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a,
const uint8* src_b, int stride_b,
int width, int height) {
uint64 ComputeSumSquareErrorPlane(const uint8* src_a,
int stride_a,
const uint8* src_b,
int stride_b,
int width,
int height) {
uint64 sse = 0;
int h;
// Coalesce rows.
if (stride_a == width &&
stride_b == width) {
if (stride_a == width && stride_b == width) {
width *= height;
height = 1;
stride_a = stride_b = 0;
@ -186,10 +233,10 @@ LIBYUV_API
double SumSquareErrorToPsnr(uint64 sse, uint64 count) {
double psnr;
if (sse > 0) {
double mse = (double)(count) / (double)(sse);
double mse = (double)count / (double)sse;
psnr = 10.0 * log10(255.0 * 255.0 * mse);
} else {
psnr = kMaxPsnr; // Limit to prevent divide by 0
psnr = kMaxPsnr; // Limit to prevent divide by 0
}
if (psnr > kMaxPsnr)
@ -199,45 +246,53 @@ double SumSquareErrorToPsnr(uint64 sse, uint64 count) {
}
LIBYUV_API
double CalcFramePsnr(const uint8* src_a, int stride_a,
const uint8* src_b, int stride_b,
int width, int height) {
double CalcFramePsnr(const uint8* src_a,
int stride_a,
const uint8* src_b,
int stride_b,
int width,
int height) {
const uint64 samples = width * height;
const uint64 sse = ComputeSumSquareErrorPlane(src_a, stride_a,
src_b, stride_b,
width, height);
const uint64 sse = ComputeSumSquareErrorPlane(src_a, stride_a, src_b,
stride_b, width, height);
return SumSquareErrorToPsnr(sse, samples);
}
LIBYUV_API
double I420Psnr(const uint8* src_y_a, int stride_y_a,
const uint8* src_u_a, int stride_u_a,
const uint8* src_v_a, int stride_v_a,
const uint8* src_y_b, int stride_y_b,
const uint8* src_u_b, int stride_u_b,
const uint8* src_v_b, int stride_v_b,
int width, int height) {
const uint64 sse_y = ComputeSumSquareErrorPlane(src_y_a, stride_y_a,
src_y_b, stride_y_b,
width, height);
double I420Psnr(const uint8* src_y_a,
int stride_y_a,
const uint8* src_u_a,
int stride_u_a,
const uint8* src_v_a,
int stride_v_a,
const uint8* src_y_b,
int stride_y_b,
const uint8* src_u_b,
int stride_u_b,
const uint8* src_v_b,
int stride_v_b,
int width,
int height) {
const uint64 sse_y = ComputeSumSquareErrorPlane(src_y_a, stride_y_a, src_y_b,
stride_y_b, width, height);
const int width_uv = (width + 1) >> 1;
const int height_uv = (height + 1) >> 1;
const uint64 sse_u = ComputeSumSquareErrorPlane(src_u_a, stride_u_a,
src_u_b, stride_u_b,
width_uv, height_uv);
const uint64 sse_v = ComputeSumSquareErrorPlane(src_v_a, stride_v_a,
src_v_b, stride_v_b,
width_uv, height_uv);
const uint64 sse_u = ComputeSumSquareErrorPlane(
src_u_a, stride_u_a, src_u_b, stride_u_b, width_uv, height_uv);
const uint64 sse_v = ComputeSumSquareErrorPlane(
src_v_a, stride_v_a, src_v_b, stride_v_b, width_uv, height_uv);
const uint64 samples = width * height + 2 * (width_uv * height_uv);
const uint64 sse = sse_y + sse_u + sse_v;
return SumSquareErrorToPsnr(sse, samples);
}
static const int64 cc1 = 26634; // (64^2*(.01*255)^2
static const int64 cc1 = 26634; // (64^2*(.01*255)^2
static const int64 cc2 = 239708; // (64^2*(.03*255)^2
static double Ssim8x8_C(const uint8* src_a, int stride_a,
const uint8* src_b, int stride_b) {
static double Ssim8x8_C(const uint8* src_a,
int stride_a,
const uint8* src_b,
int stride_b) {
int64 sum_a = 0;
int64 sum_b = 0;
int64 sum_sq_a = 0;
@ -270,12 +325,12 @@ static double Ssim8x8_C(const uint8* src_a, int stride_a,
const int64 ssim_n = (2 * sum_a_x_sum_b + c1) *
(2 * count * sum_axb - 2 * sum_a_x_sum_b + c2);
const int64 sum_a_sq = sum_a*sum_a;
const int64 sum_b_sq = sum_b*sum_b;
const int64 sum_a_sq = sum_a * sum_a;
const int64 sum_b_sq = sum_b * sum_b;
const int64 ssim_d = (sum_a_sq + sum_b_sq + c1) *
(count * sum_sq_a - sum_a_sq +
count * sum_sq_b - sum_b_sq + c2);
const int64 ssim_d =
(sum_a_sq + sum_b_sq + c1) *
(count * sum_sq_a - sum_a_sq + count * sum_sq_b - sum_b_sq + c2);
if (ssim_d == 0.0) {
return DBL_MAX;
@ -288,13 +343,16 @@ static double Ssim8x8_C(const uint8* src_a, int stride_a,
// on the 4x4 pixel grid. Such arrangement allows the windows to overlap
// block boundaries to penalize blocking artifacts.
LIBYUV_API
double CalcFrameSsim(const uint8* src_a, int stride_a,
const uint8* src_b, int stride_b,
int width, int height) {
double CalcFrameSsim(const uint8* src_a,
int stride_a,
const uint8* src_b,
int stride_b,
int width,
int height) {
int samples = 0;
double ssim_total = 0;
double (*Ssim8x8)(const uint8* src_a, int stride_a,
const uint8* src_b, int stride_b) = Ssim8x8_C;
double (*Ssim8x8)(const uint8* src_a, int stride_a, const uint8* src_b,
int stride_b) = Ssim8x8_C;
// sample point start with each 4x4 location
int i;
@ -314,22 +372,27 @@ double CalcFrameSsim(const uint8* src_a, int stride_a,
}
LIBYUV_API
double I420Ssim(const uint8* src_y_a, int stride_y_a,
const uint8* src_u_a, int stride_u_a,
const uint8* src_v_a, int stride_v_a,
const uint8* src_y_b, int stride_y_b,
const uint8* src_u_b, int stride_u_b,
const uint8* src_v_b, int stride_v_b,
int width, int height) {
const double ssim_y = CalcFrameSsim(src_y_a, stride_y_a,
src_y_b, stride_y_b, width, height);
double I420Ssim(const uint8* src_y_a,
int stride_y_a,
const uint8* src_u_a,
int stride_u_a,
const uint8* src_v_a,
int stride_v_a,
const uint8* src_y_b,
int stride_y_b,
const uint8* src_u_b,
int stride_u_b,
const uint8* src_v_b,
int stride_v_b,
int width,
int height) {
const double ssim_y =
CalcFrameSsim(src_y_a, stride_y_a, src_y_b, stride_y_b, width, height);
const int width_uv = (width + 1) >> 1;
const int height_uv = (height + 1) >> 1;
const double ssim_u = CalcFrameSsim(src_u_a, stride_u_a,
src_u_b, stride_u_b,
const double ssim_u = CalcFrameSsim(src_u_a, stride_u_a, src_u_b, stride_u_b,
width_uv, height_uv);
const double ssim_v = CalcFrameSsim(src_v_a, stride_v_a,
src_v_b, stride_v_b,
const double ssim_v = CalcFrameSsim(src_v_a, stride_v_a, src_v_b, stride_v_b,
width_uv, height_uv);
return ssim_y * 0.8 + 0.1 * (ssim_u + ssim_v);
}

View File

@ -17,6 +17,50 @@ namespace libyuv {
extern "C" {
#endif
#if ORIGINAL_OPT
uint32 HammingDistance_C(const uint8* src_a, const uint8* src_b, int count) {
uint32 diff = 0u;
int i;
for (i = 0; i < count; ++i) {
int x = src_a[i] ^ src_b[i];
if (x & 1)
++diff;
if (x & 2)
++diff;
if (x & 4)
++diff;
if (x & 8)
++diff;
if (x & 16)
++diff;
if (x & 32)
++diff;
if (x & 64)
++diff;
if (x & 128)
++diff;
}
return diff;
}
#endif
// Hakmem method for hamming distance.
uint32 HammingDistance_C(const uint8* src_a, const uint8* src_b, int count) {
uint32 diff = 0u;
int i;
for (i = 0; i < count - 3; i += 4) {
uint32 x = *((uint32*)src_a) ^ *((uint32*)src_b);
uint32 u = x - ((x >> 1) & 0x55555555);
u = ((u >> 2) & 0x33333333) + (u & 0x33333333);
diff += ((((u + (u >> 4)) & 0x0f0f0f0f) * 0x01010101) >> 24);
src_a += 4;
src_b += 4;
}
return diff;
}
uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count) {
uint32 sse = 0u;
int i;

View File

@ -22,6 +22,19 @@ extern "C" {
#if !defined(LIBYUV_DISABLE_X86) && \
(defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
uint32 HammingDistance_X86(const uint8* src_a, const uint8* src_b, int count) {
uint32 diff = 0u;
int i;
for (i = 0; i < count - 7; i += 8) {
uint64 x = *((uint64*)src_a) ^ *((uint64*)src_b);
src_a += 8;
src_b += 8;
diff += __builtin_popcountll(x);
}
return diff;
}
uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
uint32 sse;
asm volatile (
@ -62,30 +75,30 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
return sse;
}
static uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 }; // 33 ^ 16
static uvec32 kHash16x33 = {0x92d9e201, 0, 0, 0}; // 33 ^ 16
static uvec32 kHashMul0 = {
0x0c3525e1, // 33 ^ 15
0xa3476dc1, // 33 ^ 14
0x3b4039a1, // 33 ^ 13
0x4f5f0981, // 33 ^ 12
0x0c3525e1, // 33 ^ 15
0xa3476dc1, // 33 ^ 14
0x3b4039a1, // 33 ^ 13
0x4f5f0981, // 33 ^ 12
};
static uvec32 kHashMul1 = {
0x30f35d61, // 33 ^ 11
0x855cb541, // 33 ^ 10
0x040a9121, // 33 ^ 9
0x747c7101, // 33 ^ 8
0x30f35d61, // 33 ^ 11
0x855cb541, // 33 ^ 10
0x040a9121, // 33 ^ 9
0x747c7101, // 33 ^ 8
};
static uvec32 kHashMul2 = {
0xec41d4e1, // 33 ^ 7
0x4cfa3cc1, // 33 ^ 6
0x025528a1, // 33 ^ 5
0x00121881, // 33 ^ 4
0xec41d4e1, // 33 ^ 7
0x4cfa3cc1, // 33 ^ 6
0x025528a1, // 33 ^ 5
0x00121881, // 33 ^ 4
};
static uvec32 kHashMul3 = {
0x00008c61, // 33 ^ 3
0x00000441, // 33 ^ 2
0x00000021, // 33 ^ 1
0x00000001, // 33 ^ 0
0x00008c61, // 33 ^ 3
0x00000441, // 33 ^ 2
0x00000021, // 33 ^ 1
0x00000001, // 33 ^ 0
};
uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
@ -148,4 +161,3 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
} // extern "C"
} // namespace libyuv
#endif

View File

@ -21,8 +21,42 @@ extern "C" {
#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
!defined(__aarch64__)
// 256 bits at a time
// uses short accumulator which restricts count to 131 KB
uint32 HammingDistance_NEON(const uint8* src_a, const uint8* src_b, int count) {
uint32 diff;
asm volatile (
"vmov.u16 q4, #0 \n" // accumulator
"1: \n"
"vld1.8 {q0, q1}, [%0]! \n"
"vld1.8 {q2, q3}, [%1]! \n"
"veor.32 q0, q0, q2 \n"
"veor.32 q1, q1, q3 \n"
"vcnt.i8 q0, q0 \n"
"vcnt.i8 q1, q1 \n"
"subs %2, %2, #32 \n"
"vadd.u8 q0, q0, q1 \n" // 16 byte counts
"vpadal.u8 q4, q0 \n" // 8 shorts
"bgt 1b \n"
"vpaddl.u16 q0, q4 \n" // 4 ints
"vpadd.u32 d0, d0, d1 \n"
"vpadd.u32 d0, d0, d0 \n"
"vmov.32 %3, d0[0] \n"
: "+r"(src_a),
"+r"(src_b),
"+r"(count),
"=r"(diff)
:
: "cc", "q0", "q1", "q2", "q3", "q4");
return diff;
}
uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
volatile uint32 sse;
uint32 sse;
asm volatile (
"vmov.u8 q8, #0 \n"
"vmov.u8 q10, #0 \n"
@ -30,9 +64,7 @@ uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
"vmov.u8 q11, #0 \n"
"1: \n"
MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n"
MEMACCESS(1)
"vld1.8 {q1}, [%1]! \n"
"subs %2, %2, #16 \n"
"vsubl.u8 q2, d0, d2 \n"

View File

@ -20,8 +20,38 @@ extern "C" {
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
// 256 bits at a time
// uses short accumulator which restricts count to 131 KB
uint32 HammingDistance_NEON(const uint8* src_a, const uint8* src_b, int count) {
uint32 diff;
asm volatile (
"movi v4.8h, #0 \n"
"1: \n"
"ld1 {v0.16b, v1.16b}, [%0], #32 \n"
"ld1 {v2.16b, v3.16b}, [%1], #32 \n"
"eor v0.16b, v0.16b, v2.16b \n"
"eor v1.16b, v1.16b, v3.16b \n"
"cnt v0.16b, v0.16b \n"
"cnt v1.16b, v1.16b \n"
"subs %w2, %w2, #32 \n"
"add v0.16b, v0.16b, v1.16b \n"
"uadalp v4.8h, v0.16b \n"
"b.gt 1b \n"
"uaddlv s4, v4.8h \n"
"fmov %w3, s4 \n"
: "+r"(src_a),
"+r"(src_b),
"+r"(count),
"=r"(diff)
:
: "cc", "v0", "v1", "v2", "v3", "v4");
return diff;
}
uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
volatile uint32 sse;
uint32 sse;
asm volatile (
"eor v16.16b, v16.16b, v16.16b \n"
"eor v18.16b, v18.16b, v18.16b \n"
@ -29,9 +59,7 @@ uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
"eor v19.16b, v19.16b, v19.16b \n"
"1: \n"
MEMACCESS(0)
"ld1 {v0.16b}, [%0], #16 \n"
MEMACCESS(1)
"ld1 {v1.16b}, [%1], #16 \n"
"subs %w2, %w2, #16 \n"
"usubl v2.8h, v0.8b, v1.8b \n"

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -37,13 +37,9 @@ static void JpegCopyI420(void* opaque,
const int* strides,
int rows) {
I420Buffers* dest = (I420Buffers*)(opaque);
I420Copy(data[0], strides[0],
data[1], strides[1],
data[2], strides[2],
dest->y, dest->y_stride,
dest->u, dest->u_stride,
dest->v, dest->v_stride,
dest->w, rows);
I420Copy(data[0], strides[0], data[1], strides[1], data[2], strides[2],
dest->y, dest->y_stride, dest->u, dest->u_stride, dest->v,
dest->v_stride, dest->w, rows);
dest->y += rows * dest->y_stride;
dest->u += ((rows + 1) >> 1) * dest->u_stride;
dest->v += ((rows + 1) >> 1) * dest->v_stride;
@ -55,13 +51,9 @@ static void JpegI422ToI420(void* opaque,
const int* strides,
int rows) {
I420Buffers* dest = (I420Buffers*)(opaque);
I422ToI420(data[0], strides[0],
data[1], strides[1],
data[2], strides[2],
dest->y, dest->y_stride,
dest->u, dest->u_stride,
dest->v, dest->v_stride,
dest->w, rows);
I422ToI420(data[0], strides[0], data[1], strides[1], data[2], strides[2],
dest->y, dest->y_stride, dest->u, dest->u_stride, dest->v,
dest->v_stride, dest->w, rows);
dest->y += rows * dest->y_stride;
dest->u += ((rows + 1) >> 1) * dest->u_stride;
dest->v += ((rows + 1) >> 1) * dest->v_stride;
@ -73,31 +65,9 @@ static void JpegI444ToI420(void* opaque,
const int* strides,
int rows) {
I420Buffers* dest = (I420Buffers*)(opaque);
I444ToI420(data[0], strides[0],
data[1], strides[1],
data[2], strides[2],
dest->y, dest->y_stride,
dest->u, dest->u_stride,
dest->v, dest->v_stride,
dest->w, rows);
dest->y += rows * dest->y_stride;
dest->u += ((rows + 1) >> 1) * dest->u_stride;
dest->v += ((rows + 1) >> 1) * dest->v_stride;
dest->h -= rows;
}
static void JpegI411ToI420(void* opaque,
const uint8* const* data,
const int* strides,
int rows) {
I420Buffers* dest = (I420Buffers*)(opaque);
I411ToI420(data[0], strides[0],
data[1], strides[1],
data[2], strides[2],
dest->y, dest->y_stride,
dest->u, dest->u_stride,
dest->v, dest->v_stride,
dest->w, rows);
I444ToI420(data[0], strides[0], data[1], strides[1], data[2], strides[2],
dest->y, dest->y_stride, dest->u, dest->u_stride, dest->v,
dest->v_stride, dest->w, rows);
dest->y += rows * dest->y_stride;
dest->u += ((rows + 1) >> 1) * dest->u_stride;
dest->v += ((rows + 1) >> 1) * dest->v_stride;
@ -109,11 +79,8 @@ static void JpegI400ToI420(void* opaque,
const int* strides,
int rows) {
I420Buffers* dest = (I420Buffers*)(opaque);
I400ToI420(data[0], strides[0],
dest->y, dest->y_stride,
dest->u, dest->u_stride,
dest->v, dest->v_stride,
dest->w, rows);
I400ToI420(data[0], strides[0], dest->y, dest->y_stride, dest->u,
dest->u_stride, dest->v, dest->v_stride, dest->w, rows);
dest->y += rows * dest->y_stride;
dest->u += ((rows + 1) >> 1) * dest->u_stride;
dest->v += ((rows + 1) >> 1) * dest->v_stride;
@ -122,8 +89,7 @@ static void JpegI400ToI420(void* opaque,
// Query size of MJPG in pixels.
LIBYUV_API
int MJPGSize(const uint8* sample, size_t sample_size,
int* width, int* height) {
int MJPGSize(const uint8* sample, size_t sample_size, int* width, int* height) {
MJpegDecoder mjpeg_decoder;
LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size);
if (ret) {
@ -139,11 +105,16 @@ int MJPGSize(const uint8* sample, size_t sample_size,
LIBYUV_API
int MJPGToI420(const uint8* sample,
size_t sample_size,
uint8* y, int y_stride,
uint8* u, int u_stride,
uint8* v, int v_stride,
int w, int h,
int dw, int dh) {
uint8* y,
int y_stride,
uint8* u,
int u_stride,
uint8* v,
int v_stride,
int w,
int h,
int dw,
int dh) {
if (sample_size == kUnknownDataSize) {
// ERROR: MJPEG frame size unknown
return -1;
@ -152,17 +123,16 @@ int MJPGToI420(const uint8* sample,
// TODO(fbarchard): Port MJpeg to C.
MJpegDecoder mjpeg_decoder;
LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size);
if (ret && (mjpeg_decoder.GetWidth() != w ||
mjpeg_decoder.GetHeight() != h)) {
if (ret &&
(mjpeg_decoder.GetWidth() != w || mjpeg_decoder.GetHeight() != h)) {
// ERROR: MJPEG frame has unexpected dimensions
mjpeg_decoder.UnloadFrame();
return 1; // runtime failure
}
if (ret) {
I420Buffers bufs = { y, y_stride, u, u_stride, v, v_stride, dw, dh };
I420Buffers bufs = {y, y_stride, u, u_stride, v, v_stride, dw, dh};
// YUV420
if (mjpeg_decoder.GetColorSpace() ==
MJpegDecoder::kColorSpaceYCbCr &&
if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr &&
mjpeg_decoder.GetNumComponents() == 3 &&
mjpeg_decoder.GetVertSampFactor(0) == 2 &&
mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
@ -171,7 +141,7 @@ int MJPGToI420(const uint8* sample,
mjpeg_decoder.GetVertSampFactor(2) == 1 &&
mjpeg_decoder.GetHorizSampFactor(2) == 1) {
ret = mjpeg_decoder.DecodeToCallback(&JpegCopyI420, &bufs, dw, dh);
// YUV422
// YUV422
} else if (mjpeg_decoder.GetColorSpace() ==
MJpegDecoder::kColorSpaceYCbCr &&
mjpeg_decoder.GetNumComponents() == 3 &&
@ -182,7 +152,7 @@ int MJPGToI420(const uint8* sample,
mjpeg_decoder.GetVertSampFactor(2) == 1 &&
mjpeg_decoder.GetHorizSampFactor(2) == 1) {
ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToI420, &bufs, dw, dh);
// YUV444
// YUV444
} else if (mjpeg_decoder.GetColorSpace() ==
MJpegDecoder::kColorSpaceYCbCr &&
mjpeg_decoder.GetNumComponents() == 3 &&
@ -193,18 +163,7 @@ int MJPGToI420(const uint8* sample,
mjpeg_decoder.GetVertSampFactor(2) == 1 &&
mjpeg_decoder.GetHorizSampFactor(2) == 1) {
ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToI420, &bufs, dw, dh);
// YUV411
} else if (mjpeg_decoder.GetColorSpace() ==
MJpegDecoder::kColorSpaceYCbCr &&
mjpeg_decoder.GetNumComponents() == 3 &&
mjpeg_decoder.GetVertSampFactor(0) == 1 &&
mjpeg_decoder.GetHorizSampFactor(0) == 4 &&
mjpeg_decoder.GetVertSampFactor(1) == 1 &&
mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
mjpeg_decoder.GetVertSampFactor(2) == 1 &&
mjpeg_decoder.GetHorizSampFactor(2) == 1) {
ret = mjpeg_decoder.DecodeToCallback(&JpegI411ToI420, &bufs, dw, dh);
// YUV400
// YUV400
} else if (mjpeg_decoder.GetColorSpace() ==
MJpegDecoder::kColorSpaceGrayscale &&
mjpeg_decoder.GetNumComponents() == 1 &&
@ -213,7 +172,7 @@ int MJPGToI420(const uint8* sample,
ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToI420, &bufs, dw, dh);
} else {
// TODO(fbarchard): Implement conversion for any other colorspace/sample
// factors that occur in practice. 411 is supported by libjpeg
// factors that occur in practice.
// ERROR: Unable to convert MJPEG frame because format is not supported
mjpeg_decoder.UnloadFrame();
return 1;
@ -231,15 +190,12 @@ struct ARGBBuffers {
};
static void JpegI420ToARGB(void* opaque,
const uint8* const* data,
const int* strides,
int rows) {
const uint8* const* data,
const int* strides,
int rows) {
ARGBBuffers* dest = (ARGBBuffers*)(opaque);
I420ToARGB(data[0], strides[0],
data[1], strides[1],
data[2], strides[2],
dest->argb, dest->argb_stride,
dest->w, rows);
I420ToARGB(data[0], strides[0], data[1], strides[1], data[2], strides[2],
dest->argb, dest->argb_stride, dest->w, rows);
dest->argb += rows * dest->argb_stride;
dest->h -= rows;
}
@ -249,11 +205,8 @@ static void JpegI422ToARGB(void* opaque,
const int* strides,
int rows) {
ARGBBuffers* dest = (ARGBBuffers*)(opaque);
I422ToARGB(data[0], strides[0],
data[1], strides[1],
data[2], strides[2],
dest->argb, dest->argb_stride,
dest->w, rows);
I422ToARGB(data[0], strides[0], data[1], strides[1], data[2], strides[2],
dest->argb, dest->argb_stride, dest->w, rows);
dest->argb += rows * dest->argb_stride;
dest->h -= rows;
}
@ -263,25 +216,8 @@ static void JpegI444ToARGB(void* opaque,
const int* strides,
int rows) {
ARGBBuffers* dest = (ARGBBuffers*)(opaque);
I444ToARGB(data[0], strides[0],
data[1], strides[1],
data[2], strides[2],
dest->argb, dest->argb_stride,
dest->w, rows);
dest->argb += rows * dest->argb_stride;
dest->h -= rows;
}
static void JpegI411ToARGB(void* opaque,
const uint8* const* data,
const int* strides,
int rows) {
ARGBBuffers* dest = (ARGBBuffers*)(opaque);
I411ToARGB(data[0], strides[0],
data[1], strides[1],
data[2], strides[2],
dest->argb, dest->argb_stride,
dest->w, rows);
I444ToARGB(data[0], strides[0], data[1], strides[1], data[2], strides[2],
dest->argb, dest->argb_stride, dest->w, rows);
dest->argb += rows * dest->argb_stride;
dest->h -= rows;
}
@ -291,9 +227,7 @@ static void JpegI400ToARGB(void* opaque,
const int* strides,
int rows) {
ARGBBuffers* dest = (ARGBBuffers*)(opaque);
I400ToARGB(data[0], strides[0],
dest->argb, dest->argb_stride,
dest->w, rows);
I400ToARGB(data[0], strides[0], dest->argb, dest->argb_stride, dest->w, rows);
dest->argb += rows * dest->argb_stride;
dest->h -= rows;
}
@ -303,9 +237,12 @@ static void JpegI400ToARGB(void* opaque,
LIBYUV_API
int MJPGToARGB(const uint8* sample,
size_t sample_size,
uint8* argb, int argb_stride,
int w, int h,
int dw, int dh) {
uint8* argb,
int argb_stride,
int w,
int h,
int dw,
int dh) {
if (sample_size == kUnknownDataSize) {
// ERROR: MJPEG frame size unknown
return -1;
@ -314,17 +251,16 @@ int MJPGToARGB(const uint8* sample,
// TODO(fbarchard): Port MJpeg to C.
MJpegDecoder mjpeg_decoder;
LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size);
if (ret && (mjpeg_decoder.GetWidth() != w ||
mjpeg_decoder.GetHeight() != h)) {
if (ret &&
(mjpeg_decoder.GetWidth() != w || mjpeg_decoder.GetHeight() != h)) {
// ERROR: MJPEG frame has unexpected dimensions
mjpeg_decoder.UnloadFrame();
return 1; // runtime failure
}
if (ret) {
ARGBBuffers bufs = { argb, argb_stride, dw, dh };
ARGBBuffers bufs = {argb, argb_stride, dw, dh};
// YUV420
if (mjpeg_decoder.GetColorSpace() ==
MJpegDecoder::kColorSpaceYCbCr &&
if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr &&
mjpeg_decoder.GetNumComponents() == 3 &&
mjpeg_decoder.GetVertSampFactor(0) == 2 &&
mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
@ -333,7 +269,7 @@ int MJPGToARGB(const uint8* sample,
mjpeg_decoder.GetVertSampFactor(2) == 1 &&
mjpeg_decoder.GetHorizSampFactor(2) == 1) {
ret = mjpeg_decoder.DecodeToCallback(&JpegI420ToARGB, &bufs, dw, dh);
// YUV422
// YUV422
} else if (mjpeg_decoder.GetColorSpace() ==
MJpegDecoder::kColorSpaceYCbCr &&
mjpeg_decoder.GetNumComponents() == 3 &&
@ -344,7 +280,7 @@ int MJPGToARGB(const uint8* sample,
mjpeg_decoder.GetVertSampFactor(2) == 1 &&
mjpeg_decoder.GetHorizSampFactor(2) == 1) {
ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToARGB, &bufs, dw, dh);
// YUV444
// YUV444
} else if (mjpeg_decoder.GetColorSpace() ==
MJpegDecoder::kColorSpaceYCbCr &&
mjpeg_decoder.GetNumComponents() == 3 &&
@ -355,18 +291,7 @@ int MJPGToARGB(const uint8* sample,
mjpeg_decoder.GetVertSampFactor(2) == 1 &&
mjpeg_decoder.GetHorizSampFactor(2) == 1) {
ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToARGB, &bufs, dw, dh);
// YUV411
} else if (mjpeg_decoder.GetColorSpace() ==
MJpegDecoder::kColorSpaceYCbCr &&
mjpeg_decoder.GetNumComponents() == 3 &&
mjpeg_decoder.GetVertSampFactor(0) == 1 &&
mjpeg_decoder.GetHorizSampFactor(0) == 4 &&
mjpeg_decoder.GetVertSampFactor(1) == 1 &&
mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
mjpeg_decoder.GetVertSampFactor(2) == 1 &&
mjpeg_decoder.GetHorizSampFactor(2) == 1) {
ret = mjpeg_decoder.DecodeToCallback(&JpegI411ToARGB, &bufs, dw, dh);
// YUV400
// YUV400
} else if (mjpeg_decoder.GetColorSpace() ==
MJpegDecoder::kColorSpaceGrayscale &&
mjpeg_decoder.GetNumComponents() == 1 &&
@ -375,7 +300,7 @@ int MJPGToARGB(const uint8* sample,
ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToARGB, &bufs, dw, dh);
} else {
// TODO(fbarchard): Implement conversion for any other colorspace/sample
// factors that occur in practice. 411 is supported by libjpeg
// factors that occur in practice.
// ERROR: Unable to convert MJPEG frame because format is not supported
mjpeg_decoder.UnloadFrame();
return 1;

View File

@ -29,11 +29,16 @@ extern "C" {
// sample_size is measured in bytes and is the size of the frame.
// With MJPEG it is the compressed size of the frame.
LIBYUV_API
int ConvertToARGB(const uint8* sample, size_t sample_size,
uint8* crop_argb, int argb_stride,
int crop_x, int crop_y,
int src_width, int src_height,
int crop_width, int crop_height,
int ConvertToARGB(const uint8* sample,
size_t sample_size,
uint8* crop_argb,
int argb_stride,
int crop_x,
int crop_y,
int src_width,
int src_height,
int crop_width,
int crop_height,
enum RotationMode rotation,
uint32 fourcc) {
uint32 format = CanonicalFourCC(fourcc);
@ -45,20 +50,19 @@ int ConvertToARGB(const uint8* sample, size_t sample_size,
int r = 0;
// One pass rotation is available for some formats. For the rest, convert
// to I420 (with optional vertical flipping) into a temporary I420 buffer,
// and then rotate the I420 to the final destination buffer.
// to ARGB (with optional vertical flipping) into a temporary ARGB buffer,
// and then rotate the ARGB to the final destination buffer.
// For in-place conversion, if destination crop_argb is same as source sample,
// also enable temporary buffer.
LIBYUV_BOOL need_buf = (rotation && format != FOURCC_ARGB) ||
crop_argb == sample;
LIBYUV_BOOL need_buf =
(rotation && format != FOURCC_ARGB) || crop_argb == sample;
uint8* dest_argb = crop_argb;
int dest_argb_stride = argb_stride;
uint8* rotate_buffer = NULL;
int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height;
if (crop_argb == NULL || sample == NULL ||
src_width <= 0 || crop_width <= 0 ||
src_height == 0 || crop_height == 0) {
if (crop_argb == NULL || sample == NULL || src_width <= 0 ||
crop_width <= 0 || src_height == 0 || crop_height == 0) {
return -1;
}
if (src_height < 0) {
@ -67,7 +71,7 @@ int ConvertToARGB(const uint8* sample, size_t sample_size,
if (need_buf) {
int argb_size = crop_width * 4 * abs_crop_height;
rotate_buffer = (uint8*)malloc(argb_size);
rotate_buffer = (uint8*)malloc(argb_size); /* NOLINT */
if (!rotate_buffer) {
return 1; // Out of memory runtime error.
}
@ -79,100 +83,85 @@ int ConvertToARGB(const uint8* sample, size_t sample_size,
// Single plane formats
case FOURCC_YUY2:
src = sample + (aligned_src_width * crop_y + crop_x) * 2;
r = YUY2ToARGB(src, aligned_src_width * 2,
crop_argb, argb_stride,
r = YUY2ToARGB(src, aligned_src_width * 2, crop_argb, argb_stride,
crop_width, inv_crop_height);
break;
case FOURCC_UYVY:
src = sample + (aligned_src_width * crop_y + crop_x) * 2;
r = UYVYToARGB(src, aligned_src_width * 2,
crop_argb, argb_stride,
r = UYVYToARGB(src, aligned_src_width * 2, crop_argb, argb_stride,
crop_width, inv_crop_height);
break;
case FOURCC_24BG:
src = sample + (src_width * crop_y + crop_x) * 3;
r = RGB24ToARGB(src, src_width * 3,
crop_argb, argb_stride,
crop_width, inv_crop_height);
r = RGB24ToARGB(src, src_width * 3, crop_argb, argb_stride, crop_width,
inv_crop_height);
break;
case FOURCC_RAW:
src = sample + (src_width * crop_y + crop_x) * 3;
r = RAWToARGB(src, src_width * 3,
crop_argb, argb_stride,
crop_width, inv_crop_height);
r = RAWToARGB(src, src_width * 3, crop_argb, argb_stride, crop_width,
inv_crop_height);
break;
case FOURCC_ARGB:
src = sample + (src_width * crop_y + crop_x) * 4;
r = ARGBToARGB(src, src_width * 4,
crop_argb, argb_stride,
crop_width, inv_crop_height);
if (!need_buf && !rotation) {
src = sample + (src_width * crop_y + crop_x) * 4;
r = ARGBToARGB(src, src_width * 4, crop_argb, argb_stride, crop_width,
inv_crop_height);
}
break;
case FOURCC_BGRA:
src = sample + (src_width * crop_y + crop_x) * 4;
r = BGRAToARGB(src, src_width * 4,
crop_argb, argb_stride,
crop_width, inv_crop_height);
r = BGRAToARGB(src, src_width * 4, crop_argb, argb_stride, crop_width,
inv_crop_height);
break;
case FOURCC_ABGR:
src = sample + (src_width * crop_y + crop_x) * 4;
r = ABGRToARGB(src, src_width * 4,
crop_argb, argb_stride,
crop_width, inv_crop_height);
r = ABGRToARGB(src, src_width * 4, crop_argb, argb_stride, crop_width,
inv_crop_height);
break;
case FOURCC_RGBA:
src = sample + (src_width * crop_y + crop_x) * 4;
r = RGBAToARGB(src, src_width * 4,
crop_argb, argb_stride,
crop_width, inv_crop_height);
r = RGBAToARGB(src, src_width * 4, crop_argb, argb_stride, crop_width,
inv_crop_height);
break;
case FOURCC_RGBP:
src = sample + (src_width * crop_y + crop_x) * 2;
r = RGB565ToARGB(src, src_width * 2,
crop_argb, argb_stride,
crop_width, inv_crop_height);
r = RGB565ToARGB(src, src_width * 2, crop_argb, argb_stride, crop_width,
inv_crop_height);
break;
case FOURCC_RGBO:
src = sample + (src_width * crop_y + crop_x) * 2;
r = ARGB1555ToARGB(src, src_width * 2,
crop_argb, argb_stride,
crop_width, inv_crop_height);
r = ARGB1555ToARGB(src, src_width * 2, crop_argb, argb_stride, crop_width,
inv_crop_height);
break;
case FOURCC_R444:
src = sample + (src_width * crop_y + crop_x) * 2;
r = ARGB4444ToARGB(src, src_width * 2,
crop_argb, argb_stride,
crop_width, inv_crop_height);
r = ARGB4444ToARGB(src, src_width * 2, crop_argb, argb_stride, crop_width,
inv_crop_height);
break;
case FOURCC_I400:
src = sample + src_width * crop_y + crop_x;
r = I400ToARGB(src, src_width,
crop_argb, argb_stride,
crop_width, inv_crop_height);
r = I400ToARGB(src, src_width, crop_argb, argb_stride, crop_width,
inv_crop_height);
break;
// Biplanar formats
case FOURCC_NV12:
src = sample + (src_width * crop_y + crop_x);
src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x;
r = NV12ToARGB(src, src_width,
src_uv, aligned_src_width,
crop_argb, argb_stride,
crop_width, inv_crop_height);
r = NV12ToARGB(src, src_width, src_uv, aligned_src_width, crop_argb,
argb_stride, crop_width, inv_crop_height);
break;
case FOURCC_NV21:
src = sample + (src_width * crop_y + crop_x);
src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x;
// Call NV12 but with u and v parameters swapped.
r = NV21ToARGB(src, src_width,
src_uv, aligned_src_width,
crop_argb, argb_stride,
crop_width, inv_crop_height);
r = NV21ToARGB(src, src_width, src_uv, aligned_src_width, crop_argb,
argb_stride, crop_width, inv_crop_height);
break;
case FOURCC_M420:
src = sample + (src_width * crop_y) * 12 / 8 + crop_x;
r = M420ToARGB(src, src_width,
crop_argb, argb_stride,
crop_width, inv_crop_height);
r = M420ToARGB(src, src_width, crop_argb, argb_stride, crop_width,
inv_crop_height);
break;
// Triplanar formats
case FOURCC_I420:
@ -184,20 +173,17 @@ int ConvertToARGB(const uint8* sample, size_t sample_size,
int halfheight = (abs_src_height + 1) / 2;
if (format == FOURCC_YV12) {
src_v = sample + src_width * abs_src_height +
(halfwidth * crop_y + crop_x) / 2;
(halfwidth * crop_y + crop_x) / 2;
src_u = sample + src_width * abs_src_height +
halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
} else {
src_u = sample + src_width * abs_src_height +
(halfwidth * crop_y + crop_x) / 2;
(halfwidth * crop_y + crop_x) / 2;
src_v = sample + src_width * abs_src_height +
halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
}
r = I420ToARGB(src_y, src_width,
src_u, halfwidth,
src_v, halfwidth,
crop_argb, argb_stride,
crop_width, inv_crop_height);
r = I420ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
crop_argb, argb_stride, crop_width, inv_crop_height);
break;
}
@ -208,14 +194,11 @@ int ConvertToARGB(const uint8* sample, size_t sample_size,
int halfwidth = (src_width + 1) / 2;
int halfheight = (abs_src_height + 1) / 2;
src_u = sample + src_width * abs_src_height +
(halfwidth * crop_y + crop_x) / 2;
(halfwidth * crop_y + crop_x) / 2;
src_v = sample + src_width * abs_src_height +
halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
r = J420ToARGB(src_y, src_width,
src_u, halfwidth,
src_v, halfwidth,
crop_argb, argb_stride,
crop_width, inv_crop_height);
halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
r = J420ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
crop_argb, argb_stride, crop_width, inv_crop_height);
break;
}
@ -226,21 +209,18 @@ int ConvertToARGB(const uint8* sample, size_t sample_size,
const uint8* src_v;
int halfwidth = (src_width + 1) / 2;
if (format == FOURCC_YV16) {
src_v = sample + src_width * abs_src_height +
halfwidth * crop_y + crop_x / 2;
src_v = sample + src_width * abs_src_height + halfwidth * crop_y +
crop_x / 2;
src_u = sample + src_width * abs_src_height +
halfwidth * (abs_src_height + crop_y) + crop_x / 2;
halfwidth * (abs_src_height + crop_y) + crop_x / 2;
} else {
src_u = sample + src_width * abs_src_height +
halfwidth * crop_y + crop_x / 2;
src_u = sample + src_width * abs_src_height + halfwidth * crop_y +
crop_x / 2;
src_v = sample + src_width * abs_src_height +
halfwidth * (abs_src_height + crop_y) + crop_x / 2;
halfwidth * (abs_src_height + crop_y) + crop_x / 2;
}
r = I422ToARGB(src_y, src_width,
src_u, halfwidth,
src_v, halfwidth,
crop_argb, argb_stride,
crop_width, inv_crop_height);
r = I422ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
crop_argb, argb_stride, crop_width, inv_crop_height);
break;
}
case FOURCC_I444:
@ -255,32 +235,14 @@ int ConvertToARGB(const uint8* sample, size_t sample_size,
src_u = sample + src_width * (abs_src_height + crop_y) + crop_x;
src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
}
r = I444ToARGB(src_y, src_width,
src_u, src_width,
src_v, src_width,
crop_argb, argb_stride,
crop_width, inv_crop_height);
break;
}
case FOURCC_I411: {
int quarterwidth = (src_width + 3) / 4;
const uint8* src_y = sample + src_width * crop_y + crop_x;
const uint8* src_u = sample + src_width * abs_src_height +
quarterwidth * crop_y + crop_x / 4;
const uint8* src_v = sample + src_width * abs_src_height +
quarterwidth * (abs_src_height + crop_y) + crop_x / 4;
r = I411ToARGB(src_y, src_width,
src_u, quarterwidth,
src_v, quarterwidth,
crop_argb, argb_stride,
crop_width, inv_crop_height);
r = I444ToARGB(src_y, src_width, src_u, src_width, src_v, src_width,
crop_argb, argb_stride, crop_width, inv_crop_height);
break;
}
#ifdef HAVE_JPEG
case FOURCC_MJPG:
r = MJPGToARGB(sample, sample_size,
crop_argb, argb_stride,
src_width, abs_src_height, crop_width, inv_crop_height);
r = MJPGToARGB(sample, sample_size, crop_argb, argb_stride, src_width,
abs_src_height, crop_width, inv_crop_height);
break;
#endif
default:
@ -289,11 +251,14 @@ int ConvertToARGB(const uint8* sample, size_t sample_size,
if (need_buf) {
if (!r) {
r = ARGBRotate(crop_argb, argb_stride,
dest_argb, dest_argb_stride,
r = ARGBRotate(crop_argb, argb_stride, dest_argb, dest_argb_stride,
crop_width, abs_crop_height, rotation);
}
free(rotate_buffer);
} else if (rotation) {
src = sample + (src_width * crop_y + crop_x) * 4;
r = ARGBRotate(src, src_width * 4, crop_argb, argb_stride, crop_width,
inv_crop_height, rotation);
}
return r;

View File

@ -27,12 +27,18 @@ extern "C" {
LIBYUV_API
int ConvertToI420(const uint8* sample,
size_t sample_size,
uint8* y, int y_stride,
uint8* u, int u_stride,
uint8* v, int v_stride,
int crop_x, int crop_y,
int src_width, int src_height,
int crop_width, int crop_height,
uint8* y,
int y_stride,
uint8* u,
int u_stride,
uint8* v,
int v_stride,
int crop_x,
int crop_y,
int src_width,
int src_height,
int crop_width,
int crop_height,
enum RotationMode rotation,
uint32 fourcc) {
uint32 format = CanonicalFourCC(fourcc);
@ -43,9 +49,10 @@ int ConvertToI420(const uint8* sample,
// TODO(nisse): Why allow crop_height < 0?
const int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height;
int r = 0;
LIBYUV_BOOL need_buf = (rotation && format != FOURCC_I420 &&
format != FOURCC_NV12 && format != FOURCC_NV21 &&
format != FOURCC_YV12) || y == sample;
LIBYUV_BOOL need_buf =
(rotation && format != FOURCC_I420 && format != FOURCC_NV12 &&
format != FOURCC_NV21 && format != FOURCC_YV12) ||
y == sample;
uint8* tmp_y = y;
uint8* tmp_u = u;
uint8* tmp_v = v;
@ -56,8 +63,7 @@ int ConvertToI420(const uint8* sample,
const int inv_crop_height =
(src_height < 0) ? -abs_crop_height : abs_crop_height;
if (!y || !u || !v || !sample ||
src_width <= 0 || crop_width <= 0 ||
if (!y || !u || !v || !sample || src_width <= 0 || crop_width <= 0 ||
src_height == 0 || crop_height == 0) {
return -1;
}
@ -70,7 +76,7 @@ int ConvertToI420(const uint8* sample,
if (need_buf) {
int y_size = crop_width * abs_crop_height;
int uv_size = ((crop_width + 1) / 2) * ((abs_crop_height + 1) / 2);
rotate_buffer = (uint8*)malloc(y_size + uv_size * 2);
rotate_buffer = (uint8*)malloc(y_size + uv_size * 2); /* NOLINT */
if (!rotate_buffer) {
return 1; // Out of memory runtime error.
}
@ -85,130 +91,85 @@ int ConvertToI420(const uint8* sample,
// Single plane formats
case FOURCC_YUY2:
src = sample + (aligned_src_width * crop_y + crop_x) * 2;
r = YUY2ToI420(src, aligned_src_width * 2,
y, y_stride,
u, u_stride,
v, v_stride,
crop_width, inv_crop_height);
r = YUY2ToI420(src, aligned_src_width * 2, y, y_stride, u, u_stride, v,
v_stride, crop_width, inv_crop_height);
break;
case FOURCC_UYVY:
src = sample + (aligned_src_width * crop_y + crop_x) * 2;
r = UYVYToI420(src, aligned_src_width * 2,
y, y_stride,
u, u_stride,
v, v_stride,
crop_width, inv_crop_height);
r = UYVYToI420(src, aligned_src_width * 2, y, y_stride, u, u_stride, v,
v_stride, crop_width, inv_crop_height);
break;
case FOURCC_RGBP:
src = sample + (src_width * crop_y + crop_x) * 2;
r = RGB565ToI420(src, src_width * 2,
y, y_stride,
u, u_stride,
v, v_stride,
crop_width, inv_crop_height);
r = RGB565ToI420(src, src_width * 2, y, y_stride, u, u_stride, v,
v_stride, crop_width, inv_crop_height);
break;
case FOURCC_RGBO:
src = sample + (src_width * crop_y + crop_x) * 2;
r = ARGB1555ToI420(src, src_width * 2,
y, y_stride,
u, u_stride,
v, v_stride,
crop_width, inv_crop_height);
r = ARGB1555ToI420(src, src_width * 2, y, y_stride, u, u_stride, v,
v_stride, crop_width, inv_crop_height);
break;
case FOURCC_R444:
src = sample + (src_width * crop_y + crop_x) * 2;
r = ARGB4444ToI420(src, src_width * 2,
y, y_stride,
u, u_stride,
v, v_stride,
crop_width, inv_crop_height);
r = ARGB4444ToI420(src, src_width * 2, y, y_stride, u, u_stride, v,
v_stride, crop_width, inv_crop_height);
break;
case FOURCC_24BG:
src = sample + (src_width * crop_y + crop_x) * 3;
r = RGB24ToI420(src, src_width * 3,
y, y_stride,
u, u_stride,
v, v_stride,
r = RGB24ToI420(src, src_width * 3, y, y_stride, u, u_stride, v, v_stride,
crop_width, inv_crop_height);
break;
case FOURCC_RAW:
src = sample + (src_width * crop_y + crop_x) * 3;
r = RAWToI420(src, src_width * 3,
y, y_stride,
u, u_stride,
v, v_stride,
r = RAWToI420(src, src_width * 3, y, y_stride, u, u_stride, v, v_stride,
crop_width, inv_crop_height);
break;
case FOURCC_ARGB:
src = sample + (src_width * crop_y + crop_x) * 4;
r = ARGBToI420(src, src_width * 4,
y, y_stride,
u, u_stride,
v, v_stride,
r = ARGBToI420(src, src_width * 4, y, y_stride, u, u_stride, v, v_stride,
crop_width, inv_crop_height);
break;
case FOURCC_BGRA:
src = sample + (src_width * crop_y + crop_x) * 4;
r = BGRAToI420(src, src_width * 4,
y, y_stride,
u, u_stride,
v, v_stride,
r = BGRAToI420(src, src_width * 4, y, y_stride, u, u_stride, v, v_stride,
crop_width, inv_crop_height);
break;
case FOURCC_ABGR:
src = sample + (src_width * crop_y + crop_x) * 4;
r = ABGRToI420(src, src_width * 4,
y, y_stride,
u, u_stride,
v, v_stride,
r = ABGRToI420(src, src_width * 4, y, y_stride, u, u_stride, v, v_stride,
crop_width, inv_crop_height);
break;
case FOURCC_RGBA:
src = sample + (src_width * crop_y + crop_x) * 4;
r = RGBAToI420(src, src_width * 4,
y, y_stride,
u, u_stride,
v, v_stride,
r = RGBAToI420(src, src_width * 4, y, y_stride, u, u_stride, v, v_stride,
crop_width, inv_crop_height);
break;
case FOURCC_I400:
src = sample + src_width * crop_y + crop_x;
r = I400ToI420(src, src_width,
y, y_stride,
u, u_stride,
v, v_stride,
r = I400ToI420(src, src_width, y, y_stride, u, u_stride, v, v_stride,
crop_width, inv_crop_height);
break;
// Biplanar formats
case FOURCC_NV12:
src = sample + (src_width * crop_y + crop_x);
src_uv = sample + (src_width * src_height) +
((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2);
r = NV12ToI420Rotate(src, src_width,
src_uv, aligned_src_width,
y, y_stride,
u, u_stride,
v, v_stride,
crop_width, inv_crop_height, rotation);
((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2);
r = NV12ToI420Rotate(src, src_width, src_uv, aligned_src_width, y,
y_stride, u, u_stride, v, v_stride, crop_width,
inv_crop_height, rotation);
break;
case FOURCC_NV21:
src = sample + (src_width * crop_y + crop_x);
src_uv = sample + (src_width * src_height) +
((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2);
((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2);
// Call NV12 but with u and v parameters swapped.
r = NV12ToI420Rotate(src, src_width,
src_uv, aligned_src_width,
y, y_stride,
v, v_stride,
u, u_stride,
crop_width, inv_crop_height, rotation);
r = NV12ToI420Rotate(src, src_width, src_uv, aligned_src_width, y,
y_stride, v, v_stride, u, u_stride, crop_width,
inv_crop_height, rotation);
break;
case FOURCC_M420:
src = sample + (src_width * crop_y) * 12 / 8 + crop_x;
r = M420ToI420(src, src_width,
y, y_stride,
u, u_stride,
v, v_stride,
r = M420ToI420(src, src_width, y, y_stride, u, u_stride, v, v_stride,
crop_width, inv_crop_height);
break;
// Triplanar formats
@ -221,22 +182,18 @@ int ConvertToI420(const uint8* sample,
int halfheight = (abs_src_height + 1) / 2;
if (format == FOURCC_YV12) {
src_v = sample + src_width * abs_src_height +
(halfwidth * crop_y + crop_x) / 2;
(halfwidth * crop_y + crop_x) / 2;
src_u = sample + src_width * abs_src_height +
halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
} else {
src_u = sample + src_width * abs_src_height +
(halfwidth * crop_y + crop_x) / 2;
(halfwidth * crop_y + crop_x) / 2;
src_v = sample + src_width * abs_src_height +
halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
}
r = I420Rotate(src_y, src_width,
src_u, halfwidth,
src_v, halfwidth,
y, y_stride,
u, u_stride,
v, v_stride,
crop_width, inv_crop_height, rotation);
r = I420Rotate(src_y, src_width, src_u, halfwidth, src_v, halfwidth, y,
y_stride, u, u_stride, v, v_stride, crop_width,
inv_crop_height, rotation);
break;
}
case FOURCC_I422:
@ -246,23 +203,19 @@ int ConvertToI420(const uint8* sample,
const uint8* src_v;
int halfwidth = (src_width + 1) / 2;
if (format == FOURCC_YV16) {
src_v = sample + src_width * abs_src_height +
halfwidth * crop_y + crop_x / 2;
src_v = sample + src_width * abs_src_height + halfwidth * crop_y +
crop_x / 2;
src_u = sample + src_width * abs_src_height +
halfwidth * (abs_src_height + crop_y) + crop_x / 2;
halfwidth * (abs_src_height + crop_y) + crop_x / 2;
} else {
src_u = sample + src_width * abs_src_height +
halfwidth * crop_y + crop_x / 2;
src_u = sample + src_width * abs_src_height + halfwidth * crop_y +
crop_x / 2;
src_v = sample + src_width * abs_src_height +
halfwidth * (abs_src_height + crop_y) + crop_x / 2;
halfwidth * (abs_src_height + crop_y) + crop_x / 2;
}
r = I422ToI420(src_y, src_width,
src_u, halfwidth,
src_v, halfwidth,
y, y_stride,
u, u_stride,
v, v_stride,
crop_width, inv_crop_height);
r = I422ToI420(src_y, src_width, src_u, halfwidth, src_v, halfwidth, y,
y_stride, u, u_stride, v, v_stride, crop_width,
inv_crop_height);
break;
}
case FOURCC_I444:
@ -277,37 +230,14 @@ int ConvertToI420(const uint8* sample,
src_u = sample + src_width * (abs_src_height + crop_y) + crop_x;
src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
}
r = I444ToI420(src_y, src_width,
src_u, src_width,
src_v, src_width,
y, y_stride,
u, u_stride,
v, v_stride,
crop_width, inv_crop_height);
break;
}
case FOURCC_I411: {
int quarterwidth = (src_width + 3) / 4;
const uint8* src_y = sample + src_width * crop_y + crop_x;
const uint8* src_u = sample + src_width * abs_src_height +
quarterwidth * crop_y + crop_x / 4;
const uint8* src_v = sample + src_width * abs_src_height +
quarterwidth * (abs_src_height + crop_y) + crop_x / 4;
r = I411ToI420(src_y, src_width,
src_u, quarterwidth,
src_v, quarterwidth,
y, y_stride,
u, u_stride,
v, v_stride,
crop_width, inv_crop_height);
r = I444ToI420(src_y, src_width, src_u, src_width, src_v, src_width, y,
y_stride, u, u_stride, v, v_stride, crop_width,
inv_crop_height);
break;
}
#ifdef HAVE_JPEG
case FOURCC_MJPG:
r = MJPGToI420(sample, sample_size,
y, y_stride,
u, u_stride,
v, v_stride,
r = MJPGToI420(sample, sample_size, y, y_stride, u, u_stride, v, v_stride,
src_width, abs_src_height, crop_width, inv_crop_height);
break;
#endif
@ -317,13 +247,9 @@ int ConvertToI420(const uint8* sample,
if (need_buf) {
if (!r) {
r = I420Rotate(y, y_stride,
u, u_stride,
v, v_stride,
tmp_y, tmp_y_stride,
tmp_u, tmp_u_stride,
tmp_v, tmp_v_stride,
crop_width, abs_crop_height, rotation);
r = I420Rotate(y, y_stride, u, u_stride, v, v_stride, tmp_y, tmp_y_stride,
tmp_u, tmp_u_stride, tmp_v, tmp_v_stride, crop_width,
abs_crop_height, rotation);
}
free(rotate_buffer);
}

View File

@ -13,7 +13,7 @@
#if defined(_MSC_VER)
#include <intrin.h> // For __cpuidex()
#endif
#if !defined(__pnacl__) && !defined(__CLR_VER) && \
#if !defined(__pnacl__) && !defined(__CLR_VER) && \
!defined(__native_client__) && (defined(_M_IX86) || defined(_M_X64)) && \
defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219)
#include <immintrin.h> // For _xgetbv()
@ -43,16 +43,20 @@ extern "C" {
#define SAFEBUFFERS
#endif
// cpu_info_ variable for SIMD instruction sets detected.
LIBYUV_API int cpu_info_ = 0;
// TODO(fbarchard): Consider using int for cpuid so casting is not needed.
// Low level cpuid for X86.
#if (defined(_M_IX86) || defined(_M_X64) || \
defined(__i386__) || defined(__x86_64__)) && \
#if (defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || \
defined(__x86_64__)) && \
!defined(__pnacl__) && !defined(__CLR_VER)
LIBYUV_API
void CpuId(uint32 info_eax, uint32 info_ecx, uint32* cpu_info) {
void CpuId(int info_eax, int info_ecx, int* cpu_info) {
#if defined(_MSC_VER)
// Visual C version uses intrinsic or inline x86 assembly.
#if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219)
__cpuidex((int*)(cpu_info), info_eax, info_ecx);
__cpuidex(cpu_info, info_eax, info_ecx);
#elif defined(_M_IX86)
__asm {
mov eax, info_eax
@ -66,26 +70,26 @@ void CpuId(uint32 info_eax, uint32 info_ecx, uint32* cpu_info) {
}
#else // Visual C but not x86
if (info_ecx == 0) {
__cpuid((int*)(cpu_info), info_eax);
__cpuid(cpu_info, info_eax);
} else {
cpu_info[3] = cpu_info[2] = cpu_info[1] = cpu_info[0] = 0;
cpu_info[3] = cpu_info[2] = cpu_info[1] = cpu_info[0] = 0u;
}
#endif
// GCC version uses inline x86 assembly.
#else // defined(_MSC_VER)
uint32 info_ebx, info_edx;
asm volatile (
#if defined( __i386__) && defined(__PIC__)
// Preserve ebx for fpic 32 bit.
"mov %%ebx, %%edi \n"
"cpuid \n"
"xchg %%edi, %%ebx \n"
: "=D" (info_ebx),
int info_ebx, info_edx;
asm volatile(
#if defined(__i386__) && defined(__PIC__)
// Preserve ebx for fpic 32 bit.
"mov %%ebx, %%edi \n"
"cpuid \n"
"xchg %%edi, %%ebx \n"
: "=D"(info_ebx),
#else
"cpuid \n"
: "=b" (info_ebx),
"cpuid \n"
: "=b"(info_ebx),
#endif // defined( __i386__) && defined(__PIC__)
"+a" (info_eax), "+c" (info_ecx), "=d" (info_edx));
"+a"(info_eax), "+c"(info_ecx), "=d"(info_edx));
cpu_info[0] = info_eax;
cpu_info[1] = info_ebx;
cpu_info[2] = info_ecx;
@ -94,7 +98,9 @@ void CpuId(uint32 info_eax, uint32 info_ecx, uint32* cpu_info) {
}
#else // (defined(_M_IX86) || defined(_M_X64) ...
LIBYUV_API
void CpuId(uint32 eax, uint32 ecx, uint32* cpu_info) {
void CpuId(int eax, int ecx, int* cpu_info) {
(void)eax;
(void)ecx;
cpu_info[0] = cpu_info[1] = cpu_info[2] = cpu_info[3] = 0;
}
#endif
@ -111,20 +117,22 @@ void CpuId(uint32 eax, uint32 ecx, uint32* cpu_info) {
#if defined(_M_IX86) && (_MSC_VER < 1900)
#pragma optimize("g", off)
#endif
#if (defined(_M_IX86) || defined(_M_X64) || \
defined(__i386__) || defined(__x86_64__)) && \
#if (defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || \
defined(__x86_64__)) && \
!defined(__pnacl__) && !defined(__CLR_VER) && !defined(__native_client__)
#define HAS_XGETBV
// X86 CPUs have xgetbv to detect OS saves high parts of ymm registers.
int GetXCR0() {
uint32 xcr0 = 0u;
int xcr0 = 0;
#if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219)
xcr0 = (uint32)(_xgetbv(0)); // VS2010 SP1 required.
xcr0 = _xgetbv(0); // VS2010 SP1 required.
#elif defined(__i386__) || defined(__x86_64__)
asm(".byte 0x0f, 0x01, 0xd0" : "=a" (xcr0) : "c" (0) : "%edx");
asm(".byte 0x0f, 0x01, 0xd0" : "=a"(xcr0) : "c"(0) : "%edx");
#endif // defined(__i386__) || defined(__x86_64__)
return xcr0;
}
#else
// xgetbv unavailable to query for OSSave support. Return 0.
#define GetXCR0() 0
#endif // defined(_M_IX86) || defined(_M_X64) ..
// Return optimization to previous setting.
#if defined(_M_IX86) && (_MSC_VER < 1900)
@ -133,8 +141,7 @@ int GetXCR0() {
// based on libvpx arm_cpudetect.c
// For Arm, but public to allow testing on any CPU
LIBYUV_API SAFEBUFFERS
int ArmCpuCaps(const char* cpuinfo_name) {
LIBYUV_API SAFEBUFFERS int ArmCpuCaps(const char* cpuinfo_name) {
char cpuinfo_line[512];
FILE* f = fopen(cpuinfo_name, "r");
if (!f) {
@ -151,7 +158,7 @@ int ArmCpuCaps(const char* cpuinfo_name) {
}
// aarch64 uses asimd for Neon.
p = strstr(cpuinfo_line, " asimd");
if (p && (p[6] == ' ' || p[6] == '\n')) {
if (p) {
fclose(f);
return kCpuHasNEON;
}
@ -161,13 +168,38 @@ int ArmCpuCaps(const char* cpuinfo_name) {
return 0;
}
// CPU detect function for SIMD instruction sets.
LIBYUV_API
int cpu_info_ = 0; // cpu_info is not initialized yet.
// TODO(fbarchard): Consider read_msa_ir().
// TODO(fbarchard): Add unittest.
LIBYUV_API SAFEBUFFERS int MipsCpuCaps(const char* cpuinfo_name,
const char ase[]) {
char cpuinfo_line[512];
FILE* f = fopen(cpuinfo_name, "r");
if (!f) {
// ase enabled if /proc/cpuinfo is unavailable.
if (strcmp(ase, " msa") == 0) {
return kCpuHasMSA;
}
return kCpuHasDSPR2;
}
while (fgets(cpuinfo_line, sizeof(cpuinfo_line) - 1, f)) {
if (memcmp(cpuinfo_line, "ASEs implemented", 16) == 0) {
char* p = strstr(cpuinfo_line, ase);
if (p) {
fclose(f);
if (strcmp(ase, " msa") == 0) {
return kCpuHasMSA;
}
return kCpuHasDSPR2;
}
}
}
fclose(f);
return 0;
}
// Test environment variable for disabling CPU features. Any non-zero value
// to disable. Zero ignored to make it easy to set the variable on/off.
#if !defined(__native_client__) && !defined(_M_ARM) && !defined(_MSC_VER)
#if !defined(__native_client__) && !defined(_M_ARM)
static LIBYUV_BOOL TestEnv(const char* name) {
const char* var = getenv(name);
@ -184,39 +216,35 @@ static LIBYUV_BOOL TestEnv(const char*) {
}
#endif
LIBYUV_API SAFEBUFFERS
int InitCpuFlags(void) {
// TODO(fbarchard): swap kCpuInit logic so 0 means uninitialized.
static SAFEBUFFERS int GetCpuFlags(void) {
int cpu_info = 0;
#if !defined(__pnacl__) && !defined(__CLR_VER) && defined(CPU_X86)
uint32 cpu_info0[4] = { 0, 0, 0, 0 };
uint32 cpu_info1[4] = { 0, 0, 0, 0 };
uint32 cpu_info7[4] = { 0, 0, 0, 0 };
int cpu_info0[4] = {0, 0, 0, 0};
int cpu_info1[4] = {0, 0, 0, 0};
int cpu_info7[4] = {0, 0, 0, 0};
CpuId(0, 0, cpu_info0);
CpuId(1, 0, cpu_info1);
if (cpu_info0[0] >= 7) {
CpuId(7, 0, cpu_info7);
}
cpu_info = ((cpu_info1[3] & 0x04000000) ? kCpuHasSSE2 : 0) |
cpu_info = kCpuHasX86 | ((cpu_info1[3] & 0x04000000) ? kCpuHasSSE2 : 0) |
((cpu_info1[2] & 0x00000200) ? kCpuHasSSSE3 : 0) |
((cpu_info1[2] & 0x00080000) ? kCpuHasSSE41 : 0) |
((cpu_info1[2] & 0x00100000) ? kCpuHasSSE42 : 0) |
((cpu_info7[1] & 0x00000200) ? kCpuHasERMS : 0) |
((cpu_info1[2] & 0x00001000) ? kCpuHasFMA3 : 0) |
kCpuHasX86;
((cpu_info7[1] & 0x00000200) ? kCpuHasERMS : 0);
#ifdef HAS_XGETBV
// AVX requires CPU has AVX, XSAVE and OSXSave for xgetbv
// AVX requires OS saves YMM registers.
if (((cpu_info1[2] & 0x1c000000) == 0x1c000000) && // AVX and OSXSave
((GetXCR0() & 6) == 6)) { // Test OS saves YMM registers
cpu_info |= ((cpu_info7[1] & 0x00000020) ? kCpuHasAVX2 : 0) | kCpuHasAVX;
cpu_info |= kCpuHasAVX | ((cpu_info7[1] & 0x00000020) ? kCpuHasAVX2 : 0) |
((cpu_info1[2] & 0x00001000) ? kCpuHasFMA3 : 0) |
((cpu_info1[2] & 0x20000000) ? kCpuHasF16C : 0);
// Detect AVX512bw
if ((GetXCR0() & 0xe0) == 0xe0) {
cpu_info |= (cpu_info7[1] & 0x40000000) ? kCpuHasAVX3 : 0;
}
}
#endif
// Environment variable overrides for testing.
if (TestEnv("LIBYUV_DISABLE_X86")) {
@ -249,15 +277,25 @@ int InitCpuFlags(void) {
if (TestEnv("LIBYUV_DISABLE_AVX3")) {
cpu_info &= ~kCpuHasAVX3;
}
if (TestEnv("LIBYUV_DISABLE_F16C")) {
cpu_info &= ~kCpuHasF16C;
}
#endif
#if defined(__mips__) && defined(__linux__)
#if defined(__mips_dspr2)
cpu_info |= kCpuHasDSPR2;
#endif
#if defined(__mips_msa)
cpu_info = MipsCpuCaps("/proc/cpuinfo", " msa");
#endif
cpu_info |= kCpuHasMIPS;
if (getenv("LIBYUV_DISABLE_DSPR2")) {
cpu_info &= ~kCpuHasDSPR2;
}
if (getenv("LIBYUV_DISABLE_MSA")) {
cpu_info &= ~kCpuHasMSA;
}
#endif
#if defined(__arm__) || defined(__aarch64__)
// gcc -mfpu=neon defines __ARM_NEON__
@ -283,15 +321,25 @@ int InitCpuFlags(void) {
if (TestEnv("LIBYUV_DISABLE_ASM")) {
cpu_info = 0;
}
cpu_info |= kCpuInitialized;
cpu_info_ = cpu_info;
cpu_info |= kCpuInitialized;
return cpu_info;
}
// Note that use of this function is not thread safe.
LIBYUV_API
void MaskCpuFlags(int enable_flags) {
cpu_info_ = InitCpuFlags() & enable_flags;
int MaskCpuFlags(int enable_flags) {
int cpu_info = GetCpuFlags() & enable_flags;
#ifdef __ATOMIC_RELAXED
__atomic_store_n(&cpu_info_, cpu_info, __ATOMIC_RELAXED);
#else
cpu_info_ = cpu_info;
#endif
return cpu_info;
}
LIBYUV_API
int InitCpuFlags(void) {
return MaskCpuFlags(-1);
}
#ifdef __cplusplus

View File

@ -13,6 +13,10 @@
#ifdef HAVE_JPEG
#include <assert.h>
#ifdef __cplusplus
#include <new>
#endif
#if !defined(__pnacl__) && !defined(__CLR_VER) && \
!defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
// Must be included before jpeglib.
@ -21,7 +25,7 @@
#if defined(_MSC_VER)
// disable warning 4324: structure was padded due to __declspec(align())
#pragma warning(disable:4324)
#pragma warning(disable : 4324)
#endif
#endif
@ -62,6 +66,7 @@ void init_source(jpeg_decompress_struct* cinfo);
void skip_input_data(jpeg_decompress_struct* cinfo, long num_bytes); // NOLINT
void term_source(jpeg_decompress_struct* cinfo);
void ErrorHandler(jpeg_common_struct* cinfo);
void OutputHandler(jpeg_common_struct* cinfo);
MJpegDecoder::MJpegDecoder()
: has_scanline_padding_(LIBYUV_FALSE),
@ -77,6 +82,7 @@ MJpegDecoder::MJpegDecoder()
decompress_struct_->err = jpeg_std_error(&error_mgr_->base);
// Override standard exit()-based error handler.
error_mgr_->base.error_exit = &ErrorHandler;
error_mgr_->base.output_message = &OutputHandler;
#endif
decompress_struct_->client_data = NULL;
source_mgr_->init_source = &init_source;
@ -127,7 +133,7 @@ LIBYUV_BOOL MJpegDecoder::LoadFrame(const uint8* src, size_t src_len) {
if (scanlines_[i]) {
delete scanlines_[i];
}
scanlines_[i] = new uint8* [scanlines_size];
scanlines_[i] = new uint8*[scanlines_size];
scanlines_sizes_[i] = scanlines_size;
}
@ -193,13 +199,11 @@ int MJpegDecoder::GetVertSampFactor(int component) {
}
int MJpegDecoder::GetHorizSubSampFactor(int component) {
return decompress_struct_->max_h_samp_factor /
GetHorizSampFactor(component);
return decompress_struct_->max_h_samp_factor / GetHorizSampFactor(component);
}
int MJpegDecoder::GetVertSubSampFactor(int component) {
return decompress_struct_->max_v_samp_factor /
GetVertSampFactor(component);
return decompress_struct_->max_v_samp_factor / GetVertSampFactor(component);
}
int MJpegDecoder::GetImageScanlinesPerImcuRow() {
@ -243,10 +247,10 @@ LIBYUV_BOOL MJpegDecoder::UnloadFrame() {
}
// TODO(fbarchard): Allow rectangle to be specified: x, y, width, height.
LIBYUV_BOOL MJpegDecoder::DecodeToBuffers(
uint8** planes, int dst_width, int dst_height) {
if (dst_width != GetWidth() ||
dst_height > GetHeight()) {
LIBYUV_BOOL MJpegDecoder::DecodeToBuffers(uint8** planes,
int dst_width,
int dst_height) {
if (dst_width != GetWidth() || dst_height > GetHeight()) {
// ERROR: Bad dimensions
return LIBYUV_FALSE;
}
@ -287,14 +291,13 @@ LIBYUV_BOOL MJpegDecoder::DecodeToBuffers(
for (int i = 0; i < num_outbufs_; ++i) {
// TODO(fbarchard): Compute skip to avoid this
assert(skip % GetVertSubSampFactor(i) == 0);
int rows_to_skip =
DivideAndRoundDown(skip, GetVertSubSampFactor(i));
int scanlines_to_copy = GetComponentScanlinesPerImcuRow(i) -
rows_to_skip;
int rows_to_skip = DivideAndRoundDown(skip, GetVertSubSampFactor(i));
int scanlines_to_copy =
GetComponentScanlinesPerImcuRow(i) - rows_to_skip;
int data_to_skip = rows_to_skip * GetComponentStride(i);
CopyPlane(databuf_[i] + data_to_skip, GetComponentStride(i),
planes[i], GetComponentWidth(i),
GetComponentWidth(i), scanlines_to_copy);
CopyPlane(databuf_[i] + data_to_skip, GetComponentStride(i), planes[i],
GetComponentWidth(i), GetComponentWidth(i),
scanlines_to_copy);
planes[i] += scanlines_to_copy * GetComponentWidth(i);
}
lines_left -= (GetImageScanlinesPerImcuRow() - skip);
@ -303,16 +306,15 @@ LIBYUV_BOOL MJpegDecoder::DecodeToBuffers(
// Read full MCUs but cropped horizontally
for (; lines_left > GetImageScanlinesPerImcuRow();
lines_left -= GetImageScanlinesPerImcuRow()) {
lines_left -= GetImageScanlinesPerImcuRow()) {
if (!DecodeImcuRow()) {
FinishDecode();
return LIBYUV_FALSE;
}
for (int i = 0; i < num_outbufs_; ++i) {
int scanlines_to_copy = GetComponentScanlinesPerImcuRow(i);
CopyPlane(databuf_[i], GetComponentStride(i),
planes[i], GetComponentWidth(i),
GetComponentWidth(i), scanlines_to_copy);
CopyPlane(databuf_[i], GetComponentStride(i), planes[i],
GetComponentWidth(i), GetComponentWidth(i), scanlines_to_copy);
planes[i] += scanlines_to_copy * GetComponentWidth(i);
}
}
@ -326,19 +328,19 @@ LIBYUV_BOOL MJpegDecoder::DecodeToBuffers(
for (int i = 0; i < num_outbufs_; ++i) {
int scanlines_to_copy =
DivideAndRoundUp(lines_left, GetVertSubSampFactor(i));
CopyPlane(databuf_[i], GetComponentStride(i),
planes[i], GetComponentWidth(i),
GetComponentWidth(i), scanlines_to_copy);
CopyPlane(databuf_[i], GetComponentStride(i), planes[i],
GetComponentWidth(i), GetComponentWidth(i), scanlines_to_copy);
planes[i] += scanlines_to_copy * GetComponentWidth(i);
}
}
return FinishDecode();
}
LIBYUV_BOOL MJpegDecoder::DecodeToCallback(CallbackFunction fn, void* opaque,
int dst_width, int dst_height) {
if (dst_width != GetWidth() ||
dst_height > GetHeight()) {
LIBYUV_BOOL MJpegDecoder::DecodeToCallback(CallbackFunction fn,
void* opaque,
int dst_width,
int dst_height) {
if (dst_width != GetWidth() || dst_height > GetHeight()) {
// ERROR: Bad dimensions
return LIBYUV_FALSE;
}
@ -393,7 +395,7 @@ LIBYUV_BOOL MJpegDecoder::DecodeToCallback(CallbackFunction fn, void* opaque,
}
// Read full MCUs until we get to the crop point.
for (; lines_left >= GetImageScanlinesPerImcuRow();
lines_left -= GetImageScanlinesPerImcuRow()) {
lines_left -= GetImageScanlinesPerImcuRow()) {
if (!DecodeImcuRow()) {
FinishDecode();
return LIBYUV_FALSE;
@ -433,22 +435,22 @@ void skip_input_data(j_decompress_ptr cinfo, long num_bytes) { // NOLINT
}
void term_source(j_decompress_ptr cinfo) {
// Nothing to do.
(void)cinfo; // Nothing to do.
}
#ifdef HAVE_SETJMP
void ErrorHandler(j_common_ptr cinfo) {
// This is called when a jpeglib command experiences an error. Unfortunately
// jpeglib's error handling model is not very flexible, because it expects the
// error handler to not return--i.e., it wants the program to terminate. To
// recover from errors we use setjmp() as shown in their example. setjmp() is
// C's implementation for the "call with current continuation" functionality
// seen in some functional programming languages.
// A formatted message can be output, but is unsafe for release.
// This is called when a jpeglib command experiences an error. Unfortunately
// jpeglib's error handling model is not very flexible, because it expects the
// error handler to not return--i.e., it wants the program to terminate. To
// recover from errors we use setjmp() as shown in their example. setjmp() is
// C's implementation for the "call with current continuation" functionality
// seen in some functional programming languages.
// A formatted message can be output, but is unsafe for release.
#ifdef DEBUG
char buf[JMSG_LENGTH_MAX];
(*cinfo->err->format_message)(cinfo, buf);
// ERROR: Error in jpeglib: buf
// ERROR: Error in jpeglib: buf
#endif
SetJmpErrorMgr* mgr = reinterpret_cast<SetJmpErrorMgr*>(cinfo->err);
@ -456,7 +458,13 @@ void ErrorHandler(j_common_ptr cinfo) {
// and causes it to return (for a second time) with value 1.
longjmp(mgr->setjmp_buffer, 1);
}
#endif
// Suppress fprintf warnings.
void OutputHandler(j_common_ptr cinfo) {
(void)cinfo;
}
#endif // HAVE_SETJMP
void MJpegDecoder::AllocOutputBuffers(int num_outbufs) {
if (num_outbufs != num_outbufs_) {
@ -465,9 +473,9 @@ void MJpegDecoder::AllocOutputBuffers(int num_outbufs) {
// it.
DestroyOutputBuffers();
scanlines_ = new uint8** [num_outbufs];
scanlines_ = new uint8**[num_outbufs];
scanlines_sizes_ = new int[num_outbufs];
databuf_ = new uint8* [num_outbufs];
databuf_ = new uint8*[num_outbufs];
databuf_strides_ = new int[num_outbufs];
for (int i = 0; i < num_outbufs; ++i) {
@ -483,13 +491,13 @@ void MJpegDecoder::AllocOutputBuffers(int num_outbufs) {
void MJpegDecoder::DestroyOutputBuffers() {
for (int i = 0; i < num_outbufs_; ++i) {
delete [] scanlines_[i];
delete [] databuf_[i];
delete[] scanlines_[i];
delete[] databuf_[i];
}
delete [] scanlines_;
delete [] databuf_;
delete [] scanlines_sizes_;
delete [] databuf_strides_;
delete[] scanlines_;
delete[] databuf_;
delete[] scanlines_sizes_;
delete[] databuf_strides_;
scanlines_ = NULL;
databuf_ = NULL;
scanlines_sizes_ = NULL;
@ -535,26 +543,26 @@ void MJpegDecoder::SetScanlinePointers(uint8** data) {
inline LIBYUV_BOOL MJpegDecoder::DecodeImcuRow() {
return (unsigned int)(GetImageScanlinesPerImcuRow()) ==
jpeg_read_raw_data(decompress_struct_,
scanlines_,
GetImageScanlinesPerImcuRow());
jpeg_read_raw_data(decompress_struct_, scanlines_,
GetImageScanlinesPerImcuRow());
}
// The helper function which recognizes the jpeg sub-sampling type.
JpegSubsamplingType MJpegDecoder::JpegSubsamplingTypeHelper(
int* subsample_x, int* subsample_y, int number_of_components) {
int* subsample_x,
int* subsample_y,
int number_of_components) {
if (number_of_components == 3) { // Color images.
if (subsample_x[0] == 1 && subsample_y[0] == 1 &&
subsample_x[1] == 2 && subsample_y[1] == 2 &&
subsample_x[2] == 2 && subsample_y[2] == 2) {
if (subsample_x[0] == 1 && subsample_y[0] == 1 && subsample_x[1] == 2 &&
subsample_y[1] == 2 && subsample_x[2] == 2 && subsample_y[2] == 2) {
return kJpegYuv420;
} else if (subsample_x[0] == 1 && subsample_y[0] == 1 &&
subsample_x[1] == 2 && subsample_y[1] == 1 &&
subsample_x[2] == 2 && subsample_y[2] == 1) {
subsample_x[1] == 2 && subsample_y[1] == 1 &&
subsample_x[2] == 2 && subsample_y[2] == 1) {
return kJpegYuv422;
} else if (subsample_x[0] == 1 && subsample_y[0] == 1 &&
subsample_x[1] == 1 && subsample_y[1] == 1 &&
subsample_x[2] == 1 && subsample_y[2] == 1) {
subsample_x[1] == 1 && subsample_y[1] == 1 &&
subsample_x[2] == 1 && subsample_y[2] == 1) {
return kJpegYuv444;
}
} else if (number_of_components == 1) { // Grey-scale images.
@ -567,4 +575,3 @@ JpegSubsamplingType MJpegDecoder::JpegSubsamplingTypeHelper(
} // namespace libyuv
#endif // HAVE_JPEG

View File

@ -24,7 +24,7 @@ static LIBYUV_BOOL ScanEOI(const uint8* sample, size_t sample_size) {
const uint8* it = sample;
while (it < end) {
// TODO(fbarchard): scan for 0xd9 instead.
it = static_cast<const uint8 *>(memchr(it, 0xff, end - it));
it = static_cast<const uint8*>(memchr(it, 0xff, end - it));
if (it == NULL) {
break;
}
@ -68,4 +68,3 @@ LIBYUV_BOOL ValidateJpeg(const uint8* sample, size_t sample_size) {
} // extern "C"
} // namespace libyuv
#endif

File diff suppressed because it is too large Load Diff

View File

@ -10,8 +10,8 @@
#include "libyuv/rotate.h"
#include "libyuv/cpu_id.h"
#include "libyuv/convert.h"
#include "libyuv/cpu_id.h"
#include "libyuv/planar_functions.h"
#include "libyuv/rotate_row.h"
#include "libyuv/row.h"
@ -22,12 +22,20 @@ extern "C" {
#endif
LIBYUV_API
void TransposePlane(const uint8* src, int src_stride,
uint8* dst, int dst_stride,
int width, int height) {
void TransposePlane(const uint8* src,
int src_stride,
uint8* dst,
int dst_stride,
int width,
int height) {
int i = height;
void (*TransposeWx8)(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width) = TransposeWx8_C;
#if defined(HAS_TRANSPOSEWX16_MSA)
void (*TransposeWx16)(const uint8* src, int src_stride, uint8* dst,
int dst_stride, int width) = TransposeWx16_C;
#else
void (*TransposeWx8)(const uint8* src, int src_stride, uint8* dst,
int dst_stride, int width) = TransposeWx8_C;
#endif
#if defined(HAS_TRANSPOSEWX8_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
TransposeWx8 = TransposeWx8_NEON;
@ -51,22 +59,40 @@ void TransposePlane(const uint8* src, int src_stride,
#endif
#if defined(HAS_TRANSPOSEWX8_DSPR2)
if (TestCpuFlag(kCpuHasDSPR2)) {
if (IS_ALIGNED(width, 4) &&
IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
if (IS_ALIGNED(width, 4) && IS_ALIGNED(src, 4) &&
IS_ALIGNED(src_stride, 4)) {
TransposeWx8 = TransposeWx8_Fast_DSPR2;
} else {
TransposeWx8 = TransposeWx8_DSPR2;
}
}
#endif
#if defined(HAS_TRANSPOSEWX16_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
TransposeWx16 = TransposeWx16_Any_MSA;
if (IS_ALIGNED(width, 16)) {
TransposeWx16 = TransposeWx16_MSA;
}
}
#endif
#if defined(HAS_TRANSPOSEWX16_MSA)
// Work across the source in 16x16 tiles
while (i >= 16) {
TransposeWx16(src, src_stride, dst, dst_stride, width);
src += 16 * src_stride; // Go down 16 rows.
dst += 16; // Move over 16 columns.
i -= 16;
}
#else
// Work across the source in 8x8 tiles
while (i >= 8) {
TransposeWx8(src, src_stride, dst, dst_stride, width);
src += 8 * src_stride; // Go down 8 rows.
dst += 8; // Move over 8 columns.
src += 8 * src_stride; // Go down 8 rows.
dst += 8; // Move over 8 columns.
i -= 8;
}
#endif
if (i > 0) {
TransposeWxH_C(src, src_stride, dst, dst_stride, width, i);
@ -74,9 +100,12 @@ void TransposePlane(const uint8* src, int src_stride,
}
LIBYUV_API
void RotatePlane90(const uint8* src, int src_stride,
uint8* dst, int dst_stride,
int width, int height) {
void RotatePlane90(const uint8* src,
int src_stride,
uint8* dst,
int dst_stride,
int width,
int height) {
// Rotate by 90 is a transpose with the source read
// from bottom to top. So set the source pointer to the end
// of the buffer and flip the sign of the source stride.
@ -86,9 +115,12 @@ void RotatePlane90(const uint8* src, int src_stride,
}
LIBYUV_API
void RotatePlane270(const uint8* src, int src_stride,
uint8* dst, int dst_stride,
int width, int height) {
void RotatePlane270(const uint8* src,
int src_stride,
uint8* dst,
int dst_stride,
int width,
int height) {
// Rotate by 270 is a transpose with the destination written
// from bottom to top. So set the destination pointer to the end
// of the buffer and flip the sign of the destination stride.
@ -98,9 +130,12 @@ void RotatePlane270(const uint8* src, int src_stride,
}
LIBYUV_API
void RotatePlane180(const uint8* src, int src_stride,
uint8* dst, int dst_stride,
int width, int height) {
void RotatePlane180(const uint8* src,
int src_stride,
uint8* dst,
int dst_stride,
int width,
int height) {
// Swap first and last row and mirror the content. Uses a temporary row.
align_buffer_64(row, width);
const uint8* src_bot = src + src_stride * (height - 1);
@ -135,12 +170,20 @@ void RotatePlane180(const uint8* src, int src_stride,
#endif
// TODO(fbarchard): Mirror on mips handle unaligned memory.
#if defined(HAS_MIRRORROW_DSPR2)
if (TestCpuFlag(kCpuHasDSPR2) &&
IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4) &&
IS_ALIGNED(dst, 4) && IS_ALIGNED(dst_stride, 4)) {
if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src, 4) &&
IS_ALIGNED(src_stride, 4) && IS_ALIGNED(dst, 4) &&
IS_ALIGNED(dst_stride, 4)) {
MirrorRow = MirrorRow_DSPR2;
}
#endif
#if defined(HAS_MIRRORROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
MirrorRow = MirrorRow_Any_MSA;
if (IS_ALIGNED(width, 64)) {
MirrorRow = MirrorRow_MSA;
}
}
#endif
#if defined(HAS_COPYROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
@ -181,15 +224,24 @@ void RotatePlane180(const uint8* src, int src_stride,
}
LIBYUV_API
void TransposeUV(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b,
int width, int height) {
void TransposeUV(const uint8* src,
int src_stride,
uint8* dst_a,
int dst_stride_a,
uint8* dst_b,
int dst_stride_b,
int width,
int height) {
int i = height;
void (*TransposeUVWx8)(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b,
#if defined(HAS_TRANSPOSEUVWX16_MSA)
void (*TransposeUVWx16)(const uint8* src, int src_stride, uint8* dst_a,
int dst_stride_a, uint8* dst_b, int dst_stride_b,
int width) = TransposeUVWx16_C;
#else
void (*TransposeUVWx8)(const uint8* src, int src_stride, uint8* dst_a,
int dst_stride_a, uint8* dst_b, int dst_stride_b,
int width) = TransposeUVWx8_C;
#endif
#if defined(HAS_TRANSPOSEUVWX8_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
TransposeUVWx8 = TransposeUVWx8_NEON;
@ -204,68 +256,92 @@ void TransposeUV(const uint8* src, int src_stride,
}
#endif
#if defined(HAS_TRANSPOSEUVWX8_DSPR2)
if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 2) &&
IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 2) && IS_ALIGNED(src, 4) &&
IS_ALIGNED(src_stride, 4)) {
TransposeUVWx8 = TransposeUVWx8_DSPR2;
}
#endif
#if defined(HAS_TRANSPOSEUVWX16_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
TransposeUVWx16 = TransposeUVWx16_Any_MSA;
if (IS_ALIGNED(width, 8)) {
TransposeUVWx16 = TransposeUVWx16_MSA;
}
}
#endif
#if defined(HAS_TRANSPOSEUVWX16_MSA)
// Work through the source in 8x8 tiles.
while (i >= 16) {
TransposeUVWx16(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b,
width);
src += 16 * src_stride; // Go down 16 rows.
dst_a += 16; // Move over 8 columns.
dst_b += 16; // Move over 8 columns.
i -= 16;
}
#else
// Work through the source in 8x8 tiles.
while (i >= 8) {
TransposeUVWx8(src, src_stride,
dst_a, dst_stride_a,
dst_b, dst_stride_b,
TransposeUVWx8(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b,
width);
src += 8 * src_stride; // Go down 8 rows.
dst_a += 8; // Move over 8 columns.
dst_b += 8; // Move over 8 columns.
src += 8 * src_stride; // Go down 8 rows.
dst_a += 8; // Move over 8 columns.
dst_b += 8; // Move over 8 columns.
i -= 8;
}
#endif
if (i > 0) {
TransposeUVWxH_C(src, src_stride,
dst_a, dst_stride_a,
dst_b, dst_stride_b,
TransposeUVWxH_C(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b,
width, i);
}
}
LIBYUV_API
void RotateUV90(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b,
int width, int height) {
void RotateUV90(const uint8* src,
int src_stride,
uint8* dst_a,
int dst_stride_a,
uint8* dst_b,
int dst_stride_b,
int width,
int height) {
src += src_stride * (height - 1);
src_stride = -src_stride;
TransposeUV(src, src_stride,
dst_a, dst_stride_a,
dst_b, dst_stride_b,
width, height);
TransposeUV(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, width,
height);
}
LIBYUV_API
void RotateUV270(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b,
int width, int height) {
void RotateUV270(const uint8* src,
int src_stride,
uint8* dst_a,
int dst_stride_a,
uint8* dst_b,
int dst_stride_b,
int width,
int height) {
dst_a += dst_stride_a * (width - 1);
dst_b += dst_stride_b * (width - 1);
dst_stride_a = -dst_stride_a;
dst_stride_b = -dst_stride_b;
TransposeUV(src, src_stride,
dst_a, dst_stride_a,
dst_b, dst_stride_b,
width, height);
TransposeUV(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, width,
height);
}
// Rotate 180 is a horizontal and vertical flip.
LIBYUV_API
void RotateUV180(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b,
int width, int height) {
void RotateUV180(const uint8* src,
int src_stride,
uint8* dst_a,
int dst_stride_a,
uint8* dst_b,
int dst_stride_b,
int width,
int height) {
int i;
void (*MirrorUVRow)(const uint8* src, uint8* dst_u, uint8* dst_v, int width) =
MirrorUVRow_C;
@ -280,8 +356,8 @@ void RotateUV180(const uint8* src, int src_stride,
}
#endif
#if defined(HAS_MIRRORUVROW_DSPR2)
if (TestCpuFlag(kCpuHasDSPR2) &&
IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src, 4) &&
IS_ALIGNED(src_stride, 4)) {
MirrorUVRow = MirrorUVRow_DSPR2;
}
#endif
@ -298,9 +374,12 @@ void RotateUV180(const uint8* src, int src_stride,
}
LIBYUV_API
int RotatePlane(const uint8* src, int src_stride,
uint8* dst, int dst_stride,
int width, int height,
int RotatePlane(const uint8* src,
int src_stride,
uint8* dst,
int dst_stride,
int width,
int height,
enum RotationMode mode) {
if (!src || width <= 0 || height == 0 || !dst) {
return -1;
@ -316,24 +395,16 @@ int RotatePlane(const uint8* src, int src_stride,
switch (mode) {
case kRotate0:
// copy frame
CopyPlane(src, src_stride,
dst, dst_stride,
width, height);
CopyPlane(src, src_stride, dst, dst_stride, width, height);
return 0;
case kRotate90:
RotatePlane90(src, src_stride,
dst, dst_stride,
width, height);
RotatePlane90(src, src_stride, dst, dst_stride, width, height);
return 0;
case kRotate270:
RotatePlane270(src, src_stride,
dst, dst_stride,
width, height);
RotatePlane270(src, src_stride, dst, dst_stride, width, height);
return 0;
case kRotate180:
RotatePlane180(src, src_stride,
dst, dst_stride,
width, height);
RotatePlane180(src, src_stride, dst, dst_stride, width, height);
return 0;
default:
break;
@ -342,18 +413,25 @@ int RotatePlane(const uint8* src, int src_stride,
}
LIBYUV_API
int I420Rotate(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height,
int I420Rotate(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint8* dst_y,
int dst_stride_y,
uint8* dst_u,
int dst_stride_u,
uint8* dst_v,
int dst_stride_v,
int width,
int height,
enum RotationMode mode) {
int halfwidth = (width + 1) >> 1;
int halfheight = (height + 1) >> 1;
if (!src_y || !src_u || !src_v || width <= 0 || height == 0 ||
!dst_y || !dst_u || !dst_v) {
if (!src_y || !src_u || !src_v || width <= 0 || height == 0 || !dst_y ||
!dst_u || !dst_v) {
return -1;
}
@ -372,45 +450,29 @@ int I420Rotate(const uint8* src_y, int src_stride_y,
switch (mode) {
case kRotate0:
// copy frame
return I420Copy(src_y, src_stride_y,
src_u, src_stride_u,
src_v, src_stride_v,
dst_y, dst_stride_y,
dst_u, dst_stride_u,
dst_v, dst_stride_v,
width, height);
return I420Copy(src_y, src_stride_y, src_u, src_stride_u, src_v,
src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
dst_v, dst_stride_v, width, height);
case kRotate90:
RotatePlane90(src_y, src_stride_y,
dst_y, dst_stride_y,
width, height);
RotatePlane90(src_u, src_stride_u,
dst_u, dst_stride_u,
halfwidth, halfheight);
RotatePlane90(src_v, src_stride_v,
dst_v, dst_stride_v,
halfwidth, halfheight);
RotatePlane90(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
RotatePlane90(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth,
halfheight);
RotatePlane90(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth,
halfheight);
return 0;
case kRotate270:
RotatePlane270(src_y, src_stride_y,
dst_y, dst_stride_y,
width, height);
RotatePlane270(src_u, src_stride_u,
dst_u, dst_stride_u,
halfwidth, halfheight);
RotatePlane270(src_v, src_stride_v,
dst_v, dst_stride_v,
halfwidth, halfheight);
RotatePlane270(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
RotatePlane270(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth,
halfheight);
RotatePlane270(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth,
halfheight);
return 0;
case kRotate180:
RotatePlane180(src_y, src_stride_y,
dst_y, dst_stride_y,
width, height);
RotatePlane180(src_u, src_stride_u,
dst_u, dst_stride_u,
halfwidth, halfheight);
RotatePlane180(src_v, src_stride_v,
dst_v, dst_stride_v,
halfwidth, halfheight);
RotatePlane180(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
RotatePlane180(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth,
halfheight);
RotatePlane180(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth,
halfheight);
return 0;
default:
break;
@ -419,17 +481,23 @@ int I420Rotate(const uint8* src_y, int src_stride_y,
}
LIBYUV_API
int NV12ToI420Rotate(const uint8* src_y, int src_stride_y,
const uint8* src_uv, int src_stride_uv,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height,
int NV12ToI420Rotate(const uint8* src_y,
int src_stride_y,
const uint8* src_uv,
int src_stride_uv,
uint8* dst_y,
int dst_stride_y,
uint8* dst_u,
int dst_stride_u,
uint8* dst_v,
int dst_stride_v,
int width,
int height,
enum RotationMode mode) {
int halfwidth = (width + 1) >> 1;
int halfheight = (height + 1) >> 1;
if (!src_y || !src_uv || width <= 0 || height == 0 ||
!dst_y || !dst_u || !dst_v) {
if (!src_y || !src_uv || width <= 0 || height == 0 || !dst_y || !dst_u ||
!dst_v) {
return -1;
}
@ -446,38 +514,23 @@ int NV12ToI420Rotate(const uint8* src_y, int src_stride_y,
switch (mode) {
case kRotate0:
// copy frame
return NV12ToI420(src_y, src_stride_y,
src_uv, src_stride_uv,
dst_y, dst_stride_y,
dst_u, dst_stride_u,
dst_v, dst_stride_v,
return NV12ToI420(src_y, src_stride_y, src_uv, src_stride_uv, dst_y,
dst_stride_y, dst_u, dst_stride_u, dst_v, dst_stride_v,
width, height);
case kRotate90:
RotatePlane90(src_y, src_stride_y,
dst_y, dst_stride_y,
width, height);
RotateUV90(src_uv, src_stride_uv,
dst_u, dst_stride_u,
dst_v, dst_stride_v,
halfwidth, halfheight);
RotatePlane90(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
RotateUV90(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v,
dst_stride_v, halfwidth, halfheight);
return 0;
case kRotate270:
RotatePlane270(src_y, src_stride_y,
dst_y, dst_stride_y,
width, height);
RotateUV270(src_uv, src_stride_uv,
dst_u, dst_stride_u,
dst_v, dst_stride_v,
halfwidth, halfheight);
RotatePlane270(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
RotateUV270(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v,
dst_stride_v, halfwidth, halfheight);
return 0;
case kRotate180:
RotatePlane180(src_y, src_stride_y,
dst_y, dst_stride_y,
width, height);
RotateUV180(src_uv, src_stride_uv,
dst_u, dst_stride_u,
dst_v, dst_stride_v,
halfwidth, halfheight);
RotatePlane180(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
RotateUV180(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v,
dst_stride_v, halfwidth, halfheight);
return 0;
default:
break;

View File

@ -18,16 +18,16 @@ namespace libyuv {
extern "C" {
#endif
#define TANY(NAMEANY, TPOS_SIMD, MASK) \
void NAMEANY(const uint8* src, int src_stride, \
uint8* dst, int dst_stride, int width) { \
int r = width & MASK; \
int n = width - r; \
if (n > 0) { \
TPOS_SIMD(src, src_stride, dst, dst_stride, n); \
} \
TransposeWx8_C(src + n, src_stride, dst + n * dst_stride, dst_stride, r);\
}
#define TANY(NAMEANY, TPOS_SIMD, MASK) \
void NAMEANY(const uint8* src, int src_stride, uint8* dst, int dst_stride, \
int width) { \
int r = width & MASK; \
int n = width - r; \
if (n > 0) { \
TPOS_SIMD(src, src_stride, dst, dst_stride, n); \
} \
TransposeWx8_C(src + n, src_stride, dst + n * dst_stride, dst_stride, r); \
}
#ifdef HAS_TRANSPOSEWX8_NEON
TANY(TransposeWx8_Any_NEON, TransposeWx8_NEON, 7)
@ -41,22 +41,22 @@ TANY(TransposeWx8_Fast_Any_SSSE3, TransposeWx8_Fast_SSSE3, 15)
#ifdef HAS_TRANSPOSEWX8_DSPR2
TANY(TransposeWx8_Any_DSPR2, TransposeWx8_DSPR2, 7)
#endif
#ifdef HAS_TRANSPOSEWX16_MSA
TANY(TransposeWx16_Any_MSA, TransposeWx16_MSA, 15)
#endif
#undef TANY
#define TUVANY(NAMEANY, TPOS_SIMD, MASK) \
void NAMEANY(const uint8* src, int src_stride, \
uint8* dst_a, int dst_stride_a, \
uint8* dst_b, int dst_stride_b, int width) { \
int r = width & MASK; \
int n = width - r; \
if (n > 0) { \
TPOS_SIMD(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, \
n); \
} \
TransposeUVWx8_C(src + n * 2, src_stride, \
dst_a + n * dst_stride_a, dst_stride_a, \
dst_b + n * dst_stride_b, dst_stride_b, r); \
}
void NAMEANY(const uint8* src, int src_stride, uint8* dst_a, \
int dst_stride_a, uint8* dst_b, int dst_stride_b, int width) { \
int r = width & MASK; \
int n = width - r; \
if (n > 0) { \
TPOS_SIMD(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, n); \
} \
TransposeUVWx8_C(src + n * 2, src_stride, dst_a + n * dst_stride_a, \
dst_stride_a, dst_b + n * dst_stride_b, dst_stride_b, r); \
}
#ifdef HAS_TRANSPOSEUVWX8_NEON
TUVANY(TransposeUVWx8_Any_NEON, TransposeUVWx8_NEON, 7)
@ -67,14 +67,12 @@ TUVANY(TransposeUVWx8_Any_SSE2, TransposeUVWx8_SSE2, 7)
#ifdef HAS_TRANSPOSEUVWX8_DSPR2
TUVANY(TransposeUVWx8_Any_DSPR2, TransposeUVWx8_DSPR2, 7)
#endif
#ifdef HAS_TRANSPOSEUVWX16_MSA
TUVANY(TransposeUVWx16_Any_MSA, TransposeUVWx16_MSA, 7)
#endif
#undef TUVANY
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif

View File

@ -10,8 +10,8 @@
#include "libyuv/rotate.h"
#include "libyuv/cpu_id.h"
#include "libyuv/convert.h"
#include "libyuv/cpu_id.h"
#include "libyuv/planar_functions.h"
#include "libyuv/row.h"
@ -22,29 +22,44 @@ extern "C" {
// ARGBScale has a function to copy pixels to a row, striding each source
// pixel by a constant.
#if !defined(LIBYUV_DISABLE_X86) && \
(defined(_M_IX86) || \
(defined(__x86_64__) && !defined(__native_client__)) || defined(__i386__))
#if !defined(LIBYUV_DISABLE_X86) && \
(defined(_M_IX86) || \
(defined(__x86_64__) && !defined(__native_client__)) || \
defined(__i386__))
#define HAS_SCALEARGBROWDOWNEVEN_SSE2
void ScaleARGBRowDownEven_SSE2(const uint8* src_ptr, int src_stride,
int src_stepx, uint8* dst_ptr, int dst_width);
void ScaleARGBRowDownEven_SSE2(const uint8* src_ptr,
int src_stride,
int src_stepx,
uint8* dst_ptr,
int dst_width);
#endif
#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
(defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
#define HAS_SCALEARGBROWDOWNEVEN_NEON
void ScaleARGBRowDownEven_NEON(const uint8* src_ptr, int src_stride,
int src_stepx, uint8* dst_ptr, int dst_width);
void ScaleARGBRowDownEven_NEON(const uint8* src_ptr,
int src_stride,
int src_stepx,
uint8* dst_ptr,
int dst_width);
#endif
void ScaleARGBRowDownEven_C(const uint8* src_ptr, int,
int src_stepx, uint8* dst_ptr, int dst_width);
void ScaleARGBRowDownEven_C(const uint8* src_ptr,
int,
int src_stepx,
uint8* dst_ptr,
int dst_width);
static void ARGBTranspose(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width, int height) {
static void ARGBTranspose(const uint8* src,
int src_stride,
uint8* dst,
int dst_stride,
int width,
int height) {
int i;
int src_pixel_step = src_stride >> 2;
void (*ScaleARGBRowDownEven)(const uint8* src_ptr, int src_stride,
int src_step, uint8* dst_ptr, int dst_width) = ScaleARGBRowDownEven_C;
int src_step, uint8* dst_ptr, int dst_width) =
ScaleARGBRowDownEven_C;
#if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(height, 4)) { // Width of dest.
ScaleARGBRowDownEven = ScaleARGBRowDownEven_SSE2;
@ -63,8 +78,12 @@ static void ARGBTranspose(const uint8* src, int src_stride,
}
}
void ARGBRotate90(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width, int height) {
void ARGBRotate90(const uint8* src,
int src_stride,
uint8* dst,
int dst_stride,
int width,
int height) {
// Rotate by 90 is a ARGBTranspose with the source read
// from bottom to top. So set the source pointer to the end
// of the buffer and flip the sign of the source stride.
@ -73,8 +92,12 @@ void ARGBRotate90(const uint8* src, int src_stride,
ARGBTranspose(src, src_stride, dst, dst_stride, width, height);
}
void ARGBRotate270(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width, int height) {
void ARGBRotate270(const uint8* src,
int src_stride,
uint8* dst,
int dst_stride,
int width,
int height) {
// Rotate by 270 is a ARGBTranspose with the destination written
// from bottom to top. So set the destination pointer to the end
// of the buffer and flip the sign of the destination stride.
@ -83,8 +106,12 @@ void ARGBRotate270(const uint8* src, int src_stride,
ARGBTranspose(src, src_stride, dst, dst_stride, width, height);
}
void ARGBRotate180(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width, int height) {
void ARGBRotate180(const uint8* src,
int src_stride,
uint8* dst,
int dst_stride,
int width,
int height) {
// Swap first and last row and mirror the content. Uses a temporary row.
align_buffer_64(row, width * 4);
const uint8* src_bot = src + src_stride * (height - 1);
@ -118,6 +145,14 @@ void ARGBRotate180(const uint8* src, int src_stride,
}
}
#endif
#if defined(HAS_ARGBMIRRORROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ARGBMirrorRow = ARGBMirrorRow_Any_MSA;
if (IS_ALIGNED(width, 16)) {
ARGBMirrorRow = ARGBMirrorRow_MSA;
}
}
#endif
#if defined(HAS_COPYROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
@ -146,9 +181,9 @@ void ARGBRotate180(const uint8* src, int src_stride,
// Odd height will harmlessly mirror the middle row twice.
for (y = 0; y < half_height; ++y) {
ARGBMirrorRow(src, row, width); // Mirror first row into a buffer
ARGBMirrorRow(src, row, width); // Mirror first row into a buffer
ARGBMirrorRow(src_bot, dst, width); // Mirror last row into first row
CopyRow(row, dst_bot, width * 4); // Copy first mirrored row into last
CopyRow(row, dst_bot, width * 4); // Copy first mirrored row into last
src += src_stride;
dst += dst_stride;
src_bot -= src_stride;
@ -158,8 +193,12 @@ void ARGBRotate180(const uint8* src, int src_stride,
}
LIBYUV_API
int ARGBRotate(const uint8* src_argb, int src_stride_argb,
uint8* dst_argb, int dst_stride_argb, int width, int height,
int ARGBRotate(const uint8* src_argb,
int src_stride_argb,
uint8* dst_argb,
int dst_stride_argb,
int width,
int height,
enum RotationMode mode) {
if (!src_argb || width <= 0 || height == 0 || !dst_argb) {
return -1;
@ -175,23 +214,19 @@ int ARGBRotate(const uint8* src_argb, int src_stride_argb,
switch (mode) {
case kRotate0:
// copy frame
return ARGBCopy(src_argb, src_stride_argb,
dst_argb, dst_stride_argb,
return ARGBCopy(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
width, height);
case kRotate90:
ARGBRotate90(src_argb, src_stride_argb,
dst_argb, dst_stride_argb,
width, height);
ARGBRotate90(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width,
height);
return 0;
case kRotate270:
ARGBRotate270(src_argb, src_stride_argb,
dst_argb, dst_stride_argb,
width, height);
ARGBRotate270(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width,
height);
return 0;
case kRotate180:
ARGBRotate180(src_argb, src_stride_argb,
dst_argb, dst_stride_argb,
width, height);
ARGBRotate180(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width,
height);
return 0;
default:
break;

View File

@ -8,16 +8,19 @@
* be found in the AUTHORS file in the root of the source tree.
*/
#include "libyuv/row.h"
#include "libyuv/rotate_row.h"
#include "libyuv/row.h"
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
void TransposeWx8_C(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width) {
void TransposeWx8_C(const uint8* src,
int src_stride,
uint8* dst,
int dst_stride,
int width) {
int i;
for (i = 0; i < width; ++i) {
dst[0] = src[0 * src_stride];
@ -33,9 +36,13 @@ void TransposeWx8_C(const uint8* src, int src_stride,
}
}
void TransposeUVWx8_C(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b, int width) {
void TransposeUVWx8_C(const uint8* src,
int src_stride,
uint8* dst_a,
int dst_stride_a,
uint8* dst_b,
int dst_stride_b,
int width) {
int i;
for (i = 0; i < width; ++i) {
dst_a[0] = src[0 * src_stride + 0];
@ -60,9 +67,12 @@ void TransposeUVWx8_C(const uint8* src, int src_stride,
}
}
void TransposeWxH_C(const uint8* src, int src_stride,
uint8* dst, int dst_stride,
int width, int height) {
void TransposeWxH_C(const uint8* src,
int src_stride,
uint8* dst,
int dst_stride,
int width,
int height) {
int i;
for (i = 0; i < width; ++i) {
int j;
@ -72,10 +82,14 @@ void TransposeWxH_C(const uint8* src, int src_stride,
}
}
void TransposeUVWxH_C(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b,
int width, int height) {
void TransposeUVWxH_C(const uint8* src,
int src_stride,
uint8* dst_a,
int dst_stride_a,
uint8* dst_b,
int dst_stride_b,
int width,
int height) {
int i;
for (i = 0; i < width * 2; i += 2) {
int j;

View File

@ -8,8 +8,8 @@
* be found in the AUTHORS file in the root of the source tree.
*/
#include "libyuv/row.h"
#include "libyuv/rotate_row.h"
#include "libyuv/row.h"
#include "libyuv/basic_types.h"
@ -18,18 +18,20 @@ namespace libyuv {
extern "C" {
#endif
#if !defined(LIBYUV_DISABLE_MIPS) && \
defined(__mips_dsp) && (__mips_dsp_rev >= 2) && \
(_MIPS_SIM == _MIPS_SIM_ABI32)
#if !defined(LIBYUV_DISABLE_DSPR2) && defined(__mips_dsp) && \
(__mips_dsp_rev >= 2) && (_MIPS_SIM == _MIPS_SIM_ABI32)
void TransposeWx8_DSPR2(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width) {
__asm__ __volatile__ (
void TransposeWx8_DSPR2(const uint8* src,
int src_stride,
uint8* dst,
int dst_stride,
int width) {
__asm__ __volatile__(
".set push \n"
".set noreorder \n"
"sll $t2, %[src_stride], 0x1 \n" // src_stride x 2
"sll $t4, %[src_stride], 0x2 \n" // src_stride x 4
"sll $t9, %[src_stride], 0x3 \n" // src_stride x 8
"sll $t2, %[src_stride], 0x1 \n" // src_stride x 2
"sll $t4, %[src_stride], 0x2 \n" // src_stride x 4
"sll $t9, %[src_stride], 0x3 \n" // src_stride x 8
"addu $t3, $t2, %[src_stride] \n"
"addu $t5, $t4, %[src_stride] \n"
"addu $t6, $t2, $t4 \n"
@ -38,8 +40,8 @@ void TransposeWx8_DSPR2(const uint8* src, int src_stride,
"or $t0, $t0, $t1 \n"
"bnez $t0, 11f \n"
" subu $t7, $t9, %[src_stride] \n"
//dst + dst_stride word aligned
"1: \n"
// dst + dst_stride word aligned
"1: \n"
"lbu $t0, 0(%[src]) \n"
"lbux $t1, %[src_stride](%[src]) \n"
"lbux $t8, $t2(%[src]) \n"
@ -65,8 +67,8 @@ void TransposeWx8_DSPR2(const uint8* src, int src_stride,
"bnez %[width], 1b \n"
" addu %[dst], %[dst], %[dst_stride] \n"
"b 2f \n"
//dst + dst_stride unaligned
"11: \n"
// dst + dst_stride unaligned
"11: \n"
"lbu $t0, 0(%[src]) \n"
"lbux $t1, %[src_stride](%[src]) \n"
"lbux $t8, $t2(%[src]) \n"
@ -92,23 +94,20 @@ void TransposeWx8_DSPR2(const uint8* src, int src_stride,
"swr $s1, 4(%[dst]) \n"
"swl $s1, 7(%[dst]) \n"
"bnez %[width], 11b \n"
"addu %[dst], %[dst], %[dst_stride] \n"
"2: \n"
"addu %[dst], %[dst], %[dst_stride] \n"
"2: \n"
".set pop \n"
:[src] "+r" (src),
[dst] "+r" (dst),
[width] "+r" (width)
:[src_stride] "r" (src_stride),
[dst_stride] "r" (dst_stride)
: "t0", "t1", "t2", "t3", "t4", "t5",
"t6", "t7", "t8", "t9",
"s0", "s1"
);
: [src] "+r"(src), [dst] "+r"(dst), [width] "+r"(width)
: [src_stride] "r"(src_stride), [dst_stride] "r"(dst_stride)
: "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9", "s0", "s1");
}
void TransposeWx8_Fast_DSPR2(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width) {
__asm__ __volatile__ (
void TransposeWx8_Fast_DSPR2(const uint8* src,
int src_stride,
uint8* dst,
int dst_stride,
int width) {
__asm__ __volatile__(
".set noat \n"
".set push \n"
".set noreorder \n"
@ -126,67 +125,67 @@ void TransposeWx8_Fast_DSPR2(const uint8* src, int src_stride,
"or $t0, $t0, $t1 \n"
"bnez $t0, 11f \n"
" subu $t7, $t9, %[src_stride] \n"
//dst + dst_stride word aligned
// dst + dst_stride word aligned
"1: \n"
"lw $t0, 0(%[src]) \n"
"lwx $t1, %[src_stride](%[src]) \n"
"lwx $t8, $t2(%[src]) \n"
"lwx $t9, $t3(%[src]) \n"
// t0 = | 30 | 20 | 10 | 00 |
// t1 = | 31 | 21 | 11 | 01 |
// t8 = | 32 | 22 | 12 | 02 |
// t9 = | 33 | 23 | 13 | 03 |
// t0 = | 30 | 20 | 10 | 00 |
// t1 = | 31 | 21 | 11 | 01 |
// t8 = | 32 | 22 | 12 | 02 |
// t9 = | 33 | 23 | 13 | 03 |
"precr.qb.ph $s0, $t1, $t0 \n"
"precr.qb.ph $s1, $t9, $t8 \n"
"precrq.qb.ph $s2, $t1, $t0 \n"
"precrq.qb.ph $s3, $t9, $t8 \n"
// s0 = | 21 | 01 | 20 | 00 |
// s1 = | 23 | 03 | 22 | 02 |
// s2 = | 31 | 11 | 30 | 10 |
// s3 = | 33 | 13 | 32 | 12 |
// s0 = | 21 | 01 | 20 | 00 |
// s1 = | 23 | 03 | 22 | 02 |
// s2 = | 31 | 11 | 30 | 10 |
// s3 = | 33 | 13 | 32 | 12 |
"precr.qb.ph $s4, $s1, $s0 \n"
"precrq.qb.ph $s5, $s1, $s0 \n"
"precr.qb.ph $s6, $s3, $s2 \n"
"precrq.qb.ph $s7, $s3, $s2 \n"
// s4 = | 03 | 02 | 01 | 00 |
// s5 = | 23 | 22 | 21 | 20 |
// s6 = | 13 | 12 | 11 | 10 |
// s7 = | 33 | 32 | 31 | 30 |
// s4 = | 03 | 02 | 01 | 00 |
// s5 = | 23 | 22 | 21 | 20 |
// s6 = | 13 | 12 | 11 | 10 |
// s7 = | 33 | 32 | 31 | 30 |
"lwx $t0, $t4(%[src]) \n"
"lwx $t1, $t5(%[src]) \n"
"lwx $t8, $t6(%[src]) \n"
"lwx $t9, $t7(%[src]) \n"
// t0 = | 34 | 24 | 14 | 04 |
// t1 = | 35 | 25 | 15 | 05 |
// t8 = | 36 | 26 | 16 | 06 |
// t9 = | 37 | 27 | 17 | 07 |
// t0 = | 34 | 24 | 14 | 04 |
// t1 = | 35 | 25 | 15 | 05 |
// t8 = | 36 | 26 | 16 | 06 |
// t9 = | 37 | 27 | 17 | 07 |
"precr.qb.ph $s0, $t1, $t0 \n"
"precr.qb.ph $s1, $t9, $t8 \n"
"precrq.qb.ph $s2, $t1, $t0 \n"
"precrq.qb.ph $s3, $t9, $t8 \n"
// s0 = | 25 | 05 | 24 | 04 |
// s1 = | 27 | 07 | 26 | 06 |
// s2 = | 35 | 15 | 34 | 14 |
// s3 = | 37 | 17 | 36 | 16 |
// s0 = | 25 | 05 | 24 | 04 |
// s1 = | 27 | 07 | 26 | 06 |
// s2 = | 35 | 15 | 34 | 14 |
// s3 = | 37 | 17 | 36 | 16 |
"precr.qb.ph $t0, $s1, $s0 \n"
"precrq.qb.ph $t1, $s1, $s0 \n"
"precr.qb.ph $t8, $s3, $s2 \n"
"precrq.qb.ph $t9, $s3, $s2 \n"
// t0 = | 07 | 06 | 05 | 04 |
// t1 = | 27 | 26 | 25 | 24 |
// t8 = | 17 | 16 | 15 | 14 |
// t9 = | 37 | 36 | 35 | 34 |
// t0 = | 07 | 06 | 05 | 04 |
// t1 = | 27 | 26 | 25 | 24 |
// t8 = | 17 | 16 | 15 | 14 |
// t9 = | 37 | 36 | 35 | 34 |
"addu $s0, %[dst], %[dst_stride] \n"
"addu $s1, $s0, %[dst_stride] \n"
@ -207,67 +206,67 @@ void TransposeWx8_Fast_DSPR2(const uint8* src, int src_stride,
"bnez $AT, 1b \n"
" addu %[dst], $s2, %[dst_stride] \n"
"b 2f \n"
//dst + dst_stride unaligned
// dst + dst_stride unaligned
"11: \n"
"lw $t0, 0(%[src]) \n"
"lwx $t1, %[src_stride](%[src]) \n"
"lwx $t8, $t2(%[src]) \n"
"lwx $t9, $t3(%[src]) \n"
// t0 = | 30 | 20 | 10 | 00 |
// t1 = | 31 | 21 | 11 | 01 |
// t8 = | 32 | 22 | 12 | 02 |
// t9 = | 33 | 23 | 13 | 03 |
// t0 = | 30 | 20 | 10 | 00 |
// t1 = | 31 | 21 | 11 | 01 |
// t8 = | 32 | 22 | 12 | 02 |
// t9 = | 33 | 23 | 13 | 03 |
"precr.qb.ph $s0, $t1, $t0 \n"
"precr.qb.ph $s1, $t9, $t8 \n"
"precrq.qb.ph $s2, $t1, $t0 \n"
"precrq.qb.ph $s3, $t9, $t8 \n"
// s0 = | 21 | 01 | 20 | 00 |
// s1 = | 23 | 03 | 22 | 02 |
// s2 = | 31 | 11 | 30 | 10 |
// s3 = | 33 | 13 | 32 | 12 |
// s0 = | 21 | 01 | 20 | 00 |
// s1 = | 23 | 03 | 22 | 02 |
// s2 = | 31 | 11 | 30 | 10 |
// s3 = | 33 | 13 | 32 | 12 |
"precr.qb.ph $s4, $s1, $s0 \n"
"precrq.qb.ph $s5, $s1, $s0 \n"
"precr.qb.ph $s6, $s3, $s2 \n"
"precrq.qb.ph $s7, $s3, $s2 \n"
// s4 = | 03 | 02 | 01 | 00 |
// s5 = | 23 | 22 | 21 | 20 |
// s6 = | 13 | 12 | 11 | 10 |
// s7 = | 33 | 32 | 31 | 30 |
// s4 = | 03 | 02 | 01 | 00 |
// s5 = | 23 | 22 | 21 | 20 |
// s6 = | 13 | 12 | 11 | 10 |
// s7 = | 33 | 32 | 31 | 30 |
"lwx $t0, $t4(%[src]) \n"
"lwx $t1, $t5(%[src]) \n"
"lwx $t8, $t6(%[src]) \n"
"lwx $t9, $t7(%[src]) \n"
// t0 = | 34 | 24 | 14 | 04 |
// t1 = | 35 | 25 | 15 | 05 |
// t8 = | 36 | 26 | 16 | 06 |
// t9 = | 37 | 27 | 17 | 07 |
// t0 = | 34 | 24 | 14 | 04 |
// t1 = | 35 | 25 | 15 | 05 |
// t8 = | 36 | 26 | 16 | 06 |
// t9 = | 37 | 27 | 17 | 07 |
"precr.qb.ph $s0, $t1, $t0 \n"
"precr.qb.ph $s1, $t9, $t8 \n"
"precrq.qb.ph $s2, $t1, $t0 \n"
"precrq.qb.ph $s3, $t9, $t8 \n"
// s0 = | 25 | 05 | 24 | 04 |
// s1 = | 27 | 07 | 26 | 06 |
// s2 = | 35 | 15 | 34 | 14 |
// s3 = | 37 | 17 | 36 | 16 |
// s0 = | 25 | 05 | 24 | 04 |
// s1 = | 27 | 07 | 26 | 06 |
// s2 = | 35 | 15 | 34 | 14 |
// s3 = | 37 | 17 | 36 | 16 |
"precr.qb.ph $t0, $s1, $s0 \n"
"precrq.qb.ph $t1, $s1, $s0 \n"
"precr.qb.ph $t8, $s3, $s2 \n"
"precrq.qb.ph $t9, $s3, $s2 \n"
// t0 = | 07 | 06 | 05 | 04 |
// t1 = | 27 | 26 | 25 | 24 |
// t8 = | 17 | 16 | 15 | 14 |
// t9 = | 37 | 36 | 35 | 34 |
// t0 = | 07 | 06 | 05 | 04 |
// t1 = | 27 | 26 | 25 | 24 |
// t8 = | 17 | 16 | 15 | 14 |
// t9 = | 37 | 36 | 35 | 34 |
"addu $s0, %[dst], %[dst_stride] \n"
"addu $s1, $s0, %[dst_stride] \n"
@ -298,34 +297,33 @@ void TransposeWx8_Fast_DSPR2(const uint8* src, int src_stride,
"2: \n"
".set pop \n"
".set at \n"
:[src] "+r" (src),
[dst] "+r" (dst),
[width] "+r" (width)
:[src_stride] "r" (src_stride),
[dst_stride] "r" (dst_stride)
: "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9",
"s0", "s1", "s2", "s3", "s4", "s5", "s6", "s7"
);
: [src] "+r"(src), [dst] "+r"(dst), [width] "+r"(width)
: [src_stride] "r"(src_stride), [dst_stride] "r"(dst_stride)
: "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9", "s0", "s1",
"s2", "s3", "s4", "s5", "s6", "s7");
}
void TransposeUVWx8_DSPR2(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b,
void TransposeUVWx8_DSPR2(const uint8* src,
int src_stride,
uint8* dst_a,
int dst_stride_a,
uint8* dst_b,
int dst_stride_b,
int width) {
__asm__ __volatile__ (
__asm__ __volatile__(
".set push \n"
".set noreorder \n"
"beqz %[width], 2f \n"
" sll $t2, %[src_stride], 0x1 \n" // src_stride x 2
"sll $t4, %[src_stride], 0x2 \n" // src_stride x 4
"sll $t9, %[src_stride], 0x3 \n" // src_stride x 8
" sll $t2, %[src_stride], 0x1 \n" // src_stride x 2
"sll $t4, %[src_stride], 0x2 \n" // src_stride x 4
"sll $t9, %[src_stride], 0x3 \n" // src_stride x 8
"addu $t3, $t2, %[src_stride] \n"
"addu $t5, $t4, %[src_stride] \n"
"addu $t6, $t2, $t4 \n"
"subu $t7, $t9, %[src_stride] \n"
"srl $t1, %[width], 1 \n"
// check word aligment for dst_a, dst_b, dst_stride_a and dst_stride_b
// check word aligment for dst_a, dst_b, dst_stride_a and dst_stride_b
"andi $t0, %[dst_a], 0x3 \n"
"andi $t8, %[dst_b], 0x3 \n"
"or $t0, $t0, $t8 \n"
@ -335,52 +333,52 @@ void TransposeUVWx8_DSPR2(const uint8* src, int src_stride,
"or $t0, $t0, $t8 \n"
"bnez $t0, 11f \n"
" nop \n"
// dst + dst_stride word aligned (both, a & b dst addresses)
"1: \n"
"lw $t0, 0(%[src]) \n" // |B0|A0|b0|a0|
"lwx $t8, %[src_stride](%[src]) \n" // |B1|A1|b1|a1|
// dst + dst_stride word aligned (both, a & b dst addresses)
"1: \n"
"lw $t0, 0(%[src]) \n" // |B0|A0|b0|a0|
"lwx $t8, %[src_stride](%[src]) \n" // |B1|A1|b1|a1|
"addu $s5, %[dst_a], %[dst_stride_a] \n"
"lwx $t9, $t2(%[src]) \n" // |B2|A2|b2|a2|
"lwx $s0, $t3(%[src]) \n" // |B3|A3|b3|a3|
"lwx $t9, $t2(%[src]) \n" // |B2|A2|b2|a2|
"lwx $s0, $t3(%[src]) \n" // |B3|A3|b3|a3|
"addu $s6, %[dst_b], %[dst_stride_b] \n"
"precrq.ph.w $s1, $t8, $t0 \n" // |B1|A1|B0|A0|
"precrq.ph.w $s2, $s0, $t9 \n" // |B3|A3|B2|A2|
"precr.qb.ph $s3, $s2, $s1 \n" // |A3|A2|A1|A0|
"precrq.qb.ph $s4, $s2, $s1 \n" // |B3|B2|B1|B0|
"precrq.ph.w $s1, $t8, $t0 \n" // |B1|A1|B0|A0|
"precrq.ph.w $s2, $s0, $t9 \n" // |B3|A3|B2|A2|
"precr.qb.ph $s3, $s2, $s1 \n" // |A3|A2|A1|A0|
"precrq.qb.ph $s4, $s2, $s1 \n" // |B3|B2|B1|B0|
"sll $t0, $t0, 16 \n"
"packrl.ph $s1, $t8, $t0 \n" // |b1|a1|b0|a0|
"packrl.ph $s1, $t8, $t0 \n" // |b1|a1|b0|a0|
"sll $t9, $t9, 16 \n"
"packrl.ph $s2, $s0, $t9 \n" // |b3|a3|b2|a2|
"packrl.ph $s2, $s0, $t9 \n" // |b3|a3|b2|a2|
"sw $s3, 0($s5) \n"
"sw $s4, 0($s6) \n"
"precr.qb.ph $s3, $s2, $s1 \n" // |a3|a2|a1|a0|
"precrq.qb.ph $s4, $s2, $s1 \n" // |b3|b2|b1|b0|
"precr.qb.ph $s3, $s2, $s1 \n" // |a3|a2|a1|a0|
"precrq.qb.ph $s4, $s2, $s1 \n" // |b3|b2|b1|b0|
"lwx $t0, $t4(%[src]) \n" // |B4|A4|b4|a4|
"lwx $t8, $t5(%[src]) \n" // |B5|A5|b5|a5|
"lwx $t9, $t6(%[src]) \n" // |B6|A6|b6|a6|
"lwx $s0, $t7(%[src]) \n" // |B7|A7|b7|a7|
"lwx $t0, $t4(%[src]) \n" // |B4|A4|b4|a4|
"lwx $t8, $t5(%[src]) \n" // |B5|A5|b5|a5|
"lwx $t9, $t6(%[src]) \n" // |B6|A6|b6|a6|
"lwx $s0, $t7(%[src]) \n" // |B7|A7|b7|a7|
"sw $s3, 0(%[dst_a]) \n"
"sw $s4, 0(%[dst_b]) \n"
"precrq.ph.w $s1, $t8, $t0 \n" // |B5|A5|B4|A4|
"precrq.ph.w $s2, $s0, $t9 \n" // |B6|A6|B7|A7|
"precr.qb.ph $s3, $s2, $s1 \n" // |A7|A6|A5|A4|
"precrq.qb.ph $s4, $s2, $s1 \n" // |B7|B6|B5|B4|
"precrq.ph.w $s1, $t8, $t0 \n" // |B5|A5|B4|A4|
"precrq.ph.w $s2, $s0, $t9 \n" // |B6|A6|B7|A7|
"precr.qb.ph $s3, $s2, $s1 \n" // |A7|A6|A5|A4|
"precrq.qb.ph $s4, $s2, $s1 \n" // |B7|B6|B5|B4|
"sll $t0, $t0, 16 \n"
"packrl.ph $s1, $t8, $t0 \n" // |b5|a5|b4|a4|
"packrl.ph $s1, $t8, $t0 \n" // |b5|a5|b4|a4|
"sll $t9, $t9, 16 \n"
"packrl.ph $s2, $s0, $t9 \n" // |b7|a7|b6|a6|
"packrl.ph $s2, $s0, $t9 \n" // |b7|a7|b6|a6|
"sw $s3, 4($s5) \n"
"sw $s4, 4($s6) \n"
"precr.qb.ph $s3, $s2, $s1 \n" // |a7|a6|a5|a4|
"precrq.qb.ph $s4, $s2, $s1 \n" // |b7|b6|b5|b4|
"precr.qb.ph $s3, $s2, $s1 \n" // |a7|a6|a5|a4|
"precrq.qb.ph $s4, $s2, $s1 \n" // |b7|b6|b5|b4|
"addiu %[src], 4 \n"
"addiu $t1, -1 \n"
@ -394,59 +392,59 @@ void TransposeUVWx8_DSPR2(const uint8* src, int src_stride,
"b 2f \n"
" nop \n"
// dst_a or dst_b or dst_stride_a or dst_stride_b not word aligned
"11: \n"
"lw $t0, 0(%[src]) \n" // |B0|A0|b0|a0|
"lwx $t8, %[src_stride](%[src]) \n" // |B1|A1|b1|a1|
// dst_a or dst_b or dst_stride_a or dst_stride_b not word aligned
"11: \n"
"lw $t0, 0(%[src]) \n" // |B0|A0|b0|a0|
"lwx $t8, %[src_stride](%[src]) \n" // |B1|A1|b1|a1|
"addu $s5, %[dst_a], %[dst_stride_a] \n"
"lwx $t9, $t2(%[src]) \n" // |B2|A2|b2|a2|
"lwx $s0, $t3(%[src]) \n" // |B3|A3|b3|a3|
"lwx $t9, $t2(%[src]) \n" // |B2|A2|b2|a2|
"lwx $s0, $t3(%[src]) \n" // |B3|A3|b3|a3|
"addu $s6, %[dst_b], %[dst_stride_b] \n"
"precrq.ph.w $s1, $t8, $t0 \n" // |B1|A1|B0|A0|
"precrq.ph.w $s2, $s0, $t9 \n" // |B3|A3|B2|A2|
"precr.qb.ph $s3, $s2, $s1 \n" // |A3|A2|A1|A0|
"precrq.qb.ph $s4, $s2, $s1 \n" // |B3|B2|B1|B0|
"precrq.ph.w $s1, $t8, $t0 \n" // |B1|A1|B0|A0|
"precrq.ph.w $s2, $s0, $t9 \n" // |B3|A3|B2|A2|
"precr.qb.ph $s3, $s2, $s1 \n" // |A3|A2|A1|A0|
"precrq.qb.ph $s4, $s2, $s1 \n" // |B3|B2|B1|B0|
"sll $t0, $t0, 16 \n"
"packrl.ph $s1, $t8, $t0 \n" // |b1|a1|b0|a0|
"packrl.ph $s1, $t8, $t0 \n" // |b1|a1|b0|a0|
"sll $t9, $t9, 16 \n"
"packrl.ph $s2, $s0, $t9 \n" // |b3|a3|b2|a2|
"packrl.ph $s2, $s0, $t9 \n" // |b3|a3|b2|a2|
"swr $s3, 0($s5) \n"
"swl $s3, 3($s5) \n"
"swr $s4, 0($s6) \n"
"swl $s4, 3($s6) \n"
"precr.qb.ph $s3, $s2, $s1 \n" // |a3|a2|a1|a0|
"precrq.qb.ph $s4, $s2, $s1 \n" // |b3|b2|b1|b0|
"precr.qb.ph $s3, $s2, $s1 \n" // |a3|a2|a1|a0|
"precrq.qb.ph $s4, $s2, $s1 \n" // |b3|b2|b1|b0|
"lwx $t0, $t4(%[src]) \n" // |B4|A4|b4|a4|
"lwx $t8, $t5(%[src]) \n" // |B5|A5|b5|a5|
"lwx $t9, $t6(%[src]) \n" // |B6|A6|b6|a6|
"lwx $s0, $t7(%[src]) \n" // |B7|A7|b7|a7|
"lwx $t0, $t4(%[src]) \n" // |B4|A4|b4|a4|
"lwx $t8, $t5(%[src]) \n" // |B5|A5|b5|a5|
"lwx $t9, $t6(%[src]) \n" // |B6|A6|b6|a6|
"lwx $s0, $t7(%[src]) \n" // |B7|A7|b7|a7|
"swr $s3, 0(%[dst_a]) \n"
"swl $s3, 3(%[dst_a]) \n"
"swr $s4, 0(%[dst_b]) \n"
"swl $s4, 3(%[dst_b]) \n"
"precrq.ph.w $s1, $t8, $t0 \n" // |B5|A5|B4|A4|
"precrq.ph.w $s2, $s0, $t9 \n" // |B6|A6|B7|A7|
"precr.qb.ph $s3, $s2, $s1 \n" // |A7|A6|A5|A4|
"precrq.qb.ph $s4, $s2, $s1 \n" // |B7|B6|B5|B4|
"precrq.ph.w $s1, $t8, $t0 \n" // |B5|A5|B4|A4|
"precrq.ph.w $s2, $s0, $t9 \n" // |B6|A6|B7|A7|
"precr.qb.ph $s3, $s2, $s1 \n" // |A7|A6|A5|A4|
"precrq.qb.ph $s4, $s2, $s1 \n" // |B7|B6|B5|B4|
"sll $t0, $t0, 16 \n"
"packrl.ph $s1, $t8, $t0 \n" // |b5|a5|b4|a4|
"packrl.ph $s1, $t8, $t0 \n" // |b5|a5|b4|a4|
"sll $t9, $t9, 16 \n"
"packrl.ph $s2, $s0, $t9 \n" // |b7|a7|b6|a6|
"packrl.ph $s2, $s0, $t9 \n" // |b7|a7|b6|a6|
"swr $s3, 4($s5) \n"
"swl $s3, 7($s5) \n"
"swr $s4, 4($s6) \n"
"swl $s4, 7($s6) \n"
"precr.qb.ph $s3, $s2, $s1 \n" // |a7|a6|a5|a4|
"precrq.qb.ph $s4, $s2, $s1 \n" // |b7|b6|b5|b4|
"precr.qb.ph $s3, $s2, $s1 \n" // |a7|a6|a5|a4|
"precrq.qb.ph $s4, $s2, $s1 \n" // |b7|b6|b5|b4|
"addiu %[src], 4 \n"
"addiu $t1, -1 \n"
@ -462,18 +460,11 @@ void TransposeUVWx8_DSPR2(const uint8* src, int src_stride,
"2: \n"
".set pop \n"
: [src] "+r" (src),
[dst_a] "+r" (dst_a),
[dst_b] "+r" (dst_b),
[width] "+r" (width),
[src_stride] "+r" (src_stride)
: [dst_stride_a] "r" (dst_stride_a),
[dst_stride_b] "r" (dst_stride_b)
: "t0", "t1", "t2", "t3", "t4", "t5",
"t6", "t7", "t8", "t9",
"s0", "s1", "s2", "s3",
"s4", "s5", "s6"
);
: [src] "+r"(src), [dst_a] "+r"(dst_a), [dst_b] "+r"(dst_b),
[width] "+r"(width), [src_stride] "+r"(src_stride)
: [dst_stride_a] "r"(dst_stride_a), [dst_stride_b] "r"(dst_stride_b)
: "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9", "s0", "s1",
"s2", "s3", "s4", "s5", "s6");
}
#endif // defined(__mips_dsp) && (__mips_dsp_rev >= 2)

View File

@ -8,8 +8,8 @@
* be found in the AUTHORS file in the root of the source tree.
*/
#include "libyuv/row.h"
#include "libyuv/rotate_row.h"
#include "libyuv/row.h"
#ifdef __cplusplus
namespace libyuv {
@ -22,342 +22,348 @@ extern "C" {
// Transpose 8x8. 32 or 64 bit, but not NaCL for 64 bit.
#if defined(HAS_TRANSPOSEWX8_SSSE3)
void TransposeWx8_SSSE3(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width) {
asm volatile (
// Read in the data from the source pointer.
// First round of bit swap.
LABELALIGN
"1: \n"
"movq (%0),%%xmm0 \n"
"movq (%0,%3),%%xmm1 \n"
"lea (%0,%3,2),%0 \n"
"punpcklbw %%xmm1,%%xmm0 \n"
"movq (%0),%%xmm2 \n"
"movdqa %%xmm0,%%xmm1 \n"
"palignr $0x8,%%xmm1,%%xmm1 \n"
"movq (%0,%3),%%xmm3 \n"
"lea (%0,%3,2),%0 \n"
"punpcklbw %%xmm3,%%xmm2 \n"
"movdqa %%xmm2,%%xmm3 \n"
"movq (%0),%%xmm4 \n"
"palignr $0x8,%%xmm3,%%xmm3 \n"
"movq (%0,%3),%%xmm5 \n"
"lea (%0,%3,2),%0 \n"
"punpcklbw %%xmm5,%%xmm4 \n"
"movdqa %%xmm4,%%xmm5 \n"
"movq (%0),%%xmm6 \n"
"palignr $0x8,%%xmm5,%%xmm5 \n"
"movq (%0,%3),%%xmm7 \n"
"lea (%0,%3,2),%0 \n"
"punpcklbw %%xmm7,%%xmm6 \n"
"neg %3 \n"
"movdqa %%xmm6,%%xmm7 \n"
"lea 0x8(%0,%3,8),%0 \n"
"palignr $0x8,%%xmm7,%%xmm7 \n"
"neg %3 \n"
// Second round of bit swap.
"punpcklwd %%xmm2,%%xmm0 \n"
"punpcklwd %%xmm3,%%xmm1 \n"
"movdqa %%xmm0,%%xmm2 \n"
"movdqa %%xmm1,%%xmm3 \n"
"palignr $0x8,%%xmm2,%%xmm2 \n"
"palignr $0x8,%%xmm3,%%xmm3 \n"
"punpcklwd %%xmm6,%%xmm4 \n"
"punpcklwd %%xmm7,%%xmm5 \n"
"movdqa %%xmm4,%%xmm6 \n"
"movdqa %%xmm5,%%xmm7 \n"
"palignr $0x8,%%xmm6,%%xmm6 \n"
"palignr $0x8,%%xmm7,%%xmm7 \n"
// Third round of bit swap.
// Write to the destination pointer.
"punpckldq %%xmm4,%%xmm0 \n"
"movq %%xmm0,(%1) \n"
"movdqa %%xmm0,%%xmm4 \n"
"palignr $0x8,%%xmm4,%%xmm4 \n"
"movq %%xmm4,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"punpckldq %%xmm6,%%xmm2 \n"
"movdqa %%xmm2,%%xmm6 \n"
"movq %%xmm2,(%1) \n"
"palignr $0x8,%%xmm6,%%xmm6 \n"
"punpckldq %%xmm5,%%xmm1 \n"
"movq %%xmm6,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"movdqa %%xmm1,%%xmm5 \n"
"movq %%xmm1,(%1) \n"
"palignr $0x8,%%xmm5,%%xmm5 \n"
"movq %%xmm5,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"punpckldq %%xmm7,%%xmm3 \n"
"movq %%xmm3,(%1) \n"
"movdqa %%xmm3,%%xmm7 \n"
"palignr $0x8,%%xmm7,%%xmm7 \n"
"sub $0x8,%2 \n"
"movq %%xmm7,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
: "r"((intptr_t)(src_stride)), // %3
"r"((intptr_t)(dst_stride)) // %4
: "memory", "cc",
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
);
void TransposeWx8_SSSE3(const uint8* src,
int src_stride,
uint8* dst,
int dst_stride,
int width) {
asm volatile(
// Read in the data from the source pointer.
// First round of bit swap.
LABELALIGN
"1: \n"
"movq (%0),%%xmm0 \n"
"movq (%0,%3),%%xmm1 \n"
"lea (%0,%3,2),%0 \n"
"punpcklbw %%xmm1,%%xmm0 \n"
"movq (%0),%%xmm2 \n"
"movdqa %%xmm0,%%xmm1 \n"
"palignr $0x8,%%xmm1,%%xmm1 \n"
"movq (%0,%3),%%xmm3 \n"
"lea (%0,%3,2),%0 \n"
"punpcklbw %%xmm3,%%xmm2 \n"
"movdqa %%xmm2,%%xmm3 \n"
"movq (%0),%%xmm4 \n"
"palignr $0x8,%%xmm3,%%xmm3 \n"
"movq (%0,%3),%%xmm5 \n"
"lea (%0,%3,2),%0 \n"
"punpcklbw %%xmm5,%%xmm4 \n"
"movdqa %%xmm4,%%xmm5 \n"
"movq (%0),%%xmm6 \n"
"palignr $0x8,%%xmm5,%%xmm5 \n"
"movq (%0,%3),%%xmm7 \n"
"lea (%0,%3,2),%0 \n"
"punpcklbw %%xmm7,%%xmm6 \n"
"neg %3 \n"
"movdqa %%xmm6,%%xmm7 \n"
"lea 0x8(%0,%3,8),%0 \n"
"palignr $0x8,%%xmm7,%%xmm7 \n"
"neg %3 \n"
// Second round of bit swap.
"punpcklwd %%xmm2,%%xmm0 \n"
"punpcklwd %%xmm3,%%xmm1 \n"
"movdqa %%xmm0,%%xmm2 \n"
"movdqa %%xmm1,%%xmm3 \n"
"palignr $0x8,%%xmm2,%%xmm2 \n"
"palignr $0x8,%%xmm3,%%xmm3 \n"
"punpcklwd %%xmm6,%%xmm4 \n"
"punpcklwd %%xmm7,%%xmm5 \n"
"movdqa %%xmm4,%%xmm6 \n"
"movdqa %%xmm5,%%xmm7 \n"
"palignr $0x8,%%xmm6,%%xmm6 \n"
"palignr $0x8,%%xmm7,%%xmm7 \n"
// Third round of bit swap.
// Write to the destination pointer.
"punpckldq %%xmm4,%%xmm0 \n"
"movq %%xmm0,(%1) \n"
"movdqa %%xmm0,%%xmm4 \n"
"palignr $0x8,%%xmm4,%%xmm4 \n"
"movq %%xmm4,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"punpckldq %%xmm6,%%xmm2 \n"
"movdqa %%xmm2,%%xmm6 \n"
"movq %%xmm2,(%1) \n"
"palignr $0x8,%%xmm6,%%xmm6 \n"
"punpckldq %%xmm5,%%xmm1 \n"
"movq %%xmm6,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"movdqa %%xmm1,%%xmm5 \n"
"movq %%xmm1,(%1) \n"
"palignr $0x8,%%xmm5,%%xmm5 \n"
"movq %%xmm5,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"punpckldq %%xmm7,%%xmm3 \n"
"movq %%xmm3,(%1) \n"
"movdqa %%xmm3,%%xmm7 \n"
"palignr $0x8,%%xmm7,%%xmm7 \n"
"sub $0x8,%2 \n"
"movq %%xmm7,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
: "r"((intptr_t)(src_stride)), // %3
"r"((intptr_t)(dst_stride)) // %4
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
"xmm7");
}
#endif // defined(HAS_TRANSPOSEWX8_SSSE3)
// Transpose 16x8. 64 bit
#if defined(HAS_TRANSPOSEWX8_FAST_SSSE3)
void TransposeWx8_Fast_SSSE3(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width) {
asm volatile (
// Read in the data from the source pointer.
// First round of bit swap.
LABELALIGN
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu (%0,%3),%%xmm1 \n"
"lea (%0,%3,2),%0 \n"
"movdqa %%xmm0,%%xmm8 \n"
"punpcklbw %%xmm1,%%xmm0 \n"
"punpckhbw %%xmm1,%%xmm8 \n"
"movdqu (%0),%%xmm2 \n"
"movdqa %%xmm0,%%xmm1 \n"
"movdqa %%xmm8,%%xmm9 \n"
"palignr $0x8,%%xmm1,%%xmm1 \n"
"palignr $0x8,%%xmm9,%%xmm9 \n"
"movdqu (%0,%3),%%xmm3 \n"
"lea (%0,%3,2),%0 \n"
"movdqa %%xmm2,%%xmm10 \n"
"punpcklbw %%xmm3,%%xmm2 \n"
"punpckhbw %%xmm3,%%xmm10 \n"
"movdqa %%xmm2,%%xmm3 \n"
"movdqa %%xmm10,%%xmm11 \n"
"movdqu (%0),%%xmm4 \n"
"palignr $0x8,%%xmm3,%%xmm3 \n"
"palignr $0x8,%%xmm11,%%xmm11 \n"
"movdqu (%0,%3),%%xmm5 \n"
"lea (%0,%3,2),%0 \n"
"movdqa %%xmm4,%%xmm12 \n"
"punpcklbw %%xmm5,%%xmm4 \n"
"punpckhbw %%xmm5,%%xmm12 \n"
"movdqa %%xmm4,%%xmm5 \n"
"movdqa %%xmm12,%%xmm13 \n"
"movdqu (%0),%%xmm6 \n"
"palignr $0x8,%%xmm5,%%xmm5 \n"
"palignr $0x8,%%xmm13,%%xmm13 \n"
"movdqu (%0,%3),%%xmm7 \n"
"lea (%0,%3,2),%0 \n"
"movdqa %%xmm6,%%xmm14 \n"
"punpcklbw %%xmm7,%%xmm6 \n"
"punpckhbw %%xmm7,%%xmm14 \n"
"neg %3 \n"
"movdqa %%xmm6,%%xmm7 \n"
"movdqa %%xmm14,%%xmm15 \n"
"lea 0x10(%0,%3,8),%0 \n"
"palignr $0x8,%%xmm7,%%xmm7 \n"
"palignr $0x8,%%xmm15,%%xmm15 \n"
"neg %3 \n"
// Second round of bit swap.
"punpcklwd %%xmm2,%%xmm0 \n"
"punpcklwd %%xmm3,%%xmm1 \n"
"movdqa %%xmm0,%%xmm2 \n"
"movdqa %%xmm1,%%xmm3 \n"
"palignr $0x8,%%xmm2,%%xmm2 \n"
"palignr $0x8,%%xmm3,%%xmm3 \n"
"punpcklwd %%xmm6,%%xmm4 \n"
"punpcklwd %%xmm7,%%xmm5 \n"
"movdqa %%xmm4,%%xmm6 \n"
"movdqa %%xmm5,%%xmm7 \n"
"palignr $0x8,%%xmm6,%%xmm6 \n"
"palignr $0x8,%%xmm7,%%xmm7 \n"
"punpcklwd %%xmm10,%%xmm8 \n"
"punpcklwd %%xmm11,%%xmm9 \n"
"movdqa %%xmm8,%%xmm10 \n"
"movdqa %%xmm9,%%xmm11 \n"
"palignr $0x8,%%xmm10,%%xmm10 \n"
"palignr $0x8,%%xmm11,%%xmm11 \n"
"punpcklwd %%xmm14,%%xmm12 \n"
"punpcklwd %%xmm15,%%xmm13 \n"
"movdqa %%xmm12,%%xmm14 \n"
"movdqa %%xmm13,%%xmm15 \n"
"palignr $0x8,%%xmm14,%%xmm14 \n"
"palignr $0x8,%%xmm15,%%xmm15 \n"
// Third round of bit swap.
// Write to the destination pointer.
"punpckldq %%xmm4,%%xmm0 \n"
"movq %%xmm0,(%1) \n"
"movdqa %%xmm0,%%xmm4 \n"
"palignr $0x8,%%xmm4,%%xmm4 \n"
"movq %%xmm4,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"punpckldq %%xmm6,%%xmm2 \n"
"movdqa %%xmm2,%%xmm6 \n"
"movq %%xmm2,(%1) \n"
"palignr $0x8,%%xmm6,%%xmm6 \n"
"punpckldq %%xmm5,%%xmm1 \n"
"movq %%xmm6,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"movdqa %%xmm1,%%xmm5 \n"
"movq %%xmm1,(%1) \n"
"palignr $0x8,%%xmm5,%%xmm5 \n"
"movq %%xmm5,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"punpckldq %%xmm7,%%xmm3 \n"
"movq %%xmm3,(%1) \n"
"movdqa %%xmm3,%%xmm7 \n"
"palignr $0x8,%%xmm7,%%xmm7 \n"
"movq %%xmm7,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"punpckldq %%xmm12,%%xmm8 \n"
"movq %%xmm8,(%1) \n"
"movdqa %%xmm8,%%xmm12 \n"
"palignr $0x8,%%xmm12,%%xmm12 \n"
"movq %%xmm12,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"punpckldq %%xmm14,%%xmm10 \n"
"movdqa %%xmm10,%%xmm14 \n"
"movq %%xmm10,(%1) \n"
"palignr $0x8,%%xmm14,%%xmm14 \n"
"punpckldq %%xmm13,%%xmm9 \n"
"movq %%xmm14,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"movdqa %%xmm9,%%xmm13 \n"
"movq %%xmm9,(%1) \n"
"palignr $0x8,%%xmm13,%%xmm13 \n"
"movq %%xmm13,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"punpckldq %%xmm15,%%xmm11 \n"
"movq %%xmm11,(%1) \n"
"movdqa %%xmm11,%%xmm15 \n"
"palignr $0x8,%%xmm15,%%xmm15 \n"
"sub $0x10,%2 \n"
"movq %%xmm15,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
: "r"((intptr_t)(src_stride)), // %3
"r"((intptr_t)(dst_stride)) // %4
: "memory", "cc",
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
"xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15"
);
void TransposeWx8_Fast_SSSE3(const uint8* src,
int src_stride,
uint8* dst,
int dst_stride,
int width) {
asm volatile(
// Read in the data from the source pointer.
// First round of bit swap.
LABELALIGN
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu (%0,%3),%%xmm1 \n"
"lea (%0,%3,2),%0 \n"
"movdqa %%xmm0,%%xmm8 \n"
"punpcklbw %%xmm1,%%xmm0 \n"
"punpckhbw %%xmm1,%%xmm8 \n"
"movdqu (%0),%%xmm2 \n"
"movdqa %%xmm0,%%xmm1 \n"
"movdqa %%xmm8,%%xmm9 \n"
"palignr $0x8,%%xmm1,%%xmm1 \n"
"palignr $0x8,%%xmm9,%%xmm9 \n"
"movdqu (%0,%3),%%xmm3 \n"
"lea (%0,%3,2),%0 \n"
"movdqa %%xmm2,%%xmm10 \n"
"punpcklbw %%xmm3,%%xmm2 \n"
"punpckhbw %%xmm3,%%xmm10 \n"
"movdqa %%xmm2,%%xmm3 \n"
"movdqa %%xmm10,%%xmm11 \n"
"movdqu (%0),%%xmm4 \n"
"palignr $0x8,%%xmm3,%%xmm3 \n"
"palignr $0x8,%%xmm11,%%xmm11 \n"
"movdqu (%0,%3),%%xmm5 \n"
"lea (%0,%3,2),%0 \n"
"movdqa %%xmm4,%%xmm12 \n"
"punpcklbw %%xmm5,%%xmm4 \n"
"punpckhbw %%xmm5,%%xmm12 \n"
"movdqa %%xmm4,%%xmm5 \n"
"movdqa %%xmm12,%%xmm13 \n"
"movdqu (%0),%%xmm6 \n"
"palignr $0x8,%%xmm5,%%xmm5 \n"
"palignr $0x8,%%xmm13,%%xmm13 \n"
"movdqu (%0,%3),%%xmm7 \n"
"lea (%0,%3,2),%0 \n"
"movdqa %%xmm6,%%xmm14 \n"
"punpcklbw %%xmm7,%%xmm6 \n"
"punpckhbw %%xmm7,%%xmm14 \n"
"neg %3 \n"
"movdqa %%xmm6,%%xmm7 \n"
"movdqa %%xmm14,%%xmm15 \n"
"lea 0x10(%0,%3,8),%0 \n"
"palignr $0x8,%%xmm7,%%xmm7 \n"
"palignr $0x8,%%xmm15,%%xmm15 \n"
"neg %3 \n"
// Second round of bit swap.
"punpcklwd %%xmm2,%%xmm0 \n"
"punpcklwd %%xmm3,%%xmm1 \n"
"movdqa %%xmm0,%%xmm2 \n"
"movdqa %%xmm1,%%xmm3 \n"
"palignr $0x8,%%xmm2,%%xmm2 \n"
"palignr $0x8,%%xmm3,%%xmm3 \n"
"punpcklwd %%xmm6,%%xmm4 \n"
"punpcklwd %%xmm7,%%xmm5 \n"
"movdqa %%xmm4,%%xmm6 \n"
"movdqa %%xmm5,%%xmm7 \n"
"palignr $0x8,%%xmm6,%%xmm6 \n"
"palignr $0x8,%%xmm7,%%xmm7 \n"
"punpcklwd %%xmm10,%%xmm8 \n"
"punpcklwd %%xmm11,%%xmm9 \n"
"movdqa %%xmm8,%%xmm10 \n"
"movdqa %%xmm9,%%xmm11 \n"
"palignr $0x8,%%xmm10,%%xmm10 \n"
"palignr $0x8,%%xmm11,%%xmm11 \n"
"punpcklwd %%xmm14,%%xmm12 \n"
"punpcklwd %%xmm15,%%xmm13 \n"
"movdqa %%xmm12,%%xmm14 \n"
"movdqa %%xmm13,%%xmm15 \n"
"palignr $0x8,%%xmm14,%%xmm14 \n"
"palignr $0x8,%%xmm15,%%xmm15 \n"
// Third round of bit swap.
// Write to the destination pointer.
"punpckldq %%xmm4,%%xmm0 \n"
"movq %%xmm0,(%1) \n"
"movdqa %%xmm0,%%xmm4 \n"
"palignr $0x8,%%xmm4,%%xmm4 \n"
"movq %%xmm4,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"punpckldq %%xmm6,%%xmm2 \n"
"movdqa %%xmm2,%%xmm6 \n"
"movq %%xmm2,(%1) \n"
"palignr $0x8,%%xmm6,%%xmm6 \n"
"punpckldq %%xmm5,%%xmm1 \n"
"movq %%xmm6,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"movdqa %%xmm1,%%xmm5 \n"
"movq %%xmm1,(%1) \n"
"palignr $0x8,%%xmm5,%%xmm5 \n"
"movq %%xmm5,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"punpckldq %%xmm7,%%xmm3 \n"
"movq %%xmm3,(%1) \n"
"movdqa %%xmm3,%%xmm7 \n"
"palignr $0x8,%%xmm7,%%xmm7 \n"
"movq %%xmm7,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"punpckldq %%xmm12,%%xmm8 \n"
"movq %%xmm8,(%1) \n"
"movdqa %%xmm8,%%xmm12 \n"
"palignr $0x8,%%xmm12,%%xmm12 \n"
"movq %%xmm12,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"punpckldq %%xmm14,%%xmm10 \n"
"movdqa %%xmm10,%%xmm14 \n"
"movq %%xmm10,(%1) \n"
"palignr $0x8,%%xmm14,%%xmm14 \n"
"punpckldq %%xmm13,%%xmm9 \n"
"movq %%xmm14,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"movdqa %%xmm9,%%xmm13 \n"
"movq %%xmm9,(%1) \n"
"palignr $0x8,%%xmm13,%%xmm13 \n"
"movq %%xmm13,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"punpckldq %%xmm15,%%xmm11 \n"
"movq %%xmm11,(%1) \n"
"movdqa %%xmm11,%%xmm15 \n"
"palignr $0x8,%%xmm15,%%xmm15 \n"
"sub $0x10,%2 \n"
"movq %%xmm15,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
: "r"((intptr_t)(src_stride)), // %3
"r"((intptr_t)(dst_stride)) // %4
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
"xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
"xmm15");
}
#endif // defined(HAS_TRANSPOSEWX8_FAST_SSSE3)
// Transpose UV 8x8. 64 bit.
#if defined(HAS_TRANSPOSEUVWX8_SSE2)
void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b, int width) {
asm volatile (
// Read in the data from the source pointer.
// First round of bit swap.
LABELALIGN
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu (%0,%4),%%xmm1 \n"
"lea (%0,%4,2),%0 \n"
"movdqa %%xmm0,%%xmm8 \n"
"punpcklbw %%xmm1,%%xmm0 \n"
"punpckhbw %%xmm1,%%xmm8 \n"
"movdqa %%xmm8,%%xmm1 \n"
"movdqu (%0),%%xmm2 \n"
"movdqu (%0,%4),%%xmm3 \n"
"lea (%0,%4,2),%0 \n"
"movdqa %%xmm2,%%xmm8 \n"
"punpcklbw %%xmm3,%%xmm2 \n"
"punpckhbw %%xmm3,%%xmm8 \n"
"movdqa %%xmm8,%%xmm3 \n"
"movdqu (%0),%%xmm4 \n"
"movdqu (%0,%4),%%xmm5 \n"
"lea (%0,%4,2),%0 \n"
"movdqa %%xmm4,%%xmm8 \n"
"punpcklbw %%xmm5,%%xmm4 \n"
"punpckhbw %%xmm5,%%xmm8 \n"
"movdqa %%xmm8,%%xmm5 \n"
"movdqu (%0),%%xmm6 \n"
"movdqu (%0,%4),%%xmm7 \n"
"lea (%0,%4,2),%0 \n"
"movdqa %%xmm6,%%xmm8 \n"
"punpcklbw %%xmm7,%%xmm6 \n"
"neg %4 \n"
"lea 0x10(%0,%4,8),%0 \n"
"punpckhbw %%xmm7,%%xmm8 \n"
"movdqa %%xmm8,%%xmm7 \n"
"neg %4 \n"
// Second round of bit swap.
"movdqa %%xmm0,%%xmm8 \n"
"movdqa %%xmm1,%%xmm9 \n"
"punpckhwd %%xmm2,%%xmm8 \n"
"punpckhwd %%xmm3,%%xmm9 \n"
"punpcklwd %%xmm2,%%xmm0 \n"
"punpcklwd %%xmm3,%%xmm1 \n"
"movdqa %%xmm8,%%xmm2 \n"
"movdqa %%xmm9,%%xmm3 \n"
"movdqa %%xmm4,%%xmm8 \n"
"movdqa %%xmm5,%%xmm9 \n"
"punpckhwd %%xmm6,%%xmm8 \n"
"punpckhwd %%xmm7,%%xmm9 \n"
"punpcklwd %%xmm6,%%xmm4 \n"
"punpcklwd %%xmm7,%%xmm5 \n"
"movdqa %%xmm8,%%xmm6 \n"
"movdqa %%xmm9,%%xmm7 \n"
// Third round of bit swap.
// Write to the destination pointer.
"movdqa %%xmm0,%%xmm8 \n"
"punpckldq %%xmm4,%%xmm0 \n"
"movlpd %%xmm0,(%1) \n" // Write back U channel
"movhpd %%xmm0,(%2) \n" // Write back V channel
"punpckhdq %%xmm4,%%xmm8 \n"
"movlpd %%xmm8,(%1,%5) \n"
"lea (%1,%5,2),%1 \n"
"movhpd %%xmm8,(%2,%6) \n"
"lea (%2,%6,2),%2 \n"
"movdqa %%xmm2,%%xmm8 \n"
"punpckldq %%xmm6,%%xmm2 \n"
"movlpd %%xmm2,(%1) \n"
"movhpd %%xmm2,(%2) \n"
"punpckhdq %%xmm6,%%xmm8 \n"
"movlpd %%xmm8,(%1,%5) \n"
"lea (%1,%5,2),%1 \n"
"movhpd %%xmm8,(%2,%6) \n"
"lea (%2,%6,2),%2 \n"
"movdqa %%xmm1,%%xmm8 \n"
"punpckldq %%xmm5,%%xmm1 \n"
"movlpd %%xmm1,(%1) \n"
"movhpd %%xmm1,(%2) \n"
"punpckhdq %%xmm5,%%xmm8 \n"
"movlpd %%xmm8,(%1,%5) \n"
"lea (%1,%5,2),%1 \n"
"movhpd %%xmm8,(%2,%6) \n"
"lea (%2,%6,2),%2 \n"
"movdqa %%xmm3,%%xmm8 \n"
"punpckldq %%xmm7,%%xmm3 \n"
"movlpd %%xmm3,(%1) \n"
"movhpd %%xmm3,(%2) \n"
"punpckhdq %%xmm7,%%xmm8 \n"
"sub $0x8,%3 \n"
"movlpd %%xmm8,(%1,%5) \n"
"lea (%1,%5,2),%1 \n"
"movhpd %%xmm8,(%2,%6) \n"
"lea (%2,%6,2),%2 \n"
"jg 1b \n"
: "+r"(src), // %0
"+r"(dst_a), // %1
"+r"(dst_b), // %2
"+r"(width) // %3
: "r"((intptr_t)(src_stride)), // %4
"r"((intptr_t)(dst_stride_a)), // %5
"r"((intptr_t)(dst_stride_b)) // %6
: "memory", "cc",
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
"xmm8", "xmm9"
);
void TransposeUVWx8_SSE2(const uint8* src,
int src_stride,
uint8* dst_a,
int dst_stride_a,
uint8* dst_b,
int dst_stride_b,
int width) {
asm volatile(
// Read in the data from the source pointer.
// First round of bit swap.
LABELALIGN
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu (%0,%4),%%xmm1 \n"
"lea (%0,%4,2),%0 \n"
"movdqa %%xmm0,%%xmm8 \n"
"punpcklbw %%xmm1,%%xmm0 \n"
"punpckhbw %%xmm1,%%xmm8 \n"
"movdqa %%xmm8,%%xmm1 \n"
"movdqu (%0),%%xmm2 \n"
"movdqu (%0,%4),%%xmm3 \n"
"lea (%0,%4,2),%0 \n"
"movdqa %%xmm2,%%xmm8 \n"
"punpcklbw %%xmm3,%%xmm2 \n"
"punpckhbw %%xmm3,%%xmm8 \n"
"movdqa %%xmm8,%%xmm3 \n"
"movdqu (%0),%%xmm4 \n"
"movdqu (%0,%4),%%xmm5 \n"
"lea (%0,%4,2),%0 \n"
"movdqa %%xmm4,%%xmm8 \n"
"punpcklbw %%xmm5,%%xmm4 \n"
"punpckhbw %%xmm5,%%xmm8 \n"
"movdqa %%xmm8,%%xmm5 \n"
"movdqu (%0),%%xmm6 \n"
"movdqu (%0,%4),%%xmm7 \n"
"lea (%0,%4,2),%0 \n"
"movdqa %%xmm6,%%xmm8 \n"
"punpcklbw %%xmm7,%%xmm6 \n"
"neg %4 \n"
"lea 0x10(%0,%4,8),%0 \n"
"punpckhbw %%xmm7,%%xmm8 \n"
"movdqa %%xmm8,%%xmm7 \n"
"neg %4 \n"
// Second round of bit swap.
"movdqa %%xmm0,%%xmm8 \n"
"movdqa %%xmm1,%%xmm9 \n"
"punpckhwd %%xmm2,%%xmm8 \n"
"punpckhwd %%xmm3,%%xmm9 \n"
"punpcklwd %%xmm2,%%xmm0 \n"
"punpcklwd %%xmm3,%%xmm1 \n"
"movdqa %%xmm8,%%xmm2 \n"
"movdqa %%xmm9,%%xmm3 \n"
"movdqa %%xmm4,%%xmm8 \n"
"movdqa %%xmm5,%%xmm9 \n"
"punpckhwd %%xmm6,%%xmm8 \n"
"punpckhwd %%xmm7,%%xmm9 \n"
"punpcklwd %%xmm6,%%xmm4 \n"
"punpcklwd %%xmm7,%%xmm5 \n"
"movdqa %%xmm8,%%xmm6 \n"
"movdqa %%xmm9,%%xmm7 \n"
// Third round of bit swap.
// Write to the destination pointer.
"movdqa %%xmm0,%%xmm8 \n"
"punpckldq %%xmm4,%%xmm0 \n"
"movlpd %%xmm0,(%1) \n" // Write back U channel
"movhpd %%xmm0,(%2) \n" // Write back V channel
"punpckhdq %%xmm4,%%xmm8 \n"
"movlpd %%xmm8,(%1,%5) \n"
"lea (%1,%5,2),%1 \n"
"movhpd %%xmm8,(%2,%6) \n"
"lea (%2,%6,2),%2 \n"
"movdqa %%xmm2,%%xmm8 \n"
"punpckldq %%xmm6,%%xmm2 \n"
"movlpd %%xmm2,(%1) \n"
"movhpd %%xmm2,(%2) \n"
"punpckhdq %%xmm6,%%xmm8 \n"
"movlpd %%xmm8,(%1,%5) \n"
"lea (%1,%5,2),%1 \n"
"movhpd %%xmm8,(%2,%6) \n"
"lea (%2,%6,2),%2 \n"
"movdqa %%xmm1,%%xmm8 \n"
"punpckldq %%xmm5,%%xmm1 \n"
"movlpd %%xmm1,(%1) \n"
"movhpd %%xmm1,(%2) \n"
"punpckhdq %%xmm5,%%xmm8 \n"
"movlpd %%xmm8,(%1,%5) \n"
"lea (%1,%5,2),%1 \n"
"movhpd %%xmm8,(%2,%6) \n"
"lea (%2,%6,2),%2 \n"
"movdqa %%xmm3,%%xmm8 \n"
"punpckldq %%xmm7,%%xmm3 \n"
"movlpd %%xmm3,(%1) \n"
"movhpd %%xmm3,(%2) \n"
"punpckhdq %%xmm7,%%xmm8 \n"
"sub $0x8,%3 \n"
"movlpd %%xmm8,(%1,%5) \n"
"lea (%1,%5,2),%1 \n"
"movhpd %%xmm8,(%2,%6) \n"
"lea (%2,%6,2),%2 \n"
"jg 1b \n"
: "+r"(src), // %0
"+r"(dst_a), // %1
"+r"(dst_b), // %2
"+r"(width) // %3
: "r"((intptr_t)(src_stride)), // %4
"r"((intptr_t)(dst_stride_a)), // %5
"r"((intptr_t)(dst_stride_b)) // %6
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
"xmm7", "xmm8", "xmm9");
}
#endif // defined(HAS_TRANSPOSEUVWX8_SSE2)
#endif // defined(__x86_64__) || defined(__i386__)

250
third_party/yuv/source/rotate_msa.cc vendored Normal file
View File

@ -0,0 +1,250 @@
/*
* Copyright 2016 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "libyuv/rotate_row.h"
// This module is for GCC MSA
#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
#include "libyuv/macros_msa.h"
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
#define ILVRL_B(in0, in1, in2, in3, out0, out1, out2, out3) \
{ \
out0 = (v16u8)__msa_ilvr_b((v16i8)in1, (v16i8)in0); \
out1 = (v16u8)__msa_ilvl_b((v16i8)in1, (v16i8)in0); \
out2 = (v16u8)__msa_ilvr_b((v16i8)in3, (v16i8)in2); \
out3 = (v16u8)__msa_ilvl_b((v16i8)in3, (v16i8)in2); \
}
#define ILVRL_H(in0, in1, in2, in3, out0, out1, out2, out3) \
{ \
out0 = (v16u8)__msa_ilvr_h((v8i16)in1, (v8i16)in0); \
out1 = (v16u8)__msa_ilvl_h((v8i16)in1, (v8i16)in0); \
out2 = (v16u8)__msa_ilvr_h((v8i16)in3, (v8i16)in2); \
out3 = (v16u8)__msa_ilvl_h((v8i16)in3, (v8i16)in2); \
}
#define ILVRL_W(in0, in1, in2, in3, out0, out1, out2, out3) \
{ \
out0 = (v16u8)__msa_ilvr_w((v4i32)in1, (v4i32)in0); \
out1 = (v16u8)__msa_ilvl_w((v4i32)in1, (v4i32)in0); \
out2 = (v16u8)__msa_ilvr_w((v4i32)in3, (v4i32)in2); \
out3 = (v16u8)__msa_ilvl_w((v4i32)in3, (v4i32)in2); \
}
#define ILVRL_D(in0, in1, in2, in3, out0, out1, out2, out3) \
{ \
out0 = (v16u8)__msa_ilvr_d((v2i64)in1, (v2i64)in0); \
out1 = (v16u8)__msa_ilvl_d((v2i64)in1, (v2i64)in0); \
out2 = (v16u8)__msa_ilvr_d((v2i64)in3, (v2i64)in2); \
out3 = (v16u8)__msa_ilvl_d((v2i64)in3, (v2i64)in2); \
}
void TransposeWx16_C(const uint8* src,
int src_stride,
uint8* dst,
int dst_stride,
int width) {
TransposeWx8_C(src, src_stride, dst, dst_stride, width);
TransposeWx8_C((src + 8 * src_stride), src_stride, (dst + 8), dst_stride,
width);
}
void TransposeUVWx16_C(const uint8* src,
int src_stride,
uint8* dst_a,
int dst_stride_a,
uint8* dst_b,
int dst_stride_b,
int width) {
TransposeUVWx8_C(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b,
width);
TransposeUVWx8_C((src + 8 * src_stride), src_stride, (dst_a + 8),
dst_stride_a, (dst_b + 8), dst_stride_b, width);
}
void TransposeWx16_MSA(const uint8* src,
int src_stride,
uint8* dst,
int dst_stride,
int width) {
int x;
const uint8* s;
v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3;
v16u8 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
v16u8 res0, res1, res2, res3, res4, res5, res6, res7, res8, res9;
for (x = 0; x < width; x += 16) {
s = src;
src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
s += src_stride;
src1 = (v16u8)__msa_ld_b((v16i8*)s, 0);
s += src_stride;
src2 = (v16u8)__msa_ld_b((v16i8*)s, 0);
s += src_stride;
src3 = (v16u8)__msa_ld_b((v16i8*)s, 0);
s += src_stride;
ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
ILVRL_H(vec0, vec2, vec1, vec3, reg0, reg1, reg2, reg3);
src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
s += src_stride;
src1 = (v16u8)__msa_ld_b((v16i8*)s, 0);
s += src_stride;
src2 = (v16u8)__msa_ld_b((v16i8*)s, 0);
s += src_stride;
src3 = (v16u8)__msa_ld_b((v16i8*)s, 0);
s += src_stride;
ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
ILVRL_H(vec0, vec2, vec1, vec3, reg4, reg5, reg6, reg7);
ILVRL_W(reg0, reg4, reg1, reg5, res0, res1, res2, res3);
ILVRL_W(reg2, reg6, reg3, reg7, res4, res5, res6, res7);
src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
s += src_stride;
src1 = (v16u8)__msa_ld_b((v16i8*)s, 0);
s += src_stride;
src2 = (v16u8)__msa_ld_b((v16i8*)s, 0);
s += src_stride;
src3 = (v16u8)__msa_ld_b((v16i8*)s, 0);
s += src_stride;
ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
ILVRL_H(vec0, vec2, vec1, vec3, reg0, reg1, reg2, reg3);
src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
s += src_stride;
src1 = (v16u8)__msa_ld_b((v16i8*)s, 0);
s += src_stride;
src2 = (v16u8)__msa_ld_b((v16i8*)s, 0);
s += src_stride;
src3 = (v16u8)__msa_ld_b((v16i8*)s, 0);
s += src_stride;
ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
ILVRL_H(vec0, vec2, vec1, vec3, reg4, reg5, reg6, reg7);
res8 = (v16u8)__msa_ilvr_w((v4i32)reg4, (v4i32)reg0);
res9 = (v16u8)__msa_ilvl_w((v4i32)reg4, (v4i32)reg0);
ILVRL_D(res0, res8, res1, res9, dst0, dst1, dst2, dst3);
ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride);
dst += dst_stride * 4;
res8 = (v16u8)__msa_ilvr_w((v4i32)reg5, (v4i32)reg1);
res9 = (v16u8)__msa_ilvl_w((v4i32)reg5, (v4i32)reg1);
ILVRL_D(res2, res8, res3, res9, dst0, dst1, dst2, dst3);
ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride);
dst += dst_stride * 4;
res8 = (v16u8)__msa_ilvr_w((v4i32)reg6, (v4i32)reg2);
res9 = (v16u8)__msa_ilvl_w((v4i32)reg6, (v4i32)reg2);
ILVRL_D(res4, res8, res5, res9, dst0, dst1, dst2, dst3);
ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride);
dst += dst_stride * 4;
res8 = (v16u8)__msa_ilvr_w((v4i32)reg7, (v4i32)reg3);
res9 = (v16u8)__msa_ilvl_w((v4i32)reg7, (v4i32)reg3);
ILVRL_D(res6, res8, res7, res9, dst0, dst1, dst2, dst3);
ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride);
src += 16;
dst += dst_stride * 4;
}
}
void TransposeUVWx16_MSA(const uint8* src,
int src_stride,
uint8* dst_a,
int dst_stride_a,
uint8* dst_b,
int dst_stride_b,
int width) {
int x;
const uint8* s;
v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3;
v16u8 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
v16u8 res0, res1, res2, res3, res4, res5, res6, res7, res8, res9;
for (x = 0; x < width; x += 8) {
s = src;
src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
s += src_stride;
src1 = (v16u8)__msa_ld_b((v16i8*)s, 0);
s += src_stride;
src2 = (v16u8)__msa_ld_b((v16i8*)s, 0);
s += src_stride;
src3 = (v16u8)__msa_ld_b((v16i8*)s, 0);
s += src_stride;
ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
ILVRL_H(vec0, vec2, vec1, vec3, reg0, reg1, reg2, reg3);
src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
s += src_stride;
src1 = (v16u8)__msa_ld_b((v16i8*)s, 0);
s += src_stride;
src2 = (v16u8)__msa_ld_b((v16i8*)s, 0);
s += src_stride;
src3 = (v16u8)__msa_ld_b((v16i8*)s, 0);
s += src_stride;
ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
ILVRL_H(vec0, vec2, vec1, vec3, reg4, reg5, reg6, reg7);
ILVRL_W(reg0, reg4, reg1, reg5, res0, res1, res2, res3);
ILVRL_W(reg2, reg6, reg3, reg7, res4, res5, res6, res7);
src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
s += src_stride;
src1 = (v16u8)__msa_ld_b((v16i8*)s, 0);
s += src_stride;
src2 = (v16u8)__msa_ld_b((v16i8*)s, 0);
s += src_stride;
src3 = (v16u8)__msa_ld_b((v16i8*)s, 0);
s += src_stride;
ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
ILVRL_H(vec0, vec2, vec1, vec3, reg0, reg1, reg2, reg3);
src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
s += src_stride;
src1 = (v16u8)__msa_ld_b((v16i8*)s, 0);
s += src_stride;
src2 = (v16u8)__msa_ld_b((v16i8*)s, 0);
s += src_stride;
src3 = (v16u8)__msa_ld_b((v16i8*)s, 0);
s += src_stride;
ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
ILVRL_H(vec0, vec2, vec1, vec3, reg4, reg5, reg6, reg7);
res8 = (v16u8)__msa_ilvr_w((v4i32)reg4, (v4i32)reg0);
res9 = (v16u8)__msa_ilvl_w((v4i32)reg4, (v4i32)reg0);
ILVRL_D(res0, res8, res1, res9, dst0, dst1, dst2, dst3);
ST_UB2(dst0, dst2, dst_a, dst_stride_a);
ST_UB2(dst1, dst3, dst_b, dst_stride_b);
dst_a += dst_stride_a * 2;
dst_b += dst_stride_b * 2;
res8 = (v16u8)__msa_ilvr_w((v4i32)reg5, (v4i32)reg1);
res9 = (v16u8)__msa_ilvl_w((v4i32)reg5, (v4i32)reg1);
ILVRL_D(res2, res8, res3, res9, dst0, dst1, dst2, dst3);
ST_UB2(dst0, dst2, dst_a, dst_stride_a);
ST_UB2(dst1, dst3, dst_b, dst_stride_b);
dst_a += dst_stride_a * 2;
dst_b += dst_stride_b * 2;
res8 = (v16u8)__msa_ilvr_w((v4i32)reg6, (v4i32)reg2);
res9 = (v16u8)__msa_ilvl_w((v4i32)reg6, (v4i32)reg2);
ILVRL_D(res4, res8, res5, res9, dst0, dst1, dst2, dst3);
ST_UB2(dst0, dst2, dst_a, dst_stride_a);
ST_UB2(dst1, dst3, dst_b, dst_stride_b);
dst_a += dst_stride_a * 2;
dst_b += dst_stride_b * 2;
res8 = (v16u8)__msa_ilvr_w((v4i32)reg7, (v4i32)reg3);
res9 = (v16u8)__msa_ilvl_w((v4i32)reg7, (v4i32)reg3);
ILVRL_D(res6, res8, res7, res9, dst0, dst1, dst2, dst3);
ST_UB2(dst0, dst2, dst_a, dst_stride_a);
ST_UB2(dst1, dst3, dst_b, dst_stride_b);
src += 16;
dst_a += dst_stride_a * 2;
dst_b += dst_stride_b * 2;
}
}
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif
#endif // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)

View File

@ -8,8 +8,8 @@
* be found in the AUTHORS file in the root of the source tree.
*/
#include "libyuv/row.h"
#include "libyuv/rotate_row.h"
#include "libyuv/row.h"
#include "libyuv/basic_types.h"
@ -21,38 +21,32 @@ extern "C" {
#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
!defined(__aarch64__)
static uvec8 kVTbl4x4Transpose =
{ 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };
static uvec8 kVTbl4x4Transpose = {0, 4, 8, 12, 1, 5, 9, 13,
2, 6, 10, 14, 3, 7, 11, 15};
void TransposeWx8_NEON(const uint8* src, int src_stride,
uint8* dst, int dst_stride,
void TransposeWx8_NEON(const uint8* src,
int src_stride,
uint8* dst,
int dst_stride,
int width) {
const uint8* src_temp;
asm volatile (
// loops are on blocks of 8. loop will stop when
// counter gets to or below 0. starting the counter
// at w-8 allow for this
"sub %5, #8 \n"
asm volatile(
// loops are on blocks of 8. loop will stop when
// counter gets to or below 0. starting the counter
// at w-8 allow for this
"sub %5, #8 \n"
// handle 8x8 blocks. this should be the majority of the plane
"1: \n"
// handle 8x8 blocks. this should be the majority of the plane
"1: \n"
"mov %0, %1 \n"
MEMACCESS(0)
"vld1.8 {d0}, [%0], %2 \n"
MEMACCESS(0)
"vld1.8 {d1}, [%0], %2 \n"
MEMACCESS(0)
"vld1.8 {d2}, [%0], %2 \n"
MEMACCESS(0)
"vld1.8 {d3}, [%0], %2 \n"
MEMACCESS(0)
"vld1.8 {d4}, [%0], %2 \n"
MEMACCESS(0)
"vld1.8 {d5}, [%0], %2 \n"
MEMACCESS(0)
"vld1.8 {d6}, [%0], %2 \n"
MEMACCESS(0)
"vld1.8 {d7}, [%0] \n"
"vtrn.8 d1, d0 \n"
@ -77,21 +71,13 @@ void TransposeWx8_NEON(const uint8* src, int src_stride,
"mov %0, %3 \n"
MEMACCESS(0)
"vst1.8 {d1}, [%0], %4 \n"
MEMACCESS(0)
"vst1.8 {d0}, [%0], %4 \n"
MEMACCESS(0)
"vst1.8 {d3}, [%0], %4 \n"
MEMACCESS(0)
"vst1.8 {d2}, [%0], %4 \n"
MEMACCESS(0)
"vst1.8 {d5}, [%0], %4 \n"
MEMACCESS(0)
"vst1.8 {d4}, [%0], %4 \n"
MEMACCESS(0)
"vst1.8 {d7}, [%0], %4 \n"
MEMACCESS(0)
"vst1.8 {d6}, [%0] \n"
"add %1, #8 \n" // src += 8
@ -99,180 +85,138 @@ void TransposeWx8_NEON(const uint8* src, int src_stride,
"subs %5, #8 \n" // w -= 8
"bge 1b \n"
// add 8 back to counter. if the result is 0 there are
// no residuals.
"adds %5, #8 \n"
"beq 4f \n"
// add 8 back to counter. if the result is 0 there are
// no residuals.
"adds %5, #8 \n"
"beq 4f \n"
// some residual, so between 1 and 7 lines left to transpose
"cmp %5, #2 \n"
"blt 3f \n"
// some residual, so between 1 and 7 lines left to transpose
"cmp %5, #2 \n"
"blt 3f \n"
"cmp %5, #4 \n"
"blt 2f \n"
"cmp %5, #4 \n"
"blt 2f \n"
// 4x8 block
"mov %0, %1 \n"
MEMACCESS(0)
"vld1.32 {d0[0]}, [%0], %2 \n"
MEMACCESS(0)
"vld1.32 {d0[1]}, [%0], %2 \n"
MEMACCESS(0)
"vld1.32 {d1[0]}, [%0], %2 \n"
MEMACCESS(0)
"vld1.32 {d1[1]}, [%0], %2 \n"
MEMACCESS(0)
"vld1.32 {d2[0]}, [%0], %2 \n"
MEMACCESS(0)
"vld1.32 {d2[1]}, [%0], %2 \n"
MEMACCESS(0)
"vld1.32 {d3[0]}, [%0], %2 \n"
MEMACCESS(0)
"vld1.32 {d3[1]}, [%0] \n"
// 4x8 block
"mov %0, %1 \n"
"vld1.32 {d0[0]}, [%0], %2 \n"
"vld1.32 {d0[1]}, [%0], %2 \n"
"vld1.32 {d1[0]}, [%0], %2 \n"
"vld1.32 {d1[1]}, [%0], %2 \n"
"vld1.32 {d2[0]}, [%0], %2 \n"
"vld1.32 {d2[1]}, [%0], %2 \n"
"vld1.32 {d3[0]}, [%0], %2 \n"
"vld1.32 {d3[1]}, [%0] \n"
"mov %0, %3 \n"
"mov %0, %3 \n"
MEMACCESS(6)
"vld1.8 {q3}, [%6] \n"
"vld1.8 {q3}, [%6] \n"
"vtbl.8 d4, {d0, d1}, d6 \n"
"vtbl.8 d5, {d0, d1}, d7 \n"
"vtbl.8 d0, {d2, d3}, d6 \n"
"vtbl.8 d1, {d2, d3}, d7 \n"
"vtbl.8 d4, {d0, d1}, d6 \n"
"vtbl.8 d5, {d0, d1}, d7 \n"
"vtbl.8 d0, {d2, d3}, d6 \n"
"vtbl.8 d1, {d2, d3}, d7 \n"
// TODO(frkoenig): Rework shuffle above to
// write out with 4 instead of 8 writes.
MEMACCESS(0)
"vst1.32 {d4[0]}, [%0], %4 \n"
MEMACCESS(0)
"vst1.32 {d4[1]}, [%0], %4 \n"
MEMACCESS(0)
"vst1.32 {d5[0]}, [%0], %4 \n"
MEMACCESS(0)
"vst1.32 {d5[1]}, [%0] \n"
// TODO(frkoenig): Rework shuffle above to
// write out with 4 instead of 8 writes.
"vst1.32 {d4[0]}, [%0], %4 \n"
"vst1.32 {d4[1]}, [%0], %4 \n"
"vst1.32 {d5[0]}, [%0], %4 \n"
"vst1.32 {d5[1]}, [%0] \n"
"add %0, %3, #4 \n"
MEMACCESS(0)
"vst1.32 {d0[0]}, [%0], %4 \n"
MEMACCESS(0)
"vst1.32 {d0[1]}, [%0], %4 \n"
MEMACCESS(0)
"vst1.32 {d1[0]}, [%0], %4 \n"
MEMACCESS(0)
"vst1.32 {d1[1]}, [%0] \n"
"add %0, %3, #4 \n"
"vst1.32 {d0[0]}, [%0], %4 \n"
"vst1.32 {d0[1]}, [%0], %4 \n"
"vst1.32 {d1[0]}, [%0], %4 \n"
"vst1.32 {d1[1]}, [%0] \n"
"add %1, #4 \n" // src += 4
"add %3, %3, %4, lsl #2 \n" // dst += 4 * dst_stride
"subs %5, #4 \n" // w -= 4
"beq 4f \n"
"add %1, #4 \n" // src += 4
"add %3, %3, %4, lsl #2 \n" // dst += 4 * dst_stride
"subs %5, #4 \n" // w -= 4
"beq 4f \n"
// some residual, check to see if it includes a 2x8 block,
// or less
"cmp %5, #2 \n"
"blt 3f \n"
// some residual, check to see if it includes a 2x8 block,
// or less
"cmp %5, #2 \n"
"blt 3f \n"
// 2x8 block
"2: \n"
"mov %0, %1 \n"
MEMACCESS(0)
"vld1.16 {d0[0]}, [%0], %2 \n"
MEMACCESS(0)
"vld1.16 {d1[0]}, [%0], %2 \n"
MEMACCESS(0)
"vld1.16 {d0[1]}, [%0], %2 \n"
MEMACCESS(0)
"vld1.16 {d1[1]}, [%0], %2 \n"
MEMACCESS(0)
"vld1.16 {d0[2]}, [%0], %2 \n"
MEMACCESS(0)
"vld1.16 {d1[2]}, [%0], %2 \n"
MEMACCESS(0)
"vld1.16 {d0[3]}, [%0], %2 \n"
MEMACCESS(0)
"vld1.16 {d1[3]}, [%0] \n"
// 2x8 block
"2: \n"
"mov %0, %1 \n"
"vld1.16 {d0[0]}, [%0], %2 \n"
"vld1.16 {d1[0]}, [%0], %2 \n"
"vld1.16 {d0[1]}, [%0], %2 \n"
"vld1.16 {d1[1]}, [%0], %2 \n"
"vld1.16 {d0[2]}, [%0], %2 \n"
"vld1.16 {d1[2]}, [%0], %2 \n"
"vld1.16 {d0[3]}, [%0], %2 \n"
"vld1.16 {d1[3]}, [%0] \n"
"vtrn.8 d0, d1 \n"
"vtrn.8 d0, d1 \n"
"mov %0, %3 \n"
"mov %0, %3 \n"
MEMACCESS(0)
"vst1.64 {d0}, [%0], %4 \n"
MEMACCESS(0)
"vst1.64 {d1}, [%0] \n"
"vst1.64 {d0}, [%0], %4 \n"
"vst1.64 {d1}, [%0] \n"
"add %1, #2 \n" // src += 2
"add %3, %3, %4, lsl #1 \n" // dst += 2 * dst_stride
"subs %5, #2 \n" // w -= 2
"beq 4f \n"
"add %1, #2 \n" // src += 2
"add %3, %3, %4, lsl #1 \n" // dst += 2 * dst_stride
"subs %5, #2 \n" // w -= 2
"beq 4f \n"
// 1x8 block
"3: \n"
MEMACCESS(1)
"vld1.8 {d0[0]}, [%1], %2 \n"
MEMACCESS(1)
"vld1.8 {d0[1]}, [%1], %2 \n"
MEMACCESS(1)
"vld1.8 {d0[2]}, [%1], %2 \n"
MEMACCESS(1)
"vld1.8 {d0[3]}, [%1], %2 \n"
MEMACCESS(1)
"vld1.8 {d0[4]}, [%1], %2 \n"
MEMACCESS(1)
"vld1.8 {d0[5]}, [%1], %2 \n"
MEMACCESS(1)
"vld1.8 {d0[6]}, [%1], %2 \n"
MEMACCESS(1)
"vld1.8 {d0[7]}, [%1] \n"
// 1x8 block
"3: \n"
"vld1.8 {d0[0]}, [%1], %2 \n"
"vld1.8 {d0[1]}, [%1], %2 \n"
"vld1.8 {d0[2]}, [%1], %2 \n"
"vld1.8 {d0[3]}, [%1], %2 \n"
"vld1.8 {d0[4]}, [%1], %2 \n"
"vld1.8 {d0[5]}, [%1], %2 \n"
"vld1.8 {d0[6]}, [%1], %2 \n"
"vld1.8 {d0[7]}, [%1] \n"
MEMACCESS(3)
"vst1.64 {d0}, [%3] \n"
"vst1.64 {d0}, [%3] \n"
"4: \n"
"4: \n"
: "=&r"(src_temp), // %0
"+r"(src), // %1
"+r"(src_stride), // %2
"+r"(dst), // %3
"+r"(dst_stride), // %4
"+r"(width) // %5
: "r"(&kVTbl4x4Transpose) // %6
: "memory", "cc", "q0", "q1", "q2", "q3"
);
: "=&r"(src_temp), // %0
"+r"(src), // %1
"+r"(src_stride), // %2
"+r"(dst), // %3
"+r"(dst_stride), // %4
"+r"(width) // %5
: "r"(&kVTbl4x4Transpose) // %6
: "memory", "cc", "q0", "q1", "q2", "q3");
}
static uvec8 kVTbl4x4TransposeDi =
{ 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 };
static uvec8 kVTbl4x4TransposeDi = {0, 8, 1, 9, 2, 10, 3, 11,
4, 12, 5, 13, 6, 14, 7, 15};
void TransposeUVWx8_NEON(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b,
void TransposeUVWx8_NEON(const uint8* src,
int src_stride,
uint8* dst_a,
int dst_stride_a,
uint8* dst_b,
int dst_stride_b,
int width) {
const uint8* src_temp;
asm volatile (
// loops are on blocks of 8. loop will stop when
// counter gets to or below 0. starting the counter
// at w-8 allow for this
"sub %7, #8 \n"
asm volatile(
// loops are on blocks of 8. loop will stop when
// counter gets to or below 0. starting the counter
// at w-8 allow for this
"sub %7, #8 \n"
// handle 8x8 blocks. this should be the majority of the plane
"1: \n"
// handle 8x8 blocks. this should be the majority of the plane
"1: \n"
"mov %0, %1 \n"
MEMACCESS(0)
"vld2.8 {d0, d1}, [%0], %2 \n"
MEMACCESS(0)
"vld2.8 {d2, d3}, [%0], %2 \n"
MEMACCESS(0)
"vld2.8 {d4, d5}, [%0], %2 \n"
MEMACCESS(0)
"vld2.8 {d6, d7}, [%0], %2 \n"
MEMACCESS(0)
"vld2.8 {d16, d17}, [%0], %2 \n"
MEMACCESS(0)
"vld2.8 {d18, d19}, [%0], %2 \n"
MEMACCESS(0)
"vld2.8 {d20, d21}, [%0], %2 \n"
MEMACCESS(0)
"vld2.8 {d22, d23}, [%0] \n"
"vtrn.8 q1, q0 \n"
@ -301,40 +245,24 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride,
"mov %0, %3 \n"
MEMACCESS(0)
"vst1.8 {d2}, [%0], %4 \n"
MEMACCESS(0)
"vst1.8 {d0}, [%0], %4 \n"
MEMACCESS(0)
"vst1.8 {d6}, [%0], %4 \n"
MEMACCESS(0)
"vst1.8 {d4}, [%0], %4 \n"
MEMACCESS(0)
"vst1.8 {d18}, [%0], %4 \n"
MEMACCESS(0)
"vst1.8 {d16}, [%0], %4 \n"
MEMACCESS(0)
"vst1.8 {d22}, [%0], %4 \n"
MEMACCESS(0)
"vst1.8 {d20}, [%0] \n"
"mov %0, %5 \n"
MEMACCESS(0)
"vst1.8 {d3}, [%0], %6 \n"
MEMACCESS(0)
"vst1.8 {d1}, [%0], %6 \n"
MEMACCESS(0)
"vst1.8 {d7}, [%0], %6 \n"
MEMACCESS(0)
"vst1.8 {d5}, [%0], %6 \n"
MEMACCESS(0)
"vst1.8 {d19}, [%0], %6 \n"
MEMACCESS(0)
"vst1.8 {d17}, [%0], %6 \n"
MEMACCESS(0)
"vst1.8 {d23}, [%0], %6 \n"
MEMACCESS(0)
"vst1.8 {d21}, [%0] \n"
"add %1, #8*2 \n" // src += 8*2
@ -343,187 +271,142 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride,
"subs %7, #8 \n" // w -= 8
"bge 1b \n"
// add 8 back to counter. if the result is 0 there are
// no residuals.
"adds %7, #8 \n"
"beq 4f \n"
// add 8 back to counter. if the result is 0 there are
// no residuals.
"adds %7, #8 \n"
"beq 4f \n"
// some residual, so between 1 and 7 lines left to transpose
"cmp %7, #2 \n"
"blt 3f \n"
// some residual, so between 1 and 7 lines left to transpose
"cmp %7, #2 \n"
"blt 3f \n"
"cmp %7, #4 \n"
"blt 2f \n"
"cmp %7, #4 \n"
"blt 2f \n"
// TODO(frkoenig): Clean this up
// 4x8 block
"mov %0, %1 \n"
MEMACCESS(0)
"vld1.64 {d0}, [%0], %2 \n"
MEMACCESS(0)
"vld1.64 {d1}, [%0], %2 \n"
MEMACCESS(0)
"vld1.64 {d2}, [%0], %2 \n"
MEMACCESS(0)
"vld1.64 {d3}, [%0], %2 \n"
MEMACCESS(0)
"vld1.64 {d4}, [%0], %2 \n"
MEMACCESS(0)
"vld1.64 {d5}, [%0], %2 \n"
MEMACCESS(0)
"vld1.64 {d6}, [%0], %2 \n"
MEMACCESS(0)
"vld1.64 {d7}, [%0] \n"
// TODO(frkoenig): Clean this up
// 4x8 block
"mov %0, %1 \n"
"vld1.64 {d0}, [%0], %2 \n"
"vld1.64 {d1}, [%0], %2 \n"
"vld1.64 {d2}, [%0], %2 \n"
"vld1.64 {d3}, [%0], %2 \n"
"vld1.64 {d4}, [%0], %2 \n"
"vld1.64 {d5}, [%0], %2 \n"
"vld1.64 {d6}, [%0], %2 \n"
"vld1.64 {d7}, [%0] \n"
MEMACCESS(8)
"vld1.8 {q15}, [%8] \n"
"vld1.8 {q15}, [%8] \n"
"vtrn.8 q0, q1 \n"
"vtrn.8 q2, q3 \n"
"vtrn.8 q0, q1 \n"
"vtrn.8 q2, q3 \n"
"vtbl.8 d16, {d0, d1}, d30 \n"
"vtbl.8 d17, {d0, d1}, d31 \n"
"vtbl.8 d18, {d2, d3}, d30 \n"
"vtbl.8 d19, {d2, d3}, d31 \n"
"vtbl.8 d20, {d4, d5}, d30 \n"
"vtbl.8 d21, {d4, d5}, d31 \n"
"vtbl.8 d22, {d6, d7}, d30 \n"
"vtbl.8 d23, {d6, d7}, d31 \n"
"vtbl.8 d16, {d0, d1}, d30 \n"
"vtbl.8 d17, {d0, d1}, d31 \n"
"vtbl.8 d18, {d2, d3}, d30 \n"
"vtbl.8 d19, {d2, d3}, d31 \n"
"vtbl.8 d20, {d4, d5}, d30 \n"
"vtbl.8 d21, {d4, d5}, d31 \n"
"vtbl.8 d22, {d6, d7}, d30 \n"
"vtbl.8 d23, {d6, d7}, d31 \n"
"mov %0, %3 \n"
"mov %0, %3 \n"
MEMACCESS(0)
"vst1.32 {d16[0]}, [%0], %4 \n"
MEMACCESS(0)
"vst1.32 {d16[1]}, [%0], %4 \n"
MEMACCESS(0)
"vst1.32 {d17[0]}, [%0], %4 \n"
MEMACCESS(0)
"vst1.32 {d17[1]}, [%0], %4 \n"
"vst1.32 {d16[0]}, [%0], %4 \n"
"vst1.32 {d16[1]}, [%0], %4 \n"
"vst1.32 {d17[0]}, [%0], %4 \n"
"vst1.32 {d17[1]}, [%0], %4 \n"
"add %0, %3, #4 \n"
MEMACCESS(0)
"vst1.32 {d20[0]}, [%0], %4 \n"
MEMACCESS(0)
"vst1.32 {d20[1]}, [%0], %4 \n"
MEMACCESS(0)
"vst1.32 {d21[0]}, [%0], %4 \n"
MEMACCESS(0)
"vst1.32 {d21[1]}, [%0] \n"
"add %0, %3, #4 \n"
"vst1.32 {d20[0]}, [%0], %4 \n"
"vst1.32 {d20[1]}, [%0], %4 \n"
"vst1.32 {d21[0]}, [%0], %4 \n"
"vst1.32 {d21[1]}, [%0] \n"
"mov %0, %5 \n"
"mov %0, %5 \n"
MEMACCESS(0)
"vst1.32 {d18[0]}, [%0], %6 \n"
MEMACCESS(0)
"vst1.32 {d18[1]}, [%0], %6 \n"
MEMACCESS(0)
"vst1.32 {d19[0]}, [%0], %6 \n"
MEMACCESS(0)
"vst1.32 {d19[1]}, [%0], %6 \n"
"vst1.32 {d18[0]}, [%0], %6 \n"
"vst1.32 {d18[1]}, [%0], %6 \n"
"vst1.32 {d19[0]}, [%0], %6 \n"
"vst1.32 {d19[1]}, [%0], %6 \n"
"add %0, %5, #4 \n"
MEMACCESS(0)
"vst1.32 {d22[0]}, [%0], %6 \n"
MEMACCESS(0)
"vst1.32 {d22[1]}, [%0], %6 \n"
MEMACCESS(0)
"vst1.32 {d23[0]}, [%0], %6 \n"
MEMACCESS(0)
"vst1.32 {d23[1]}, [%0] \n"
"add %0, %5, #4 \n"
"vst1.32 {d22[0]}, [%0], %6 \n"
"vst1.32 {d22[1]}, [%0], %6 \n"
"vst1.32 {d23[0]}, [%0], %6 \n"
"vst1.32 {d23[1]}, [%0] \n"
"add %1, #4*2 \n" // src += 4 * 2
"add %3, %3, %4, lsl #2 \n" // dst_a += 4 * dst_stride_a
"add %5, %5, %6, lsl #2 \n" // dst_b += 4 * dst_stride_b
"subs %7, #4 \n" // w -= 4
"beq 4f \n"
"add %1, #4*2 \n" // src += 4 * 2
"add %3, %3, %4, lsl #2 \n" // dst_a += 4 *
// dst_stride_a
"add %5, %5, %6, lsl #2 \n" // dst_b += 4 *
// dst_stride_b
"subs %7, #4 \n" // w -= 4
"beq 4f \n"
// some residual, check to see if it includes a 2x8 block,
// or less
"cmp %7, #2 \n"
"blt 3f \n"
// some residual, check to see if it includes a 2x8 block,
// or less
"cmp %7, #2 \n"
"blt 3f \n"
// 2x8 block
"2: \n"
"mov %0, %1 \n"
MEMACCESS(0)
"vld2.16 {d0[0], d2[0]}, [%0], %2 \n"
MEMACCESS(0)
"vld2.16 {d1[0], d3[0]}, [%0], %2 \n"
MEMACCESS(0)
"vld2.16 {d0[1], d2[1]}, [%0], %2 \n"
MEMACCESS(0)
"vld2.16 {d1[1], d3[1]}, [%0], %2 \n"
MEMACCESS(0)
"vld2.16 {d0[2], d2[2]}, [%0], %2 \n"
MEMACCESS(0)
"vld2.16 {d1[2], d3[2]}, [%0], %2 \n"
MEMACCESS(0)
"vld2.16 {d0[3], d2[3]}, [%0], %2 \n"
MEMACCESS(0)
"vld2.16 {d1[3], d3[3]}, [%0] \n"
// 2x8 block
"2: \n"
"mov %0, %1 \n"
"vld2.16 {d0[0], d2[0]}, [%0], %2 \n"
"vld2.16 {d1[0], d3[0]}, [%0], %2 \n"
"vld2.16 {d0[1], d2[1]}, [%0], %2 \n"
"vld2.16 {d1[1], d3[1]}, [%0], %2 \n"
"vld2.16 {d0[2], d2[2]}, [%0], %2 \n"
"vld2.16 {d1[2], d3[2]}, [%0], %2 \n"
"vld2.16 {d0[3], d2[3]}, [%0], %2 \n"
"vld2.16 {d1[3], d3[3]}, [%0] \n"
"vtrn.8 d0, d1 \n"
"vtrn.8 d2, d3 \n"
"vtrn.8 d0, d1 \n"
"vtrn.8 d2, d3 \n"
"mov %0, %3 \n"
"mov %0, %3 \n"
MEMACCESS(0)
"vst1.64 {d0}, [%0], %4 \n"
MEMACCESS(0)
"vst1.64 {d2}, [%0] \n"
"vst1.64 {d0}, [%0], %4 \n"
"vst1.64 {d2}, [%0] \n"
"mov %0, %5 \n"
"mov %0, %5 \n"
MEMACCESS(0)
"vst1.64 {d1}, [%0], %6 \n"
MEMACCESS(0)
"vst1.64 {d3}, [%0] \n"
"vst1.64 {d1}, [%0], %6 \n"
"vst1.64 {d3}, [%0] \n"
"add %1, #2*2 \n" // src += 2 * 2
"add %3, %3, %4, lsl #1 \n" // dst_a += 2 * dst_stride_a
"add %5, %5, %6, lsl #1 \n" // dst_b += 2 * dst_stride_b
"subs %7, #2 \n" // w -= 2
"beq 4f \n"
"add %1, #2*2 \n" // src += 2 * 2
"add %3, %3, %4, lsl #1 \n" // dst_a += 2 *
// dst_stride_a
"add %5, %5, %6, lsl #1 \n" // dst_b += 2 *
// dst_stride_b
"subs %7, #2 \n" // w -= 2
"beq 4f \n"
// 1x8 block
"3: \n"
MEMACCESS(1)
"vld2.8 {d0[0], d1[0]}, [%1], %2 \n"
MEMACCESS(1)
"vld2.8 {d0[1], d1[1]}, [%1], %2 \n"
MEMACCESS(1)
"vld2.8 {d0[2], d1[2]}, [%1], %2 \n"
MEMACCESS(1)
"vld2.8 {d0[3], d1[3]}, [%1], %2 \n"
MEMACCESS(1)
"vld2.8 {d0[4], d1[4]}, [%1], %2 \n"
MEMACCESS(1)
"vld2.8 {d0[5], d1[5]}, [%1], %2 \n"
MEMACCESS(1)
"vld2.8 {d0[6], d1[6]}, [%1], %2 \n"
MEMACCESS(1)
"vld2.8 {d0[7], d1[7]}, [%1] \n"
// 1x8 block
"3: \n"
"vld2.8 {d0[0], d1[0]}, [%1], %2 \n"
"vld2.8 {d0[1], d1[1]}, [%1], %2 \n"
"vld2.8 {d0[2], d1[2]}, [%1], %2 \n"
"vld2.8 {d0[3], d1[3]}, [%1], %2 \n"
"vld2.8 {d0[4], d1[4]}, [%1], %2 \n"
"vld2.8 {d0[5], d1[5]}, [%1], %2 \n"
"vld2.8 {d0[6], d1[6]}, [%1], %2 \n"
"vld2.8 {d0[7], d1[7]}, [%1] \n"
MEMACCESS(3)
"vst1.64 {d0}, [%3] \n"
MEMACCESS(5)
"vst1.64 {d1}, [%5] \n"
"vst1.64 {d0}, [%3] \n"
"vst1.64 {d1}, [%5] \n"
"4: \n"
"4: \n"
: "=&r"(src_temp), // %0
"+r"(src), // %1
"+r"(src_stride), // %2
"+r"(dst_a), // %3
"+r"(dst_stride_a), // %4
"+r"(dst_b), // %5
"+r"(dst_stride_b), // %6
"+r"(width) // %7
: "r"(&kVTbl4x4TransposeDi) // %8
: "memory", "cc",
"q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
);
: "=&r"(src_temp), // %0
"+r"(src), // %1
"+r"(src_stride), // %2
"+r"(dst_a), // %3
"+r"(dst_stride_a), // %4
"+r"(dst_b), // %5
"+r"(dst_stride_b), // %6
"+r"(width) // %7
: "r"(&kVTbl4x4TransposeDi) // %8
: "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
}
#endif // defined(__ARM_NEON__) && !defined(__aarch64__)

View File

@ -8,8 +8,8 @@
* be found in the AUTHORS file in the root of the source tree.
*/
#include "libyuv/row.h"
#include "libyuv/rotate_row.h"
#include "libyuv/row.h"
#include "libyuv/basic_types.h"
@ -21,38 +21,32 @@ extern "C" {
// This module is for GCC Neon armv8 64 bit.
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
static uvec8 kVTbl4x4Transpose =
{ 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };
static uvec8 kVTbl4x4Transpose = {0, 4, 8, 12, 1, 5, 9, 13,
2, 6, 10, 14, 3, 7, 11, 15};
void TransposeWx8_NEON(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width) {
void TransposeWx8_NEON(const uint8* src,
int src_stride,
uint8* dst,
int dst_stride,
int width) {
const uint8* src_temp;
int64 width64 = (int64) width; // Work around clang 3.4 warning.
asm volatile (
// loops are on blocks of 8. loop will stop when
// counter gets to or below 0. starting the counter
// at w-8 allow for this
"sub %3, %3, #8 \n"
"sub %w3, %w3, #8 \n"
// handle 8x8 blocks. this should be the majority of the plane
"1: \n"
"mov %0, %1 \n"
MEMACCESS(0)
"ld1 {v0.8b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v1.8b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v2.8b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v3.8b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v4.8b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v5.8b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v6.8b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v7.8b}, [%0] \n"
"trn2 v16.8b, v0.8b, v1.8b \n"
@ -84,62 +78,45 @@ void TransposeWx8_NEON(const uint8* src, int src_stride,
"mov %0, %2 \n"
MEMACCESS(0)
"st1 {v17.8b}, [%0], %6 \n"
MEMACCESS(0)
"st1 {v16.8b}, [%0], %6 \n"
MEMACCESS(0)
"st1 {v19.8b}, [%0], %6 \n"
MEMACCESS(0)
"st1 {v18.8b}, [%0], %6 \n"
MEMACCESS(0)
"st1 {v21.8b}, [%0], %6 \n"
MEMACCESS(0)
"st1 {v20.8b}, [%0], %6 \n"
MEMACCESS(0)
"st1 {v23.8b}, [%0], %6 \n"
MEMACCESS(0)
"st1 {v22.8b}, [%0] \n"
"add %1, %1, #8 \n" // src += 8
"add %2, %2, %6, lsl #3 \n" // dst += 8 * dst_stride
"subs %3, %3, #8 \n" // w -= 8
"subs %w3, %w3, #8 \n" // w -= 8
"b.ge 1b \n"
// add 8 back to counter. if the result is 0 there are
// no residuals.
"adds %3, %3, #8 \n"
"adds %w3, %w3, #8 \n"
"b.eq 4f \n"
// some residual, so between 1 and 7 lines left to transpose
"cmp %3, #2 \n"
"cmp %w3, #2 \n"
"b.lt 3f \n"
"cmp %3, #4 \n"
"cmp %w3, #4 \n"
"b.lt 2f \n"
// 4x8 block
"mov %0, %1 \n"
MEMACCESS(0)
"ld1 {v0.s}[0], [%0], %5 \n"
MEMACCESS(0)
"ld1 {v0.s}[1], [%0], %5 \n"
MEMACCESS(0)
"ld1 {v0.s}[2], [%0], %5 \n"
MEMACCESS(0)
"ld1 {v0.s}[3], [%0], %5 \n"
MEMACCESS(0)
"ld1 {v1.s}[0], [%0], %5 \n"
MEMACCESS(0)
"ld1 {v1.s}[1], [%0], %5 \n"
MEMACCESS(0)
"ld1 {v1.s}[2], [%0], %5 \n"
MEMACCESS(0)
"ld1 {v1.s}[3], [%0] \n"
"mov %0, %2 \n"
MEMACCESS(4)
"ld1 {v2.16b}, [%4] \n"
"tbl v3.16b, {v0.16b}, v2.16b \n"
@ -147,53 +124,37 @@ void TransposeWx8_NEON(const uint8* src, int src_stride,
// TODO(frkoenig): Rework shuffle above to
// write out with 4 instead of 8 writes.
MEMACCESS(0)
"st1 {v3.s}[0], [%0], %6 \n"
MEMACCESS(0)
"st1 {v3.s}[1], [%0], %6 \n"
MEMACCESS(0)
"st1 {v3.s}[2], [%0], %6 \n"
MEMACCESS(0)
"st1 {v3.s}[3], [%0] \n"
"add %0, %2, #4 \n"
MEMACCESS(0)
"st1 {v0.s}[0], [%0], %6 \n"
MEMACCESS(0)
"st1 {v0.s}[1], [%0], %6 \n"
MEMACCESS(0)
"st1 {v0.s}[2], [%0], %6 \n"
MEMACCESS(0)
"st1 {v0.s}[3], [%0] \n"
"add %1, %1, #4 \n" // src += 4
"add %2, %2, %6, lsl #2 \n" // dst += 4 * dst_stride
"subs %3, %3, #4 \n" // w -= 4
"subs %w3, %w3, #4 \n" // w -= 4
"b.eq 4f \n"
// some residual, check to see if it includes a 2x8 block,
// or less
"cmp %3, #2 \n"
"cmp %w3, #2 \n"
"b.lt 3f \n"
// 2x8 block
"2: \n"
"mov %0, %1 \n"
MEMACCESS(0)
"ld1 {v0.h}[0], [%0], %5 \n"
MEMACCESS(0)
"ld1 {v1.h}[0], [%0], %5 \n"
MEMACCESS(0)
"ld1 {v0.h}[1], [%0], %5 \n"
MEMACCESS(0)
"ld1 {v1.h}[1], [%0], %5 \n"
MEMACCESS(0)
"ld1 {v0.h}[2], [%0], %5 \n"
MEMACCESS(0)
"ld1 {v1.h}[2], [%0], %5 \n"
MEMACCESS(0)
"ld1 {v0.h}[3], [%0], %5 \n"
MEMACCESS(0)
"ld1 {v1.h}[3], [%0] \n"
"trn2 v2.8b, v0.8b, v1.8b \n"
@ -201,36 +162,25 @@ void TransposeWx8_NEON(const uint8* src, int src_stride,
"mov %0, %2 \n"
MEMACCESS(0)
"st1 {v3.8b}, [%0], %6 \n"
MEMACCESS(0)
"st1 {v2.8b}, [%0] \n"
"add %1, %1, #2 \n" // src += 2
"add %2, %2, %6, lsl #1 \n" // dst += 2 * dst_stride
"subs %3, %3, #2 \n" // w -= 2
"subs %w3, %w3, #2 \n" // w -= 2
"b.eq 4f \n"
// 1x8 block
"3: \n"
MEMACCESS(1)
"ld1 {v0.b}[0], [%1], %5 \n"
MEMACCESS(1)
"ld1 {v0.b}[1], [%1], %5 \n"
MEMACCESS(1)
"ld1 {v0.b}[2], [%1], %5 \n"
MEMACCESS(1)
"ld1 {v0.b}[3], [%1], %5 \n"
MEMACCESS(1)
"ld1 {v0.b}[4], [%1], %5 \n"
MEMACCESS(1)
"ld1 {v0.b}[5], [%1], %5 \n"
MEMACCESS(1)
"ld1 {v0.b}[6], [%1], %5 \n"
MEMACCESS(1)
"ld1 {v0.b}[7], [%1] \n"
MEMACCESS(2)
"st1 {v0.8b}, [%2] \n"
"4: \n"
@ -238,7 +188,7 @@ void TransposeWx8_NEON(const uint8* src, int src_stride,
: "=&r"(src_temp), // %0
"+r"(src), // %1
"+r"(dst), // %2
"+r"(width64) // %3
"+r"(width) // %3
: "r"(&kVTbl4x4Transpose), // %4
"r"(static_cast<ptrdiff_t>(src_stride)), // %5
"r"(static_cast<ptrdiff_t>(dst_stride)) // %6
@ -247,41 +197,35 @@ void TransposeWx8_NEON(const uint8* src, int src_stride,
);
}
static uint8 kVTbl4x4TransposeDi[32] =
{ 0, 16, 32, 48, 2, 18, 34, 50, 4, 20, 36, 52, 6, 22, 38, 54,
1, 17, 33, 49, 3, 19, 35, 51, 5, 21, 37, 53, 7, 23, 39, 55};
static uint8 kVTbl4x4TransposeDi[32] = {
0, 16, 32, 48, 2, 18, 34, 50, 4, 20, 36, 52, 6, 22, 38, 54,
1, 17, 33, 49, 3, 19, 35, 51, 5, 21, 37, 53, 7, 23, 39, 55};
void TransposeUVWx8_NEON(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b,
void TransposeUVWx8_NEON(const uint8* src,
int src_stride,
uint8* dst_a,
int dst_stride_a,
uint8* dst_b,
int dst_stride_b,
int width) {
const uint8* src_temp;
int64 width64 = (int64) width; // Work around clang 3.4 warning.
asm volatile (
// loops are on blocks of 8. loop will stop when
// counter gets to or below 0. starting the counter
// at w-8 allow for this
"sub %4, %4, #8 \n"
"sub %w4, %w4, #8 \n"
// handle 8x8 blocks. this should be the majority of the plane
"1: \n"
"mov %0, %1 \n"
MEMACCESS(0)
"ld1 {v0.16b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v1.16b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v2.16b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v3.16b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v4.16b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v5.16b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v6.16b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v7.16b}, [%0] \n"
"trn1 v16.16b, v0.16b, v1.16b \n"
@ -313,81 +257,56 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride,
"mov %0, %2 \n"
MEMACCESS(0)
"st1 {v16.d}[0], [%0], %6 \n"
MEMACCESS(0)
"st1 {v18.d}[0], [%0], %6 \n"
MEMACCESS(0)
"st1 {v17.d}[0], [%0], %6 \n"
MEMACCESS(0)
"st1 {v19.d}[0], [%0], %6 \n"
MEMACCESS(0)
"st1 {v16.d}[1], [%0], %6 \n"
MEMACCESS(0)
"st1 {v18.d}[1], [%0], %6 \n"
MEMACCESS(0)
"st1 {v17.d}[1], [%0], %6 \n"
MEMACCESS(0)
"st1 {v19.d}[1], [%0] \n"
"mov %0, %3 \n"
MEMACCESS(0)
"st1 {v20.d}[0], [%0], %7 \n"
MEMACCESS(0)
"st1 {v22.d}[0], [%0], %7 \n"
MEMACCESS(0)
"st1 {v21.d}[0], [%0], %7 \n"
MEMACCESS(0)
"st1 {v23.d}[0], [%0], %7 \n"
MEMACCESS(0)
"st1 {v20.d}[1], [%0], %7 \n"
MEMACCESS(0)
"st1 {v22.d}[1], [%0], %7 \n"
MEMACCESS(0)
"st1 {v21.d}[1], [%0], %7 \n"
MEMACCESS(0)
"st1 {v23.d}[1], [%0] \n"
"add %1, %1, #16 \n" // src += 8*2
"add %2, %2, %6, lsl #3 \n" // dst_a += 8 * dst_stride_a
"add %3, %3, %7, lsl #3 \n" // dst_b += 8 * dst_stride_b
"subs %4, %4, #8 \n" // w -= 8
"subs %w4, %w4, #8 \n" // w -= 8
"b.ge 1b \n"
// add 8 back to counter. if the result is 0 there are
// no residuals.
"adds %4, %4, #8 \n"
"adds %w4, %w4, #8 \n"
"b.eq 4f \n"
// some residual, so between 1 and 7 lines left to transpose
"cmp %4, #2 \n"
"cmp %w4, #2 \n"
"b.lt 3f \n"
"cmp %4, #4 \n"
"cmp %w4, #4 \n"
"b.lt 2f \n"
// TODO(frkoenig): Clean this up
// 4x8 block
"mov %0, %1 \n"
MEMACCESS(0)
"ld1 {v0.8b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v1.8b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v2.8b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v3.8b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v4.8b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v5.8b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v6.8b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v7.8b}, [%0] \n"
MEMACCESS(8)
"ld1 {v30.16b}, [%8], #16 \n"
"ld1 {v31.16b}, [%8] \n"
@ -398,75 +317,51 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride,
"mov %0, %2 \n"
MEMACCESS(0)
"st1 {v16.s}[0], [%0], %6 \n"
MEMACCESS(0)
"st1 {v16.s}[1], [%0], %6 \n"
MEMACCESS(0)
"st1 {v16.s}[2], [%0], %6 \n"
MEMACCESS(0)
"st1 {v16.s}[3], [%0], %6 \n"
"add %0, %2, #4 \n"
MEMACCESS(0)
"st1 {v18.s}[0], [%0], %6 \n"
MEMACCESS(0)
"st1 {v18.s}[1], [%0], %6 \n"
MEMACCESS(0)
"st1 {v18.s}[2], [%0], %6 \n"
MEMACCESS(0)
"st1 {v18.s}[3], [%0] \n"
"mov %0, %3 \n"
MEMACCESS(0)
"st1 {v17.s}[0], [%0], %7 \n"
MEMACCESS(0)
"st1 {v17.s}[1], [%0], %7 \n"
MEMACCESS(0)
"st1 {v17.s}[2], [%0], %7 \n"
MEMACCESS(0)
"st1 {v17.s}[3], [%0], %7 \n"
"add %0, %3, #4 \n"
MEMACCESS(0)
"st1 {v19.s}[0], [%0], %7 \n"
MEMACCESS(0)
"st1 {v19.s}[1], [%0], %7 \n"
MEMACCESS(0)
"st1 {v19.s}[2], [%0], %7 \n"
MEMACCESS(0)
"st1 {v19.s}[3], [%0] \n"
"add %1, %1, #8 \n" // src += 4 * 2
"add %2, %2, %6, lsl #2 \n" // dst_a += 4 * dst_stride_a
"add %3, %3, %7, lsl #2 \n" // dst_b += 4 * dst_stride_b
"subs %4, %4, #4 \n" // w -= 4
"subs %w4, %w4, #4 \n" // w -= 4
"b.eq 4f \n"
// some residual, check to see if it includes a 2x8 block,
// or less
"cmp %4, #2 \n"
"cmp %w4, #2 \n"
"b.lt 3f \n"
// 2x8 block
"2: \n"
"mov %0, %1 \n"
MEMACCESS(0)
"ld2 {v0.h, v1.h}[0], [%0], %5 \n"
MEMACCESS(0)
"ld2 {v2.h, v3.h}[0], [%0], %5 \n"
MEMACCESS(0)
"ld2 {v0.h, v1.h}[1], [%0], %5 \n"
MEMACCESS(0)
"ld2 {v2.h, v3.h}[1], [%0], %5 \n"
MEMACCESS(0)
"ld2 {v0.h, v1.h}[2], [%0], %5 \n"
MEMACCESS(0)
"ld2 {v2.h, v3.h}[2], [%0], %5 \n"
MEMACCESS(0)
"ld2 {v0.h, v1.h}[3], [%0], %5 \n"
MEMACCESS(0)
"ld2 {v2.h, v3.h}[3], [%0] \n"
"trn1 v4.8b, v0.8b, v2.8b \n"
@ -476,46 +371,32 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride,
"mov %0, %2 \n"
MEMACCESS(0)
"st1 {v4.d}[0], [%0], %6 \n"
MEMACCESS(0)
"st1 {v6.d}[0], [%0] \n"
"mov %0, %3 \n"
MEMACCESS(0)
"st1 {v5.d}[0], [%0], %7 \n"
MEMACCESS(0)
"st1 {v7.d}[0], [%0] \n"
"add %1, %1, #4 \n" // src += 2 * 2
"add %2, %2, %6, lsl #1 \n" // dst_a += 2 * dst_stride_a
"add %3, %3, %7, lsl #1 \n" // dst_b += 2 * dst_stride_b
"subs %4, %4, #2 \n" // w -= 2
"subs %w4, %w4, #2 \n" // w -= 2
"b.eq 4f \n"
// 1x8 block
"3: \n"
MEMACCESS(1)
"ld2 {v0.b, v1.b}[0], [%1], %5 \n"
MEMACCESS(1)
"ld2 {v0.b, v1.b}[1], [%1], %5 \n"
MEMACCESS(1)
"ld2 {v0.b, v1.b}[2], [%1], %5 \n"
MEMACCESS(1)
"ld2 {v0.b, v1.b}[3], [%1], %5 \n"
MEMACCESS(1)
"ld2 {v0.b, v1.b}[4], [%1], %5 \n"
MEMACCESS(1)
"ld2 {v0.b, v1.b}[5], [%1], %5 \n"
MEMACCESS(1)
"ld2 {v0.b, v1.b}[6], [%1], %5 \n"
MEMACCESS(1)
"ld2 {v0.b, v1.b}[7], [%1] \n"
MEMACCESS(2)
"st1 {v0.d}[0], [%2] \n"
MEMACCESS(3)
"st1 {v1.d}[0], [%3] \n"
"4: \n"
@ -524,7 +405,7 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride,
"+r"(src), // %1
"+r"(dst_a), // %2
"+r"(dst_b), // %3
"+r"(width64) // %4
"+r"(width) // %4
: "r"(static_cast<ptrdiff_t>(src_stride)), // %5
"r"(static_cast<ptrdiff_t>(dst_stride_a)), // %6
"r"(static_cast<ptrdiff_t>(dst_stride_b)), // %7

View File

@ -8,8 +8,8 @@
* be found in the AUTHORS file in the root of the source tree.
*/
#include "libyuv/row.h"
#include "libyuv/rotate_row.h"
#include "libyuv/row.h"
#ifdef __cplusplus
namespace libyuv {
@ -19,15 +19,17 @@ extern "C" {
// This module is for 32 bit Visual C x86 and clangcl
#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
__declspec(naked)
void TransposeWx8_SSSE3(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width) {
__declspec(naked) void TransposeWx8_SSSE3(const uint8* src,
int src_stride,
uint8* dst,
int dst_stride,
int width) {
__asm {
push edi
push esi
push ebp
mov eax, [esp + 12 + 4] // src
mov edi, [esp + 12 + 8] // src_stride
mov eax, [esp + 12 + 4] // src
mov edi, [esp + 12 + 8] // src_stride
mov edx, [esp + 12 + 12] // dst
mov esi, [esp + 12 + 16] // dst_stride
mov ecx, [esp + 12 + 20] // width
@ -110,18 +112,20 @@ void TransposeWx8_SSSE3(const uint8* src, int src_stride,
}
}
__declspec(naked)
void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b,
int w) {
__declspec(naked) void TransposeUVWx8_SSE2(const uint8* src,
int src_stride,
uint8* dst_a,
int dst_stride_a,
uint8* dst_b,
int dst_stride_b,
int w) {
__asm {
push ebx
push esi
push edi
push ebp
mov eax, [esp + 16 + 4] // src
mov edi, [esp + 16 + 8] // src_stride
mov eax, [esp + 16 + 4] // src
mov edi, [esp + 16 + 8] // src_stride
mov edx, [esp + 16 + 12] // dst_a
mov esi, [esp + 16 + 16] // dst_stride_a
mov ebx, [esp + 16 + 20] // dst_b
@ -133,9 +137,9 @@ void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
mov ecx, [ecx + 16 + 28] // w
align 4
convertloop:
// Read in the data from the source pointer.
// First round of bit swap.
convertloop:
movdqu xmm0, [eax]
movdqu xmm1, [eax + edi]
lea eax, [eax + 2 * edi]
@ -162,7 +166,7 @@ void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
lea eax, [eax + 2 * edi]
movdqu [esp], xmm5 // backup xmm5
neg edi
movdqa xmm5, xmm6 // use xmm5 as temp register.
movdqa xmm5, xmm6 // use xmm5 as temp register.
punpcklbw xmm6, xmm7
punpckhbw xmm5, xmm7
movdqa xmm7, xmm5
@ -183,10 +187,11 @@ void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
movdqa xmm6, xmm5
movdqu xmm5, [esp] // restore xmm5
movdqu [esp], xmm6 // backup xmm6
movdqa xmm6, xmm5 // use xmm6 as temp register.
movdqa xmm6, xmm5 // use xmm6 as temp register.
punpcklwd xmm5, xmm7
punpckhwd xmm6, xmm7
movdqa xmm7, xmm6
// Third round of bit swap.
// Write to the destination pointer.
movdqa xmm6, xmm0
@ -200,7 +205,7 @@ void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
lea edx, [edx + 2 * esi]
movhpd qword ptr [ebx + ebp], xmm4
lea ebx, [ebx + 2 * ebp]
movdqa xmm0, xmm2 // use xmm0 as the temp register.
movdqa xmm0, xmm2 // use xmm0 as the temp register.
punpckldq xmm2, xmm6
movlpd qword ptr [edx], xmm2
movhpd qword ptr [ebx], xmm2
@ -209,7 +214,7 @@ void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
lea edx, [edx + 2 * esi]
movhpd qword ptr [ebx + ebp], xmm0
lea ebx, [ebx + 2 * ebp]
movdqa xmm0, xmm1 // use xmm0 as the temp register.
movdqa xmm0, xmm1 // use xmm0 as the temp register.
punpckldq xmm1, xmm5
movlpd qword ptr [edx], xmm1
movhpd qword ptr [ebx], xmm1
@ -218,7 +223,7 @@ void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
lea edx, [edx + 2 * esi]
movhpd qword ptr [ebx + ebp], xmm0
lea ebx, [ebx + 2 * ebp]
movdqa xmm0, xmm3 // use xmm0 as the temp register.
movdqa xmm0, xmm3 // use xmm0 as the temp register.
punpckldq xmm3, xmm7
movlpd qword ptr [edx], xmm3
movhpd qword ptr [ebx], xmm3

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

1721
third_party/yuv/source/row_dspr2.cc vendored Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,782 +0,0 @@
/*
* Copyright (c) 2012 The LibYuv project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "libyuv/row.h"
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
// The following are available on Mips platforms:
#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__) && \
(_MIPS_SIM == _MIPS_SIM_ABI32)
#ifdef HAS_COPYROW_MIPS
void CopyRow_MIPS(const uint8* src, uint8* dst, int count) {
__asm__ __volatile__ (
".set noreorder \n"
".set noat \n"
"slti $at, %[count], 8 \n"
"bne $at ,$zero, $last8 \n"
"xor $t8, %[src], %[dst] \n"
"andi $t8, $t8, 0x3 \n"
"bne $t8, $zero, unaligned \n"
"negu $a3, %[dst] \n"
// make dst/src aligned
"andi $a3, $a3, 0x3 \n"
"beq $a3, $zero, $chk16w \n"
// word-aligned now count is the remining bytes count
"subu %[count], %[count], $a3 \n"
"lwr $t8, 0(%[src]) \n"
"addu %[src], %[src], $a3 \n"
"swr $t8, 0(%[dst]) \n"
"addu %[dst], %[dst], $a3 \n"
// Now the dst/src are mutually word-aligned with word-aligned addresses
"$chk16w: \n"
"andi $t8, %[count], 0x3f \n" // whole 64-B chunks?
// t8 is the byte count after 64-byte chunks
"beq %[count], $t8, chk8w \n"
// There will be at most 1 32-byte chunk after it
"subu $a3, %[count], $t8 \n" // the reminder
// Here a3 counts bytes in 16w chunks
"addu $a3, %[dst], $a3 \n"
// Now a3 is the final dst after 64-byte chunks
"addu $t0, %[dst], %[count] \n"
// t0 is the "past the end" address
// When in the loop we exercise "pref 30,x(a1)", the a1+x should not be past
// the "t0-32" address
// This means: for x=128 the last "safe" a1 address is "t0-160"
// Alternatively, for x=64 the last "safe" a1 address is "t0-96"
// we will use "pref 30,128(a1)", so "t0-160" is the limit
"subu $t9, $t0, 160 \n"
// t9 is the "last safe pref 30,128(a1)" address
"pref 0, 0(%[src]) \n" // first line of src
"pref 0, 32(%[src]) \n" // second line of src
"pref 0, 64(%[src]) \n"
"pref 30, 32(%[dst]) \n"
// In case the a1 > t9 don't use "pref 30" at all
"sgtu $v1, %[dst], $t9 \n"
"bgtz $v1, $loop16w \n"
"nop \n"
// otherwise, start with using pref30
"pref 30, 64(%[dst]) \n"
"$loop16w: \n"
"pref 0, 96(%[src]) \n"
"lw $t0, 0(%[src]) \n"
"bgtz $v1, $skip_pref30_96 \n" // skip
"lw $t1, 4(%[src]) \n"
"pref 30, 96(%[dst]) \n" // continue
"$skip_pref30_96: \n"
"lw $t2, 8(%[src]) \n"
"lw $t3, 12(%[src]) \n"
"lw $t4, 16(%[src]) \n"
"lw $t5, 20(%[src]) \n"
"lw $t6, 24(%[src]) \n"
"lw $t7, 28(%[src]) \n"
"pref 0, 128(%[src]) \n"
// bring the next lines of src, addr 128
"sw $t0, 0(%[dst]) \n"
"sw $t1, 4(%[dst]) \n"
"sw $t2, 8(%[dst]) \n"
"sw $t3, 12(%[dst]) \n"
"sw $t4, 16(%[dst]) \n"
"sw $t5, 20(%[dst]) \n"
"sw $t6, 24(%[dst]) \n"
"sw $t7, 28(%[dst]) \n"
"lw $t0, 32(%[src]) \n"
"bgtz $v1, $skip_pref30_128 \n" // skip pref 30,128(a1)
"lw $t1, 36(%[src]) \n"
"pref 30, 128(%[dst]) \n" // set dest, addr 128
"$skip_pref30_128: \n"
"lw $t2, 40(%[src]) \n"
"lw $t3, 44(%[src]) \n"
"lw $t4, 48(%[src]) \n"
"lw $t5, 52(%[src]) \n"
"lw $t6, 56(%[src]) \n"
"lw $t7, 60(%[src]) \n"
"pref 0, 160(%[src]) \n"
// bring the next lines of src, addr 160
"sw $t0, 32(%[dst]) \n"
"sw $t1, 36(%[dst]) \n"
"sw $t2, 40(%[dst]) \n"
"sw $t3, 44(%[dst]) \n"
"sw $t4, 48(%[dst]) \n"
"sw $t5, 52(%[dst]) \n"
"sw $t6, 56(%[dst]) \n"
"sw $t7, 60(%[dst]) \n"
"addiu %[dst], %[dst], 64 \n" // adding 64 to dest
"sgtu $v1, %[dst], $t9 \n"
"bne %[dst], $a3, $loop16w \n"
" addiu %[src], %[src], 64 \n" // adding 64 to src
"move %[count], $t8 \n"
// Here we have src and dest word-aligned but less than 64-bytes to go
"chk8w: \n"
"pref 0, 0x0(%[src]) \n"
"andi $t8, %[count], 0x1f \n" // 32-byte chunk?
// the t8 is the reminder count past 32-bytes
"beq %[count], $t8, chk1w \n"
// count=t8,no 32-byte chunk
" nop \n"
"lw $t0, 0(%[src]) \n"
"lw $t1, 4(%[src]) \n"
"lw $t2, 8(%[src]) \n"
"lw $t3, 12(%[src]) \n"
"lw $t4, 16(%[src]) \n"
"lw $t5, 20(%[src]) \n"
"lw $t6, 24(%[src]) \n"
"lw $t7, 28(%[src]) \n"
"addiu %[src], %[src], 32 \n"
"sw $t0, 0(%[dst]) \n"
"sw $t1, 4(%[dst]) \n"
"sw $t2, 8(%[dst]) \n"
"sw $t3, 12(%[dst]) \n"
"sw $t4, 16(%[dst]) \n"
"sw $t5, 20(%[dst]) \n"
"sw $t6, 24(%[dst]) \n"
"sw $t7, 28(%[dst]) \n"
"addiu %[dst], %[dst], 32 \n"
"chk1w: \n"
"andi %[count], $t8, 0x3 \n"
// now count is the reminder past 1w chunks
"beq %[count], $t8, $last8 \n"
" subu $a3, $t8, %[count] \n"
// a3 is count of bytes in 1w chunks
"addu $a3, %[dst], $a3 \n"
// now a3 is the dst address past the 1w chunks
// copying in words (4-byte chunks)
"$wordCopy_loop: \n"
"lw $t3, 0(%[src]) \n"
// the first t3 may be equal t0 ... optimize?
"addiu %[src], %[src],4 \n"
"addiu %[dst], %[dst],4 \n"
"bne %[dst], $a3,$wordCopy_loop \n"
" sw $t3, -4(%[dst]) \n"
// For the last (<8) bytes
"$last8: \n"
"blez %[count], leave \n"
" addu $a3, %[dst], %[count] \n" // a3 -last dst address
"$last8loop: \n"
"lb $v1, 0(%[src]) \n"
"addiu %[src], %[src], 1 \n"
"addiu %[dst], %[dst], 1 \n"
"bne %[dst], $a3, $last8loop \n"
" sb $v1, -1(%[dst]) \n"
"leave: \n"
" j $ra \n"
" nop \n"
//
// UNALIGNED case
//
"unaligned: \n"
// got here with a3="negu a1"
"andi $a3, $a3, 0x3 \n" // a1 is word aligned?
"beqz $a3, $ua_chk16w \n"
" subu %[count], %[count], $a3 \n"
// bytes left after initial a3 bytes
"lwr $v1, 0(%[src]) \n"
"lwl $v1, 3(%[src]) \n"
"addu %[src], %[src], $a3 \n" // a3 may be 1, 2 or 3
"swr $v1, 0(%[dst]) \n"
"addu %[dst], %[dst], $a3 \n"
// below the dst will be word aligned (NOTE1)
"$ua_chk16w: \n"
"andi $t8, %[count], 0x3f \n" // whole 64-B chunks?
// t8 is the byte count after 64-byte chunks
"beq %[count], $t8, ua_chk8w \n"
// if a2==t8, no 64-byte chunks
// There will be at most 1 32-byte chunk after it
"subu $a3, %[count], $t8 \n" // the reminder
// Here a3 counts bytes in 16w chunks
"addu $a3, %[dst], $a3 \n"
// Now a3 is the final dst after 64-byte chunks
"addu $t0, %[dst], %[count] \n" // t0 "past the end"
"subu $t9, $t0, 160 \n"
// t9 is the "last safe pref 30,128(a1)" address
"pref 0, 0(%[src]) \n" // first line of src
"pref 0, 32(%[src]) \n" // second line addr 32
"pref 0, 64(%[src]) \n"
"pref 30, 32(%[dst]) \n"
// safe, as we have at least 64 bytes ahead
// In case the a1 > t9 don't use "pref 30" at all
"sgtu $v1, %[dst], $t9 \n"
"bgtz $v1, $ua_loop16w \n"
// skip "pref 30,64(a1)" for too short arrays
" nop \n"
// otherwise, start with using pref30
"pref 30, 64(%[dst]) \n"
"$ua_loop16w: \n"
"pref 0, 96(%[src]) \n"
"lwr $t0, 0(%[src]) \n"
"lwl $t0, 3(%[src]) \n"
"lwr $t1, 4(%[src]) \n"
"bgtz $v1, $ua_skip_pref30_96 \n"
" lwl $t1, 7(%[src]) \n"
"pref 30, 96(%[dst]) \n"
// continue setting up the dest, addr 96
"$ua_skip_pref30_96: \n"
"lwr $t2, 8(%[src]) \n"
"lwl $t2, 11(%[src]) \n"
"lwr $t3, 12(%[src]) \n"
"lwl $t3, 15(%[src]) \n"
"lwr $t4, 16(%[src]) \n"
"lwl $t4, 19(%[src]) \n"
"lwr $t5, 20(%[src]) \n"
"lwl $t5, 23(%[src]) \n"
"lwr $t6, 24(%[src]) \n"
"lwl $t6, 27(%[src]) \n"
"lwr $t7, 28(%[src]) \n"
"lwl $t7, 31(%[src]) \n"
"pref 0, 128(%[src]) \n"
// bring the next lines of src, addr 128
"sw $t0, 0(%[dst]) \n"
"sw $t1, 4(%[dst]) \n"
"sw $t2, 8(%[dst]) \n"
"sw $t3, 12(%[dst]) \n"
"sw $t4, 16(%[dst]) \n"
"sw $t5, 20(%[dst]) \n"
"sw $t6, 24(%[dst]) \n"
"sw $t7, 28(%[dst]) \n"
"lwr $t0, 32(%[src]) \n"
"lwl $t0, 35(%[src]) \n"
"lwr $t1, 36(%[src]) \n"
"bgtz $v1, ua_skip_pref30_128 \n"
" lwl $t1, 39(%[src]) \n"
"pref 30, 128(%[dst]) \n"
// continue setting up the dest, addr 128
"ua_skip_pref30_128: \n"
"lwr $t2, 40(%[src]) \n"
"lwl $t2, 43(%[src]) \n"
"lwr $t3, 44(%[src]) \n"
"lwl $t3, 47(%[src]) \n"
"lwr $t4, 48(%[src]) \n"
"lwl $t4, 51(%[src]) \n"
"lwr $t5, 52(%[src]) \n"
"lwl $t5, 55(%[src]) \n"
"lwr $t6, 56(%[src]) \n"
"lwl $t6, 59(%[src]) \n"
"lwr $t7, 60(%[src]) \n"
"lwl $t7, 63(%[src]) \n"
"pref 0, 160(%[src]) \n"
// bring the next lines of src, addr 160
"sw $t0, 32(%[dst]) \n"
"sw $t1, 36(%[dst]) \n"
"sw $t2, 40(%[dst]) \n"
"sw $t3, 44(%[dst]) \n"
"sw $t4, 48(%[dst]) \n"
"sw $t5, 52(%[dst]) \n"
"sw $t6, 56(%[dst]) \n"
"sw $t7, 60(%[dst]) \n"
"addiu %[dst],%[dst],64 \n" // adding 64 to dest
"sgtu $v1,%[dst],$t9 \n"
"bne %[dst],$a3,$ua_loop16w \n"
" addiu %[src],%[src],64 \n" // adding 64 to src
"move %[count],$t8 \n"
// Here we have src and dest word-aligned but less than 64-bytes to go
"ua_chk8w: \n"
"pref 0, 0x0(%[src]) \n"
"andi $t8, %[count], 0x1f \n" // 32-byte chunk?
// the t8 is the reminder count
"beq %[count], $t8, $ua_chk1w \n"
// when count==t8, no 32-byte chunk
"lwr $t0, 0(%[src]) \n"
"lwl $t0, 3(%[src]) \n"
"lwr $t1, 4(%[src]) \n"
"lwl $t1, 7(%[src]) \n"
"lwr $t2, 8(%[src]) \n"
"lwl $t2, 11(%[src]) \n"
"lwr $t3, 12(%[src]) \n"
"lwl $t3, 15(%[src]) \n"
"lwr $t4, 16(%[src]) \n"
"lwl $t4, 19(%[src]) \n"
"lwr $t5, 20(%[src]) \n"
"lwl $t5, 23(%[src]) \n"
"lwr $t6, 24(%[src]) \n"
"lwl $t6, 27(%[src]) \n"
"lwr $t7, 28(%[src]) \n"
"lwl $t7, 31(%[src]) \n"
"addiu %[src], %[src], 32 \n"
"sw $t0, 0(%[dst]) \n"
"sw $t1, 4(%[dst]) \n"
"sw $t2, 8(%[dst]) \n"
"sw $t3, 12(%[dst]) \n"
"sw $t4, 16(%[dst]) \n"
"sw $t5, 20(%[dst]) \n"
"sw $t6, 24(%[dst]) \n"
"sw $t7, 28(%[dst]) \n"
"addiu %[dst], %[dst], 32 \n"
"$ua_chk1w: \n"
"andi %[count], $t8, 0x3 \n"
// now count is the reminder past 1w chunks
"beq %[count], $t8, ua_smallCopy \n"
"subu $a3, $t8, %[count] \n"
// a3 is count of bytes in 1w chunks
"addu $a3, %[dst], $a3 \n"
// now a3 is the dst address past the 1w chunks
// copying in words (4-byte chunks)
"$ua_wordCopy_loop: \n"
"lwr $v1, 0(%[src]) \n"
"lwl $v1, 3(%[src]) \n"
"addiu %[src], %[src], 4 \n"
"addiu %[dst], %[dst], 4 \n"
// note: dst=a1 is word aligned here, see NOTE1
"bne %[dst], $a3, $ua_wordCopy_loop \n"
" sw $v1,-4(%[dst]) \n"
// Now less than 4 bytes (value in count) left to copy
"ua_smallCopy: \n"
"beqz %[count], leave \n"
" addu $a3, %[dst], %[count] \n" // a3 = last dst address
"$ua_smallCopy_loop: \n"
"lb $v1, 0(%[src]) \n"
"addiu %[src], %[src], 1 \n"
"addiu %[dst], %[dst], 1 \n"
"bne %[dst],$a3,$ua_smallCopy_loop \n"
" sb $v1, -1(%[dst]) \n"
"j $ra \n"
" nop \n"
".set at \n"
".set reorder \n"
: [dst] "+r" (dst), [src] "+r" (src)
: [count] "r" (count)
: "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7",
"t8", "t9", "a3", "v1", "at"
);
}
#endif // HAS_COPYROW_MIPS
// DSPR2 functions
#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips_dsp) && \
(__mips_dsp_rev >= 2) && \
(_MIPS_SIM == _MIPS_SIM_ABI32) && (__mips_isa_rev < 6)
void SplitUVRow_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
int width) {
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
"srl $t4, %[width], 4 \n" // multiplies of 16
"blez $t4, 2f \n"
" andi %[width], %[width], 0xf \n" // residual
"1: \n"
"addiu $t4, $t4, -1 \n"
"lw $t0, 0(%[src_uv]) \n" // V1 | U1 | V0 | U0
"lw $t1, 4(%[src_uv]) \n" // V3 | U3 | V2 | U2
"lw $t2, 8(%[src_uv]) \n" // V5 | U5 | V4 | U4
"lw $t3, 12(%[src_uv]) \n" // V7 | U7 | V6 | U6
"lw $t5, 16(%[src_uv]) \n" // V9 | U9 | V8 | U8
"lw $t6, 20(%[src_uv]) \n" // V11 | U11 | V10 | U10
"lw $t7, 24(%[src_uv]) \n" // V13 | U13 | V12 | U12
"lw $t8, 28(%[src_uv]) \n" // V15 | U15 | V14 | U14
"addiu %[src_uv], %[src_uv], 32 \n"
"precrq.qb.ph $t9, $t1, $t0 \n" // V3 | V2 | V1 | V0
"precr.qb.ph $t0, $t1, $t0 \n" // U3 | U2 | U1 | U0
"precrq.qb.ph $t1, $t3, $t2 \n" // V7 | V6 | V5 | V4
"precr.qb.ph $t2, $t3, $t2 \n" // U7 | U6 | U5 | U4
"precrq.qb.ph $t3, $t6, $t5 \n" // V11 | V10 | V9 | V8
"precr.qb.ph $t5, $t6, $t5 \n" // U11 | U10 | U9 | U8
"precrq.qb.ph $t6, $t8, $t7 \n" // V15 | V14 | V13 | V12
"precr.qb.ph $t7, $t8, $t7 \n" // U15 | U14 | U13 | U12
"sw $t9, 0(%[dst_v]) \n"
"sw $t0, 0(%[dst_u]) \n"
"sw $t1, 4(%[dst_v]) \n"
"sw $t2, 4(%[dst_u]) \n"
"sw $t3, 8(%[dst_v]) \n"
"sw $t5, 8(%[dst_u]) \n"
"sw $t6, 12(%[dst_v]) \n"
"sw $t7, 12(%[dst_u]) \n"
"addiu %[dst_v], %[dst_v], 16 \n"
"bgtz $t4, 1b \n"
" addiu %[dst_u], %[dst_u], 16 \n"
"beqz %[width], 3f \n"
" nop \n"
"2: \n"
"lbu $t0, 0(%[src_uv]) \n"
"lbu $t1, 1(%[src_uv]) \n"
"addiu %[src_uv], %[src_uv], 2 \n"
"addiu %[width], %[width], -1 \n"
"sb $t0, 0(%[dst_u]) \n"
"sb $t1, 0(%[dst_v]) \n"
"addiu %[dst_u], %[dst_u], 1 \n"
"bgtz %[width], 2b \n"
" addiu %[dst_v], %[dst_v], 1 \n"
"3: \n"
".set pop \n"
: [src_uv] "+r" (src_uv),
[width] "+r" (width),
[dst_u] "+r" (dst_u),
[dst_v] "+r" (dst_v)
:
: "t0", "t1", "t2", "t3",
"t4", "t5", "t6", "t7", "t8", "t9"
);
}
void MirrorRow_DSPR2(const uint8* src, uint8* dst, int width) {
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
"srl $t4, %[width], 4 \n" // multiplies of 16
"andi $t5, %[width], 0xf \n"
"blez $t4, 2f \n"
" addu %[src], %[src], %[width] \n" // src += width
"1: \n"
"lw $t0, -16(%[src]) \n" // |3|2|1|0|
"lw $t1, -12(%[src]) \n" // |7|6|5|4|
"lw $t2, -8(%[src]) \n" // |11|10|9|8|
"lw $t3, -4(%[src]) \n" // |15|14|13|12|
"wsbh $t0, $t0 \n" // |2|3|0|1|
"wsbh $t1, $t1 \n" // |6|7|4|5|
"wsbh $t2, $t2 \n" // |10|11|8|9|
"wsbh $t3, $t3 \n" // |14|15|12|13|
"rotr $t0, $t0, 16 \n" // |0|1|2|3|
"rotr $t1, $t1, 16 \n" // |4|5|6|7|
"rotr $t2, $t2, 16 \n" // |8|9|10|11|
"rotr $t3, $t3, 16 \n" // |12|13|14|15|
"addiu %[src], %[src], -16 \n"
"addiu $t4, $t4, -1 \n"
"sw $t3, 0(%[dst]) \n" // |15|14|13|12|
"sw $t2, 4(%[dst]) \n" // |11|10|9|8|
"sw $t1, 8(%[dst]) \n" // |7|6|5|4|
"sw $t0, 12(%[dst]) \n" // |3|2|1|0|
"bgtz $t4, 1b \n"
" addiu %[dst], %[dst], 16 \n"
"beqz $t5, 3f \n"
" nop \n"
"2: \n"
"lbu $t0, -1(%[src]) \n"
"addiu $t5, $t5, -1 \n"
"addiu %[src], %[src], -1 \n"
"sb $t0, 0(%[dst]) \n"
"bgez $t5, 2b \n"
" addiu %[dst], %[dst], 1 \n"
"3: \n"
".set pop \n"
: [src] "+r" (src), [dst] "+r" (dst)
: [width] "r" (width)
: "t0", "t1", "t2", "t3", "t4", "t5"
);
}
void MirrorUVRow_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
int width) {
int x;
int y;
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
"addu $t4, %[width], %[width] \n"
"srl %[x], %[width], 4 \n"
"andi %[y], %[width], 0xf \n"
"blez %[x], 2f \n"
" addu %[src_uv], %[src_uv], $t4 \n"
"1: \n"
"lw $t0, -32(%[src_uv]) \n" // |3|2|1|0|
"lw $t1, -28(%[src_uv]) \n" // |7|6|5|4|
"lw $t2, -24(%[src_uv]) \n" // |11|10|9|8|
"lw $t3, -20(%[src_uv]) \n" // |15|14|13|12|
"lw $t4, -16(%[src_uv]) \n" // |19|18|17|16|
"lw $t6, -12(%[src_uv]) \n" // |23|22|21|20|
"lw $t7, -8(%[src_uv]) \n" // |27|26|25|24|
"lw $t8, -4(%[src_uv]) \n" // |31|30|29|28|
"rotr $t0, $t0, 16 \n" // |1|0|3|2|
"rotr $t1, $t1, 16 \n" // |5|4|7|6|
"rotr $t2, $t2, 16 \n" // |9|8|11|10|
"rotr $t3, $t3, 16 \n" // |13|12|15|14|
"rotr $t4, $t4, 16 \n" // |17|16|19|18|
"rotr $t6, $t6, 16 \n" // |21|20|23|22|
"rotr $t7, $t7, 16 \n" // |25|24|27|26|
"rotr $t8, $t8, 16 \n" // |29|28|31|30|
"precr.qb.ph $t9, $t0, $t1 \n" // |0|2|4|6|
"precrq.qb.ph $t5, $t0, $t1 \n" // |1|3|5|7|
"precr.qb.ph $t0, $t2, $t3 \n" // |8|10|12|14|
"precrq.qb.ph $t1, $t2, $t3 \n" // |9|11|13|15|
"precr.qb.ph $t2, $t4, $t6 \n" // |16|18|20|22|
"precrq.qb.ph $t3, $t4, $t6 \n" // |17|19|21|23|
"precr.qb.ph $t4, $t7, $t8 \n" // |24|26|28|30|
"precrq.qb.ph $t6, $t7, $t8 \n" // |25|27|29|31|
"addiu %[src_uv], %[src_uv], -32 \n"
"addiu %[x], %[x], -1 \n"
"swr $t4, 0(%[dst_u]) \n"
"swl $t4, 3(%[dst_u]) \n" // |30|28|26|24|
"swr $t6, 0(%[dst_v]) \n"
"swl $t6, 3(%[dst_v]) \n" // |31|29|27|25|
"swr $t2, 4(%[dst_u]) \n"
"swl $t2, 7(%[dst_u]) \n" // |22|20|18|16|
"swr $t3, 4(%[dst_v]) \n"
"swl $t3, 7(%[dst_v]) \n" // |23|21|19|17|
"swr $t0, 8(%[dst_u]) \n"
"swl $t0, 11(%[dst_u]) \n" // |14|12|10|8|
"swr $t1, 8(%[dst_v]) \n"
"swl $t1, 11(%[dst_v]) \n" // |15|13|11|9|
"swr $t9, 12(%[dst_u]) \n"
"swl $t9, 15(%[dst_u]) \n" // |6|4|2|0|
"swr $t5, 12(%[dst_v]) \n"
"swl $t5, 15(%[dst_v]) \n" // |7|5|3|1|
"addiu %[dst_v], %[dst_v], 16 \n"
"bgtz %[x], 1b \n"
" addiu %[dst_u], %[dst_u], 16 \n"
"beqz %[y], 3f \n"
" nop \n"
"b 2f \n"
" nop \n"
"2: \n"
"lbu $t0, -2(%[src_uv]) \n"
"lbu $t1, -1(%[src_uv]) \n"
"addiu %[src_uv], %[src_uv], -2 \n"
"addiu %[y], %[y], -1 \n"
"sb $t0, 0(%[dst_u]) \n"
"sb $t1, 0(%[dst_v]) \n"
"addiu %[dst_u], %[dst_u], 1 \n"
"bgtz %[y], 2b \n"
" addiu %[dst_v], %[dst_v], 1 \n"
"3: \n"
".set pop \n"
: [src_uv] "+r" (src_uv),
[dst_u] "+r" (dst_u),
[dst_v] "+r" (dst_v),
[x] "=&r" (x),
[y] "=&r" (y)
: [width] "r" (width)
: "t0", "t1", "t2", "t3", "t4",
"t5", "t7", "t8", "t9"
);
}
// Convert (4 Y and 2 VU) I422 and arrange RGB values into
// t5 = | 0 | B0 | 0 | b0 |
// t4 = | 0 | B1 | 0 | b1 |
// t9 = | 0 | G0 | 0 | g0 |
// t8 = | 0 | G1 | 0 | g1 |
// t2 = | 0 | R0 | 0 | r0 |
// t1 = | 0 | R1 | 0 | r1 |
#define YUVTORGB \
"lw $t0, 0(%[y_buf]) \n" \
"lhu $t1, 0(%[u_buf]) \n" \
"lhu $t2, 0(%[v_buf]) \n" \
"preceu.ph.qbr $t1, $t1 \n" \
"preceu.ph.qbr $t2, $t2 \n" \
"preceu.ph.qbra $t3, $t0 \n" \
"preceu.ph.qbla $t0, $t0 \n" \
"subu.ph $t1, $t1, $s5 \n" \
"subu.ph $t2, $t2, $s5 \n" \
"subu.ph $t3, $t3, $s4 \n" \
"subu.ph $t0, $t0, $s4 \n" \
"mul.ph $t3, $t3, $s0 \n" \
"mul.ph $t0, $t0, $s0 \n" \
"shll.ph $t4, $t1, 0x7 \n" \
"subu.ph $t4, $t4, $t1 \n" \
"mul.ph $t6, $t1, $s1 \n" \
"mul.ph $t1, $t2, $s2 \n" \
"addq_s.ph $t5, $t4, $t3 \n" \
"addq_s.ph $t4, $t4, $t0 \n" \
"shra.ph $t5, $t5, 6 \n" \
"shra.ph $t4, $t4, 6 \n" \
"addiu %[u_buf], 2 \n" \
"addiu %[v_buf], 2 \n" \
"addu.ph $t6, $t6, $t1 \n" \
"mul.ph $t1, $t2, $s3 \n" \
"addu.ph $t9, $t6, $t3 \n" \
"addu.ph $t8, $t6, $t0 \n" \
"shra.ph $t9, $t9, 6 \n" \
"shra.ph $t8, $t8, 6 \n" \
"addu.ph $t2, $t1, $t3 \n" \
"addu.ph $t1, $t1, $t0 \n" \
"shra.ph $t2, $t2, 6 \n" \
"shra.ph $t1, $t1, 6 \n" \
"subu.ph $t5, $t5, $s5 \n" \
"subu.ph $t4, $t4, $s5 \n" \
"subu.ph $t9, $t9, $s5 \n" \
"subu.ph $t8, $t8, $s5 \n" \
"subu.ph $t2, $t2, $s5 \n" \
"subu.ph $t1, $t1, $s5 \n" \
"shll_s.ph $t5, $t5, 8 \n" \
"shll_s.ph $t4, $t4, 8 \n" \
"shll_s.ph $t9, $t9, 8 \n" \
"shll_s.ph $t8, $t8, 8 \n" \
"shll_s.ph $t2, $t2, 8 \n" \
"shll_s.ph $t1, $t1, 8 \n" \
"shra.ph $t5, $t5, 8 \n" \
"shra.ph $t4, $t4, 8 \n" \
"shra.ph $t9, $t9, 8 \n" \
"shra.ph $t8, $t8, 8 \n" \
"shra.ph $t2, $t2, 8 \n" \
"shra.ph $t1, $t1, 8 \n" \
"addu.ph $t5, $t5, $s5 \n" \
"addu.ph $t4, $t4, $s5 \n" \
"addu.ph $t9, $t9, $s5 \n" \
"addu.ph $t8, $t8, $s5 \n" \
"addu.ph $t2, $t2, $s5 \n" \
"addu.ph $t1, $t1, $s5 \n"
// TODO(fbarchard): accept yuv conversion constants.
void I422ToARGBRow_DSPR2(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
const struct YuvConstants* yuvconstants,
int width) {
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
"beqz %[width], 2f \n"
" repl.ph $s0, 74 \n" // |YG|YG| = |74|74|
"repl.ph $s1, -25 \n" // |UG|UG| = |-25|-25|
"repl.ph $s2, -52 \n" // |VG|VG| = |-52|-52|
"repl.ph $s3, 102 \n" // |VR|VR| = |102|102|
"repl.ph $s4, 16 \n" // |0|16|0|16|
"repl.ph $s5, 128 \n" // |128|128| // clipping
"lui $s6, 0xff00 \n"
"ori $s6, 0xff00 \n" // |ff|00|ff|00|ff|
"1: \n"
YUVTORGB
// Arranging into argb format
"precr.qb.ph $t4, $t8, $t4 \n" // |G1|g1|B1|b1|
"precr.qb.ph $t5, $t9, $t5 \n" // |G0|g0|B0|b0|
"addiu %[width], -4 \n"
"precrq.qb.ph $t8, $t4, $t5 \n" // |G1|B1|G0|B0|
"precr.qb.ph $t9, $t4, $t5 \n" // |g1|b1|g0|b0|
"precr.qb.ph $t2, $t1, $t2 \n" // |R1|r1|R0|r0|
"addiu %[y_buf], 4 \n"
"preceu.ph.qbla $t1, $t2 \n" // |0 |R1|0 |R0|
"preceu.ph.qbra $t2, $t2 \n" // |0 |r1|0 |r0|
"or $t1, $t1, $s6 \n" // |ff|R1|ff|R0|
"or $t2, $t2, $s6 \n" // |ff|r1|ff|r0|
"precrq.ph.w $t0, $t2, $t9 \n" // |ff|r1|g1|b1|
"precrq.ph.w $t3, $t1, $t8 \n" // |ff|R1|G1|B1|
"sll $t9, $t9, 16 \n"
"sll $t8, $t8, 16 \n"
"packrl.ph $t2, $t2, $t9 \n" // |ff|r0|g0|b0|
"packrl.ph $t1, $t1, $t8 \n" // |ff|R0|G0|B0|
// Store results.
"sw $t2, 0(%[rgb_buf]) \n"
"sw $t0, 4(%[rgb_buf]) \n"
"sw $t1, 8(%[rgb_buf]) \n"
"sw $t3, 12(%[rgb_buf]) \n"
"bnez %[width], 1b \n"
" addiu %[rgb_buf], 16 \n"
"2: \n"
".set pop \n"
:[y_buf] "+r" (y_buf),
[u_buf] "+r" (u_buf),
[v_buf] "+r" (v_buf),
[width] "+r" (width),
[rgb_buf] "+r" (rgb_buf)
:
: "t0", "t1", "t2", "t3", "t4", "t5",
"t6", "t7", "t8", "t9",
"s0", "s1", "s2", "s3",
"s4", "s5", "s6"
);
}
// Bilinear filter 8x2 -> 8x1
void InterpolateRow_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
ptrdiff_t src_stride, int dst_width,
int source_y_fraction) {
int y0_fraction = 256 - source_y_fraction;
const uint8* src_ptr1 = src_ptr + src_stride;
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
"replv.ph $t0, %[y0_fraction] \n"
"replv.ph $t1, %[source_y_fraction] \n"
"1: \n"
"lw $t2, 0(%[src_ptr]) \n"
"lw $t3, 0(%[src_ptr1]) \n"
"lw $t4, 4(%[src_ptr]) \n"
"lw $t5, 4(%[src_ptr1]) \n"
"muleu_s.ph.qbl $t6, $t2, $t0 \n"
"muleu_s.ph.qbr $t7, $t2, $t0 \n"
"muleu_s.ph.qbl $t8, $t3, $t1 \n"
"muleu_s.ph.qbr $t9, $t3, $t1 \n"
"muleu_s.ph.qbl $t2, $t4, $t0 \n"
"muleu_s.ph.qbr $t3, $t4, $t0 \n"
"muleu_s.ph.qbl $t4, $t5, $t1 \n"
"muleu_s.ph.qbr $t5, $t5, $t1 \n"
"addq.ph $t6, $t6, $t8 \n"
"addq.ph $t7, $t7, $t9 \n"
"addq.ph $t2, $t2, $t4 \n"
"addq.ph $t3, $t3, $t5 \n"
"shra.ph $t6, $t6, 8 \n"
"shra.ph $t7, $t7, 8 \n"
"shra.ph $t2, $t2, 8 \n"
"shra.ph $t3, $t3, 8 \n"
"precr.qb.ph $t6, $t6, $t7 \n"
"precr.qb.ph $t2, $t2, $t3 \n"
"addiu %[src_ptr], %[src_ptr], 8 \n"
"addiu %[src_ptr1], %[src_ptr1], 8 \n"
"addiu %[dst_width], %[dst_width], -8 \n"
"sw $t6, 0(%[dst_ptr]) \n"
"sw $t2, 4(%[dst_ptr]) \n"
"bgtz %[dst_width], 1b \n"
" addiu %[dst_ptr], %[dst_ptr], 8 \n"
".set pop \n"
: [dst_ptr] "+r" (dst_ptr),
[src_ptr1] "+r" (src_ptr1),
[src_ptr] "+r" (src_ptr),
[dst_width] "+r" (dst_width)
: [source_y_fraction] "r" (source_y_fraction),
[y0_fraction] "r" (y0_fraction),
[src_stride] "r" (src_stride)
: "t0", "t1", "t2", "t3", "t4", "t5",
"t6", "t7", "t8", "t9"
);
}
#endif // __mips_dsp_rev >= 2
#endif // defined(__mips__)
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif

2977
third_party/yuv/source/row_msa.cc vendored Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -19,16 +19,16 @@ extern "C" {
#endif
// Definition for ScaleFilterCols, ScaleARGBCols and ScaleARGBFilterCols
#define CANY(NAMEANY, TERP_SIMD, TERP_C, BPP, MASK) \
void NAMEANY(uint8* dst_ptr, const uint8* src_ptr, \
int dst_width, int x, int dx) { \
int n = dst_width & ~MASK; \
if (n > 0) { \
TERP_SIMD(dst_ptr, src_ptr, n, x, dx); \
} \
TERP_C(dst_ptr + n * BPP, src_ptr, \
dst_width & MASK, x + n * dx, dx); \
}
#define CANY(NAMEANY, TERP_SIMD, TERP_C, BPP, MASK) \
void NAMEANY(uint8* dst_ptr, const uint8* src_ptr, int dst_width, int x, \
int dx) { \
int r = dst_width & MASK; \
int n = dst_width & ~MASK; \
if (n > 0) { \
TERP_SIMD(dst_ptr, src_ptr, n, x, dx); \
} \
TERP_C(dst_ptr + n * BPP, src_ptr, r, x + n * dx, dx); \
}
#ifdef HAS_SCALEFILTERCOLS_NEON
CANY(ScaleFilterCols_Any_NEON, ScaleFilterCols_NEON, ScaleFilterCols_C, 1, 7)
@ -37,167 +37,379 @@ CANY(ScaleFilterCols_Any_NEON, ScaleFilterCols_NEON, ScaleFilterCols_C, 1, 7)
CANY(ScaleARGBCols_Any_NEON, ScaleARGBCols_NEON, ScaleARGBCols_C, 4, 7)
#endif
#ifdef HAS_SCALEARGBFILTERCOLS_NEON
CANY(ScaleARGBFilterCols_Any_NEON, ScaleARGBFilterCols_NEON,
ScaleARGBFilterCols_C, 4, 3)
CANY(ScaleARGBFilterCols_Any_NEON,
ScaleARGBFilterCols_NEON,
ScaleARGBFilterCols_C,
4,
3)
#endif
#undef CANY
// Fixed scale down.
#define SDANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK) \
void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride, \
uint8* dst_ptr, int dst_width) { \
int r = (int)((unsigned int)dst_width % (MASK + 1)); \
int n = dst_width - r; \
if (n > 0) { \
SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n); \
} \
SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride, \
dst_ptr + n * BPP, r); \
}
// Mask may be non-power of 2, so use MOD
#define SDANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK) \
void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, \
int dst_width) { \
int r = (int)((unsigned int)dst_width % (MASK + 1)); /* NOLINT */ \
int n = dst_width - r; \
if (n > 0) { \
SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n); \
} \
SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride, \
dst_ptr + n * BPP, r); \
}
// Fixed scale down for odd source width. Used by I420Blend subsampling.
// Since dst_width is (width + 1) / 2, this function scales one less pixel
// and copies the last pixel.
#define SDODD(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK) \
void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride, \
uint8* dst_ptr, int dst_width) { \
int r = (int)((unsigned int)(dst_width - 1) % (MASK + 1)); \
int n = dst_width - r; \
if (n > 0) { \
SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n); \
} \
SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride, \
dst_ptr + n * BPP, r); \
}
#define SDODD(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK) \
void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, \
int dst_width) { \
int r = (int)((unsigned int)(dst_width - 1) % (MASK + 1)); /* NOLINT */ \
int n = (dst_width - 1) - r; \
if (n > 0) { \
SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n); \
} \
SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride, \
dst_ptr + n * BPP, r + 1); \
}
#ifdef HAS_SCALEROWDOWN2_SSSE3
SDANY(ScaleRowDown2_Any_SSSE3, ScaleRowDown2_SSSE3, ScaleRowDown2_C, 2, 1, 15)
SDANY(ScaleRowDown2Linear_Any_SSSE3, ScaleRowDown2Linear_SSSE3,
ScaleRowDown2Linear_C, 2, 1, 15)
SDANY(ScaleRowDown2Box_Any_SSSE3, ScaleRowDown2Box_SSSE3, ScaleRowDown2Box_C,
2, 1, 15)
SDODD(ScaleRowDown2Box_Odd_SSSE3, ScaleRowDown2Box_SSSE3,
ScaleRowDown2Box_Odd_C, 2, 1, 15)
SDANY(ScaleRowDown2Linear_Any_SSSE3,
ScaleRowDown2Linear_SSSE3,
ScaleRowDown2Linear_C,
2,
1,
15)
SDANY(ScaleRowDown2Box_Any_SSSE3,
ScaleRowDown2Box_SSSE3,
ScaleRowDown2Box_C,
2,
1,
15)
SDODD(ScaleRowDown2Box_Odd_SSSE3,
ScaleRowDown2Box_SSSE3,
ScaleRowDown2Box_Odd_C,
2,
1,
15)
#endif
#ifdef HAS_SCALEROWDOWN2_AVX2
SDANY(ScaleRowDown2_Any_AVX2, ScaleRowDown2_AVX2, ScaleRowDown2_C, 2, 1, 31)
SDANY(ScaleRowDown2Linear_Any_AVX2, ScaleRowDown2Linear_AVX2,
ScaleRowDown2Linear_C, 2, 1, 31)
SDANY(ScaleRowDown2Box_Any_AVX2, ScaleRowDown2Box_AVX2, ScaleRowDown2Box_C,
2, 1, 31)
SDODD(ScaleRowDown2Box_Odd_AVX2, ScaleRowDown2Box_AVX2, ScaleRowDown2Box_Odd_C,
2, 1, 31)
SDANY(ScaleRowDown2Linear_Any_AVX2,
ScaleRowDown2Linear_AVX2,
ScaleRowDown2Linear_C,
2,
1,
31)
SDANY(ScaleRowDown2Box_Any_AVX2,
ScaleRowDown2Box_AVX2,
ScaleRowDown2Box_C,
2,
1,
31)
SDODD(ScaleRowDown2Box_Odd_AVX2,
ScaleRowDown2Box_AVX2,
ScaleRowDown2Box_Odd_C,
2,
1,
31)
#endif
#ifdef HAS_SCALEROWDOWN2_NEON
SDANY(ScaleRowDown2_Any_NEON, ScaleRowDown2_NEON, ScaleRowDown2_C, 2, 1, 15)
SDANY(ScaleRowDown2Linear_Any_NEON, ScaleRowDown2Linear_NEON,
ScaleRowDown2Linear_C, 2, 1, 15)
SDANY(ScaleRowDown2Box_Any_NEON, ScaleRowDown2Box_NEON,
ScaleRowDown2Box_C, 2, 1, 15)
SDODD(ScaleRowDown2Box_Odd_NEON, ScaleRowDown2Box_NEON,
ScaleRowDown2Box_Odd_C, 2, 1, 15)
SDANY(ScaleRowDown2Linear_Any_NEON,
ScaleRowDown2Linear_NEON,
ScaleRowDown2Linear_C,
2,
1,
15)
SDANY(ScaleRowDown2Box_Any_NEON,
ScaleRowDown2Box_NEON,
ScaleRowDown2Box_C,
2,
1,
15)
SDODD(ScaleRowDown2Box_Odd_NEON,
ScaleRowDown2Box_NEON,
ScaleRowDown2Box_Odd_C,
2,
1,
15)
#endif
#ifdef HAS_SCALEROWDOWN2_MSA
SDANY(ScaleRowDown2_Any_MSA, ScaleRowDown2_MSA, ScaleRowDown2_C, 2, 1, 31)
SDANY(ScaleRowDown2Linear_Any_MSA,
ScaleRowDown2Linear_MSA,
ScaleRowDown2Linear_C,
2,
1,
31)
SDANY(ScaleRowDown2Box_Any_MSA,
ScaleRowDown2Box_MSA,
ScaleRowDown2Box_C,
2,
1,
31)
#endif
#ifdef HAS_SCALEROWDOWN4_SSSE3
SDANY(ScaleRowDown4_Any_SSSE3, ScaleRowDown4_SSSE3, ScaleRowDown4_C, 4, 1, 7)
SDANY(ScaleRowDown4Box_Any_SSSE3, ScaleRowDown4Box_SSSE3, ScaleRowDown4Box_C,
4, 1, 7)
SDANY(ScaleRowDown4Box_Any_SSSE3,
ScaleRowDown4Box_SSSE3,
ScaleRowDown4Box_C,
4,
1,
7)
#endif
#ifdef HAS_SCALEROWDOWN4_AVX2
SDANY(ScaleRowDown4_Any_AVX2, ScaleRowDown4_AVX2, ScaleRowDown4_C, 4, 1, 15)
SDANY(ScaleRowDown4Box_Any_AVX2, ScaleRowDown4Box_AVX2, ScaleRowDown4Box_C,
4, 1, 15)
SDANY(ScaleRowDown4Box_Any_AVX2,
ScaleRowDown4Box_AVX2,
ScaleRowDown4Box_C,
4,
1,
15)
#endif
#ifdef HAS_SCALEROWDOWN4_NEON
SDANY(ScaleRowDown4_Any_NEON, ScaleRowDown4_NEON, ScaleRowDown4_C, 4, 1, 7)
SDANY(ScaleRowDown4Box_Any_NEON, ScaleRowDown4Box_NEON, ScaleRowDown4Box_C,
4, 1, 7)
SDANY(ScaleRowDown4Box_Any_NEON,
ScaleRowDown4Box_NEON,
ScaleRowDown4Box_C,
4,
1,
7)
#endif
#ifdef HAS_SCALEROWDOWN4_MSA
SDANY(ScaleRowDown4_Any_MSA, ScaleRowDown4_MSA, ScaleRowDown4_C, 4, 1, 15)
SDANY(ScaleRowDown4Box_Any_MSA,
ScaleRowDown4Box_MSA,
ScaleRowDown4Box_C,
4,
1,
15)
#endif
#ifdef HAS_SCALEROWDOWN34_SSSE3
SDANY(ScaleRowDown34_Any_SSSE3, ScaleRowDown34_SSSE3,
ScaleRowDown34_C, 4 / 3, 1, 23)
SDANY(ScaleRowDown34_0_Box_Any_SSSE3, ScaleRowDown34_0_Box_SSSE3,
ScaleRowDown34_0_Box_C, 4 / 3, 1, 23)
SDANY(ScaleRowDown34_1_Box_Any_SSSE3, ScaleRowDown34_1_Box_SSSE3,
ScaleRowDown34_1_Box_C, 4 / 3, 1, 23)
SDANY(ScaleRowDown34_Any_SSSE3,
ScaleRowDown34_SSSE3,
ScaleRowDown34_C,
4 / 3,
1,
23)
SDANY(ScaleRowDown34_0_Box_Any_SSSE3,
ScaleRowDown34_0_Box_SSSE3,
ScaleRowDown34_0_Box_C,
4 / 3,
1,
23)
SDANY(ScaleRowDown34_1_Box_Any_SSSE3,
ScaleRowDown34_1_Box_SSSE3,
ScaleRowDown34_1_Box_C,
4 / 3,
1,
23)
#endif
#ifdef HAS_SCALEROWDOWN34_NEON
SDANY(ScaleRowDown34_Any_NEON, ScaleRowDown34_NEON,
ScaleRowDown34_C, 4 / 3, 1, 23)
SDANY(ScaleRowDown34_0_Box_Any_NEON, ScaleRowDown34_0_Box_NEON,
ScaleRowDown34_0_Box_C, 4 / 3, 1, 23)
SDANY(ScaleRowDown34_1_Box_Any_NEON, ScaleRowDown34_1_Box_NEON,
ScaleRowDown34_1_Box_C, 4 / 3, 1, 23)
SDANY(ScaleRowDown34_Any_NEON,
ScaleRowDown34_NEON,
ScaleRowDown34_C,
4 / 3,
1,
23)
SDANY(ScaleRowDown34_0_Box_Any_NEON,
ScaleRowDown34_0_Box_NEON,
ScaleRowDown34_0_Box_C,
4 / 3,
1,
23)
SDANY(ScaleRowDown34_1_Box_Any_NEON,
ScaleRowDown34_1_Box_NEON,
ScaleRowDown34_1_Box_C,
4 / 3,
1,
23)
#endif
#ifdef HAS_SCALEROWDOWN38_SSSE3
SDANY(ScaleRowDown38_Any_SSSE3, ScaleRowDown38_SSSE3,
ScaleRowDown38_C, 8 / 3, 1, 11)
SDANY(ScaleRowDown38_3_Box_Any_SSSE3, ScaleRowDown38_3_Box_SSSE3,
ScaleRowDown38_3_Box_C, 8 / 3, 1, 5)
SDANY(ScaleRowDown38_2_Box_Any_SSSE3, ScaleRowDown38_2_Box_SSSE3,
ScaleRowDown38_2_Box_C, 8 / 3, 1, 5)
SDANY(ScaleRowDown38_Any_SSSE3,
ScaleRowDown38_SSSE3,
ScaleRowDown38_C,
8 / 3,
1,
11)
SDANY(ScaleRowDown38_3_Box_Any_SSSE3,
ScaleRowDown38_3_Box_SSSE3,
ScaleRowDown38_3_Box_C,
8 / 3,
1,
5)
SDANY(ScaleRowDown38_2_Box_Any_SSSE3,
ScaleRowDown38_2_Box_SSSE3,
ScaleRowDown38_2_Box_C,
8 / 3,
1,
5)
#endif
#ifdef HAS_SCALEROWDOWN38_NEON
SDANY(ScaleRowDown38_Any_NEON, ScaleRowDown38_NEON,
ScaleRowDown38_C, 8 / 3, 1, 11)
SDANY(ScaleRowDown38_3_Box_Any_NEON, ScaleRowDown38_3_Box_NEON,
ScaleRowDown38_3_Box_C, 8 / 3, 1, 11)
SDANY(ScaleRowDown38_2_Box_Any_NEON, ScaleRowDown38_2_Box_NEON,
ScaleRowDown38_2_Box_C, 8 / 3, 1, 11)
SDANY(ScaleRowDown38_Any_NEON,
ScaleRowDown38_NEON,
ScaleRowDown38_C,
8 / 3,
1,
11)
SDANY(ScaleRowDown38_3_Box_Any_NEON,
ScaleRowDown38_3_Box_NEON,
ScaleRowDown38_3_Box_C,
8 / 3,
1,
11)
SDANY(ScaleRowDown38_2_Box_Any_NEON,
ScaleRowDown38_2_Box_NEON,
ScaleRowDown38_2_Box_C,
8 / 3,
1,
11)
#endif
#ifdef HAS_SCALEROWDOWN38_MSA
SDANY(ScaleRowDown38_Any_MSA,
ScaleRowDown38_MSA,
ScaleRowDown38_C,
8 / 3,
1,
11)
SDANY(ScaleRowDown38_3_Box_Any_MSA,
ScaleRowDown38_3_Box_MSA,
ScaleRowDown38_3_Box_C,
8 / 3,
1,
11)
SDANY(ScaleRowDown38_2_Box_Any_MSA,
ScaleRowDown38_2_Box_MSA,
ScaleRowDown38_2_Box_C,
8 / 3,
1,
11)
#endif
#ifdef HAS_SCALEARGBROWDOWN2_SSE2
SDANY(ScaleARGBRowDown2_Any_SSE2, ScaleARGBRowDown2_SSE2,
ScaleARGBRowDown2_C, 2, 4, 3)
SDANY(ScaleARGBRowDown2Linear_Any_SSE2, ScaleARGBRowDown2Linear_SSE2,
ScaleARGBRowDown2Linear_C, 2, 4, 3)
SDANY(ScaleARGBRowDown2Box_Any_SSE2, ScaleARGBRowDown2Box_SSE2,
ScaleARGBRowDown2Box_C, 2, 4, 3)
SDANY(ScaleARGBRowDown2_Any_SSE2,
ScaleARGBRowDown2_SSE2,
ScaleARGBRowDown2_C,
2,
4,
3)
SDANY(ScaleARGBRowDown2Linear_Any_SSE2,
ScaleARGBRowDown2Linear_SSE2,
ScaleARGBRowDown2Linear_C,
2,
4,
3)
SDANY(ScaleARGBRowDown2Box_Any_SSE2,
ScaleARGBRowDown2Box_SSE2,
ScaleARGBRowDown2Box_C,
2,
4,
3)
#endif
#ifdef HAS_SCALEARGBROWDOWN2_NEON
SDANY(ScaleARGBRowDown2_Any_NEON, ScaleARGBRowDown2_NEON,
ScaleARGBRowDown2_C, 2, 4, 7)
SDANY(ScaleARGBRowDown2Linear_Any_NEON, ScaleARGBRowDown2Linear_NEON,
ScaleARGBRowDown2Linear_C, 2, 4, 7)
SDANY(ScaleARGBRowDown2Box_Any_NEON, ScaleARGBRowDown2Box_NEON,
ScaleARGBRowDown2Box_C, 2, 4, 7)
SDANY(ScaleARGBRowDown2_Any_NEON,
ScaleARGBRowDown2_NEON,
ScaleARGBRowDown2_C,
2,
4,
7)
SDANY(ScaleARGBRowDown2Linear_Any_NEON,
ScaleARGBRowDown2Linear_NEON,
ScaleARGBRowDown2Linear_C,
2,
4,
7)
SDANY(ScaleARGBRowDown2Box_Any_NEON,
ScaleARGBRowDown2Box_NEON,
ScaleARGBRowDown2Box_C,
2,
4,
7)
#endif
#ifdef HAS_SCALEARGBROWDOWN2_MSA
SDANY(ScaleARGBRowDown2_Any_MSA,
ScaleARGBRowDown2_MSA,
ScaleARGBRowDown2_C,
2,
4,
3)
SDANY(ScaleARGBRowDown2Linear_Any_MSA,
ScaleARGBRowDown2Linear_MSA,
ScaleARGBRowDown2Linear_C,
2,
4,
3)
SDANY(ScaleARGBRowDown2Box_Any_MSA,
ScaleARGBRowDown2Box_MSA,
ScaleARGBRowDown2Box_C,
2,
4,
3)
#endif
#undef SDANY
// Scale down by even scale factor.
#define SDAANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, BPP, MASK) \
void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride, int src_stepx, \
uint8* dst_ptr, int dst_width) { \
int r = (int)((unsigned int)dst_width % (MASK + 1)); \
int n = dst_width - r; \
if (n > 0) { \
SCALEROWDOWN_SIMD(src_ptr, src_stride, src_stepx, dst_ptr, n); \
} \
SCALEROWDOWN_C(src_ptr + (n * src_stepx) * BPP, src_stride, \
src_stepx, dst_ptr + n * BPP, r); \
}
#define SDAANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, BPP, MASK) \
void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride, int src_stepx, \
uint8* dst_ptr, int dst_width) { \
int r = dst_width & MASK; \
int n = dst_width & ~MASK; \
if (n > 0) { \
SCALEROWDOWN_SIMD(src_ptr, src_stride, src_stepx, dst_ptr, n); \
} \
SCALEROWDOWN_C(src_ptr + (n * src_stepx) * BPP, src_stride, src_stepx, \
dst_ptr + n * BPP, r); \
}
#ifdef HAS_SCALEARGBROWDOWNEVEN_SSE2
SDAANY(ScaleARGBRowDownEven_Any_SSE2, ScaleARGBRowDownEven_SSE2,
ScaleARGBRowDownEven_C, 4, 3)
SDAANY(ScaleARGBRowDownEvenBox_Any_SSE2, ScaleARGBRowDownEvenBox_SSE2,
ScaleARGBRowDownEvenBox_C, 4, 3)
SDAANY(ScaleARGBRowDownEven_Any_SSE2,
ScaleARGBRowDownEven_SSE2,
ScaleARGBRowDownEven_C,
4,
3)
SDAANY(ScaleARGBRowDownEvenBox_Any_SSE2,
ScaleARGBRowDownEvenBox_SSE2,
ScaleARGBRowDownEvenBox_C,
4,
3)
#endif
#ifdef HAS_SCALEARGBROWDOWNEVEN_NEON
SDAANY(ScaleARGBRowDownEven_Any_NEON, ScaleARGBRowDownEven_NEON,
ScaleARGBRowDownEven_C, 4, 3)
SDAANY(ScaleARGBRowDownEvenBox_Any_NEON, ScaleARGBRowDownEvenBox_NEON,
ScaleARGBRowDownEvenBox_C, 4, 3)
SDAANY(ScaleARGBRowDownEven_Any_NEON,
ScaleARGBRowDownEven_NEON,
ScaleARGBRowDownEven_C,
4,
3)
SDAANY(ScaleARGBRowDownEvenBox_Any_NEON,
ScaleARGBRowDownEvenBox_NEON,
ScaleARGBRowDownEvenBox_C,
4,
3)
#endif
#ifdef HAS_SCALEARGBROWDOWNEVEN_MSA
SDAANY(ScaleARGBRowDownEven_Any_MSA,
ScaleARGBRowDownEven_MSA,
ScaleARGBRowDownEven_C,
4,
3)
SDAANY(ScaleARGBRowDownEvenBox_Any_MSA,
ScaleARGBRowDownEvenBox_MSA,
ScaleARGBRowDownEvenBox_C,
4,
3)
#endif
// Add rows box filter scale down.
#define SAANY(NAMEANY, SCALEADDROW_SIMD, SCALEADDROW_C, MASK) \
void NAMEANY(const uint8* src_ptr, uint16* dst_ptr, int src_width) { \
int n = src_width & ~MASK; \
if (n > 0) { \
SCALEADDROW_SIMD(src_ptr, dst_ptr, n); \
} \
SCALEADDROW_C(src_ptr + n, dst_ptr + n, src_width & MASK); \
}
#define SAANY(NAMEANY, SCALEADDROW_SIMD, SCALEADDROW_C, MASK) \
void NAMEANY(const uint8* src_ptr, uint16* dst_ptr, int src_width) { \
int n = src_width & ~MASK; \
if (n > 0) { \
SCALEADDROW_SIMD(src_ptr, dst_ptr, n); \
} \
SCALEADDROW_C(src_ptr + n, dst_ptr + n, src_width & MASK); \
}
#ifdef HAS_SCALEADDROW_SSE2
SAANY(ScaleAddRow_Any_SSE2, ScaleAddRow_SSE2, ScaleAddRow_C, 15)
@ -208,14 +420,15 @@ SAANY(ScaleAddRow_Any_AVX2, ScaleAddRow_AVX2, ScaleAddRow_C, 31)
#ifdef HAS_SCALEADDROW_NEON
SAANY(ScaleAddRow_Any_NEON, ScaleAddRow_NEON, ScaleAddRow_C, 15)
#endif
#ifdef HAS_SCALEADDROW_MSA
SAANY(ScaleAddRow_Any_MSA, ScaleAddRow_MSA, ScaleAddRow_C, 15)
#endif
#ifdef HAS_SCALEADDROW_DSPR2
SAANY(ScaleAddRow_Any_DSPR2, ScaleAddRow_DSPR2, ScaleAddRow_C, 15)
#endif
#undef SAANY
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif

View File

@ -30,20 +30,31 @@ static __inline int Abs(int v) {
// ScaleARGB ARGB, 1/2
// This is an optimized version for scaling down a ARGB to 1/2 of
// its original size.
static void ScaleARGBDown2(int src_width, int src_height,
int dst_width, int dst_height,
int src_stride, int dst_stride,
const uint8* src_argb, uint8* dst_argb,
int x, int dx, int y, int dy,
static void ScaleARGBDown2(int src_width,
int src_height,
int dst_width,
int dst_height,
int src_stride,
int dst_stride,
const uint8* src_argb,
uint8* dst_argb,
int x,
int dx,
int y,
int dy,
enum FilterMode filtering) {
int j;
int row_stride = src_stride * (dy >> 16);
void (*ScaleARGBRowDown2)(const uint8* src_argb, ptrdiff_t src_stride,
uint8* dst_argb, int dst_width) =
filtering == kFilterNone ? ScaleARGBRowDown2_C :
(filtering == kFilterLinear ? ScaleARGBRowDown2Linear_C :
ScaleARGBRowDown2Box_C);
assert(dx == 65536 * 2); // Test scale factor of 2.
filtering == kFilterNone
? ScaleARGBRowDown2_C
: (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_C
: ScaleARGBRowDown2Box_C);
(void)src_width;
(void)src_height;
(void)dx;
assert(dx == 65536 * 2); // Test scale factor of 2.
assert((dy & 0x1ffff) == 0); // Test vertical scale is multiple of 2.
// Advance to odd row, even column.
if (filtering == kFilterBilinear) {
@ -54,25 +65,49 @@ static void ScaleARGBDown2(int src_width, int src_height,
#if defined(HAS_SCALEARGBROWDOWN2_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_Any_SSE2 :
(filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_SSE2 :
ScaleARGBRowDown2Box_Any_SSE2);
ScaleARGBRowDown2 =
filtering == kFilterNone
? ScaleARGBRowDown2_Any_SSE2
: (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_SSE2
: ScaleARGBRowDown2Box_Any_SSE2);
if (IS_ALIGNED(dst_width, 4)) {
ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_SSE2 :
(filtering == kFilterLinear ? ScaleARGBRowDown2Linear_SSE2 :
ScaleARGBRowDown2Box_SSE2);
ScaleARGBRowDown2 =
filtering == kFilterNone
? ScaleARGBRowDown2_SSE2
: (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_SSE2
: ScaleARGBRowDown2Box_SSE2);
}
}
#endif
#if defined(HAS_SCALEARGBROWDOWN2_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_Any_NEON :
(filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_NEON :
ScaleARGBRowDown2Box_Any_NEON);
ScaleARGBRowDown2 =
filtering == kFilterNone
? ScaleARGBRowDown2_Any_NEON
: (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_NEON
: ScaleARGBRowDown2Box_Any_NEON);
if (IS_ALIGNED(dst_width, 8)) {
ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_NEON :
(filtering == kFilterLinear ? ScaleARGBRowDown2Linear_NEON :
ScaleARGBRowDown2Box_NEON);
ScaleARGBRowDown2 =
filtering == kFilterNone
? ScaleARGBRowDown2_NEON
: (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_NEON
: ScaleARGBRowDown2Box_NEON);
}
}
#endif
#if defined(HAS_SCALEARGBROWDOWN2_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ScaleARGBRowDown2 =
filtering == kFilterNone
? ScaleARGBRowDown2_Any_MSA
: (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_MSA
: ScaleARGBRowDown2Box_Any_MSA);
if (IS_ALIGNED(dst_width, 4)) {
ScaleARGBRowDown2 =
filtering == kFilterNone
? ScaleARGBRowDown2_MSA
: (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_MSA
: ScaleARGBRowDown2Box_MSA);
}
}
#endif
@ -90,21 +125,32 @@ static void ScaleARGBDown2(int src_width, int src_height,
// ScaleARGB ARGB, 1/4
// This is an optimized version for scaling down a ARGB to 1/4 of
// its original size.
static void ScaleARGBDown4Box(int src_width, int src_height,
int dst_width, int dst_height,
int src_stride, int dst_stride,
const uint8* src_argb, uint8* dst_argb,
int x, int dx, int y, int dy) {
static void ScaleARGBDown4Box(int src_width,
int src_height,
int dst_width,
int dst_height,
int src_stride,
int dst_stride,
const uint8* src_argb,
uint8* dst_argb,
int x,
int dx,
int y,
int dy) {
int j;
// Allocate 2 rows of ARGB.
const int kRowSize = (dst_width * 2 * 4 + 31) & ~31;
align_buffer_64(row, kRowSize * 2);
int row_stride = src_stride * (dy >> 16);
void (*ScaleARGBRowDown2)(const uint8* src_argb, ptrdiff_t src_stride,
uint8* dst_argb, int dst_width) = ScaleARGBRowDown2Box_C;
uint8* dst_argb, int dst_width) =
ScaleARGBRowDown2Box_C;
// Advance to odd row, even column.
src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
assert(dx == 65536 * 4); // Test scale factor of 4.
(void)src_width;
(void)src_height;
(void)dx;
assert(dx == 65536 * 4); // Test scale factor of 4.
assert((dy & 0x3ffff) == 0); // Test vertical scale is multiple of 4.
#if defined(HAS_SCALEARGBROWDOWN2_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
@ -125,8 +171,8 @@ static void ScaleARGBDown4Box(int src_width, int src_height,
for (j = 0; j < dst_height; ++j) {
ScaleARGBRowDown2(src_argb, src_stride, row, dst_width * 2);
ScaleARGBRowDown2(src_argb + src_stride * 2, src_stride,
row + kRowSize, dst_width * 2);
ScaleARGBRowDown2(src_argb + src_stride * 2, src_stride, row + kRowSize,
dst_width * 2);
ScaleARGBRowDown2(row, kRowSize, dst_argb, dst_width);
src_argb += row_stride;
dst_argb += dst_stride;
@ -137,11 +183,18 @@ static void ScaleARGBDown4Box(int src_width, int src_height,
// ScaleARGB ARGB Even
// This is an optimized version for scaling down a ARGB to even
// multiple of its original size.
static void ScaleARGBDownEven(int src_width, int src_height,
int dst_width, int dst_height,
int src_stride, int dst_stride,
const uint8* src_argb, uint8* dst_argb,
int x, int dx, int y, int dy,
static void ScaleARGBDownEven(int src_width,
int src_height,
int dst_width,
int dst_height,
int src_stride,
int dst_stride,
const uint8* src_argb,
uint8* dst_argb,
int x,
int dx,
int y,
int dy,
enum FilterMode filtering) {
int j;
int col_step = dx >> 16;
@ -149,26 +202,38 @@ static void ScaleARGBDownEven(int src_width, int src_height,
void (*ScaleARGBRowDownEven)(const uint8* src_argb, ptrdiff_t src_stride,
int src_step, uint8* dst_argb, int dst_width) =
filtering ? ScaleARGBRowDownEvenBox_C : ScaleARGBRowDownEven_C;
(void)src_width;
(void)src_height;
assert(IS_ALIGNED(src_width, 2));
assert(IS_ALIGNED(src_height, 2));
src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
#if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_SSE2 :
ScaleARGBRowDownEven_Any_SSE2;
ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_SSE2
: ScaleARGBRowDownEven_Any_SSE2;
if (IS_ALIGNED(dst_width, 4)) {
ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_SSE2 :
ScaleARGBRowDownEven_SSE2;
ScaleARGBRowDownEven =
filtering ? ScaleARGBRowDownEvenBox_SSE2 : ScaleARGBRowDownEven_SSE2;
}
}
#endif
#if defined(HAS_SCALEARGBROWDOWNEVEN_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_NEON :
ScaleARGBRowDownEven_Any_NEON;
ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_NEON
: ScaleARGBRowDownEven_Any_NEON;
if (IS_ALIGNED(dst_width, 4)) {
ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_NEON :
ScaleARGBRowDownEven_NEON;
ScaleARGBRowDownEven =
filtering ? ScaleARGBRowDownEvenBox_NEON : ScaleARGBRowDownEven_NEON;
}
}
#endif
#if defined(HAS_SCALEARGBROWDOWNEVEN_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_MSA
: ScaleARGBRowDownEven_Any_MSA;
if (IS_ALIGNED(dst_width, 4)) {
ScaleARGBRowDownEven =
filtering ? ScaleARGBRowDownEvenBox_MSA : ScaleARGBRowDownEven_MSA;
}
}
#endif
@ -184,25 +249,32 @@ static void ScaleARGBDownEven(int src_width, int src_height,
}
// Scale ARGB down with bilinear interpolation.
static void ScaleARGBBilinearDown(int src_width, int src_height,
int dst_width, int dst_height,
int src_stride, int dst_stride,
const uint8* src_argb, uint8* dst_argb,
int x, int dx, int y, int dy,
static void ScaleARGBBilinearDown(int src_width,
int src_height,
int dst_width,
int dst_height,
int src_stride,
int dst_stride,
const uint8* src_argb,
uint8* dst_argb,
int x,
int dx,
int y,
int dy,
enum FilterMode filtering) {
int j;
void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
InterpolateRow_C;
void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx) =
void (*InterpolateRow)(uint8 * dst_argb, const uint8* src_argb,
ptrdiff_t src_stride, int dst_width,
int source_y_fraction) = InterpolateRow_C;
void (*ScaleARGBFilterCols)(uint8 * dst_argb, const uint8* src_argb,
int dst_width, int x, int dx) =
(src_width >= 32768) ? ScaleARGBFilterCols64_C : ScaleARGBFilterCols_C;
int64 xlast = x + (int64)(dst_width - 1) * dx;
int64 xl = (dx >= 0) ? x : xlast;
int64 xr = (dx >= 0) ? xlast : x;
int clip_src_width;
xl = (xl >> 16) & ~3; // Left edge aligned.
xr = (xr >> 16) + 1; // Right most pixel used. Bilinear uses 2 pixels.
xl = (xl >> 16) & ~3; // Left edge aligned.
xr = (xr >> 16) + 1; // Right most pixel used. Bilinear uses 2 pixels.
xr = (xr + 1 + 3) & ~3; // 1 beyond 4 pixel aligned right most pixel.
if (xr > src_width) {
xr = src_width;
@ -235,14 +307,22 @@ static void ScaleARGBBilinearDown(int src_width, int src_height,
}
#endif
#if defined(HAS_INTERPOLATEROW_DSPR2)
if (TestCpuFlag(kCpuHasDSPR2) &&
IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4)) {
if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src_argb, 4) &&
IS_ALIGNED(src_stride, 4)) {
InterpolateRow = InterpolateRow_Any_DSPR2;
if (IS_ALIGNED(clip_src_width, 4)) {
InterpolateRow = InterpolateRow_DSPR2;
}
}
#endif
#if defined(HAS_INTERPOLATEROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
InterpolateRow = InterpolateRow_Any_MSA;
if (IS_ALIGNED(clip_src_width, 32)) {
InterpolateRow = InterpolateRow_MSA;
}
}
#endif
#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
@ -286,18 +366,25 @@ static void ScaleARGBBilinearDown(int src_width, int src_height,
}
// Scale ARGB up with bilinear interpolation.
static void ScaleARGBBilinearUp(int src_width, int src_height,
int dst_width, int dst_height,
int src_stride, int dst_stride,
const uint8* src_argb, uint8* dst_argb,
int x, int dx, int y, int dy,
static void ScaleARGBBilinearUp(int src_width,
int src_height,
int dst_width,
int dst_height,
int src_stride,
int dst_stride,
const uint8* src_argb,
uint8* dst_argb,
int x,
int dx,
int y,
int dy,
enum FilterMode filtering) {
int j;
void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
InterpolateRow_C;
void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx) =
void (*InterpolateRow)(uint8 * dst_argb, const uint8* src_argb,
ptrdiff_t src_stride, int dst_width,
int source_y_fraction) = InterpolateRow_C;
void (*ScaleARGBFilterCols)(uint8 * dst_argb, const uint8* src_argb,
int dst_width, int x, int dx) =
filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C;
const int max_y = (src_height - 1) << 16;
#if defined(HAS_INTERPOLATEROW_SSSE3)
@ -325,14 +412,22 @@ static void ScaleARGBBilinearUp(int src_width, int src_height,
}
#endif
#if defined(HAS_INTERPOLATEROW_DSPR2)
if (TestCpuFlag(kCpuHasDSPR2) &&
IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) {
if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(dst_argb, 4) &&
IS_ALIGNED(dst_stride, 4)) {
InterpolateRow = InterpolateRow_DSPR2;
}
#endif
#if defined(HAS_INTERPOLATEROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
InterpolateRow = InterpolateRow_Any_MSA;
if (IS_ALIGNED(dst_width, 8)) {
InterpolateRow = InterpolateRow_MSA;
}
}
#endif
if (src_width >= 32768) {
ScaleARGBFilterCols = filtering ?
ScaleARGBFilterCols64_C : ScaleARGBCols64_C;
ScaleARGBFilterCols =
filtering ? ScaleARGBFilterCols64_C : ScaleARGBCols64_C;
}
#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3)
if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
@ -423,8 +518,10 @@ static void ScaleARGBBilinearUp(int src_width, int src_height,
#ifdef YUVSCALEUP
// Scale YUV to ARGB up with bilinear interpolation.
static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
int dst_width, int dst_height,
static void ScaleYUVToARGBBilinearUp(int src_width,
int src_height,
int dst_width,
int dst_height,
int src_stride_y,
int src_stride_u,
int src_stride_v,
@ -433,14 +530,15 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb,
int x, int dx, int y, int dy,
int x,
int dx,
int y,
int dy,
enum FilterMode filtering) {
int j;
void (*I422ToARGBRow)(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width) = I422ToARGBRow_C;
void (*I422ToARGBRow)(const uint8* y_buf, const uint8* u_buf,
const uint8* v_buf, uint8* rgb_buf, int width) =
I422ToARGBRow_C;
#if defined(HAS_I422TOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
@ -474,10 +572,18 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
I422ToARGBRow = I422ToARGBRow_DSPR2;
}
#endif
#if defined(HAS_I422TOARGBROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
I422ToARGBRow = I422ToARGBRow_Any_MSA;
if (IS_ALIGNED(src_width, 8)) {
I422ToARGBRow = I422ToARGBRow_MSA;
}
}
#endif
void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
InterpolateRow_C;
void (*InterpolateRow)(uint8 * dst_argb, const uint8* src_argb,
ptrdiff_t src_stride, int dst_width,
int source_y_fraction) = InterpolateRow_C;
#if defined(HAS_INTERPOLATEROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
InterpolateRow = InterpolateRow_Any_SSSE3;
@ -503,18 +609,26 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
}
#endif
#if defined(HAS_INTERPOLATEROW_DSPR2)
if (TestCpuFlag(kCpuHasDSPR2) &&
IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(dst_argb, 4) &&
IS_ALIGNED(dst_stride_argb, 4)) {
InterpolateRow = InterpolateRow_DSPR2;
}
#endif
#if defined(HAS_INTERPOLATEROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
InterpolateRow = InterpolateRow_Any_MSA;
if (IS_ALIGNED(dst_width, 8)) {
InterpolateRow = InterpolateRow_MSA;
}
}
#endif
void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx) =
void (*ScaleARGBFilterCols)(uint8 * dst_argb, const uint8* src_argb,
int dst_width, int x, int dx) =
filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C;
if (src_width >= 32768) {
ScaleARGBFilterCols = filtering ?
ScaleARGBFilterCols64_C : ScaleARGBCols64_C;
ScaleARGBFilterCols =
filtering ? ScaleARGBFilterCols64_C : ScaleARGBCols64_C;
}
#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3)
if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
@ -635,15 +749,23 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
// of x and dx is the integer part of the source position and
// the lower 16 bits are the fixed decimal part.
static void ScaleARGBSimple(int src_width, int src_height,
int dst_width, int dst_height,
int src_stride, int dst_stride,
const uint8* src_argb, uint8* dst_argb,
int x, int dx, int y, int dy) {
static void ScaleARGBSimple(int src_width,
int src_height,
int dst_width,
int dst_height,
int src_stride,
int dst_stride,
const uint8* src_argb,
uint8* dst_argb,
int x,
int dx,
int y,
int dy) {
int j;
void (*ScaleARGBCols)(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx) =
void (*ScaleARGBCols)(uint8 * dst_argb, const uint8* src_argb, int dst_width,
int x, int dx) =
(src_width >= 32768) ? ScaleARGBCols64_C : ScaleARGBCols_C;
(void)src_height;
#if defined(HAS_SCALEARGBCOLS_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
ScaleARGBCols = ScaleARGBCols_SSE2;
@ -667,8 +789,8 @@ static void ScaleARGBSimple(int src_width, int src_height,
}
for (j = 0; j < dst_height; ++j) {
ScaleARGBCols(dst_argb, src_argb + (y >> 16) * src_stride,
dst_width, x, dx);
ScaleARGBCols(dst_argb, src_argb + (y >> 16) * src_stride, dst_width, x,
dx);
dst_argb += dst_stride;
y += dy;
}
@ -677,11 +799,18 @@ static void ScaleARGBSimple(int src_width, int src_height,
// ScaleARGB a ARGB.
// This function in turn calls a scaling function
// suitable for handling the desired resolutions.
static void ScaleARGB(const uint8* src, int src_stride,
int src_width, int src_height,
uint8* dst, int dst_stride,
int dst_width, int dst_height,
int clip_x, int clip_y, int clip_width, int clip_height,
static void ScaleARGB(const uint8* src,
int src_stride,
int src_width,
int src_height,
uint8* dst,
int dst_stride,
int dst_width,
int dst_height,
int clip_x,
int clip_y,
int clip_width,
int clip_height,
enum FilterMode filtering) {
// Initial source x/y coordinate and step values as 16.16 fixed point.
int x = 0;
@ -690,8 +819,7 @@ static void ScaleARGB(const uint8* src, int src_stride,
int dy = 0;
// ARGB does not support box filter yet, but allow the user to pass it.
// Simplify filtering when possible.
filtering = ScaleFilterReduce(src_width, src_height,
dst_width, dst_height,
filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height,
filtering);
// Negative src_height means invert the image.
@ -700,17 +828,17 @@ static void ScaleARGB(const uint8* src, int src_stride,
src = src + (src_height - 1) * src_stride;
src_stride = -src_stride;
}
ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,
&x, &y, &dx, &dy);
ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
&dx, &dy);
src_width = Abs(src_width);
if (clip_x) {
int64 clipf = (int64)(clip_x) * dx;
int64 clipf = (int64)(clip_x)*dx;
x += (clipf & 0xffff);
src += (clipf >> 16) * 4;
dst += clip_x * 4;
}
if (clip_y) {
int64 clipf = (int64)(clip_y) * dy;
int64 clipf = (int64)(clip_y)*dy;
y += (clipf & 0xffff);
src += (clipf >> 16) * src_stride;
dst += clip_y * dst_stride;
@ -725,24 +853,20 @@ static void ScaleARGB(const uint8* src, int src_stride,
if (!(dx & 0x10000) && !(dy & 0x10000)) {
if (dx == 0x20000) {
// Optimized 1/2 downsample.
ScaleARGBDown2(src_width, src_height,
clip_width, clip_height,
src_stride, dst_stride, src, dst,
x, dx, y, dy, filtering);
ScaleARGBDown2(src_width, src_height, clip_width, clip_height,
src_stride, dst_stride, src, dst, x, dx, y, dy,
filtering);
return;
}
if (dx == 0x40000 && filtering == kFilterBox) {
// Optimized 1/4 box downsample.
ScaleARGBDown4Box(src_width, src_height,
clip_width, clip_height,
src_stride, dst_stride, src, dst,
x, dx, y, dy);
ScaleARGBDown4Box(src_width, src_height, clip_width, clip_height,
src_stride, dst_stride, src, dst, x, dx, y, dy);
return;
}
ScaleARGBDownEven(src_width, src_height,
clip_width, clip_height,
src_stride, dst_stride, src, dst,
x, dx, y, dy, filtering);
ScaleARGBDownEven(src_width, src_height, clip_width, clip_height,
src_stride, dst_stride, src, dst, x, dx, y, dy,
filtering);
return;
}
// Optimized odd scale down. ie 3, 5, 7, 9x.
@ -759,96 +883,105 @@ static void ScaleARGB(const uint8* src, int src_stride,
}
if (dx == 0x10000 && (x & 0xffff) == 0) {
// Arbitrary scale vertically, but unscaled vertically.
ScalePlaneVertical(src_height,
clip_width, clip_height,
src_stride, dst_stride, src, dst,
x, y, dy, 4, filtering);
ScalePlaneVertical(src_height, clip_width, clip_height, src_stride,
dst_stride, src, dst, x, y, dy, 4, filtering);
return;
}
if (filtering && dy < 65536) {
ScaleARGBBilinearUp(src_width, src_height,
clip_width, clip_height,
src_stride, dst_stride, src, dst,
x, dx, y, dy, filtering);
ScaleARGBBilinearUp(src_width, src_height, clip_width, clip_height,
src_stride, dst_stride, src, dst, x, dx, y, dy,
filtering);
return;
}
if (filtering) {
ScaleARGBBilinearDown(src_width, src_height,
clip_width, clip_height,
src_stride, dst_stride, src, dst,
x, dx, y, dy, filtering);
ScaleARGBBilinearDown(src_width, src_height, clip_width, clip_height,
src_stride, dst_stride, src, dst, x, dx, y, dy,
filtering);
return;
}
ScaleARGBSimple(src_width, src_height, clip_width, clip_height,
src_stride, dst_stride, src, dst,
x, dx, y, dy);
ScaleARGBSimple(src_width, src_height, clip_width, clip_height, src_stride,
dst_stride, src, dst, x, dx, y, dy);
}
LIBYUV_API
int ARGBScaleClip(const uint8* src_argb, int src_stride_argb,
int src_width, int src_height,
uint8* dst_argb, int dst_stride_argb,
int dst_width, int dst_height,
int clip_x, int clip_y, int clip_width, int clip_height,
int ARGBScaleClip(const uint8* src_argb,
int src_stride_argb,
int src_width,
int src_height,
uint8* dst_argb,
int dst_stride_argb,
int dst_width,
int dst_height,
int clip_x,
int clip_y,
int clip_width,
int clip_height,
enum FilterMode filtering) {
if (!src_argb || src_width == 0 || src_height == 0 ||
!dst_argb || dst_width <= 0 || dst_height <= 0 ||
clip_x < 0 || clip_y < 0 ||
if (!src_argb || src_width == 0 || src_height == 0 || !dst_argb ||
dst_width <= 0 || dst_height <= 0 || clip_x < 0 || clip_y < 0 ||
clip_width > 32768 || clip_height > 32768 ||
(clip_x + clip_width) > dst_width ||
(clip_y + clip_height) > dst_height) {
return -1;
}
ScaleARGB(src_argb, src_stride_argb, src_width, src_height,
dst_argb, dst_stride_argb, dst_width, dst_height,
clip_x, clip_y, clip_width, clip_height, filtering);
ScaleARGB(src_argb, src_stride_argb, src_width, src_height, dst_argb,
dst_stride_argb, dst_width, dst_height, clip_x, clip_y, clip_width,
clip_height, filtering);
return 0;
}
// Scale an ARGB image.
LIBYUV_API
int ARGBScale(const uint8* src_argb, int src_stride_argb,
int src_width, int src_height,
uint8* dst_argb, int dst_stride_argb,
int dst_width, int dst_height,
int ARGBScale(const uint8* src_argb,
int src_stride_argb,
int src_width,
int src_height,
uint8* dst_argb,
int dst_stride_argb,
int dst_width,
int dst_height,
enum FilterMode filtering) {
if (!src_argb || src_width == 0 || src_height == 0 ||
src_width > 32768 || src_height > 32768 ||
!dst_argb || dst_width <= 0 || dst_height <= 0) {
if (!src_argb || src_width == 0 || src_height == 0 || src_width > 32768 ||
src_height > 32768 || !dst_argb || dst_width <= 0 || dst_height <= 0) {
return -1;
}
ScaleARGB(src_argb, src_stride_argb, src_width, src_height,
dst_argb, dst_stride_argb, dst_width, dst_height,
0, 0, dst_width, dst_height, filtering);
ScaleARGB(src_argb, src_stride_argb, src_width, src_height, dst_argb,
dst_stride_argb, dst_width, dst_height, 0, 0, dst_width, dst_height,
filtering);
return 0;
}
// Scale with YUV conversion to ARGB and clipping.
LIBYUV_API
int YUVToARGBScaleClip(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
int YUVToARGBScaleClip(const uint8* src_y,
int src_stride_y,
const uint8* src_u,
int src_stride_u,
const uint8* src_v,
int src_stride_v,
uint32 src_fourcc,
int src_width, int src_height,
uint8* dst_argb, int dst_stride_argb,
int src_width,
int src_height,
uint8* dst_argb,
int dst_stride_argb,
uint32 dst_fourcc,
int dst_width, int dst_height,
int clip_x, int clip_y, int clip_width, int clip_height,
int dst_width,
int dst_height,
int clip_x,
int clip_y,
int clip_width,
int clip_height,
enum FilterMode filtering) {
uint8* argb_buffer = (uint8*)malloc(src_width * src_height * 4);
int r;
I420ToARGB(src_y, src_stride_y,
src_u, src_stride_u,
src_v, src_stride_v,
argb_buffer, src_width * 4,
src_width, src_height);
(void)src_fourcc; // TODO(fbarchard): implement and/or assert.
(void)dst_fourcc;
I420ToARGB(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v,
argb_buffer, src_width * 4, src_width, src_height);
r = ARGBScaleClip(argb_buffer, src_width * 4,
src_width, src_height,
dst_argb, dst_stride_argb,
dst_width, dst_height,
clip_x, clip_y, clip_width, clip_height,
filtering);
r = ARGBScaleClip(argb_buffer, src_width * 4, src_width, src_height, dst_argb,
dst_stride_argb, dst_width, dst_height, clip_x, clip_y,
clip_width, clip_height, filtering);
free(argb_buffer);
return r;
}

View File

@ -28,9 +28,12 @@ static __inline int Abs(int v) {
}
// CPU agnostic row functions
void ScaleRowDown2_C(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
void ScaleRowDown2_C(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst,
int dst_width) {
int x;
(void)src_stride;
for (x = 0; x < dst_width - 1; x += 2) {
dst[0] = src_ptr[1];
dst[1] = src_ptr[3];
@ -42,9 +45,12 @@ void ScaleRowDown2_C(const uint8* src_ptr, ptrdiff_t src_stride,
}
}
void ScaleRowDown2_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
uint16* dst, int dst_width) {
void ScaleRowDown2_16_C(const uint16* src_ptr,
ptrdiff_t src_stride,
uint16* dst,
int dst_width) {
int x;
(void)src_stride;
for (x = 0; x < dst_width - 1; x += 2) {
dst[0] = src_ptr[1];
dst[1] = src_ptr[3];
@ -56,10 +62,13 @@ void ScaleRowDown2_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
}
}
void ScaleRowDown2Linear_C(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
void ScaleRowDown2Linear_C(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst,
int dst_width) {
const uint8* s = src_ptr;
int x;
(void)src_stride;
for (x = 0; x < dst_width - 1; x += 2) {
dst[0] = (s[0] + s[1] + 1) >> 1;
dst[1] = (s[2] + s[3] + 1) >> 1;
@ -71,10 +80,13 @@ void ScaleRowDown2Linear_C(const uint8* src_ptr, ptrdiff_t src_stride,
}
}
void ScaleRowDown2Linear_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
uint16* dst, int dst_width) {
void ScaleRowDown2Linear_16_C(const uint16* src_ptr,
ptrdiff_t src_stride,
uint16* dst,
int dst_width) {
const uint16* s = src_ptr;
int x;
(void)src_stride;
for (x = 0; x < dst_width - 1; x += 2) {
dst[0] = (s[0] + s[1] + 1) >> 1;
dst[1] = (s[2] + s[3] + 1) >> 1;
@ -86,8 +98,10 @@ void ScaleRowDown2Linear_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
}
}
void ScaleRowDown2Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
void ScaleRowDown2Box_C(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst,
int dst_width) {
const uint8* s = src_ptr;
const uint8* t = src_ptr + src_stride;
int x;
@ -103,8 +117,10 @@ void ScaleRowDown2Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
}
}
void ScaleRowDown2Box_Odd_C(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
void ScaleRowDown2Box_Odd_C(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst,
int dst_width) {
const uint8* s = src_ptr;
const uint8* t = src_ptr + src_stride;
int x;
@ -125,8 +141,10 @@ void ScaleRowDown2Box_Odd_C(const uint8* src_ptr, ptrdiff_t src_stride,
dst[0] = (s[0] + t[0] + 1) >> 1;
}
void ScaleRowDown2Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
uint16* dst, int dst_width) {
void ScaleRowDown2Box_16_C(const uint16* src_ptr,
ptrdiff_t src_stride,
uint16* dst,
int dst_width) {
const uint16* s = src_ptr;
const uint16* t = src_ptr + src_stride;
int x;
@ -142,9 +160,12 @@ void ScaleRowDown2Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
}
}
void ScaleRowDown4_C(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
void ScaleRowDown4_C(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst,
int dst_width) {
int x;
(void)src_stride;
for (x = 0; x < dst_width - 1; x += 2) {
dst[0] = src_ptr[2];
dst[1] = src_ptr[6];
@ -156,9 +177,12 @@ void ScaleRowDown4_C(const uint8* src_ptr, ptrdiff_t src_stride,
}
}
void ScaleRowDown4_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
uint16* dst, int dst_width) {
void ScaleRowDown4_16_C(const uint16* src_ptr,
ptrdiff_t src_stride,
uint16* dst,
int dst_width) {
int x;
(void)src_stride;
for (x = 0; x < dst_width - 1; x += 2) {
dst[0] = src_ptr[2];
dst[1] = src_ptr[6];
@ -170,81 +194,88 @@ void ScaleRowDown4_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
}
}
void ScaleRowDown4Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
void ScaleRowDown4Box_C(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst,
int dst_width) {
intptr_t stride = src_stride;
int x;
for (x = 0; x < dst_width - 1; x += 2) {
dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
src_ptr[stride + 0] + src_ptr[stride + 1] +
src_ptr[stride + 2] + src_ptr[stride + 3] +
src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +
src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +
src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +
src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +
8) >> 4;
src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2] +
src_ptr[stride + 3] + src_ptr[stride * 2 + 0] +
src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2] +
src_ptr[stride * 2 + 3] + src_ptr[stride * 3 + 0] +
src_ptr[stride * 3 + 1] + src_ptr[stride * 3 + 2] +
src_ptr[stride * 3 + 3] + 8) >>
4;
dst[1] = (src_ptr[4] + src_ptr[5] + src_ptr[6] + src_ptr[7] +
src_ptr[stride + 4] + src_ptr[stride + 5] +
src_ptr[stride + 6] + src_ptr[stride + 7] +
src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5] +
src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7] +
src_ptr[stride * 3 + 4] + src_ptr[stride * 3 + 5] +
src_ptr[stride * 3 + 6] + src_ptr[stride * 3 + 7] +
8) >> 4;
src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride + 6] +
src_ptr[stride + 7] + src_ptr[stride * 2 + 4] +
src_ptr[stride * 2 + 5] + src_ptr[stride * 2 + 6] +
src_ptr[stride * 2 + 7] + src_ptr[stride * 3 + 4] +
src_ptr[stride * 3 + 5] + src_ptr[stride * 3 + 6] +
src_ptr[stride * 3 + 7] + 8) >>
4;
dst += 2;
src_ptr += 8;
}
if (dst_width & 1) {
dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
src_ptr[stride + 0] + src_ptr[stride + 1] +
src_ptr[stride + 2] + src_ptr[stride + 3] +
src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +
src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +
src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +
src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +
8) >> 4;
src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2] +
src_ptr[stride + 3] + src_ptr[stride * 2 + 0] +
src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2] +
src_ptr[stride * 2 + 3] + src_ptr[stride * 3 + 0] +
src_ptr[stride * 3 + 1] + src_ptr[stride * 3 + 2] +
src_ptr[stride * 3 + 3] + 8) >>
4;
}
}
void ScaleRowDown4Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
uint16* dst, int dst_width) {
void ScaleRowDown4Box_16_C(const uint16* src_ptr,
ptrdiff_t src_stride,
uint16* dst,
int dst_width) {
intptr_t stride = src_stride;
int x;
for (x = 0; x < dst_width - 1; x += 2) {
dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
src_ptr[stride + 0] + src_ptr[stride + 1] +
src_ptr[stride + 2] + src_ptr[stride + 3] +
src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +
src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +
src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +
src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +
8) >> 4;
src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2] +
src_ptr[stride + 3] + src_ptr[stride * 2 + 0] +
src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2] +
src_ptr[stride * 2 + 3] + src_ptr[stride * 3 + 0] +
src_ptr[stride * 3 + 1] + src_ptr[stride * 3 + 2] +
src_ptr[stride * 3 + 3] + 8) >>
4;
dst[1] = (src_ptr[4] + src_ptr[5] + src_ptr[6] + src_ptr[7] +
src_ptr[stride + 4] + src_ptr[stride + 5] +
src_ptr[stride + 6] + src_ptr[stride + 7] +
src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5] +
src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7] +
src_ptr[stride * 3 + 4] + src_ptr[stride * 3 + 5] +
src_ptr[stride * 3 + 6] + src_ptr[stride * 3 + 7] +
8) >> 4;
src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride + 6] +
src_ptr[stride + 7] + src_ptr[stride * 2 + 4] +
src_ptr[stride * 2 + 5] + src_ptr[stride * 2 + 6] +
src_ptr[stride * 2 + 7] + src_ptr[stride * 3 + 4] +
src_ptr[stride * 3 + 5] + src_ptr[stride * 3 + 6] +
src_ptr[stride * 3 + 7] + 8) >>
4;
dst += 2;
src_ptr += 8;
}
if (dst_width & 1) {
dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
src_ptr[stride + 0] + src_ptr[stride + 1] +
src_ptr[stride + 2] + src_ptr[stride + 3] +
src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +
src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +
src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +
src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +
8) >> 4;
src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2] +
src_ptr[stride + 3] + src_ptr[stride * 2 + 0] +
src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2] +
src_ptr[stride * 2 + 3] + src_ptr[stride * 3 + 0] +
src_ptr[stride * 3 + 1] + src_ptr[stride * 3 + 2] +
src_ptr[stride * 3 + 3] + 8) >>
4;
}
}
void ScaleRowDown34_C(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
void ScaleRowDown34_C(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst,
int dst_width) {
int x;
(void)src_stride;
assert((dst_width % 3 == 0) && (dst_width > 0));
for (x = 0; x < dst_width; x += 3) {
dst[0] = src_ptr[0];
@ -255,9 +286,12 @@ void ScaleRowDown34_C(const uint8* src_ptr, ptrdiff_t src_stride,
}
}
void ScaleRowDown34_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
uint16* dst, int dst_width) {
void ScaleRowDown34_16_C(const uint16* src_ptr,
ptrdiff_t src_stride,
uint16* dst,
int dst_width) {
int x;
(void)src_stride;
assert((dst_width % 3 == 0) && (dst_width > 0));
for (x = 0; x < dst_width; x += 3) {
dst[0] = src_ptr[0];
@ -269,8 +303,10 @@ void ScaleRowDown34_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
}
// Filter rows 0 and 1 together, 3 : 1
void ScaleRowDown34_0_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* d, int dst_width) {
void ScaleRowDown34_0_Box_C(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* d,
int dst_width) {
const uint8* s = src_ptr;
const uint8* t = src_ptr + src_stride;
int x;
@ -291,8 +327,10 @@ void ScaleRowDown34_0_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
}
}
void ScaleRowDown34_0_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
uint16* d, int dst_width) {
void ScaleRowDown34_0_Box_16_C(const uint16* src_ptr,
ptrdiff_t src_stride,
uint16* d,
int dst_width) {
const uint16* s = src_ptr;
const uint16* t = src_ptr + src_stride;
int x;
@ -314,8 +352,10 @@ void ScaleRowDown34_0_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
}
// Filter rows 1 and 2 together, 1 : 1
void ScaleRowDown34_1_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* d, int dst_width) {
void ScaleRowDown34_1_Box_C(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* d,
int dst_width) {
const uint8* s = src_ptr;
const uint8* t = src_ptr + src_stride;
int x;
@ -336,8 +376,10 @@ void ScaleRowDown34_1_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
}
}
void ScaleRowDown34_1_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
uint16* d, int dst_width) {
void ScaleRowDown34_1_Box_16_C(const uint16* src_ptr,
ptrdiff_t src_stride,
uint16* d,
int dst_width) {
const uint16* s = src_ptr;
const uint16* t = src_ptr + src_stride;
int x;
@ -359,8 +401,11 @@ void ScaleRowDown34_1_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
}
// Scales a single row of pixels using point sampling.
void ScaleCols_C(uint8* dst_ptr, const uint8* src_ptr,
int dst_width, int x, int dx) {
void ScaleCols_C(uint8* dst_ptr,
const uint8* src_ptr,
int dst_width,
int x,
int dx) {
int j;
for (j = 0; j < dst_width - 1; j += 2) {
dst_ptr[0] = src_ptr[x >> 16];
@ -374,8 +419,11 @@ void ScaleCols_C(uint8* dst_ptr, const uint8* src_ptr,
}
}
void ScaleCols_16_C(uint16* dst_ptr, const uint16* src_ptr,
int dst_width, int x, int dx) {
void ScaleCols_16_C(uint16* dst_ptr,
const uint16* src_ptr,
int dst_width,
int x,
int dx) {
int j;
for (j = 0; j < dst_width - 1; j += 2) {
dst_ptr[0] = src_ptr[x >> 16];
@ -390,9 +438,14 @@ void ScaleCols_16_C(uint16* dst_ptr, const uint16* src_ptr,
}
// Scales a single row of pixels up by 2x using point sampling.
void ScaleColsUp2_C(uint8* dst_ptr, const uint8* src_ptr,
int dst_width, int x, int dx) {
void ScaleColsUp2_C(uint8* dst_ptr,
const uint8* src_ptr,
int dst_width,
int x,
int dx) {
int j;
(void)x;
(void)dx;
for (j = 0; j < dst_width - 1; j += 2) {
dst_ptr[1] = dst_ptr[0] = src_ptr[0];
src_ptr += 1;
@ -403,9 +456,14 @@ void ScaleColsUp2_C(uint8* dst_ptr, const uint8* src_ptr,
}
}
void ScaleColsUp2_16_C(uint16* dst_ptr, const uint16* src_ptr,
int dst_width, int x, int dx) {
void ScaleColsUp2_16_C(uint16* dst_ptr,
const uint16* src_ptr,
int dst_width,
int x,
int dx) {
int j;
(void)x;
(void)dx;
for (j = 0; j < dst_width - 1; j += 2) {
dst_ptr[1] = dst_ptr[0] = src_ptr[0];
src_ptr += 1;
@ -417,11 +475,20 @@ void ScaleColsUp2_16_C(uint16* dst_ptr, const uint16* src_ptr,
}
// (1-f)a + fb can be replaced with a + f(b-a)
#define BLENDER(a, b, f) (uint8)((int)(a) + \
((int)(f) * ((int)(b) - (int)(a)) >> 16))
#if defined(__arm__) || defined(__aarch64__)
#define BLENDER(a, b, f) \
(uint8)((int)(a) + ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
#else
// Intel uses 7 bit math with rounding.
#define BLENDER(a, b, f) \
(uint8)((int)(a) + (((int)((f) >> 9) * ((int)(b) - (int)(a)) + 0x40) >> 7))
#endif
void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr,
int dst_width, int x, int dx) {
void ScaleFilterCols_C(uint8* dst_ptr,
const uint8* src_ptr,
int dst_width,
int x,
int dx) {
int j;
for (j = 0; j < dst_width - 1; j += 2) {
int xi = x >> 16;
@ -444,8 +511,11 @@ void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr,
}
}
void ScaleFilterCols64_C(uint8* dst_ptr, const uint8* src_ptr,
int dst_width, int x32, int dx) {
void ScaleFilterCols64_C(uint8* dst_ptr,
const uint8* src_ptr,
int dst_width,
int x32,
int dx) {
int64 x = (int64)(x32);
int j;
for (j = 0; j < dst_width - 1; j += 2) {
@ -470,11 +540,15 @@ void ScaleFilterCols64_C(uint8* dst_ptr, const uint8* src_ptr,
}
#undef BLENDER
#define BLENDER(a, b, f) (uint16)((int)(a) + \
((int)(f) * ((int)(b) - (int)(a)) >> 16))
// Same as 8 bit arm blender but return is cast to uint16
#define BLENDER(a, b, f) \
(uint16)((int)(a) + ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
void ScaleFilterCols_16_C(uint16* dst_ptr, const uint16* src_ptr,
int dst_width, int x, int dx) {
void ScaleFilterCols_16_C(uint16* dst_ptr,
const uint16* src_ptr,
int dst_width,
int x,
int dx) {
int j;
for (j = 0; j < dst_width - 1; j += 2) {
int xi = x >> 16;
@ -497,8 +571,11 @@ void ScaleFilterCols_16_C(uint16* dst_ptr, const uint16* src_ptr,
}
}
void ScaleFilterCols64_16_C(uint16* dst_ptr, const uint16* src_ptr,
int dst_width, int x32, int dx) {
void ScaleFilterCols64_16_C(uint16* dst_ptr,
const uint16* src_ptr,
int dst_width,
int x32,
int dx) {
int64 x = (int64)(x32);
int j;
for (j = 0; j < dst_width - 1; j += 2) {
@ -523,9 +600,12 @@ void ScaleFilterCols64_16_C(uint16* dst_ptr, const uint16* src_ptr,
}
#undef BLENDER
void ScaleRowDown38_C(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
void ScaleRowDown38_C(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst,
int dst_width) {
int x;
(void)src_stride;
assert(dst_width % 3 == 0);
for (x = 0; x < dst_width; x += 3) {
dst[0] = src_ptr[0];
@ -536,9 +616,12 @@ void ScaleRowDown38_C(const uint8* src_ptr, ptrdiff_t src_stride,
}
}
void ScaleRowDown38_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
uint16* dst, int dst_width) {
void ScaleRowDown38_16_C(const uint16* src_ptr,
ptrdiff_t src_stride,
uint16* dst,
int dst_width) {
int x;
(void)src_stride;
assert(dst_width % 3 == 0);
for (x = 0; x < dst_width; x += 3) {
dst[0] = src_ptr[0];
@ -552,25 +635,29 @@ void ScaleRowDown38_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
// 8x3 -> 3x1
void ScaleRowDown38_3_Box_C(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
uint8* dst_ptr,
int dst_width) {
intptr_t stride = src_stride;
int i;
assert((dst_width % 3 == 0) && (dst_width > 0));
for (i = 0; i < dst_width; i += 3) {
dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
src_ptr[stride + 0] + src_ptr[stride + 1] +
src_ptr[stride + 2] + src_ptr[stride * 2 + 0] +
src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) *
(65536 / 9) >> 16;
dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
src_ptr[stride + 3] + src_ptr[stride + 4] +
src_ptr[stride + 5] + src_ptr[stride * 2 + 3] +
src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) *
(65536 / 9) >> 16;
dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
src_ptr[stride + 6] + src_ptr[stride + 7] +
src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) *
(65536 / 6) >> 16;
dst_ptr[0] =
(src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] +
src_ptr[stride + 1] + src_ptr[stride + 2] + src_ptr[stride * 2 + 0] +
src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) *
(65536 / 9) >>
16;
dst_ptr[1] =
(src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] +
src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride * 2 + 3] +
src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) *
(65536 / 9) >>
16;
dst_ptr[2] =
(src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7] +
src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) *
(65536 / 6) >>
16;
src_ptr += 8;
dst_ptr += 3;
}
@ -578,66 +665,80 @@ void ScaleRowDown38_3_Box_C(const uint8* src_ptr,
void ScaleRowDown38_3_Box_16_C(const uint16* src_ptr,
ptrdiff_t src_stride,
uint16* dst_ptr, int dst_width) {
uint16* dst_ptr,
int dst_width) {
intptr_t stride = src_stride;
int i;
assert((dst_width % 3 == 0) && (dst_width > 0));
for (i = 0; i < dst_width; i += 3) {
dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
src_ptr[stride + 0] + src_ptr[stride + 1] +
src_ptr[stride + 2] + src_ptr[stride * 2 + 0] +
src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) *
(65536 / 9) >> 16;
dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
src_ptr[stride + 3] + src_ptr[stride + 4] +
src_ptr[stride + 5] + src_ptr[stride * 2 + 3] +
src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) *
(65536 / 9) >> 16;
dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
src_ptr[stride + 6] + src_ptr[stride + 7] +
src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) *
(65536 / 6) >> 16;
dst_ptr[0] =
(src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] +
src_ptr[stride + 1] + src_ptr[stride + 2] + src_ptr[stride * 2 + 0] +
src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) *
(65536 / 9) >>
16;
dst_ptr[1] =
(src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] +
src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride * 2 + 3] +
src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) *
(65536 / 9) >>
16;
dst_ptr[2] =
(src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7] +
src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) *
(65536 / 6) >>
16;
src_ptr += 8;
dst_ptr += 3;
}
}
// 8x2 -> 3x1
void ScaleRowDown38_2_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
void ScaleRowDown38_2_Box_C(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr,
int dst_width) {
intptr_t stride = src_stride;
int i;
assert((dst_width % 3 == 0) && (dst_width > 0));
for (i = 0; i < dst_width; i += 3) {
dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
src_ptr[stride + 0] + src_ptr[stride + 1] +
src_ptr[stride + 2]) * (65536 / 6) >> 16;
dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
src_ptr[stride + 3] + src_ptr[stride + 4] +
src_ptr[stride + 5]) * (65536 / 6) >> 16;
dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
src_ptr[stride + 6] + src_ptr[stride + 7]) *
(65536 / 4) >> 16;
dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] +
src_ptr[stride + 1] + src_ptr[stride + 2]) *
(65536 / 6) >>
16;
dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] +
src_ptr[stride + 4] + src_ptr[stride + 5]) *
(65536 / 6) >>
16;
dst_ptr[2] =
(src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7]) *
(65536 / 4) >>
16;
src_ptr += 8;
dst_ptr += 3;
}
}
void ScaleRowDown38_2_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
uint16* dst_ptr, int dst_width) {
void ScaleRowDown38_2_Box_16_C(const uint16* src_ptr,
ptrdiff_t src_stride,
uint16* dst_ptr,
int dst_width) {
intptr_t stride = src_stride;
int i;
assert((dst_width % 3 == 0) && (dst_width > 0));
for (i = 0; i < dst_width; i += 3) {
dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
src_ptr[stride + 0] + src_ptr[stride + 1] +
src_ptr[stride + 2]) * (65536 / 6) >> 16;
dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
src_ptr[stride + 3] + src_ptr[stride + 4] +
src_ptr[stride + 5]) * (65536 / 6) >> 16;
dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
src_ptr[stride + 6] + src_ptr[stride + 7]) *
(65536 / 4) >> 16;
dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] +
src_ptr[stride + 1] + src_ptr[stride + 2]) *
(65536 / 6) >>
16;
dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] +
src_ptr[stride + 4] + src_ptr[stride + 5]) *
(65536 / 6) >>
16;
dst_ptr[2] =
(src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7]) *
(65536 / 4) >>
16;
src_ptr += 8;
dst_ptr += 3;
}
@ -673,11 +774,12 @@ void ScaleAddRow_16_C(const uint16* src_ptr, uint32* dst_ptr, int src_width) {
void ScaleARGBRowDown2_C(const uint8* src_argb,
ptrdiff_t src_stride,
uint8* dst_argb, int dst_width) {
uint8* dst_argb,
int dst_width) {
const uint32* src = (const uint32*)(src_argb);
uint32* dst = (uint32*)(dst_argb);
int x;
(void)src_stride;
for (x = 0; x < dst_width - 1; x += 2) {
dst[0] = src[1];
dst[1] = src[3];
@ -691,8 +793,10 @@ void ScaleARGBRowDown2_C(const uint8* src_argb,
void ScaleARGBRowDown2Linear_C(const uint8* src_argb,
ptrdiff_t src_stride,
uint8* dst_argb, int dst_width) {
uint8* dst_argb,
int dst_width) {
int x;
(void)src_stride;
for (x = 0; x < dst_width; ++x) {
dst_argb[0] = (src_argb[0] + src_argb[4] + 1) >> 1;
dst_argb[1] = (src_argb[1] + src_argb[5] + 1) >> 1;
@ -703,29 +807,37 @@ void ScaleARGBRowDown2Linear_C(const uint8* src_argb,
}
}
void ScaleARGBRowDown2Box_C(const uint8* src_argb, ptrdiff_t src_stride,
uint8* dst_argb, int dst_width) {
void ScaleARGBRowDown2Box_C(const uint8* src_argb,
ptrdiff_t src_stride,
uint8* dst_argb,
int dst_width) {
int x;
for (x = 0; x < dst_width; ++x) {
dst_argb[0] = (src_argb[0] + src_argb[4] +
src_argb[src_stride] + src_argb[src_stride + 4] + 2) >> 2;
dst_argb[1] = (src_argb[1] + src_argb[5] +
src_argb[src_stride + 1] + src_argb[src_stride + 5] + 2) >> 2;
dst_argb[2] = (src_argb[2] + src_argb[6] +
src_argb[src_stride + 2] + src_argb[src_stride + 6] + 2) >> 2;
dst_argb[3] = (src_argb[3] + src_argb[7] +
src_argb[src_stride + 3] + src_argb[src_stride + 7] + 2) >> 2;
dst_argb[0] = (src_argb[0] + src_argb[4] + src_argb[src_stride] +
src_argb[src_stride + 4] + 2) >>
2;
dst_argb[1] = (src_argb[1] + src_argb[5] + src_argb[src_stride + 1] +
src_argb[src_stride + 5] + 2) >>
2;
dst_argb[2] = (src_argb[2] + src_argb[6] + src_argb[src_stride + 2] +
src_argb[src_stride + 6] + 2) >>
2;
dst_argb[3] = (src_argb[3] + src_argb[7] + src_argb[src_stride + 3] +
src_argb[src_stride + 7] + 2) >>
2;
src_argb += 8;
dst_argb += 4;
}
}
void ScaleARGBRowDownEven_C(const uint8* src_argb, ptrdiff_t src_stride,
void ScaleARGBRowDownEven_C(const uint8* src_argb,
ptrdiff_t src_stride,
int src_stepx,
uint8* dst_argb, int dst_width) {
uint8* dst_argb,
int dst_width) {
const uint32* src = (const uint32*)(src_argb);
uint32* dst = (uint32*)(dst_argb);
(void)src_stride;
int x;
for (x = 0; x < dst_width - 1; x += 2) {
dst[0] = src[0];
@ -741,25 +853,33 @@ void ScaleARGBRowDownEven_C(const uint8* src_argb, ptrdiff_t src_stride,
void ScaleARGBRowDownEvenBox_C(const uint8* src_argb,
ptrdiff_t src_stride,
int src_stepx,
uint8* dst_argb, int dst_width) {
uint8* dst_argb,
int dst_width) {
int x;
for (x = 0; x < dst_width; ++x) {
dst_argb[0] = (src_argb[0] + src_argb[4] +
src_argb[src_stride] + src_argb[src_stride + 4] + 2) >> 2;
dst_argb[1] = (src_argb[1] + src_argb[5] +
src_argb[src_stride + 1] + src_argb[src_stride + 5] + 2) >> 2;
dst_argb[2] = (src_argb[2] + src_argb[6] +
src_argb[src_stride + 2] + src_argb[src_stride + 6] + 2) >> 2;
dst_argb[3] = (src_argb[3] + src_argb[7] +
src_argb[src_stride + 3] + src_argb[src_stride + 7] + 2) >> 2;
dst_argb[0] = (src_argb[0] + src_argb[4] + src_argb[src_stride] +
src_argb[src_stride + 4] + 2) >>
2;
dst_argb[1] = (src_argb[1] + src_argb[5] + src_argb[src_stride + 1] +
src_argb[src_stride + 5] + 2) >>
2;
dst_argb[2] = (src_argb[2] + src_argb[6] + src_argb[src_stride + 2] +
src_argb[src_stride + 6] + 2) >>
2;
dst_argb[3] = (src_argb[3] + src_argb[7] + src_argb[src_stride + 3] +
src_argb[src_stride + 7] + 2) >>
2;
src_argb += src_stepx * 4;
dst_argb += 4;
}
}
// Scales a single row of pixels using point sampling.
void ScaleARGBCols_C(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx) {
void ScaleARGBCols_C(uint8* dst_argb,
const uint8* src_argb,
int dst_width,
int x,
int dx) {
const uint32* src = (const uint32*)(src_argb);
uint32* dst = (uint32*)(dst_argb);
int j;
@ -775,8 +895,11 @@ void ScaleARGBCols_C(uint8* dst_argb, const uint8* src_argb,
}
}
void ScaleARGBCols64_C(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x32, int dx) {
void ScaleARGBCols64_C(uint8* dst_argb,
const uint8* src_argb,
int dst_width,
int x32,
int dx) {
int64 x = (int64)(x32);
const uint32* src = (const uint32*)(src_argb);
uint32* dst = (uint32*)(dst_argb);
@ -794,11 +917,16 @@ void ScaleARGBCols64_C(uint8* dst_argb, const uint8* src_argb,
}
// Scales a single row of pixels up by 2x using point sampling.
void ScaleARGBColsUp2_C(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx) {
void ScaleARGBColsUp2_C(uint8* dst_argb,
const uint8* src_argb,
int dst_width,
int x,
int dx) {
const uint32* src = (const uint32*)(src_argb);
uint32* dst = (uint32*)(dst_argb);
int j;
(void)x;
(void)dx;
for (j = 0; j < dst_width - 1; j += 2) {
dst[1] = dst[0] = src[0];
src += 1;
@ -809,16 +937,20 @@ void ScaleARGBColsUp2_C(uint8* dst_argb, const uint8* src_argb,
}
}
// TODO(fbarchard): Replace 0x7f ^ f with 128-f. bug=607.
// Mimics SSSE3 blender
#define BLENDER1(a, b, f) ((a) * (0x7f ^ f) + (b) * f) >> 7
#define BLENDERC(a, b, f, s) (uint32)( \
BLENDER1(((a) >> s) & 255, ((b) >> s) & 255, f) << s)
#define BLENDER(a, b, f) \
BLENDERC(a, b, f, 24) | BLENDERC(a, b, f, 16) | \
BLENDERC(a, b, f, 8) | BLENDERC(a, b, f, 0)
#define BLENDER1(a, b, f) ((a) * (0x7f ^ f) + (b)*f) >> 7
#define BLENDERC(a, b, f, s) \
(uint32)(BLENDER1(((a) >> s) & 255, ((b) >> s) & 255, f) << s)
#define BLENDER(a, b, f) \
BLENDERC(a, b, f, 24) | BLENDERC(a, b, f, 16) | BLENDERC(a, b, f, 8) | \
BLENDERC(a, b, f, 0)
void ScaleARGBFilterCols_C(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx) {
void ScaleARGBFilterCols_C(uint8* dst_argb,
const uint8* src_argb,
int dst_width,
int x,
int dx) {
const uint32* src = (const uint32*)(src_argb);
uint32* dst = (uint32*)(dst_argb);
int j;
@ -846,8 +978,11 @@ void ScaleARGBFilterCols_C(uint8* dst_argb, const uint8* src_argb,
}
}
void ScaleARGBFilterCols64_C(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x32, int dx) {
void ScaleARGBFilterCols64_C(uint8* dst_argb,
const uint8* src_argb,
int dst_width,
int x32,
int dx) {
int64 x = (int64)(x32);
const uint32* src = (const uint32*)(src_argb);
uint32* dst = (uint32*)(dst_argb);
@ -881,16 +1016,22 @@ void ScaleARGBFilterCols64_C(uint8* dst_argb, const uint8* src_argb,
// Scale plane vertically with bilinear interpolation.
void ScalePlaneVertical(int src_height,
int dst_width, int dst_height,
int src_stride, int dst_stride,
const uint8* src_argb, uint8* dst_argb,
int x, int y, int dy,
int bpp, enum FilterMode filtering) {
int dst_width,
int dst_height,
int src_stride,
int dst_stride,
const uint8* src_argb,
uint8* dst_argb,
int x,
int y,
int dy,
int bpp,
enum FilterMode filtering) {
// TODO(fbarchard): Allow higher bpp.
int dst_width_bytes = dst_width * bpp;
void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
InterpolateRow_C;
void (*InterpolateRow)(uint8 * dst_argb, const uint8* src_argb,
ptrdiff_t src_stride, int dst_width,
int source_y_fraction) = InterpolateRow_C;
const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
int j;
assert(bpp >= 1 && bpp <= 4);
@ -923,14 +1064,22 @@ void ScalePlaneVertical(int src_height,
}
#endif
#if defined(HAS_INTERPOLATEROW_DSPR2)
if (TestCpuFlag(kCpuHasDSPR2) &&
IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4) &&
IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) {
if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src_argb, 4) &&
IS_ALIGNED(src_stride, 4) && IS_ALIGNED(dst_argb, 4) &&
IS_ALIGNED(dst_stride, 4)) {
InterpolateRow = InterpolateRow_Any_DSPR2;
if (IS_ALIGNED(dst_width_bytes, 4)) {
InterpolateRow = InterpolateRow_DSPR2;
}
}
#endif
#if defined(HAS_INTERPOLATEROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
InterpolateRow = InterpolateRow_Any_MSA;
if (IS_ALIGNED(dst_width_bytes, 32)) {
InterpolateRow = InterpolateRow_MSA;
}
}
#endif
for (j = 0; j < dst_height; ++j) {
int yi;
@ -940,23 +1089,29 @@ void ScalePlaneVertical(int src_height,
}
yi = y >> 16;
yf = filtering ? ((y >> 8) & 255) : 0;
InterpolateRow(dst_argb, src_argb + yi * src_stride,
src_stride, dst_width_bytes, yf);
InterpolateRow(dst_argb, src_argb + yi * src_stride, src_stride,
dst_width_bytes, yf);
dst_argb += dst_stride;
y += dy;
}
}
void ScalePlaneVertical_16(int src_height,
int dst_width, int dst_height,
int src_stride, int dst_stride,
const uint16* src_argb, uint16* dst_argb,
int x, int y, int dy,
int wpp, enum FilterMode filtering) {
int dst_width,
int dst_height,
int src_stride,
int dst_stride,
const uint16* src_argb,
uint16* dst_argb,
int x,
int y,
int dy,
int wpp,
enum FilterMode filtering) {
// TODO(fbarchard): Allow higher wpp.
int dst_width_words = dst_width * wpp;
void (*InterpolateRow)(uint16* dst_argb, const uint16* src_argb,
ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
InterpolateRow_16_C;
void (*InterpolateRow)(uint16 * dst_argb, const uint16* src_argb,
ptrdiff_t src_stride, int dst_width,
int source_y_fraction) = InterpolateRow_16_C;
const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
int j;
assert(wpp >= 1 && wpp <= 2);
@ -997,9 +1152,9 @@ void ScalePlaneVertical_16(int src_height,
}
#endif
#if defined(HAS_INTERPOLATEROW_16_DSPR2)
if (TestCpuFlag(kCpuHasDSPR2) &&
IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4) &&
IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) {
if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src_argb, 4) &&
IS_ALIGNED(src_stride, 4) && IS_ALIGNED(dst_argb, 4) &&
IS_ALIGNED(dst_stride, 4)) {
InterpolateRow = InterpolateRow_Any_16_DSPR2;
if (IS_ALIGNED(dst_width_bytes, 4)) {
InterpolateRow = InterpolateRow_16_DSPR2;
@ -1014,16 +1169,18 @@ void ScalePlaneVertical_16(int src_height,
}
yi = y >> 16;
yf = filtering ? ((y >> 8) & 255) : 0;
InterpolateRow(dst_argb, src_argb + yi * src_stride,
src_stride, dst_width_words, yf);
InterpolateRow(dst_argb, src_argb + yi * src_stride, src_stride,
dst_width_words, yf);
dst_argb += dst_stride;
y += dy;
}
}
// Simplify the filtering based on scale factors.
enum FilterMode ScaleFilterReduce(int src_width, int src_height,
int dst_width, int dst_height,
enum FilterMode ScaleFilterReduce(int src_width,
int src_height,
int dst_width,
int dst_height,
enum FilterMode filtering) {
if (src_width < 0) {
src_width = -src_width;
@ -1070,17 +1227,21 @@ int FixedDiv_C(int num, int div) {
// Divide num by div and return as 16.16 fixed point result.
int FixedDiv1_C(int num, int div) {
return (int)((((int64)(num) << 16) - 0x00010001) /
(div - 1));
return (int)((((int64)(num) << 16) - 0x00010001) / (div - 1));
}
#define CENTERSTART(dx, s) (dx < 0) ? -((-dx >> 1) + s) : ((dx >> 1) + s)
// Compute slope values for stepping.
void ScaleSlope(int src_width, int src_height,
int dst_width, int dst_height,
void ScaleSlope(int src_width,
int src_height,
int dst_width,
int dst_height,
enum FilterMode filtering,
int* x, int* y, int* dx, int* dy) {
int* x,
int* y,
int* dx,
int* dy) {
assert(x != NULL);
assert(y != NULL);
assert(dx != NULL);
@ -1112,7 +1273,7 @@ void ScaleSlope(int src_width, int src_height,
*x = 0;
}
if (dst_height <= src_height) {
*dy = FixedDiv(src_height, dst_height);
*dy = FixedDiv(src_height, dst_height);
*y = CENTERSTART(*dy, -32768); // Subtract 0.5 (32768) to center filter.
} else if (dst_height > 1) {
*dy = FixedDiv1(src_height, dst_height);

View File

@ -17,168 +17,167 @@ extern "C" {
#endif
// This module is for GCC MIPS DSPR2
#if !defined(LIBYUV_DISABLE_MIPS) && \
defined(__mips_dsp) && (__mips_dsp_rev >= 2) && \
(_MIPS_SIM == _MIPS_SIM_ABI32)
#if !defined(LIBYUV_DISABLE_DSPR2) && defined(__mips_dsp) && \
(__mips_dsp_rev >= 2) && (_MIPS_SIM == _MIPS_SIM_ABI32)
void ScaleRowDown2_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
void ScaleRowDown2_DSPR2(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst,
int dst_width) {
__asm__ __volatile__(
".set push \n"
".set noreorder \n"
".set push \n"
".set noreorder \n"
"srl $t9, %[dst_width], 4 \n" // iterations -> by 16
"beqz $t9, 2f \n"
" nop \n"
"srl $t9, %[dst_width], 4 \n" // iterations -> by 16
"beqz $t9, 2f \n"
" nop \n"
"1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
"lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4|
"lw $t2, 8(%[src_ptr]) \n" // |11|10|9|8|
"lw $t3, 12(%[src_ptr]) \n" // |15|14|13|12|
"lw $t4, 16(%[src_ptr]) \n" // |19|18|17|16|
"lw $t5, 20(%[src_ptr]) \n" // |23|22|21|20|
"lw $t6, 24(%[src_ptr]) \n" // |27|26|25|24|
"lw $t7, 28(%[src_ptr]) \n" // |31|30|29|28|
// TODO(fbarchard): Use odd pixels instead of even.
"precr.qb.ph $t8, $t1, $t0 \n" // |6|4|2|0|
"precr.qb.ph $t0, $t3, $t2 \n" // |14|12|10|8|
"precr.qb.ph $t1, $t5, $t4 \n" // |22|20|18|16|
"precr.qb.ph $t2, $t7, $t6 \n" // |30|28|26|24|
"addiu %[src_ptr], %[src_ptr], 32 \n"
"addiu $t9, $t9, -1 \n"
"sw $t8, 0(%[dst]) \n"
"sw $t0, 4(%[dst]) \n"
"sw $t1, 8(%[dst]) \n"
"sw $t2, 12(%[dst]) \n"
"bgtz $t9, 1b \n"
" addiu %[dst], %[dst], 16 \n"
"1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
"lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4|
"lw $t2, 8(%[src_ptr]) \n" // |11|10|9|8|
"lw $t3, 12(%[src_ptr]) \n" // |15|14|13|12|
"lw $t4, 16(%[src_ptr]) \n" // |19|18|17|16|
"lw $t5, 20(%[src_ptr]) \n" // |23|22|21|20|
"lw $t6, 24(%[src_ptr]) \n" // |27|26|25|24|
"lw $t7, 28(%[src_ptr]) \n" // |31|30|29|28|
// TODO(fbarchard): Use odd pixels instead of even.
"precrq.qb.ph $t8, $t1, $t0 \n" // |7|5|3|1|
"precrq.qb.ph $t0, $t3, $t2 \n" // |15|13|11|9|
"precrq.qb.ph $t1, $t5, $t4 \n" // |23|21|19|17|
"precrq.qb.ph $t2, $t7, $t6 \n" // |31|29|27|25|
"addiu %[src_ptr], %[src_ptr], 32 \n"
"addiu $t9, $t9, -1 \n"
"sw $t8, 0(%[dst]) \n"
"sw $t0, 4(%[dst]) \n"
"sw $t1, 8(%[dst]) \n"
"sw $t2, 12(%[dst]) \n"
"bgtz $t9, 1b \n"
" addiu %[dst], %[dst], 16 \n"
"2: \n"
"andi $t9, %[dst_width], 0xf \n" // residue
"beqz $t9, 3f \n"
" nop \n"
"2: \n"
"andi $t9, %[dst_width], 0xf \n" // residue
"beqz $t9, 3f \n"
" nop \n"
"21: \n"
"lbu $t0, 0(%[src_ptr]) \n"
"addiu %[src_ptr], %[src_ptr], 2 \n"
"addiu $t9, $t9, -1 \n"
"sb $t0, 0(%[dst]) \n"
"bgtz $t9, 21b \n"
" addiu %[dst], %[dst], 1 \n"
"21: \n"
"lbu $t0, 1(%[src_ptr]) \n"
"addiu %[src_ptr], %[src_ptr], 2 \n"
"addiu $t9, $t9, -1 \n"
"sb $t0, 0(%[dst]) \n"
"bgtz $t9, 21b \n"
" addiu %[dst], %[dst], 1 \n"
"3: \n"
".set pop \n"
: [src_ptr] "+r" (src_ptr),
[dst] "+r" (dst)
: [dst_width] "r" (dst_width)
: "t0", "t1", "t2", "t3", "t4", "t5",
"t6", "t7", "t8", "t9"
);
"3: \n"
".set pop \n"
: [src_ptr] "+r"(src_ptr), [dst] "+r"(dst)
: [dst_width] "r"(dst_width)
: "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9");
}
void ScaleRowDown2Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
void ScaleRowDown2Box_DSPR2(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst,
int dst_width) {
const uint8* t = src_ptr + src_stride;
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
__asm__ __volatile__(
".set push \n"
".set noreorder \n"
"srl $t9, %[dst_width], 3 \n" // iterations -> step 8
"bltz $t9, 2f \n"
" nop \n"
"srl $t9, %[dst_width], 3 \n" // iterations -> step 8
"bltz $t9, 2f \n"
" nop \n"
"1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
"lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4|
"lw $t2, 8(%[src_ptr]) \n" // |11|10|9|8|
"lw $t3, 12(%[src_ptr]) \n" // |15|14|13|12|
"lw $t4, 0(%[t]) \n" // |19|18|17|16|
"lw $t5, 4(%[t]) \n" // |23|22|21|20|
"lw $t6, 8(%[t]) \n" // |27|26|25|24|
"lw $t7, 12(%[t]) \n" // |31|30|29|28|
"addiu $t9, $t9, -1 \n"
"srl $t8, $t0, 16 \n" // |X|X|3|2|
"ins $t0, $t4, 16, 16 \n" // |17|16|1|0|
"ins $t4, $t8, 0, 16 \n" // |19|18|3|2|
"raddu.w.qb $t0, $t0 \n" // |17+16+1+0|
"raddu.w.qb $t4, $t4 \n" // |19+18+3+2|
"shra_r.w $t0, $t0, 2 \n" // |t0+2|>>2
"shra_r.w $t4, $t4, 2 \n" // |t4+2|>>2
"srl $t8, $t1, 16 \n" // |X|X|7|6|
"ins $t1, $t5, 16, 16 \n" // |21|20|5|4|
"ins $t5, $t8, 0, 16 \n" // |22|23|7|6|
"raddu.w.qb $t1, $t1 \n" // |21+20+5+4|
"raddu.w.qb $t5, $t5 \n" // |23+22+7+6|
"shra_r.w $t1, $t1, 2 \n" // |t1+2|>>2
"shra_r.w $t5, $t5, 2 \n" // |t5+2|>>2
"srl $t8, $t2, 16 \n" // |X|X|11|10|
"ins $t2, $t6, 16, 16 \n" // |25|24|9|8|
"ins $t6, $t8, 0, 16 \n" // |27|26|11|10|
"raddu.w.qb $t2, $t2 \n" // |25+24+9+8|
"raddu.w.qb $t6, $t6 \n" // |27+26+11+10|
"shra_r.w $t2, $t2, 2 \n" // |t2+2|>>2
"shra_r.w $t6, $t6, 2 \n" // |t5+2|>>2
"srl $t8, $t3, 16 \n" // |X|X|15|14|
"ins $t3, $t7, 16, 16 \n" // |29|28|13|12|
"ins $t7, $t8, 0, 16 \n" // |31|30|15|14|
"raddu.w.qb $t3, $t3 \n" // |29+28+13+12|
"raddu.w.qb $t7, $t7 \n" // |31+30+15+14|
"shra_r.w $t3, $t3, 2 \n" // |t3+2|>>2
"shra_r.w $t7, $t7, 2 \n" // |t7+2|>>2
"addiu %[src_ptr], %[src_ptr], 16 \n"
"addiu %[t], %[t], 16 \n"
"sb $t0, 0(%[dst]) \n"
"sb $t4, 1(%[dst]) \n"
"sb $t1, 2(%[dst]) \n"
"sb $t5, 3(%[dst]) \n"
"sb $t2, 4(%[dst]) \n"
"sb $t6, 5(%[dst]) \n"
"sb $t3, 6(%[dst]) \n"
"sb $t7, 7(%[dst]) \n"
"bgtz $t9, 1b \n"
" addiu %[dst], %[dst], 8 \n"
"1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
"lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4|
"lw $t2, 8(%[src_ptr]) \n" // |11|10|9|8|
"lw $t3, 12(%[src_ptr]) \n" // |15|14|13|12|
"lw $t4, 0(%[t]) \n" // |19|18|17|16|
"lw $t5, 4(%[t]) \n" // |23|22|21|20|
"lw $t6, 8(%[t]) \n" // |27|26|25|24|
"lw $t7, 12(%[t]) \n" // |31|30|29|28|
"addiu $t9, $t9, -1 \n"
"srl $t8, $t0, 16 \n" // |X|X|3|2|
"ins $t0, $t4, 16, 16 \n" // |17|16|1|0|
"ins $t4, $t8, 0, 16 \n" // |19|18|3|2|
"raddu.w.qb $t0, $t0 \n" // |17+16+1+0|
"raddu.w.qb $t4, $t4 \n" // |19+18+3+2|
"shra_r.w $t0, $t0, 2 \n" // |t0+2|>>2
"shra_r.w $t4, $t4, 2 \n" // |t4+2|>>2
"srl $t8, $t1, 16 \n" // |X|X|7|6|
"ins $t1, $t5, 16, 16 \n" // |21|20|5|4|
"ins $t5, $t8, 0, 16 \n" // |22|23|7|6|
"raddu.w.qb $t1, $t1 \n" // |21+20+5+4|
"raddu.w.qb $t5, $t5 \n" // |23+22+7+6|
"shra_r.w $t1, $t1, 2 \n" // |t1+2|>>2
"shra_r.w $t5, $t5, 2 \n" // |t5+2|>>2
"srl $t8, $t2, 16 \n" // |X|X|11|10|
"ins $t2, $t6, 16, 16 \n" // |25|24|9|8|
"ins $t6, $t8, 0, 16 \n" // |27|26|11|10|
"raddu.w.qb $t2, $t2 \n" // |25+24+9+8|
"raddu.w.qb $t6, $t6 \n" // |27+26+11+10|
"shra_r.w $t2, $t2, 2 \n" // |t2+2|>>2
"shra_r.w $t6, $t6, 2 \n" // |t5+2|>>2
"srl $t8, $t3, 16 \n" // |X|X|15|14|
"ins $t3, $t7, 16, 16 \n" // |29|28|13|12|
"ins $t7, $t8, 0, 16 \n" // |31|30|15|14|
"raddu.w.qb $t3, $t3 \n" // |29+28+13+12|
"raddu.w.qb $t7, $t7 \n" // |31+30+15+14|
"shra_r.w $t3, $t3, 2 \n" // |t3+2|>>2
"shra_r.w $t7, $t7, 2 \n" // |t7+2|>>2
"addiu %[src_ptr], %[src_ptr], 16 \n"
"addiu %[t], %[t], 16 \n"
"sb $t0, 0(%[dst]) \n"
"sb $t4, 1(%[dst]) \n"
"sb $t1, 2(%[dst]) \n"
"sb $t5, 3(%[dst]) \n"
"sb $t2, 4(%[dst]) \n"
"sb $t6, 5(%[dst]) \n"
"sb $t3, 6(%[dst]) \n"
"sb $t7, 7(%[dst]) \n"
"bgtz $t9, 1b \n"
" addiu %[dst], %[dst], 8 \n"
"2: \n"
"andi $t9, %[dst_width], 0x7 \n" // x = residue
"beqz $t9, 3f \n"
" nop \n"
"2: \n"
"andi $t9, %[dst_width], 0x7 \n" // x = residue
"beqz $t9, 3f \n"
" nop \n"
"21: \n"
"lwr $t1, 0(%[src_ptr]) \n"
"lwl $t1, 3(%[src_ptr]) \n"
"lwr $t2, 0(%[t]) \n"
"lwl $t2, 3(%[t]) \n"
"srl $t8, $t1, 16 \n"
"ins $t1, $t2, 16, 16 \n"
"ins $t2, $t8, 0, 16 \n"
"raddu.w.qb $t1, $t1 \n"
"raddu.w.qb $t2, $t2 \n"
"shra_r.w $t1, $t1, 2 \n"
"shra_r.w $t2, $t2, 2 \n"
"sb $t1, 0(%[dst]) \n"
"sb $t2, 1(%[dst]) \n"
"addiu %[src_ptr], %[src_ptr], 4 \n"
"addiu $t9, $t9, -2 \n"
"addiu %[t], %[t], 4 \n"
"bgtz $t9, 21b \n"
" addiu %[dst], %[dst], 2 \n"
"21: \n"
"lwr $t1, 0(%[src_ptr]) \n"
"lwl $t1, 3(%[src_ptr]) \n"
"lwr $t2, 0(%[t]) \n"
"lwl $t2, 3(%[t]) \n"
"srl $t8, $t1, 16 \n"
"ins $t1, $t2, 16, 16 \n"
"ins $t2, $t8, 0, 16 \n"
"raddu.w.qb $t1, $t1 \n"
"raddu.w.qb $t2, $t2 \n"
"shra_r.w $t1, $t1, 2 \n"
"shra_r.w $t2, $t2, 2 \n"
"sb $t1, 0(%[dst]) \n"
"sb $t2, 1(%[dst]) \n"
"addiu %[src_ptr], %[src_ptr], 4 \n"
"addiu $t9, $t9, -2 \n"
"addiu %[t], %[t], 4 \n"
"bgtz $t9, 21b \n"
" addiu %[dst], %[dst], 2 \n"
"3: \n"
".set pop \n"
"3: \n"
".set pop \n"
: [src_ptr] "+r" (src_ptr),
[dst] "+r" (dst), [t] "+r" (t)
: [dst_width] "r" (dst_width)
: "t0", "t1", "t2", "t3", "t4", "t5",
"t6", "t7", "t8", "t9"
);
: [src_ptr] "+r"(src_ptr), [dst] "+r"(dst), [t] "+r"(t)
: [dst_width] "r"(dst_width)
: "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9");
}
void ScaleRowDown4_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
__asm__ __volatile__ (
void ScaleRowDown4_DSPR2(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst,
int dst_width) {
__asm__ __volatile__(
".set push \n"
".set noreorder \n"
@ -186,7 +185,7 @@ void ScaleRowDown4_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
"beqz $t9, 2f \n"
" nop \n"
"1: \n"
"1: \n"
"lw $t1, 0(%[src_ptr]) \n" // |3|2|1|0|
"lw $t2, 4(%[src_ptr]) \n" // |7|6|5|4|
"lw $t3, 8(%[src_ptr]) \n" // |11|10|9|8|
@ -199,8 +198,8 @@ void ScaleRowDown4_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
"precr.qb.ph $t2, $t4, $t3 \n" // |14|12|10|8|
"precr.qb.ph $t5, $t6, $t5 \n" // |22|20|18|16|
"precr.qb.ph $t6, $t8, $t7 \n" // |30|28|26|24|
"precr.qb.ph $t1, $t2, $t1 \n" // |12|8|4|0|
"precr.qb.ph $t5, $t6, $t5 \n" // |28|24|20|16|
"precrq.qb.ph $t1, $t2, $t1 \n" // |14|10|6|2|
"precrq.qb.ph $t5, $t6, $t5 \n" // |30|26|22|18|
"addiu %[src_ptr], %[src_ptr], 32 \n"
"addiu $t9, $t9, -1 \n"
"sw $t1, 0(%[dst]) \n"
@ -208,44 +207,43 @@ void ScaleRowDown4_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
"bgtz $t9, 1b \n"
" addiu %[dst], %[dst], 8 \n"
"2: \n"
"2: \n"
"andi $t9, %[dst_width], 7 \n" // residue
"beqz $t9, 3f \n"
" nop \n"
"21: \n"
"lbu $t1, 0(%[src_ptr]) \n"
"21: \n"
"lbu $t1, 2(%[src_ptr]) \n"
"addiu %[src_ptr], %[src_ptr], 4 \n"
"addiu $t9, $t9, -1 \n"
"sb $t1, 0(%[dst]) \n"
"bgtz $t9, 21b \n"
" addiu %[dst], %[dst], 1 \n"
"3: \n"
"3: \n"
".set pop \n"
: [src_ptr] "+r" (src_ptr),
[dst] "+r" (dst)
: [dst_width] "r" (dst_width)
: "t1", "t2", "t3", "t4", "t5",
"t6", "t7", "t8", "t9"
);
: [src_ptr] "+r"(src_ptr), [dst] "+r"(dst)
: [dst_width] "r"(dst_width)
: "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9");
}
void ScaleRowDown4Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
void ScaleRowDown4Box_DSPR2(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst,
int dst_width) {
intptr_t stride = src_stride;
const uint8* s1 = src_ptr + stride;
const uint8* s2 = s1 + stride;
const uint8* s3 = s2 + stride;
__asm__ __volatile__ (
__asm__ __volatile__(
".set push \n"
".set noreorder \n"
"srl $t9, %[dst_width], 1 \n"
"andi $t8, %[dst_width], 1 \n"
"1: \n"
"1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
"lw $t1, 0(%[s1]) \n" // |7|6|5|4|
"lw $t2, 0(%[s2]) \n" // |11|10|9|8|
@ -299,23 +297,20 @@ void ScaleRowDown4Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
"2: \n"
".set pop \n"
: [src_ptr] "+r" (src_ptr),
[dst] "+r" (dst),
[s1] "+r" (s1),
[s2] "+r" (s2),
[s3] "+r" (s3)
: [dst_width] "r" (dst_width)
: "t0", "t1", "t2", "t3", "t4", "t5",
"t6","t7", "t8", "t9"
);
: [src_ptr] "+r"(src_ptr), [dst] "+r"(dst), [s1] "+r"(s1), [s2] "+r"(s2),
[s3] "+r"(s3)
: [dst_width] "r"(dst_width)
: "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9");
}
void ScaleRowDown34_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
__asm__ __volatile__ (
void ScaleRowDown34_DSPR2(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst,
int dst_width) {
__asm__ __volatile__(
".set push \n"
".set noreorder \n"
"1: \n"
"1: \n"
"lw $t1, 0(%[src_ptr]) \n" // |3|2|1|0|
"lw $t2, 4(%[src_ptr]) \n" // |7|6|5|4|
"lw $t3, 8(%[src_ptr]) \n" // |11|10|9|8|
@ -347,23 +342,21 @@ void ScaleRowDown34_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
"bnez %[dst_width], 1b \n"
" addiu %[dst], %[dst], 24 \n"
".set pop \n"
: [src_ptr] "+r" (src_ptr),
[dst] "+r" (dst),
[dst_width] "+r" (dst_width)
: [src_ptr] "+r"(src_ptr), [dst] "+r"(dst), [dst_width] "+r"(dst_width)
:
: "t0", "t1", "t2", "t3", "t4", "t5",
"t6","t7", "t8", "t9"
);
: "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9");
}
void ScaleRowDown34_0_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* d, int dst_width) {
__asm__ __volatile__ (
void ScaleRowDown34_0_Box_DSPR2(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* d,
int dst_width) {
__asm__ __volatile__(
".set push \n"
".set noreorder \n"
"repl.ph $t3, 3 \n" // 0x00030003
"1: \n"
"1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0|
"lwx $t1, %[src_stride](%[src_ptr]) \n" // |T3|T2|T1|T0|
"rotr $t2, $t0, 8 \n" // |S0|S3|S2|S1|
@ -400,26 +393,24 @@ void ScaleRowDown34_0_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
"sb $t6, 2(%[d]) \n"
"bgtz %[dst_width], 1b \n"
" addiu %[d], %[d], 3 \n"
"3: \n"
"3: \n"
".set pop \n"
: [src_ptr] "+r" (src_ptr),
[src_stride] "+r" (src_stride),
[d] "+r" (d),
[dst_width] "+r" (dst_width)
: [src_ptr] "+r"(src_ptr), [src_stride] "+r"(src_stride), [d] "+r"(d),
[dst_width] "+r"(dst_width)
:
: "t0", "t1", "t2", "t3",
"t4", "t5", "t6"
);
: "t0", "t1", "t2", "t3", "t4", "t5", "t6");
}
void ScaleRowDown34_1_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* d, int dst_width) {
__asm__ __volatile__ (
void ScaleRowDown34_1_Box_DSPR2(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* d,
int dst_width) {
__asm__ __volatile__(
".set push \n"
".set noreorder \n"
"repl.ph $t2, 3 \n" // 0x00030003
"1: \n"
"1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0|
"lwx $t1, %[src_stride](%[src_ptr]) \n" // |T3|T2|T1|T0|
"rotr $t4, $t0, 8 \n" // |S0|S3|S2|S1|
@ -452,25 +443,23 @@ void ScaleRowDown34_1_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
"sb $t6, 2(%[d]) \n"
"bgtz %[dst_width], 1b \n"
" addiu %[d], %[d], 3 \n"
"3: \n"
"3: \n"
".set pop \n"
: [src_ptr] "+r" (src_ptr),
[src_stride] "+r" (src_stride),
[d] "+r" (d),
[dst_width] "+r" (dst_width)
: [src_ptr] "+r"(src_ptr), [src_stride] "+r"(src_stride), [d] "+r"(d),
[dst_width] "+r"(dst_width)
:
: "t0", "t1", "t2", "t3",
"t4", "t5", "t6"
);
: "t0", "t1", "t2", "t3", "t4", "t5", "t6");
}
void ScaleRowDown38_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
__asm__ __volatile__ (
void ScaleRowDown38_DSPR2(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst,
int dst_width) {
__asm__ __volatile__(
".set push \n"
".set noreorder \n"
"1: \n"
"1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
"lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4|
"lw $t2, 8(%[src_ptr]) \n" // |11|10|9|8|
@ -501,26 +490,24 @@ void ScaleRowDown38_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
"bgez $t8, 1b \n"
" addiu %[dst], %[dst], 12 \n"
".set pop \n"
: [src_ptr] "+r" (src_ptr),
[dst] "+r" (dst),
[dst_width] "+r" (dst_width)
: [src_ptr] "+r"(src_ptr), [dst] "+r"(dst), [dst_width] "+r"(dst_width)
:
: "t0", "t1", "t2", "t3", "t4",
"t5", "t6", "t7", "t8"
);
: "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8");
}
void ScaleRowDown38_2_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
void ScaleRowDown38_2_Box_DSPR2(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr,
int dst_width) {
intptr_t stride = src_stride;
const uint8* t = src_ptr + stride;
const int c = 0x2AAA;
__asm__ __volatile__ (
__asm__ __volatile__(
".set push \n"
".set noreorder \n"
"1: \n"
"1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0|
"lw $t1, 4(%[src_ptr]) \n" // |S7|S6|S5|S4|
"lw $t2, 0(%[t]) \n" // |T3|T2|T1|T0|
@ -554,18 +541,16 @@ void ScaleRowDown38_2_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
"bgtz %[dst_width], 1b \n"
" sb $t0, -3(%[dst_ptr]) \n"
".set pop \n"
: [src_ptr] "+r" (src_ptr),
[dst_ptr] "+r" (dst_ptr),
[t] "+r" (t),
[dst_width] "+r" (dst_width)
: [c] "r" (c)
: "t0", "t1", "t2", "t3", "t4", "t5", "t6"
);
: [src_ptr] "+r"(src_ptr), [dst_ptr] "+r"(dst_ptr), [t] "+r"(t),
[dst_width] "+r"(dst_width)
: [c] "r"(c)
: "t0", "t1", "t2", "t3", "t4", "t5", "t6");
}
void ScaleRowDown38_3_Box_DSPR2(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
uint8* dst_ptr,
int dst_width) {
intptr_t stride = src_stride;
const uint8* s1 = src_ptr + stride;
stride += stride;
@ -573,11 +558,11 @@ void ScaleRowDown38_3_Box_DSPR2(const uint8* src_ptr,
const int c1 = 0x1C71;
const int c2 = 0x2AAA;
__asm__ __volatile__ (
__asm__ __volatile__(
".set push \n"
".set noreorder \n"
"1: \n"
"1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0|
"lw $t1, 4(%[src_ptr]) \n" // |S7|S6|S5|S4|
"lw $t2, 0(%[s1]) \n" // |T3|T2|T1|T0|
@ -624,15 +609,55 @@ void ScaleRowDown38_3_Box_DSPR2(const uint8* src_ptr,
"bgtz %[dst_width], 1b \n"
" sb $t0, -3(%[dst_ptr]) \n"
".set pop \n"
: [src_ptr] "+r" (src_ptr),
[dst_ptr] "+r" (dst_ptr),
[s1] "+r" (s1),
[s2] "+r" (s2),
[dst_width] "+r" (dst_width)
: [c1] "r" (c1), [c2] "r" (c2)
: "t0", "t1", "t2", "t3", "t4",
"t5", "t6", "t7", "t8"
);
: [src_ptr] "+r"(src_ptr), [dst_ptr] "+r"(dst_ptr), [s1] "+r"(s1),
[s2] "+r"(s2), [dst_width] "+r"(dst_width)
: [c1] "r"(c1), [c2] "r"(c2)
: "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8");
}
void ScaleAddRow_DSPR2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
int x;
for (x = 0; x < ((src_width - 1)); x += 8) {
uint32 tmp_t1, tmp_t2, tmp_t3, tmp_t4;
uint32 tmp_t5, tmp_t6, tmp_t7, tmp_t8;
__asm__ __volatile__(
".set push \n"
".set noreorder \n"
"lw %[tmp_t5], 0(%[src_ptr]) \n"
"lw %[tmp_t6], 4(%[src_ptr]) \n"
"lw %[tmp_t1], 0(%[dst_ptr]) \n"
"lw %[tmp_t2], 4(%[dst_ptr]) \n"
"lw %[tmp_t3], 8(%[dst_ptr]) \n"
"lw %[tmp_t4], 12(%[dst_ptr]) \n"
"preceu.ph.qbr %[tmp_t7], %[tmp_t5] \n"
"preceu.ph.qbl %[tmp_t8], %[tmp_t5] \n"
"addu.ph %[tmp_t1], %[tmp_t1], %[tmp_t7] \n"
"addu.ph %[tmp_t2], %[tmp_t2], %[tmp_t8] \n"
"preceu.ph.qbr %[tmp_t7], %[tmp_t6] \n"
"preceu.ph.qbl %[tmp_t8], %[tmp_t6] \n"
"addu.ph %[tmp_t3], %[tmp_t3], %[tmp_t7] \n"
"addu.ph %[tmp_t4], %[tmp_t4], %[tmp_t8] \n"
"sw %[tmp_t1], 0(%[dst_ptr]) \n"
"sw %[tmp_t2], 4(%[dst_ptr]) \n"
"sw %[tmp_t3], 8(%[dst_ptr]) \n"
"sw %[tmp_t4], 12(%[dst_ptr]) \n"
".set pop \n"
:
[tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2), [tmp_t3] "=&r"(tmp_t3),
[tmp_t4] "=&r"(tmp_t4), [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
[tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8), [src_ptr] "+r"(src_ptr)
: [dst_ptr] "r"(dst_ptr));
src_ptr += 8;
dst_ptr += 8;
}
if ((src_width)&7) {
for (x = 0; x < ((src_width - 1) & 7); x += 1) {
dst_ptr[0] += src_ptr[0];
src_ptr += 1;
dst_ptr += 1;
}
}
}
#endif // defined(__mips_dsp) && (__mips_dsp_rev >= 2)
@ -641,4 +666,3 @@ void ScaleRowDown38_3_Box_DSPR2(const uint8* src_ptr,
} // extern "C"
} // namespace libyuv
#endif

View File

@ -21,85 +21,82 @@ extern "C" {
(defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
// Offsets for source bytes 0 to 9
static uvec8 kShuf0 =
{ 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 };
static uvec8 kShuf0 = {0, 1, 3, 4, 5, 7, 8, 9,
128, 128, 128, 128, 128, 128, 128, 128};
// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
static uvec8 kShuf1 =
{ 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 };
static uvec8 kShuf1 = {3, 4, 5, 7, 8, 9, 11, 12,
128, 128, 128, 128, 128, 128, 128, 128};
// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
static uvec8 kShuf2 =
{ 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 };
static uvec8 kShuf2 = {5, 7, 8, 9, 11, 12, 13, 15,
128, 128, 128, 128, 128, 128, 128, 128};
// Offsets for source bytes 0 to 10
static uvec8 kShuf01 =
{ 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 };
static uvec8 kShuf01 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10};
// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
static uvec8 kShuf11 =
{ 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 };
static uvec8 kShuf11 = {2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13};
// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
static uvec8 kShuf21 =
{ 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 };
static uvec8 kShuf21 = {5, 6, 6, 7, 8, 9, 9, 10,
10, 11, 12, 13, 13, 14, 14, 15};
// Coefficients for source bytes 0 to 10
static uvec8 kMadd01 =
{ 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 };
static uvec8 kMadd01 = {3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2};
// Coefficients for source bytes 10 to 21
static uvec8 kMadd11 =
{ 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 };
static uvec8 kMadd11 = {1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1};
// Coefficients for source bytes 21 to 31
static uvec8 kMadd21 =
{ 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 };
static uvec8 kMadd21 = {2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3};
// Coefficients for source bytes 21 to 31
static vec16 kRound34 =
{ 2, 2, 2, 2, 2, 2, 2, 2 };
static vec16 kRound34 = {2, 2, 2, 2, 2, 2, 2, 2};
static uvec8 kShuf38a =
{ 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
static uvec8 kShuf38a = {0, 3, 6, 8, 11, 14, 128, 128,
128, 128, 128, 128, 128, 128, 128, 128};
static uvec8 kShuf38b =
{ 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 };
static uvec8 kShuf38b = {128, 128, 128, 128, 128, 128, 0, 3,
6, 8, 11, 14, 128, 128, 128, 128};
// Arrange words 0,3,6 into 0,1,2
static uvec8 kShufAc =
{ 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
static uvec8 kShufAc = {0, 1, 6, 7, 12, 13, 128, 128,
128, 128, 128, 128, 128, 128, 128, 128};
// Arrange words 0,3,6 into 3,4,5
static uvec8 kShufAc3 =
{ 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 };
static uvec8 kShufAc3 = {128, 128, 128, 128, 128, 128, 0, 1,
6, 7, 12, 13, 128, 128, 128, 128};
// Scaling values for boxes of 3x3 and 2x3
static uvec16 kScaleAc33 =
{ 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 };
static uvec16 kScaleAc33 = {65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9,
65536 / 9, 65536 / 6, 0, 0};
// Arrange first value for pixels 0,1,2,3,4,5
static uvec8 kShufAb0 =
{ 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 };
static uvec8 kShufAb0 = {0, 128, 3, 128, 6, 128, 8, 128,
11, 128, 14, 128, 128, 128, 128, 128};
// Arrange second value for pixels 0,1,2,3,4,5
static uvec8 kShufAb1 =
{ 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 };
static uvec8 kShufAb1 = {1, 128, 4, 128, 7, 128, 9, 128,
12, 128, 15, 128, 128, 128, 128, 128};
// Arrange third value for pixels 0,1,2,3,4,5
static uvec8 kShufAb2 =
{ 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };
static uvec8 kShufAb2 = {2, 128, 5, 128, 128, 128, 10, 128,
13, 128, 128, 128, 128, 128, 128, 128};
// Scaling values for boxes of 3x2 and 2x2
static uvec16 kScaleAb2 =
{ 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
static uvec16 kScaleAb2 = {65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3,
65536 / 3, 65536 / 2, 0, 0};
// GCC versions of row functions are verbatim conversions from Visual C.
// Generated using gcc disassembly on Visual C object file:
// objdump -D yuvscaler.obj >yuvscaler.txt
void ScaleRowDown2_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
void ScaleRowDown2_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr,
int dst_width) {
(void)src_stride;
asm volatile (
LABELALIGN
"1: \n"
@ -120,8 +117,11 @@ void ScaleRowDown2_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
);
}
void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr,
int dst_width) {
(void)src_stride;
asm volatile (
"pcmpeqb %%xmm4,%%xmm4 \n"
"psrlw $0xf,%%xmm4 \n"
@ -149,8 +149,10 @@ void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
);
}
void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
void ScaleRowDown2Box_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr,
int dst_width) {
asm volatile (
"pcmpeqb %%xmm4,%%xmm4 \n"
"psrlw $0xf,%%xmm4 \n"
@ -189,8 +191,11 @@ void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
}
#ifdef HAS_SCALEROWDOWN2_AVX2
void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
void ScaleRowDown2_AVX2(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr,
int dst_width) {
(void)src_stride;
asm volatile (
LABELALIGN
"1: \n"
@ -213,8 +218,11 @@ void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
);
}
void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
void ScaleRowDown2Linear_AVX2(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr,
int dst_width) {
(void)src_stride;
asm volatile (
"vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
"vpsrlw $0xf,%%ymm4,%%ymm4 \n"
@ -244,8 +252,10 @@ void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
);
}
void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
void ScaleRowDown2Box_AVX2(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr,
int dst_width) {
asm volatile (
"vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
"vpsrlw $0xf,%%ymm4,%%ymm4 \n"
@ -286,8 +296,11 @@ void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
}
#endif // HAS_SCALEROWDOWN2_AVX2
void ScaleRowDown4_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
void ScaleRowDown4_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr,
int dst_width) {
(void)src_stride;
asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n"
"psrld $0x18,%%xmm5 \n"
@ -314,8 +327,10 @@ void ScaleRowDown4_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
);
}
void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
void ScaleRowDown4Box_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr,
int dst_width) {
intptr_t stridex3;
asm volatile (
"pcmpeqb %%xmm4,%%xmm4 \n"
@ -368,10 +383,12 @@ void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
);
}
#ifdef HAS_SCALEROWDOWN4_AVX2
void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
void ScaleRowDown4_AVX2(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr,
int dst_width) {
(void)src_stride;
asm volatile (
"vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
"vpsrld $0x18,%%ymm5,%%ymm5 \n"
@ -400,8 +417,10 @@ void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
);
}
void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
void ScaleRowDown4Box_AVX2(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr,
int dst_width) {
asm volatile (
"vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
"vpsrlw $0xf,%%ymm4,%%ymm4 \n"
@ -455,17 +474,20 @@ void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
}
#endif // HAS_SCALEROWDOWN4_AVX2
void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
asm volatile (
"movdqa %0,%%xmm3 \n"
"movdqa %1,%%xmm4 \n"
"movdqa %2,%%xmm5 \n"
:
: "m"(kShuf0), // %0
"m"(kShuf1), // %1
"m"(kShuf2) // %2
);
void ScaleRowDown34_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr,
int dst_width) {
(void)src_stride;
asm volatile(
"movdqa %0,%%xmm3 \n"
"movdqa %1,%%xmm4 \n"
"movdqa %2,%%xmm5 \n"
:
: "m"(kShuf0), // %0
"m"(kShuf1), // %1
"m"(kShuf2) // %2
);
asm volatile (
LABELALIGN
"1: \n"
@ -492,25 +514,26 @@ void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
asm volatile (
"movdqa %0,%%xmm2 \n" // kShuf01
"movdqa %1,%%xmm3 \n" // kShuf11
"movdqa %2,%%xmm4 \n" // kShuf21
:
: "m"(kShuf01), // %0
"m"(kShuf11), // %1
"m"(kShuf21) // %2
);
asm volatile (
"movdqa %0,%%xmm5 \n" // kMadd01
"movdqa %1,%%xmm0 \n" // kMadd11
"movdqa %2,%%xmm1 \n" // kRound34
:
: "m"(kMadd01), // %0
"m"(kMadd11), // %1
"m"(kRound34) // %2
);
uint8* dst_ptr,
int dst_width) {
asm volatile(
"movdqa %0,%%xmm2 \n" // kShuf01
"movdqa %1,%%xmm3 \n" // kShuf11
"movdqa %2,%%xmm4 \n" // kShuf21
:
: "m"(kShuf01), // %0
"m"(kShuf11), // %1
"m"(kShuf21) // %2
);
asm volatile(
"movdqa %0,%%xmm5 \n" // kMadd01
"movdqa %1,%%xmm0 \n" // kMadd11
"movdqa %2,%%xmm1 \n" // kRound34
:
: "m"(kMadd01), // %0
"m"(kMadd11), // %1
"m"(kRound34) // %2
);
asm volatile (
LABELALIGN
"1: \n"
@ -557,25 +580,26 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
asm volatile (
"movdqa %0,%%xmm2 \n" // kShuf01
"movdqa %1,%%xmm3 \n" // kShuf11
"movdqa %2,%%xmm4 \n" // kShuf21
:
: "m"(kShuf01), // %0
"m"(kShuf11), // %1
"m"(kShuf21) // %2
);
asm volatile (
"movdqa %0,%%xmm5 \n" // kMadd01
"movdqa %1,%%xmm0 \n" // kMadd11
"movdqa %2,%%xmm1 \n" // kRound34
:
: "m"(kMadd01), // %0
"m"(kMadd11), // %1
"m"(kRound34) // %2
);
uint8* dst_ptr,
int dst_width) {
asm volatile(
"movdqa %0,%%xmm2 \n" // kShuf01
"movdqa %1,%%xmm3 \n" // kShuf11
"movdqa %2,%%xmm4 \n" // kShuf21
:
: "m"(kShuf01), // %0
"m"(kShuf11), // %1
"m"(kShuf21) // %2
);
asm volatile(
"movdqa %0,%%xmm5 \n" // kMadd01
"movdqa %1,%%xmm0 \n" // kMadd11
"movdqa %2,%%xmm1 \n" // kRound34
:
: "m"(kMadd01), // %0
"m"(kMadd11), // %1
"m"(kRound34) // %2
);
asm volatile (
LABELALIGN
@ -624,8 +648,11 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
);
}
void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
void ScaleRowDown38_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr,
int dst_width) {
(void)src_stride;
asm volatile (
"movdqa %3,%%xmm4 \n"
"movdqa %4,%%xmm5 \n"
@ -655,18 +682,19 @@ void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
asm volatile (
"movdqa %0,%%xmm2 \n"
"movdqa %1,%%xmm3 \n"
"movdqa %2,%%xmm4 \n"
"movdqa %3,%%xmm5 \n"
:
: "m"(kShufAb0), // %0
"m"(kShufAb1), // %1
"m"(kShufAb2), // %2
"m"(kScaleAb2) // %3
);
uint8* dst_ptr,
int dst_width) {
asm volatile(
"movdqa %0,%%xmm2 \n"
"movdqa %1,%%xmm3 \n"
"movdqa %2,%%xmm4 \n"
"movdqa %3,%%xmm5 \n"
:
: "m"(kShufAb0), // %0
"m"(kShufAb1), // %1
"m"(kShufAb2), // %2
"m"(kScaleAb2) // %3
);
asm volatile (
LABELALIGN
"1: \n"
@ -700,17 +728,18 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
asm volatile (
"movdqa %0,%%xmm2 \n"
"movdqa %1,%%xmm3 \n"
"movdqa %2,%%xmm4 \n"
"pxor %%xmm5,%%xmm5 \n"
:
: "m"(kShufAc), // %0
"m"(kShufAc3), // %1
"m"(kScaleAc33) // %2
);
uint8* dst_ptr,
int dst_width) {
asm volatile(
"movdqa %0,%%xmm2 \n"
"movdqa %1,%%xmm3 \n"
"movdqa %2,%%xmm4 \n"
"pxor %%xmm5,%%xmm5 \n"
:
: "m"(kShufAc), // %0
"m"(kShufAc3), // %1
"m"(kScaleAc33) // %2
);
asm volatile (
LABELALIGN
"1: \n"
@ -790,7 +819,6 @@ void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
);
}
#ifdef HAS_SCALEADDROW_AVX2
// Reads 32 bytes and accumulates to 32 shorts at a time.
void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
@ -821,9 +849,21 @@ void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
}
#endif // HAS_SCALEADDROW_AVX2
// Constant for making pixels signed to avoid pmaddubsw
// saturation.
static uvec8 kFsub80 = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
// Constant for making pixels unsigned and adding .5 for rounding.
static uvec16 kFadd40 = {0x4040, 0x4040, 0x4040, 0x4040,
0x4040, 0x4040, 0x4040, 0x4040};
// Bilinear column filtering. SSSE3 version.
void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
int dst_width, int x, int dx) {
void ScaleFilterCols_SSSE3(uint8* dst_ptr,
const uint8* src_ptr,
int dst_width,
int x,
int dx) {
intptr_t x0, x1, temp_pixel;
asm volatile (
"movd %6,%%xmm2 \n"
@ -831,7 +871,10 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
"movl $0x04040000,%k2 \n"
"movd %k2,%%xmm5 \n"
"pcmpeqb %%xmm6,%%xmm6 \n"
"psrlw $0x9,%%xmm6 \n"
"psrlw $0x9,%%xmm6 \n" // 0x007f007f
"pcmpeqb %%xmm7,%%xmm7 \n"
"psrlw $15,%%xmm7 \n" // 0x00010001
"pextrw $0x1,%%xmm2,%k3 \n"
"subl $0x2,%5 \n"
"jl 29f \n"
@ -853,16 +896,19 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
"movd %k2,%%xmm4 \n"
"pshufb %%xmm5,%%xmm1 \n"
"punpcklwd %%xmm4,%%xmm0 \n"
"pxor %%xmm6,%%xmm1 \n"
"pmaddubsw %%xmm1,%%xmm0 \n"
"psubb %8,%%xmm0 \n" // make pixels signed.
"pxor %%xmm6,%%xmm1 \n" // 128 - f = (f ^ 127 ) + 1
"paddusb %%xmm7,%%xmm1 \n"
"pmaddubsw %%xmm0,%%xmm1 \n"
"pextrw $0x1,%%xmm2,%k3 \n"
"pextrw $0x3,%%xmm2,%k4 \n"
"psrlw $0x7,%%xmm0 \n"
"packuswb %%xmm0,%%xmm0 \n"
"movd %%xmm0,%k2 \n"
"paddw %9,%%xmm1 \n" // make pixels unsigned.
"psrlw $0x7,%%xmm1 \n"
"packuswb %%xmm1,%%xmm1 \n"
"movd %%xmm1,%k2 \n"
"mov %w2," MEMACCESS(0) " \n"
"lea " MEMLEA(0x2,0) ",%0 \n"
"sub $0x2,%5 \n"
"subl $0x2,%5 \n"
"jge 2b \n"
LABELALIGN
@ -873,11 +919,14 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
"movd %k2,%%xmm0 \n"
"psrlw $0x9,%%xmm2 \n"
"pshufb %%xmm5,%%xmm2 \n"
"psubb %8,%%xmm0 \n" // make pixels signed.
"pxor %%xmm6,%%xmm2 \n"
"pmaddubsw %%xmm2,%%xmm0 \n"
"psrlw $0x7,%%xmm0 \n"
"packuswb %%xmm0,%%xmm0 \n"
"movd %%xmm0,%k2 \n"
"paddusb %%xmm7,%%xmm2 \n"
"pmaddubsw %%xmm0,%%xmm2 \n"
"paddw %9,%%xmm2 \n" // make pixels unsigned.
"psrlw $0x7,%%xmm2 \n"
"packuswb %%xmm2,%%xmm2 \n"
"movd %%xmm2,%k2 \n"
"mov %b2," MEMACCESS(0) " \n"
"99: \n"
: "+r"(dst_ptr), // %0
@ -885,18 +934,34 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
"=&a"(temp_pixel), // %2
"=&r"(x0), // %3
"=&r"(x1), // %4
#if defined(__x86_64__)
"+rm"(dst_width) // %5
#else
"+m"(dst_width) // %5
#endif
: "rm"(x), // %6
"rm"(dx) // %7
"rm"(dx), // %7
#if defined(__x86_64__)
"x"(kFsub80), // %8
"x"(kFadd40) // %9
#else
"m"(kFsub80), // %8
"m"(kFadd40) // %9
#endif
: "memory", "cc", NACL_R14
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
);
}
// Reads 4 pixels, duplicates them and writes 8 pixels.
// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
int dst_width, int x, int dx) {
void ScaleColsUp2_SSE2(uint8* dst_ptr,
const uint8* src_ptr,
int dst_width,
int x,
int dx) {
(void)x;
(void)dx;
asm volatile (
LABELALIGN
"1: \n"
@ -920,7 +985,9 @@ void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
ptrdiff_t src_stride,
uint8* dst_argb, int dst_width) {
uint8* dst_argb,
int dst_width) {
(void)src_stride;
asm volatile (
LABELALIGN
"1: \n"
@ -941,7 +1008,9 @@ void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
ptrdiff_t src_stride,
uint8* dst_argb, int dst_width) {
uint8* dst_argb,
int dst_width) {
(void)src_stride;
asm volatile (
LABELALIGN
"1: \n"
@ -965,7 +1034,8 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
ptrdiff_t src_stride,
uint8* dst_argb, int dst_width) {
uint8* dst_argb,
int dst_width) {
asm volatile (
LABELALIGN
"1: \n"
@ -995,10 +1065,14 @@ void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
// Reads 4 pixels at a time.
// Alignment requirement: dst_argb 16 byte aligned.
void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
int src_stepx, uint8* dst_argb, int dst_width) {
void ScaleARGBRowDownEven_SSE2(const uint8* src_argb,
ptrdiff_t src_stride,
int src_stepx,
uint8* dst_argb,
int dst_width) {
intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
intptr_t src_stepx_x12;
(void)src_stride;
asm volatile (
"lea " MEMLEA3(0x00,1,4) ",%1 \n"
"lea " MEMLEA4(0x00,1,1,2) ",%4 \n"
@ -1029,8 +1103,10 @@ void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
// Blends four 2x2 to 4x1.
// Alignment requirement: dst_argb 16 byte aligned.
void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
ptrdiff_t src_stride, int src_stepx,
uint8* dst_argb, int dst_width) {
ptrdiff_t src_stride,
int src_stepx,
uint8* dst_argb,
int dst_width) {
intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
intptr_t src_stepx_x12;
intptr_t row1 = (intptr_t)(src_stride);
@ -1072,8 +1148,11 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
);
}
void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx) {
void ScaleARGBCols_SSE2(uint8* dst_argb,
const uint8* src_argb,
int dst_width,
int x,
int dx) {
intptr_t x0, x1;
asm volatile (
"movd %5,%%xmm2 \n"
@ -1141,8 +1220,13 @@ void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
// Reads 4 pixels, duplicates them and writes 8 pixels.
// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx) {
void ScaleARGBColsUp2_SSE2(uint8* dst_argb,
const uint8* src_argb,
int dst_width,
int x,
int dx) {
(void)x;
(void)dx;
asm volatile (
LABELALIGN
"1: \n"
@ -1167,26 +1251,29 @@ void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
// Shuffle table for arranging 2 pixels into pairs for pmaddubsw
static uvec8 kShuffleColARGB = {
0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel
8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel
0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel
8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel
};
// Shuffle table for duplicating 2 fractions into 8 bytes each
static uvec8 kShuffleFractions = {
0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
};
// Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version
void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx) {
void ScaleARGBFilterCols_SSSE3(uint8* dst_argb,
const uint8* src_argb,
int dst_width,
int x,
int dx) {
intptr_t x0, x1;
asm volatile (
"movdqa %0,%%xmm4 \n"
"movdqa %1,%%xmm5 \n"
:
: "m"(kShuffleColARGB), // %0
"m"(kShuffleFractions) // %1
);
asm volatile(
"movdqa %0,%%xmm4 \n"
"movdqa %1,%%xmm5 \n"
:
: "m"(kShuffleColARGB), // %0
"m"(kShuffleFractions) // %1
);
asm volatile (
"movd %5,%%xmm2 \n"
@ -1253,34 +1340,32 @@ void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
// Divide num by div and return as 16.16 fixed point result.
int FixedDiv_X86(int num, int div) {
asm volatile (
"cdq \n"
"shld $0x10,%%eax,%%edx \n"
"shl $0x10,%%eax \n"
"idiv %1 \n"
"mov %0, %%eax \n"
: "+a"(num) // %0
: "c"(div) // %1
: "memory", "cc", "edx"
);
asm volatile(
"cdq \n"
"shld $0x10,%%eax,%%edx \n"
"shl $0x10,%%eax \n"
"idiv %1 \n"
"mov %0, %%eax \n"
: "+a"(num) // %0
: "c"(div) // %1
: "memory", "cc", "edx");
return num;
}
// Divide num - 1 by div - 1 and return as 16.16 fixed point result.
int FixedDiv1_X86(int num, int div) {
asm volatile (
"cdq \n"
"shld $0x10,%%eax,%%edx \n"
"shl $0x10,%%eax \n"
"sub $0x10001,%%eax \n"
"sbb $0x0,%%edx \n"
"sub $0x1,%1 \n"
"idiv %1 \n"
"mov %0, %%eax \n"
: "+a"(num) // %0
: "c"(div) // %1
: "memory", "cc", "edx"
);
asm volatile(
"cdq \n"
"shld $0x10,%%eax,%%edx \n"
"shl $0x10,%%eax \n"
"sub $0x10001,%%eax \n"
"sbb $0x0,%%edx \n"
"sub $0x1,%1 \n"
"idiv %1 \n"
"mov %0, %%eax \n"
: "+a"(num) // %0
: "c"(div) // %1
: "memory", "cc", "edx");
return num;
}

553
third_party/yuv/source/scale_msa.cc vendored Normal file
View File

@ -0,0 +1,553 @@
/*
* Copyright 2016 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <assert.h>
#include "libyuv/scale_row.h"
// This module is for GCC MSA
#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
#include "libyuv/macros_msa.h"
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
void ScaleARGBRowDown2_MSA(const uint8_t* src_argb,
ptrdiff_t src_stride,
uint8_t* dst_argb,
int dst_width) {
int x;
v16u8 src0, src1, dst0;
(void)src_stride;
for (x = 0; x < dst_width; x += 4) {
src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0);
src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16);
dst0 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0);
ST_UB(dst0, dst_argb);
src_argb += 32;
dst_argb += 16;
}
}
void ScaleARGBRowDown2Linear_MSA(const uint8_t* src_argb,
ptrdiff_t src_stride,
uint8_t* dst_argb,
int dst_width) {
int x;
v16u8 src0, src1, vec0, vec1, dst0;
(void)src_stride;
for (x = 0; x < dst_width; x += 4) {
src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0);
src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16);
vec0 = (v16u8)__msa_pckev_w((v4i32)src1, (v4i32)src0);
vec1 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0);
dst0 = (v16u8)__msa_aver_u_b((v16u8)vec0, (v16u8)vec1);
ST_UB(dst0, dst_argb);
src_argb += 32;
dst_argb += 16;
}
}
void ScaleARGBRowDown2Box_MSA(const uint8_t* src_argb,
ptrdiff_t src_stride,
uint8_t* dst_argb,
int dst_width) {
int x;
const uint8_t* s = src_argb;
const uint8_t* t = src_argb + src_stride;
v16u8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, dst0;
v8u16 reg0, reg1, reg2, reg3;
v16i8 shuffler = {0, 4, 1, 5, 2, 6, 3, 7, 8, 12, 9, 13, 10, 14, 11, 15};
for (x = 0; x < dst_width; x += 4) {
src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
src2 = (v16u8)__msa_ld_b((v16i8*)t, 0);
src3 = (v16u8)__msa_ld_b((v16i8*)t, 16);
vec0 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src0, (v16i8)src0);
vec1 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src1, (v16i8)src1);
vec2 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src2, (v16i8)src2);
vec3 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src3, (v16i8)src3);
reg0 = __msa_hadd_u_h(vec0, vec0);
reg1 = __msa_hadd_u_h(vec1, vec1);
reg2 = __msa_hadd_u_h(vec2, vec2);
reg3 = __msa_hadd_u_h(vec3, vec3);
reg0 += reg2;
reg1 += reg3;
reg0 = (v8u16)__msa_srari_h((v8i16)reg0, 2);
reg1 = (v8u16)__msa_srari_h((v8i16)reg1, 2);
dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0);
ST_UB(dst0, dst_argb);
s += 32;
t += 32;
dst_argb += 16;
}
}
void ScaleARGBRowDownEven_MSA(const uint8_t* src_argb,
ptrdiff_t src_stride,
int32_t src_stepx,
uint8_t* dst_argb,
int dst_width) {
int x;
int32_t stepx = src_stepx * 4;
int32_t data0, data1, data2, data3;
(void)src_stride;
for (x = 0; x < dst_width; x += 4) {
data0 = LW(src_argb);
data1 = LW(src_argb + stepx);
data2 = LW(src_argb + stepx * 2);
data3 = LW(src_argb + stepx * 3);
SW(data0, dst_argb);
SW(data1, dst_argb + 4);
SW(data2, dst_argb + 8);
SW(data3, dst_argb + 12);
src_argb += stepx * 4;
dst_argb += 16;
}
}
void ScaleARGBRowDownEvenBox_MSA(const uint8* src_argb,
ptrdiff_t src_stride,
int src_stepx,
uint8* dst_argb,
int dst_width) {
int x;
const uint8* nxt_argb = src_argb + src_stride;
int32_t stepx = src_stepx * 4;
int64_t data0, data1, data2, data3;
v16u8 src0 = {0}, src1 = {0}, src2 = {0}, src3 = {0};
v16u8 vec0, vec1, vec2, vec3;
v8u16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
v16u8 dst0;
for (x = 0; x < dst_width; x += 4) {
data0 = LD(src_argb);
data1 = LD(src_argb + stepx);
data2 = LD(src_argb + stepx * 2);
data3 = LD(src_argb + stepx * 3);
src0 = (v16u8)__msa_insert_d((v2i64)src0, 0, data0);
src0 = (v16u8)__msa_insert_d((v2i64)src0, 1, data1);
src1 = (v16u8)__msa_insert_d((v2i64)src1, 0, data2);
src1 = (v16u8)__msa_insert_d((v2i64)src1, 1, data3);
data0 = LD(nxt_argb);
data1 = LD(nxt_argb + stepx);
data2 = LD(nxt_argb + stepx * 2);
data3 = LD(nxt_argb + stepx * 3);
src2 = (v16u8)__msa_insert_d((v2i64)src2, 0, data0);
src2 = (v16u8)__msa_insert_d((v2i64)src2, 1, data1);
src3 = (v16u8)__msa_insert_d((v2i64)src3, 0, data2);
src3 = (v16u8)__msa_insert_d((v2i64)src3, 1, data3);
vec0 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src0);
vec1 = (v16u8)__msa_ilvr_b((v16i8)src3, (v16i8)src1);
vec2 = (v16u8)__msa_ilvl_b((v16i8)src2, (v16i8)src0);
vec3 = (v16u8)__msa_ilvl_b((v16i8)src3, (v16i8)src1);
reg0 = __msa_hadd_u_h(vec0, vec0);
reg1 = __msa_hadd_u_h(vec1, vec1);
reg2 = __msa_hadd_u_h(vec2, vec2);
reg3 = __msa_hadd_u_h(vec3, vec3);
reg4 = (v8u16)__msa_pckev_d((v2i64)reg2, (v2i64)reg0);
reg5 = (v8u16)__msa_pckev_d((v2i64)reg3, (v2i64)reg1);
reg6 = (v8u16)__msa_pckod_d((v2i64)reg2, (v2i64)reg0);
reg7 = (v8u16)__msa_pckod_d((v2i64)reg3, (v2i64)reg1);
reg4 += reg6;
reg5 += reg7;
reg4 = (v8u16)__msa_srari_h((v8i16)reg4, 2);
reg5 = (v8u16)__msa_srari_h((v8i16)reg5, 2);
dst0 = (v16u8)__msa_pckev_b((v16i8)reg5, (v16i8)reg4);
ST_UB(dst0, dst_argb);
src_argb += stepx * 4;
nxt_argb += stepx * 4;
dst_argb += 16;
}
}
void ScaleRowDown2_MSA(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst,
int dst_width) {
int x;
v16u8 src0, src1, src2, src3, dst0, dst1;
(void)src_stride;
for (x = 0; x < dst_width; x += 32) {
src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0);
src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16);
src2 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 32);
src3 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 48);
dst0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
dst1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
ST_UB2(dst0, dst1, dst, 16);
src_ptr += 64;
dst += 32;
}
}
void ScaleRowDown2Linear_MSA(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst,
int dst_width) {
int x;
v16u8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, dst0, dst1;
(void)src_stride;
for (x = 0; x < dst_width; x += 32) {
src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0);
src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16);
src2 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 32);
src3 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 48);
vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
vec2 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
vec1 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
vec3 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
dst0 = __msa_aver_u_b(vec1, vec0);
dst1 = __msa_aver_u_b(vec3, vec2);
ST_UB2(dst0, dst1, dst, 16);
src_ptr += 64;
dst += 32;
}
}
void ScaleRowDown2Box_MSA(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst,
int dst_width) {
int x;
const uint8_t* s = src_ptr;
const uint8_t* t = src_ptr + src_stride;
v16u8 src0, src1, src2, src3, src4, src5, src6, src7, dst0, dst1;
v8u16 vec0, vec1, vec2, vec3;
for (x = 0; x < dst_width; x += 32) {
src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
src2 = (v16u8)__msa_ld_b((v16i8*)s, 32);
src3 = (v16u8)__msa_ld_b((v16i8*)s, 48);
src4 = (v16u8)__msa_ld_b((v16i8*)t, 0);
src5 = (v16u8)__msa_ld_b((v16i8*)t, 16);
src6 = (v16u8)__msa_ld_b((v16i8*)t, 32);
src7 = (v16u8)__msa_ld_b((v16i8*)t, 48);
vec0 = __msa_hadd_u_h(src0, src0);
vec1 = __msa_hadd_u_h(src1, src1);
vec2 = __msa_hadd_u_h(src2, src2);
vec3 = __msa_hadd_u_h(src3, src3);
vec0 += __msa_hadd_u_h(src4, src4);
vec1 += __msa_hadd_u_h(src5, src5);
vec2 += __msa_hadd_u_h(src6, src6);
vec3 += __msa_hadd_u_h(src7, src7);
vec0 = (v8u16)__msa_srari_h((v8i16)vec0, 2);
vec1 = (v8u16)__msa_srari_h((v8i16)vec1, 2);
vec2 = (v8u16)__msa_srari_h((v8i16)vec2, 2);
vec3 = (v8u16)__msa_srari_h((v8i16)vec3, 2);
dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
ST_UB2(dst0, dst1, dst, 16);
s += 64;
t += 64;
dst += 32;
}
}
void ScaleRowDown4_MSA(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst,
int dst_width) {
int x;
v16u8 src0, src1, src2, src3, vec0, vec1, dst0;
(void)src_stride;
for (x = 0; x < dst_width; x += 16) {
src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0);
src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16);
src2 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 32);
src3 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 48);
vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
dst0 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0);
ST_UB(dst0, dst);
src_ptr += 64;
dst += 16;
}
}
void ScaleRowDown4Box_MSA(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst,
int dst_width) {
int x;
const uint8_t* s = src_ptr;
const uint8_t* t0 = s + src_stride;
const uint8_t* t1 = s + src_stride * 2;
const uint8_t* t2 = s + src_stride * 3;
v16u8 src0, src1, src2, src3, src4, src5, src6, src7, dst0;
v8u16 vec0, vec1, vec2, vec3;
v4u32 reg0, reg1, reg2, reg3;
for (x = 0; x < dst_width; x += 16) {
src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
src2 = (v16u8)__msa_ld_b((v16i8*)s, 32);
src3 = (v16u8)__msa_ld_b((v16i8*)s, 48);
src4 = (v16u8)__msa_ld_b((v16i8*)t0, 0);
src5 = (v16u8)__msa_ld_b((v16i8*)t0, 16);
src6 = (v16u8)__msa_ld_b((v16i8*)t0, 32);
src7 = (v16u8)__msa_ld_b((v16i8*)t0, 48);
vec0 = __msa_hadd_u_h(src0, src0);
vec1 = __msa_hadd_u_h(src1, src1);
vec2 = __msa_hadd_u_h(src2, src2);
vec3 = __msa_hadd_u_h(src3, src3);
vec0 += __msa_hadd_u_h(src4, src4);
vec1 += __msa_hadd_u_h(src5, src5);
vec2 += __msa_hadd_u_h(src6, src6);
vec3 += __msa_hadd_u_h(src7, src7);
src0 = (v16u8)__msa_ld_b((v16i8*)t1, 0);
src1 = (v16u8)__msa_ld_b((v16i8*)t1, 16);
src2 = (v16u8)__msa_ld_b((v16i8*)t1, 32);
src3 = (v16u8)__msa_ld_b((v16i8*)t1, 48);
src4 = (v16u8)__msa_ld_b((v16i8*)t2, 0);
src5 = (v16u8)__msa_ld_b((v16i8*)t2, 16);
src6 = (v16u8)__msa_ld_b((v16i8*)t2, 32);
src7 = (v16u8)__msa_ld_b((v16i8*)t2, 48);
vec0 += __msa_hadd_u_h(src0, src0);
vec1 += __msa_hadd_u_h(src1, src1);
vec2 += __msa_hadd_u_h(src2, src2);
vec3 += __msa_hadd_u_h(src3, src3);
vec0 += __msa_hadd_u_h(src4, src4);
vec1 += __msa_hadd_u_h(src5, src5);
vec2 += __msa_hadd_u_h(src6, src6);
vec3 += __msa_hadd_u_h(src7, src7);
reg0 = __msa_hadd_u_w(vec0, vec0);
reg1 = __msa_hadd_u_w(vec1, vec1);
reg2 = __msa_hadd_u_w(vec2, vec2);
reg3 = __msa_hadd_u_w(vec3, vec3);
reg0 = (v4u32)__msa_srari_w((v4i32)reg0, 4);
reg1 = (v4u32)__msa_srari_w((v4i32)reg1, 4);
reg2 = (v4u32)__msa_srari_w((v4i32)reg2, 4);
reg3 = (v4u32)__msa_srari_w((v4i32)reg3, 4);
vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
ST_UB(dst0, dst);
s += 64;
t0 += 64;
t1 += 64;
t2 += 64;
dst += 16;
}
}
void ScaleRowDown38_MSA(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst,
int dst_width) {
int x, width;
uint64_t dst0;
uint32_t dst1;
v16u8 src0, src1, vec0;
v16i8 mask = {0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0};
(void)src_stride;
assert(dst_width % 3 == 0);
width = dst_width / 3;
for (x = 0; x < width; x += 4) {
src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0);
src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16);
vec0 = (v16u8)__msa_vshf_b(mask, (v16i8)src1, (v16i8)src0);
dst0 = __msa_copy_u_d((v2i64)vec0, 0);
dst1 = __msa_copy_u_w((v4i32)vec0, 2);
SD(dst0, dst);
SW(dst1, dst + 8);
src_ptr += 32;
dst += 12;
}
}
void ScaleRowDown38_2_Box_MSA(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
int dst_width) {
int x, width;
const uint8_t* s = src_ptr;
const uint8_t* t = src_ptr + src_stride;
uint64_t dst0;
uint32_t dst1;
v16u8 src0, src1, src2, src3, out;
v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
v4u32 tmp0, tmp1, tmp2, tmp3, tmp4;
v8i16 zero = {0};
v8i16 mask = {0, 1, 2, 8, 3, 4, 5, 9};
v16i8 dst_mask = {0, 2, 16, 4, 6, 18, 8, 10, 20, 12, 14, 22, 0, 0, 0, 0};
v4u32 const_0x2AAA = (v4u32)__msa_fill_w(0x2AAA);
v4u32 const_0x4000 = (v4u32)__msa_fill_w(0x4000);
assert((dst_width % 3 == 0) && (dst_width > 0));
width = dst_width / 3;
for (x = 0; x < width; x += 4) {
src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
src2 = (v16u8)__msa_ld_b((v16i8*)t, 0);
src3 = (v16u8)__msa_ld_b((v16i8*)t, 16);
vec0 = (v8u16)__msa_ilvr_b((v16i8)src2, (v16i8)src0);
vec1 = (v8u16)__msa_ilvl_b((v16i8)src2, (v16i8)src0);
vec2 = (v8u16)__msa_ilvr_b((v16i8)src3, (v16i8)src1);
vec3 = (v8u16)__msa_ilvl_b((v16i8)src3, (v16i8)src1);
vec0 = __msa_hadd_u_h((v16u8)vec0, (v16u8)vec0);
vec1 = __msa_hadd_u_h((v16u8)vec1, (v16u8)vec1);
vec2 = __msa_hadd_u_h((v16u8)vec2, (v16u8)vec2);
vec3 = __msa_hadd_u_h((v16u8)vec3, (v16u8)vec3);
vec4 = (v8u16)__msa_vshf_h(mask, zero, (v8i16)vec0);
vec5 = (v8u16)__msa_vshf_h(mask, zero, (v8i16)vec1);
vec6 = (v8u16)__msa_vshf_h(mask, zero, (v8i16)vec2);
vec7 = (v8u16)__msa_vshf_h(mask, zero, (v8i16)vec3);
vec0 = (v8u16)__msa_pckod_w((v4i32)vec1, (v4i32)vec0);
vec1 = (v8u16)__msa_pckod_w((v4i32)vec3, (v4i32)vec2);
vec0 = (v8u16)__msa_pckod_w((v4i32)vec1, (v4i32)vec0);
tmp0 = __msa_hadd_u_w(vec4, vec4);
tmp1 = __msa_hadd_u_w(vec5, vec5);
tmp2 = __msa_hadd_u_w(vec6, vec6);
tmp3 = __msa_hadd_u_w(vec7, vec7);
tmp4 = __msa_hadd_u_w(vec0, vec0);
vec0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0);
vec1 = (v8u16)__msa_pckev_h((v8i16)tmp3, (v8i16)tmp2);
tmp0 = __msa_hadd_u_w(vec0, vec0);
tmp1 = __msa_hadd_u_w(vec1, vec1);
tmp0 *= const_0x2AAA;
tmp1 *= const_0x2AAA;
tmp4 *= const_0x4000;
tmp0 = (v4u32)__msa_srai_w((v4i32)tmp0, 16);
tmp1 = (v4u32)__msa_srai_w((v4i32)tmp1, 16);
tmp4 = (v4u32)__msa_srai_w((v4i32)tmp4, 16);
vec0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0);
vec1 = (v8u16)__msa_pckev_h((v8i16)tmp4, (v8i16)tmp4);
out = (v16u8)__msa_vshf_b(dst_mask, (v16i8)vec1, (v16i8)vec0);
dst0 = __msa_copy_u_d((v2i64)out, 0);
dst1 = __msa_copy_u_w((v4i32)out, 2);
SD(dst0, dst_ptr);
SW(dst1, dst_ptr + 8);
s += 32;
t += 32;
dst_ptr += 12;
}
}
void ScaleRowDown38_3_Box_MSA(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
int dst_width) {
int x, width;
const uint8_t* s = src_ptr;
const uint8_t* t0 = s + src_stride;
const uint8_t* t1 = s + src_stride * 2;
uint64_t dst0;
uint32_t dst1;
v16u8 src0, src1, src2, src3, src4, src5, out;
v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
v4u32 tmp0, tmp1, tmp2, tmp3, tmp4;
v8u16 zero = {0};
v8i16 mask = {0, 1, 2, 8, 3, 4, 5, 9};
v16i8 dst_mask = {0, 2, 16, 4, 6, 18, 8, 10, 20, 12, 14, 22, 0, 0, 0, 0};
v4u32 const_0x1C71 = (v4u32)__msa_fill_w(0x1C71);
v4u32 const_0x2AAA = (v4u32)__msa_fill_w(0x2AAA);
assert((dst_width % 3 == 0) && (dst_width > 0));
width = dst_width / 3;
for (x = 0; x < width; x += 4) {
src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
src2 = (v16u8)__msa_ld_b((v16i8*)t0, 0);
src3 = (v16u8)__msa_ld_b((v16i8*)t0, 16);
src4 = (v16u8)__msa_ld_b((v16i8*)t1, 0);
src5 = (v16u8)__msa_ld_b((v16i8*)t1, 16);
vec0 = (v8u16)__msa_ilvr_b((v16i8)src2, (v16i8)src0);
vec1 = (v8u16)__msa_ilvl_b((v16i8)src2, (v16i8)src0);
vec2 = (v8u16)__msa_ilvr_b((v16i8)src3, (v16i8)src1);
vec3 = (v8u16)__msa_ilvl_b((v16i8)src3, (v16i8)src1);
vec4 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src4);
vec5 = (v8u16)__msa_ilvl_b((v16i8)zero, (v16i8)src4);
vec6 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src5);
vec7 = (v8u16)__msa_ilvl_b((v16i8)zero, (v16i8)src5);
vec0 = __msa_hadd_u_h((v16u8)vec0, (v16u8)vec0);
vec1 = __msa_hadd_u_h((v16u8)vec1, (v16u8)vec1);
vec2 = __msa_hadd_u_h((v16u8)vec2, (v16u8)vec2);
vec3 = __msa_hadd_u_h((v16u8)vec3, (v16u8)vec3);
vec0 += __msa_hadd_u_h((v16u8)vec4, (v16u8)vec4);
vec1 += __msa_hadd_u_h((v16u8)vec5, (v16u8)vec5);
vec2 += __msa_hadd_u_h((v16u8)vec6, (v16u8)vec6);
vec3 += __msa_hadd_u_h((v16u8)vec7, (v16u8)vec7);
vec4 = (v8u16)__msa_vshf_h(mask, (v8i16)zero, (v8i16)vec0);
vec5 = (v8u16)__msa_vshf_h(mask, (v8i16)zero, (v8i16)vec1);
vec6 = (v8u16)__msa_vshf_h(mask, (v8i16)zero, (v8i16)vec2);
vec7 = (v8u16)__msa_vshf_h(mask, (v8i16)zero, (v8i16)vec3);
vec0 = (v8u16)__msa_pckod_w((v4i32)vec1, (v4i32)vec0);
vec1 = (v8u16)__msa_pckod_w((v4i32)vec3, (v4i32)vec2);
vec0 = (v8u16)__msa_pckod_w((v4i32)vec1, (v4i32)vec0);
tmp0 = __msa_hadd_u_w(vec4, vec4);
tmp1 = __msa_hadd_u_w(vec5, vec5);
tmp2 = __msa_hadd_u_w(vec6, vec6);
tmp3 = __msa_hadd_u_w(vec7, vec7);
tmp4 = __msa_hadd_u_w(vec0, vec0);
vec0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0);
vec1 = (v8u16)__msa_pckev_h((v8i16)tmp3, (v8i16)tmp2);
tmp0 = __msa_hadd_u_w(vec0, vec0);
tmp1 = __msa_hadd_u_w(vec1, vec1);
tmp0 *= const_0x1C71;
tmp1 *= const_0x1C71;
tmp4 *= const_0x2AAA;
tmp0 = (v4u32)__msa_srai_w((v4i32)tmp0, 16);
tmp1 = (v4u32)__msa_srai_w((v4i32)tmp1, 16);
tmp4 = (v4u32)__msa_srai_w((v4i32)tmp4, 16);
vec0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0);
vec1 = (v8u16)__msa_pckev_h((v8i16)tmp4, (v8i16)tmp4);
out = (v16u8)__msa_vshf_b(dst_mask, (v16i8)vec1, (v16i8)vec0);
dst0 = __msa_copy_u_d((v2i64)out, 0);
dst1 = __msa_copy_u_w((v4i32)out, 2);
SD(dst0, dst_ptr);
SW(dst1, dst_ptr + 8);
s += 32;
t0 += 32;
t1 += 32;
dst_ptr += 12;
}
}
void ScaleAddRow_MSA(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) {
int x;
v16u8 src0;
v8u16 dst0, dst1;
v16i8 zero = {0};
assert(src_width > 0);
for (x = 0; x < src_width; x += 16) {
src0 = LD_UB(src_ptr);
dst0 = (v8u16)__msa_ld_h((v8i16*)dst_ptr, 0);
dst1 = (v8u16)__msa_ld_h((v8i16*)dst_ptr, 16);
dst0 += (v8u16)__msa_ilvr_b(zero, (v16i8)src0);
dst1 += (v8u16)__msa_ilvl_b(zero, (v16i8)src0);
ST_UH2(dst0, dst1, dst_ptr, 8);
src_ptr += 16;
dst_ptr += 16;
}
}
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif
#endif // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)

File diff suppressed because it is too large Load Diff

View File

@ -8,8 +8,8 @@
* be found in the AUTHORS file in the root of the source tree.
*/
#include "libyuv/scale.h"
#include "libyuv/row.h"
#include "libyuv/scale.h"
#include "libyuv/scale_row.h"
#ifdef __cplusplus
@ -21,15 +21,16 @@ extern "C" {
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
// Read 32x1 throw away even pixels, and write 16x1.
void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
void ScaleRowDown2_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst,
int dst_width) {
(void)src_stride;
asm volatile (
"1: \n"
// load even pixels into v0, odd into v1
MEMACCESS(0)
"ld2 {v0.16b,v1.16b}, [%0], #32 \n"
"subs %w2, %w2, #16 \n" // 16 processed per loop
MEMACCESS(1)
"st1 {v1.16b}, [%1], #16 \n" // store odd pixels
"b.gt 1b \n"
: "+r"(src_ptr), // %0
@ -41,18 +42,19 @@ void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
}
// Read 32x1 average down and write 16x1.
void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
void ScaleRowDown2Linear_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst,
int dst_width) {
(void)src_stride;
asm volatile (
"1: \n"
MEMACCESS(0)
"ld1 {v0.16b,v1.16b}, [%0], #32 \n" // load pixels and post inc
"subs %w2, %w2, #16 \n" // 16 processed per loop
"uaddlp v0.8h, v0.16b \n" // add adjacent
"uaddlp v1.8h, v1.16b \n"
"rshrn v0.8b, v0.8h, #1 \n" // downshift, round and pack
"rshrn2 v0.16b, v1.8h, #1 \n"
MEMACCESS(1)
"st1 {v0.16b}, [%1], #16 \n"
"b.gt 1b \n"
: "+r"(src_ptr), // %0
@ -64,15 +66,15 @@ void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
}
// Read 32x2 average down and write 16x1.
void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
void ScaleRowDown2Box_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst,
int dst_width) {
asm volatile (
// change the stride to row 2 pointer
"add %1, %1, %0 \n"
"1: \n"
MEMACCESS(0)
"ld1 {v0.16b,v1.16b}, [%0], #32 \n" // load row 1 and post inc
MEMACCESS(1)
"ld1 {v0.16b, v1.16b}, [%0], #32 \n" // load row 1 and post inc
"ld1 {v2.16b, v3.16b}, [%1], #32 \n" // load row 2 and post inc
"subs %w3, %w3, #16 \n" // 16 processed per loop
"uaddlp v0.8h, v0.16b \n" // row 1 add adjacent
@ -81,7 +83,6 @@ void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
"uadalp v1.8h, v3.16b \n"
"rshrn v0.8b, v0.8h, #2 \n" // downshift, round and pack
"rshrn2 v0.16b, v1.8h, #2 \n"
MEMACCESS(2)
"st1 {v0.16b}, [%2], #16 \n"
"b.gt 1b \n"
: "+r"(src_ptr), // %0
@ -93,14 +94,15 @@ void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
);
}
void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
void ScaleRowDown4_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr,
int dst_width) {
(void)src_stride;
asm volatile (
"1: \n"
MEMACCESS(0)
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
"subs %w2, %w2, #8 \n" // 8 processed per loop
MEMACCESS(1)
"st1 {v2.8b}, [%1], #8 \n"
"b.gt 1b \n"
: "+r"(src_ptr), // %0
@ -111,20 +113,18 @@ void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
);
}
void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
void ScaleRowDown4Box_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr,
int dst_width) {
const uint8* src_ptr1 = src_ptr + src_stride;
const uint8* src_ptr2 = src_ptr + src_stride * 2;
const uint8* src_ptr3 = src_ptr + src_stride * 3;
asm volatile (
asm volatile (
"1: \n"
MEMACCESS(0)
"ld1 {v0.16b}, [%0], #16 \n" // load up 16x4
MEMACCESS(3)
"ld1 {v1.16b}, [%2], #16 \n"
MEMACCESS(4)
"ld1 {v2.16b}, [%3], #16 \n"
MEMACCESS(5)
"ld1 {v3.16b}, [%4], #16 \n"
"subs %w5, %w5, #4 \n"
"uaddlp v0.8h, v0.16b \n"
@ -133,7 +133,6 @@ asm volatile (
"uadalp v0.8h, v3.16b \n"
"addp v0.8h, v0.8h, v0.8h \n"
"rshrn v0.8b, v0.8h, #4 \n" // divide by 16 w/rounding
MEMACCESS(1)
"st1 {v0.s}[0], [%1], #4 \n"
"b.gt 1b \n"
: "+r"(src_ptr), // %0
@ -152,15 +151,15 @@ asm volatile (
// Point samples 32 pixels to 24 pixels.
void ScaleRowDown34_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
uint8* dst_ptr,
int dst_width) {
(void)src_stride;
asm volatile (
"1: \n"
MEMACCESS(0)
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
"subs %w2, %w2, #24 \n"
"orr v2.16b, v3.16b, v3.16b \n" // order v0, v1, v2
MEMACCESS(1)
"st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
"st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
"b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
@ -172,15 +171,14 @@ void ScaleRowDown34_NEON(const uint8* src_ptr,
void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
uint8* dst_ptr,
int dst_width) {
asm volatile (
"movi v20.8b, #3 \n"
"add %3, %3, %0 \n"
"1: \n"
MEMACCESS(0)
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
MEMACCESS(3)
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1
"subs %w2, %w2, #24 \n"
// filter src line 0 with src line 1
@ -216,8 +214,7 @@ void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
"umlal v16.8h, v3.8b, v20.8b \n"
"uqrshrn v2.8b, v16.8h, #2 \n"
MEMACCESS(1)
"st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
"st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
"b.gt 1b \n"
: "+r"(src_ptr), // %0
@ -232,15 +229,14 @@ void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
uint8* dst_ptr,
int dst_width) {
asm volatile (
"movi v20.8b, #3 \n"
"add %3, %3, %0 \n"
"1: \n"
MEMACCESS(0)
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
MEMACCESS(3)
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1
"subs %w2, %w2, #24 \n"
// average src line 0 with src line 1
"urhadd v0.8b, v0.8b, v4.8b \n"
@ -261,8 +257,7 @@ void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
"umlal v4.8h, v3.8b, v20.8b \n"
"uqrshrn v2.8b, v4.8h, #2 \n"
MEMACCESS(1)
"st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
"st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
"b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
@ -273,32 +268,27 @@ void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
);
}
static uvec8 kShuf38 =
{ 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 };
static uvec8 kShuf38_2 =
{ 0, 16, 32, 2, 18, 33, 4, 20, 34, 6, 22, 35, 0, 0, 0, 0 };
static vec16 kMult38_Div6 =
{ 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12,
65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 };
static vec16 kMult38_Div9 =
{ 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18,
65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 };
static uvec8 kShuf38 = {0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0};
static uvec8 kShuf38_2 = {0, 16, 32, 2, 18, 33, 4, 20,
34, 6, 22, 35, 0, 0, 0, 0};
static vec16 kMult38_Div6 = {65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12,
65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12};
static vec16 kMult38_Div9 = {65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18,
65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18};
// 32 -> 12
void ScaleRowDown38_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
uint8* dst_ptr,
int dst_width) {
(void)src_stride;
asm volatile (
MEMACCESS(3)
"ld1 {v3.16b}, [%3] \n"
"1: \n"
MEMACCESS(0)
"ld1 {v0.16b,v1.16b}, [%0], #32 \n"
"ld1 {v0.16b,v1.16b}, [%0], #32 \n"
"subs %w2, %w2, #12 \n"
"tbl v2.16b, {v0.16b,v1.16b}, v3.16b \n"
MEMACCESS(1)
"tbl v2.16b, {v0.16b,v1.16b}, v3.16b \n"
"st1 {v2.8b}, [%1], #8 \n"
MEMACCESS(1)
"st1 {v2.s}[2], [%1], #4 \n"
"b.gt 1b \n"
: "+r"(src_ptr), // %0
@ -312,16 +302,14 @@ void ScaleRowDown38_NEON(const uint8* src_ptr,
// 32x3 -> 12x1
void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
uint8* dst_ptr,
int dst_width) {
const uint8* src_ptr1 = src_ptr + src_stride * 2;
ptrdiff_t tmp_src_stride = src_stride;
asm volatile (
MEMACCESS(5)
"ld1 {v29.8h}, [%5] \n"
MEMACCESS(6)
"ld1 {v30.16b}, [%6] \n"
MEMACCESS(7)
"ld1 {v31.8h}, [%7] \n"
"add %2, %2, %0 \n"
"1: \n"
@ -330,12 +318,9 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
// 10 50 11 51 12 52 13 53
// 20 60 21 61 22 62 23 63
// 30 70 31 71 32 72 33 73
MEMACCESS(0)
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"
MEMACCESS(3)
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n"
MEMACCESS(4)
"ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32 \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n"
"ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32 \n"
"subs %w4, %w4, #12 \n"
// Shuffle the input data around to get align the data
@ -419,9 +404,7 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
// be adjacent
"tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n"
MEMACCESS(1)
"st1 {v3.8b}, [%1], #8 \n"
MEMACCESS(1)
"st1 {v3.s}[2], [%1], #4 \n"
"b.gt 1b \n"
: "+r"(src_ptr), // %0
@ -441,13 +424,12 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
// 32x2 -> 12x1
void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
uint8* dst_ptr,
int dst_width) {
// TODO(fbarchard): use src_stride directly for clang 3.5+.
ptrdiff_t tmp_src_stride = src_stride;
asm volatile (
MEMACCESS(4)
"ld1 {v30.8h}, [%4] \n"
MEMACCESS(5)
"ld1 {v31.16b}, [%5] \n"
"add %2, %2, %0 \n"
"1: \n"
@ -456,10 +438,8 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
// 10 50 11 51 12 52 13 53
// 20 60 21 61 22 62 23 63
// 30 70 31 71 32 72 33 73
MEMACCESS(0)
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"
MEMACCESS(3)
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n"
"subs %w3, %w3, #12 \n"
// Shuffle the input data around to get align the data
@ -529,9 +509,7 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
"tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v31.16b \n"
MEMACCESS(1)
"st1 {v3.8b}, [%1], #8 \n"
MEMACCESS(1)
"st1 {v3.s}[2], [%1], #4 \n"
"b.gt 1b \n"
: "+r"(src_ptr), // %0
@ -545,8 +523,11 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
);
}
void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint16* dst_ptr, int src_width, int src_height) {
void ScaleAddRows_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint16* dst_ptr,
int src_width,
int src_height) {
const uint8* src_tmp;
asm volatile (
"1: \n"
@ -556,13 +537,11 @@ void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
"eor v3.16b, v3.16b, v3.16b \n"
"2: \n"
// load 16 pixels into q0
MEMACCESS(0)
"ld1 {v0.16b}, [%0], %3 \n"
"uaddw2 v3.8h, v3.8h, v0.16b \n"
"uaddw v2.8h, v2.8h, v0.8b \n"
"subs w12, w12, #1 \n"
"b.gt 2b \n"
MEMACCESS(2)
"st1 {v2.8h, v3.8h}, [%2], #32 \n" // store pixels
"add %1, %1, #16 \n"
"subs %w4, %w4, #16 \n" // 16 processed per loop
@ -578,23 +557,30 @@ void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
);
}
// clang-format off
// TODO(Yang Zhang): Investigate less load instructions for
// the x/dx stepping
#define LOAD2_DATA8_LANE(n) \
"lsr %5, %3, #16 \n" \
"add %6, %1, %5 \n" \
"add %3, %3, %4 \n" \
MEMACCESS(6) \
"ld2 {v4.b, v5.b}["#n"], [%6] \n"
#define LOAD2_DATA8_LANE(n) \
"lsr %5, %3, #16 \n" \
"add %6, %1, %5 \n" \
"add %3, %3, %4 \n" \
"ld2 {v4.b, v5.b}[" #n "], [%6] \n"
// clang-format on
void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
int dst_width, int x, int dx) {
// The NEON version mimics this formula (from row_common.cc):
// #define BLENDER(a, b, f) (uint8)((int)(a) +
// ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
void ScaleFilterCols_NEON(uint8* dst_ptr,
const uint8* src_ptr,
int dst_width,
int x,
int dx) {
int dx_offset[4] = {0, 1, 2, 3};
int* tmp = dx_offset;
const uint8* src_tmp = src_ptr;
int64 dst_width64 = (int64) dst_width; // Work around ios 64 bit warning.
int64 x64 = (int64) x;
int64 dx64 = (int64) dx;
int64 x64 = (int64)x;
int64 dx64 = (int64)dx;
asm volatile (
"dup v0.4s, %w3 \n" // x
"dup v1.4s, %w4 \n" // dx
@ -626,12 +612,11 @@ void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
"ushll2 v6.4s, v6.8h, #0 \n"
"mul v16.4s, v16.4s, v7.4s \n"
"mul v17.4s, v17.4s, v6.4s \n"
"shrn v6.4h, v16.4s, #16 \n"
"shrn2 v6.8h, v17.4s, #16 \n"
"rshrn v6.4h, v16.4s, #16 \n"
"rshrn2 v6.8h, v17.4s, #16 \n"
"add v4.8h, v4.8h, v6.8h \n"
"xtn v4.8b, v4.8h \n"
MEMACCESS(0)
"st1 {v4.8b}, [%0], #8 \n" // store pixels
"add v1.4s, v1.4s, v0.4s \n"
"add v2.4s, v2.4s, v0.4s \n"
@ -639,7 +624,7 @@ void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
"b.gt 1b \n"
: "+r"(dst_ptr), // %0
"+r"(src_ptr), // %1
"+r"(dst_width64), // %2
"+r"(dst_width), // %2
"+r"(x64), // %3
"+r"(dx64), // %4
"+r"(tmp), // %5
@ -654,9 +639,11 @@ void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
// 16x2 -> 16x1
void ScaleFilterRows_NEON(uint8* dst_ptr,
const uint8* src_ptr, ptrdiff_t src_stride,
int dst_width, int source_y_fraction) {
int y_fraction = 256 - source_y_fraction;
const uint8* src_ptr,
ptrdiff_t src_stride,
int dst_width,
int source_y_fraction) {
int y_fraction = 256 - source_y_fraction;
asm volatile (
"cmp %w4, #0 \n"
"b.eq 100f \n"
@ -672,9 +659,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
"dup v4.8b, %w5 \n"
// General purpose row blend.
"1: \n"
MEMACCESS(1)
"ld1 {v0.16b}, [%1], #16 \n"
MEMACCESS(2)
"ld1 {v1.16b}, [%2], #16 \n"
"subs %w3, %w3, #16 \n"
"umull v6.8h, v0.8b, v4.8b \n"
@ -683,63 +668,50 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
"umlal2 v7.8h, v1.16b, v5.16b \n"
"rshrn v0.8b, v6.8h, #8 \n"
"rshrn2 v0.16b, v7.8h, #8 \n"
MEMACCESS(0)
"st1 {v0.16b}, [%0], #16 \n"
"b.gt 1b \n"
"b 99f \n"
// Blend 25 / 75.
"25: \n"
MEMACCESS(1)
"ld1 {v0.16b}, [%1], #16 \n"
MEMACCESS(2)
"ld1 {v1.16b}, [%2], #16 \n"
"subs %w3, %w3, #16 \n"
"urhadd v0.16b, v0.16b, v1.16b \n"
"urhadd v0.16b, v0.16b, v1.16b \n"
MEMACCESS(0)
"st1 {v0.16b}, [%0], #16 \n"
"b.gt 25b \n"
"b 99f \n"
// Blend 50 / 50.
"50: \n"
MEMACCESS(1)
"ld1 {v0.16b}, [%1], #16 \n"
MEMACCESS(2)
"ld1 {v1.16b}, [%2], #16 \n"
"subs %w3, %w3, #16 \n"
"urhadd v0.16b, v0.16b, v1.16b \n"
MEMACCESS(0)
"st1 {v0.16b}, [%0], #16 \n"
"b.gt 50b \n"
"b 99f \n"
// Blend 75 / 25.
"75: \n"
MEMACCESS(1)
"ld1 {v1.16b}, [%1], #16 \n"
MEMACCESS(2)
"ld1 {v0.16b}, [%2], #16 \n"
"subs %w3, %w3, #16 \n"
"urhadd v0.16b, v0.16b, v1.16b \n"
"urhadd v0.16b, v0.16b, v1.16b \n"
MEMACCESS(0)
"st1 {v0.16b}, [%0], #16 \n"
"b.gt 75b \n"
"b 99f \n"
// Blend 100 / 0 - Copy row unchanged.
"100: \n"
MEMACCESS(1)
"ld1 {v0.16b}, [%1], #16 \n"
"subs %w3, %w3, #16 \n"
MEMACCESS(0)
"st1 {v0.16b}, [%0], #16 \n"
"b.gt 100b \n"
"99: \n"
MEMACCESS(0)
"st1 {v0.b}[15], [%0] \n"
: "+r"(dst_ptr), // %0
"+r"(src_ptr), // %1
@ -752,19 +724,18 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
);
}
void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
void ScaleARGBRowDown2_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst,
int dst_width) {
(void)src_stride;
asm volatile (
"1: \n"
// load even pixels into q0, odd into q1
MEMACCESS (0)
"ld2 {v0.4s, v1.4s}, [%0], #32 \n"
MEMACCESS (0)
"ld2 {v2.4s, v3.4s}, [%0], #32 \n"
"subs %w2, %w2, #8 \n" // 8 processed per loop
MEMACCESS (1)
"st1 {v1.16b}, [%1], #16 \n" // store odd pixels
MEMACCESS (1)
"st1 {v3.16b}, [%1], #16 \n"
"b.gt 1b \n"
: "+r" (src_ptr), // %0
@ -775,11 +746,13 @@ void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
);
}
void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride,
uint8* dst_argb, int dst_width) {
void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb,
ptrdiff_t src_stride,
uint8* dst_argb,
int dst_width) {
(void)src_stride;
asm volatile (
"1: \n"
MEMACCESS (0)
// load 8 ARGB pixels.
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"
"subs %w2, %w2, #8 \n" // 8 processed per loop.
@ -791,7 +764,6 @@ void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride,
"rshrn v1.8b, v1.8h, #1 \n"
"rshrn v2.8b, v2.8h, #1 \n"
"rshrn v3.8b, v3.8h, #1 \n"
MEMACCESS (1)
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"
"b.gt 1b \n"
: "+r"(src_argb), // %0
@ -802,20 +774,20 @@ void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride,
);
}
void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst,
int dst_width) {
asm volatile (
// change the stride to row 2 pointer
"add %1, %1, %0 \n"
"1: \n"
MEMACCESS (0)
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 8 ARGB pixels.
"subs %w3, %w3, #8 \n" // 8 processed per loop.
"uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
"uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
"uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
"uaddlp v3.8h, v3.16b \n" // A 16 bytes -> 8 shorts.
MEMACCESS (1)
"ld4 {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n" // load 8 more ARGB pixels.
"uadalp v0.8h, v16.16b \n" // B 16 bytes -> 8 shorts.
"uadalp v1.8h, v17.16b \n" // G 16 bytes -> 8 shorts.
@ -825,7 +797,6 @@ void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
"rshrn v1.8b, v1.8h, #2 \n"
"rshrn v2.8b, v2.8h, #2 \n"
"rshrn v3.8b, v3.8h, #2 \n"
MEMACCESS (2)
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"
"b.gt 1b \n"
: "+r" (src_ptr), // %0
@ -839,20 +810,19 @@ void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
// Reads 4 pixels at a time.
// Alignment requirement: src_argb 4 byte aligned.
void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride,
int src_stepx, uint8* dst_argb, int dst_width) {
void ScaleARGBRowDownEven_NEON(const uint8* src_argb,
ptrdiff_t src_stride,
int src_stepx,
uint8* dst_argb,
int dst_width) {
(void)src_stride;
asm volatile (
"1: \n"
MEMACCESS(0)
"ld1 {v0.s}[0], [%0], %3 \n"
MEMACCESS(0)
"ld1 {v0.s}[1], [%0], %3 \n"
MEMACCESS(0)
"ld1 {v0.s}[2], [%0], %3 \n"
MEMACCESS(0)
"ld1 {v0.s}[3], [%0], %3 \n"
"subs %w2, %w2, #4 \n" // 4 pixels per loop.
MEMACCESS(1)
"st1 {v0.16b}, [%1], #16 \n"
"b.gt 1b \n"
: "+r"(src_argb), // %0
@ -867,27 +837,21 @@ void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride,
// Alignment requirement: src_argb 4 byte aligned.
// TODO(Yang Zhang): Might be worth another optimization pass in future.
// It could be upgraded to 8 pixels at a time to start with.
void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb,
ptrdiff_t src_stride,
int src_stepx,
uint8* dst_argb, int dst_width) {
uint8* dst_argb,
int dst_width) {
asm volatile (
"add %1, %1, %0 \n"
"1: \n"
MEMACCESS(0)
"ld1 {v0.8b}, [%0], %4 \n" // Read 4 2x2 blocks -> 2x1
MEMACCESS(1)
"ld1 {v1.8b}, [%1], %4 \n"
MEMACCESS(0)
"ld1 {v2.8b}, [%0], %4 \n"
MEMACCESS(1)
"ld1 {v3.8b}, [%1], %4 \n"
MEMACCESS(0)
"ld1 {v4.8b}, [%0], %4 \n"
MEMACCESS(1)
"ld1 {v5.8b}, [%1], %4 \n"
MEMACCESS(0)
"ld1 {v6.8b}, [%0], %4 \n"
MEMACCESS(1)
"ld1 {v7.8b}, [%1], %4 \n"
"uaddl v0.8h, v0.8b, v1.8b \n"
"uaddl v2.8h, v2.8b, v3.8b \n"
@ -904,7 +868,6 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
"rshrn v0.8b, v0.8h, #2 \n" // first 2 pixels.
"rshrn2 v0.16b, v4.8h, #2 \n" // next 2 pixels.
"subs %w3, %w3, #4 \n" // 4 pixels per loop.
MEMACCESS(2)
"st1 {v0.16b}, [%2], #16 \n"
"b.gt 1b \n"
: "+r"(src_argb), // %0
@ -916,21 +879,24 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
);
}
// clang-format off
// TODO(Yang Zhang): Investigate less load instructions for
// the x/dx stepping
#define LOAD1_DATA32_LANE(vn, n) \
"lsr %5, %3, #16 \n" \
"add %6, %1, %5, lsl #2 \n" \
"add %3, %3, %4 \n" \
MEMACCESS(6) \
"ld1 {"#vn".s}["#n"], [%6] \n"
#define LOAD1_DATA32_LANE(vn, n) \
"lsr %5, %3, #16 \n" \
"add %6, %1, %5, lsl #2 \n" \
"add %3, %3, %4 \n" \
"ld1 {" #vn ".s}[" #n "], [%6] \n"
// clang-format on
void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx) {
void ScaleARGBCols_NEON(uint8* dst_argb,
const uint8* src_argb,
int dst_width,
int x,
int dx) {
const uint8* src_tmp = src_argb;
int64 dst_width64 = (int64) dst_width; // Work around ios 64 bit warning.
int64 x64 = (int64) x;
int64 dx64 = (int64) dx;
int64 x64 = (int64)x;
int64 dx64 = (int64)dx;
int64 tmp64;
asm volatile (
"1: \n"
@ -943,13 +909,12 @@ void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,
LOAD1_DATA32_LANE(v1, 2)
LOAD1_DATA32_LANE(v1, 3)
MEMACCESS(0)
"st1 {v0.4s, v1.4s}, [%0], #32 \n" // store pixels
"subs %w2, %w2, #8 \n" // 8 processed per loop
"b.gt 1b \n"
"b.gt 1b \n"
: "+r"(dst_argb), // %0
"+r"(src_argb), // %1
"+r"(dst_width64), // %2
"+r"(dst_width), // %2
"+r"(x64), // %3
"+r"(dx64), // %4
"=&r"(tmp64), // %5
@ -961,23 +926,26 @@ void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,
#undef LOAD1_DATA32_LANE
// clang-format off
// TODO(Yang Zhang): Investigate less load instructions for
// the x/dx stepping
#define LOAD2_DATA32_LANE(vn1, vn2, n) \
"lsr %5, %3, #16 \n" \
"add %6, %1, %5, lsl #2 \n" \
"add %3, %3, %4 \n" \
MEMACCESS(6) \
"ld2 {"#vn1".s, "#vn2".s}["#n"], [%6] \n"
#define LOAD2_DATA32_LANE(vn1, vn2, n) \
"lsr %5, %3, #16 \n" \
"add %6, %1, %5, lsl #2 \n" \
"add %3, %3, %4 \n" \
"ld2 {" #vn1 ".s, " #vn2 ".s}[" #n "], [%6] \n"
// clang-format on
void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx) {
void ScaleARGBFilterCols_NEON(uint8* dst_argb,
const uint8* src_argb,
int dst_width,
int x,
int dx) {
int dx_offset[4] = {0, 1, 2, 3};
int* tmp = dx_offset;
const uint8* src_tmp = src_argb;
int64 dst_width64 = (int64) dst_width; // Work around ios 64 bit warning.
int64 x64 = (int64) x;
int64 dx64 = (int64) dx;
int64 x64 = (int64)x;
int64 dx64 = (int64)dx;
asm volatile (
"dup v0.4s, %w3 \n" // x
"dup v1.4s, %w4 \n" // dx
@ -1014,14 +982,13 @@ void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb,
"shrn v0.8b, v16.8h, #7 \n"
"shrn2 v0.16b, v17.8h, #7 \n"
MEMACCESS(0)
"st1 {v0.4s}, [%0], #16 \n" // store pixels
"add v5.4s, v5.4s, v6.4s \n"
"subs %w2, %w2, #4 \n" // 4 processed per loop
"b.gt 1b \n"
: "+r"(dst_argb), // %0
"+r"(src_argb), // %1
"+r"(dst_width64), // %2
"+r"(dst_width), // %2
"+r"(x64), // %3
"+r"(dx64), // %4
"+r"(tmp), // %5

File diff suppressed because it is too large Load Diff

View File

@ -8,7 +8,6 @@
* be found in the AUTHORS file in the root of the source tree.
*/
#include "libyuv/video_common.h"
#ifdef __cplusplus
@ -24,24 +23,24 @@ struct FourCCAliasEntry {
};
static const struct FourCCAliasEntry kFourCCAliases[] = {
{FOURCC_IYUV, FOURCC_I420},
{FOURCC_YU12, FOURCC_I420},
{FOURCC_YU16, FOURCC_I422},
{FOURCC_YU24, FOURCC_I444},
{FOURCC_YUYV, FOURCC_YUY2},
{FOURCC_YUVS, FOURCC_YUY2}, // kCMPixelFormat_422YpCbCr8_yuvs
{FOURCC_HDYC, FOURCC_UYVY},
{FOURCC_2VUY, FOURCC_UYVY}, // kCMPixelFormat_422YpCbCr8
{FOURCC_JPEG, FOURCC_MJPG}, // Note: JPEG has DHT while MJPG does not.
{FOURCC_DMB1, FOURCC_MJPG},
{FOURCC_BA81, FOURCC_BGGR}, // deprecated.
{FOURCC_RGB3, FOURCC_RAW },
{FOURCC_BGR3, FOURCC_24BG},
{FOURCC_CM32, FOURCC_BGRA}, // kCMPixelFormat_32ARGB
{FOURCC_CM24, FOURCC_RAW }, // kCMPixelFormat_24RGB
{FOURCC_L555, FOURCC_RGBO}, // kCMPixelFormat_16LE555
{FOURCC_L565, FOURCC_RGBP}, // kCMPixelFormat_16LE565
{FOURCC_5551, FOURCC_RGBO}, // kCMPixelFormat_16LE5551
{FOURCC_IYUV, FOURCC_I420},
{FOURCC_YU12, FOURCC_I420},
{FOURCC_YU16, FOURCC_I422},
{FOURCC_YU24, FOURCC_I444},
{FOURCC_YUYV, FOURCC_YUY2},
{FOURCC_YUVS, FOURCC_YUY2}, // kCMPixelFormat_422YpCbCr8_yuvs
{FOURCC_HDYC, FOURCC_UYVY},
{FOURCC_2VUY, FOURCC_UYVY}, // kCMPixelFormat_422YpCbCr8
{FOURCC_JPEG, FOURCC_MJPG}, // Note: JPEG has DHT while MJPG does not.
{FOURCC_DMB1, FOURCC_MJPG},
{FOURCC_BA81, FOURCC_BGGR}, // deprecated.
{FOURCC_RGB3, FOURCC_RAW},
{FOURCC_BGR3, FOURCC_24BG},
{FOURCC_CM32, FOURCC_BGRA}, // kCMPixelFormat_32ARGB
{FOURCC_CM24, FOURCC_RAW}, // kCMPixelFormat_24RGB
{FOURCC_L555, FOURCC_RGBO}, // kCMPixelFormat_16LE555
{FOURCC_L565, FOURCC_RGBP}, // kCMPixelFormat_16LE565
{FOURCC_5551, FOURCC_RGBO}, // kCMPixelFormat_16LE5551
};
// TODO(fbarchard): Consider mapping kCMPixelFormat_32BGRA to FOURCC_ARGB.
// {FOURCC_BGRA, FOURCC_ARGB}, // kCMPixelFormat_32BGRA
@ -62,4 +61,3 @@ uint32 CanonicalFourCC(uint32 fourcc) {
} // extern "C"
} // namespace libyuv
#endif