Backed out changeset 65ee637b7e20 (bug 1284803)

This commit is contained in:
Iris Hsiao 2016-07-26 10:22:51 +08:00
parent 601f1a35d5
commit 2841a7655a
79 changed files with 11873 additions and 16060 deletions

View File

@ -8,8 +8,7 @@ LOCAL_CPP_EXTENSION := .cc
LOCAL_SRC_FILES := \
source/compare.cc \
source/compare_common.cc \
source/compare_neon64.cc \
source/compare_gcc.cc \
source/compare_posix.cc \
source/convert.cc \
source/convert_argb.cc \
source/convert_from.cc \
@ -17,26 +16,20 @@ LOCAL_SRC_FILES := \
source/convert_to_argb.cc \
source/convert_to_i420.cc \
source/cpu_id.cc \
source/format_conversion.cc \
source/planar_functions.cc \
source/rotate.cc \
source/rotate_any.cc \
source/rotate_argb.cc \
source/rotate_common.cc \
source/rotate_mips.cc \
source/rotate_neon64.cc \
source/rotate_gcc.cc \
source/row_any.cc \
source/row_common.cc \
source/row_mips.cc \
source/row_neon64.cc \
source/row_gcc.cc \
source/row_posix.cc \
source/scale.cc \
source/scale_any.cc \
source/scale_argb.cc \
source/scale_common.cc \
source/scale_mips.cc \
source/scale_neon64.cc \
source/scale_gcc.cc \
source/scale_posix.cc \
source/video_common.cc
# TODO(fbarchard): Enable mjpeg encoder.

View File

@ -1,42 +1,130 @@
use_relative_paths = True
vars = {
"libyuv_trunk" : "https://libyuv.googlecode.com/svn/trunk",
# Override root_dir in your .gclient's custom_vars to specify a custom root
# folder name.
'root_dir': 'libyuv',
'extra_gyp_flag': '-Dextra_gyp_flag=0',
'chromium_git': 'https://chromium.googlesource.com',
"root_dir": "trunk",
"extra_gyp_flag": "-Dextra_gyp_flag=0",
# Roll the Chromium Git hash to pick up newer versions of all the
# dependencies and tools linked to in setup_links.py.
'chromium_revision': '2a818f54130d8c93f81490adce5a1e87307bf5f0',
# Use this googlecode_url variable only if there is an internal mirror for it.
# If you do not know, use the full path while defining your new deps entry.
"googlecode_url": "http://%s.googlecode.com/svn",
"chromium_trunk" : "http://src.chromium.org/svn/trunk",
# chrome://version/ for revision of canary Chrome.
"chromium_revision": "232627",
}
# NOTE: Prefer revision numbers to tags for svn deps. Use http rather than
# https; the latter can cause problems for users behind proxies.
deps = {
Var('root_dir') + '/third_party/gflags/src':
Var('chromium_git') + '/external/github.com/gflags/gflags@03bebcb065c83beff83d50ae025a55a4bf94dfca',
"../chromium_deps":
File(Var("chromium_trunk") + "/src/DEPS@" + Var("chromium_revision")),
"build":
Var("chromium_trunk") + "/src/build@" + Var("chromium_revision"),
# Needed by common.gypi.
"google_apis/build":
Var("chromium_trunk") + "/src/google_apis/build@" + Var("chromium_revision"),
"testing":
Var("chromium_trunk") + "/src/testing@" + Var("chromium_revision"),
"testing/gtest":
From("chromium_deps", "src/testing/gtest"),
"tools/clang":
Var("chromium_trunk") + "/src/tools/clang@" + Var("chromium_revision"),
"tools/gyp":
From("chromium_deps", "src/tools/gyp"),
"tools/python":
Var("chromium_trunk") + "/src/tools/python@" + Var("chromium_revision"),
"tools/valgrind":
Var("chromium_trunk") + "/src/tools/valgrind@" + Var("chromium_revision"),
# Needed by build/common.gypi.
"tools/win/supalink":
Var("chromium_trunk") + "/src/tools/win/supalink@" + Var("chromium_revision"),
"third_party/libjpeg_turbo":
From("chromium_deps", "src/third_party/libjpeg_turbo"),
# Yasm assember required for libjpeg_turbo
"third_party/yasm":
Var("chromium_trunk") + "/src/third_party/yasm@" + Var("chromium_revision"),
"third_party/yasm/source/patched-yasm":
Var("chromium_trunk") + "/deps/third_party/yasm/patched-yasm@" + Var("chromium_revision"),
}
# Define rules for which include paths are allowed in our source.
include_rules = [ '+gflags' ]
deps_os = {
"win": {
# Use WebRTC's, stripped down, version of Cygwin (required by GYP).
"third_party/cygwin":
(Var("googlecode_url") % "webrtc") + "/deps/third_party/cygwin@2672",
# Used by libjpeg-turbo.
# TODO(fbarchard): Remove binaries and run yasm from build folder.
"third_party/yasm/binaries":
Var("chromium_trunk") + "/deps/third_party/yasm/binaries@" + Var("chromium_revision"),
"third_party/yasm": None,
},
"unix": {
"third_party/gold":
From("chromium_deps", "src/third_party/gold"),
},
"android": {
"third_party/android_tools":
From("chromium_deps", "src/third_party/android_tools"),
"third_party/libjpeg":
From("chromium_deps", "src/third_party/libjpeg"),
},
"ios": {
# NSS, for SSLClientSocketNSS.
"third_party/nss":
From("chromium_deps", "src/third_party/nss"),
"net/third_party/nss":
Var("chromium_trunk") + "/src/net/third_party/nss@" + Var("chromium_revision"),
# class-dump utility to generate header files for undocumented SDKs.
"testing/iossim/third_party/class-dump":
From("chromium_deps", "src/testing/iossim/third_party/class-dump"),
# Helper for running under the simulator.
"testing/iossim":
Var("chromium_trunk") + "/src/testing/iossim@" + Var("chromium_revision"),
},
}
hooks = [
{
# Clone chromium and its deps.
'name': 'sync chromium',
'pattern': '.',
'action': ['python', '-u', Var('root_dir') + '/sync_chromium.py',
'--target-revision', Var('chromium_revision')],
},
{
# Create links to shared dependencies in Chromium.
'name': 'setup_links',
'pattern': '.',
'action': ['python', Var('root_dir') + '/setup_links.py'],
# Pull clang on mac. If nothing changed, or on non-mac platforms, this takes
# zero seconds to run. If something changed, it downloads a prebuilt clang.
"pattern": ".",
"action": ["python", Var("root_dir") + "/tools/clang/scripts/update.py",
"--mac-only"],
},
{
# A change to a .gyp, .gypi, or to GYP itself should run the generator.
'pattern': '.',
'action': ['python', Var('root_dir') + '/gyp_libyuv'],
"pattern": ".",
"action": ["python", Var("root_dir") + "/build/gyp_chromium",
"--depth=" + Var("root_dir"), Var("root_dir") + "/all.gyp",
Var("extra_gyp_flag")],
},
{
# Update the cygwin mount on Windows.
# This is necessary to get the correct mapping between e.g. /bin and the
# cygwin path on Windows. Without it we can't run bash scripts in actions.
# Ideally this should be solved in "pylib/gyp/msvs_emulation.py".
"pattern": ".",
"action": ["python", Var("root_dir") + "/build/win/setup_cygwin_mount.py",
"--win-only"],
},
]

View File

@ -1,13 +1,2 @@
fbarchard@chromium.org
magjed@chromium.org
torbjorng@chromium.org
per-file *.gyp=kjellander@chromium.org
per-file *.gn=kjellander@chromium.org
per-file .gitignore=*
per-file AUTHORS=*
per-file DEPS=*
per-file PRESUBMIT.py=kjellander@chromium.org
per-file gyp_libyuv.py=kjellander@chromium.org
per-file setup_links.py=*
per-file sync_chromium.py=kjellander@chromium.org
fbarchard@chromium.org
mflodman@chromium.org

View File

@ -1,8 +1,9 @@
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 1602
Version: 971
License: BSD
License File: LICENSE
Description:
libyuv is an open source project that includes YUV conversion and scaling functionality.
libyuv is an open source project that includes
YUV conversion and scaling functionality.

View File

@ -1,10 +1,9 @@
# This file is used by gcl to get repository specific information.
CODE_REVIEW_SERVER: codereview.chromium.org
# The LibYuv code review is via WebRtc's code review
CODE_REVIEW_SERVER: webrtc-codereview.appspot.com
#CC_LIST:
VIEW_VC: https://chromium.googlesource.com/libyuv/libyuv/+/
VIEW_VC: https://code.google.com/p/libyuv/source/detail?r=
#STATUS:
FORCE_HTTPS_COMMIT_URL: True
PROJECT: libyuv
TRY_ON_UPLOAD: False
TRYSERVER_ROOT: src
TRYSERVER_SVN_URL: svn://svn.chromium.org/chrome-try/try-libyuv

View File

@ -18,6 +18,7 @@
#include "libyuv/convert_from.h"
#include "libyuv/convert_from_argb.h"
#include "libyuv/cpu_id.h"
#include "libyuv/format_conversion.h"
#include "libyuv/mjpeg_decoder.h"
#include "libyuv/planar_functions.h"
#include "libyuv/rotate.h"

View File

@ -13,12 +13,26 @@
#include <stddef.h> // for NULL, size_t
#if defined(__ANDROID__) || (defined(_MSC_VER) && (_MSC_VER < 1600))
#include <sys/types.h> // for uintptr_t on x86
#else
#if !(defined(_MSC_VER) && (_MSC_VER < 1600))
#include <stdint.h> // for uintptr_t
#endif
typedef uint64_t uint64;
typedef int64_t int64;
#if defined(_MSC_VER)
// nsprpub/pr/include/obsolete/protypes.h defines these weirdly
typedef long int32;
typedef unsigned long uint32;
#else
typedef uint32_t uint32;
typedef int32_t int32;
#endif
typedef uint16_t uint16;
typedef int16_t int16;
typedef uint8_t uint8;
typedef int8_t int8;
#define INT_TYPES_DEFINED 1
#ifndef GG_LONGLONG
#ifndef INT_TYPES_DEFINED
#define INT_TYPES_DEFINED

View File

@ -22,11 +22,6 @@ extern "C" {
LIBYUV_API
uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed);
// Scan an opaque argb image and return fourcc based on alpha offset.
// Returns FOURCC_ARGB, FOURCC_BGRA, or 0 if unknown.
LIBYUV_API
uint32 ARGBDetect(const uint8* argb, int stride_argb, int width, int height);
// Sum Square Error - used to compute Mean Square Error or PSNR.
LIBYUV_API
uint64 ComputeSumSquareError(const uint8* src_a,

View File

@ -12,8 +12,10 @@
#define INCLUDE_LIBYUV_CONVERT_H_
#include "libyuv/basic_types.h"
#include "libyuv/rotate.h" // For enum RotationMode.
// TODO(fbarchard): Remove the following headers includes.
#include "libyuv/convert_from.h"
#include "libyuv/planar_functions.h"
#include "libyuv/rotate.h"
#ifdef __cplusplus
namespace libyuv {
@ -69,8 +71,6 @@ int I400ToI420(const uint8* src_y, int src_stride_y,
uint8* dst_v, int dst_stride_v,
int width, int height);
#define J400ToJ420 I400ToI420
// Convert NV12 to I420.
LIBYUV_API
int NV12ToI420(const uint8* src_y, int src_stride_y,
@ -113,6 +113,15 @@ int M420ToI420(const uint8* src_m420, int src_stride_m420,
uint8* dst_v, int dst_stride_v,
int width, int height);
// Convert Q420 to I420.
LIBYUV_API
int Q420ToI420(const uint8* src_y, int src_stride_y,
const uint8* src_yuy2, int src_stride_yuy2,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
// ARGB little endian (bgra in memory) to I420.
LIBYUV_API
int ARGBToI420(const uint8* src_frame, int src_stride_frame,
@ -202,6 +211,8 @@ int MJPGSize(const uint8* sample, size_t sample_size,
int* width, int* height);
#endif
// Note Bayer formats (BGGR) To I420 are in format_conversion.h
// Convert camera sample to I420 with cropping, rotation and vertical flip.
// "src_size" is needed to parse MJPG.
// "dst_stride_y" number of bytes in a row of the dst_y plane.

View File

@ -12,10 +12,13 @@
#define INCLUDE_LIBYUV_CONVERT_ARGB_H_
#include "libyuv/basic_types.h"
#include "libyuv/rotate.h" // For enum RotationMode.
// TODO(fbarchard): Remove the following headers includes
#include "libyuv/convert_from.h"
#include "libyuv/planar_functions.h"
#include "libyuv/rotate.h"
// TODO(fbarchard): This set of functions should exactly match convert.h
// Add missing Q420.
// TODO(fbarchard): Add tests. Create random content of right size and convert
// with C vs Opt and or to I420 and compare.
// TODO(fbarchard): Some of these functions lack parameter setting.
@ -58,22 +61,6 @@ int I444ToARGB(const uint8* src_y, int src_stride_y,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
// Convert J444 to ARGB.
LIBYUV_API
int J444ToARGB(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
// Convert I444 to ABGR.
LIBYUV_API
int I444ToABGR(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_abgr, int dst_stride_abgr,
int width, int height);
// Convert I411 to ARGB.
LIBYUV_API
int I411ToARGB(const uint8* src_y, int src_stride_y,
@ -82,38 +69,20 @@ int I411ToARGB(const uint8* src_y, int src_stride_y,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
// Convert I420 with Alpha to preattenuated ARGB.
LIBYUV_API
int I420AlphaToARGB(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
const uint8* src_a, int src_stride_a,
uint8* dst_argb, int dst_stride_argb,
int width, int height, int attenuate);
// Convert I420 with Alpha to preattenuated ABGR.
LIBYUV_API
int I420AlphaToABGR(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
const uint8* src_a, int src_stride_a,
uint8* dst_abgr, int dst_stride_abgr,
int width, int height, int attenuate);
// Convert I400 (grey) to ARGB. Reverse of ARGBToI400.
// Convert I400 (grey) to ARGB.
LIBYUV_API
int I400ToARGB(const uint8* src_y, int src_stride_y,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
// Convert J400 (jpeg grey) to ARGB.
LIBYUV_API
int J400ToARGB(const uint8* src_y, int src_stride_y,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
// Alias.
#define YToARGB I400ToARGB
#define YToARGB I400ToARGB_Reference
// Convert I400 to ARGB. Reverse of ARGBToI400.
LIBYUV_API
int I400ToARGB_Reference(const uint8* src_y, int src_stride_y,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
// Convert NV12 to ARGB.
LIBYUV_API
@ -135,6 +104,13 @@ int M420ToARGB(const uint8* src_m420, int src_stride_m420,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
// TODO(fbarchard): Convert Q420 to ARGB.
// LIBYUV_API
// int Q420ToARGB(const uint8* src_y, int src_stride_y,
// const uint8* src_yuy2, int src_stride_yuy2,
// uint8* dst_argb, int dst_stride_argb,
// int width, int height);
// Convert YUY2 to ARGB.
LIBYUV_API
int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2,
@ -147,70 +123,6 @@ int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
// Convert J420 to ARGB.
LIBYUV_API
int J420ToARGB(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
// Convert J422 to ARGB.
LIBYUV_API
int J422ToARGB(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
// Convert J420 to ABGR.
LIBYUV_API
int J420ToABGR(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_abgr, int dst_stride_abgr,
int width, int height);
// Convert J422 to ABGR.
LIBYUV_API
int J422ToABGR(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_abgr, int dst_stride_abgr,
int width, int height);
// Convert H420 to ARGB.
LIBYUV_API
int H420ToARGB(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
// Convert H422 to ARGB.
LIBYUV_API
int H422ToARGB(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
// Convert H420 to ABGR.
LIBYUV_API
int H420ToABGR(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_abgr, int dst_stride_abgr,
int width, int height);
// Convert H422 to ABGR.
LIBYUV_API
int H422ToABGR(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_abgr, int dst_stride_abgr,
int width, int height);
// BGRA little endian (argb in memory) to ARGB.
LIBYUV_API
int BGRAToARGB(const uint8* src_frame, int src_stride_frame,
@ -272,6 +184,8 @@ int MJPGToARGB(const uint8* sample, size_t sample_size,
int dst_width, int dst_height);
#endif
// Note Bayer formats (BGGR) to ARGB are in format_conversion.h.
// Convert camera sample to ARGB with cropping, rotation and vertical flip.
// "src_size" is needed to parse MJPG.
// "dst_stride_argb" number of bytes in a row of the dst_argb plane.

View File

@ -56,6 +56,9 @@ int I400Copy(const uint8* src_y, int src_stride_y,
uint8* dst_y, int dst_stride_y,
int width, int height);
// TODO(fbarchard): I420ToM420
// TODO(fbarchard): I420ToQ420
LIBYUV_API
int I420ToNV12(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
@ -135,17 +138,6 @@ int I420ToRGB565(const uint8* src_y, int src_stride_y,
uint8* dst_frame, int dst_stride_frame,
int width, int height);
// Convert I420 To RGB565 with 4x4 dither matrix (16 bytes).
// Values in dither matrix from 0 to 7 recommended.
// The order of the dither matrix is first byte is upper left.
LIBYUV_API
int I420ToRGB565Dither(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_frame, int dst_stride_frame,
const uint8* dither4x4, int width, int height);
LIBYUV_API
int I420ToARGB1555(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
@ -160,6 +152,8 @@ int I420ToARGB4444(const uint8* src_y, int src_stride_y,
uint8* dst_frame, int dst_stride_frame,
int width, int height);
// Note Bayer formats (BGGR) To I420 are in format_conversion.h.
// Convert I420 to specified format.
// "dst_sample_stride" is bytes in a row for the destination. Pass 0 if the
// buffer has contiguous rows. Can be negative. A multiple of 16 is optimal.

View File

@ -25,22 +25,24 @@ int ARGBCopy(const uint8* src_argb, int src_stride_argb,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
// Convert ARGB To BGRA.
// Convert ARGB To BGRA. (alias)
#define ARGBToBGRA BGRAToARGB
LIBYUV_API
int ARGBToBGRA(const uint8* src_argb, int src_stride_argb,
uint8* dst_bgra, int dst_stride_bgra,
int BGRAToARGB(const uint8* src_frame, int src_stride_frame,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
// Convert ARGB To ABGR.
// Convert ARGB To ABGR. (alias)
#define ARGBToABGR ABGRToARGB
LIBYUV_API
int ARGBToABGR(const uint8* src_argb, int src_stride_argb,
uint8* dst_abgr, int dst_stride_abgr,
int ABGRToARGB(const uint8* src_frame, int src_stride_frame,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
// Convert ARGB To RGBA.
LIBYUV_API
int ARGBToRGBA(const uint8* src_argb, int src_stride_argb,
uint8* dst_rgba, int dst_stride_rgba,
int ARGBToRGBA(const uint8* src_frame, int src_stride_frame,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
// Convert ARGB To RGB24.
@ -61,16 +63,6 @@ int ARGBToRGB565(const uint8* src_argb, int src_stride_argb,
uint8* dst_rgb565, int dst_stride_rgb565,
int width, int height);
// Convert ARGB To RGB565 with 4x4 dither matrix (16 bytes).
// Values in dither matrix from 0 to 7 recommended.
// The order of the dither matrix is first byte is upper left.
// TODO(fbarchard): Consider pointer to 2d array for dither4x4.
// const uint8(*dither)[4][4];
LIBYUV_API
int ARGBToRGB565Dither(const uint8* src_argb, int src_stride_argb,
uint8* dst_rgb565, int dst_stride_rgb565,
const uint8* dither4x4, int width, int height);
// Convert ARGB To ARGB1555.
LIBYUV_API
int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb,
@ -115,14 +107,6 @@ int ARGBToJ420(const uint8* src_argb, int src_stride_argb,
uint8* dst_v, int dst_stride_v,
int width, int height);
// Convert ARGB to J422.
LIBYUV_API
int ARGBToJ422(const uint8* src_argb, int src_stride_argb,
uint8* dst_yj, int dst_stride_yj,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
// Convert ARGB To I411.
LIBYUV_API
int ARGBToI411(const uint8* src_argb, int src_stride_argb,
@ -143,12 +127,6 @@ int ARGBToI400(const uint8* src_argb, int src_stride_argb,
uint8* dst_y, int dst_stride_y,
int width, int height);
// Convert ARGB to G. (Reverse of J400toARGB, which replicates G back to ARGB)
LIBYUV_API
int ARGBToG(const uint8* src_argb, int src_stride_argb,
uint8* dst_g, int dst_stride_g,
int width, int height);
// Convert ARGB To NV12.
LIBYUV_API
int ARGBToNV12(const uint8* src_argb, int src_stride_argb,

View File

@ -18,8 +18,9 @@ namespace libyuv {
extern "C" {
#endif
// TODO(fbarchard): Consider overlapping bits for different architectures.
// Internal flag to indicate cpuid requires initialization.
static const int kCpuInitialized = 0x1;
#define kCpuInit 0x1
// These flags are only valid on ARM processors.
static const int kCpuHasARM = 0x2;
@ -36,12 +37,12 @@ static const int kCpuHasAVX = 0x200;
static const int kCpuHasAVX2 = 0x400;
static const int kCpuHasERMS = 0x800;
static const int kCpuHasFMA3 = 0x1000;
static const int kCpuHasAVX3 = 0x2000;
// 0x2000, 0x4000, 0x8000 reserved for future X86 flags.
// These flags are only valid on MIPS processors.
static const int kCpuHasMIPS = 0x10000;
static const int kCpuHasDSPR2 = 0x20000;
static const int kCpuHasMIPS_DSP = 0x20000;
static const int kCpuHasMIPS_DSPR2 = 0x40000;
// Internal function used to auto-init.
LIBYUV_API
@ -56,13 +57,13 @@ int ArmCpuCaps(const char* cpuinfo_name);
// returns non-zero if instruction set is detected
static __inline int TestCpuFlag(int test_flag) {
LIBYUV_API extern int cpu_info_;
return (!cpu_info_ ? InitCpuFlags() : cpu_info_) & test_flag;
return (cpu_info_ == kCpuInit ? InitCpuFlags() : cpu_info_) & test_flag;
}
// For testing, allow CPU flags to be disabled.
// ie MaskCpuFlags(~kCpuHasSSSE3) to disable SSSE3.
// MaskCpuFlags(-1) to enable all cpu specific optimizations.
// MaskCpuFlags(1) to disable all cpu specific optimizations.
// MaskCpuFlags(0) to disable all cpu specific optimizations.
LIBYUV_API
void MaskCpuFlags(int enable_flags);

View File

@ -43,17 +43,6 @@ enum JpegSubsamplingType {
kJpegUnknown
};
struct Buffer {
const uint8* data;
int len;
};
struct BufferVector {
Buffer* buffers;
int len;
int pos;
};
struct SetJmpErrorMgr;
// MJPEG ("Motion JPEG") is a pseudo-standard video codec where the frames are
@ -153,6 +142,27 @@ class LIBYUV_API MJpegDecoder {
int* subsample_x, int* subsample_y, int number_of_components);
private:
struct Buffer {
const uint8* data;
int len;
};
struct BufferVector {
Buffer* buffers;
int len;
int pos;
};
// Methods that are passed to jpeglib.
static int fill_input_buffer(jpeg_decompress_struct* cinfo);
static void init_source(jpeg_decompress_struct* cinfo);
static void skip_input_data(jpeg_decompress_struct* cinfo,
long num_bytes); // NOLINT
static void term_source(jpeg_decompress_struct* cinfo);
static void ErrorHandler(jpeg_common_struct* cinfo);
static void OutputHandler(jpeg_common_struct* cinfo);
void AllocOutputBuffers(int num_outbufs);
void DestroyOutputBuffers();

View File

@ -28,11 +28,6 @@ void CopyPlane(const uint8* src_y, int src_stride_y,
uint8* dst_y, int dst_stride_y,
int width, int height);
LIBYUV_API
void CopyPlane_16(const uint16* src_y, int src_stride_y,
uint16* dst_y, int dst_stride_y,
int width, int height);
// Set a plane of data to a 32 bit value.
LIBYUV_API
void SetPlane(uint8* dst_y, int dst_stride_y,
@ -45,7 +40,6 @@ int I400ToI400(const uint8* src_y, int src_stride_y,
uint8* dst_y, int dst_stride_y,
int width, int height);
#define J400ToJ400 I400ToI400
// Copy I422 to I422.
#define I422ToI422 I422Copy
@ -85,18 +79,6 @@ int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,
uint8* dst_v, int dst_stride_v,
int width, int height);
LIBYUV_API
int YUY2ToNV12(const uint8* src_yuy2, int src_stride_yuy2,
uint8* dst_y, int dst_stride_y,
uint8* dst_uv, int dst_stride_uv,
int width, int height);
LIBYUV_API
int UYVYToNV12(const uint8* src_uyvy, int src_stride_uyvy,
uint8* dst_y, int dst_stride_y,
uint8* dst_uv, int dst_stride_uv,
int width, int height);
// Convert I420 to I400. (calls CopyPlane ignoring u/v).
LIBYUV_API
int I420ToI400(const uint8* src_y, int src_stride_y,
@ -106,7 +88,6 @@ int I420ToI400(const uint8* src_y, int src_stride_y,
int width, int height);
// Alias
#define J420ToJ400 I420ToI400
#define I420ToI420Mirror I420Mirror
// I420 mirror.
@ -145,6 +126,13 @@ int NV12ToRGB565(const uint8* src_y, int src_stride_y,
uint8* dst_rgb565, int dst_stride_rgb565,
int width, int height);
// Convert NV21 to RGB565.
LIBYUV_API
int NV21ToRGB565(const uint8* src_y, int src_stride_y,
const uint8* src_uv, int src_stride_uv,
uint8* dst_rgb565, int dst_stride_rgb565,
int width, int height);
// I422ToARGB is in convert_argb.h
// Convert I422 to BGRA.
LIBYUV_API
@ -170,14 +158,6 @@ int I422ToRGBA(const uint8* src_y, int src_stride_y,
uint8* dst_rgba, int dst_stride_rgba,
int width, int height);
// Alias
#define RGB24ToRAW RAWToRGB24
LIBYUV_API
int RAWToRGB24(const uint8* src_raw, int src_stride_raw,
uint8* dst_rgb24, int dst_stride_rgb24,
int width, int height);
// Draw a rectangle into I420.
LIBYUV_API
int I420Rect(uint8* dst_y, int dst_stride_y,
@ -282,19 +262,13 @@ int ARGBCopy(const uint8* src_argb, int src_stride_argb,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
// Copy Alpha channel of ARGB to alpha of ARGB.
// Copy ARGB to ARGB.
LIBYUV_API
int ARGBCopyAlpha(const uint8* src_argb, int src_stride_argb,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
// Extract the alpha channel from ARGB.
LIBYUV_API
int ARGBExtractAlpha(const uint8* src_argb, int src_stride_argb,
uint8* dst_a, int dst_stride_a,
int width, int height);
// Copy Y channel to Alpha of ARGB.
// Copy ARGB to ARGB.
LIBYUV_API
int ARGBCopyYToAlpha(const uint8* src_y, int src_stride_y,
uint8* dst_argb, int dst_stride_argb,
@ -308,7 +282,6 @@ LIBYUV_API
ARGBBlendRow GetARGBBlend();
// Alpha Blend ARGB images and store to destination.
// Source is pre-multiplied by alpha using ARGBAttenuate.
// Alpha of destination is set to 255.
LIBYUV_API
int ARGBBlend(const uint8* src_argb0, int src_stride_argb0,
@ -316,31 +289,6 @@ int ARGBBlend(const uint8* src_argb0, int src_stride_argb0,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
// Alpha Blend plane and store to destination.
// Source is not pre-multiplied by alpha.
LIBYUV_API
int BlendPlane(const uint8* src_y0, int src_stride_y0,
const uint8* src_y1, int src_stride_y1,
const uint8* alpha, int alpha_stride,
uint8* dst_y, int dst_stride_y,
int width, int height);
// Alpha Blend YUV images and store to destination.
// Source is not pre-multiplied by alpha.
// Alpha is full width x height and subsampled to half size to apply to UV.
LIBYUV_API
int I420Blend(const uint8* src_y0, int src_stride_y0,
const uint8* src_u0, int src_stride_u0,
const uint8* src_v0, int src_stride_v0,
const uint8* src_y1, int src_stride_y1,
const uint8* src_u1, int src_stride_u1,
const uint8* src_v1, int src_stride_v1,
const uint8* alpha, int alpha_stride,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
// Multiply ARGB image by ARGB image. Shifted down by 8. Saturates to 255.
LIBYUV_API
int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0,
@ -390,6 +338,12 @@ int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
// Convert MJPG to ARGB.
LIBYUV_API
int MJPGToARGB(const uint8* sample, size_t sample_size,
uint8* argb, int argb_stride,
int w, int h, int dw, int dh);
// Internal function - do not call directly.
// Computes table of cumulative sum for image where the value is the sum
// of all values above and to the left of the entry. Used by ARGBBlur.
@ -416,63 +370,36 @@ int ARGBShade(const uint8* src_argb, int src_stride_argb,
uint8* dst_argb, int dst_stride_argb,
int width, int height, uint32 value);
// Interpolate between two images using specified amount of interpolation
// (0 to 255) and store to destination.
// 'interpolation' is specified as 8 bit fraction where 0 means 100% src0
// and 255 means 1% src0 and 99% src1.
LIBYUV_API
int InterpolatePlane(const uint8* src0, int src_stride0,
const uint8* src1, int src_stride1,
uint8* dst, int dst_stride,
int width, int height, int interpolation);
// Interpolate between two ARGB images using specified amount of interpolation
// Internally calls InterpolatePlane with width * 4 (bpp).
// (0 to 255) and store to destination.
// 'interpolation' is specified as 8 bit fraction where 0 means 100% src_argb0
// and 255 means 1% src_argb0 and 99% src_argb1.
// Internally uses ARGBScale bilinear filtering.
// Caveat: This function will write up to 16 bytes beyond the end of dst_argb.
LIBYUV_API
int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0,
const uint8* src_argb1, int src_stride_argb1,
uint8* dst_argb, int dst_stride_argb,
int width, int height, int interpolation);
// Interpolate between two YUV images using specified amount of interpolation
// Internally calls InterpolatePlane on each plane where the U and V planes
// are half width and half height.
LIBYUV_API
int I420Interpolate(const uint8* src0_y, int src0_stride_y,
const uint8* src0_u, int src0_stride_u,
const uint8* src0_v, int src0_stride_v,
const uint8* src1_y, int src1_stride_y,
const uint8* src1_u, int src1_stride_u,
const uint8* src1_v, int src1_stride_v,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height, int interpolation);
#if defined(__pnacl__) || defined(__CLR_VER) || \
(defined(__i386__) && !defined(__SSE2__))
#if defined(__pnacl__) || defined(__CLR_VER) || defined(COVERAGE_ENABLED) || \
defined(TARGET_IPHONE_SIMULATOR)
#define LIBYUV_DISABLE_X86
#endif
// MemorySanitizer does not support assembly code yet. http://crbug.com/344505
#if defined(__has_feature)
#if __has_feature(memory_sanitizer)
#define LIBYUV_DISABLE_X86
#endif
#endif
// The following are available on all x86 platforms:
#if !defined(LIBYUV_DISABLE_X86) && \
(defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
#define HAS_ARGBAFFINEROW_SSE2
#endif
// Row function for copying pixels from a source with a slope to a row
// Row functions for copying a pixels from a source with a slope to a row
// of destination. Useful for scaling, rotation, mirror, texture mapping.
LIBYUV_API
void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride,
uint8* dst_argb, const float* uv_dudv, int width);
// The following are available on all x86 platforms:
#if !defined(LIBYUV_DISABLE_X86) && \
(defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
LIBYUV_API
void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
uint8* dst_argb, const float* uv_dudv, int width);
#define HAS_ARGBAFFINEROW_SSE2
#endif // LIBYUV_DISABLE_X86
// Shuffle ARGB channel order. e.g. BGRA to ARGB.
// shuffler is 16 bytes and must be aligned.

File diff suppressed because it is too large Load Diff

View File

@ -34,13 +34,6 @@ void ScalePlane(const uint8* src, int src_stride,
int dst_width, int dst_height,
enum FilterMode filtering);
LIBYUV_API
void ScalePlane_16(const uint16* src, int src_stride,
int src_width, int src_height,
uint16* dst, int dst_stride,
int dst_width, int dst_height,
enum FilterMode filtering);
// Scales a YUV 4:2:0 image from the src width and height to the
// dst width and height.
// If filtering is kFilterNone, a simple nearest-neighbor algorithm is
@ -62,17 +55,6 @@ int I420Scale(const uint8* src_y, int src_stride_y,
int dst_width, int dst_height,
enum FilterMode filtering);
LIBYUV_API
int I420Scale_16(const uint16* src_y, int src_stride_y,
const uint16* src_u, int src_stride_u,
const uint16* src_v, int src_stride_v,
int src_width, int src_height,
uint16* dst_y, int dst_stride_y,
uint16* dst_u, int dst_stride_u,
uint16* dst_v, int dst_stride_v,
int dst_width, int dst_height,
enum FilterMode filtering);
#ifdef __cplusplus
// Legacy API. Deprecated.
LIBYUV_API

View File

@ -35,6 +35,7 @@ int ARGBScaleClip(const uint8* src_argb, int src_stride_argb,
int clip_x, int clip_y, int clip_width, int clip_height,
enum FilterMode filtering);
// TODO(fbarchard): Implement this.
// Scale with YUV conversion to ARGB and clipping.
LIBYUV_API
int YUVToARGBScaleClip(const uint8* src_y, int src_stride_y,

View File

@ -12,94 +12,54 @@
#define INCLUDE_LIBYUV_SCALE_ROW_H_
#include "libyuv/basic_types.h"
#include "libyuv/scale.h"
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
#if defined(__pnacl__) || defined(__CLR_VER) || \
(defined(__i386__) && !defined(__SSE2__))
#if defined(__pnacl__) || defined(__CLR_VER) || defined(COVERAGE_ENABLED) || \
defined(TARGET_IPHONE_SIMULATOR)
#define LIBYUV_DISABLE_X86
#endif
// MemorySanitizer does not support assembly code yet. http://crbug.com/344505
#if defined(__has_feature)
#if __has_feature(memory_sanitizer)
#define LIBYUV_DISABLE_X86
#endif
#endif
// GCC >= 4.7.0 required for AVX2.
#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
#if (__GNUC__ > 4) || (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7))
#define GCC_HAS_AVX2 1
#endif // GNUC >= 4.7
#endif // __GNUC__
// clang >= 3.4.0 required for AVX2.
#if defined(__clang__) && (defined(__x86_64__) || defined(__i386__))
#if (__clang_major__ > 3) || (__clang_major__ == 3 && (__clang_minor__ >= 4))
#define CLANG_HAS_AVX2 1
#endif // clang >= 3.4
#endif // __clang__
// Visual C 2012 required for AVX2.
#if defined(_M_IX86) && !defined(__clang__) && \
defined(_MSC_VER) && _MSC_VER >= 1700
#define VISUALC_HAS_AVX2 1
#endif // VisualStudio >= 2012
// The following are available on all x86 platforms:
#if !defined(LIBYUV_DISABLE_X86) && \
(defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
#define HAS_FIXEDDIV1_X86
#define HAS_FIXEDDIV_X86
#define HAS_SCALEARGBCOLS_SSE2
#define HAS_SCALEARGBCOLSUP2_SSE2
#define HAS_SCALEARGBFILTERCOLS_SSSE3
#define HAS_SCALEARGBROWDOWN2_SSE2
#define HAS_SCALEARGBROWDOWNEVEN_SSE2
#define HAS_SCALECOLSUP2_SSE2
#define HAS_SCALEFILTERCOLS_SSSE3
#define HAS_SCALEROWDOWN2_SSSE3
#define HAS_SCALEROWDOWN2_SSE2
#define HAS_SCALEROWDOWN4_SSE2
#define HAS_SCALEROWDOWN34_SSSE3
#define HAS_SCALEROWDOWN38_SSSE3
#define HAS_SCALEROWDOWN4_SSSE3
#define HAS_SCALEADDROW_SSE2
#endif
// The following are available on all x86 platforms, but
// require VS2012, clang 3.4 or gcc 4.7.
// The code supports NaCL but requires a new compiler and validator.
#if !defined(LIBYUV_DISABLE_X86) && (defined(VISUALC_HAS_AVX2) || \
defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2))
#define HAS_SCALEADDROW_AVX2
#define HAS_SCALEROWDOWN2_AVX2
#define HAS_SCALEROWDOWN4_AVX2
#define HAS_SCALEADDROWS_SSE2
#define HAS_SCALEFILTERCOLS_SSSE3
#define HAS_SCALECOLSUP2_SSE2
#define HAS_SCALEARGBROWDOWN2_SSE2
#define HAS_SCALEARGBROWDOWNEVEN_SSE2
#define HAS_SCALEARGBCOLS_SSE2
#define HAS_SCALEARGBFILTERCOLS_SSSE3
#define HAS_SCALEARGBCOLSUP2_SSE2
#define HAS_FIXEDDIV_X86
#define HAS_FIXEDDIV1_X86
#endif
// The following are available on Neon platforms:
#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
(defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
#define HAS_SCALEARGBCOLS_NEON
#define HAS_SCALEARGBROWDOWN2_NEON
#define HAS_SCALEARGBROWDOWNEVEN_NEON
#define HAS_SCALEFILTERCOLS_NEON
(defined(__ARM_NEON__) || defined(LIBYUV_NEON))
#define HAS_SCALEROWDOWN2_NEON
#define HAS_SCALEROWDOWN4_NEON
#define HAS_SCALEROWDOWN34_NEON
#define HAS_SCALEROWDOWN38_NEON
#define HAS_SCALEROWDOWN4_NEON
#define HAS_SCALEARGBFILTERCOLS_NEON
#define HAS_SCALEARGBROWDOWNEVEN_NEON
#define HAS_SCALEARGBROWDOWN2_NEON
#endif
// The following are available on Mips platforms:
#if !defined(LIBYUV_DISABLE_MIPS) && !defined(__native_client__) && \
defined(__mips__) && defined(__mips_dsp) && (__mips_dsp_rev >= 2)
#define HAS_SCALEROWDOWN2_DSPR2
#define HAS_SCALEROWDOWN4_DSPR2
#define HAS_SCALEROWDOWN34_DSPR2
#define HAS_SCALEROWDOWN38_DSPR2
#define HAS_SCALEROWDOWN2_MIPS_DSPR2
#define HAS_SCALEROWDOWN4_MIPS_DSPR2
#define HAS_SCALEROWDOWN34_MIPS_DSPR2
#define HAS_SCALEROWDOWN38_MIPS_DSPR2
#endif
// Scale ARGB vertically with bilinear interpolation.
@ -110,13 +70,6 @@ void ScalePlaneVertical(int src_height,
int x, int y, int dy,
int bpp, enum FilterMode filtering);
void ScalePlaneVertical_16(int src_height,
int dst_width, int dst_height,
int src_stride, int dst_stride,
const uint16* src_argb, uint16* dst_argb,
int x, int y, int dy,
int wpp, enum FilterMode filtering);
// Simplify the filtering based on scale factors.
enum FilterMode ScaleFilterReduce(int src_width, int src_height,
int dst_width, int dst_height,
@ -144,70 +97,37 @@ void ScaleSlope(int src_width, int src_height,
void ScaleRowDown2_C(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
void ScaleRowDown2_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
uint16* dst, int dst_width);
void ScaleRowDown2Linear_C(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
void ScaleRowDown2Linear_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
uint16* dst, int dst_width);
void ScaleRowDown2Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
void ScaleRowDown2Box_Odd_C(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
void ScaleRowDown2Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
uint16* dst, int dst_width);
void ScaleRowDown4_C(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
void ScaleRowDown4_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
uint16* dst, int dst_width);
void ScaleRowDown4Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
void ScaleRowDown4Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
uint16* dst, int dst_width);
void ScaleRowDown34_C(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
void ScaleRowDown34_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
uint16* dst, int dst_width);
void ScaleRowDown34_0_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* d, int dst_width);
void ScaleRowDown34_0_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
uint16* d, int dst_width);
void ScaleRowDown34_1_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* d, int dst_width);
void ScaleRowDown34_1_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
uint16* d, int dst_width);
void ScaleCols_C(uint8* dst_ptr, const uint8* src_ptr,
int dst_width, int x, int dx);
void ScaleCols_16_C(uint16* dst_ptr, const uint16* src_ptr,
int dst_width, int x, int dx);
void ScaleColsUp2_C(uint8* dst_ptr, const uint8* src_ptr,
int dst_width, int, int);
void ScaleColsUp2_16_C(uint16* dst_ptr, const uint16* src_ptr,
int dst_width, int, int);
void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr,
int dst_width, int x, int dx);
void ScaleFilterCols_16_C(uint16* dst_ptr, const uint16* src_ptr,
int dst_width, int x, int dx);
void ScaleFilterCols64_C(uint8* dst_ptr, const uint8* src_ptr,
int dst_width, int x, int dx);
void ScaleFilterCols64_16_C(uint16* dst_ptr, const uint16* src_ptr,
int dst_width, int x, int dx);
void ScaleRowDown38_C(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
void ScaleRowDown38_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
uint16* dst, int dst_width);
void ScaleRowDown38_3_Box_C(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown38_3_Box_16_C(const uint16* src_ptr,
ptrdiff_t src_stride,
uint16* dst_ptr, int dst_width);
void ScaleRowDown38_2_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown38_2_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
uint16* dst_ptr, int dst_width);
void ScaleAddRow_C(const uint8* src_ptr, uint16* dst_ptr, int src_width);
void ScaleAddRow_16_C(const uint16* src_ptr, uint32* dst_ptr, int src_width);
void ScaleAddRows_C(const uint8* src_ptr, ptrdiff_t src_stride,
uint16* dst_ptr, int src_width, int src_height);
void ScaleARGBRowDown2_C(const uint8* src_argb,
ptrdiff_t src_stride,
uint8* dst_argb, int dst_width);
@ -234,28 +154,25 @@ void ScaleARGBFilterCols_C(uint8* dst_argb, const uint8* src_argb,
void ScaleARGBFilterCols64_C(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx);
// Specialized scalers for x86.
void ScaleRowDown2_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown4_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
@ -272,128 +189,46 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown2_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown2Linear_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown2Box_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown2Box_Odd_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown2_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown2Linear_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown2Box_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown2Box_Odd_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown4_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown4Box_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown4_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown4Box_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown34_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown34_1_Box_Any_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown34_0_Box_Any_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown38_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown38_3_Box_Any_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown38_2_Box_Any_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
void ScaleAddRow_Any_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
void ScaleAddRow_Any_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint16* dst_ptr, int src_width,
int src_height);
void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
int dst_width, int x, int dx);
void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
int dst_width, int x, int dx);
// ARGB Column functions
void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
ptrdiff_t src_stride,
uint8* dst_argb, int dst_width);
void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
ptrdiff_t src_stride,
uint8* dst_argb, int dst_width);
void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
ptrdiff_t src_stride,
uint8* dst_argb, int dst_width);
void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
int src_stepx,
uint8* dst_argb, int dst_width);
void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
ptrdiff_t src_stride,
int src_stepx,
uint8* dst_argb, int dst_width);
void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx);
void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx);
void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx);
void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx);
void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx);
void ScaleARGBFilterCols_Any_NEON(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx);
void ScaleARGBCols_Any_NEON(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx);
// ARGB Row functions
void ScaleARGBRowDown2_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
uint8* dst_argb, int dst_width);
void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
uint8* dst_argb, int dst_width);
void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
uint8* dst_argb, int dst_width);
void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride,
uint8* dst_argb, int dst_width);
void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
void ScaleARGBRowDown2_Any_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
uint8* dst_argb, int dst_width);
void ScaleARGBRowDown2Linear_Any_SSE2(const uint8* src_argb,
ptrdiff_t src_stride,
uint8* dst_argb, int dst_width);
void ScaleARGBRowDown2Box_Any_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
uint8* dst_argb, int dst_width);
void ScaleARGBRowDown2_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
void ScaleARGBRowDown2Linear_Any_NEON(const uint8* src_argb,
ptrdiff_t src_stride,
uint8* dst_argb, int dst_width);
void ScaleARGBRowDown2Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
int src_stepx, uint8* dst_argb, int dst_width);
void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
int src_stepx,
uint8* dst_argb, int dst_width);
void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride,
// Row functions.
void ScaleARGBRowDownEven_NEON(const uint8* src_argb, int src_stride,
int src_stepx,
uint8* dst_argb, int dst_width);
void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, int src_stride,
int src_stepx,
uint8* dst_argb, int dst_width);
void ScaleARGBRowDownEven_Any_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
int src_stepx,
uint8* dst_argb, int dst_width);
void ScaleARGBRowDownEvenBox_Any_SSE2(const uint8* src_argb,
ptrdiff_t src_stride,
int src_stepx,
uint8* dst_argb, int dst_width);
void ScaleARGBRowDownEven_Any_NEON(const uint8* src_argb, ptrdiff_t src_stride,
int src_stepx,
uint8* dst_argb, int dst_width);
void ScaleARGBRowDownEvenBox_Any_NEON(const uint8* src_argb,
ptrdiff_t src_stride,
int src_stepx,
uint8* dst_argb, int dst_width);
void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
// ScaleRowDown2Box also used by planar functions
// NEON downscalers with interpolation.
@ -401,8 +236,7 @@ void ScaleARGBRowDownEvenBox_Any_NEON(const uint8* src_argb,
// Note - not static due to reuse in convert for 444 to 420.
void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
@ -437,63 +271,27 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown2_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
void ScaleRowDown2Linear_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
void ScaleRowDown2Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
void ScaleRowDown2Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
void ScaleRowDown4_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
void ScaleRowDown4Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
void ScaleRowDown34_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
void ScaleRowDown2Box_Odd_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
void ScaleRowDown34_0_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* d, int dst_width);
void ScaleRowDown34_1_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* d, int dst_width);
void ScaleRowDown38_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
void ScaleRowDown4_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown4Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown34_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown34_0_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown34_1_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
// 32 -> 12
void ScaleRowDown38_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
// 32x3 -> 12x1
void ScaleRowDown38_3_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
// 32x2 -> 12x1
void ScaleRowDown38_2_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleAddRow_NEON(const uint8* src_ptr, uint16* dst_ptr, int src_width);
void ScaleAddRow_Any_NEON(const uint8* src_ptr, uint16* dst_ptr, int src_width);
void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
int dst_width, int x, int dx);
void ScaleFilterCols_Any_NEON(uint8* dst_ptr, const uint8* src_ptr,
int dst_width, int x, int dx);
void ScaleRowDown2_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
void ScaleRowDown2Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
void ScaleRowDown4_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
void ScaleRowDown4Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
void ScaleRowDown34_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
void ScaleRowDown34_0_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* d, int dst_width);
void ScaleRowDown34_1_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* d, int dst_width);
void ScaleRowDown38_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
void ScaleRowDown38_2_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown38_3_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown38_2_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown38_3_Box_MIPS_DSPR2(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
#ifdef __cplusplus
} // extern "C"

View File

@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1602
#define LIBYUV_VERSION 971
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT

View File

@ -62,7 +62,7 @@ enum FourCC {
// 2 Secondary YUV formats: row biplanar.
FOURCC_M420 = FOURCC('M', '4', '2', '0'),
FOURCC_Q420 = FOURCC('Q', '4', '2', '0'), // deprecated.
FOURCC_Q420 = FOURCC('Q', '4', '2', '0'),
// 9 Primary RGB formats: 4 32 bpp, 2 24 bpp, 3 16 bpp.
FOURCC_ARGB = FOURCC('A', 'R', 'G', 'B'),
@ -75,7 +75,7 @@ enum FourCC {
FOURCC_RGBO = FOURCC('R', 'G', 'B', 'O'), // argb1555 LE.
FOURCC_R444 = FOURCC('R', '4', '4', '4'), // argb4444 LE.
// 4 Secondary RGB formats: 4 Bayer Patterns. deprecated.
// 4 Secondary RGB formats: 4 Bayer Patterns.
FOURCC_RGGB = FOURCC('R', 'G', 'G', 'B'),
FOURCC_BGGR = FOURCC('B', 'G', 'G', 'R'),
FOURCC_GRBG = FOURCC('G', 'R', 'B', 'G'),
@ -90,8 +90,7 @@ enum FourCC {
FOURCC_YV24 = FOURCC('Y', 'V', '2', '4'),
FOURCC_YU12 = FOURCC('Y', 'U', '1', '2'), // Linux version of I420.
FOURCC_J420 = FOURCC('J', '4', '2', '0'),
FOURCC_J400 = FOURCC('J', '4', '0', '0'), // unofficial fourcc
FOURCC_H420 = FOURCC('H', '4', '2', '0'), // unofficial fourcc
FOURCC_J400 = FOURCC('J', '4', '0', '0'),
// 14 Auxiliary aliases. CanonicalFourCC() maps these to canonical fourcc.
FOURCC_IYUV = FOURCC('I', 'Y', 'U', 'V'), // Alias for I420.
@ -151,7 +150,6 @@ enum FourCCBpp {
FOURCC_BPP_YU12 = 12,
FOURCC_BPP_J420 = 12,
FOURCC_BPP_J400 = 8,
FOURCC_BPP_H420 = 12,
FOURCC_BPP_MJPG = 0, // 0 means unknown.
FOURCC_BPP_H264 = 0,
FOURCC_BPP_IYUV = 12,

View File

@ -10,83 +10,92 @@
'includes': [
'libyuv.gypi',
],
# Make sure that if we are being compiled to an xcodeproj, nothing tries to
# include a .pch.
'xcode_settings': {
'GCC_PREFIX_HEADER': '',
'GCC_PRECOMPILE_PREFIX_HEADER': 'NO',
},
'variables': {
'use_system_libjpeg%': 0,
'libyuv_disable_jpeg%': 0,
# 'chromium_code' treats libyuv as internal and increases warning level.
'chromium_code': 1,
# clang compiler default variable usable by other apps that include libyuv.
'clang%': 0,
# Link-Time Optimizations.
'use_lto%': 0,
'yuv_disable_asm%': 0,
'yuv_disable_avx2%': 0,
'build_neon': 0,
'conditions': [
['(target_arch == "armv7" or target_arch == "armv7s" or \
(target_arch == "arm" and arm_version >= 7) or target_arch == "arm64")\
and (arm_neon == 1 or arm_neon_optional == 1)',
{
['target_arch == "arm" and arm_version >= 7 and (arm_neon == 1 or arm_neon_optional == 1)', {
'build_neon': 1,
}],
],
},
'conditions': [
[ 'build_neon != 0', {
'targets': [
# The NEON-specific components.
{
'target_name': 'libyuv_neon',
'type': 'static_library',
'standalone_static_library': 1,
'defines': [
'LIBYUV_NEON',
],
# TODO(noahric): This should remove whatever mfpu is set, not
# just vfpv3-d16.
'cflags!': [
'-mfpu=vfp',
'-mfpu=vfpv3',
'-mfpu=vfpv3-d16',
],
# XXX Doesn't work currently
'cflags_mozilla!': [
'-mfpu=vfp',
'-mfpu=vfpv3',
'-mfpu=vfpv3-d16',
],
'cflags': [
'-mfpu=neon',
],
'cflags_mozilla': [
'-mfpu=neon',
],
'include_dirs': [
'include',
'.',
],
'direct_dependent_settings': {
'include_dirs': [
'include',
'.',
],
},
'sources': [
# sources.
'source/compare_neon.cc',
'source/rotate_neon.cc',
'source/row_neon.cc',
'source/scale_neon.cc',
],
},
],
}],
],
'targets': [
{
'target_name': 'libyuv',
# Change type to 'shared_library' to build .so or .dll files.
'type': 'static_library',
'variables': {
'optimize': 'max', # enable O2 and ltcg.
},
# Allows libyuv.a redistributable library without external dependencies.
'standalone_static_library': 1,
# 'standalone_static_library': 1,
'conditions': [
# Disable -Wunused-parameter
['clang == 1', {
'cflags': [
'-Wno-unused-parameter',
],
}],
['build_neon != 0', {
# TODO(fbarchard): Use gyp define to enable jpeg.
[ 'build_with_mozilla==1', {
'defines': [
'LIBYUV_NEON',
'HAVE_JPEG'
],
'cflags!': [
'-mfpu=vfp',
'-mfpu=vfpv3',
'-mfpu=vfpv3-d16',
# '-mthumb', # arm32 not thumb
],
'conditions': [
# Disable LTO in libyuv_neon target due to gcc 4.9 compiler bug.
['clang == 0 and use_lto == 1', {
'cflags!': [
'-flto',
'-ffat-lto-objects',
],
}],
# arm64 does not need -mfpu=neon option as neon is not optional
['target_arch != "arm64"', {
'cflags': [
'-mfpu=neon',
# '-marm', # arm32 not thumb
],
}],
'cflags_mozilla': [
'$(MOZ_JPEG_CFLAGS)',
],
}],
['OS != "ios" and libyuv_disable_jpeg != 1', {
[ 'OS != "ios" and build_with_mozilla!=1', {
'defines': [
'HAVE_JPEG'
],
'conditions': [
# Caveat system jpeg support may not support motion jpeg
[ 'use_system_libjpeg == 1', {
# Android uses libjpeg for system jpeg support.
[ 'OS == "android" and use_system_libjpeg == 1', {
'dependencies': [
'<(DEPTH)/third_party/libjpeg/libjpeg.gyp:libjpeg',
],
@ -104,15 +113,37 @@
}],
],
}],
], #conditions
[ 'build_neon != 0', {
'dependencies': [
'libyuv_neon',
],
'defines': [
'LIBYUV_NEON',
]
}],
[ 'yuv_disable_asm!=0', {
'defines': [
# Enable the following 3 macros to turn off assembly for specified CPU.
'LIBYUV_DISABLE_X86',
'LIBYUV_DISABLE_NEON',
'LIBYUV_DISABLE_MIPS',
],
}],
[ 'yuv_disable_avx2==1', {
'defines': [
'LIBYUV_DISABLE_AVX2',
]
}],
],
'defines': [
# Enable the following 3 macros to turn off assembly for specified CPU.
# 'LIBYUV_DISABLE_X86',
# 'LIBYUV_DISABLE_NEON',
# 'LIBYUV_DISABLE_MIPS',
# This disables AVX2 (Haswell) support, overriding compiler checks
# 'LIBYUV_DISABLE_AVX2',
# Enable the following macro to build libyuv as a shared library (dll).
# 'LIBYUV_USING_SHARED_LIBRARY',
# TODO(fbarchard): Make these into gyp defines.
],
'include_dirs': [
'include',
@ -123,18 +154,6 @@
'include',
'.',
],
'conditions': [
['OS == "android" and target_arch == "arm64"', {
'ldflags': [
'-Wl,--dynamic-linker,/system/bin/linker64',
],
}],
['OS == "android" and target_arch != "arm64"', {
'ldflags': [
'-Wl,--dynamic-linker,/system/bin/linker',
],
}],
], #conditions
},
'sources': [
'<@(libyuv_sources)',

View File

@ -18,11 +18,11 @@
'include/libyuv/convert_from.h',
'include/libyuv/convert_from_argb.h',
'include/libyuv/cpu_id.h',
'include/libyuv/format_conversion.h',
'include/libyuv/mjpeg_decoder.h',
'include/libyuv/planar_functions.h',
'include/libyuv/rotate.h',
'include/libyuv/rotate_argb.h',
'include/libyuv/rotate_row.h',
'include/libyuv/row.h',
'include/libyuv/scale.h',
'include/libyuv/scale_argb.h',
@ -33,9 +33,7 @@
# sources.
'source/compare.cc',
'source/compare_common.cc',
'source/compare_gcc.cc',
'source/compare_neon.cc',
'source/compare_neon64.cc',
'source/compare_posix.cc',
'source/compare_win.cc',
'source/convert.cc',
'source/convert_argb.cc',
@ -45,33 +43,23 @@
'source/convert_to_argb.cc',
'source/convert_to_i420.cc',
'source/cpu_id.cc',
'source/format_conversion.cc',
'source/mjpeg_decoder.cc',
'source/mjpeg_validate.cc',
'source/planar_functions.cc',
'source/rotate.cc',
'source/rotate_any.cc',
'source/rotate_argb.cc',
'source/rotate_common.cc',
'source/rotate_gcc.cc',
'source/rotate_mips.cc',
'source/rotate_neon.cc',
'source/rotate_neon64.cc',
'source/rotate_win.cc',
'source/row_any.cc',
'source/row_common.cc',
'source/row_gcc.cc',
'source/row_mips.cc',
'source/row_neon.cc',
'source/row_neon64.cc',
'source/row_posix.cc',
'source/row_win.cc',
'source/scale.cc',
'source/scale_any.cc',
'source/scale_argb.cc',
'source/scale_common.cc',
'source/scale_gcc.cc',
'source/scale_mips.cc',
'source/scale_neon.cc',
'source/scale_neon64.cc',
'source/scale_posix.cc',
'source/scale_win.cc',
'source/video_common.cc',
],

View File

@ -21,6 +21,9 @@
'build_newlib': 0,
'build_pnacl_newlib': 1,
},
'dependencies': [
'../../native_client/tools.gyp:prep_toolchain',
],
'include_dirs': [
'include',
],

View File

@ -7,25 +7,24 @@
# be found in the AUTHORS file in the root of the source tree.
{
'variables': {
'libyuv_disable_jpeg%': 0,
},
'targets': [
{
'target_name': 'libyuv_unittest',
'type': '<(gtest_target_type)',
'type': 'executable',
'dependencies': [
'libyuv.gyp:libyuv',
# The tests are based on gtest
'testing/gtest.gyp:gtest',
'third_party/gflags/gflags.gyp:gflags',
'testing/gtest.gyp:gtest_main',
],
'direct_dependent_settings': {
'defines': [
'GTEST_RELATIVE_PATH',
],
},
'export_dependent_settings': [
'<(DEPTH)/testing/gtest.gyp:gtest',
'defines': [
'LIBYUV_SVNREVISION="<!(svnversion -n)"',
# Enable the following 3 macros to turn off assembly for specified CPU.
# 'LIBYUV_DISABLE_X86',
# 'LIBYUV_DISABLE_NEON',
# 'LIBYUV_DISABLE_MIPS',
# Enable the following macro to build libyuv as a shared library (dll).
# 'LIBYUV_USING_SHARED_LIBRARY',
],
'sources': [
# headers
@ -34,7 +33,6 @@
# sources
'unit_test/basictypes_test.cc',
'unit_test/compare_test.cc',
'unit_test/color_test.cc',
'unit_test/convert_test.cc',
'unit_test/cpu_test.cc',
'unit_test/math_test.cc',
@ -45,6 +43,7 @@
'unit_test/scale_test.cc',
'unit_test/unit_test.cc',
'unit_test/video_common_test.cc',
'unit_test/version_test.cc',
],
'conditions': [
['OS=="linux"', {
@ -52,55 +51,14 @@
'-fexceptions',
],
}],
[ 'OS == "ios" and target_subarch == 64', {
'defines': [
'LIBYUV_DISABLE_NEON'
],
}],
[ 'OS == "ios"', {
'xcode_settings': {
'DEBUGGING_SYMBOLS': 'YES',
'DEBUG_INFORMATION_FORMAT' : 'dwarf-with-dsym',
# Work around compile issue with isosim.mm, see
# https://code.google.com/p/libyuv/issues/detail?id=548 for details.
'WARNING_CFLAGS': [
'-Wno-sometimes-uninitialized',
],
},
'cflags': [
'-Wno-sometimes-uninitialized',
],
}],
[ 'OS != "ios" and libyuv_disable_jpeg != 1', {
[ 'OS != "ios"', {
'defines': [
'HAVE_JPEG',
],
}],
['OS=="android"', {
'dependencies': [
'<(DEPTH)/testing/android/native_test.gyp:native_test_native_code',
],
}],
# TODO(YangZhang): These lines can be removed when high accuracy
# YUV to RGB to Neon is ported.
[ '(target_arch == "armv7" or target_arch == "armv7s" \
or (target_arch == "arm" and arm_version >= 7) \
or target_arch == "arm64") \
and (arm_neon == 1 or arm_neon_optional == 1)', {
'defines': [
'LIBYUV_NEON'
],
}],
], # conditions
'defines': [
# Enable the following 3 macros to turn off assembly for specified CPU.
# 'LIBYUV_DISABLE_X86',
# 'LIBYUV_DISABLE_NEON',
# 'LIBYUV_DISABLE_MIPS',
# Enable the following macro to build libyuv as a shared library (dll).
# 'LIBYUV_USING_SHARED_LIBRARY',
],
},
{
'target_name': 'compare',
'type': 'executable',
@ -147,24 +105,7 @@
'util/psnr.cc',
'util/ssim.cc',
],
'dependencies': [
'libyuv.gyp:libyuv',
],
'conditions': [
[ 'OS == "ios" and target_subarch == 64', {
'defines': [
'LIBYUV_DISABLE_NEON'
],
}],
[ 'OS != "ios" and libyuv_disable_jpeg != 1', {
'defines': [
'HAVE_JPEG',
],
}],
], # conditions
},
{
'target_name': 'cpuid',
'type': 'executable',
@ -177,50 +118,6 @@
],
},
], # targets
'conditions': [
['OS=="android"', {
'targets': [
{
# TODO(kjellander): Figure out what to change in build/apk_test.gypi
# to it can be used instead of the copied code below. Using it in its
# current version was not possible, since the target starts with 'lib',
# which somewhere confuses the variables.
'target_name': 'libyuv_unittest_apk',
'type': 'none',
'variables': {
# These are used to configure java_apk.gypi included below.
'test_type': 'gtest',
'apk_name': 'libyuv_unittest',
'test_suite_name': 'libyuv_unittest',
'intermediate_dir': '<(PRODUCT_DIR)/libyuv_unittest_apk',
'input_shlib_path': '<(SHARED_LIB_DIR)/<(SHARED_LIB_PREFIX)libyuv_unittest<(SHARED_LIB_SUFFIX)',
'final_apk_path': '<(intermediate_dir)/libyuv_unittest-debug.apk',
'java_in_dir': '<(DEPTH)/testing/android/native_test/java',
'test_runner_path': '<(DEPTH)/util/android/test_runner.py',
'native_lib_target': 'libyuv_unittest',
'gyp_managed_install': 0,
},
'includes': [
'build/android/test_runner.gypi',
'build/java_apk.gypi',
],
'dependencies': [
'<(DEPTH)/base/base.gyp:base_java',
# TODO(kjellander): Figure out why base_build_config_gen is needed
# here. It really shouldn't since it's a dependency of base_java
# above, but there's always 0 tests run if it's missing.
'<(DEPTH)/base/base.gyp:base_build_config_gen',
'<(DEPTH)/build/android/pylib/device/commands/commands.gyp:chromium_commands',
'<(DEPTH)/build/android/pylib/remote/device/dummy/dummy.gyp:remote_device_dummy_apk',
'<(DEPTH)/testing/android/appurify_support.gyp:appurify_support_java',
'<(DEPTH)/testing/android/on_device_instrumentation.gyp:reporter_java',
'<(DEPTH)/tools/android/android_tools.gyp:android_tools',
'libyuv_unittest',
],
},
],
}],
],
}
# Local Variables:

View File

@ -1,81 +1,48 @@
# This is a generic makefile for libyuv for gcc.
# make -f linux.mk CXX=clang++
# make -f linux.mk CC=clang++
CC?=gcc
CFLAGS?=-O2 -fomit-frame-pointer
CFLAGS+=-Iinclude/
CXX?=g++
CXXFLAGS?=-O2 -fomit-frame-pointer
CXXFLAGS+=-Iinclude/
CC=g++
CCFLAGS=-O2 -fomit-frame-pointer -Iinclude/
LOCAL_OBJ_FILES := \
source/compare.o \
source/compare_common.o \
source/compare_gcc.o \
source/compare_neon64.o \
source/compare_neon.o \
source/compare_win.o \
source/convert_argb.o \
source/convert.o \
source/convert_from_argb.o \
source/convert_from.o \
source/convert_jpeg.o \
source/convert_to_argb.o \
source/convert_to_i420.o \
source/cpu_id.o \
source/mjpeg_decoder.o \
source/mjpeg_validate.o \
source/planar_functions.o \
source/rotate_any.o \
source/rotate_argb.o \
source/rotate.o \
source/rotate_common.o \
source/rotate_gcc.o \
source/rotate_mips.o \
source/rotate_neon64.o \
source/rotate_neon.o \
source/rotate_win.o \
source/row_any.o \
source/row_common.o \
source/row_gcc.o \
source/row_mips.o \
source/row_neon64.o \
source/row_neon.o \
source/row_win.o \
source/scale_any.o \
source/scale_argb.o \
source/scale.o \
source/scale_common.o \
source/scale_gcc.o \
source/scale_mips.o \
source/scale_neon64.o \
source/scale_neon.o \
source/scale_win.o \
source/video_common.o
source/compare.o \
source/compare_common.o \
source/compare_posix.o \
source/convert.o \
source/convert_argb.o \
source/convert_from.o \
source/convert_from_argb.o \
source/convert_to_argb.o \
source/convert_to_i420.o \
source/cpu_id.o \
source/format_conversion.o \
source/planar_functions.o \
source/rotate.o \
source/rotate_argb.o \
source/rotate_mips.o \
source/row_any.o \
source/row_common.o \
source/row_mips.o \
source/row_posix.o \
source/scale.o \
source/scale_argb.o \
source/scale_common.o \
source/scale_mips.o \
source/scale_posix.o \
source/video_common.o
.cc.o:
$(CXX) -c $(CXXFLAGS) $*.cc -o $*.o
$(CC) -c $(CCFLAGS) $*.cc -o $*.o
.c.o:
$(CC) -c $(CFLAGS) $*.c -o $*.o
all: libyuv.a convert linux.mk
all: libyuv.a convert cpuid psnr
libyuv.a: $(LOCAL_OBJ_FILES) linux.mk
$(AR) $(ARFLAGS) -o $@ $(LOCAL_OBJ_FILES)
libyuv.a: $(LOCAL_OBJ_FILES)
$(AR) $(ARFLAGS) $@ $(LOCAL_OBJ_FILES)
# A C++ test utility that uses libyuv conversion.
convert: util/convert.cc libyuv.a
$(CXX) $(CXXFLAGS) -Iutil/ -o $@ util/convert.cc libyuv.a
# A standalone test utility
psnr: util/psnr.cc
$(CXX) $(CXXFLAGS) -Iutil/ -o $@ util/psnr.cc util/psnr_main.cc util/ssim.cc
# A C test utility that uses libyuv conversion from C.
cpuid: util/cpuid.c libyuv.a
$(CC) $(CFLAGS) -o $@ util/cpuid.c libyuv.a
# A test utility that uses libyuv conversion.
convert: util/convert.cc linux.mk
$(CC) $(CCFLAGS) -Iutil/ -o $@ util/convert.cc libyuv.a
clean:
/bin/rm -f source/*.o *.ii *.s libyuv.a convert cpuid psnr
/bin/rm -f source/*.o *.ii *.s libyuv.a convert

View File

@ -17,23 +17,38 @@
#endif
#include "libyuv/basic_types.h"
#include "libyuv/compare_row.h"
#include "libyuv/cpu_id.h"
#include "libyuv/row.h"
#include "libyuv/video_common.h"
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
// hash seed of 5381 recommended.
// Internal C version of HashDjb2 with int sized count for efficiency.
uint32 HashDjb2_C(const uint8* src, int count, uint32 seed);
// This module is for Visual C x86
#if !defined(LIBYUV_DISABLE_X86) && \
(defined(_M_IX86) || \
(defined(__x86_64__) || (defined(__i386__) && !defined(__pic__))))
#define HAS_HASHDJB2_SSE41
uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed);
#if _MSC_VER >= 1700
#define HAS_HASHDJB2_AVX2
uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed);
#endif
#endif // HAS_HASHDJB2_SSE41
// hash seed of 5381 recommended.
LIBYUV_API
uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) {
const int kBlockSize = 1 << 15; // 32768;
int remainder;
uint32 (*HashDjb2_SSE)(const uint8* src, int count, uint32 seed) =
HashDjb2_C;
uint32 (*HashDjb2_SSE)(const uint8* src, int count, uint32 seed) = HashDjb2_C;
#if defined(HAS_HASHDJB2_SSE41)
if (TestCpuFlag(kCpuHasSSE41)) {
HashDjb2_SSE = HashDjb2_SSE41;
@ -63,53 +78,22 @@ uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) {
return seed;
}
static uint32 ARGBDetectRow_C(const uint8* argb, int width) {
int x;
for (x = 0; x < width - 1; x += 2) {
if (argb[0] != 255) { // First byte is not Alpha of 255, so not ARGB.
return FOURCC_BGRA;
}
if (argb[3] != 255) { // 4th byte is not Alpha of 255, so not BGRA.
return FOURCC_ARGB;
}
if (argb[4] != 255) { // Second pixel first byte is not Alpha of 255.
return FOURCC_BGRA;
}
if (argb[7] != 255) { // Second pixel 4th byte is not Alpha of 255.
return FOURCC_ARGB;
}
argb += 8;
}
if (width & 1) {
if (argb[0] != 255) { // First byte is not Alpha of 255, so not ARGB.
return FOURCC_BGRA;
}
if (argb[3] != 255) { // 4th byte is not Alpha of 255, so not BGRA.
return FOURCC_ARGB;
}
}
return 0;
}
// Scan an opaque argb image and return fourcc based on alpha offset.
// Returns FOURCC_ARGB, FOURCC_BGRA, or 0 if unknown.
LIBYUV_API
uint32 ARGBDetect(const uint8* argb, int stride_argb, int width, int height) {
uint32 fourcc = 0;
int h;
// Coalesce rows.
if (stride_argb == width * 4) {
width *= height;
height = 1;
stride_argb = 0;
}
for (h = 0; h < height && fourcc == 0; ++h) {
fourcc = ARGBDetectRow_C(argb, width);
argb += stride_argb;
}
return fourcc;
}
uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count);
#if !defined(LIBYUV_DISABLE_NEON) && \
(defined(__ARM_NEON__) || defined(LIBYUV_NEON))
#define HAS_SUMSQUAREERROR_NEON
uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count);
#endif
#if !defined(LIBYUV_DISABLE_X86) && \
(defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
#define HAS_SUMSQUAREERROR_SSE2
uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count);
#endif
// Visual C 2012 required for AVX2.
#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && _MSC_VER >= 1700
#define HAS_SUMSQUAREERROR_AVX2
uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count);
#endif
// TODO(fbarchard): Refactor into row function.
LIBYUV_API
@ -130,7 +114,8 @@ uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b,
}
#endif
#if defined(HAS_SUMSQUAREERROR_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
if (TestCpuFlag(kCpuHasSSE2) &&
IS_ALIGNED(src_a, 16) && IS_ALIGNED(src_b, 16)) {
// Note only used for multiples of 16 so count is not checked.
SumSquareError = SumSquareError_SSE2;
}

View File

@ -10,8 +10,6 @@
#include "libyuv/basic_types.h"
#include "libyuv/compare_row.h"
#ifdef __cplusplus
namespace libyuv {
extern "C" {

View File

@ -10,16 +10,12 @@
#include "libyuv/basic_types.h"
#include "libyuv/compare_row.h"
#include "libyuv/row.h"
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
!defined(__aarch64__)
#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)
uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
volatile uint32 sse;
@ -29,10 +25,9 @@ uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
"vmov.u8 q9, #0 \n"
"vmov.u8 q11, #0 \n"
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n"
MEMACCESS(1)
"vld1.8 {q1}, [%1]! \n"
"subs %2, %2, #16 \n"
"vsubl.u8 q2, d0, d2 \n"
@ -58,7 +53,7 @@ uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
return sse;
}
#endif // defined(__ARM_NEON__) && !defined(__aarch64__)
#endif // __ARM_NEON__
#ifdef __cplusplus
} // extern "C"

View File

@ -9,8 +9,6 @@
*/
#include "libyuv/basic_types.h"
#include "libyuv/compare_row.h"
#include "libyuv/row.h"
#ifdef __cplusplus
@ -18,8 +16,7 @@ namespace libyuv {
extern "C" {
#endif
// This module is for 32 bit Visual C x86 and clangcl
#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
__declspec(naked)
uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
@ -30,11 +27,13 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
pxor xmm0, xmm0
pxor xmm5, xmm5
align 4
wloop:
movdqu xmm1, [eax]
movdqa xmm1, [eax]
lea eax, [eax + 16]
movdqu xmm2, [edx]
movdqa xmm2, [edx]
lea edx, [edx + 16]
sub ecx, 16
movdqa xmm3, xmm1 // abs trick
psubusb xmm1, xmm2
psubusb xmm2, xmm3
@ -46,7 +45,6 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
pmaddwd xmm2, xmm2
paddd xmm0, xmm1
paddd xmm0, xmm2
sub ecx, 16
jg wloop
pshufd xmm1, xmm0, 0xee
@ -72,10 +70,12 @@ uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
vpxor ymm5, ymm5, ymm5 // constant 0 for unpck
sub edx, eax
align 4
wloop:
vmovdqu ymm1, [eax]
vmovdqu ymm2, [eax + edx]
lea eax, [eax + 32]
sub ecx, 32
vpsubusb ymm3, ymm1, ymm2 // abs difference trick
vpsubusb ymm2, ymm2, ymm1
vpor ymm1, ymm2, ymm3
@ -85,7 +85,6 @@ uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
vpmaddwd ymm1, ymm1, ymm1
vpaddd ymm0, ymm0, ymm1
vpaddd ymm0, ymm0, ymm2
sub ecx, 32
jg wloop
vpshufd ymm1, ymm0, 0xee // 3, 2 + 1, 0 both lanes.
@ -101,32 +100,41 @@ uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
}
#endif // _MSC_VER >= 1700
uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 }; // 33 ^ 16
uvec32 kHashMul0 = {
#define HAS_HASHDJB2_SSE41
static uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 }; // 33 ^ 16
static uvec32 kHashMul0 = {
0x0c3525e1, // 33 ^ 15
0xa3476dc1, // 33 ^ 14
0x3b4039a1, // 33 ^ 13
0x4f5f0981, // 33 ^ 12
};
uvec32 kHashMul1 = {
static uvec32 kHashMul1 = {
0x30f35d61, // 33 ^ 11
0x855cb541, // 33 ^ 10
0x040a9121, // 33 ^ 9
0x747c7101, // 33 ^ 8
};
uvec32 kHashMul2 = {
static uvec32 kHashMul2 = {
0xec41d4e1, // 33 ^ 7
0x4cfa3cc1, // 33 ^ 6
0x025528a1, // 33 ^ 5
0x00121881, // 33 ^ 4
};
uvec32 kHashMul3 = {
static uvec32 kHashMul3 = {
0x00008c61, // 33 ^ 3
0x00000441, // 33 ^ 2
0x00000021, // 33 ^ 1
0x00000001, // 33 ^ 0
};
// 27: 66 0F 38 40 C6 pmulld xmm0,xmm6
// 44: 66 0F 38 40 DD pmulld xmm3,xmm5
// 59: 66 0F 38 40 E5 pmulld xmm4,xmm5
// 72: 66 0F 38 40 D5 pmulld xmm2,xmm5
// 83: 66 0F 38 40 CD pmulld xmm1,xmm5
#define pmulld(reg) _asm _emit 0x66 _asm _emit 0x0F _asm _emit 0x38 \
_asm _emit 0x40 _asm _emit reg
__declspec(naked)
uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
__asm {
@ -135,32 +143,34 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
movd xmm0, [esp + 12] // seed
pxor xmm7, xmm7 // constant 0 for unpck
movdqa xmm6, xmmword ptr kHash16x33
movdqa xmm6, kHash16x33
align 4
wloop:
movdqu xmm1, [eax] // src[0-15]
lea eax, [eax + 16]
pmulld xmm0, xmm6 // hash *= 33 ^ 16
movdqa xmm5, xmmword ptr kHashMul0
pmulld(0xc6) // pmulld xmm0,xmm6 hash *= 33 ^ 16
movdqa xmm5, kHashMul0
movdqa xmm2, xmm1
punpcklbw xmm2, xmm7 // src[0-7]
movdqa xmm3, xmm2
punpcklwd xmm3, xmm7 // src[0-3]
pmulld xmm3, xmm5
movdqa xmm5, xmmword ptr kHashMul1
pmulld(0xdd) // pmulld xmm3, xmm5
movdqa xmm5, kHashMul1
movdqa xmm4, xmm2
punpckhwd xmm4, xmm7 // src[4-7]
pmulld xmm4, xmm5
movdqa xmm5, xmmword ptr kHashMul2
pmulld(0xe5) // pmulld xmm4, xmm5
movdqa xmm5, kHashMul2
punpckhbw xmm1, xmm7 // src[8-15]
movdqa xmm2, xmm1
punpcklwd xmm2, xmm7 // src[8-11]
pmulld xmm2, xmm5
movdqa xmm5, xmmword ptr kHashMul3
pmulld(0xd5) // pmulld xmm2, xmm5
movdqa xmm5, kHashMul3
punpckhwd xmm1, xmm7 // src[12-15]
pmulld xmm1, xmm5
pmulld(0xcd) // pmulld xmm1, xmm5
paddd xmm3, xmm4 // add 16 results
paddd xmm1, xmm2
sub ecx, 16
paddd xmm1, xmm3
pshufd xmm2, xmm1, 0x0e // upper 2 dwords
@ -168,7 +178,6 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
pshufd xmm2, xmm1, 0x01
paddd xmm1, xmm2
paddd xmm0, xmm1
sub ecx, 16
jg wloop
movd eax, xmm0 // return hash
@ -183,38 +192,39 @@ uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) {
__asm {
mov eax, [esp + 4] // src
mov ecx, [esp + 8] // count
vmovd xmm0, [esp + 12] // seed
movd xmm0, [esp + 12] // seed
movdqa xmm6, kHash16x33
align 4
wloop:
vpmovzxbd xmm3, [eax] // src[0-3]
vpmulld xmm0, xmm0, xmmword ptr kHash16x33 // hash *= 33 ^ 16
vpmovzxbd xmm4, [eax + 4] // src[4-7]
vpmulld xmm3, xmm3, xmmword ptr kHashMul0
vpmovzxbd xmm2, [eax + 8] // src[8-11]
vpmulld xmm4, xmm4, xmmword ptr kHashMul1
vpmovzxbd xmm1, [eax + 12] // src[12-15]
vpmulld xmm2, xmm2, xmmword ptr kHashMul2
vpmovzxbd xmm3, dword ptr [eax] // src[0-3]
pmulld xmm0, xmm6 // hash *= 33 ^ 16
vpmovzxbd xmm4, dword ptr [eax + 4] // src[4-7]
pmulld xmm3, kHashMul0
vpmovzxbd xmm2, dword ptr [eax + 8] // src[8-11]
pmulld xmm4, kHashMul1
vpmovzxbd xmm1, dword ptr [eax + 12] // src[12-15]
pmulld xmm2, kHashMul2
lea eax, [eax + 16]
vpmulld xmm1, xmm1, xmmword ptr kHashMul3
vpaddd xmm3, xmm3, xmm4 // add 16 results
vpaddd xmm1, xmm1, xmm2
vpaddd xmm1, xmm1, xmm3
vpshufd xmm2, xmm1, 0x0e // upper 2 dwords
vpaddd xmm1, xmm1,xmm2
vpshufd xmm2, xmm1, 0x01
vpaddd xmm1, xmm1, xmm2
vpaddd xmm0, xmm0, xmm1
pmulld xmm1, kHashMul3
paddd xmm3, xmm4 // add 16 results
paddd xmm1, xmm2
sub ecx, 16
paddd xmm1, xmm3
pshufd xmm2, xmm1, 0x0e // upper 2 dwords
paddd xmm1, xmm2
pshufd xmm2, xmm1, 0x01
paddd xmm1, xmm2
paddd xmm0, xmm1
jg wloop
vmovd eax, xmm0 // return hash
vzeroupper
movd eax, xmm0 // return hash
ret
}
}
#endif // _MSC_VER >= 1700
#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
#ifdef __cplusplus
} // extern "C"

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -9,7 +9,6 @@
*/
#include "libyuv/convert.h"
#include "libyuv/convert_argb.h"
#ifdef HAVE_JPEG
#include "libyuv/mjpeg_decoder.h"
@ -219,7 +218,7 @@ int MJPGToI420(const uint8* sample,
return 1;
}
}
return ret ? 0 : 1;
return ret ? 0 : -1;
}
#ifdef HAVE_JPEG
@ -381,7 +380,7 @@ int MJPGToARGB(const uint8* sample,
return 1;
}
}
return ret ? 0 : 1;
return ret ? 0 : -1;
}
#endif

View File

@ -11,6 +11,7 @@
#include "libyuv/convert_argb.h"
#include "libyuv/cpu_id.h"
#include "libyuv/format_conversion.h"
#ifdef HAVE_JPEG
#include "libyuv/mjpeg_decoder.h"
#endif
@ -23,7 +24,7 @@ namespace libyuv {
extern "C" {
#endif
// Convert camera sample to ARGB with cropping, rotation and vertical flip.
// Convert camera sample to I420 with cropping, rotation and vertical flip.
// src_width is used for source stride computation
// src_height is used to compute location of planes, and indicate inversion
// sample_size is measured in bytes and is the size of the frame.
@ -51,8 +52,8 @@ int ConvertToARGB(const uint8* sample, size_t sample_size,
// also enable temporary buffer.
LIBYUV_BOOL need_buf = (rotation && format != FOURCC_ARGB) ||
crop_argb == sample;
uint8* dest_argb = crop_argb;
int dest_argb_stride = argb_stride;
uint8* tmp_argb = crop_argb;
int tmp_argb_stride = argb_stride;
uint8* rotate_buffer = NULL;
int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height;
@ -66,13 +67,13 @@ int ConvertToARGB(const uint8* sample, size_t sample_size,
}
if (need_buf) {
int argb_size = crop_width * 4 * abs_crop_height;
int argb_size = crop_width * abs_crop_height * 4;
rotate_buffer = (uint8*)malloc(argb_size);
if (!rotate_buffer) {
return 1; // Out of memory runtime error.
}
crop_argb = rotate_buffer;
argb_stride = crop_width * 4;
argb_stride = crop_width;
}
switch (format) {
@ -143,6 +144,36 @@ int ConvertToARGB(const uint8* sample, size_t sample_size,
crop_argb, argb_stride,
crop_width, inv_crop_height);
break;
// TODO(fbarchard): Support cropping Bayer by odd numbers
// by adjusting fourcc.
case FOURCC_BGGR:
src = sample + (src_width * crop_y + crop_x);
r = BayerBGGRToARGB(src, src_width,
crop_argb, argb_stride,
crop_width, inv_crop_height);
break;
case FOURCC_GBRG:
src = sample + (src_width * crop_y + crop_x);
r = BayerGBRGToARGB(src, src_width,
crop_argb, argb_stride,
crop_width, inv_crop_height);
break;
case FOURCC_GRBG:
src = sample + (src_width * crop_y + crop_x);
r = BayerGRBGToARGB(src, src_width,
crop_argb, argb_stride,
crop_width, inv_crop_height);
break;
case FOURCC_RGGB:
src = sample + (src_width * crop_y + crop_x);
r = BayerRGGBToARGB(src, src_width,
crop_argb, argb_stride,
crop_width, inv_crop_height);
break;
case FOURCC_I400:
src = sample + src_width * crop_y + crop_x;
r = I400ToARGB(src, src_width,
@ -174,8 +205,18 @@ int ConvertToARGB(const uint8* sample, size_t sample_size,
crop_argb, argb_stride,
crop_width, inv_crop_height);
break;
// case FOURCC_Q420:
// src = sample + (src_width + aligned_src_width * 2) * crop_y + crop_x;
// src_uv = sample + (src_width + aligned_src_width * 2) * crop_y +
// src_width + crop_x * 2;
// r = Q420ToARGB(src, src_width * 3,
// src_uv, src_width * 3,
// crop_argb, argb_stride,
// crop_width, inv_crop_height);
// break;
// Triplanar formats
case FOURCC_I420:
case FOURCC_YU12:
case FOURCC_YV12: {
const uint8* src_y = sample + (src_width * crop_y + crop_x);
const uint8* src_u;
@ -200,25 +241,6 @@ int ConvertToARGB(const uint8* sample, size_t sample_size,
crop_width, inv_crop_height);
break;
}
case FOURCC_J420: {
const uint8* src_y = sample + (src_width * crop_y + crop_x);
const uint8* src_u;
const uint8* src_v;
int halfwidth = (src_width + 1) / 2;
int halfheight = (abs_src_height + 1) / 2;
src_u = sample + src_width * abs_src_height +
(halfwidth * crop_y + crop_x) / 2;
src_v = sample + src_width * abs_src_height +
halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
r = J420ToARGB(src_y, src_width,
src_u, halfwidth,
src_v, halfwidth,
crop_argb, argb_stride,
crop_width, inv_crop_height);
break;
}
case FOURCC_I422:
case FOURCC_YV16: {
const uint8* src_y = sample + src_width * crop_y + crop_x;
@ -290,7 +312,7 @@ int ConvertToARGB(const uint8* sample, size_t sample_size,
if (need_buf) {
if (!r) {
r = ARGBRotate(crop_argb, argb_stride,
dest_argb, dest_argb_stride,
tmp_argb, tmp_argb_stride,
crop_width, abs_crop_height, rotation);
}
free(rotate_buffer);

View File

@ -12,6 +12,7 @@
#include "libyuv/convert.h"
#include "libyuv/format_conversion.h"
#include "libyuv/video_common.h"
#ifdef __cplusplus
@ -39,13 +40,12 @@ int ConvertToI420(const uint8* sample,
int aligned_src_width = (src_width + 1) & ~1;
const uint8* src;
const uint8* src_uv;
const int abs_src_height = (src_height < 0) ? -src_height : src_height;
// TODO(nisse): Why allow crop_height < 0?
const int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height;
int abs_src_height = (src_height < 0) ? -src_height : src_height;
int inv_crop_height = (crop_height < 0) ? -crop_height : crop_height;
int r = 0;
LIBYUV_BOOL need_buf = (rotation && format != FOURCC_I420 &&
format != FOURCC_NV12 && format != FOURCC_NV21 &&
format != FOURCC_YV12) || y == sample;
format != FOURCC_YU12 && format != FOURCC_YV12) || y == sample;
uint8* tmp_y = y;
uint8* tmp_u = u;
uint8* tmp_v = v;
@ -53,14 +53,16 @@ int ConvertToI420(const uint8* sample,
int tmp_u_stride = u_stride;
int tmp_v_stride = v_stride;
uint8* rotate_buffer = NULL;
const int inv_crop_height =
(src_height < 0) ? -abs_crop_height : abs_crop_height;
int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height;
if (!y || !u || !v || !sample ||
src_width <= 0 || crop_width <= 0 ||
src_height == 0 || crop_height == 0) {
return -1;
}
if (src_height < 0) {
inv_crop_height = -inv_crop_height;
}
// One pass rotation is available for some formats. For the rest, convert
// to I420 (with optional vertical flipping) into a temporary I420 buffer,
@ -171,6 +173,40 @@ int ConvertToI420(const uint8* sample,
v, v_stride,
crop_width, inv_crop_height);
break;
// TODO(fbarchard): Support cropping Bayer by odd numbers
// by adjusting fourcc.
case FOURCC_BGGR:
src = sample + (src_width * crop_y + crop_x);
r = BayerBGGRToI420(src, src_width,
y, y_stride,
u, u_stride,
v, v_stride,
crop_width, inv_crop_height);
break;
case FOURCC_GBRG:
src = sample + (src_width * crop_y + crop_x);
r = BayerGBRGToI420(src, src_width,
y, y_stride,
u, u_stride,
v, v_stride,
crop_width, inv_crop_height);
break;
case FOURCC_GRBG:
src = sample + (src_width * crop_y + crop_x);
r = BayerGRBGToI420(src, src_width,
y, y_stride,
u, u_stride,
v, v_stride,
crop_width, inv_crop_height);
break;
case FOURCC_RGGB:
src = sample + (src_width * crop_y + crop_x);
r = BayerRGGBToI420(src, src_width,
y, y_stride,
u, u_stride,
v, v_stride,
crop_width, inv_crop_height);
break;
case FOURCC_I400:
src = sample + src_width * crop_y + crop_x;
r = I400ToI420(src, src_width,
@ -182,8 +218,7 @@ int ConvertToI420(const uint8* sample,
// Biplanar formats
case FOURCC_NV12:
src = sample + (src_width * crop_y + crop_x);
src_uv = sample + (src_width * src_height) +
((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2);
src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x;
r = NV12ToI420Rotate(src, src_width,
src_uv, aligned_src_width,
y, y_stride,
@ -193,8 +228,7 @@ int ConvertToI420(const uint8* sample,
break;
case FOURCC_NV21:
src = sample + (src_width * crop_y + crop_x);
src_uv = sample + (src_width * src_height) +
((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2);
src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x;
// Call NV12 but with u and v parameters swapped.
r = NV12ToI420Rotate(src, src_width,
src_uv, aligned_src_width,
@ -211,8 +245,20 @@ int ConvertToI420(const uint8* sample,
v, v_stride,
crop_width, inv_crop_height);
break;
case FOURCC_Q420:
src = sample + (src_width + aligned_src_width * 2) * crop_y + crop_x;
src_uv = sample + (src_width + aligned_src_width * 2) * crop_y +
src_width + crop_x * 2;
r = Q420ToI420(src, src_width * 3,
src_uv, src_width * 3,
y, y_stride,
u, u_stride,
v, v_stride,
crop_width, inv_crop_height);
break;
// Triplanar formats
case FOURCC_I420:
case FOURCC_YU12:
case FOURCC_YV12: {
const uint8* src_y = sample + (src_width * crop_y + crop_x);
const uint8* src_u;

View File

@ -10,12 +10,12 @@
#include "libyuv/cpu_id.h"
#if defined(_MSC_VER)
#if defined(_MSC_VER) && !defined(__clang__)
#include <intrin.h> // For __cpuidex()
#endif
#if !defined(__pnacl__) && !defined(__CLR_VER) && \
!defined(__native_client__) && (defined(_M_IX86) || defined(_M_X64)) && \
defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219)
!defined(__native_client__) && defined(_M_X64) && \
defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219)
#include <immintrin.h> // For _xgetbv()
#endif
@ -36,22 +36,20 @@ extern "C" {
// For functions that use the stack and have runtime checks for overflow,
// use SAFEBUFFERS to avoid additional check.
#if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219) && \
!defined(__clang__)
#if defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219)
#define SAFEBUFFERS __declspec(safebuffers)
#else
#define SAFEBUFFERS
#endif
// Low level cpuid for X86.
#if (defined(_M_IX86) || defined(_M_X64) || \
defined(__i386__) || defined(__x86_64__)) && \
!defined(__pnacl__) && !defined(__CLR_VER)
// Low level cpuid for X86. Returns zeros on other CPUs.
#if !defined(__pnacl__) && !defined(__CLR_VER) && \
(defined(_M_IX86) || defined(_M_X64) || \
defined(__i386__) || defined(__x86_64__))
LIBYUV_API
void CpuId(uint32 info_eax, uint32 info_ecx, uint32* cpu_info) {
#if defined(_MSC_VER)
// Visual C version uses intrinsic or inline x86 assembly.
#if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219)
#if defined(_MSC_VER) && !defined(__clang__)
#if (_MSC_FULL_VER >= 160040219)
__cpuidex((int*)(cpu_info), info_eax, info_ecx);
#elif defined(_M_IX86)
__asm {
@ -64,17 +62,16 @@ void CpuId(uint32 info_eax, uint32 info_ecx, uint32* cpu_info) {
mov [edi + 8], ecx
mov [edi + 12], edx
}
#else // Visual C but not x86
#else
if (info_ecx == 0) {
__cpuid((int*)(cpu_info), info_eax);
} else {
cpu_info[3] = cpu_info[2] = cpu_info[1] = cpu_info[0] = 0;
}
#endif
// GCC version uses inline x86 assembly.
#else // defined(_MSC_VER)
uint32 info_ebx, info_edx;
asm volatile (
asm volatile ( // NOLINT
#if defined( __i386__) && defined(__PIC__)
// Preserve ebx for fpic 32 bit.
"mov %%ebx, %%edi \n"
@ -92,78 +89,76 @@ void CpuId(uint32 info_eax, uint32 info_ecx, uint32* cpu_info) {
cpu_info[3] = info_edx;
#endif // defined(_MSC_VER)
}
#else // (defined(_M_IX86) || defined(_M_X64) ...
#if !defined(__native_client__)
#define HAS_XGETBV
// X86 CPUs have xgetbv to detect OS saves high parts of ymm registers.
int TestOsSaveYmm() {
uint32 xcr0 = 0u;
#if defined(_MSC_VER) && defined(_XCR_XFEATURE_ENABLED_MASK)
xcr0 = (uint32)(_xgetbv(_XCR_XFEATURE_ENABLED_MASK));
#elif defined(_MSC_VER) && defined(_M_IX86)
__asm {
xor ecx, ecx // xcr 0
_asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0 // For VS2010 and earlier.
mov xcr0, eax
}
#elif defined(__i386__) || defined(__x86_64__)
asm(".byte 0x0f, 0x01, 0xd0" : "=a" (xcr0) : "c" (0) : "%edx");
#endif // defined(_MSC_VER)
return((xcr0 & 6) == 6); // Is ymm saved?
}
#endif // !defined(__native_client__)
#else
LIBYUV_API
void CpuId(uint32 eax, uint32 ecx, uint32* cpu_info) {
cpu_info[0] = cpu_info[1] = cpu_info[2] = cpu_info[3] = 0;
}
#endif
// For VS2010 and earlier emit can be used:
// _asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0 // For VS2010 and earlier.
// __asm {
// xor ecx, ecx // xcr 0
// xgetbv
// mov xcr0, eax
// }
// For VS2013 and earlier 32 bit, the _xgetbv(0) optimizer produces bad code.
// https://code.google.com/p/libyuv/issues/detail?id=529
#if defined(_M_IX86) && (_MSC_VER < 1900)
#pragma optimize("g", off)
#endif
#if (defined(_M_IX86) || defined(_M_X64) || \
defined(__i386__) || defined(__x86_64__)) && \
!defined(__pnacl__) && !defined(__CLR_VER) && !defined(__native_client__)
#define HAS_XGETBV
// X86 CPUs have xgetbv to detect OS saves high parts of ymm registers.
int GetXCR0() {
uint32 xcr0 = 0u;
#if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219)
xcr0 = (uint32)(_xgetbv(0)); // VS2010 SP1 required.
#elif defined(__i386__) || defined(__x86_64__)
asm(".byte 0x0f, 0x01, 0xd0" : "=a" (xcr0) : "c" (0) : "%edx");
#endif // defined(__i386__) || defined(__x86_64__)
return xcr0;
}
#endif // defined(_M_IX86) || defined(_M_X64) ..
// Return optimization to previous setting.
#if defined(_M_IX86) && (_MSC_VER < 1900)
#pragma optimize("g", on)
#endif
// based on libvpx arm_cpudetect.c
// For Arm, but public to allow testing on any CPU
LIBYUV_API SAFEBUFFERS
int ArmCpuCaps(const char* cpuinfo_name) {
char cpuinfo_line[512];
FILE* f = fopen(cpuinfo_name, "r");
if (!f) {
// Assume Neon if /proc/cpuinfo is unavailable.
// This will occur for Chrome sandbox for Pepper or Render process.
return kCpuHasNEON;
}
while (fgets(cpuinfo_line, sizeof(cpuinfo_line) - 1, f)) {
if (memcmp(cpuinfo_line, "Features", 8) == 0) {
char* p = strstr(cpuinfo_line, " neon");
if (p && (p[5] == ' ' || p[5] == '\n')) {
fclose(f);
return kCpuHasNEON;
}
// aarch64 uses asimd for Neon.
p = strstr(cpuinfo_line, " asimd");
if (p && (p[6] == ' ' || p[6] == '\n')) {
fclose(f);
return kCpuHasNEON;
if (f) {
char cpuinfo_line[512];
while (fgets(cpuinfo_line, sizeof(cpuinfo_line) - 1, f)) {
if (memcmp(cpuinfo_line, "Features", 8) == 0) {
char* p = strstr(cpuinfo_line, " neon");
if (p && (p[5] == ' ' || p[5] == '\n')) {
fclose(f);
return kCpuHasNEON;
}
}
}
fclose(f);
}
fclose(f);
return 0;
}
#if defined(__mips__) && defined(__linux__)
static int MipsCpuCaps(const char* search_string) {
const char* file_name = "/proc/cpuinfo";
char cpuinfo_line[256];
FILE* f = NULL;
if ((f = fopen(file_name, "r")) != NULL) {
while (fgets(cpuinfo_line, sizeof(cpuinfo_line) - 1, f) != NULL) {
if (strstr(cpuinfo_line, search_string) != NULL) {
fclose(f);
return kCpuHasMIPS_DSP;
}
}
fclose(f);
}
/* Did not find string in the proc file, or not Linux ELF. */
return 0;
}
#endif
// CPU detect function for SIMD instruction sets.
LIBYUV_API
int cpu_info_ = 0; // cpu_info is not initialized yet.
int cpu_info_ = kCpuInit; // cpu_info is not initialized yet.
// Test environment variable for disabling CPU features. Any non-zero value
// to disable. Zero ignored to make it easy to set the variable on/off.
@ -186,109 +181,93 @@ static LIBYUV_BOOL TestEnv(const char*) {
LIBYUV_API SAFEBUFFERS
int InitCpuFlags(void) {
// TODO(fbarchard): swap kCpuInit logic so 0 means uninitialized.
int cpu_info = 0;
#if !defined(__pnacl__) && !defined(__CLR_VER) && defined(CPU_X86)
uint32 cpu_info0[4] = { 0, 0, 0, 0 };
uint32 cpu_info1[4] = { 0, 0, 0, 0 };
uint32 cpu_info7[4] = { 0, 0, 0, 0 };
CpuId(0, 0, cpu_info0);
CpuId(1, 0, cpu_info1);
if (cpu_info0[0] >= 7) {
CpuId(7, 0, cpu_info7);
}
cpu_info = ((cpu_info1[3] & 0x04000000) ? kCpuHasSSE2 : 0) |
((cpu_info1[2] & 0x00000200) ? kCpuHasSSSE3 : 0) |
((cpu_info1[2] & 0x00080000) ? kCpuHasSSE41 : 0) |
((cpu_info1[2] & 0x00100000) ? kCpuHasSSE42 : 0) |
((cpu_info7[1] & 0x00000200) ? kCpuHasERMS : 0) |
((cpu_info1[2] & 0x00001000) ? kCpuHasFMA3 : 0) |
kCpuHasX86;
CpuId(7, 0, cpu_info7);
cpu_info_ = ((cpu_info1[3] & 0x04000000) ? kCpuHasSSE2 : 0) |
((cpu_info1[2] & 0x00000200) ? kCpuHasSSSE3 : 0) |
((cpu_info1[2] & 0x00080000) ? kCpuHasSSE41 : 0) |
((cpu_info1[2] & 0x00100000) ? kCpuHasSSE42 : 0) |
((cpu_info7[1] & 0x00000200) ? kCpuHasERMS : 0) |
((cpu_info1[2] & 0x00001000) ? kCpuHasFMA3 : 0) |
kCpuHasX86;
#ifdef HAS_XGETBV
// AVX requires CPU has AVX, XSAVE and OSXSave for xgetbv
if (((cpu_info1[2] & 0x1c000000) == 0x1c000000) && // AVX and OSXSave
((GetXCR0() & 6) == 6)) { // Test OS saves YMM registers
cpu_info |= ((cpu_info7[1] & 0x00000020) ? kCpuHasAVX2 : 0) | kCpuHasAVX;
// Detect AVX512bw
if ((GetXCR0() & 0xe0) == 0xe0) {
cpu_info |= (cpu_info7[1] & 0x40000000) ? kCpuHasAVX3 : 0;
}
if ((cpu_info1[2] & 0x18000000) == 0x18000000 && // AVX and OSSave
TestOsSaveYmm()) { // Saves YMM.
cpu_info_ |= ((cpu_info7[1] & 0x00000020) ? kCpuHasAVX2 : 0) |
kCpuHasAVX;
}
#endif
// Environment variable overrides for testing.
if (TestEnv("LIBYUV_DISABLE_X86")) {
cpu_info &= ~kCpuHasX86;
cpu_info_ &= ~kCpuHasX86;
}
if (TestEnv("LIBYUV_DISABLE_SSE2")) {
cpu_info &= ~kCpuHasSSE2;
cpu_info_ &= ~kCpuHasSSE2;
}
if (TestEnv("LIBYUV_DISABLE_SSSE3")) {
cpu_info &= ~kCpuHasSSSE3;
cpu_info_ &= ~kCpuHasSSSE3;
}
if (TestEnv("LIBYUV_DISABLE_SSE41")) {
cpu_info &= ~kCpuHasSSE41;
cpu_info_ &= ~kCpuHasSSE41;
}
if (TestEnv("LIBYUV_DISABLE_SSE42")) {
cpu_info &= ~kCpuHasSSE42;
cpu_info_ &= ~kCpuHasSSE42;
}
if (TestEnv("LIBYUV_DISABLE_AVX")) {
cpu_info &= ~kCpuHasAVX;
cpu_info_ &= ~kCpuHasAVX;
}
if (TestEnv("LIBYUV_DISABLE_AVX2")) {
cpu_info &= ~kCpuHasAVX2;
cpu_info_ &= ~kCpuHasAVX2;
}
if (TestEnv("LIBYUV_DISABLE_ERMS")) {
cpu_info &= ~kCpuHasERMS;
cpu_info_ &= ~kCpuHasERMS;
}
if (TestEnv("LIBYUV_DISABLE_FMA3")) {
cpu_info &= ~kCpuHasFMA3;
cpu_info_ &= ~kCpuHasFMA3;
}
if (TestEnv("LIBYUV_DISABLE_AVX3")) {
cpu_info &= ~kCpuHasAVX3;
}
#endif
#if defined(__mips__) && defined(__linux__)
#elif defined(__mips__) && defined(__linux__)
// Linux mips parse text file for dsp detect.
cpu_info_ = MipsCpuCaps("dsp"); // set kCpuHasMIPS_DSP.
#if defined(__mips_dspr2)
cpu_info |= kCpuHasDSPR2;
cpu_info_ |= kCpuHasMIPS_DSPR2;
#endif
cpu_info |= kCpuHasMIPS;
if (getenv("LIBYUV_DISABLE_DSPR2")) {
cpu_info &= ~kCpuHasDSPR2;
cpu_info_ |= kCpuHasMIPS;
if (getenv("LIBYUV_DISABLE_MIPS")) {
cpu_info_ &= ~kCpuHasMIPS;
}
#endif
#if defined(__arm__) || defined(__aarch64__)
// gcc -mfpu=neon defines __ARM_NEON__
// __ARM_NEON__ generates code that requires Neon. NaCL also requires Neon.
// For Linux, /proc/cpuinfo can be tested but without that assume Neon.
#if defined(__ARM_NEON__) || defined(__native_client__) || !defined(__linux__)
cpu_info = kCpuHasNEON;
// For aarch64(arm64), /proc/cpuinfo's feature is not complete, e.g. no neon
// flag in it.
// So for aarch64, neon enabling is hard coded here.
#endif
#if defined(__aarch64__)
cpu_info = kCpuHasNEON;
#else
if (getenv("LIBYUV_DISABLE_MIPS_DSP")) {
cpu_info_ &= ~kCpuHasMIPS_DSP;
}
if (getenv("LIBYUV_DISABLE_MIPS_DSPR2")) {
cpu_info_ &= ~kCpuHasMIPS_DSPR2;
}
#elif defined(__arm__)
#if defined(__linux__) && (defined(__ARM_NEON__) || defined(LIBYUV_NEON)) && \
!defined(__native_client__)
// Linux arm parse text file for neon detect.
cpu_info = ArmCpuCaps("/proc/cpuinfo");
cpu_info_ = ArmCpuCaps("/proc/cpuinfo");
#elif defined(__ARM_NEON__) || defined(__native_client__)
// gcc -mfpu=neon defines __ARM_NEON__
// Enable Neon if you want support for Neon and Arm, and use MaskCpuFlags
// to disable Neon on devices that do not have it.
cpu_info_ = kCpuHasNEON;
#endif
cpu_info |= kCpuHasARM;
cpu_info_ |= kCpuHasARM;
if (TestEnv("LIBYUV_DISABLE_NEON")) {
cpu_info &= ~kCpuHasNEON;
cpu_info_ &= ~kCpuHasNEON;
}
#endif // __arm__
if (TestEnv("LIBYUV_DISABLE_ASM")) {
cpu_info = 0;
cpu_info_ = 0;
}
cpu_info |= kCpuInitialized;
cpu_info_ = cpu_info;
return cpu_info;
return cpu_info_;
}
// Note that use of this function is not thread safe.
LIBYUV_API
void MaskCpuFlags(int enable_flags) {
cpu_info_ = InitCpuFlags() & enable_flags;

View File

@ -13,20 +13,13 @@
#ifdef HAVE_JPEG
#include <assert.h>
#if !defined(__pnacl__) && !defined(__CLR_VER) && \
!defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
#if !defined(__pnacl__) && !defined(__CLR_VER) && !defined(COVERAGE_ENABLED) &&\
!defined(TARGET_IPHONE_SIMULATOR)
// Must be included before jpeglib.
#include <setjmp.h>
#define HAVE_SETJMP
#if defined(_MSC_VER)
// disable warning 4324: structure was padded due to __declspec(align())
#pragma warning(disable:4324)
#endif
#endif
struct FILE; // For jpeglib.h.
// C++ build requires extern C for jpeg internals.
#ifdef __cplusplus
extern "C" {
@ -56,13 +49,6 @@ const int MJpegDecoder::kColorSpaceYCbCr = JCS_YCbCr;
const int MJpegDecoder::kColorSpaceCMYK = JCS_CMYK;
const int MJpegDecoder::kColorSpaceYCCK = JCS_YCCK;
// Methods that are passed to jpeglib.
boolean fill_input_buffer(jpeg_decompress_struct* cinfo);
void init_source(jpeg_decompress_struct* cinfo);
void skip_input_data(jpeg_decompress_struct* cinfo, long num_bytes); // NOLINT
void term_source(jpeg_decompress_struct* cinfo);
void ErrorHandler(jpeg_common_struct* cinfo);
MJpegDecoder::MJpegDecoder()
: has_scanline_padding_(LIBYUV_FALSE),
num_outbufs_(0),
@ -77,6 +63,9 @@ MJpegDecoder::MJpegDecoder()
decompress_struct_->err = jpeg_std_error(&error_mgr_->base);
// Override standard exit()-based error handler.
error_mgr_->base.error_exit = &ErrorHandler;
#ifndef DEBUG_MJPEG
error_mgr_->base.output_message = &OutputHandler;
#endif
#endif
decompress_struct_->client_data = NULL;
source_mgr_->init_source = &init_source;
@ -106,7 +95,7 @@ LIBYUV_BOOL MJpegDecoder::LoadFrame(const uint8* src, size_t src_len) {
}
buf_.data = src;
buf_.len = static_cast<int>(src_len);
buf_.len = (int)(src_len);
buf_vec_.pos = 0;
decompress_struct_->client_data = &buf_vec_;
#ifdef HAVE_SETJMP
@ -411,12 +400,12 @@ LIBYUV_BOOL MJpegDecoder::DecodeToCallback(CallbackFunction fn, void* opaque,
return FinishDecode();
}
void init_source(j_decompress_ptr cinfo) {
void MJpegDecoder::init_source(j_decompress_ptr cinfo) {
fill_input_buffer(cinfo);
}
boolean fill_input_buffer(j_decompress_ptr cinfo) {
BufferVector* buf_vec = reinterpret_cast<BufferVector*>(cinfo->client_data);
boolean MJpegDecoder::fill_input_buffer(j_decompress_ptr cinfo) {
BufferVector* buf_vec = (BufferVector*)(cinfo->client_data);
if (buf_vec->pos >= buf_vec->len) {
assert(0 && "No more data");
// ERROR: No more data
@ -428,16 +417,17 @@ boolean fill_input_buffer(j_decompress_ptr cinfo) {
return TRUE;
}
void skip_input_data(j_decompress_ptr cinfo, long num_bytes) { // NOLINT
void MJpegDecoder::skip_input_data(j_decompress_ptr cinfo,
long num_bytes) { // NOLINT
cinfo->src->next_input_byte += num_bytes;
}
void term_source(j_decompress_ptr cinfo) {
void MJpegDecoder::term_source(j_decompress_ptr cinfo) {
// Nothing to do.
}
#ifdef HAVE_SETJMP
void ErrorHandler(j_common_ptr cinfo) {
void MJpegDecoder::ErrorHandler(j_common_ptr cinfo) {
// This is called when a jpeglib command experiences an error. Unfortunately
// jpeglib's error handling model is not very flexible, because it expects the
// error handler to not return--i.e., it wants the program to terminate. To
@ -451,12 +441,18 @@ void ErrorHandler(j_common_ptr cinfo) {
// ERROR: Error in jpeglib: buf
#endif
SetJmpErrorMgr* mgr = reinterpret_cast<SetJmpErrorMgr*>(cinfo->err);
SetJmpErrorMgr* mgr = (SetJmpErrorMgr*)(cinfo->err);
// This rewinds the call stack to the point of the corresponding setjmp()
// and causes it to return (for a second time) with value 1.
longjmp(mgr->setjmp_buffer, 1);
}
#ifndef DEBUG_MJPEG
void MJpegDecoder::OutputHandler(j_common_ptr cinfo) {
// silently eat messages
}
#endif
#endif // HAVE_SETJMP
void MJpegDecoder::AllocOutputBuffers(int num_outbufs) {
if (num_outbufs != num_outbufs_) {
@ -503,11 +499,11 @@ LIBYUV_BOOL MJpegDecoder::StartDecode() {
decompress_struct_->dct_method = JDCT_IFAST; // JDCT_ISLOW is default
decompress_struct_->dither_mode = JDITHER_NONE;
// Not applicable to 'raw':
decompress_struct_->do_fancy_upsampling = (boolean)(LIBYUV_FALSE);
decompress_struct_->do_fancy_upsampling = LIBYUV_FALSE;
// Only for buffered mode:
decompress_struct_->enable_2pass_quant = (boolean)(LIBYUV_FALSE);
decompress_struct_->enable_2pass_quant = LIBYUV_FALSE;
// Blocky but fast:
decompress_struct_->do_block_smoothing = (boolean)(LIBYUV_FALSE);
decompress_struct_->do_block_smoothing = LIBYUV_FALSE;
if (!jpeg_start_decompress(decompress_struct_)) {
// ERROR: Couldn't start JPEG decompressor";

View File

@ -10,58 +10,34 @@
#include "libyuv/mjpeg_decoder.h"
#include <string.h> // For memchr.
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
// Helper function to scan for EOI marker (0xff 0xd9).
static LIBYUV_BOOL ScanEOI(const uint8* sample, size_t sample_size) {
if (sample_size >= 2) {
const uint8* end = sample + sample_size - 1;
const uint8* it = sample;
while (it < end) {
// TODO(fbarchard): scan for 0xd9 instead.
it = static_cast<const uint8 *>(memchr(it, 0xff, end - it));
if (it == NULL) {
break;
}
if (it[1] == 0xd9) {
return LIBYUV_TRUE; // Success: Valid jpeg.
}
++it; // Skip over current 0xff.
}
}
// ERROR: Invalid jpeg end code not found. Size sample_size
return LIBYUV_FALSE;
}
// Helper function to validate the jpeg appears intact.
// TODO(fbarchard): Optimize case where SOI is found but EOI is not.
LIBYUV_BOOL ValidateJpeg(const uint8* sample, size_t sample_size) {
// Maximum size that ValidateJpeg will consider valid.
const size_t kMaxJpegSize = 0x7fffffffull;
const size_t kBackSearchSize = 1024;
if (sample_size < 64 || sample_size > kMaxJpegSize || !sample) {
size_t i;
if (sample_size < 64) {
// ERROR: Invalid jpeg size: sample_size
return LIBYUV_FALSE;
}
if (sample[0] != 0xff || sample[1] != 0xd8) { // SOI marker
if (sample[0] != 0xff || sample[1] != 0xd8) { // Start Of Image
// ERROR: Invalid jpeg initial start code
return LIBYUV_FALSE;
}
// Look for the End Of Image (EOI) marker near the end of the buffer.
if (sample_size > kBackSearchSize) {
if (ScanEOI(sample + sample_size - kBackSearchSize, kBackSearchSize)) {
return LIBYUV_TRUE; // Success: Valid jpeg.
for (i = sample_size - 2; i > 1;) {
if (sample[i] != 0xd9) {
if (sample[i] == 0xff && sample[i + 1] == 0xd9) { // End Of Image
return LIBYUV_TRUE; // Success: Valid jpeg.
}
--i;
}
// Reduce search size for forward search.
sample_size = sample_size - kBackSearchSize + 1;
--i;
}
// Step over SOI marker and scan for EOI.
return ScanEOI(sample + 2, sample_size - 2);
// ERROR: Invalid jpeg end code not found. Size sample_size
return LIBYUV_FALSE;
}
#ifdef __cplusplus

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -27,31 +27,36 @@ extern "C" {
(defined(__x86_64__) && !defined(__native_client__)) || defined(__i386__))
#define HAS_SCALEARGBROWDOWNEVEN_SSE2
void ScaleARGBRowDownEven_SSE2(const uint8* src_ptr, int src_stride,
int src_stepx, uint8* dst_ptr, int dst_width);
int src_stepx,
uint8* dst_ptr, int dst_width);
#endif
#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
(defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
(defined(__ARM_NEON__) || defined(LIBYUV_NEON))
#define HAS_SCALEARGBROWDOWNEVEN_NEON
void ScaleARGBRowDownEven_NEON(const uint8* src_ptr, int src_stride,
int src_stepx, uint8* dst_ptr, int dst_width);
int src_stepx,
uint8* dst_ptr, int dst_width);
#endif
void ScaleARGBRowDownEven_C(const uint8* src_ptr, int,
int src_stepx, uint8* dst_ptr, int dst_width);
int src_stepx,
uint8* dst_ptr, int dst_width);
static void ARGBTranspose(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width, int height) {
uint8* dst, int dst_stride,
int width, int height) {
int i;
int src_pixel_step = src_stride >> 2;
void (*ScaleARGBRowDownEven)(const uint8* src_ptr, int src_stride,
int src_step, uint8* dst_ptr, int dst_width) = ScaleARGBRowDownEven_C;
#if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(height, 4)) { // Width of dest.
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(height, 4) && // Width of dest.
IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
ScaleARGBRowDownEven = ScaleARGBRowDownEven_SSE2;
}
#endif
#if defined(HAS_SCALEARGBROWDOWNEVEN_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(height, 4)) { // Width of dest.
#elif defined(HAS_SCALEARGBROWDOWNEVEN_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(height, 4) && // Width of dest.
IS_ALIGNED(src, 4)) {
ScaleARGBRowDownEven = ScaleARGBRowDownEven_NEON;
}
#endif
@ -64,7 +69,8 @@ static void ARGBTranspose(const uint8* src, int src_stride,
}
void ARGBRotate90(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width, int height) {
uint8* dst, int dst_stride,
int width, int height) {
// Rotate by 90 is a ARGBTranspose with the source read
// from bottom to top. So set the source pointer to the end
// of the buffer and flip the sign of the source stride.
@ -74,7 +80,8 @@ void ARGBRotate90(const uint8* src, int src_stride,
}
void ARGBRotate270(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width, int height) {
uint8* dst, int dst_stride,
int width, int height) {
// Rotate by 270 is a ARGBTranspose with the destination written
// from bottom to top. So set the destination pointer to the end
// of the buffer and flip the sign of the destination stride.
@ -84,7 +91,8 @@ void ARGBRotate270(const uint8* src, int src_stride,
}
void ARGBRotate180(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width, int height) {
uint8* dst, int dst_stride,
int width, int height) {
// Swap first and last row and mirror the content. Uses a temporary row.
align_buffer_64(row, width * 4);
const uint8* src_bot = src + src_stride * (height - 1);
@ -94,38 +102,38 @@ void ARGBRotate180(const uint8* src, int src_stride,
void (*ARGBMirrorRow)(const uint8* src, uint8* dst, int width) =
ARGBMirrorRow_C;
void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
#if defined(HAS_ARGBMIRRORROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBMirrorRow = ARGBMirrorRow_Any_NEON;
if (IS_ALIGNED(width, 4)) {
ARGBMirrorRow = ARGBMirrorRow_NEON;
}
}
#endif
#if defined(HAS_ARGBMIRRORROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
ARGBMirrorRow = ARGBMirrorRow_Any_SSE2;
if (IS_ALIGNED(width, 4)) {
ARGBMirrorRow = ARGBMirrorRow_SSE2;
}
#if defined(HAS_ARGBMIRRORROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4) &&
IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
ARGBMirrorRow = ARGBMirrorRow_SSSE3;
}
#endif
#if defined(HAS_ARGBMIRRORROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBMirrorRow = ARGBMirrorRow_Any_AVX2;
if (IS_ALIGNED(width, 8)) {
ARGBMirrorRow = ARGBMirrorRow_AVX2;
}
if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 8)) {
ARGBMirrorRow = ARGBMirrorRow_AVX2;
}
#endif
#if defined(HAS_ARGBMIRRORROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 4)) {
ARGBMirrorRow = ARGBMirrorRow_NEON;
}
#endif
#if defined(HAS_COPYROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width * 4, 32)) {
CopyRow = CopyRow_NEON;
}
#endif
#if defined(HAS_COPYROW_X86)
if (TestCpuFlag(kCpuHasX86)) {
CopyRow = CopyRow_X86;
}
#endif
#if defined(HAS_COPYROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
}
#endif
#if defined(HAS_COPYROW_AVX)
if (TestCpuFlag(kCpuHasAVX)) {
CopyRow = IS_ALIGNED(width * 4, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width * 4, 32) &&
IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
CopyRow = CopyRow_SSE2;
}
#endif
#if defined(HAS_COPYROW_ERMS)
@ -133,11 +141,6 @@ void ARGBRotate180(const uint8* src, int src_stride,
CopyRow = CopyRow_ERMS;
}
#endif
#if defined(HAS_COPYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
}
#endif
#if defined(HAS_COPYROW_MIPS)
if (TestCpuFlag(kCpuHasMIPS)) {
CopyRow = CopyRow_MIPS;
@ -159,7 +162,8 @@ void ARGBRotate180(const uint8* src, int src_stride,
LIBYUV_API
int ARGBRotate(const uint8* src_argb, int src_stride_argb,
uint8* dst_argb, int dst_stride_argb, int width, int height,
uint8* dst_argb, int dst_stride_argb,
int width, int height,
enum RotationMode mode) {
if (!src_argb || width <= 0 || height == 0 || !dst_argb) {
return -1;

View File

@ -9,7 +9,6 @@
*/
#include "libyuv/row.h"
#include "libyuv/rotate_row.h"
#include "libyuv/basic_types.h"
@ -19,11 +18,11 @@ extern "C" {
#endif
#if !defined(LIBYUV_DISABLE_MIPS) && \
defined(__mips_dsp) && (__mips_dsp_rev >= 2) && \
(_MIPS_SIM == _MIPS_SIM_ABI32)
defined(__mips_dsp) && (__mips_dsp_rev >= 2)
void TransposeWx8_DSPR2(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width) {
void TransposeWx8_MIPS_DSPR2(const uint8* src, int src_stride,
uint8* dst, int dst_stride,
int width) {
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
@ -106,8 +105,9 @@ void TransposeWx8_DSPR2(const uint8* src, int src_stride,
);
}
void TransposeWx8_Fast_DSPR2(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width) {
void TransposeWx8_FAST_MIPS_DSPR2(const uint8* src, int src_stride,
uint8* dst, int dst_stride,
int width) {
__asm__ __volatile__ (
".set noat \n"
".set push \n"
@ -303,15 +303,17 @@ void TransposeWx8_Fast_DSPR2(const uint8* src, int src_stride,
[width] "+r" (width)
:[src_stride] "r" (src_stride),
[dst_stride] "r" (dst_stride)
: "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9",
"s0", "s1", "s2", "s3", "s4", "s5", "s6", "s7"
: "t0", "t1", "t2", "t3", "t4", "t5",
"t6", "t7", "t8", "t9",
"s0", "s1", "s2", "s3", "s4",
"s5", "s6", "s7"
);
}
void TransposeUVWx8_DSPR2(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b,
int width) {
void TransposeUVWx8_MIPS_DSPR2(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b,
int width) {
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"

View File

@ -9,7 +9,6 @@
*/
#include "libyuv/row.h"
#include "libyuv/rotate_row.h"
#include "libyuv/basic_types.h"
@ -18,42 +17,32 @@ namespace libyuv {
extern "C" {
#endif
#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
!defined(__aarch64__)
#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)
static uvec8 kVTbl4x4Transpose =
{ 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };
void TransposeWx8_NEON(const uint8* src, int src_stride,
uint8* dst, int dst_stride,
int width) {
const uint8* src_temp;
asm volatile (
// loops are on blocks of 8. loop will stop when
// counter gets to or below 0. starting the counter
// at w-8 allow for this
"sub %5, #8 \n"
"sub %4, #8 \n"
// handle 8x8 blocks. this should be the majority of the plane
".p2align 2 \n"
"1: \n"
"mov %0, %1 \n"
"mov r9, %0 \n"
MEMACCESS(0)
"vld1.8 {d0}, [%0], %2 \n"
MEMACCESS(0)
"vld1.8 {d1}, [%0], %2 \n"
MEMACCESS(0)
"vld1.8 {d2}, [%0], %2 \n"
MEMACCESS(0)
"vld1.8 {d3}, [%0], %2 \n"
MEMACCESS(0)
"vld1.8 {d4}, [%0], %2 \n"
MEMACCESS(0)
"vld1.8 {d5}, [%0], %2 \n"
MEMACCESS(0)
"vld1.8 {d6}, [%0], %2 \n"
MEMACCESS(0)
"vld1.8 {d7}, [%0] \n"
"vld1.8 {d0}, [r9], %1 \n"
"vld1.8 {d1}, [r9], %1 \n"
"vld1.8 {d2}, [r9], %1 \n"
"vld1.8 {d3}, [r9], %1 \n"
"vld1.8 {d4}, [r9], %1 \n"
"vld1.8 {d5}, [r9], %1 \n"
"vld1.8 {d6}, [r9], %1 \n"
"vld1.8 {d7}, [r9] \n"
"vtrn.8 d1, d0 \n"
"vtrn.8 d3, d2 \n"
@ -75,65 +64,48 @@ void TransposeWx8_NEON(const uint8* src, int src_stride,
"vrev16.8 q2, q2 \n"
"vrev16.8 q3, q3 \n"
"mov %0, %3 \n"
"mov r9, %2 \n"
MEMACCESS(0)
"vst1.8 {d1}, [%0], %4 \n"
MEMACCESS(0)
"vst1.8 {d0}, [%0], %4 \n"
MEMACCESS(0)
"vst1.8 {d3}, [%0], %4 \n"
MEMACCESS(0)
"vst1.8 {d2}, [%0], %4 \n"
MEMACCESS(0)
"vst1.8 {d5}, [%0], %4 \n"
MEMACCESS(0)
"vst1.8 {d4}, [%0], %4 \n"
MEMACCESS(0)
"vst1.8 {d7}, [%0], %4 \n"
MEMACCESS(0)
"vst1.8 {d6}, [%0] \n"
"vst1.8 {d1}, [r9], %3 \n"
"vst1.8 {d0}, [r9], %3 \n"
"vst1.8 {d3}, [r9], %3 \n"
"vst1.8 {d2}, [r9], %3 \n"
"vst1.8 {d5}, [r9], %3 \n"
"vst1.8 {d4}, [r9], %3 \n"
"vst1.8 {d7}, [r9], %3 \n"
"vst1.8 {d6}, [r9] \n"
"add %1, #8 \n" // src += 8
"add %3, %3, %4, lsl #3 \n" // dst += 8 * dst_stride
"subs %5, #8 \n" // w -= 8
"add %0, #8 \n" // src += 8
"add %2, %2, %3, lsl #3 \n" // dst += 8 * dst_stride
"subs %4, #8 \n" // w -= 8
"bge 1b \n"
// add 8 back to counter. if the result is 0 there are
// no residuals.
"adds %5, #8 \n"
"adds %4, #8 \n"
"beq 4f \n"
// some residual, so between 1 and 7 lines left to transpose
"cmp %5, #2 \n"
"cmp %4, #2 \n"
"blt 3f \n"
"cmp %5, #4 \n"
"cmp %4, #4 \n"
"blt 2f \n"
// 4x8 block
"mov %0, %1 \n"
MEMACCESS(0)
"vld1.32 {d0[0]}, [%0], %2 \n"
MEMACCESS(0)
"vld1.32 {d0[1]}, [%0], %2 \n"
MEMACCESS(0)
"vld1.32 {d1[0]}, [%0], %2 \n"
MEMACCESS(0)
"vld1.32 {d1[1]}, [%0], %2 \n"
MEMACCESS(0)
"vld1.32 {d2[0]}, [%0], %2 \n"
MEMACCESS(0)
"vld1.32 {d2[1]}, [%0], %2 \n"
MEMACCESS(0)
"vld1.32 {d3[0]}, [%0], %2 \n"
MEMACCESS(0)
"vld1.32 {d3[1]}, [%0] \n"
"mov r9, %0 \n"
"vld1.32 {d0[0]}, [r9], %1 \n"
"vld1.32 {d0[1]}, [r9], %1 \n"
"vld1.32 {d1[0]}, [r9], %1 \n"
"vld1.32 {d1[1]}, [r9], %1 \n"
"vld1.32 {d2[0]}, [r9], %1 \n"
"vld1.32 {d2[1]}, [r9], %1 \n"
"vld1.32 {d3[0]}, [r9], %1 \n"
"vld1.32 {d3[1]}, [r9] \n"
"mov %0, %3 \n"
"mov r9, %2 \n"
MEMACCESS(6)
"vld1.8 {q3}, [%6] \n"
"vld1.8 {q3}, [%5] \n"
"vtbl.8 d4, {d0, d1}, d6 \n"
"vtbl.8 d5, {d0, d1}, d7 \n"
@ -142,101 +114,73 @@ void TransposeWx8_NEON(const uint8* src, int src_stride,
// TODO(frkoenig): Rework shuffle above to
// write out with 4 instead of 8 writes.
MEMACCESS(0)
"vst1.32 {d4[0]}, [%0], %4 \n"
MEMACCESS(0)
"vst1.32 {d4[1]}, [%0], %4 \n"
MEMACCESS(0)
"vst1.32 {d5[0]}, [%0], %4 \n"
MEMACCESS(0)
"vst1.32 {d5[1]}, [%0] \n"
"vst1.32 {d4[0]}, [r9], %3 \n"
"vst1.32 {d4[1]}, [r9], %3 \n"
"vst1.32 {d5[0]}, [r9], %3 \n"
"vst1.32 {d5[1]}, [r9] \n"
"add %0, %3, #4 \n"
MEMACCESS(0)
"vst1.32 {d0[0]}, [%0], %4 \n"
MEMACCESS(0)
"vst1.32 {d0[1]}, [%0], %4 \n"
MEMACCESS(0)
"vst1.32 {d1[0]}, [%0], %4 \n"
MEMACCESS(0)
"vst1.32 {d1[1]}, [%0] \n"
"add r9, %2, #4 \n"
"vst1.32 {d0[0]}, [r9], %3 \n"
"vst1.32 {d0[1]}, [r9], %3 \n"
"vst1.32 {d1[0]}, [r9], %3 \n"
"vst1.32 {d1[1]}, [r9] \n"
"add %1, #4 \n" // src += 4
"add %3, %3, %4, lsl #2 \n" // dst += 4 * dst_stride
"subs %5, #4 \n" // w -= 4
"add %0, #4 \n" // src += 4
"add %2, %2, %3, lsl #2 \n" // dst += 4 * dst_stride
"subs %4, #4 \n" // w -= 4
"beq 4f \n"
// some residual, check to see if it includes a 2x8 block,
// or less
"cmp %5, #2 \n"
"cmp %4, #2 \n"
"blt 3f \n"
// 2x8 block
"2: \n"
"mov %0, %1 \n"
MEMACCESS(0)
"vld1.16 {d0[0]}, [%0], %2 \n"
MEMACCESS(0)
"vld1.16 {d1[0]}, [%0], %2 \n"
MEMACCESS(0)
"vld1.16 {d0[1]}, [%0], %2 \n"
MEMACCESS(0)
"vld1.16 {d1[1]}, [%0], %2 \n"
MEMACCESS(0)
"vld1.16 {d0[2]}, [%0], %2 \n"
MEMACCESS(0)
"vld1.16 {d1[2]}, [%0], %2 \n"
MEMACCESS(0)
"vld1.16 {d0[3]}, [%0], %2 \n"
MEMACCESS(0)
"vld1.16 {d1[3]}, [%0] \n"
"mov r9, %0 \n"
"vld1.16 {d0[0]}, [r9], %1 \n"
"vld1.16 {d1[0]}, [r9], %1 \n"
"vld1.16 {d0[1]}, [r9], %1 \n"
"vld1.16 {d1[1]}, [r9], %1 \n"
"vld1.16 {d0[2]}, [r9], %1 \n"
"vld1.16 {d1[2]}, [r9], %1 \n"
"vld1.16 {d0[3]}, [r9], %1 \n"
"vld1.16 {d1[3]}, [r9] \n"
"vtrn.8 d0, d1 \n"
"mov %0, %3 \n"
"mov r9, %2 \n"
MEMACCESS(0)
"vst1.64 {d0}, [%0], %4 \n"
MEMACCESS(0)
"vst1.64 {d1}, [%0] \n"
"vst1.64 {d0}, [r9], %3 \n"
"vst1.64 {d1}, [r9] \n"
"add %1, #2 \n" // src += 2
"add %3, %3, %4, lsl #1 \n" // dst += 2 * dst_stride
"subs %5, #2 \n" // w -= 2
"add %0, #2 \n" // src += 2
"add %2, %2, %3, lsl #1 \n" // dst += 2 * dst_stride
"subs %4, #2 \n" // w -= 2
"beq 4f \n"
// 1x8 block
"3: \n"
MEMACCESS(1)
"vld1.8 {d0[0]}, [%1], %2 \n"
MEMACCESS(1)
"vld1.8 {d0[1]}, [%1], %2 \n"
MEMACCESS(1)
"vld1.8 {d0[2]}, [%1], %2 \n"
MEMACCESS(1)
"vld1.8 {d0[3]}, [%1], %2 \n"
MEMACCESS(1)
"vld1.8 {d0[4]}, [%1], %2 \n"
MEMACCESS(1)
"vld1.8 {d0[5]}, [%1], %2 \n"
MEMACCESS(1)
"vld1.8 {d0[6]}, [%1], %2 \n"
MEMACCESS(1)
"vld1.8 {d0[7]}, [%1] \n"
"vld1.8 {d0[0]}, [%0], %1 \n"
"vld1.8 {d0[1]}, [%0], %1 \n"
"vld1.8 {d0[2]}, [%0], %1 \n"
"vld1.8 {d0[3]}, [%0], %1 \n"
"vld1.8 {d0[4]}, [%0], %1 \n"
"vld1.8 {d0[5]}, [%0], %1 \n"
"vld1.8 {d0[6]}, [%0], %1 \n"
"vld1.8 {d0[7]}, [%0] \n"
MEMACCESS(3)
"vst1.64 {d0}, [%3] \n"
"vst1.64 {d0}, [%2] \n"
"4: \n"
: "=&r"(src_temp), // %0
"+r"(src), // %1
"+r"(src_stride), // %2
"+r"(dst), // %3
"+r"(dst_stride), // %4
"+r"(width) // %5
: "r"(&kVTbl4x4Transpose) // %6
: "memory", "cc", "q0", "q1", "q2", "q3"
: "+r"(src), // %0
"+r"(src_stride), // %1
"+r"(dst), // %2
"+r"(dst_stride), // %3
"+r"(width) // %4
: "r"(&kVTbl4x4Transpose) // %5
: "memory", "cc", "r9", "q0", "q1", "q2", "q3"
);
}
@ -247,33 +191,25 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b,
int width) {
const uint8* src_temp;
asm volatile (
// loops are on blocks of 8. loop will stop when
// counter gets to or below 0. starting the counter
// at w-8 allow for this
"sub %7, #8 \n"
"sub %6, #8 \n"
// handle 8x8 blocks. this should be the majority of the plane
".p2align 2 \n"
"1: \n"
"mov %0, %1 \n"
"mov r9, %0 \n"
MEMACCESS(0)
"vld2.8 {d0, d1}, [%0], %2 \n"
MEMACCESS(0)
"vld2.8 {d2, d3}, [%0], %2 \n"
MEMACCESS(0)
"vld2.8 {d4, d5}, [%0], %2 \n"
MEMACCESS(0)
"vld2.8 {d6, d7}, [%0], %2 \n"
MEMACCESS(0)
"vld2.8 {d16, d17}, [%0], %2 \n"
MEMACCESS(0)
"vld2.8 {d18, d19}, [%0], %2 \n"
MEMACCESS(0)
"vld2.8 {d20, d21}, [%0], %2 \n"
MEMACCESS(0)
"vld2.8 {d22, d23}, [%0] \n"
"vld2.8 {d0, d1}, [r9], %1 \n"
"vld2.8 {d2, d3}, [r9], %1 \n"
"vld2.8 {d4, d5}, [r9], %1 \n"
"vld2.8 {d6, d7}, [r9], %1 \n"
"vld2.8 {d16, d17}, [r9], %1 \n"
"vld2.8 {d18, d19}, [r9], %1 \n"
"vld2.8 {d20, d21}, [r9], %1 \n"
"vld2.8 {d22, d23}, [r9] \n"
"vtrn.8 q1, q0 \n"
"vtrn.8 q3, q2 \n"
@ -299,84 +235,59 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride,
"vrev16.8 q10, q10 \n"
"vrev16.8 q11, q11 \n"
"mov %0, %3 \n"
"mov r9, %2 \n"
MEMACCESS(0)
"vst1.8 {d2}, [%0], %4 \n"
MEMACCESS(0)
"vst1.8 {d0}, [%0], %4 \n"
MEMACCESS(0)
"vst1.8 {d6}, [%0], %4 \n"
MEMACCESS(0)
"vst1.8 {d4}, [%0], %4 \n"
MEMACCESS(0)
"vst1.8 {d18}, [%0], %4 \n"
MEMACCESS(0)
"vst1.8 {d16}, [%0], %4 \n"
MEMACCESS(0)
"vst1.8 {d22}, [%0], %4 \n"
MEMACCESS(0)
"vst1.8 {d20}, [%0] \n"
"vst1.8 {d2}, [r9], %3 \n"
"vst1.8 {d0}, [r9], %3 \n"
"vst1.8 {d6}, [r9], %3 \n"
"vst1.8 {d4}, [r9], %3 \n"
"vst1.8 {d18}, [r9], %3 \n"
"vst1.8 {d16}, [r9], %3 \n"
"vst1.8 {d22}, [r9], %3 \n"
"vst1.8 {d20}, [r9] \n"
"mov %0, %5 \n"
"mov r9, %4 \n"
MEMACCESS(0)
"vst1.8 {d3}, [%0], %6 \n"
MEMACCESS(0)
"vst1.8 {d1}, [%0], %6 \n"
MEMACCESS(0)
"vst1.8 {d7}, [%0], %6 \n"
MEMACCESS(0)
"vst1.8 {d5}, [%0], %6 \n"
MEMACCESS(0)
"vst1.8 {d19}, [%0], %6 \n"
MEMACCESS(0)
"vst1.8 {d17}, [%0], %6 \n"
MEMACCESS(0)
"vst1.8 {d23}, [%0], %6 \n"
MEMACCESS(0)
"vst1.8 {d21}, [%0] \n"
"vst1.8 {d3}, [r9], %5 \n"
"vst1.8 {d1}, [r9], %5 \n"
"vst1.8 {d7}, [r9], %5 \n"
"vst1.8 {d5}, [r9], %5 \n"
"vst1.8 {d19}, [r9], %5 \n"
"vst1.8 {d17}, [r9], %5 \n"
"vst1.8 {d23}, [r9], %5 \n"
"vst1.8 {d21}, [r9] \n"
"add %1, #8*2 \n" // src += 8*2
"add %3, %3, %4, lsl #3 \n" // dst_a += 8 * dst_stride_a
"add %5, %5, %6, lsl #3 \n" // dst_b += 8 * dst_stride_b
"subs %7, #8 \n" // w -= 8
"add %0, #8*2 \n" // src += 8*2
"add %2, %2, %3, lsl #3 \n" // dst_a += 8 * dst_stride_a
"add %4, %4, %5, lsl #3 \n" // dst_b += 8 * dst_stride_b
"subs %6, #8 \n" // w -= 8
"bge 1b \n"
// add 8 back to counter. if the result is 0 there are
// no residuals.
"adds %7, #8 \n"
"adds %6, #8 \n"
"beq 4f \n"
// some residual, so between 1 and 7 lines left to transpose
"cmp %7, #2 \n"
"cmp %6, #2 \n"
"blt 3f \n"
"cmp %7, #4 \n"
"cmp %6, #4 \n"
"blt 2f \n"
// TODO(frkoenig): Clean this up
//TODO(frkoenig): Clean this up
// 4x8 block
"mov %0, %1 \n"
MEMACCESS(0)
"vld1.64 {d0}, [%0], %2 \n"
MEMACCESS(0)
"vld1.64 {d1}, [%0], %2 \n"
MEMACCESS(0)
"vld1.64 {d2}, [%0], %2 \n"
MEMACCESS(0)
"vld1.64 {d3}, [%0], %2 \n"
MEMACCESS(0)
"vld1.64 {d4}, [%0], %2 \n"
MEMACCESS(0)
"vld1.64 {d5}, [%0], %2 \n"
MEMACCESS(0)
"vld1.64 {d6}, [%0], %2 \n"
MEMACCESS(0)
"vld1.64 {d7}, [%0] \n"
"mov r9, %0 \n"
"vld1.64 {d0}, [r9], %1 \n"
"vld1.64 {d1}, [r9], %1 \n"
"vld1.64 {d2}, [r9], %1 \n"
"vld1.64 {d3}, [r9], %1 \n"
"vld1.64 {d4}, [r9], %1 \n"
"vld1.64 {d5}, [r9], %1 \n"
"vld1.64 {d6}, [r9], %1 \n"
"vld1.64 {d7}, [r9] \n"
MEMACCESS(8)
"vld1.8 {q15}, [%8] \n"
"vld1.8 {q15}, [%7] \n"
"vtrn.8 q0, q1 \n"
"vtrn.8 q2, q3 \n"
@ -390,142 +301,103 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride,
"vtbl.8 d22, {d6, d7}, d30 \n"
"vtbl.8 d23, {d6, d7}, d31 \n"
"mov %0, %3 \n"
"mov r9, %2 \n"
MEMACCESS(0)
"vst1.32 {d16[0]}, [%0], %4 \n"
MEMACCESS(0)
"vst1.32 {d16[1]}, [%0], %4 \n"
MEMACCESS(0)
"vst1.32 {d17[0]}, [%0], %4 \n"
MEMACCESS(0)
"vst1.32 {d17[1]}, [%0], %4 \n"
"vst1.32 {d16[0]}, [r9], %3 \n"
"vst1.32 {d16[1]}, [r9], %3 \n"
"vst1.32 {d17[0]}, [r9], %3 \n"
"vst1.32 {d17[1]}, [r9], %3 \n"
"add %0, %3, #4 \n"
MEMACCESS(0)
"vst1.32 {d20[0]}, [%0], %4 \n"
MEMACCESS(0)
"vst1.32 {d20[1]}, [%0], %4 \n"
MEMACCESS(0)
"vst1.32 {d21[0]}, [%0], %4 \n"
MEMACCESS(0)
"vst1.32 {d21[1]}, [%0] \n"
"add r9, %2, #4 \n"
"vst1.32 {d20[0]}, [r9], %3 \n"
"vst1.32 {d20[1]}, [r9], %3 \n"
"vst1.32 {d21[0]}, [r9], %3 \n"
"vst1.32 {d21[1]}, [r9] \n"
"mov %0, %5 \n"
"mov r9, %4 \n"
MEMACCESS(0)
"vst1.32 {d18[0]}, [%0], %6 \n"
MEMACCESS(0)
"vst1.32 {d18[1]}, [%0], %6 \n"
MEMACCESS(0)
"vst1.32 {d19[0]}, [%0], %6 \n"
MEMACCESS(0)
"vst1.32 {d19[1]}, [%0], %6 \n"
"vst1.32 {d18[0]}, [r9], %5 \n"
"vst1.32 {d18[1]}, [r9], %5 \n"
"vst1.32 {d19[0]}, [r9], %5 \n"
"vst1.32 {d19[1]}, [r9], %5 \n"
"add %0, %5, #4 \n"
MEMACCESS(0)
"vst1.32 {d22[0]}, [%0], %6 \n"
MEMACCESS(0)
"vst1.32 {d22[1]}, [%0], %6 \n"
MEMACCESS(0)
"vst1.32 {d23[0]}, [%0], %6 \n"
MEMACCESS(0)
"vst1.32 {d23[1]}, [%0] \n"
"add r9, %4, #4 \n"
"vst1.32 {d22[0]}, [r9], %5 \n"
"vst1.32 {d22[1]}, [r9], %5 \n"
"vst1.32 {d23[0]}, [r9], %5 \n"
"vst1.32 {d23[1]}, [r9] \n"
"add %1, #4*2 \n" // src += 4 * 2
"add %3, %3, %4, lsl #2 \n" // dst_a += 4 * dst_stride_a
"add %5, %5, %6, lsl #2 \n" // dst_b += 4 * dst_stride_b
"subs %7, #4 \n" // w -= 4
"add %0, #4*2 \n" // src += 4 * 2
"add %2, %2, %3, lsl #2 \n" // dst_a += 4 * dst_stride_a
"add %4, %4, %5, lsl #2 \n" // dst_b += 4 * dst_stride_b
"subs %6, #4 \n" // w -= 4
"beq 4f \n"
// some residual, check to see if it includes a 2x8 block,
// or less
"cmp %7, #2 \n"
"cmp %6, #2 \n"
"blt 3f \n"
// 2x8 block
"2: \n"
"mov %0, %1 \n"
MEMACCESS(0)
"vld2.16 {d0[0], d2[0]}, [%0], %2 \n"
MEMACCESS(0)
"vld2.16 {d1[0], d3[0]}, [%0], %2 \n"
MEMACCESS(0)
"vld2.16 {d0[1], d2[1]}, [%0], %2 \n"
MEMACCESS(0)
"vld2.16 {d1[1], d3[1]}, [%0], %2 \n"
MEMACCESS(0)
"vld2.16 {d0[2], d2[2]}, [%0], %2 \n"
MEMACCESS(0)
"vld2.16 {d1[2], d3[2]}, [%0], %2 \n"
MEMACCESS(0)
"vld2.16 {d0[3], d2[3]}, [%0], %2 \n"
MEMACCESS(0)
"vld2.16 {d1[3], d3[3]}, [%0] \n"
"mov r9, %0 \n"
"vld2.16 {d0[0], d2[0]}, [r9], %1 \n"
"vld2.16 {d1[0], d3[0]}, [r9], %1 \n"
"vld2.16 {d0[1], d2[1]}, [r9], %1 \n"
"vld2.16 {d1[1], d3[1]}, [r9], %1 \n"
"vld2.16 {d0[2], d2[2]}, [r9], %1 \n"
"vld2.16 {d1[2], d3[2]}, [r9], %1 \n"
"vld2.16 {d0[3], d2[3]}, [r9], %1 \n"
"vld2.16 {d1[3], d3[3]}, [r9] \n"
"vtrn.8 d0, d1 \n"
"vtrn.8 d2, d3 \n"
"mov %0, %3 \n"
"mov r9, %2 \n"
MEMACCESS(0)
"vst1.64 {d0}, [%0], %4 \n"
MEMACCESS(0)
"vst1.64 {d2}, [%0] \n"
"vst1.64 {d0}, [r9], %3 \n"
"vst1.64 {d2}, [r9] \n"
"mov %0, %5 \n"
"mov r9, %4 \n"
MEMACCESS(0)
"vst1.64 {d1}, [%0], %6 \n"
MEMACCESS(0)
"vst1.64 {d3}, [%0] \n"
"vst1.64 {d1}, [r9], %5 \n"
"vst1.64 {d3}, [r9] \n"
"add %1, #2*2 \n" // src += 2 * 2
"add %3, %3, %4, lsl #1 \n" // dst_a += 2 * dst_stride_a
"add %5, %5, %6, lsl #1 \n" // dst_b += 2 * dst_stride_b
"subs %7, #2 \n" // w -= 2
"add %0, #2*2 \n" // src += 2 * 2
"add %2, %2, %3, lsl #1 \n" // dst_a += 2 * dst_stride_a
"add %4, %4, %5, lsl #1 \n" // dst_b += 2 * dst_stride_b
"subs %6, #2 \n" // w -= 2
"beq 4f \n"
// 1x8 block
"3: \n"
MEMACCESS(1)
"vld2.8 {d0[0], d1[0]}, [%1], %2 \n"
MEMACCESS(1)
"vld2.8 {d0[1], d1[1]}, [%1], %2 \n"
MEMACCESS(1)
"vld2.8 {d0[2], d1[2]}, [%1], %2 \n"
MEMACCESS(1)
"vld2.8 {d0[3], d1[3]}, [%1], %2 \n"
MEMACCESS(1)
"vld2.8 {d0[4], d1[4]}, [%1], %2 \n"
MEMACCESS(1)
"vld2.8 {d0[5], d1[5]}, [%1], %2 \n"
MEMACCESS(1)
"vld2.8 {d0[6], d1[6]}, [%1], %2 \n"
MEMACCESS(1)
"vld2.8 {d0[7], d1[7]}, [%1] \n"
"vld2.8 {d0[0], d1[0]}, [%0], %1 \n"
"vld2.8 {d0[1], d1[1]}, [%0], %1 \n"
"vld2.8 {d0[2], d1[2]}, [%0], %1 \n"
"vld2.8 {d0[3], d1[3]}, [%0], %1 \n"
"vld2.8 {d0[4], d1[4]}, [%0], %1 \n"
"vld2.8 {d0[5], d1[5]}, [%0], %1 \n"
"vld2.8 {d0[6], d1[6]}, [%0], %1 \n"
"vld2.8 {d0[7], d1[7]}, [%0] \n"
MEMACCESS(3)
"vst1.64 {d0}, [%3] \n"
MEMACCESS(5)
"vst1.64 {d1}, [%5] \n"
"vst1.64 {d0}, [%2] \n"
"vst1.64 {d1}, [%4] \n"
"4: \n"
: "=&r"(src_temp), // %0
"+r"(src), // %1
"+r"(src_stride), // %2
"+r"(dst_a), // %3
"+r"(dst_stride_a), // %4
"+r"(dst_b), // %5
"+r"(dst_stride_b), // %6
"+r"(width) // %7
: "r"(&kVTbl4x4TransposeDi) // %8
: "memory", "cc",
: "+r"(src), // %0
"+r"(src_stride), // %1
"+r"(dst_a), // %2
"+r"(dst_stride_a), // %3
"+r"(dst_b), // %4
"+r"(dst_stride_b), // %5
"+r"(width) // %6
: "r"(&kVTbl4x4TransposeDi) // %7
: "memory", "cc", "r9",
"q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
);
}
#endif // defined(__ARM_NEON__) && !defined(__aarch64__)
#endif
#ifdef __cplusplus
} // extern "C"

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -16,8 +16,13 @@ extern "C" {
#endif
// The following are available on Mips platforms:
#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__) && \
(_MIPS_SIM == _MIPS_SIM_ABI32)
#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__)
#include <sgidefs.h>
#if (_MIPS_ISA == _MIPS_ISA_MIPS4) || (_MIPS_ISA == _MIPS_ISA_MIPS5)
#define HAS_MIPS_PREFETCH 1
#endif
#ifdef HAS_COPYROW_MIPS
void CopyRow_MIPS(const uint8* src, uint8* dst, int count) {
@ -61,23 +66,31 @@ void CopyRow_MIPS(const uint8* src, uint8* dst, int count) {
// Alternatively, for x=64 the last "safe" a1 address is "t0-96"
// we will use "pref 30,128(a1)", so "t0-160" is the limit
"subu $t9, $t0, 160 \n"
#ifdef HAS_MIPS_PREFETCH
// t9 is the "last safe pref 30,128(a1)" address
"pref 0, 0(%[src]) \n" // first line of src
"pref 0, 32(%[src]) \n" // second line of src
"pref 0, 64(%[src]) \n"
"pref 30, 32(%[dst]) \n"
#endif
// In case the a1 > t9 don't use "pref 30" at all
"sgtu $v1, %[dst], $t9 \n"
"bgtz $v1, $loop16w \n"
"nop \n"
// otherwise, start with using pref30
#ifdef HAS_MIPS_PREFETCH
"pref 30, 64(%[dst]) \n"
#endif
"$loop16w: \n"
#ifdef HAS_MIPS_PREFETCH
"pref 0, 96(%[src]) \n"
#endif
"lw $t0, 0(%[src]) \n"
"bgtz $v1, $skip_pref30_96 \n" // skip
"lw $t1, 4(%[src]) \n"
#ifdef HAS_MIPS_PREFETCH
"pref 30, 96(%[dst]) \n" // continue
#endif
"$skip_pref30_96: \n"
"lw $t2, 8(%[src]) \n"
"lw $t3, 12(%[src]) \n"
@ -85,7 +98,9 @@ void CopyRow_MIPS(const uint8* src, uint8* dst, int count) {
"lw $t5, 20(%[src]) \n"
"lw $t6, 24(%[src]) \n"
"lw $t7, 28(%[src]) \n"
#ifdef HAS_MIPS_PREFETCH
"pref 0, 128(%[src]) \n"
#endif
// bring the next lines of src, addr 128
"sw $t0, 0(%[dst]) \n"
"sw $t1, 4(%[dst]) \n"
@ -98,7 +113,9 @@ void CopyRow_MIPS(const uint8* src, uint8* dst, int count) {
"lw $t0, 32(%[src]) \n"
"bgtz $v1, $skip_pref30_128 \n" // skip pref 30,128(a1)
"lw $t1, 36(%[src]) \n"
#ifdef HAS_MIPS_PREFETCH
"pref 30, 128(%[dst]) \n" // set dest, addr 128
#endif
"$skip_pref30_128: \n"
"lw $t2, 40(%[src]) \n"
"lw $t3, 44(%[src]) \n"
@ -106,7 +123,9 @@ void CopyRow_MIPS(const uint8* src, uint8* dst, int count) {
"lw $t5, 52(%[src]) \n"
"lw $t6, 56(%[src]) \n"
"lw $t7, 60(%[src]) \n"
#ifdef HAS_MIPS_PREFETCH
"pref 0, 160(%[src]) \n"
#endif
// bring the next lines of src, addr 160
"sw $t0, 32(%[dst]) \n"
"sw $t1, 36(%[dst]) \n"
@ -126,7 +145,9 @@ void CopyRow_MIPS(const uint8* src, uint8* dst, int count) {
// Here we have src and dest word-aligned but less than 64-bytes to go
"chk8w: \n"
#ifdef HAS_MIPS_PREFETCH
"pref 0, 0x0(%[src]) \n"
#endif
"andi $t8, %[count], 0x1f \n" // 32-byte chunk?
// the t8 is the reminder count past 32-bytes
"beq %[count], $t8, chk1w \n"
@ -214,10 +235,12 @@ void CopyRow_MIPS(const uint8* src, uint8* dst, int count) {
"addu $t0, %[dst], %[count] \n" // t0 "past the end"
"subu $t9, $t0, 160 \n"
// t9 is the "last safe pref 30,128(a1)" address
#ifdef HAS_MIPS_PREFETCH
"pref 0, 0(%[src]) \n" // first line of src
"pref 0, 32(%[src]) \n" // second line addr 32
"pref 0, 64(%[src]) \n"
"pref 30, 32(%[dst]) \n"
#endif
// safe, as we have at least 64 bytes ahead
// In case the a1 > t9 don't use "pref 30" at all
"sgtu $v1, %[dst], $t9 \n"
@ -225,15 +248,21 @@ void CopyRow_MIPS(const uint8* src, uint8* dst, int count) {
// skip "pref 30,64(a1)" for too short arrays
" nop \n"
// otherwise, start with using pref30
#ifdef HAS_MIPS_PREFETCH
"pref 30, 64(%[dst]) \n"
#endif
"$ua_loop16w: \n"
#ifdef HAS_MIPS_PREFETCH
"pref 0, 96(%[src]) \n"
#endif
"lwr $t0, 0(%[src]) \n"
"lwl $t0, 3(%[src]) \n"
"lwr $t1, 4(%[src]) \n"
"bgtz $v1, $ua_skip_pref30_96 \n"
" lwl $t1, 7(%[src]) \n"
#ifdef HAS_MIPS_PREFETCH
"pref 30, 96(%[dst]) \n"
#endif
// continue setting up the dest, addr 96
"$ua_skip_pref30_96: \n"
"lwr $t2, 8(%[src]) \n"
@ -248,7 +277,9 @@ void CopyRow_MIPS(const uint8* src, uint8* dst, int count) {
"lwl $t6, 27(%[src]) \n"
"lwr $t7, 28(%[src]) \n"
"lwl $t7, 31(%[src]) \n"
#ifdef HAS_MIPS_PREFETCH
"pref 0, 128(%[src]) \n"
#endif
// bring the next lines of src, addr 128
"sw $t0, 0(%[dst]) \n"
"sw $t1, 4(%[dst]) \n"
@ -263,7 +294,9 @@ void CopyRow_MIPS(const uint8* src, uint8* dst, int count) {
"lwr $t1, 36(%[src]) \n"
"bgtz $v1, ua_skip_pref30_128 \n"
" lwl $t1, 39(%[src]) \n"
#ifdef HAS_MIPS_PREFETCH
"pref 30, 128(%[dst]) \n"
#endif
// continue setting up the dest, addr 128
"ua_skip_pref30_128: \n"
@ -279,7 +312,9 @@ void CopyRow_MIPS(const uint8* src, uint8* dst, int count) {
"lwl $t6, 59(%[src]) \n"
"lwr $t7, 60(%[src]) \n"
"lwl $t7, 63(%[src]) \n"
#ifdef HAS_MIPS_PREFETCH
"pref 0, 160(%[src]) \n"
#endif
// bring the next lines of src, addr 160
"sw $t0, 32(%[dst]) \n"
"sw $t1, 36(%[dst]) \n"
@ -299,7 +334,9 @@ void CopyRow_MIPS(const uint8* src, uint8* dst, int count) {
// Here we have src and dest word-aligned but less than 64-bytes to go
"ua_chk8w: \n"
#ifdef HAS_MIPS_PREFETCH
"pref 0, 0x0(%[src]) \n"
#endif
"andi $t8, %[count], 0x1f \n" // 32-byte chunk?
// the t8 is the reminder count
"beq %[count], $t8, $ua_chk1w \n"
@ -375,13 +412,11 @@ void CopyRow_MIPS(const uint8* src, uint8* dst, int count) {
}
#endif // HAS_COPYROW_MIPS
// DSPR2 functions
// MIPS DSPR2 functions
#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips_dsp) && \
(__mips_dsp_rev >= 2) && \
(_MIPS_SIM == _MIPS_SIM_ABI32) && (__mips_isa_rev < 6)
void SplitUVRow_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
int width) {
(__mips_dsp_rev >= 2)
void SplitUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
int width) {
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
@ -389,6 +424,7 @@ void SplitUVRow_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
"blez $t4, 2f \n"
" andi %[width], %[width], 0xf \n" // residual
".p2align 2 \n"
"1: \n"
"addiu $t4, $t4, -1 \n"
"lw $t0, 0(%[src_uv]) \n" // V1 | U1 | V0 | U0
@ -446,7 +482,90 @@ void SplitUVRow_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
);
}
void MirrorRow_DSPR2(const uint8* src, uint8* dst, int width) {
void SplitUVRow_Unaligned_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u,
uint8* dst_v, int width) {
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
"srl $t4, %[width], 4 \n" // multiplies of 16
"blez $t4, 2f \n"
" andi %[width], %[width], 0xf \n" // residual
".p2align 2 \n"
"1: \n"
"addiu $t4, $t4, -1 \n"
"lwr $t0, 0(%[src_uv]) \n"
"lwl $t0, 3(%[src_uv]) \n" // V1 | U1 | V0 | U0
"lwr $t1, 4(%[src_uv]) \n"
"lwl $t1, 7(%[src_uv]) \n" // V3 | U3 | V2 | U2
"lwr $t2, 8(%[src_uv]) \n"
"lwl $t2, 11(%[src_uv]) \n" // V5 | U5 | V4 | U4
"lwr $t3, 12(%[src_uv]) \n"
"lwl $t3, 15(%[src_uv]) \n" // V7 | U7 | V6 | U6
"lwr $t5, 16(%[src_uv]) \n"
"lwl $t5, 19(%[src_uv]) \n" // V9 | U9 | V8 | U8
"lwr $t6, 20(%[src_uv]) \n"
"lwl $t6, 23(%[src_uv]) \n" // V11 | U11 | V10 | U10
"lwr $t7, 24(%[src_uv]) \n"
"lwl $t7, 27(%[src_uv]) \n" // V13 | U13 | V12 | U12
"lwr $t8, 28(%[src_uv]) \n"
"lwl $t8, 31(%[src_uv]) \n" // V15 | U15 | V14 | U14
"precrq.qb.ph $t9, $t1, $t0 \n" // V3 | V2 | V1 | V0
"precr.qb.ph $t0, $t1, $t0 \n" // U3 | U2 | U1 | U0
"precrq.qb.ph $t1, $t3, $t2 \n" // V7 | V6 | V5 | V4
"precr.qb.ph $t2, $t3, $t2 \n" // U7 | U6 | U5 | U4
"precrq.qb.ph $t3, $t6, $t5 \n" // V11 | V10 | V9 | V8
"precr.qb.ph $t5, $t6, $t5 \n" // U11 | U10 | U9 | U8
"precrq.qb.ph $t6, $t8, $t7 \n" // V15 | V14 | V13 | V12
"precr.qb.ph $t7, $t8, $t7 \n" // U15 | U14 | U13 | U12
"addiu %[src_uv], %[src_uv], 32 \n"
"swr $t9, 0(%[dst_v]) \n"
"swl $t9, 3(%[dst_v]) \n"
"swr $t0, 0(%[dst_u]) \n"
"swl $t0, 3(%[dst_u]) \n"
"swr $t1, 4(%[dst_v]) \n"
"swl $t1, 7(%[dst_v]) \n"
"swr $t2, 4(%[dst_u]) \n"
"swl $t2, 7(%[dst_u]) \n"
"swr $t3, 8(%[dst_v]) \n"
"swl $t3, 11(%[dst_v]) \n"
"swr $t5, 8(%[dst_u]) \n"
"swl $t5, 11(%[dst_u]) \n"
"swr $t6, 12(%[dst_v]) \n"
"swl $t6, 15(%[dst_v]) \n"
"swr $t7, 12(%[dst_u]) \n"
"swl $t7, 15(%[dst_u]) \n"
"addiu %[dst_u], %[dst_u], 16 \n"
"bgtz $t4, 1b \n"
" addiu %[dst_v], %[dst_v], 16 \n"
"beqz %[width], 3f \n"
" nop \n"
"2: \n"
"lbu $t0, 0(%[src_uv]) \n"
"lbu $t1, 1(%[src_uv]) \n"
"addiu %[src_uv], %[src_uv], 2 \n"
"addiu %[width], %[width], -1 \n"
"sb $t0, 0(%[dst_u]) \n"
"sb $t1, 0(%[dst_v]) \n"
"addiu %[dst_u], %[dst_u], 1 \n"
"bgtz %[width], 2b \n"
" addiu %[dst_v], %[dst_v], 1 \n"
"3: \n"
".set pop \n"
: [src_uv] "+r" (src_uv),
[width] "+r" (width),
[dst_u] "+r" (dst_u),
[dst_v] "+r" (dst_v)
:
: "t0", "t1", "t2", "t3",
"t4", "t5", "t6", "t7", "t8", "t9"
);
}
void MirrorRow_MIPS_DSPR2(const uint8* src, uint8* dst, int width) {
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
@ -456,6 +575,7 @@ void MirrorRow_DSPR2(const uint8* src, uint8* dst, int width) {
"blez $t4, 2f \n"
" addu %[src], %[src], %[width] \n" // src += width
".p2align 2 \n"
"1: \n"
"lw $t0, -16(%[src]) \n" // |3|2|1|0|
"lw $t1, -12(%[src]) \n" // |7|6|5|4|
@ -496,10 +616,10 @@ void MirrorRow_DSPR2(const uint8* src, uint8* dst, int width) {
);
}
void MirrorUVRow_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
int width) {
int x;
int y;
void MirrorUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
int width) {
int x = 0;
int y = 0;
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
@ -510,6 +630,7 @@ void MirrorUVRow_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
"blez %[x], 2f \n"
" addu %[src_uv], %[src_uv], $t4 \n"
".p2align 2 \n"
"1: \n"
"lw $t0, -32(%[src_uv]) \n" // |3|2|1|0|
"lw $t1, -28(%[src_uv]) \n" // |7|6|5|4|
@ -579,7 +700,7 @@ void MirrorUVRow_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
[dst_u] "+r" (dst_u),
[dst_v] "+r" (dst_v),
[x] "=&r" (x),
[y] "=&r" (y)
[y] "+r" (y)
: [width] "r" (width)
: "t0", "t1", "t2", "t3", "t4",
"t5", "t7", "t8", "t9"
@ -593,7 +714,7 @@ void MirrorUVRow_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
// t8 = | 0 | G1 | 0 | g1 |
// t2 = | 0 | R0 | 0 | r0 |
// t1 = | 0 | R1 | 0 | r1 |
#define YUVTORGB \
#define I422ToTransientMipsRGB \
"lw $t0, 0(%[y_buf]) \n" \
"lhu $t1, 0(%[u_buf]) \n" \
"lhu $t2, 0(%[v_buf]) \n" \
@ -652,13 +773,11 @@ void MirrorUVRow_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
"addu.ph $t2, $t2, $s5 \n" \
"addu.ph $t1, $t1, $s5 \n"
// TODO(fbarchard): accept yuv conversion constants.
void I422ToARGBRow_DSPR2(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
const struct YuvConstants* yuvconstants,
int width) {
void I422ToARGBRow_MIPS_DSPR2(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width) {
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
@ -672,8 +791,9 @@ void I422ToARGBRow_DSPR2(const uint8* y_buf,
"lui $s6, 0xff00 \n"
"ori $s6, 0xff00 \n" // |ff|00|ff|00|ff|
".p2align 2 \n"
"1: \n"
YUVTORGB
I422ToTransientMipsRGB
// Arranging into argb format
"precr.qb.ph $t4, $t8, $t4 \n" // |G1|g1|B1|b1|
"precr.qb.ph $t5, $t9, $t5 \n" // |G0|g0|B0|b0|
@ -715,10 +835,136 @@ void I422ToARGBRow_DSPR2(const uint8* y_buf,
);
}
void I422ToABGRRow_MIPS_DSPR2(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width) {
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
"beqz %[width], 2f \n"
" repl.ph $s0, 74 \n" // |YG|YG| = |74|74|
"repl.ph $s1, -25 \n" // |UG|UG| = |-25|-25|
"repl.ph $s2, -52 \n" // |VG|VG| = |-52|-52|
"repl.ph $s3, 102 \n" // |VR|VR| = |102|102|
"repl.ph $s4, 16 \n" // |0|16|0|16|
"repl.ph $s5, 128 \n" // |128|128|
"lui $s6, 0xff00 \n"
"ori $s6, 0xff00 \n" // |ff|00|ff|00|
".p2align 2 \n"
"1: \n"
I422ToTransientMipsRGB
// Arranging into abgr format
"precr.qb.ph $t0, $t8, $t1 \n" // |G1|g1|R1|r1|
"precr.qb.ph $t3, $t9, $t2 \n" // |G0|g0|R0|r0|
"precrq.qb.ph $t8, $t0, $t3 \n" // |G1|R1|G0|R0|
"precr.qb.ph $t9, $t0, $t3 \n" // |g1|r1|g0|r0|
"precr.qb.ph $t2, $t4, $t5 \n" // |B1|b1|B0|b0|
"addiu %[width], -4 \n"
"addiu %[y_buf], 4 \n"
"preceu.ph.qbla $t1, $t2 \n" // |0 |B1|0 |B0|
"preceu.ph.qbra $t2, $t2 \n" // |0 |b1|0 |b0|
"or $t1, $t1, $s6 \n" // |ff|B1|ff|B0|
"or $t2, $t2, $s6 \n" // |ff|b1|ff|b0|
"precrq.ph.w $t0, $t2, $t9 \n" // |ff|b1|g1|r1|
"precrq.ph.w $t3, $t1, $t8 \n" // |ff|B1|G1|R1|
"sll $t9, $t9, 16 \n"
"sll $t8, $t8, 16 \n"
"packrl.ph $t2, $t2, $t9 \n" // |ff|b0|g0|r0|
"packrl.ph $t1, $t1, $t8 \n" // |ff|B0|G0|R0|
// Store results.
"sw $t2, 0(%[rgb_buf]) \n"
"sw $t0, 4(%[rgb_buf]) \n"
"sw $t1, 8(%[rgb_buf]) \n"
"sw $t3, 12(%[rgb_buf]) \n"
"bnez %[width], 1b \n"
" addiu %[rgb_buf], 16 \n"
"2: \n"
".set pop \n"
:[y_buf] "+r" (y_buf),
[u_buf] "+r" (u_buf),
[v_buf] "+r" (v_buf),
[width] "+r" (width),
[rgb_buf] "+r" (rgb_buf)
:
: "t0", "t1", "t2", "t3", "t4", "t5",
"t6", "t7", "t8", "t9",
"s0", "s1", "s2", "s3",
"s4", "s5", "s6"
);
}
void I422ToBGRARow_MIPS_DSPR2(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width) {
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
"beqz %[width], 2f \n"
" repl.ph $s0, 74 \n" // |YG|YG| = |74 |74 |
"repl.ph $s1, -25 \n" // |UG|UG| = |-25|-25|
"repl.ph $s2, -52 \n" // |VG|VG| = |-52|-52|
"repl.ph $s3, 102 \n" // |VR|VR| = |102|102|
"repl.ph $s4, 16 \n" // |0|16|0|16|
"repl.ph $s5, 128 \n" // |128|128|
"lui $s6, 0xff \n"
"ori $s6, 0xff \n" // |00|ff|00|ff|
".p2align 2 \n"
"1: \n"
I422ToTransientMipsRGB
// Arranging into bgra format
"precr.qb.ph $t4, $t4, $t8 \n" // |B1|b1|G1|g1|
"precr.qb.ph $t5, $t5, $t9 \n" // |B0|b0|G0|g0|
"precrq.qb.ph $t8, $t4, $t5 \n" // |B1|G1|B0|G0|
"precr.qb.ph $t9, $t4, $t5 \n" // |b1|g1|b0|g0|
"precr.qb.ph $t2, $t1, $t2 \n" // |R1|r1|R0|r0|
"addiu %[width], -4 \n"
"addiu %[y_buf], 4 \n"
"preceu.ph.qbla $t1, $t2 \n" // |0 |R1|0 |R0|
"preceu.ph.qbra $t2, $t2 \n" // |0 |r1|0 |r0|
"sll $t1, $t1, 8 \n" // |R1|0 |R0|0 |
"sll $t2, $t2, 8 \n" // |r1|0 |r0|0 |
"or $t1, $t1, $s6 \n" // |R1|ff|R0|ff|
"or $t2, $t2, $s6 \n" // |r1|ff|r0|ff|
"precrq.ph.w $t0, $t9, $t2 \n" // |b1|g1|r1|ff|
"precrq.ph.w $t3, $t8, $t1 \n" // |B1|G1|R1|ff|
"sll $t1, $t1, 16 \n"
"sll $t2, $t2, 16 \n"
"packrl.ph $t2, $t9, $t2 \n" // |b0|g0|r0|ff|
"packrl.ph $t1, $t8, $t1 \n" // |B0|G0|R0|ff|
// Store results.
"sw $t2, 0(%[rgb_buf]) \n"
"sw $t0, 4(%[rgb_buf]) \n"
"sw $t1, 8(%[rgb_buf]) \n"
"sw $t3, 12(%[rgb_buf]) \n"
"bnez %[width], 1b \n"
" addiu %[rgb_buf], 16 \n"
"2: \n"
".set pop \n"
:[y_buf] "+r" (y_buf),
[u_buf] "+r" (u_buf),
[v_buf] "+r" (v_buf),
[width] "+r" (width),
[rgb_buf] "+r" (rgb_buf)
:
: "t0", "t1", "t2", "t3", "t4", "t5",
"t6", "t7", "t8", "t9",
"s0", "s1", "s2", "s3",
"s4", "s5", "s6"
);
}
// Bilinear filter 8x2 -> 8x1
void InterpolateRow_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
ptrdiff_t src_stride, int dst_width,
int source_y_fraction) {
void InterpolateRows_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
ptrdiff_t src_stride, int dst_width,
int source_y_fraction) {
int y0_fraction = 256 - source_y_fraction;
const uint8* src_ptr1 = src_ptr + src_stride;
@ -729,6 +975,7 @@ void InterpolateRow_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
"replv.ph $t0, %[y0_fraction] \n"
"replv.ph $t1, %[source_y_fraction] \n"
".p2align 2 \n"
"1: \n"
"lw $t2, 0(%[src_ptr]) \n"
"lw $t3, 0(%[src_ptr1]) \n"

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -53,27 +53,18 @@ static void ScaleARGBDown2(int src_width, int src_height,
}
#if defined(HAS_SCALEARGBROWDOWN2_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_Any_SSE2 :
(filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_SSE2 :
ScaleARGBRowDown2Box_Any_SSE2);
if (IS_ALIGNED(dst_width, 4)) {
ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_SSE2 :
(filtering == kFilterLinear ? ScaleARGBRowDown2Linear_SSE2 :
ScaleARGBRowDown2Box_SSE2);
}
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4) &&
IS_ALIGNED(src_argb, 16) && IS_ALIGNED(row_stride, 16) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_SSE2 :
(filtering == kFilterLinear ? ScaleARGBRowDown2Linear_SSE2 :
ScaleARGBRowDown2Box_SSE2);
}
#endif
#if defined(HAS_SCALEARGBROWDOWN2_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_Any_NEON :
(filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_NEON :
ScaleARGBRowDown2Box_Any_NEON);
if (IS_ALIGNED(dst_width, 8)) {
ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_NEON :
(filtering == kFilterLinear ? ScaleARGBRowDown2Linear_NEON :
ScaleARGBRowDown2Box_NEON);
}
#elif defined(HAS_SCALEARGBROWDOWN2_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8) &&
IS_ALIGNED(src_argb, 4) && IS_ALIGNED(row_stride, 4)) {
ScaleARGBRowDown2 = filtering ? ScaleARGBRowDown2Box_NEON :
ScaleARGBRowDown2_NEON;
}
#endif
@ -97,7 +88,7 @@ static void ScaleARGBDown4Box(int src_width, int src_height,
int x, int dx, int y, int dy) {
int j;
// Allocate 2 rows of ARGB.
const int kRowSize = (dst_width * 2 * 4 + 31) & ~31;
const int kRowSize = (dst_width * 2 * 4 + 15) & ~15;
align_buffer_64(row, kRowSize * 2);
int row_stride = src_stride * (dy >> 16);
void (*ScaleARGBRowDown2)(const uint8* src_argb, ptrdiff_t src_stride,
@ -107,22 +98,17 @@ static void ScaleARGBDown4Box(int src_width, int src_height,
assert(dx == 65536 * 4); // Test scale factor of 4.
assert((dy & 0x3ffff) == 0); // Test vertical scale is multiple of 4.
#if defined(HAS_SCALEARGBROWDOWN2_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
ScaleARGBRowDown2 = ScaleARGBRowDown2Box_Any_SSE2;
if (IS_ALIGNED(dst_width, 4)) {
ScaleARGBRowDown2 = ScaleARGBRowDown2Box_SSE2;
}
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4) &&
IS_ALIGNED(src_argb, 16) && IS_ALIGNED(row_stride, 16) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
ScaleARGBRowDown2 = ScaleARGBRowDown2Box_SSE2;
}
#elif defined(HAS_SCALEARGBROWDOWN2_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8) &&
IS_ALIGNED(src_argb, 4) && IS_ALIGNED(row_stride, 4)) {
ScaleARGBRowDown2 = ScaleARGBRowDown2Box_NEON;
}
#endif
#if defined(HAS_SCALEARGBROWDOWN2_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ScaleARGBRowDown2 = ScaleARGBRowDown2Box_Any_NEON;
if (IS_ALIGNED(dst_width, 8)) {
ScaleARGBRowDown2 = ScaleARGBRowDown2Box_NEON;
}
}
#endif
for (j = 0; j < dst_height; ++j) {
ScaleARGBRowDown2(src_argb, src_stride, row, dst_width * 2);
ScaleARGBRowDown2(src_argb + src_stride * 2, src_stride,
@ -153,23 +139,16 @@ static void ScaleARGBDownEven(int src_width, int src_height,
assert(IS_ALIGNED(src_height, 2));
src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
#if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_SSE2 :
ScaleARGBRowDownEven_Any_SSE2;
if (IS_ALIGNED(dst_width, 4)) {
ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_SSE2 :
ScaleARGBRowDownEven_SSE2;
}
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_SSE2 :
ScaleARGBRowDownEven_SSE2;
}
#endif
#if defined(HAS_SCALEARGBROWDOWNEVEN_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_NEON :
ScaleARGBRowDownEven_Any_NEON;
if (IS_ALIGNED(dst_width, 4)) {
ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_NEON :
ScaleARGBRowDownEven_NEON;
}
#elif defined(HAS_SCALEARGBROWDOWNEVEN_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 4) &&
IS_ALIGNED(src_argb, 4)) {
ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_NEON :
ScaleARGBRowDownEven_NEON;
}
#endif
@ -191,35 +170,42 @@ static void ScaleARGBBilinearDown(int src_width, int src_height,
int x, int dx, int y, int dy,
enum FilterMode filtering) {
int j;
void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
InterpolateRow_C;
void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx) =
(src_width >= 32768) ? ScaleARGBFilterCols64_C : ScaleARGBFilterCols_C;
int64 xlast = x + (int64)(dst_width - 1) * dx;
int64 xl = (dx >= 0) ? x : xlast;
int64 xr = (dx >= 0) ? xlast : x;
int clip_src_width;
xl = (xl >> 16) & ~3; // Left edge aligned.
xr = (xr >> 16) + 1; // Right most pixel used. Bilinear uses 2 pixels.
xr = (xr + 1 + 3) & ~3; // 1 beyond 4 pixel aligned right most pixel.
if (xr > src_width) {
xr = src_width;
}
clip_src_width = (int)(xr - xl) * 4; // Width aligned to 4.
xr = (xr >> 16) + 1; // Right most pixel used.
clip_src_width = (((xr - xl) + 1 + 3) & ~3) * 4; // Width aligned to 4.
src_argb += xl * 4;
x -= (int)(xl << 16);
void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
InterpolateRow_C;
#if defined(HAS_INTERPOLATEROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && clip_src_width >= 16) {
InterpolateRow = InterpolateRow_Any_SSE2;
if (IS_ALIGNED(clip_src_width, 16)) {
InterpolateRow = InterpolateRow_Unaligned_SSE2;
if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16)) {
InterpolateRow = InterpolateRow_SSE2;
}
}
}
#endif
#if defined(HAS_INTERPOLATEROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
if (TestCpuFlag(kCpuHasSSSE3) && clip_src_width >= 16) {
InterpolateRow = InterpolateRow_Any_SSSE3;
if (IS_ALIGNED(clip_src_width, 16)) {
InterpolateRow = InterpolateRow_SSSE3;
InterpolateRow = InterpolateRow_Unaligned_SSSE3;
if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16)) {
InterpolateRow = InterpolateRow_SSSE3;
}
}
}
#endif
#if defined(HAS_INTERPOLATEROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
if (TestCpuFlag(kCpuHasAVX2) && clip_src_width >= 32) {
InterpolateRow = InterpolateRow_Any_AVX2;
if (IS_ALIGNED(clip_src_width, 32)) {
InterpolateRow = InterpolateRow_AVX2;
@ -227,62 +213,52 @@ static void ScaleARGBBilinearDown(int src_width, int src_height,
}
#endif
#if defined(HAS_INTERPOLATEROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
if (TestCpuFlag(kCpuHasNEON) && clip_src_width >= 16) {
InterpolateRow = InterpolateRow_Any_NEON;
if (IS_ALIGNED(clip_src_width, 16)) {
InterpolateRow = InterpolateRow_NEON;
}
}
#endif
#if defined(HAS_INTERPOLATEROW_DSPR2)
if (TestCpuFlag(kCpuHasDSPR2) &&
#if defined(HAS_INTERPOLATEROWS_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && clip_src_width >= 4 &&
IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4)) {
InterpolateRow = InterpolateRow_Any_DSPR2;
InterpolateRow = InterpolateRow_Any_MIPS_DSPR2;
if (IS_ALIGNED(clip_src_width, 4)) {
InterpolateRow = InterpolateRow_DSPR2;
InterpolateRow = InterpolateRow_MIPS_DSPR2;
}
}
#endif
void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx) =
(src_width >= 32768) ? ScaleARGBFilterCols64_C : ScaleARGBFilterCols_C;
#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
}
#endif
#if defined(HAS_SCALEARGBFILTERCOLS_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ScaleARGBFilterCols = ScaleARGBFilterCols_Any_NEON;
if (IS_ALIGNED(dst_width, 4)) {
ScaleARGBFilterCols = ScaleARGBFilterCols_NEON;
}
}
#endif
// TODO(fbarchard): Consider not allocating row buffer for kFilterLinear.
// Allocate a row of ARGB.
{
align_buffer_64(row, clip_src_width * 4);
align_buffer_64(row, clip_src_width * 4);
const int max_y = (src_height - 1) << 16;
const int max_y = (src_height - 1) << 16;
for (j = 0; j < dst_height; ++j) {
if (y > max_y) {
y = max_y;
}
for (j = 0; j < dst_height; ++j) {
int yi = y >> 16;
const uint8* src = src_argb + yi * src_stride;
if (filtering == kFilterLinear) {
ScaleARGBFilterCols(dst_argb, src, dst_width, x, dx);
} else {
int yf = (y >> 8) & 255;
InterpolateRow(row, src, src_stride, clip_src_width, yf);
ScaleARGBFilterCols(dst_argb, row, dst_width, x, dx);
}
dst_argb += dst_stride;
y += dy;
if (y > max_y) {
y = max_y;
}
int yi = y >> 16;
const uint8* src = src_argb + yi * src_stride;
if (filtering == kFilterLinear) {
ScaleARGBFilterCols(dst_argb, src, dst_width, x, dx);
} else {
int yf = (y >> 8) & 255;
InterpolateRow(row, src, src_stride, clip_src_width, yf);
ScaleARGBFilterCols(dst_argb, row, dst_width, x, dx);
}
free_aligned_buffer_64(row);
dst_argb += dst_stride;
y += dy;
}
free_aligned_buffer_64(row);
}
// Scale ARGB up with bilinear interpolation.
@ -299,17 +275,30 @@ static void ScaleARGBBilinearUp(int src_width, int src_height,
void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx) =
filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C;
const int max_y = (src_height - 1) << 16;
#if defined(HAS_INTERPOLATEROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && dst_width >= 4) {
InterpolateRow = InterpolateRow_Any_SSE2;
if (IS_ALIGNED(dst_width, 4)) {
InterpolateRow = InterpolateRow_Unaligned_SSE2;
if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
InterpolateRow = InterpolateRow_SSE2;
}
}
}
#endif
#if defined(HAS_INTERPOLATEROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
if (TestCpuFlag(kCpuHasSSSE3) && dst_width >= 4) {
InterpolateRow = InterpolateRow_Any_SSSE3;
if (IS_ALIGNED(dst_width, 4)) {
InterpolateRow = InterpolateRow_SSSE3;
InterpolateRow = InterpolateRow_Unaligned_SSSE3;
if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
InterpolateRow = InterpolateRow_SSSE3;
}
}
}
#endif
#if defined(HAS_INTERPOLATEROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
if (TestCpuFlag(kCpuHasAVX2) && dst_width >= 8) {
InterpolateRow = InterpolateRow_Any_AVX2;
if (IS_ALIGNED(dst_width, 8)) {
InterpolateRow = InterpolateRow_AVX2;
@ -317,17 +306,17 @@ static void ScaleARGBBilinearUp(int src_width, int src_height,
}
#endif
#if defined(HAS_INTERPOLATEROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
if (TestCpuFlag(kCpuHasNEON) && dst_width >= 4) {
InterpolateRow = InterpolateRow_Any_NEON;
if (IS_ALIGNED(dst_width, 4)) {
InterpolateRow = InterpolateRow_NEON;
}
}
#endif
#if defined(HAS_INTERPOLATEROW_DSPR2)
if (TestCpuFlag(kCpuHasDSPR2) &&
#if defined(HAS_INTERPOLATEROWS_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && dst_width >= 1 &&
IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) {
InterpolateRow = InterpolateRow_DSPR2;
InterpolateRow = InterpolateRow_MIPS_DSPR2;
}
#endif
if (src_width >= 32768) {
@ -339,86 +328,70 @@ static void ScaleARGBBilinearUp(int src_width, int src_height,
ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
}
#endif
#if defined(HAS_SCALEARGBFILTERCOLS_NEON)
if (filtering && TestCpuFlag(kCpuHasNEON)) {
ScaleARGBFilterCols = ScaleARGBFilterCols_Any_NEON;
if (IS_ALIGNED(dst_width, 4)) {
ScaleARGBFilterCols = ScaleARGBFilterCols_NEON;
}
}
#endif
#if defined(HAS_SCALEARGBCOLS_SSE2)
if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
ScaleARGBFilterCols = ScaleARGBCols_SSE2;
}
#endif
#if defined(HAS_SCALEARGBCOLS_NEON)
if (!filtering && TestCpuFlag(kCpuHasNEON)) {
ScaleARGBFilterCols = ScaleARGBCols_Any_NEON;
if (IS_ALIGNED(dst_width, 8)) {
ScaleARGBFilterCols = ScaleARGBCols_NEON;
}
}
#endif
if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
ScaleARGBFilterCols = ScaleARGBColsUp2_C;
#if defined(HAS_SCALEARGBCOLSUP2_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) &&
IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2;
}
#endif
}
const int max_y = (src_height - 1) << 16;
if (y > max_y) {
y = max_y;
}
int yi = y >> 16;
const uint8* src = src_argb + yi * src_stride;
{
int yi = y >> 16;
const uint8* src = src_argb + yi * src_stride;
// Allocate 2 rows of ARGB.
const int kRowSize = (dst_width * 4 + 15) & ~15;
align_buffer_64(row, kRowSize * 2);
// Allocate 2 rows of ARGB.
const int kRowSize = (dst_width * 4 + 31) & ~31;
align_buffer_64(row, kRowSize * 2);
uint8* rowptr = row;
int rowstride = kRowSize;
int lasty = yi;
uint8* rowptr = row;
int rowstride = kRowSize;
int lasty = yi;
ScaleARGBFilterCols(rowptr, src, dst_width, x, dx);
if (src_height > 1) {
src += src_stride;
}
ScaleARGBFilterCols(rowptr + rowstride, src, dst_width, x, dx);
ScaleARGBFilterCols(rowptr, src, dst_width, x, dx);
if (src_height > 1) {
src += src_stride;
for (j = 0; j < dst_height; ++j) {
yi = y >> 16;
if (yi != lasty) {
if (y > max_y) {
y = max_y;
yi = y >> 16;
src = src_argb + yi * src_stride;
}
if (yi != lasty) {
ScaleARGBFilterCols(rowptr, src, dst_width, x, dx);
rowptr += rowstride;
rowstride = -rowstride;
lasty = yi;
src += src_stride;
}
}
if (filtering == kFilterLinear) {
InterpolateRow(dst_argb, rowptr, 0, dst_width * 4, 0);
} else {
int yf = (y >> 8) & 255;
InterpolateRow(dst_argb, rowptr, rowstride, dst_width * 4, yf);
}
dst_argb += dst_stride;
y += dy;
}
free_aligned_buffer_64(row);
}
ScaleARGBFilterCols(rowptr + rowstride, src, dst_width, x, dx);
src += src_stride;
for (j = 0; j < dst_height; ++j) {
yi = y >> 16;
if (yi != lasty) {
if (y > max_y) {
y = max_y;
yi = y >> 16;
src = src_argb + yi * src_stride;
}
if (yi != lasty) {
ScaleARGBFilterCols(rowptr, src, dst_width, x, dx);
rowptr += rowstride;
rowstride = -rowstride;
lasty = yi;
src += src_stride;
}
}
if (filtering == kFilterLinear) {
InterpolateRow(dst_argb, rowptr, 0, dst_width * 4, 0);
} else {
int yf = (y >> 8) & 255;
InterpolateRow(dst_argb, rowptr, rowstride, dst_width * 4, yf);
}
dst_argb += dst_stride;
y += dy;
}
free_aligned_buffer_64(row);
}
#ifdef YUVSCALEUP
@ -442,15 +415,18 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
uint8* rgb_buf,
int width) = I422ToARGBRow_C;
#if defined(HAS_I422TOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
if (TestCpuFlag(kCpuHasSSSE3) && src_width >= 8) {
I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
if (IS_ALIGNED(src_width, 8)) {
I422ToARGBRow = I422ToARGBRow_SSSE3;
I422ToARGBRow = I422ToARGBRow_Unaligned_SSSE3;
if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
I422ToARGBRow = I422ToARGBRow_SSSE3;
}
}
}
#endif
#if defined(HAS_I422TOARGBROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
if (TestCpuFlag(kCpuHasAVX2) && src_width >= 16) {
I422ToARGBRow = I422ToARGBRow_Any_AVX2;
if (IS_ALIGNED(src_width, 16)) {
I422ToARGBRow = I422ToARGBRow_AVX2;
@ -458,36 +434,50 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
}
#endif
#if defined(HAS_I422TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
if (TestCpuFlag(kCpuHasNEON) && src_width >= 8) {
I422ToARGBRow = I422ToARGBRow_Any_NEON;
if (IS_ALIGNED(src_width, 8)) {
I422ToARGBRow = I422ToARGBRow_NEON;
}
}
#endif
#if defined(HAS_I422TOARGBROW_DSPR2)
if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src_width, 4) &&
#if defined(HAS_I422TOARGBROW_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(src_width, 4) &&
IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
I422ToARGBRow = I422ToARGBRow_DSPR2;
I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2;
}
#endif
void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
InterpolateRow_C;
#if defined(HAS_INTERPOLATEROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && dst_width >= 4) {
InterpolateRow = InterpolateRow_Any_SSE2;
if (IS_ALIGNED(dst_width, 4)) {
InterpolateRow = InterpolateRow_Unaligned_SSE2;
if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
InterpolateRow = InterpolateRow_SSE2;
}
}
}
#endif
#if defined(HAS_INTERPOLATEROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
if (TestCpuFlag(kCpuHasSSSE3) && dst_width >= 4) {
InterpolateRow = InterpolateRow_Any_SSSE3;
if (IS_ALIGNED(dst_width, 4)) {
InterpolateRow = InterpolateRow_SSSE3;
InterpolateRow = InterpolateRow_Unaligned_SSSE3;
if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
InterpolateRow = InterpolateRow_SSSE3;
}
}
}
#endif
#if defined(HAS_INTERPOLATEROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
if (TestCpuFlag(kCpuHasAVX2) && dst_width >= 8) {
InterpolateRow = InterpolateRow_Any_AVX2;
if (IS_ALIGNED(dst_width, 8)) {
InterpolateRow = InterpolateRow_AVX2;
@ -495,17 +485,17 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
}
#endif
#if defined(HAS_INTERPOLATEROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
if (TestCpuFlag(kCpuHasNEON) && dst_width >= 4) {
InterpolateRow = InterpolateRow_Any_NEON;
if (IS_ALIGNED(dst_width, 4)) {
InterpolateRow = InterpolateRow_NEON;
}
}
#endif
#if defined(HAS_INTERPOLATEROW_DSPR2)
if (TestCpuFlag(kCpuHasDSPR2) &&
#if defined(HAS_INTERPOLATEROWS_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && dst_width >= 1 &&
IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
InterpolateRow = InterpolateRow_DSPR2;
InterpolateRow = InterpolateRow_MIPS_DSPR2;
}
#endif
@ -521,31 +511,17 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
}
#endif
#if defined(HAS_SCALEARGBFILTERCOLS_NEON)
if (filtering && TestCpuFlag(kCpuHasNEON)) {
ScaleARGBFilterCols = ScaleARGBFilterCols_Any_NEON;
if (IS_ALIGNED(dst_width, 4)) {
ScaleARGBFilterCols = ScaleARGBFilterCols_NEON;
}
}
#endif
#if defined(HAS_SCALEARGBCOLS_SSE2)
if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
ScaleARGBFilterCols = ScaleARGBCols_SSE2;
}
#endif
#if defined(HAS_SCALEARGBCOLS_NEON)
if (!filtering && TestCpuFlag(kCpuHasNEON)) {
ScaleARGBFilterCols = ScaleARGBCols_Any_NEON;
if (IS_ALIGNED(dst_width, 8)) {
ScaleARGBFilterCols = ScaleARGBCols_NEON;
}
}
#endif
if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
ScaleARGBFilterCols = ScaleARGBColsUp2_C;
#if defined(HAS_SCALEARGBCOLSUP2_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) &&
IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2;
}
#endif
@ -563,7 +539,7 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
const uint8* src_row_v = src_v + uv_yi * src_stride_v;
// Allocate 2 rows of ARGB.
const int kRowSize = (dst_width * 4 + 31) & ~31;
const int kRowSize = (dst_width * 4 + 15) & ~15;
align_buffer_64(row, kRowSize * 2);
// Allocate 1 row of ARGB for source conversion.
@ -648,19 +624,13 @@ static void ScaleARGBSimple(int src_width, int src_height,
if (TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
ScaleARGBCols = ScaleARGBCols_SSE2;
}
#endif
#if defined(HAS_SCALEARGBCOLS_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ScaleARGBCols = ScaleARGBCols_Any_NEON;
if (IS_ALIGNED(dst_width, 8)) {
ScaleARGBCols = ScaleARGBCols_NEON;
}
}
#endif
if (src_width * 2 == dst_width && x < 0x8000) {
ScaleARGBCols = ScaleARGBColsUp2_C;
#if defined(HAS_SCALEARGBCOLSUP2_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) &&
IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
ScaleARGBCols = ScaleARGBColsUp2_SSE2;
}
#endif
@ -794,7 +764,6 @@ int ARGBScaleClip(const uint8* src_argb, int src_stride_argb,
if (!src_argb || src_width == 0 || src_height == 0 ||
!dst_argb || dst_width <= 0 || dst_height <= 0 ||
clip_x < 0 || clip_y < 0 ||
clip_width > 32768 || clip_height > 32768 ||
(clip_x + clip_width) > dst_width ||
(clip_y + clip_height) > dst_height) {
return -1;
@ -813,7 +782,6 @@ int ARGBScale(const uint8* src_argb, int src_stride_argb,
int dst_width, int dst_height,
enum FilterMode filtering) {
if (!src_argb || src_width == 0 || src_height == 0 ||
src_width > 32768 || src_height > 32768 ||
!dst_argb || dst_width <= 0 || dst_height <= 0) {
return -1;
}
@ -823,36 +791,6 @@ int ARGBScale(const uint8* src_argb, int src_stride_argb,
return 0;
}
// Scale with YUV conversion to ARGB and clipping.
LIBYUV_API
int YUVToARGBScaleClip(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint32 src_fourcc,
int src_width, int src_height,
uint8* dst_argb, int dst_stride_argb,
uint32 dst_fourcc,
int dst_width, int dst_height,
int clip_x, int clip_y, int clip_width, int clip_height,
enum FilterMode filtering) {
uint8* argb_buffer = (uint8*)malloc(src_width * src_height * 4);
int r;
I420ToARGB(src_y, src_stride_y,
src_u, src_stride_u,
src_v, src_stride_v,
argb_buffer, src_width * 4,
src_width, src_height);
r = ARGBScaleClip(argb_buffer, src_width * 4,
src_width, src_height,
dst_argb, dst_stride_argb,
dst_width, dst_height,
clip_x, clip_y, clip_width, clip_height,
filtering);
free(argb_buffer);
return r;
}
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv

View File

@ -42,20 +42,6 @@ void ScaleRowDown2_C(const uint8* src_ptr, ptrdiff_t src_stride,
}
}
void ScaleRowDown2_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
uint16* dst, int dst_width) {
int x;
for (x = 0; x < dst_width - 1; x += 2) {
dst[0] = src_ptr[1];
dst[1] = src_ptr[3];
dst += 2;
src_ptr += 4;
}
if (dst_width & 1) {
dst[0] = src_ptr[1];
}
}
void ScaleRowDown2Linear_C(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
const uint8* s = src_ptr;
@ -71,21 +57,6 @@ void ScaleRowDown2Linear_C(const uint8* src_ptr, ptrdiff_t src_stride,
}
}
void ScaleRowDown2Linear_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
uint16* dst, int dst_width) {
const uint16* s = src_ptr;
int x;
for (x = 0; x < dst_width - 1; x += 2) {
dst[0] = (s[0] + s[1] + 1) >> 1;
dst[1] = (s[2] + s[3] + 1) >> 1;
dst += 2;
s += 4;
}
if (dst_width & 1) {
dst[0] = (s[0] + s[1] + 1) >> 1;
}
}
void ScaleRowDown2Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
const uint8* s = src_ptr;
@ -103,45 +74,6 @@ void ScaleRowDown2Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
}
}
void ScaleRowDown2Box_Odd_C(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
const uint8* s = src_ptr;
const uint8* t = src_ptr + src_stride;
int x;
dst_width -= 1;
for (x = 0; x < dst_width - 1; x += 2) {
dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
dst[1] = (s[2] + s[3] + t[2] + t[3] + 2) >> 2;
dst += 2;
s += 4;
t += 4;
}
if (dst_width & 1) {
dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
dst += 1;
s += 2;
t += 2;
}
dst[0] = (s[0] + t[0] + 1) >> 1;
}
void ScaleRowDown2Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
uint16* dst, int dst_width) {
const uint16* s = src_ptr;
const uint16* t = src_ptr + src_stride;
int x;
for (x = 0; x < dst_width - 1; x += 2) {
dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
dst[1] = (s[2] + s[3] + t[2] + t[3] + 2) >> 2;
dst += 2;
s += 4;
t += 4;
}
if (dst_width & 1) {
dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
}
}
void ScaleRowDown4_C(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
int x;
@ -156,20 +88,6 @@ void ScaleRowDown4_C(const uint8* src_ptr, ptrdiff_t src_stride,
}
}
void ScaleRowDown4_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
uint16* dst, int dst_width) {
int x;
for (x = 0; x < dst_width - 1; x += 2) {
dst[0] = src_ptr[2];
dst[1] = src_ptr[6];
dst += 2;
src_ptr += 8;
}
if (dst_width & 1) {
dst[0] = src_ptr[2];
}
}
void ScaleRowDown4Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
intptr_t stride = src_stride;
@ -206,42 +124,6 @@ void ScaleRowDown4Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
}
}
void ScaleRowDown4Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
uint16* dst, int dst_width) {
intptr_t stride = src_stride;
int x;
for (x = 0; x < dst_width - 1; x += 2) {
dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
src_ptr[stride + 0] + src_ptr[stride + 1] +
src_ptr[stride + 2] + src_ptr[stride + 3] +
src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +
src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +
src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +
src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +
8) >> 4;
dst[1] = (src_ptr[4] + src_ptr[5] + src_ptr[6] + src_ptr[7] +
src_ptr[stride + 4] + src_ptr[stride + 5] +
src_ptr[stride + 6] + src_ptr[stride + 7] +
src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5] +
src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7] +
src_ptr[stride * 3 + 4] + src_ptr[stride * 3 + 5] +
src_ptr[stride * 3 + 6] + src_ptr[stride * 3 + 7] +
8) >> 4;
dst += 2;
src_ptr += 8;
}
if (dst_width & 1) {
dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
src_ptr[stride + 0] + src_ptr[stride + 1] +
src_ptr[stride + 2] + src_ptr[stride + 3] +
src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +
src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +
src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +
src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +
8) >> 4;
}
}
void ScaleRowDown34_C(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
int x;
@ -255,19 +137,6 @@ void ScaleRowDown34_C(const uint8* src_ptr, ptrdiff_t src_stride,
}
}
void ScaleRowDown34_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
uint16* dst, int dst_width) {
int x;
assert((dst_width % 3 == 0) && (dst_width > 0));
for (x = 0; x < dst_width; x += 3) {
dst[0] = src_ptr[0];
dst[1] = src_ptr[1];
dst[2] = src_ptr[3];
dst += 3;
src_ptr += 4;
}
}
// Filter rows 0 and 1 together, 3 : 1
void ScaleRowDown34_0_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* d, int dst_width) {
@ -291,28 +160,6 @@ void ScaleRowDown34_0_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
}
}
void ScaleRowDown34_0_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
uint16* d, int dst_width) {
const uint16* s = src_ptr;
const uint16* t = src_ptr + src_stride;
int x;
assert((dst_width % 3 == 0) && (dst_width > 0));
for (x = 0; x < dst_width; x += 3) {
uint16 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
uint16 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
uint16 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
uint16 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
uint16 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
uint16 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
d[0] = (a0 * 3 + b0 + 2) >> 2;
d[1] = (a1 * 3 + b1 + 2) >> 2;
d[2] = (a2 * 3 + b2 + 2) >> 2;
d += 3;
s += 4;
t += 4;
}
}
// Filter rows 1 and 2 together, 1 : 1
void ScaleRowDown34_1_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* d, int dst_width) {
@ -336,28 +183,6 @@ void ScaleRowDown34_1_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
}
}
void ScaleRowDown34_1_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
uint16* d, int dst_width) {
const uint16* s = src_ptr;
const uint16* t = src_ptr + src_stride;
int x;
assert((dst_width % 3 == 0) && (dst_width > 0));
for (x = 0; x < dst_width; x += 3) {
uint16 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
uint16 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
uint16 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
uint16 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
uint16 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
uint16 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
d[0] = (a0 + b0 + 1) >> 1;
d[1] = (a1 + b1 + 1) >> 1;
d[2] = (a2 + b2 + 1) >> 1;
d += 3;
s += 4;
t += 4;
}
}
// Scales a single row of pixels using point sampling.
void ScaleCols_C(uint8* dst_ptr, const uint8* src_ptr,
int dst_width, int x, int dx) {
@ -374,21 +199,6 @@ void ScaleCols_C(uint8* dst_ptr, const uint8* src_ptr,
}
}
void ScaleCols_16_C(uint16* dst_ptr, const uint16* src_ptr,
int dst_width, int x, int dx) {
int j;
for (j = 0; j < dst_width - 1; j += 2) {
dst_ptr[0] = src_ptr[x >> 16];
x += dx;
dst_ptr[1] = src_ptr[x >> 16];
x += dx;
dst_ptr += 2;
}
if (dst_width & 1) {
dst_ptr[0] = src_ptr[x >> 16];
}
}
// Scales a single row of pixels up by 2x using point sampling.
void ScaleColsUp2_C(uint8* dst_ptr, const uint8* src_ptr,
int dst_width, int x, int dx) {
@ -403,28 +213,9 @@ void ScaleColsUp2_C(uint8* dst_ptr, const uint8* src_ptr,
}
}
void ScaleColsUp2_16_C(uint16* dst_ptr, const uint16* src_ptr,
int dst_width, int x, int dx) {
int j;
for (j = 0; j < dst_width - 1; j += 2) {
dst_ptr[1] = dst_ptr[0] = src_ptr[0];
src_ptr += 1;
dst_ptr += 2;
}
if (dst_width & 1) {
dst_ptr[0] = src_ptr[0];
}
}
// (1-f)a + fb can be replaced with a + f(b-a)
#if defined(__arm__) || defined(__aarch64__)
#define BLENDER(a, b, f) (uint8)((int)(a) + \
((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
#else
// inteluses 7 bit math with rounding.
#define BLENDER(a, b, f) (uint8)((int)(a) + \
(((int)((f) >> 9) * ((int)(b) - (int)(a)) + 0x40) >> 7))
#endif
((int)(f) * ((int)(b) - (int)(a)) >> 16))
void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr,
int dst_width, int x, int dx) {
@ -476,60 +267,6 @@ void ScaleFilterCols64_C(uint8* dst_ptr, const uint8* src_ptr,
}
#undef BLENDER
// Same as 8 bit arm blender but return is cast to uint16
#define BLENDER(a, b, f) (uint16)((int)(a) + \
((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
void ScaleFilterCols_16_C(uint16* dst_ptr, const uint16* src_ptr,
int dst_width, int x, int dx) {
int j;
for (j = 0; j < dst_width - 1; j += 2) {
int xi = x >> 16;
int a = src_ptr[xi];
int b = src_ptr[xi + 1];
dst_ptr[0] = BLENDER(a, b, x & 0xffff);
x += dx;
xi = x >> 16;
a = src_ptr[xi];
b = src_ptr[xi + 1];
dst_ptr[1] = BLENDER(a, b, x & 0xffff);
x += dx;
dst_ptr += 2;
}
if (dst_width & 1) {
int xi = x >> 16;
int a = src_ptr[xi];
int b = src_ptr[xi + 1];
dst_ptr[0] = BLENDER(a, b, x & 0xffff);
}
}
void ScaleFilterCols64_16_C(uint16* dst_ptr, const uint16* src_ptr,
int dst_width, int x32, int dx) {
int64 x = (int64)(x32);
int j;
for (j = 0; j < dst_width - 1; j += 2) {
int64 xi = x >> 16;
int a = src_ptr[xi];
int b = src_ptr[xi + 1];
dst_ptr[0] = BLENDER(a, b, x & 0xffff);
x += dx;
xi = x >> 16;
a = src_ptr[xi];
b = src_ptr[xi + 1];
dst_ptr[1] = BLENDER(a, b, x & 0xffff);
x += dx;
dst_ptr += 2;
}
if (dst_width & 1) {
int64 xi = x >> 16;
int a = src_ptr[xi];
int b = src_ptr[xi + 1];
dst_ptr[0] = BLENDER(a, b, x & 0xffff);
}
}
#undef BLENDER
void ScaleRowDown38_C(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
int x;
@ -543,19 +280,6 @@ void ScaleRowDown38_C(const uint8* src_ptr, ptrdiff_t src_stride,
}
}
void ScaleRowDown38_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
uint16* dst, int dst_width) {
int x;
assert(dst_width % 3 == 0);
for (x = 0; x < dst_width; x += 3) {
dst[0] = src_ptr[0];
dst[1] = src_ptr[3];
dst[2] = src_ptr[6];
dst += 3;
src_ptr += 8;
}
}
// 8x3 -> 3x1
void ScaleRowDown38_3_Box_C(const uint8* src_ptr,
ptrdiff_t src_stride,
@ -583,32 +307,6 @@ void ScaleRowDown38_3_Box_C(const uint8* src_ptr,
}
}
void ScaleRowDown38_3_Box_16_C(const uint16* src_ptr,
ptrdiff_t src_stride,
uint16* dst_ptr, int dst_width) {
intptr_t stride = src_stride;
int i;
assert((dst_width % 3 == 0) && (dst_width > 0));
for (i = 0; i < dst_width; i += 3) {
dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
src_ptr[stride + 0] + src_ptr[stride + 1] +
src_ptr[stride + 2] + src_ptr[stride * 2 + 0] +
src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) *
(65536 / 9) >> 16;
dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
src_ptr[stride + 3] + src_ptr[stride + 4] +
src_ptr[stride + 5] + src_ptr[stride * 2 + 3] +
src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) *
(65536 / 9) >> 16;
dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
src_ptr[stride + 6] + src_ptr[stride + 7] +
src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) *
(65536 / 6) >> 16;
src_ptr += 8;
dst_ptr += 3;
}
}
// 8x2 -> 3x1
void ScaleRowDown38_2_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
@ -630,51 +328,21 @@ void ScaleRowDown38_2_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
}
}
void ScaleRowDown38_2_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
uint16* dst_ptr, int dst_width) {
intptr_t stride = src_stride;
int i;
assert((dst_width % 3 == 0) && (dst_width > 0));
for (i = 0; i < dst_width; i += 3) {
dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
src_ptr[stride + 0] + src_ptr[stride + 1] +
src_ptr[stride + 2]) * (65536 / 6) >> 16;
dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
src_ptr[stride + 3] + src_ptr[stride + 4] +
src_ptr[stride + 5]) * (65536 / 6) >> 16;
dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
src_ptr[stride + 6] + src_ptr[stride + 7]) *
(65536 / 4) >> 16;
src_ptr += 8;
dst_ptr += 3;
}
}
void ScaleAddRow_C(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
void ScaleAddRows_C(const uint8* src_ptr, ptrdiff_t src_stride,
uint16* dst_ptr, int src_width, int src_height) {
int x;
assert(src_width > 0);
for (x = 0; x < src_width - 1; x += 2) {
dst_ptr[0] += src_ptr[0];
dst_ptr[1] += src_ptr[1];
src_ptr += 2;
dst_ptr += 2;
}
if (src_width & 1) {
dst_ptr[0] += src_ptr[0];
}
}
void ScaleAddRow_16_C(const uint16* src_ptr, uint32* dst_ptr, int src_width) {
int x;
assert(src_width > 0);
for (x = 0; x < src_width - 1; x += 2) {
dst_ptr[0] += src_ptr[0];
dst_ptr[1] += src_ptr[1];
src_ptr += 2;
dst_ptr += 2;
}
if (src_width & 1) {
dst_ptr[0] += src_ptr[0];
assert(src_height > 0);
for (x = 0; x < src_width; ++x) {
const uint8* s = src_ptr + x;
unsigned int sum = 0u;
int y;
for (y = 0; y < src_height; ++y) {
sum += s[0];
s += src_stride;
}
// TODO(fbarchard): Consider limitting height to 256 to avoid overflow.
dst_ptr[x] = sum < 65535u ? sum : 65535u;
}
}
@ -816,7 +484,6 @@ void ScaleARGBColsUp2_C(uint8* dst_argb, const uint8* src_argb,
}
}
// TODO(fbarchard): Replace 0x7f ^ f with 128-f. bug=607.
// Mimics SSSE3 blender
#define BLENDER1(a, b, f) ((a) * (0x7f ^ f) + (b) * f) >> 7
#define BLENDERC(a, b, f, s) (uint32)( \
@ -906,16 +573,32 @@ void ScalePlaneVertical(int src_height,
assert(dst_width > 0);
assert(dst_height > 0);
src_argb += (x >> 16) * bpp;
#if defined(HAS_INTERPOLATEROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && dst_width_bytes >= 16) {
InterpolateRow = InterpolateRow_Any_SSE2;
if (IS_ALIGNED(dst_width_bytes, 16)) {
InterpolateRow = InterpolateRow_Unaligned_SSE2;
if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
InterpolateRow = InterpolateRow_SSE2;
}
}
}
#endif
#if defined(HAS_INTERPOLATEROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
if (TestCpuFlag(kCpuHasSSSE3) && dst_width_bytes >= 16) {
InterpolateRow = InterpolateRow_Any_SSSE3;
if (IS_ALIGNED(dst_width_bytes, 16)) {
InterpolateRow = InterpolateRow_SSSE3;
InterpolateRow = InterpolateRow_Unaligned_SSSE3;
if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
InterpolateRow = InterpolateRow_SSSE3;
}
}
}
#endif
#if defined(HAS_INTERPOLATEROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
if (TestCpuFlag(kCpuHasAVX2) && dst_width_bytes >= 32) {
InterpolateRow = InterpolateRow_Any_AVX2;
if (IS_ALIGNED(dst_width_bytes, 32)) {
InterpolateRow = InterpolateRow_AVX2;
@ -923,20 +606,20 @@ void ScalePlaneVertical(int src_height,
}
#endif
#if defined(HAS_INTERPOLATEROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
if (TestCpuFlag(kCpuHasNEON) && dst_width_bytes >= 16) {
InterpolateRow = InterpolateRow_Any_NEON;
if (IS_ALIGNED(dst_width_bytes, 16)) {
InterpolateRow = InterpolateRow_NEON;
}
}
#endif
#if defined(HAS_INTERPOLATEROW_DSPR2)
if (TestCpuFlag(kCpuHasDSPR2) &&
#if defined(HAS_INTERPOLATEROWS_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && dst_width_bytes >= 4 &&
IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4) &&
IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) {
InterpolateRow = InterpolateRow_Any_DSPR2;
InterpolateRow = InterpolateRow_Any_MIPS_DSPR2;
if (IS_ALIGNED(dst_width_bytes, 4)) {
InterpolateRow = InterpolateRow_DSPR2;
InterpolateRow = InterpolateRow_MIPS_DSPR2;
}
}
#endif
@ -954,80 +637,6 @@ void ScalePlaneVertical(int src_height,
y += dy;
}
}
void ScalePlaneVertical_16(int src_height,
int dst_width, int dst_height,
int src_stride, int dst_stride,
const uint16* src_argb, uint16* dst_argb,
int x, int y, int dy,
int wpp, enum FilterMode filtering) {
// TODO(fbarchard): Allow higher wpp.
int dst_width_words = dst_width * wpp;
void (*InterpolateRow)(uint16* dst_argb, const uint16* src_argb,
ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
InterpolateRow_16_C;
const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
int j;
assert(wpp >= 1 && wpp <= 2);
assert(src_height != 0);
assert(dst_width > 0);
assert(dst_height > 0);
src_argb += (x >> 16) * wpp;
#if defined(HAS_INTERPOLATEROW_16_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
InterpolateRow = InterpolateRow_Any_16_SSE2;
if (IS_ALIGNED(dst_width_bytes, 16)) {
InterpolateRow = InterpolateRow_16_SSE2;
}
}
#endif
#if defined(HAS_INTERPOLATEROW_16_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
InterpolateRow = InterpolateRow_Any_16_SSSE3;
if (IS_ALIGNED(dst_width_bytes, 16)) {
InterpolateRow = InterpolateRow_16_SSSE3;
}
}
#endif
#if defined(HAS_INTERPOLATEROW_16_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
InterpolateRow = InterpolateRow_Any_16_AVX2;
if (IS_ALIGNED(dst_width_bytes, 32)) {
InterpolateRow = InterpolateRow_16_AVX2;
}
}
#endif
#if defined(HAS_INTERPOLATEROW_16_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
InterpolateRow = InterpolateRow_Any_16_NEON;
if (IS_ALIGNED(dst_width_bytes, 16)) {
InterpolateRow = InterpolateRow_16_NEON;
}
}
#endif
#if defined(HAS_INTERPOLATEROW_16_DSPR2)
if (TestCpuFlag(kCpuHasDSPR2) &&
IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4) &&
IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) {
InterpolateRow = InterpolateRow_Any_16_DSPR2;
if (IS_ALIGNED(dst_width_bytes, 4)) {
InterpolateRow = InterpolateRow_16_DSPR2;
}
}
#endif
for (j = 0; j < dst_height; ++j) {
int yi;
int yf;
if (y > max_y) {
y = max_y;
}
yi = y >> 16;
yf = filtering ? ((y >> 8) & 255) : 0;
InterpolateRow(dst_argb, src_argb + yi * src_stride,
src_stride, dst_width_words, yf);
dst_argb += dst_stride;
y += dy;
}
}
// Simplify the filtering based on scale factors.
enum FilterMode ScaleFilterReduce(int src_width, int src_height,
@ -1044,6 +653,10 @@ enum FilterMode ScaleFilterReduce(int src_width, int src_height,
if (dst_width * 2 >= src_width && dst_height * 2 >= src_height) {
filtering = kFilterBilinear;
}
// If scaling to larger, switch from Box to Bilinear.
if (dst_width >= src_width || dst_height >= src_height) {
filtering = kFilterBilinear;
}
}
if (filtering == kFilterBilinear) {
if (src_height == 1) {

View File

@ -18,11 +18,10 @@ extern "C" {
// This module is for GCC MIPS DSPR2
#if !defined(LIBYUV_DISABLE_MIPS) && \
defined(__mips_dsp) && (__mips_dsp_rev >= 2) && \
(_MIPS_SIM == _MIPS_SIM_ABI32)
defined(__mips_dsp) && (__mips_dsp_rev >= 2)
void ScaleRowDown2_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
__asm__ __volatile__(
".set push \n"
".set noreorder \n"
@ -31,6 +30,7 @@ void ScaleRowDown2_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
"beqz $t9, 2f \n"
" nop \n"
".p2align 2 \n"
"1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
"lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4|
@ -77,8 +77,8 @@ void ScaleRowDown2_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
);
}
void ScaleRowDown2Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
void ScaleRowDown2Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
const uint8* t = src_ptr + src_stride;
__asm__ __volatile__ (
@ -89,6 +89,7 @@ void ScaleRowDown2Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
"bltz $t9, 2f \n"
" nop \n"
".p2align 2 \n"
"1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
"lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4|
@ -176,8 +177,8 @@ void ScaleRowDown2Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
);
}
void ScaleRowDown4_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
void ScaleRowDown4_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
@ -186,6 +187,7 @@ void ScaleRowDown4_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
"beqz $t9, 2f \n"
" nop \n"
".p2align 2 \n"
"1: \n"
"lw $t1, 0(%[src_ptr]) \n" // |3|2|1|0|
"lw $t2, 4(%[src_ptr]) \n" // |7|6|5|4|
@ -231,8 +233,8 @@ void ScaleRowDown4_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
);
}
void ScaleRowDown4Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
void ScaleRowDown4Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
intptr_t stride = src_stride;
const uint8* s1 = src_ptr + stride;
const uint8* s2 = s1 + stride;
@ -245,6 +247,7 @@ void ScaleRowDown4Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
"srl $t9, %[dst_width], 1 \n"
"andi $t8, %[dst_width], 1 \n"
".p2align 2 \n"
"1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
"lw $t1, 0(%[s1]) \n" // |7|6|5|4|
@ -310,11 +313,12 @@ void ScaleRowDown4Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
);
}
void ScaleRowDown34_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
void ScaleRowDown34_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
".p2align 2 \n"
"1: \n"
"lw $t1, 0(%[src_ptr]) \n" // |3|2|1|0|
"lw $t2, 4(%[src_ptr]) \n" // |7|6|5|4|
@ -356,13 +360,14 @@ void ScaleRowDown34_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
);
}
void ScaleRowDown34_0_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* d, int dst_width) {
void ScaleRowDown34_0_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* d, int dst_width) {
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
"repl.ph $t3, 3 \n" // 0x00030003
".p2align 2 \n"
"1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0|
"lwx $t1, %[src_stride](%[src_ptr]) \n" // |T3|T2|T1|T0|
@ -412,13 +417,14 @@ void ScaleRowDown34_0_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
);
}
void ScaleRowDown34_1_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* d, int dst_width) {
void ScaleRowDown34_1_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* d, int dst_width) {
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
"repl.ph $t2, 3 \n" // 0x00030003
".p2align 2 \n"
"1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0|
"lwx $t1, %[src_stride](%[src_ptr]) \n" // |T3|T2|T1|T0|
@ -464,12 +470,13 @@ void ScaleRowDown34_1_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
);
}
void ScaleRowDown38_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
void ScaleRowDown38_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
".p2align 2 \n"
"1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
"lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4|
@ -510,8 +517,8 @@ void ScaleRowDown38_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
);
}
void ScaleRowDown38_2_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
void ScaleRowDown38_2_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
intptr_t stride = src_stride;
const uint8* t = src_ptr + stride;
const int c = 0x2AAA;
@ -520,6 +527,7 @@ void ScaleRowDown38_2_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
".set push \n"
".set noreorder \n"
".p2align 2 \n"
"1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0|
"lw $t1, 4(%[src_ptr]) \n" // |S7|S6|S5|S4|
@ -563,9 +571,9 @@ void ScaleRowDown38_2_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
);
}
void ScaleRowDown38_3_Box_DSPR2(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
void ScaleRowDown38_3_Box_MIPS_DSPR2(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
intptr_t stride = src_stride;
const uint8* s1 = src_ptr + stride;
stride += stride;
@ -577,6 +585,7 @@ void ScaleRowDown38_3_Box_DSPR2(const uint8* src_ptr,
".set push \n"
".set noreorder \n"
".p2align 2 \n"
"1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0|
"lw $t1, 4(%[src_ptr]) \n" // |S7|S6|S5|S4|

View File

@ -16,8 +16,7 @@ extern "C" {
#endif
// This module is for GCC Neon.
#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
!defined(__aarch64__)
#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)
// NEON downscalers with interpolation.
// Provided by Fritz Koenig
@ -26,12 +25,11 @@ extern "C" {
void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
asm volatile (
".p2align 2 \n"
"1: \n"
// load even pixels into q0, odd into q1
MEMACCESS(0)
"vld2.8 {q0, q1}, [%0]! \n"
"subs %2, %2, #16 \n" // 16 processed per loop
MEMACCESS(1)
"vst1.8 {q1}, [%1]! \n" // store odd pixels
"bgt 1b \n"
: "+r"(src_ptr), // %0
@ -42,39 +40,15 @@ void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
);
}
// Read 32x1 average down and write 16x1.
void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
asm volatile (
"1: \n"
MEMACCESS(0)
"vld1.8 {q0, q1}, [%0]! \n" // load pixels and post inc
"subs %2, %2, #16 \n" // 16 processed per loop
"vpaddl.u8 q0, q0 \n" // add adjacent
"vpaddl.u8 q1, q1 \n"
"vrshrn.u16 d0, q0, #1 \n" // downshift, round and pack
"vrshrn.u16 d1, q1, #1 \n"
MEMACCESS(1)
"vst1.8 {q0}, [%1]! \n"
"bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst), // %1
"+r"(dst_width) // %2
:
: "q0", "q1" // Clobber List
);
}
// Read 32x2 average down and write 16x1.
void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
asm volatile (
// change the stride to row 2 pointer
"add %1, %0 \n"
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld1.8 {q0, q1}, [%0]! \n" // load row 1 and post inc
MEMACCESS(1)
"vld1.8 {q2, q3}, [%1]! \n" // load row 2 and post inc
"subs %3, %3, #16 \n" // 16 processed per loop
"vpaddl.u8 q0, q0 \n" // row 1 add adjacent
@ -83,7 +57,6 @@ void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
"vpadal.u8 q1, q3 \n"
"vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack
"vrshrn.u16 d1, q1, #2 \n"
MEMACCESS(2)
"vst1.8 {q0}, [%2]! \n"
"bgt 1b \n"
: "+r"(src_ptr), // %0
@ -98,11 +71,10 @@ void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
asm volatile (
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
"subs %2, %2, #8 \n" // 8 processed per loop
MEMACCESS(1)
"vst1.8 {d2}, [%1]! \n"
"bgt 1b \n"
: "+r"(src_ptr), // %0
@ -115,19 +87,16 @@ void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
const uint8* src_ptr1 = src_ptr + src_stride;
const uint8* src_ptr2 = src_ptr + src_stride * 2;
const uint8* src_ptr3 = src_ptr + src_stride * 3;
asm volatile (
asm volatile (
"add r4, %0, %3 \n"
"add r5, r4, %3 \n"
"add %3, r5, %3 \n"
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // load up 16x4
MEMACCESS(3)
"vld1.8 {q1}, [%3]! \n"
MEMACCESS(4)
"vld1.8 {q2}, [%4]! \n"
MEMACCESS(5)
"vld1.8 {q3}, [%5]! \n"
"vld1.8 {q1}, [r4]! \n"
"vld1.8 {q2}, [r5]! \n"
"vld1.8 {q3}, [%3]! \n"
"subs %2, %2, #4 \n"
"vpaddl.u8 q0, q0 \n"
"vpadal.u8 q0, q1 \n"
@ -136,17 +105,13 @@ asm volatile (
"vpaddl.u16 q0, q0 \n"
"vrshrn.u32 d0, q0, #4 \n" // divide by 16 w/rounding
"vmovn.u16 d0, q0 \n"
MEMACCESS(1)
"vst1.32 {d0[0]}, [%1]! \n"
"bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width), // %2
"+r"(src_ptr1), // %3
"+r"(src_ptr2), // %4
"+r"(src_ptr3) // %5
:
: "q0", "q1", "q2", "q3", "memory", "cc"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
: "r"(src_stride) // %3
: "r4", "r5", "q0", "q1", "q2", "q3", "memory", "cc"
);
}
@ -157,12 +122,11 @@ void ScaleRowDown34_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
asm volatile (
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
"subs %2, %2, #24 \n"
"vmov d2, d3 \n" // order d0, d1, d2
MEMACCESS(1)
"vst3.8 {d0, d1, d2}, [%1]! \n"
"bgt 1b \n"
: "+r"(src_ptr), // %0
@ -179,10 +143,9 @@ void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
asm volatile (
"vmov.u8 d24, #3 \n"
"add %3, %0 \n"
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
MEMACCESS(3)
"vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
"subs %2, %2, #24 \n"
@ -219,7 +182,6 @@ void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
"vmlal.u8 q8, d3, d24 \n"
"vqrshrn.u16 d2, q8, #2 \n"
MEMACCESS(1)
"vst3.8 {d0, d1, d2}, [%1]! \n"
"bgt 1b \n"
@ -238,10 +200,9 @@ void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
asm volatile (
"vmov.u8 d24, #3 \n"
"add %3, %0 \n"
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
MEMACCESS(3)
"vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
"subs %2, %2, #24 \n"
// average src line 0 with src line 1
@ -261,7 +222,6 @@ void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
"vmlal.u8 q3, d3, d24 \n"
"vqrshrn.u16 d2, q3, #2 \n"
MEMACCESS(1)
"vst3.8 {d0, d1, d2}, [%1]! \n"
"bgt 1b \n"
: "+r"(src_ptr), // %0
@ -290,17 +250,14 @@ void ScaleRowDown38_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
asm volatile (
MEMACCESS(3)
"vld1.8 {q3}, [%3] \n"
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld1.8 {d0, d1, d2, d3}, [%0]! \n"
"subs %2, %2, #12 \n"
"vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n"
"vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n"
MEMACCESS(1)
"vst1.8 {d4}, [%1]! \n"
MEMACCESS(1)
"vst1.32 {d5[0]}, [%1]! \n"
"bgt 1b \n"
: "+r"(src_ptr), // %0
@ -315,28 +272,22 @@ void ScaleRowDown38_NEON(const uint8* src_ptr,
void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
const uint8* src_ptr1 = src_ptr + src_stride * 2;
asm volatile (
MEMACCESS(5)
"vld1.16 {q13}, [%5] \n"
MEMACCESS(6)
"vld1.8 {q14}, [%6] \n"
MEMACCESS(7)
"vld1.8 {q15}, [%7] \n"
"vld1.16 {q13}, [%4] \n"
"vld1.8 {q14}, [%5] \n"
"vld1.8 {q15}, [%6] \n"
"add r4, %0, %3, lsl #1 \n"
"add %3, %0 \n"
".p2align 2 \n"
"1: \n"
// d0 = 00 40 01 41 02 42 03 43
// d1 = 10 50 11 51 12 52 13 53
// d2 = 20 60 21 61 22 62 23 63
// d3 = 30 70 31 71 32 72 33 73
MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n"
MEMACCESS(3)
"vld4.8 {d4, d5, d6, d7}, [%3]! \n"
MEMACCESS(4)
"vld4.8 {d16, d17, d18, d19}, [%4]! \n"
"vld4.8 {d16, d17, d18, d19}, [r4]! \n"
"subs %2, %2, #12 \n"
// Shuffle the input data around to get align the data
@ -413,20 +364,18 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
"vtbl.u8 d3, {d0, d1, d2}, d28 \n"
"vtbl.u8 d4, {d0, d1, d2}, d29 \n"
MEMACCESS(1)
"vst1.8 {d3}, [%1]! \n"
MEMACCESS(1)
"vst1.32 {d4[0]}, [%1]! \n"
"bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width), // %2
"+r"(src_stride), // %3
"+r"(src_ptr1) // %4
: "r"(&kMult38_Div6), // %5
"r"(&kShuf38_2), // %6
"r"(&kMult38_Div9) // %7
: "q0", "q1", "q2", "q3", "q8", "q9", "q13", "q14", "q15", "memory", "cc"
"+r"(src_stride) // %3
: "r"(&kMult38_Div6), // %4
"r"(&kShuf38_2), // %5
"r"(&kMult38_Div9) // %6
: "r4", "q0", "q1", "q2", "q3", "q8", "q9",
"q13", "q14", "q15", "memory", "cc"
);
}
@ -435,20 +384,17 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
asm volatile (
MEMACCESS(4)
"vld1.16 {q13}, [%4] \n"
MEMACCESS(5)
"vld1.8 {q14}, [%5] \n"
"add %3, %0 \n"
".p2align 2 \n"
"1: \n"
// d0 = 00 40 01 41 02 42 03 43
// d1 = 10 50 11 51 12 52 13 53
// d2 = 20 60 21 61 22 62 23 63
// d3 = 30 70 31 71 32 72 33 73
MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n"
MEMACCESS(3)
"vld4.8 {d4, d5, d6, d7}, [%3]! \n"
"subs %2, %2, #12 \n"
@ -515,9 +461,7 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
"vtbl.u8 d3, {d0, d1, d2}, d28 \n"
"vtbl.u8 d4, {d0, d1, d2}, d29 \n"
MEMACCESS(1)
"vst1.8 {d3}, [%1]! \n"
MEMACCESS(1)
"vst1.32 {d4[0]}, [%1]! \n"
"bgt 1b \n"
: "+r"(src_ptr), // %0
@ -530,114 +474,6 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
);
}
void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint16* dst_ptr, int src_width, int src_height) {
const uint8* src_tmp;
asm volatile (
"1: \n"
"mov %0, %1 \n"
"mov r12, %5 \n"
"veor q2, q2, q2 \n"
"veor q3, q3, q3 \n"
"2: \n"
// load 16 pixels into q0
MEMACCESS(0)
"vld1.8 {q0}, [%0], %3 \n"
"vaddw.u8 q3, q3, d1 \n"
"vaddw.u8 q2, q2, d0 \n"
"subs r12, r12, #1 \n"
"bgt 2b \n"
MEMACCESS(2)
"vst1.16 {q2, q3}, [%2]! \n" // store pixels
"add %1, %1, #16 \n"
"subs %4, %4, #16 \n" // 16 processed per loop
"bgt 1b \n"
: "=&r"(src_tmp), // %0
"+r"(src_ptr), // %1
"+r"(dst_ptr), // %2
"+r"(src_stride), // %3
"+r"(src_width), // %4
"+r"(src_height) // %5
:
: "memory", "cc", "r12", "q0", "q1", "q2", "q3" // Clobber List
);
}
// TODO(Yang Zhang): Investigate less load instructions for
// the x/dx stepping
#define LOAD2_DATA8_LANE(n) \
"lsr %5, %3, #16 \n" \
"add %6, %1, %5 \n" \
"add %3, %3, %4 \n" \
MEMACCESS(6) \
"vld2.8 {d6["#n"], d7["#n"]}, [%6] \n"
// The NEON version mimics this formula:
// #define BLENDER(a, b, f) (uint8)((int)(a) +
// ((int)(f) * ((int)(b) - (int)(a)) >> 16))
void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
int dst_width, int x, int dx) {
int dx_offset[4] = {0, 1, 2, 3};
int* tmp = dx_offset;
const uint8* src_tmp = src_ptr;
asm volatile (
"vdup.32 q0, %3 \n" // x
"vdup.32 q1, %4 \n" // dx
"vld1.32 {q2}, [%5] \n" // 0 1 2 3
"vshl.i32 q3, q1, #2 \n" // 4 * dx
"vmul.s32 q1, q1, q2 \n"
// x , x + 1 * dx, x + 2 * dx, x + 3 * dx
"vadd.s32 q1, q1, q0 \n"
// x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx
"vadd.s32 q2, q1, q3 \n"
"vshl.i32 q0, q3, #1 \n" // 8 * dx
"1: \n"
LOAD2_DATA8_LANE(0)
LOAD2_DATA8_LANE(1)
LOAD2_DATA8_LANE(2)
LOAD2_DATA8_LANE(3)
LOAD2_DATA8_LANE(4)
LOAD2_DATA8_LANE(5)
LOAD2_DATA8_LANE(6)
LOAD2_DATA8_LANE(7)
"vmov q10, q1 \n"
"vmov q11, q2 \n"
"vuzp.16 q10, q11 \n"
"vmovl.u8 q8, d6 \n"
"vmovl.u8 q9, d7 \n"
"vsubl.s16 q11, d18, d16 \n"
"vsubl.s16 q12, d19, d17 \n"
"vmovl.u16 q13, d20 \n"
"vmovl.u16 q10, d21 \n"
"vmul.s32 q11, q11, q13 \n"
"vmul.s32 q12, q12, q10 \n"
"vrshrn.s32 d18, q11, #16 \n"
"vrshrn.s32 d19, q12, #16 \n"
"vadd.s16 q8, q8, q9 \n"
"vmovn.s16 d6, q8 \n"
MEMACCESS(0)
"vst1.8 {d6}, [%0]! \n" // store pixels
"vadd.s32 q1, q1, q0 \n"
"vadd.s32 q2, q2, q0 \n"
"subs %2, %2, #8 \n" // 8 processed per loop
"bgt 1b \n"
: "+r"(dst_ptr), // %0
"+r"(src_ptr), // %1
"+r"(dst_width), // %2
"+r"(x), // %3
"+r"(dx), // %4
"+r"(tmp), // %5
"+r"(src_tmp) // %6
:
: "memory", "cc", "q0", "q1", "q2", "q3",
"q8", "q9", "q10", "q11", "q12", "q13"
);
}
#undef LOAD2_DATA8_LANE
// 16x2 -> 16x1
void ScaleFilterRows_NEON(uint8* dst_ptr,
const uint8* src_ptr, ptrdiff_t src_stride,
@ -658,9 +494,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
"vdup.8 d4, %4 \n"
// General purpose row blend.
"1: \n"
MEMACCESS(1)
"vld1.8 {q0}, [%1]! \n"
MEMACCESS(2)
"vld1.8 {q1}, [%2]! \n"
"subs %3, %3, #16 \n"
"vmull.u8 q13, d0, d4 \n"
@ -669,63 +503,50 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
"vmlal.u8 q14, d3, d5 \n"
"vrshrn.u16 d0, q13, #8 \n"
"vrshrn.u16 d1, q14, #8 \n"
MEMACCESS(0)
"vst1.8 {q0}, [%0]! \n"
"bgt 1b \n"
"b 99f \n"
// Blend 25 / 75.
"25: \n"
MEMACCESS(1)
"vld1.8 {q0}, [%1]! \n"
MEMACCESS(2)
"vld1.8 {q1}, [%2]! \n"
"subs %3, %3, #16 \n"
"vrhadd.u8 q0, q1 \n"
"vrhadd.u8 q0, q1 \n"
MEMACCESS(0)
"vst1.8 {q0}, [%0]! \n"
"bgt 25b \n"
"b 99f \n"
// Blend 50 / 50.
"50: \n"
MEMACCESS(1)
"vld1.8 {q0}, [%1]! \n"
MEMACCESS(2)
"vld1.8 {q1}, [%2]! \n"
"subs %3, %3, #16 \n"
"vrhadd.u8 q0, q1 \n"
MEMACCESS(0)
"vst1.8 {q0}, [%0]! \n"
"bgt 50b \n"
"b 99f \n"
// Blend 75 / 25.
"75: \n"
MEMACCESS(1)
"vld1.8 {q1}, [%1]! \n"
MEMACCESS(2)
"vld1.8 {q0}, [%2]! \n"
"subs %3, %3, #16 \n"
"vrhadd.u8 q0, q1 \n"
"vrhadd.u8 q0, q1 \n"
MEMACCESS(0)
"vst1.8 {q0}, [%0]! \n"
"bgt 75b \n"
"b 99f \n"
// Blend 100 / 0 - Copy row unchanged.
"100: \n"
MEMACCESS(1)
"vld1.8 {q0}, [%1]! \n"
"subs %3, %3, #16 \n"
MEMACCESS(0)
"vst1.8 {q0}, [%0]! \n"
"bgt 100b \n"
"99: \n"
MEMACCESS(0)
"vst1.8 {d1[7]}, [%0] \n"
: "+r"(dst_ptr), // %0
"+r"(src_ptr), // %1
@ -740,16 +561,13 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
asm volatile (
".p2align 2 \n"
"1: \n"
// load even pixels into q0, odd into q1
MEMACCESS(0)
"vld2.32 {q0, q1}, [%0]! \n"
MEMACCESS(0)
"vld2.32 {q2, q3}, [%0]! \n"
"subs %2, %2, #8 \n" // 8 processed per loop
MEMACCESS(1)
"vst1.8 {q1}, [%1]! \n" // store odd pixels
MEMACCESS(1)
"vst1.8 {q3}, [%1]! \n"
"bgt 1b \n"
: "+r"(src_ptr), // %0
@ -760,52 +578,21 @@ void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
);
}
void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride,
uint8* dst_argb, int dst_width) {
asm volatile (
"1: \n"
MEMACCESS(0)
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
MEMACCESS(0)
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
"subs %2, %2, #8 \n" // 8 processed per loop
"vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
"vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
"vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
"vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts.
"vrshrn.u16 d0, q0, #1 \n" // downshift, round and pack
"vrshrn.u16 d1, q1, #1 \n"
"vrshrn.u16 d2, q2, #1 \n"
"vrshrn.u16 d3, q3, #1 \n"
MEMACCESS(1)
"vst4.8 {d0, d1, d2, d3}, [%1]! \n"
"bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(dst_width) // %2
:
: "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List
);
}
void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
asm volatile (
// change the stride to row 2 pointer
"add %1, %1, %0 \n"
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
MEMACCESS(0)
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
"subs %3, %3, #8 \n" // 8 processed per loop.
"vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
"vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
"vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
"vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts.
MEMACCESS(1)
"vld4.8 {d16, d18, d20, d22}, [%1]! \n" // load 8 more ARGB pixels.
MEMACCESS(1)
"vld4.8 {d17, d19, d21, d23}, [%1]! \n" // load last 8 ARGB pixels.
"vpadal.u8 q0, q8 \n" // B 16 bytes -> 8 shorts.
"vpadal.u8 q1, q9 \n" // G 16 bytes -> 8 shorts.
@ -815,7 +602,6 @@ void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
"vrshrn.u16 d1, q1, #2 \n"
"vrshrn.u16 d2, q2, #2 \n"
"vrshrn.u16 d3, q3, #2 \n"
MEMACCESS(2)
"vst4.8 {d0, d1, d2, d3}, [%2]! \n"
"bgt 1b \n"
: "+r"(src_ptr), // %0
@ -833,17 +619,13 @@ void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride,
int src_stepx, uint8* dst_argb, int dst_width) {
asm volatile (
"mov r12, %3, lsl #2 \n"
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld1.32 {d0[0]}, [%0], r12 \n"
MEMACCESS(0)
"vld1.32 {d0[1]}, [%0], r12 \n"
MEMACCESS(0)
"vld1.32 {d1[0]}, [%0], r12 \n"
MEMACCESS(0)
"vld1.32 {d1[1]}, [%0], r12 \n"
"subs %2, %2, #4 \n" // 4 pixels per loop.
MEMACCESS(1)
"vst1.8 {q0}, [%1]! \n"
"bgt 1b \n"
: "+r"(src_argb), // %0
@ -862,22 +644,15 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
asm volatile (
"mov r12, %4, lsl #2 \n"
"add %1, %1, %0 \n"
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld1.8 {d0}, [%0], r12 \n" // Read 4 2x2 blocks -> 2x1
MEMACCESS(1)
"vld1.8 {d1}, [%1], r12 \n"
MEMACCESS(0)
"vld1.8 {d2}, [%0], r12 \n"
MEMACCESS(1)
"vld1.8 {d3}, [%1], r12 \n"
MEMACCESS(0)
"vld1.8 {d4}, [%0], r12 \n"
MEMACCESS(1)
"vld1.8 {d5}, [%1], r12 \n"
MEMACCESS(0)
"vld1.8 {d6}, [%0], r12 \n"
MEMACCESS(1)
"vld1.8 {d7}, [%1], r12 \n"
"vaddl.u8 q0, d0, d1 \n"
"vaddl.u8 q1, d2, d3 \n"
@ -890,7 +665,6 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
"vrshrn.u16 d0, q0, #2 \n" // first 2 pixels.
"vrshrn.u16 d1, q2, #2 \n" // next 2 pixels.
"subs %3, %3, #4 \n" // 4 pixels per loop.
MEMACCESS(2)
"vst1.8 {q0}, [%2]! \n"
"bgt 1b \n"
: "+r"(src_argb), // %0
@ -902,118 +676,7 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
);
}
// TODO(Yang Zhang): Investigate less load instructions for
// the x/dx stepping
#define LOAD1_DATA32_LANE(dn, n) \
"lsr %5, %3, #16 \n" \
"add %6, %1, %5, lsl #2 \n" \
"add %3, %3, %4 \n" \
MEMACCESS(6) \
"vld1.32 {"#dn"["#n"]}, [%6] \n"
void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx) {
int tmp;
const uint8* src_tmp = src_argb;
asm volatile (
"1: \n"
LOAD1_DATA32_LANE(d0, 0)
LOAD1_DATA32_LANE(d0, 1)
LOAD1_DATA32_LANE(d1, 0)
LOAD1_DATA32_LANE(d1, 1)
LOAD1_DATA32_LANE(d2, 0)
LOAD1_DATA32_LANE(d2, 1)
LOAD1_DATA32_LANE(d3, 0)
LOAD1_DATA32_LANE(d3, 1)
MEMACCESS(0)
"vst1.32 {q0, q1}, [%0]! \n" // store pixels
"subs %2, %2, #8 \n" // 8 processed per loop
"bgt 1b \n"
: "+r"(dst_argb), // %0
"+r"(src_argb), // %1
"+r"(dst_width), // %2
"+r"(x), // %3
"+r"(dx), // %4
"=&r"(tmp), // %5
"+r"(src_tmp) // %6
:
: "memory", "cc", "q0", "q1"
);
}
#undef LOAD1_DATA32_LANE
// TODO(Yang Zhang): Investigate less load instructions for
// the x/dx stepping
#define LOAD2_DATA32_LANE(dn1, dn2, n) \
"lsr %5, %3, #16 \n" \
"add %6, %1, %5, lsl #2 \n" \
"add %3, %3, %4 \n" \
MEMACCESS(6) \
"vld2.32 {"#dn1"["#n"], "#dn2"["#n"]}, [%6] \n"
void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx) {
int dx_offset[4] = {0, 1, 2, 3};
int* tmp = dx_offset;
const uint8* src_tmp = src_argb;
asm volatile (
"vdup.32 q0, %3 \n" // x
"vdup.32 q1, %4 \n" // dx
"vld1.32 {q2}, [%5] \n" // 0 1 2 3
"vshl.i32 q9, q1, #2 \n" // 4 * dx
"vmul.s32 q1, q1, q2 \n"
"vmov.i8 q3, #0x7f \n" // 0x7F
"vmov.i16 q15, #0x7f \n" // 0x7F
// x , x + 1 * dx, x + 2 * dx, x + 3 * dx
"vadd.s32 q8, q1, q0 \n"
"1: \n"
// d0, d1: a
// d2, d3: b
LOAD2_DATA32_LANE(d0, d2, 0)
LOAD2_DATA32_LANE(d0, d2, 1)
LOAD2_DATA32_LANE(d1, d3, 0)
LOAD2_DATA32_LANE(d1, d3, 1)
"vshrn.i32 d22, q8, #9 \n"
"vand.16 d22, d22, d30 \n"
"vdup.8 d24, d22[0] \n"
"vdup.8 d25, d22[2] \n"
"vdup.8 d26, d22[4] \n"
"vdup.8 d27, d22[6] \n"
"vext.8 d4, d24, d25, #4 \n"
"vext.8 d5, d26, d27, #4 \n" // f
"veor.8 q10, q2, q3 \n" // 0x7f ^ f
"vmull.u8 q11, d0, d20 \n"
"vmull.u8 q12, d1, d21 \n"
"vmull.u8 q13, d2, d4 \n"
"vmull.u8 q14, d3, d5 \n"
"vadd.i16 q11, q11, q13 \n"
"vadd.i16 q12, q12, q14 \n"
"vshrn.i16 d0, q11, #7 \n"
"vshrn.i16 d1, q12, #7 \n"
MEMACCESS(0)
"vst1.32 {d0, d1}, [%0]! \n" // store pixels
"vadd.s32 q8, q8, q9 \n"
"subs %2, %2, #4 \n" // 4 processed per loop
"bgt 1b \n"
: "+r"(dst_argb), // %0
"+r"(src_argb), // %1
"+r"(dst_width), // %2
"+r"(x), // %3
"+r"(dx), // %4
"+r"(tmp), // %5
"+r"(src_tmp) // %6
:
: "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9",
"q10", "q11", "q12", "q13", "q14", "q15"
);
}
#undef LOAD2_DATA32_LANE
#endif // defined(__ARM_NEON__) && !defined(__aarch64__)
#endif // __ARM_NEON__
#ifdef __cplusplus
} // extern "C"

File diff suppressed because it is too large Load Diff

View File

@ -25,7 +25,6 @@ struct FourCCAliasEntry {
static const struct FourCCAliasEntry kFourCCAliases[] = {
{FOURCC_IYUV, FOURCC_I420},
{FOURCC_YU12, FOURCC_I420},
{FOURCC_YU16, FOURCC_I422},
{FOURCC_YU24, FOURCC_I444},
{FOURCC_YUYV, FOURCC_YUY2},
@ -34,7 +33,7 @@ static const struct FourCCAliasEntry kFourCCAliases[] = {
{FOURCC_2VUY, FOURCC_UYVY}, // kCMPixelFormat_422YpCbCr8
{FOURCC_JPEG, FOURCC_MJPG}, // Note: JPEG has DHT while MJPG does not.
{FOURCC_DMB1, FOURCC_MJPG},
{FOURCC_BA81, FOURCC_BGGR}, // deprecated.
{FOURCC_BA81, FOURCC_BGGR},
{FOURCC_RGB3, FOURCC_RAW },
{FOURCC_BGR3, FOURCC_24BG},
{FOURCC_CM32, FOURCC_BGRA}, // kCMPixelFormat_32ARGB

View File

@ -84,14 +84,6 @@ def main(_):
help='Additional arguments to --gtest_filter')
parser.add_option('', '--gtest_repeat',
help='Argument for --gtest_repeat')
parser.add_option("--gtest_shuffle", action="store_true", default=False,
help="Randomize tests' orders on every iteration.")
parser.add_option("--gtest_break_on_failure", action="store_true",
default=False,
help="Drop in to debugger on assertion failure. Also "
"useful for forcing tests to exit with a stack dump "
"on the first assertion failure when running with "
"--gtest_repeat=-1")
parser.add_option('-v', '--verbose', action='store_true', default=False,
help='Verbose output - enable debug log messages')
parser.add_option('', '--tool', dest='valgrind_tool', default='memcheck',
@ -103,12 +95,6 @@ def main(_):
'instead of /tmp.\nThis can be useful for tool '
'developers/maintainers.\nPlease note that the <tool>'
'.logs directory will be clobbered on tool startup.'))
parser.add_option("--test-launcher-bot-mode", action="store_true",
help="run the tests with --test-launcher-bot-mode")
parser.add_option("--test-launcher-total-shards", type=int,
help="run the tests with --test-launcher-total-shards")
parser.add_option("--test-launcher-shard-index", type=int,
help="run the tests with --test-launcher-shard-index")
options, args = parser.parse_args()
if options.verbose:

View File

@ -13,7 +13,7 @@
namespace libyuv {
TEST_F(LibYUVBaseTest, Endian) {
TEST_F(libyuvTest, Endian) {
uint16 v16 = 0x1234u;
uint8 first_byte = *reinterpret_cast<uint8*>(&v16);
#if defined(LIBYUV_LITTLE_ENDIAN)
@ -23,7 +23,7 @@ TEST_F(LibYUVBaseTest, Endian) {
#endif
}
TEST_F(LibYUVBaseTest, SizeOfTypes) {
TEST_F(libyuvTest, SizeOfTypes) {
int8 i8 = -1;
uint8 u8 = 1u;
int16 i16 = -1;
@ -50,7 +50,7 @@ TEST_F(LibYUVBaseTest, SizeOfTypes) {
EXPECT_LT(0u, u64);
}
TEST_F(LibYUVBaseTest, SizeOfConstants) {
TEST_F(libyuvTest, SizeOfConstants) {
EXPECT_EQ(8u, sizeof(INT64_C(0)));
EXPECT_EQ(8u, sizeof(UINT64_C(0)));
EXPECT_EQ(8u, sizeof(INT64_C(0x1234567887654321)));

View File

@ -16,7 +16,7 @@
#include "libyuv/basic_types.h"
#include "libyuv/compare.h"
#include "libyuv/cpu_id.h"
#include "libyuv/video_common.h"
#include "libyuv/row.h"
namespace libyuv {
@ -31,10 +31,10 @@ static uint32 ReferenceHashDjb2(const uint8* src, uint64 count, uint32 seed) {
return hash;
}
TEST_F(LibYUVBaseTest, Djb2_Test) {
TEST_F(libyuvTest, Djb2_Test) {
const int kMaxTest = benchmark_width_ * benchmark_height_;
align_buffer_page_end(src_a, kMaxTest);
align_buffer_page_end(src_b, kMaxTest);
align_buffer_64(src_a, kMaxTest);
align_buffer_64(src_b, kMaxTest);
const char* fox = "The quick brown fox jumps over the lazy dog"
" and feels as if he were in the seventh heaven of typography"
@ -44,8 +44,8 @@ TEST_F(LibYUVBaseTest, Djb2_Test) {
EXPECT_EQ(kExpectedFoxHash, foxhash);
for (int i = 0; i < kMaxTest; ++i) {
src_a[i] = (fastrand() & 0xff);
src_b[i] = (fastrand() & 0xff);
src_a[i] = (random() & 0xff);
src_b[i] = (random() & 0xff);
}
// Compare different buffers. Expect hash is different.
uint32 h1 = HashDjb2(src_a, kMaxTest, 5381);
@ -111,13 +111,13 @@ TEST_F(LibYUVBaseTest, Djb2_Test) {
h2 = HashDjb2(src_a, kMaxTest / 2, 0);
EXPECT_EQ(h1, h2);
free_aligned_buffer_page_end(src_a);
free_aligned_buffer_page_end(src_b);
free_aligned_buffer_64(src_a);
free_aligned_buffer_64(src_b);
}
TEST_F(LibYUVBaseTest, BenchmarkDjb2_Opt) {
TEST_F(libyuvTest, BenchmarkDjb2_Opt) {
const int kMaxTest = benchmark_width_ * benchmark_height_;
align_buffer_page_end(src_a, kMaxTest);
align_buffer_64(src_a, kMaxTest);
for (int i = 0; i < kMaxTest; ++i) {
src_a[i] = i;
@ -128,12 +128,12 @@ TEST_F(LibYUVBaseTest, BenchmarkDjb2_Opt) {
h1 = HashDjb2(src_a, kMaxTest, 5381);
}
EXPECT_EQ(h1, h2);
free_aligned_buffer_page_end(src_a);
free_aligned_buffer_64(src_a);
}
TEST_F(LibYUVBaseTest, BenchmarkDjb2_Unaligned) {
TEST_F(libyuvTest, BenchmarkDjb2_Unaligned) {
const int kMaxTest = benchmark_width_ * benchmark_height_;
align_buffer_page_end(src_a, kMaxTest + 1);
align_buffer_64(src_a, kMaxTest + 1);
for (int i = 0; i < kMaxTest; ++i) {
src_a[i + 1] = i;
}
@ -143,68 +143,13 @@ TEST_F(LibYUVBaseTest, BenchmarkDjb2_Unaligned) {
h1 = HashDjb2(src_a + 1, kMaxTest, 5381);
}
EXPECT_EQ(h1, h2);
free_aligned_buffer_page_end(src_a);
free_aligned_buffer_64(src_a);
}
TEST_F(LibYUVBaseTest, BenchmarkARGBDetect_Opt) {
uint32 fourcc;
const int kMaxTest = benchmark_width_ * benchmark_height_ * 4;
align_buffer_page_end(src_a, kMaxTest);
for (int i = 0; i < kMaxTest; ++i) {
src_a[i] = 255;
}
src_a[0] = 0;
fourcc = ARGBDetect(src_a, benchmark_width_ * 4,
benchmark_width_, benchmark_height_);
EXPECT_EQ(libyuv::FOURCC_BGRA, fourcc);
src_a[0] = 255;
src_a[3] = 0;
fourcc = ARGBDetect(src_a, benchmark_width_ * 4,
benchmark_width_, benchmark_height_);
EXPECT_EQ(libyuv::FOURCC_ARGB, fourcc);
src_a[3] = 255;
for (int i = 0; i < benchmark_iterations_; ++i) {
fourcc = ARGBDetect(src_a, benchmark_width_ * 4,
benchmark_width_, benchmark_height_);
}
EXPECT_EQ(0, fourcc);
free_aligned_buffer_page_end(src_a);
}
TEST_F(LibYUVBaseTest, BenchmarkARGBDetect_Unaligned) {
uint32 fourcc;
const int kMaxTest = benchmark_width_ * benchmark_height_ * 4 + 1;
align_buffer_page_end(src_a, kMaxTest);
for (int i = 1; i < kMaxTest; ++i) {
src_a[i] = 255;
}
src_a[0 + 1] = 0;
fourcc = ARGBDetect(src_a + 1, benchmark_width_ * 4,
benchmark_width_, benchmark_height_);
EXPECT_EQ(libyuv::FOURCC_BGRA, fourcc);
src_a[0 + 1] = 255;
src_a[3 + 1] = 0;
fourcc = ARGBDetect(src_a + 1, benchmark_width_ * 4,
benchmark_width_, benchmark_height_);
EXPECT_EQ(libyuv::FOURCC_ARGB, fourcc);
src_a[3 + 1] = 255;
for (int i = 0; i < benchmark_iterations_; ++i) {
fourcc = ARGBDetect(src_a + 1, benchmark_width_ * 4,
benchmark_width_, benchmark_height_);
}
EXPECT_EQ(0, fourcc);
free_aligned_buffer_page_end(src_a);
}
TEST_F(LibYUVBaseTest, BenchmarkSumSquareError_Opt) {
TEST_F(libyuvTest, BenchmarkSumSquareError_Opt) {
const int kMaxWidth = 4096 * 3;
align_buffer_page_end(src_a, kMaxWidth);
align_buffer_page_end(src_b, kMaxWidth);
align_buffer_64(src_a, kMaxWidth);
align_buffer_64(src_b, kMaxWidth);
memset(src_a, 0, kMaxWidth);
memset(src_b, 0, kMaxWidth);
@ -228,14 +173,14 @@ TEST_F(LibYUVBaseTest, BenchmarkSumSquareError_Opt) {
EXPECT_EQ(0, h1);
free_aligned_buffer_page_end(src_a);
free_aligned_buffer_page_end(src_b);
free_aligned_buffer_64(src_a);
free_aligned_buffer_64(src_b);
}
TEST_F(LibYUVBaseTest, SumSquareError) {
TEST_F(libyuvTest, SumSquareError) {
const int kMaxWidth = 4096 * 3;
align_buffer_page_end(src_a, kMaxWidth);
align_buffer_page_end(src_b, kMaxWidth);
align_buffer_64(src_a, kMaxWidth);
align_buffer_64(src_b, kMaxWidth);
memset(src_a, 0, kMaxWidth);
memset(src_b, 0, kMaxWidth);
@ -255,32 +200,34 @@ TEST_F(LibYUVBaseTest, SumSquareError) {
EXPECT_EQ(kMaxWidth * 3 * 3, err);
srandom(time(NULL));
for (int i = 0; i < kMaxWidth; ++i) {
src_a[i] = (fastrand() & 0xff);
src_b[i] = (fastrand() & 0xff);
src_a[i] = (random() & 0xff);
src_b[i] = (random() & 0xff);
}
MaskCpuFlags(disable_cpu_flags_);
MaskCpuFlags(0);
uint64 c_err = ComputeSumSquareError(src_a, src_b, kMaxWidth);
MaskCpuFlags(benchmark_cpu_info_);
MaskCpuFlags(-1);
uint64 opt_err = ComputeSumSquareError(src_a, src_b, kMaxWidth);
EXPECT_EQ(c_err, opt_err);
free_aligned_buffer_page_end(src_a);
free_aligned_buffer_page_end(src_b);
free_aligned_buffer_64(src_a);
free_aligned_buffer_64(src_b);
}
TEST_F(LibYUVBaseTest, BenchmarkPsnr_Opt) {
align_buffer_page_end(src_a, benchmark_width_ * benchmark_height_);
align_buffer_page_end(src_b, benchmark_width_ * benchmark_height_);
TEST_F(libyuvTest, BenchmarkPsnr_Opt) {
align_buffer_64(src_a, benchmark_width_ * benchmark_height_);
align_buffer_64(src_b, benchmark_width_ * benchmark_height_);
for (int i = 0; i < benchmark_width_ * benchmark_height_; ++i) {
src_a[i] = i;
src_b[i] = i;
}
MaskCpuFlags(benchmark_cpu_info_);
MaskCpuFlags(-1);
double opt_time = get_time();
for (int i = 0; i < benchmark_iterations_; ++i)
@ -293,43 +240,18 @@ TEST_F(LibYUVBaseTest, BenchmarkPsnr_Opt) {
EXPECT_EQ(0, 0);
free_aligned_buffer_page_end(src_a);
free_aligned_buffer_page_end(src_b);
free_aligned_buffer_64(src_a);
free_aligned_buffer_64(src_b);
}
TEST_F(LibYUVBaseTest, BenchmarkPsnr_Unaligned) {
align_buffer_page_end(src_a, benchmark_width_ * benchmark_height_ + 1);
align_buffer_page_end(src_b, benchmark_width_ * benchmark_height_);
for (int i = 0; i < benchmark_width_ * benchmark_height_; ++i) {
src_a[i + 1] = i;
src_b[i] = i;
}
MaskCpuFlags(benchmark_cpu_info_);
double opt_time = get_time();
for (int i = 0; i < benchmark_iterations_; ++i)
CalcFramePsnr(src_a + 1, benchmark_width_,
src_b, benchmark_width_,
benchmark_width_, benchmark_height_);
opt_time = (get_time() - opt_time) / benchmark_iterations_;
printf("BenchmarkPsnr_Opt - %8.2f us opt\n", opt_time * 1e6);
EXPECT_EQ(0, 0);
free_aligned_buffer_page_end(src_a);
free_aligned_buffer_page_end(src_b);
}
TEST_F(LibYUVBaseTest, Psnr) {
TEST_F(libyuvTest, Psnr) {
const int kSrcWidth = benchmark_width_;
const int kSrcHeight = benchmark_height_;
const int b = 128;
const int kSrcPlaneSize = (kSrcWidth + b * 2) * (kSrcHeight + b * 2);
const int kSrcStride = 2 * b + kSrcWidth;
align_buffer_page_end(src_a, kSrcPlaneSize);
align_buffer_page_end(src_b, kSrcPlaneSize);
align_buffer_64(src_a, kSrcPlaneSize);
align_buffer_64(src_b, kSrcPlaneSize);
memset(src_a, 0, kSrcPlaneSize);
memset(src_b, 0, kSrcPlaneSize);
@ -370,24 +292,26 @@ TEST_F(LibYUVBaseTest, Psnr) {
EXPECT_LT(err, 6.0);
}
srandom(time(NULL));
memset(src_a, 0, kSrcPlaneSize);
memset(src_b, 0, kSrcPlaneSize);
for (int i = b; i < (kSrcHeight + b); ++i) {
for (int j = b; j < (kSrcWidth + b); ++j) {
src_a[(i * kSrcStride) + j] = (fastrand() & 0xff);
src_b[(i * kSrcStride) + j] = (fastrand() & 0xff);
src_a[(i * kSrcStride) + j] = (random() & 0xff);
src_b[(i * kSrcStride) + j] = (random() & 0xff);
}
}
MaskCpuFlags(disable_cpu_flags_);
MaskCpuFlags(0);
double c_err, opt_err;
c_err = CalcFramePsnr(src_a + kSrcStride * b + b, kSrcStride,
src_b + kSrcStride * b + b, kSrcStride,
kSrcWidth, kSrcHeight);
MaskCpuFlags(benchmark_cpu_info_);
MaskCpuFlags(-1);
opt_err = CalcFramePsnr(src_a + kSrcStride * b + b, kSrcStride,
src_b + kSrcStride * b + b, kSrcStride,
@ -395,19 +319,19 @@ TEST_F(LibYUVBaseTest, Psnr) {
EXPECT_EQ(opt_err, c_err);
free_aligned_buffer_page_end(src_a);
free_aligned_buffer_page_end(src_b);
free_aligned_buffer_64(src_a);
free_aligned_buffer_64(src_b);
}
TEST_F(LibYUVBaseTest, DISABLED_BenchmarkSsim_Opt) {
align_buffer_page_end(src_a, benchmark_width_ * benchmark_height_);
align_buffer_page_end(src_b, benchmark_width_ * benchmark_height_);
TEST_F(libyuvTest, DISABLED_BenchmarkSsim_Opt) {
align_buffer_64(src_a, benchmark_width_ * benchmark_height_);
align_buffer_64(src_b, benchmark_width_ * benchmark_height_);
for (int i = 0; i < benchmark_width_ * benchmark_height_; ++i) {
src_a[i] = i;
src_b[i] = i;
}
MaskCpuFlags(benchmark_cpu_info_);
MaskCpuFlags(-1);
double opt_time = get_time();
for (int i = 0; i < benchmark_iterations_; ++i)
@ -420,18 +344,18 @@ TEST_F(LibYUVBaseTest, DISABLED_BenchmarkSsim_Opt) {
EXPECT_EQ(0, 0); // Pass if we get this far.
free_aligned_buffer_page_end(src_a);
free_aligned_buffer_page_end(src_b);
free_aligned_buffer_64(src_a);
free_aligned_buffer_64(src_b);
}
TEST_F(LibYUVBaseTest, Ssim) {
TEST_F(libyuvTest, Ssim) {
const int kSrcWidth = benchmark_width_;
const int kSrcHeight = benchmark_height_;
const int b = 128;
const int kSrcPlaneSize = (kSrcWidth + b * 2) * (kSrcHeight + b * 2);
const int kSrcStride = 2 * b + kSrcWidth;
align_buffer_page_end(src_a, kSrcPlaneSize);
align_buffer_page_end(src_b, kSrcPlaneSize);
align_buffer_64(src_a, kSrcPlaneSize);
align_buffer_64(src_b, kSrcPlaneSize);
memset(src_a, 0, kSrcPlaneSize);
memset(src_b, 0, kSrcPlaneSize);
@ -482,21 +406,22 @@ TEST_F(LibYUVBaseTest, Ssim) {
EXPECT_LT(err, 0.01);
}
srandom(time(NULL));
for (int i = b; i < (kSrcHeight + b); ++i) {
for (int j = b; j < (kSrcWidth + b); ++j) {
src_a[(i * kSrcStride) + j] = (fastrand() & 0xff);
src_b[(i * kSrcStride) + j] = (fastrand() & 0xff);
src_a[(i * kSrcStride) + j] = (random() & 0xff);
src_b[(i * kSrcStride) + j] = (random() & 0xff);
}
}
MaskCpuFlags(disable_cpu_flags_);
MaskCpuFlags(0);
double c_err, opt_err;
c_err = CalcFrameSsim(src_a + kSrcStride * b + b, kSrcStride,
src_b + kSrcStride * b + b, kSrcStride,
kSrcWidth, kSrcHeight);
MaskCpuFlags(benchmark_cpu_info_);
MaskCpuFlags(-1);
opt_err = CalcFrameSsim(src_a + kSrcStride * b + b, kSrcStride,
src_b + kSrcStride * b + b, kSrcStride,
@ -506,8 +431,8 @@ TEST_F(LibYUVBaseTest, Ssim) {
EXPECT_EQ(opt_err, c_err);
}
free_aligned_buffer_page_end(src_a);
free_aligned_buffer_page_end(src_b);
free_aligned_buffer_64(src_a);
free_aligned_buffer_64(src_b);
}
} // namespace libyuv

File diff suppressed because it is too large Load Diff

View File

@ -18,7 +18,7 @@
namespace libyuv {
TEST_F(LibYUVBaseTest, TestCpuHas) {
TEST_F(libyuvTest, TestCpuHas) {
int cpu_flags = TestCpuFlag(-1);
printf("Cpu Flags %x\n", cpu_flags);
int has_arm = TestCpuFlag(kCpuHasARM);
@ -43,39 +43,17 @@ TEST_F(LibYUVBaseTest, TestCpuHas) {
printf("Has ERMS %x\n", has_erms);
int has_fma3 = TestCpuFlag(kCpuHasFMA3);
printf("Has FMA3 %x\n", has_fma3);
int has_avx3 = TestCpuFlag(kCpuHasAVX3);
printf("Has AVX3 %x\n", has_avx3);
int has_mips = TestCpuFlag(kCpuHasMIPS);
printf("Has MIPS %x\n", has_mips);
int has_dspr2 = TestCpuFlag(kCpuHasDSPR2);
printf("Has DSPR2 %x\n", has_dspr2);
}
TEST_F(LibYUVBaseTest, TestCpuCompilerEnabled) {
#if defined(__aarch64__)
printf("Arm64 build\n");
#endif
#if defined(__aarch64__) || defined(__ARM_NEON__) || defined(LIBYUV_NEON)
printf("Neon build enabled\n");
#endif
#if defined(__x86_64__) || defined(_M_X64)
printf("x64 build\n");
#endif
#ifdef _MSC_VER
printf("_MSC_VER %d\n", _MSC_VER);
#endif
#if !defined(LIBYUV_DISABLE_X86) && (defined(GCC_HAS_AVX2) || \
defined(CLANG_HAS_AVX2) || defined(VISUALC_HAS_AVX2))
printf("Has AVX2 1\n");
#else
printf("Has AVX2 0\n");
// If compiler does not support AVX2, the following function not expected:
#endif
int has_mips_dsp = TestCpuFlag(kCpuHasMIPS_DSP);
printf("Has MIPS DSP %x\n", has_mips_dsp);
int has_mips_dspr2 = TestCpuFlag(kCpuHasMIPS_DSPR2);
printf("Has MIPS DSPR2 %x\n", has_mips_dspr2);
}
#if defined(__i386__) || defined(__x86_64__) || \
defined(_M_IX86) || defined(_M_X64)
TEST_F(LibYUVBaseTest, TestCpuId) {
TEST_F(libyuvTest, TestCpuId) {
int has_x86 = TestCpuFlag(kCpuHasX86);
if (has_x86) {
uint32 cpu_info[4];
@ -114,25 +92,16 @@ TEST_F(LibYUVBaseTest, TestCpuId) {
}
#endif
static int FileExists(const char* file_name) {
FILE* f = fopen(file_name, "r");
if (!f) {
return 0;
}
fclose(f);
return 1;
}
TEST_F(LibYUVBaseTest, TestLinuxNeon) {
if (FileExists("../../unit_test/testdata/arm_v7.txt")) {
EXPECT_EQ(0, ArmCpuCaps("../../unit_test/testdata/arm_v7.txt"));
EXPECT_EQ(kCpuHasNEON, ArmCpuCaps("../../unit_test/testdata/tegra3.txt"));
EXPECT_EQ(kCpuHasNEON, ArmCpuCaps("../../unit_test/testdata/juno.txt"));
TEST_F(libyuvTest, TestLinuxNeon) {
int testdata = ArmCpuCaps("unit_test/testdata/arm_v7.txt");
if (testdata) {
EXPECT_EQ(0, ArmCpuCaps("unit_test/testdata/arm_v7.txt"));
EXPECT_EQ(kCpuHasNEON, ArmCpuCaps("unit_test/testdata/tegra3.txt"));
} else {
printf("WARNING: unable to load \"../../unit_test/testdata/arm_v7.txt\"\n");
printf("WARNING: unable to load \"unit_test/testdata/arm_v7.txt\"\n");
}
#if defined(__linux__) && defined(__ARM_NEON__)
EXPECT_EQ(kCpuHasNEON, ArmCpuCaps("/proc/cpuinfo"));
EXPECT_NE(0, ArmCpuCaps("/proc/cpuinfo"));
#endif
}

View File

@ -10,17 +10,17 @@
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include "libyuv/basic_types.h"
#include "libyuv/cpu_id.h"
#include "libyuv/row.h"
#include "libyuv/scale.h"
#include "libyuv/scale_row.h"
#include "../unit_test/unit_test.h"
namespace libyuv {
TEST_F(LibYUVBaseTest, TestFixedDiv) {
TEST_F(libyuvTest, TestFixedDiv) {
int num[1280];
int div[1280];
int result_opt[1280];
@ -65,6 +65,7 @@ TEST_F(LibYUVBaseTest, TestFixedDiv) {
}
EXPECT_EQ(123 * 65536, libyuv::FixedDiv(123, 1));
srandom(time(NULL));
MemRandomize(reinterpret_cast<uint8*>(&num[0]), sizeof(num));
MemRandomize(reinterpret_cast<uint8*>(&div[0]), sizeof(div));
for (int j = 0; j < 1280; ++j) {
@ -84,12 +85,13 @@ TEST_F(LibYUVBaseTest, TestFixedDiv) {
}
}
TEST_F(LibYUVBaseTest, TestFixedDiv_Opt) {
TEST_F(libyuvTest, TestFixedDiv_Opt) {
int num[1280];
int div[1280];
int result_opt[1280];
int result_c[1280];
srandom(time(NULL));
MemRandomize(reinterpret_cast<uint8*>(&num[0]), sizeof(num));
MemRandomize(reinterpret_cast<uint8*>(&div[0]), sizeof(div));
for (int j = 0; j < 1280; ++j) {
@ -118,12 +120,13 @@ TEST_F(LibYUVBaseTest, TestFixedDiv_Opt) {
}
}
TEST_F(LibYUVBaseTest, TestFixedDiv1_Opt) {
TEST_F(libyuvTest, TestFixedDiv1_Opt) {
int num[1280];
int div[1280];
int result_opt[1280];
int result_c[1280];
srandom(time(NULL));
MemRandomize(reinterpret_cast<uint8*>(&num[0]), sizeof(num));
MemRandomize(reinterpret_cast<uint8*>(&div[0]), sizeof(div));
for (int j = 0; j < 1280; ++j) {

File diff suppressed because it is too large Load Diff

View File

@ -9,9 +9,11 @@
*/
#include <stdlib.h>
#include <time.h>
#include "libyuv/cpu_id.h"
#include "libyuv/rotate_argb.h"
#include "libyuv/row.h"
#include "../unit_test/unit_test.h"
namespace libyuv {
@ -20,8 +22,6 @@ void TestRotateBpp(int src_width, int src_height,
int dst_width, int dst_height,
libyuv::RotationMode mode,
int benchmark_iterations,
int disable_cpu_flags,
int benchmark_cpu_info,
const int kBpp) {
if (src_width < 1) {
src_width = 1;
@ -36,38 +36,38 @@ void TestRotateBpp(int src_width, int src_height,
dst_height = 1;
}
int src_stride_argb = src_width * kBpp;
int src_argb_plane_size = src_stride_argb * abs(src_height);
align_buffer_page_end(src_argb, src_argb_plane_size);
int src_argb_plane_size = src_stride_argb * src_height;
align_buffer_64(src_argb, src_argb_plane_size);
for (int i = 0; i < src_argb_plane_size; ++i) {
src_argb[i] = fastrand() & 0xff;
src_argb[i] = random() & 0xff;
}
int dst_stride_argb = dst_width * kBpp;
int dst_argb_plane_size = dst_stride_argb * dst_height;
align_buffer_page_end(dst_argb_c, dst_argb_plane_size);
align_buffer_page_end(dst_argb_opt, dst_argb_plane_size);
align_buffer_64(dst_argb_c, dst_argb_plane_size);
align_buffer_64(dst_argb_opt, dst_argb_plane_size);
memset(dst_argb_c, 2, dst_argb_plane_size);
memset(dst_argb_opt, 3, dst_argb_plane_size);
if (kBpp == 1) {
MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization.
MaskCpuFlags(0); // Disable all CPU optimization.
RotatePlane(src_argb, src_stride_argb,
dst_argb_c, dst_stride_argb,
src_width, src_height, mode);
MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization.
MaskCpuFlags(-1); // Enable all CPU optimization.
for (int i = 0; i < benchmark_iterations; ++i) {
RotatePlane(src_argb, src_stride_argb,
dst_argb_opt, dst_stride_argb,
src_width, src_height, mode);
}
} else if (kBpp == 4) {
MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization.
MaskCpuFlags(0); // Disable all CPU optimization.
ARGBRotate(src_argb, src_stride_argb,
dst_argb_c, dst_stride_argb,
src_width, src_height, mode);
MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization.
MaskCpuFlags(-1); // Enable all CPU optimization.
for (int i = 0; i < benchmark_iterations; ++i) {
ARGBRotate(src_argb, src_stride_argb,
dst_argb_opt, dst_stride_argb,
@ -80,117 +80,123 @@ void TestRotateBpp(int src_width, int src_height,
EXPECT_EQ(dst_argb_c[i], dst_argb_opt[i]);
}
free_aligned_buffer_page_end(dst_argb_c);
free_aligned_buffer_page_end(dst_argb_opt);
free_aligned_buffer_page_end(src_argb);
free_aligned_buffer_64(dst_argb_c);
free_aligned_buffer_64(dst_argb_opt);
free_aligned_buffer_64(src_argb);
}
static void ARGBTestRotate(int src_width, int src_height,
int dst_width, int dst_height,
libyuv::RotationMode mode,
int benchmark_iterations,
int disable_cpu_flags,
int benchmark_cpu_info) {
int benchmark_iterations) {
TestRotateBpp(src_width, src_height,
dst_width, dst_height,
mode, benchmark_iterations,
disable_cpu_flags, benchmark_cpu_info, 4);
mode, benchmark_iterations, 4);
}
TEST_F(LibYUVRotateTest, ARGBRotate0_Opt) {
TEST_F(libyuvTest, ARGBRotate0) {
ARGBTestRotate(benchmark_width_, benchmark_height_,
benchmark_width_, benchmark_height_,
kRotate0, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_);
kRotate0, benchmark_iterations_);
}
TEST_F(LibYUVRotateTest, ARGBRotate90_Opt) {
TEST_F(libyuvTest, ARGBRotate90) {
ARGBTestRotate(benchmark_width_, benchmark_height_,
benchmark_height_, benchmark_width_,
kRotate90, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_);
kRotate90, benchmark_iterations_);
}
TEST_F(LibYUVRotateTest, ARGBRotate180_Opt) {
TEST_F(libyuvTest, ARGBRotate180) {
ARGBTestRotate(benchmark_width_, benchmark_height_,
benchmark_width_, benchmark_height_,
kRotate180, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_);
kRotate180, benchmark_iterations_);
}
TEST_F(LibYUVRotateTest, ARGBRotate270_Opt) {
TEST_F(libyuvTest, ARGBRotate270) {
ARGBTestRotate(benchmark_width_, benchmark_height_,
benchmark_height_, benchmark_width_,
kRotate270, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_);
kRotate270, benchmark_iterations_);
}
TEST_F(libyuvTest, ARGBRotate0_Odd) {
ARGBTestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
benchmark_width_ - 3, benchmark_height_ - 1,
kRotate0, benchmark_iterations_);
}
TEST_F(libyuvTest, ARGBRotate90_Odd) {
ARGBTestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
benchmark_height_ - 1, benchmark_width_ - 3,
kRotate90, benchmark_iterations_);
}
TEST_F(libyuvTest, ARGBRotate180_Odd) {
ARGBTestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
benchmark_width_ - 3, benchmark_height_ - 1,
kRotate180, benchmark_iterations_);
}
TEST_F(libyuvTest, ARGBRotate270_Odd) {
ARGBTestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
benchmark_height_ - 1, benchmark_width_ - 3,
kRotate270, benchmark_iterations_);
}
static void TestRotatePlane(int src_width, int src_height,
int dst_width, int dst_height,
libyuv::RotationMode mode,
int benchmark_iterations,
int disable_cpu_flags,
int benchmark_cpu_info) {
int benchmark_iterations) {
TestRotateBpp(src_width, src_height,
dst_width, dst_height,
mode, benchmark_iterations,
disable_cpu_flags, benchmark_cpu_info, 1);
mode, benchmark_iterations, 1);
}
TEST_F(LibYUVRotateTest, RotatePlane0_Opt) {
TEST_F(libyuvTest, RotatePlane0) {
TestRotatePlane(benchmark_width_, benchmark_height_,
benchmark_width_, benchmark_height_,
kRotate0, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_);
kRotate0, benchmark_iterations_);
}
TEST_F(LibYUVRotateTest, RotatePlane90_Opt) {
TEST_F(libyuvTest, RotatePlane90) {
TestRotatePlane(benchmark_width_, benchmark_height_,
benchmark_height_, benchmark_width_,
kRotate90, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_);
kRotate90, benchmark_iterations_);
}
TEST_F(LibYUVRotateTest, RotatePlane180_Opt) {
TEST_F(libyuvTest, RotatePlane180) {
TestRotatePlane(benchmark_width_, benchmark_height_,
benchmark_width_, benchmark_height_,
kRotate180, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_);
kRotate180, benchmark_iterations_);
}
TEST_F(LibYUVRotateTest, RotatePlane270_Opt) {
TEST_F(libyuvTest, RotatePlane270) {
TestRotatePlane(benchmark_width_, benchmark_height_,
benchmark_height_, benchmark_width_,
kRotate270, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_);
kRotate270, benchmark_iterations_);
}
TEST_F(LibYUVRotateTest, DISABLED_RotatePlane0_Odd) {
TEST_F(libyuvTest, RotatePlane0_Odd) {
TestRotatePlane(benchmark_width_ - 3, benchmark_height_ - 1,
benchmark_width_ - 3, benchmark_height_ - 1,
kRotate0, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_);
kRotate0, benchmark_iterations_);
}
TEST_F(LibYUVRotateTest, DISABLED_RotatePlane90_Odd) {
TEST_F(libyuvTest, RotatePlane90_Odd) {
TestRotatePlane(benchmark_width_ - 3, benchmark_height_ - 1,
benchmark_height_ - 1, benchmark_width_ - 3,
kRotate90, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_);
kRotate90, benchmark_iterations_);
}
TEST_F(LibYUVRotateTest, DISABLED_RotatePlane180_Odd) {
TEST_F(libyuvTest, RotatePlane180_Odd) {
TestRotatePlane(benchmark_width_ - 3, benchmark_height_ - 1,
benchmark_width_ - 3, benchmark_height_ - 1,
kRotate180, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_);
kRotate180, benchmark_iterations_);
}
TEST_F(LibYUVRotateTest, DISABLED_RotatePlane270_Odd) {
TEST_F(libyuvTest, RotatePlane270_Odd) {
TestRotatePlane(benchmark_width_ - 3, benchmark_height_ - 1,
benchmark_height_ - 1, benchmark_width_ - 3,
kRotate270, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_);
kRotate270, benchmark_iterations_);
}
} // namespace libyuv

View File

@ -9,9 +9,11 @@
*/
#include <stdlib.h>
#include <time.h>
#include "libyuv/cpu_id.h"
#include "libyuv/rotate.h"
#include "libyuv/row.h"
#include "../unit_test/unit_test.h"
namespace libyuv {
@ -19,12 +21,11 @@ namespace libyuv {
static void I420TestRotate(int src_width, int src_height,
int dst_width, int dst_height,
libyuv::RotationMode mode,
int benchmark_iterations,
int disable_cpu_flags, int benchmark_cpu_info) {
int benchmark_iterations) {
if (src_width < 1) {
src_width = 1;
}
if (src_height == 0) {
if (src_height < 1) {
src_height = 1;
}
if (dst_width < 1) {
@ -33,23 +34,23 @@ static void I420TestRotate(int src_width, int src_height,
if (dst_height < 1) {
dst_height = 1;
}
int src_i420_y_size = src_width * Abs(src_height);
int src_i420_uv_size = ((src_width + 1) / 2) * ((Abs(src_height) + 1) / 2);
int src_i420_y_size = src_width * src_height;
int src_i420_uv_size = ((src_width + 1) / 2) * ((src_height + 1) / 2);
int src_i420_size = src_i420_y_size + src_i420_uv_size * 2;
align_buffer_page_end(src_i420, src_i420_size);
align_buffer_64(src_i420, src_i420_size);
for (int i = 0; i < src_i420_size; ++i) {
src_i420[i] = fastrand() & 0xff;
src_i420[i] = random() & 0xff;
}
int dst_i420_y_size = dst_width * dst_height;
int dst_i420_uv_size = ((dst_width + 1) / 2) * ((dst_height + 1) / 2);
int dst_i420_size = dst_i420_y_size + dst_i420_uv_size * 2;
align_buffer_page_end(dst_i420_c, dst_i420_size);
align_buffer_page_end(dst_i420_opt, dst_i420_size);
align_buffer_64(dst_i420_c, dst_i420_size);
align_buffer_64(dst_i420_opt, dst_i420_size);
memset(dst_i420_c, 2, dst_i420_size);
memset(dst_i420_opt, 3, dst_i420_size);
MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization.
MaskCpuFlags(0); // Disable all CPU optimization.
I420Rotate(src_i420, src_width,
src_i420 + src_i420_y_size, (src_width + 1) / 2,
src_i420 + src_i420_y_size + src_i420_uv_size, (src_width + 1) / 2,
@ -59,7 +60,7 @@ static void I420TestRotate(int src_width, int src_height,
(dst_width + 1) / 2,
src_width, src_height, mode);
MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization.
MaskCpuFlags(-1); // Enable all CPU optimization.
for (int i = 0; i < benchmark_iterations; ++i) {
I420Rotate(src_i420, src_width,
src_i420 + src_i420_y_size, (src_width + 1) / 2,
@ -77,79 +78,67 @@ static void I420TestRotate(int src_width, int src_height,
EXPECT_EQ(dst_i420_c[i], dst_i420_opt[i]);
}
free_aligned_buffer_page_end(dst_i420_c);
free_aligned_buffer_page_end(dst_i420_opt);
free_aligned_buffer_page_end(src_i420);
free_aligned_buffer_64(dst_i420_c);
free_aligned_buffer_64(dst_i420_opt);
free_aligned_buffer_64(src_i420);
}
TEST_F(LibYUVRotateTest, I420Rotate0_Opt) {
TEST_F(libyuvTest, I420Rotate0) {
I420TestRotate(benchmark_width_, benchmark_height_,
benchmark_width_, benchmark_height_,
kRotate0, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_);
kRotate0, benchmark_iterations_);
}
TEST_F(LibYUVRotateTest, I420Rotate90_Opt) {
TEST_F(libyuvTest, I420Rotate90) {
I420TestRotate(benchmark_width_, benchmark_height_,
benchmark_height_, benchmark_width_,
kRotate90, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_);
kRotate90, benchmark_iterations_);
}
TEST_F(LibYUVRotateTest, I420Rotate180_Opt) {
TEST_F(libyuvTest, I420Rotate180) {
I420TestRotate(benchmark_width_, benchmark_height_,
benchmark_width_, benchmark_height_,
kRotate180, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_);
kRotate180, benchmark_iterations_);
}
TEST_F(LibYUVRotateTest, I420Rotate270_Opt) {
TEST_F(libyuvTest, I420Rotate270) {
I420TestRotate(benchmark_width_, benchmark_height_,
benchmark_height_, benchmark_width_,
kRotate270, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_);
kRotate270, benchmark_iterations_);
}
// TODO(fbarchard): Remove odd width tests.
// Odd width tests work but disabled because they use C code and can be
// tested by passing an odd width command line or environment variable.
TEST_F(LibYUVRotateTest, DISABLED_I420Rotate0_Odd) {
TEST_F(libyuvTest, I420Rotate0_Odd) {
I420TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
benchmark_width_ - 3, benchmark_height_ - 1,
kRotate0, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_);
kRotate0, benchmark_iterations_);
}
TEST_F(LibYUVRotateTest, DISABLED_I420Rotate90_Odd) {
TEST_F(libyuvTest, I420Rotate90_Odd) {
I420TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
benchmark_height_ - 1, benchmark_width_ - 3,
kRotate90, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_);
kRotate90, benchmark_iterations_);
}
TEST_F(LibYUVRotateTest, DISABLED_I420Rotate180_Odd) {
TEST_F(libyuvTest, I420Rotate180_Odd) {
I420TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
benchmark_width_ - 3, benchmark_height_ - 1,
kRotate180, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_);
kRotate180, benchmark_iterations_);
}
TEST_F(LibYUVRotateTest, DISABLED_I420Rotate270_Odd) {
TEST_F(libyuvTest, I420Rotate270_Odd) {
I420TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
benchmark_height_ - 1, benchmark_width_ - 3,
kRotate270, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_);
kRotate270, benchmark_iterations_);
}
static void NV12TestRotate(int src_width, int src_height,
int dst_width, int dst_height,
libyuv::RotationMode mode,
int benchmark_iterations,
int disable_cpu_flags, int benchmark_cpu_info) {
int benchmark_iterations) {
if (src_width < 1) {
src_width = 1;
}
if (src_height == 0) { // allow negative for inversion test.
if (src_height < 1) {
src_height = 1;
}
if (dst_width < 1) {
@ -158,24 +147,23 @@ static void NV12TestRotate(int src_width, int src_height,
if (dst_height < 1) {
dst_height = 1;
}
int src_nv12_y_size = src_width * Abs(src_height);
int src_nv12_uv_size =
((src_width + 1) / 2) * ((Abs(src_height) + 1) / 2) * 2;
int src_nv12_y_size = src_width * src_height;
int src_nv12_uv_size = ((src_width + 1) / 2) * ((src_height + 1) / 2) * 2;
int src_nv12_size = src_nv12_y_size + src_nv12_uv_size;
align_buffer_page_end(src_nv12, src_nv12_size);
align_buffer_64(src_nv12, src_nv12_size);
for (int i = 0; i < src_nv12_size; ++i) {
src_nv12[i] = fastrand() & 0xff;
src_nv12[i] = random() & 0xff;
}
int dst_i420_y_size = dst_width * dst_height;
int dst_i420_uv_size = ((dst_width + 1) / 2) * ((dst_height + 1) / 2);
int dst_i420_size = dst_i420_y_size + dst_i420_uv_size * 2;
align_buffer_page_end(dst_i420_c, dst_i420_size);
align_buffer_page_end(dst_i420_opt, dst_i420_size);
align_buffer_64(dst_i420_c, dst_i420_size);
align_buffer_64(dst_i420_opt, dst_i420_size);
memset(dst_i420_c, 2, dst_i420_size);
memset(dst_i420_opt, 3, dst_i420_size);
MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization.
MaskCpuFlags(0); // Disable all CPU optimization.
NV12ToI420Rotate(src_nv12, src_width,
src_nv12 + src_nv12_y_size, (src_width + 1) & ~1,
dst_i420_c, dst_width,
@ -184,7 +172,7 @@ static void NV12TestRotate(int src_width, int src_height,
(dst_width + 1) / 2,
src_width, src_height, mode);
MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization.
MaskCpuFlags(-1); // Enable all CPU optimization.
for (int i = 0; i < benchmark_iterations; ++i) {
NV12ToI420Rotate(src_nv12, src_width,
src_nv12 + src_nv12_y_size, (src_width + 1) & ~1,
@ -200,97 +188,57 @@ static void NV12TestRotate(int src_width, int src_height,
EXPECT_EQ(dst_i420_c[i], dst_i420_opt[i]);
}
free_aligned_buffer_page_end(dst_i420_c);
free_aligned_buffer_page_end(dst_i420_opt);
free_aligned_buffer_page_end(src_nv12);
free_aligned_buffer_64(dst_i420_c);
free_aligned_buffer_64(dst_i420_opt);
free_aligned_buffer_64(src_nv12);
}
TEST_F(LibYUVRotateTest, NV12Rotate0_Opt) {
TEST_F(libyuvTest, NV12Rotate0) {
NV12TestRotate(benchmark_width_, benchmark_height_,
benchmark_width_, benchmark_height_,
kRotate0, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_);
kRotate0, benchmark_iterations_);
}
TEST_F(LibYUVRotateTest, NV12Rotate90_Opt) {
TEST_F(libyuvTest, NV12Rotate90) {
NV12TestRotate(benchmark_width_, benchmark_height_,
benchmark_height_, benchmark_width_,
kRotate90, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_);
kRotate90, benchmark_iterations_);
}
TEST_F(LibYUVRotateTest, NV12Rotate180_Opt) {
TEST_F(libyuvTest, NV12Rotate180) {
NV12TestRotate(benchmark_width_, benchmark_height_,
benchmark_width_, benchmark_height_,
kRotate180, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_);
kRotate180, benchmark_iterations_);
}
TEST_F(LibYUVRotateTest, NV12Rotate270_Opt) {
TEST_F(libyuvTest, NV12Rotate270) {
NV12TestRotate(benchmark_width_, benchmark_height_,
benchmark_height_, benchmark_width_,
kRotate270, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_);
kRotate270, benchmark_iterations_);
}
TEST_F(LibYUVRotateTest, DISABLED_NV12Rotate0_Odd) {
TEST_F(libyuvTest, NV12Rotate0_Odd) {
NV12TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
benchmark_width_ - 3, benchmark_height_ - 1,
kRotate0, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_);
kRotate0, benchmark_iterations_);
}
TEST_F(LibYUVRotateTest, DISABLED_NV12Rotate90_Odd) {
TEST_F(libyuvTest, NV12Rotate90_Odd) {
NV12TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
benchmark_height_ - 1, benchmark_width_ - 3,
kRotate90, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_);
kRotate90, benchmark_iterations_);
}
TEST_F(LibYUVRotateTest, DISABLED_NV12Rotate180_Odd) {
TEST_F(libyuvTest, NV12Rotate180_Odd) {
NV12TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
benchmark_width_ - 3, benchmark_height_ - 1,
kRotate180, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_);
kRotate180, benchmark_iterations_);
}
TEST_F(LibYUVRotateTest, DISABLED_NV12Rotate270_Odd) {
TEST_F(libyuvTest, NV12Rotate270_Odd) {
NV12TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
benchmark_height_ - 1, benchmark_width_ - 3,
kRotate270, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_);
kRotate270, benchmark_iterations_);
}
TEST_F(LibYUVRotateTest, NV12Rotate0_Invert) {
NV12TestRotate(benchmark_width_, -benchmark_height_,
benchmark_width_, benchmark_height_,
kRotate0, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_);
}
TEST_F(LibYUVRotateTest, NV12Rotate90_Invert) {
NV12TestRotate(benchmark_width_, -benchmark_height_,
benchmark_height_, benchmark_width_,
kRotate90, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_);
}
TEST_F(LibYUVRotateTest, NV12Rotate180_Invert) {
NV12TestRotate(benchmark_width_, -benchmark_height_,
benchmark_width_, benchmark_height_,
kRotate180, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_);
}
TEST_F(LibYUVRotateTest, NV12Rotate270_Invert) {
NV12TestRotate(benchmark_width_, -benchmark_height_,
benchmark_height_, benchmark_width_,
kRotate270, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_);
}
} // namespace libyuv

View File

@ -11,64 +11,48 @@
#include <stdlib.h>
#include <time.h>
#include "libyuv/convert_argb.h"
#include "libyuv/cpu_id.h"
#include "libyuv/scale_argb.h"
#include "libyuv/video_common.h"
#include "libyuv/row.h"
#include "../unit_test/unit_test.h"
namespace libyuv {
#define STRINGIZE(line) #line
#define FILELINESTR(file, line) file ":" STRINGIZE(line)
// Test scaling with C vs Opt and return maximum pixel difference. 0 = exact.
static int ARGBTestFilter(int src_width, int src_height,
int dst_width, int dst_height,
FilterMode f, int benchmark_iterations,
int disable_cpu_flags, int benchmark_cpu_info) {
if (!SizeValid(src_width, src_height, dst_width, dst_height)) {
return 0;
}
FilterMode f, int benchmark_iterations) {
const int b = 128;
int i, j;
const int b = 0; // 128 to test for padding/stride.
int64 src_argb_plane_size = (Abs(src_width) + b * 2) *
(Abs(src_height) + b * 2) * 4LL;
int src_argb_plane_size = (Abs(src_width) + b * 2) *
(Abs(src_height) + b * 2) * 4;
int src_stride_argb = (b * 2 + Abs(src_width)) * 4;
align_buffer_page_end(src_argb, src_argb_plane_size);
if (!src_argb) {
printf("Skipped. Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n");
return 0;
}
align_buffer_64(src_argb, src_argb_plane_size);
srandom(time(NULL));
MemRandomize(src_argb, src_argb_plane_size);
int64 dst_argb_plane_size = (dst_width + b * 2) * (dst_height + b * 2) * 4LL;
int dst_argb_plane_size = (dst_width + b * 2) * (dst_height + b * 2) * 4;
int dst_stride_argb = (b * 2 + dst_width) * 4;
align_buffer_page_end(dst_argb_c, dst_argb_plane_size);
align_buffer_page_end(dst_argb_opt, dst_argb_plane_size);
if (!dst_argb_c || !dst_argb_opt) {
printf("Skipped. Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n");
return 0;
}
align_buffer_64(dst_argb_c, dst_argb_plane_size);
align_buffer_64(dst_argb_opt, dst_argb_plane_size);
memset(dst_argb_c, 2, dst_argb_plane_size);
memset(dst_argb_opt, 3, dst_argb_plane_size);
// Warm up both versions for consistent benchmarks.
MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization.
MaskCpuFlags(0); // Disable all CPU optimization.
ARGBScale(src_argb + (src_stride_argb * b) + b * 4, src_stride_argb,
src_width, src_height,
dst_argb_c + (dst_stride_argb * b) + b * 4, dst_stride_argb,
dst_width, dst_height, f);
MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization.
MaskCpuFlags(-1); // Enable all CPU optimization.
ARGBScale(src_argb + (src_stride_argb * b) + b * 4, src_stride_argb,
src_width, src_height,
dst_argb_opt + (dst_stride_argb * b) + b * 4, dst_stride_argb,
dst_width, dst_height, f);
MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization.
MaskCpuFlags(0); // Disable all CPU optimization.
double c_time = get_time();
ARGBScale(src_argb + (src_stride_argb * b) + b * 4, src_stride_argb,
src_width, src_height,
@ -77,7 +61,7 @@ static int ARGBTestFilter(int src_width, int src_height,
c_time = (get_time() - c_time);
MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization.
MaskCpuFlags(-1); // Enable all CPU optimization.
double opt_time = get_time();
for (i = 0; i < benchmark_iterations; ++i) {
ARGBScale(src_argb + (src_stride_argb * b) + b * 4, src_stride_argb,
@ -106,9 +90,9 @@ static int ARGBTestFilter(int src_width, int src_height,
}
}
free_aligned_buffer_page_end(dst_argb_c);
free_aligned_buffer_page_end(dst_argb_opt);
free_aligned_buffer_page_end(src_argb);
free_aligned_buffer_64(dst_argb_c);
free_aligned_buffer_64(dst_argb_opt);
free_aligned_buffer_64(src_argb);
return max_diff;
}
@ -146,38 +130,28 @@ static int TileARGBScale(const uint8* src_argb, int src_stride_argb,
static int ARGBClipTestFilter(int src_width, int src_height,
int dst_width, int dst_height,
FilterMode f, int benchmark_iterations) {
if (!SizeValid(src_width, src_height, dst_width, dst_height)) {
return 0;
}
const int b = 128;
int64 src_argb_plane_size = (Abs(src_width) + b * 2) *
int src_argb_plane_size = (Abs(src_width) + b * 2) *
(Abs(src_height) + b * 2) * 4;
int src_stride_argb = (b * 2 + Abs(src_width)) * 4;
align_buffer_page_end(src_argb, src_argb_plane_size);
if (!src_argb) {
printf("Skipped. Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n");
return 0;
}
align_buffer_64(src_argb, src_argb_plane_size);
memset(src_argb, 1, src_argb_plane_size);
int64 dst_argb_plane_size = (dst_width + b * 2) * (dst_height + b * 2) * 4;
int dst_argb_plane_size = (dst_width + b * 2) * (dst_height + b * 2) * 4;
int dst_stride_argb = (b * 2 + dst_width) * 4;
srandom(time(NULL));
int i, j;
for (i = b; i < (Abs(src_height) + b); ++i) {
for (j = b; j < (Abs(src_width) + b) * 4; ++j) {
src_argb[(i * src_stride_argb) + j] = (fastrand() & 0xff);
src_argb[(i * src_stride_argb) + j] = (random() & 0xff);
}
}
align_buffer_page_end(dst_argb_c, dst_argb_plane_size);
align_buffer_page_end(dst_argb_opt, dst_argb_plane_size);
if (!dst_argb_c || !dst_argb_opt) {
printf("Skipped. Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n");
return 0;
}
align_buffer_64(dst_argb_c, dst_argb_plane_size);
align_buffer_64(dst_argb_opt, dst_argb_plane_size);
memset(dst_argb_c, 2, dst_argb_plane_size);
memset(dst_argb_opt, 3, dst_argb_plane_size);
@ -215,81 +189,66 @@ static int ARGBClipTestFilter(int src_width, int src_height,
}
}
free_aligned_buffer_page_end(dst_argb_c);
free_aligned_buffer_page_end(dst_argb_opt);
free_aligned_buffer_page_end(src_argb);
free_aligned_buffer_64(dst_argb_c);
free_aligned_buffer_64(dst_argb_opt);
free_aligned_buffer_64(src_argb);
return max_diff;
}
// The following adjustments in dimensions ensure the scale factor will be
// exactly achieved.
#define DX(x, nom, denom) static_cast<int>((Abs(x) / nom) * nom)
#define SX(x, nom, denom) static_cast<int>((x / nom) * denom)
#define TEST_FACTOR1(name, filter, nom, denom, max_diff) \
TEST_F(LibYUVScaleTest, ARGBScaleDownBy##name##_##filter) { \
int diff = ARGBTestFilter(SX(benchmark_width_, nom, denom), \
SX(benchmark_height_, nom, denom), \
DX(benchmark_width_, nom, denom), \
DX(benchmark_height_, nom, denom), \
kFilter##filter, benchmark_iterations_, \
disable_cpu_flags_, benchmark_cpu_info_); \
#define TEST_FACTOR1(name, filter, hfactor, vfactor, max_diff) \
TEST_F(libyuvTest, ARGBScaleDownBy##name##_##filter) { \
int diff = ARGBTestFilter(benchmark_width_, benchmark_height_, \
Abs(benchmark_width_) * hfactor, \
Abs(benchmark_height_) * vfactor, \
kFilter##filter, benchmark_iterations_); \
EXPECT_LE(diff, max_diff); \
} \
TEST_F(LibYUVScaleTest, ARGBScaleDownClipBy##name##_##filter) { \
int diff = ARGBClipTestFilter(SX(benchmark_width_, nom, denom), \
SX(benchmark_height_, nom, denom), \
DX(benchmark_width_, nom, denom), \
DX(benchmark_height_, nom, denom), \
kFilter##filter, benchmark_iterations_); \
TEST_F(libyuvTest, ARGBScaleDownClipBy##name##_##filter) { \
int diff = ARGBClipTestFilter(benchmark_width_, benchmark_height_, \
Abs(benchmark_width_) * hfactor, \
Abs(benchmark_height_) * vfactor, \
kFilter##filter, benchmark_iterations_); \
EXPECT_LE(diff, max_diff); \
}
// Test a scale factor with all 4 filters. Expect unfiltered to be exact, but
// Test a scale factor with 2 filters. Expect unfiltered to be exact, but
// filtering is different fixed point implementations for SSSE3, Neon and C.
#define TEST_FACTOR(name, nom, denom) \
TEST_FACTOR1(name, None, nom, denom, 0) \
TEST_FACTOR1(name, Linear, nom, denom, 3) \
TEST_FACTOR1(name, Bilinear, nom, denom, 3) \
TEST_FACTOR1(name, Box, nom, denom, 3)
#define TEST_FACTOR(name, hfactor, vfactor) \
TEST_FACTOR1(name, None, hfactor, vfactor, 2) \
TEST_FACTOR1(name, Linear, hfactor, vfactor, 2) \
TEST_FACTOR1(name, Bilinear, hfactor, vfactor, 2) \
TEST_FACTOR1(name, Box, hfactor, vfactor, 2)
TEST_FACTOR(2, 1, 2)
TEST_FACTOR(4, 1, 4)
TEST_FACTOR(8, 1, 8)
TEST_FACTOR(3by4, 3, 4)
TEST_FACTOR(3by8, 3, 8)
TEST_FACTOR(3, 1, 3)
TEST_FACTOR(2, 1 / 2, 1 / 2)
TEST_FACTOR(4, 1 / 4, 1 / 4)
TEST_FACTOR(8, 1 / 8, 1 / 8)
TEST_FACTOR(3by4, 3 / 4, 3 / 4)
#undef TEST_FACTOR1
#undef TEST_FACTOR
#undef SX
#undef DX
#define TEST_SCALETO1(name, width, height, filter, max_diff) \
TEST_F(LibYUVScaleTest, name##To##width##x##height##_##filter) { \
TEST_F(libyuvTest, name##To##width##x##height##_##filter) { \
int diff = ARGBTestFilter(benchmark_width_, benchmark_height_, \
width, height, \
kFilter##filter, benchmark_iterations_, \
disable_cpu_flags_, benchmark_cpu_info_); \
kFilter##filter, benchmark_iterations_); \
EXPECT_LE(diff, max_diff); \
} \
TEST_F(LibYUVScaleTest, name##From##width##x##height##_##filter) { \
TEST_F(libyuvTest, name##From##width##x##height##_##filter) { \
int diff = ARGBTestFilter(width, height, \
Abs(benchmark_width_), Abs(benchmark_height_), \
kFilter##filter, benchmark_iterations_, \
disable_cpu_flags_, benchmark_cpu_info_); \
kFilter##filter, benchmark_iterations_); \
EXPECT_LE(diff, max_diff); \
} \
TEST_F(LibYUVScaleTest, name##ClipTo##width##x##height##_##filter) { \
TEST_F(libyuvTest, name##ClipTo##width##x##height##_##filter) { \
int diff = ARGBClipTestFilter(benchmark_width_, benchmark_height_, \
width, height, \
kFilter##filter, benchmark_iterations_); \
width, height, \
kFilter##filter, benchmark_iterations_); \
EXPECT_LE(diff, max_diff); \
} \
TEST_F(LibYUVScaleTest, name##ClipFrom##width##x##height##_##filter) { \
TEST_F(libyuvTest, name##ClipFrom##width##x##height##_##filter) { \
int diff = ARGBClipTestFilter(width, height, \
Abs(benchmark_width_), \
Abs(benchmark_height_), \
kFilter##filter, benchmark_iterations_); \
Abs(benchmark_width_), Abs(benchmark_height_), \
kFilter##filter, benchmark_iterations_); \
EXPECT_LE(diff, max_diff); \
}
@ -297,166 +256,15 @@ TEST_FACTOR(3, 1, 3)
#define TEST_SCALETO(name, width, height) \
TEST_SCALETO1(name, width, height, None, 0) \
TEST_SCALETO1(name, width, height, Linear, 3) \
TEST_SCALETO1(name, width, height, Bilinear, 3)
TEST_SCALETO1(name, width, height, Bilinear, 3) \
TEST_SCALETO1(name, width, height, Box, 3)
TEST_SCALETO(ARGBScale, 1, 1)
TEST_SCALETO(ARGBScale, 320, 240)
TEST_SCALETO(ARGBScale, 352, 288)
TEST_SCALETO(ARGBScale, 569, 480)
TEST_SCALETO(ARGBScale, 640, 360)
TEST_SCALETO(ARGBScale, 1280, 720)
#undef TEST_SCALETO1
#undef TEST_SCALETO
// Scale with YUV conversion to ARGB and clipping.
LIBYUV_API
int YUVToARGBScaleReference2(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint32 src_fourcc,
int src_width, int src_height,
uint8* dst_argb, int dst_stride_argb,
uint32 dst_fourcc,
int dst_width, int dst_height,
int clip_x, int clip_y,
int clip_width, int clip_height,
enum FilterMode filtering) {
uint8* argb_buffer = static_cast<uint8*>(malloc(src_width * src_height * 4));
int r;
I420ToARGB(src_y, src_stride_y,
src_u, src_stride_u,
src_v, src_stride_v,
argb_buffer, src_width * 4,
src_width, src_height);
r = ARGBScaleClip(argb_buffer, src_width * 4,
src_width, src_height,
dst_argb, dst_stride_argb,
dst_width, dst_height,
clip_x, clip_y, clip_width, clip_height,
filtering);
free(argb_buffer);
return r;
}
static void FillRamp(uint8* buf, int width, int height, int v, int dx, int dy) {
int rv = v;
for (int y = 0; y < height; ++y) {
for (int x = 0; x < width; ++x) {
*buf++ = v;
v += dx;
if (v < 0 || v > 255) {
dx = -dx;
v += dx;
}
}
v = rv + dy;
if (v < 0 || v > 255) {
dy = -dy;
v += dy;
}
rv = v;
}
}
// Test scaling with C vs Opt and return maximum pixel difference. 0 = exact.
static int YUVToARGBTestFilter(int src_width, int src_height,
int dst_width, int dst_height,
FilterMode f, int benchmark_iterations,
int disable_cpu_flags, int benchmark_cpu_info) {
int64 src_y_plane_size = Abs(src_width) * Abs(src_height);
int64 src_uv_plane_size = ((Abs(src_width) + 1) / 2) *
((Abs(src_height) + 1) / 2);
int src_stride_y = Abs(src_width);
int src_stride_uv = (Abs(src_width) + 1) / 2;
align_buffer_page_end(src_y, src_y_plane_size);
align_buffer_page_end(src_u, src_uv_plane_size);
align_buffer_page_end(src_v, src_uv_plane_size);
int64 dst_argb_plane_size = (dst_width) * (dst_height) * 4LL;
int dst_stride_argb = (dst_width) * 4;
align_buffer_page_end(dst_argb_c, dst_argb_plane_size);
align_buffer_page_end(dst_argb_opt, dst_argb_plane_size);
if (!dst_argb_c || !dst_argb_opt || !src_y || !src_u || !src_v) {
printf("Skipped. Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n");
return 0;
}
// Fill YUV image with continuous ramp, which is less sensitive to
// subsampling and filtering differences for test purposes.
FillRamp(src_y, Abs(src_width), Abs(src_height), 128, 1, 1);
FillRamp(src_u, (Abs(src_width) + 1) / 2, (Abs(src_height) + 1) / 2, 3, 1, 1);
FillRamp(src_v, (Abs(src_width) + 1) / 2, (Abs(src_height) + 1) / 2, 4, 1, 1);
memset(dst_argb_c, 2, dst_argb_plane_size);
memset(dst_argb_opt, 3, dst_argb_plane_size);
YUVToARGBScaleReference2(src_y, src_stride_y,
src_u, src_stride_uv,
src_v, src_stride_uv,
libyuv::FOURCC_I420,
src_width, src_height,
dst_argb_c, dst_stride_argb,
libyuv::FOURCC_I420,
dst_width, dst_height,
0, 0, dst_width, dst_height,
f);
for (int i = 0; i < benchmark_iterations; ++i) {
YUVToARGBScaleClip(src_y, src_stride_y,
src_u, src_stride_uv,
src_v, src_stride_uv,
libyuv::FOURCC_I420,
src_width, src_height,
dst_argb_opt, dst_stride_argb,
libyuv::FOURCC_I420,
dst_width, dst_height,
0, 0, dst_width, dst_height,
f);
}
int max_diff = 0;
for (int i = 0; i < dst_height; ++i) {
for (int j = 0; j < dst_width * 4; ++j) {
int abs_diff = Abs(dst_argb_c[(i * dst_stride_argb) + j] -
dst_argb_opt[(i * dst_stride_argb) + j]);
if (abs_diff > max_diff) {
printf("error %d at %d,%d c %d opt %d",
abs_diff,
j, i,
dst_argb_c[(i * dst_stride_argb) + j],
dst_argb_opt[(i * dst_stride_argb) + j]);
EXPECT_LE(abs_diff, 40);
max_diff = abs_diff;
}
}
}
free_aligned_buffer_page_end(dst_argb_c);
free_aligned_buffer_page_end(dst_argb_opt);
free_aligned_buffer_page_end(src_y);
free_aligned_buffer_page_end(src_u);
free_aligned_buffer_page_end(src_v);
return max_diff;
}
TEST_F(LibYUVScaleTest, YUVToRGBScaleUp) {
int diff = YUVToARGBTestFilter(benchmark_width_, benchmark_height_,
benchmark_width_ * 3 / 2,
benchmark_height_ * 3 / 2,
libyuv::kFilterBilinear,
benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_);
EXPECT_LE(diff, 10);
}
TEST_F(LibYUVScaleTest, YUVToRGBScaleDown) {
int diff = YUVToARGBTestFilter(benchmark_width_ * 3 / 2,
benchmark_height_ * 3 / 2,
benchmark_width_, benchmark_height_,
libyuv::kFilterBilinear,
benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_);
EXPECT_LE(diff, 10);
}
} // namespace libyuv

View File

@ -15,27 +15,19 @@
#include "libyuv/scale.h"
#include "../unit_test/unit_test.h"
#define STRINGIZE(line) #line
#define FILELINESTR(file, line) file ":" STRINGIZE(line)
namespace libyuv {
// Test scaling with C vs Opt and return maximum pixel difference. 0 = exact.
static int TestFilter(int src_width, int src_height,
int dst_width, int dst_height,
FilterMode f, int benchmark_iterations,
int disable_cpu_flags, int benchmark_cpu_info) {
if (!SizeValid(src_width, src_height, dst_width, dst_height)) {
return 0;
}
FilterMode f, int benchmark_iterations) {
int i, j;
const int b = 0; // 128 to test for padding/stride.
const int b = 128;
int src_width_uv = (Abs(src_width) + 1) >> 1;
int src_height_uv = (Abs(src_height) + 1) >> 1;
int64 src_y_plane_size = (Abs(src_width) + b * 2) * (Abs(src_height) + b * 2);
int64 src_uv_plane_size = (src_width_uv + b * 2) * (src_height_uv + b * 2);
int src_y_plane_size = (Abs(src_width) + b * 2) * (Abs(src_height) + b * 2);
int src_uv_plane_size = (src_width_uv + b * 2) * (src_height_uv + b * 2);
int src_stride_y = b * 2 + Abs(src_width);
int src_stride_uv = b * 2 + src_width_uv;
@ -43,10 +35,7 @@ static int TestFilter(int src_width, int src_height,
align_buffer_page_end(src_y, src_y_plane_size)
align_buffer_page_end(src_u, src_uv_plane_size)
align_buffer_page_end(src_v, src_uv_plane_size)
if (!src_y || !src_u || !src_v) {
printf("Skipped. Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n");
return 0;
}
srandom(time(NULL));
MemRandomize(src_y, src_y_plane_size);
MemRandomize(src_u, src_uv_plane_size);
MemRandomize(src_v, src_uv_plane_size);
@ -54,8 +43,8 @@ static int TestFilter(int src_width, int src_height,
int dst_width_uv = (dst_width + 1) >> 1;
int dst_height_uv = (dst_height + 1) >> 1;
int64 dst_y_plane_size = (dst_width + b * 2) * (dst_height + b * 2);
int64 dst_uv_plane_size = (dst_width_uv + b * 2) * (dst_height_uv + b * 2);
int dst_y_plane_size = (dst_width + b * 2) * (dst_height + b * 2);
int dst_uv_plane_size = (dst_width_uv + b * 2) * (dst_height_uv + b * 2);
int dst_stride_y = b * 2 + dst_width;
int dst_stride_uv = b * 2 + dst_width_uv;
@ -66,13 +55,9 @@ static int TestFilter(int src_width, int src_height,
align_buffer_page_end(dst_y_opt, dst_y_plane_size)
align_buffer_page_end(dst_u_opt, dst_uv_plane_size)
align_buffer_page_end(dst_v_opt, dst_uv_plane_size)
if (!dst_y_c || !dst_u_c || !dst_v_c ||
!dst_y_opt|| !dst_u_opt|| !dst_v_opt) {
printf("Skipped. Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n");
return 0;
}
MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization.
MaskCpuFlags(0); // Disable all CPU optimization.
double c_time = get_time();
I420Scale(src_y + (src_stride_y * b) + b, src_stride_y,
src_u + (src_stride_uv * b) + b, src_stride_uv,
@ -84,7 +69,7 @@ static int TestFilter(int src_width, int src_height,
dst_width, dst_height, f);
c_time = (get_time() - c_time);
MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization.
MaskCpuFlags(-1); // Enable all CPU optimization.
double opt_time = get_time();
for (i = 0; i < benchmark_iterations; ++i) {
I420Scale(src_y + (src_stride_y * b) + b, src_stride_y,
@ -147,223 +132,54 @@ static int TestFilter(int src_width, int src_height,
return max_diff;
}
// Test scaling with 8 bit C vs 16 bit C and return maximum pixel difference.
// 0 = exact.
static int TestFilter_16(int src_width, int src_height,
int dst_width, int dst_height,
FilterMode f, int benchmark_iterations) {
if (!SizeValid(src_width, src_height, dst_width, dst_height)) {
return 0;
}
int i, j;
const int b = 0; // 128 to test for padding/stride.
int src_width_uv = (Abs(src_width) + 1) >> 1;
int src_height_uv = (Abs(src_height) + 1) >> 1;
int64 src_y_plane_size = (Abs(src_width) + b * 2) *
(Abs(src_height) + b * 2);
int64 src_uv_plane_size = (src_width_uv + b * 2) * (src_height_uv + b * 2);
int src_stride_y = b * 2 + Abs(src_width);
int src_stride_uv = b * 2 + src_width_uv;
align_buffer_page_end(src_y, src_y_plane_size)
align_buffer_page_end(src_u, src_uv_plane_size)
align_buffer_page_end(src_v, src_uv_plane_size)
align_buffer_page_end(src_y_16, src_y_plane_size * 2)
align_buffer_page_end(src_u_16, src_uv_plane_size * 2)
align_buffer_page_end(src_v_16, src_uv_plane_size * 2)
uint16* p_src_y_16 = reinterpret_cast<uint16*>(src_y_16);
uint16* p_src_u_16 = reinterpret_cast<uint16*>(src_u_16);
uint16* p_src_v_16 = reinterpret_cast<uint16*>(src_v_16);
MemRandomize(src_y, src_y_plane_size);
MemRandomize(src_u, src_uv_plane_size);
MemRandomize(src_v, src_uv_plane_size);
for (i = b; i < src_height + b; ++i) {
for (j = b; j < src_width + b; ++j) {
p_src_y_16[(i * src_stride_y) + j] = src_y[(i * src_stride_y) + j];
}
}
for (i = b; i < (src_height_uv + b); ++i) {
for (j = b; j < (src_width_uv + b); ++j) {
p_src_u_16[(i * src_stride_uv) + j] = src_u[(i * src_stride_uv) + j];
p_src_v_16[(i * src_stride_uv) + j] = src_v[(i * src_stride_uv) + j];
}
}
int dst_width_uv = (dst_width + 1) >> 1;
int dst_height_uv = (dst_height + 1) >> 1;
int dst_y_plane_size = (dst_width + b * 2) * (dst_height + b * 2);
int dst_uv_plane_size = (dst_width_uv + b * 2) * (dst_height_uv + b * 2);
int dst_stride_y = b * 2 + dst_width;
int dst_stride_uv = b * 2 + dst_width_uv;
align_buffer_page_end(dst_y_8, dst_y_plane_size)
align_buffer_page_end(dst_u_8, dst_uv_plane_size)
align_buffer_page_end(dst_v_8, dst_uv_plane_size)
align_buffer_page_end(dst_y_16, dst_y_plane_size * 2)
align_buffer_page_end(dst_u_16, dst_uv_plane_size * 2)
align_buffer_page_end(dst_v_16, dst_uv_plane_size * 2)
uint16* p_dst_y_16 = reinterpret_cast<uint16*>(dst_y_16);
uint16* p_dst_u_16 = reinterpret_cast<uint16*>(dst_u_16);
uint16* p_dst_v_16 = reinterpret_cast<uint16*>(dst_v_16);
I420Scale(src_y + (src_stride_y * b) + b, src_stride_y,
src_u + (src_stride_uv * b) + b, src_stride_uv,
src_v + (src_stride_uv * b) + b, src_stride_uv,
src_width, src_height,
dst_y_8 + (dst_stride_y * b) + b, dst_stride_y,
dst_u_8 + (dst_stride_uv * b) + b, dst_stride_uv,
dst_v_8 + (dst_stride_uv * b) + b, dst_stride_uv,
dst_width, dst_height, f);
for (i = 0; i < benchmark_iterations; ++i) {
I420Scale_16(p_src_y_16 + (src_stride_y * b) + b, src_stride_y,
p_src_u_16 + (src_stride_uv * b) + b, src_stride_uv,
p_src_v_16 + (src_stride_uv * b) + b, src_stride_uv,
src_width, src_height,
p_dst_y_16 + (dst_stride_y * b) + b, dst_stride_y,
p_dst_u_16 + (dst_stride_uv * b) + b, dst_stride_uv,
p_dst_v_16 + (dst_stride_uv * b) + b, dst_stride_uv,
dst_width, dst_height, f);
}
// Expect an exact match
int max_diff = 0;
for (i = b; i < (dst_height + b); ++i) {
for (j = b; j < (dst_width + b); ++j) {
int abs_diff = Abs(dst_y_8[(i * dst_stride_y) + j] -
p_dst_y_16[(i * dst_stride_y) + j]);
if (abs_diff > max_diff) {
max_diff = abs_diff;
}
}
}
for (i = b; i < (dst_height_uv + b); ++i) {
for (j = b; j < (dst_width_uv + b); ++j) {
int abs_diff = Abs(dst_u_8[(i * dst_stride_uv) + j] -
p_dst_u_16[(i * dst_stride_uv) + j]);
if (abs_diff > max_diff) {
max_diff = abs_diff;
}
abs_diff = Abs(dst_v_8[(i * dst_stride_uv) + j] -
p_dst_v_16[(i * dst_stride_uv) + j]);
if (abs_diff > max_diff) {
max_diff = abs_diff;
}
}
}
free_aligned_buffer_page_end(dst_y_8)
free_aligned_buffer_page_end(dst_u_8)
free_aligned_buffer_page_end(dst_v_8)
free_aligned_buffer_page_end(dst_y_16)
free_aligned_buffer_page_end(dst_u_16)
free_aligned_buffer_page_end(dst_v_16)
free_aligned_buffer_page_end(src_y)
free_aligned_buffer_page_end(src_u)
free_aligned_buffer_page_end(src_v)
free_aligned_buffer_page_end(src_y_16)
free_aligned_buffer_page_end(src_u_16)
free_aligned_buffer_page_end(src_v_16)
return max_diff;
}
// The following adjustments in dimensions ensure the scale factor will be
// exactly achieved.
// 2 is chroma subsample
#define DX(x, nom, denom) static_cast<int>(((Abs(x) / nom + 1) / 2) * nom * 2)
#define SX(x, nom, denom) static_cast<int>(((x / nom + 1) / 2) * denom * 2)
#define TEST_FACTOR1(name, filter, nom, denom, max_diff) \
TEST_F(LibYUVScaleTest, ScaleDownBy##name##_##filter) { \
int diff = TestFilter(SX(benchmark_width_, nom, denom), \
SX(benchmark_height_, nom, denom), \
DX(benchmark_width_, nom, denom), \
DX(benchmark_height_, nom, denom), \
kFilter##filter, benchmark_iterations_, \
disable_cpu_flags_, benchmark_cpu_info_); \
EXPECT_LE(diff, max_diff); \
} \
TEST_F(LibYUVScaleTest, DISABLED_ScaleDownBy##name##_##filter##_16) { \
int diff = TestFilter_16(SX(benchmark_width_, nom, denom), \
SX(benchmark_height_, nom, denom), \
DX(benchmark_width_, nom, denom), \
DX(benchmark_height_, nom, denom), \
kFilter##filter, benchmark_iterations_); \
#define TEST_FACTOR1(name, filter, hfactor, vfactor, max_diff) \
TEST_F(libyuvTest, ScaleDownBy##name##_##filter) { \
int diff = TestFilter(benchmark_width_, benchmark_height_, \
Abs(benchmark_width_) * hfactor, \
Abs(benchmark_height_) * vfactor, \
kFilter##filter, benchmark_iterations_); \
EXPECT_LE(diff, max_diff); \
}
// Test a scale factor with all 4 filters. Expect unfiltered to be exact, but
// filtering is different fixed point implementations for SSSE3, Neon and C.
#define TEST_FACTOR(name, nom, denom, boxdiff) \
TEST_FACTOR1(name, None, nom, denom, 0) \
TEST_FACTOR1(name, Linear, nom, denom, 3) \
TEST_FACTOR1(name, Bilinear, nom, denom, 3) \
TEST_FACTOR1(name, Box, nom, denom, boxdiff)
#define TEST_FACTOR(name, hfactor, vfactor) \
TEST_FACTOR1(name, None, hfactor, vfactor, 0) \
TEST_FACTOR1(name, Linear, hfactor, vfactor, 3) \
TEST_FACTOR1(name, Bilinear, hfactor, vfactor, 3) \
TEST_FACTOR1(name, Box, hfactor, vfactor, 3) \
TEST_FACTOR(2, 1, 2, 0)
TEST_FACTOR(4, 1, 4, 0)
TEST_FACTOR(8, 1, 8, 0)
TEST_FACTOR(3by4, 3, 4, 1)
TEST_FACTOR(3by8, 3, 8, 1)
TEST_FACTOR(3, 1, 3, 0)
TEST_FACTOR(2, 1 / 2, 1 / 2)
TEST_FACTOR(4, 1 / 4, 1 / 4)
TEST_FACTOR(8, 1 / 8, 1 / 8)
TEST_FACTOR(3by4, 3 / 4, 3 / 4)
#undef TEST_FACTOR1
#undef TEST_FACTOR
#undef SX
#undef DX
#define TEST_SCALETO1(name, width, height, filter, max_diff) \
TEST_F(LibYUVScaleTest, name##To##width##x##height##_##filter) { \
TEST_F(libyuvTest, name##To##width##x##height##_##filter) { \
int diff = TestFilter(benchmark_width_, benchmark_height_, \
width, height, \
kFilter##filter, benchmark_iterations_, \
disable_cpu_flags_, benchmark_cpu_info_); \
kFilter##filter, benchmark_iterations_); \
EXPECT_LE(diff, max_diff); \
} \
TEST_F(LibYUVScaleTest, name##From##width##x##height##_##filter) { \
TEST_F(libyuvTest, name##From##width##x##height##_##filter) { \
int diff = TestFilter(width, height, \
Abs(benchmark_width_), Abs(benchmark_height_), \
kFilter##filter, benchmark_iterations_, \
disable_cpu_flags_, benchmark_cpu_info_); \
EXPECT_LE(diff, max_diff); \
} \
TEST_F(LibYUVScaleTest, \
DISABLED_##name##To##width##x##height##_##filter##_16) { \
int diff = TestFilter_16(benchmark_width_, benchmark_height_, \
width, height, \
kFilter##filter, benchmark_iterations_); \
EXPECT_LE(diff, max_diff); \
} \
TEST_F(LibYUVScaleTest, \
DISABLED_##name##From##width##x##height##_##filter##_16) { \
int diff = TestFilter_16(width, height, \
Abs(benchmark_width_), Abs(benchmark_height_), \
kFilter##filter, benchmark_iterations_); \
kFilter##filter, benchmark_iterations_); \
EXPECT_LE(diff, max_diff); \
}
// Test scale to a specified size with all 4 filters.
#define TEST_SCALETO(name, width, height) \
TEST_SCALETO1(name, width, height, None, 0) \
TEST_SCALETO1(name, width, height, Linear, 0) \
TEST_SCALETO1(name, width, height, Bilinear, 0) \
TEST_SCALETO1(name, width, height, Box, 0)
TEST_SCALETO1(name, width, height, Linear, 3) \
TEST_SCALETO1(name, width, height, Bilinear, 3) \
TEST_SCALETO1(name, width, height, Box, 3)
TEST_SCALETO(Scale, 1, 1)
TEST_SCALETO(Scale, 320, 240)
TEST_SCALETO(Scale, 352, 288)
TEST_SCALETO(Scale, 569, 480)
TEST_SCALETO(Scale, 640, 360)
TEST_SCALETO(Scale, 1280, 720)
#undef TEST_SCALETO1

View File

@ -14,343 +14,42 @@
#include <cstring>
#include "gflags/gflags.h"
// Change this to 1000 for benchmarking.
// TODO(fbarchard): Add command line parsing to pass this as option.
#define BENCHMARK_ITERATIONS 1
unsigned int fastrand_seed = 0xfb;
DEFINE_int32(libyuv_width, 0, "width of test image.");
DEFINE_int32(libyuv_height, 0, "height of test image.");
DEFINE_int32(libyuv_repeat, 0, "number of times to repeat test.");
DEFINE_int32(libyuv_flags, 0,
"cpu flags for reference code. 1 = C, -1 = SIMD");
DEFINE_int32(libyuv_cpu_info, 0,
"cpu flags for benchmark code. 1 = C, -1 = SIMD");
// For quicker unittests, default is 128 x 72. But when benchmarking,
// default to 720p. Allow size to specify.
// Set flags to -1 for benchmarking to avoid slower C code.
LibYUVConvertTest::LibYUVConvertTest() :
libyuvTest::libyuvTest() : rotate_max_w_(128), rotate_max_h_(128),
benchmark_iterations_(BENCHMARK_ITERATIONS), benchmark_width_(128),
benchmark_height_(72), disable_cpu_flags_(1), benchmark_cpu_info_(-1) {
const char* repeat = getenv("LIBYUV_REPEAT");
if (repeat) {
benchmark_iterations_ = atoi(repeat); // NOLINT
}
if (FLAGS_libyuv_repeat) {
benchmark_iterations_ = FLAGS_libyuv_repeat;
}
if (benchmark_iterations_ > 1) {
benchmark_width_ = 1280;
benchmark_height_ = 720;
}
const char* width = getenv("LIBYUV_WIDTH");
if (width) {
benchmark_width_ = atoi(width); // NOLINT
}
if (FLAGS_libyuv_width) {
benchmark_width_ = FLAGS_libyuv_width;
}
const char* height = getenv("LIBYUV_HEIGHT");
if (height) {
benchmark_height_ = atoi(height); // NOLINT
}
if (FLAGS_libyuv_height) {
benchmark_height_ = FLAGS_libyuv_height;
}
const char* cpu_flags = getenv("LIBYUV_FLAGS");
if (cpu_flags) {
disable_cpu_flags_ = atoi(cpu_flags); // NOLINT
}
if (FLAGS_libyuv_flags) {
disable_cpu_flags_ = FLAGS_libyuv_flags;
}
const char* cpu_info = getenv("LIBYUV_CPU_INFO");
if (cpu_info) {
benchmark_cpu_info_ = atoi(cpu_flags); // NOLINT
}
if (FLAGS_libyuv_cpu_info) {
benchmark_cpu_info_ = FLAGS_libyuv_cpu_info;
}
benchmark_pixels_div256_ = static_cast<int>((
static_cast<double>(Abs(benchmark_width_)) *
static_cast<double>(Abs(benchmark_height_)) *
static_cast<double>(benchmark_iterations_) + 255.0) / 256.0);
benchmark_pixels_div1280_ = static_cast<int>((
static_cast<double>(Abs(benchmark_width_)) *
static_cast<double>(Abs(benchmark_height_)) *
static_cast<double>(benchmark_iterations_) + 1279.0) / 1280.0);
}
LibYUVColorTest::LibYUVColorTest() :
benchmark_iterations_(BENCHMARK_ITERATIONS), benchmark_width_(128),
benchmark_height_(72), disable_cpu_flags_(1), benchmark_cpu_info_(-1) {
const char* repeat = getenv("LIBYUV_REPEAT");
if (repeat) {
benchmark_iterations_ = atoi(repeat); // NOLINT
}
if (FLAGS_libyuv_repeat) {
benchmark_iterations_ = FLAGS_libyuv_repeat;
}
if (benchmark_iterations_ > 1) {
benchmark_width_ = 1280;
benchmark_height_ = 720;
}
const char* width = getenv("LIBYUV_WIDTH");
if (width) {
benchmark_width_ = atoi(width); // NOLINT
}
if (FLAGS_libyuv_width) {
benchmark_width_ = FLAGS_libyuv_width;
}
const char* height = getenv("LIBYUV_HEIGHT");
if (height) {
benchmark_height_ = atoi(height); // NOLINT
}
if (FLAGS_libyuv_height) {
benchmark_height_ = FLAGS_libyuv_height;
}
const char* cpu_flags = getenv("LIBYUV_FLAGS");
if (cpu_flags) {
disable_cpu_flags_ = atoi(cpu_flags); // NOLINT
}
if (FLAGS_libyuv_flags) {
disable_cpu_flags_ = FLAGS_libyuv_flags;
}
const char* cpu_info = getenv("LIBYUV_CPU_INFO");
if (cpu_info) {
benchmark_cpu_info_ = atoi(cpu_flags); // NOLINT
}
if (FLAGS_libyuv_cpu_info) {
benchmark_cpu_info_ = FLAGS_libyuv_cpu_info;
}
benchmark_pixels_div256_ = static_cast<int>((
static_cast<double>(Abs(benchmark_width_)) *
static_cast<double>(Abs(benchmark_height_)) *
static_cast<double>(benchmark_iterations_) + 255.0) / 256.0);
benchmark_pixels_div1280_ = static_cast<int>((
static_cast<double>(Abs(benchmark_width_)) *
static_cast<double>(Abs(benchmark_height_)) *
static_cast<double>(benchmark_iterations_) + 1279.0) / 1280.0);
}
LibYUVScaleTest::LibYUVScaleTest() :
benchmark_iterations_(BENCHMARK_ITERATIONS), benchmark_width_(128),
benchmark_height_(72), disable_cpu_flags_(1), benchmark_cpu_info_(-1) {
const char* repeat = getenv("LIBYUV_REPEAT");
if (repeat) {
benchmark_iterations_ = atoi(repeat); // NOLINT
}
if (FLAGS_libyuv_repeat) {
benchmark_iterations_ = FLAGS_libyuv_repeat;
}
if (benchmark_iterations_ > 1) {
benchmark_width_ = 1280;
benchmark_height_ = 720;
}
const char* width = getenv("LIBYUV_WIDTH");
if (width) {
benchmark_width_ = atoi(width); // NOLINT
}
if (FLAGS_libyuv_width) {
benchmark_width_ = FLAGS_libyuv_width;
}
const char* height = getenv("LIBYUV_HEIGHT");
if (height) {
benchmark_height_ = atoi(height); // NOLINT
}
if (FLAGS_libyuv_height) {
benchmark_height_ = FLAGS_libyuv_height;
}
const char* cpu_flags = getenv("LIBYUV_FLAGS");
if (cpu_flags) {
disable_cpu_flags_ = atoi(cpu_flags); // NOLINT
}
if (FLAGS_libyuv_flags) {
disable_cpu_flags_ = FLAGS_libyuv_flags;
}
const char* cpu_info = getenv("LIBYUV_CPU_INFO");
if (cpu_info) {
benchmark_cpu_info_ = atoi(cpu_flags); // NOLINT
}
if (FLAGS_libyuv_cpu_info) {
benchmark_cpu_info_ = FLAGS_libyuv_cpu_info;
}
benchmark_pixels_div256_ = static_cast<int>((
static_cast<double>(Abs(benchmark_width_)) *
static_cast<double>(Abs(benchmark_height_)) *
static_cast<double>(benchmark_iterations_) + 255.0) / 256.0);
benchmark_pixels_div1280_ = static_cast<int>((
static_cast<double>(Abs(benchmark_width_)) *
static_cast<double>(Abs(benchmark_height_)) *
static_cast<double>(benchmark_iterations_) + 1279.0) / 1280.0);
}
LibYUVRotateTest::LibYUVRotateTest() :
benchmark_iterations_(BENCHMARK_ITERATIONS), benchmark_width_(128),
benchmark_height_(72), disable_cpu_flags_(1), benchmark_cpu_info_(-1) {
const char* repeat = getenv("LIBYUV_REPEAT");
if (repeat) {
benchmark_iterations_ = atoi(repeat); // NOLINT
}
if (FLAGS_libyuv_repeat) {
benchmark_iterations_ = FLAGS_libyuv_repeat;
}
if (benchmark_iterations_ > 1) {
benchmark_width_ = 1280;
benchmark_height_ = 720;
}
const char* width = getenv("LIBYUV_WIDTH");
if (width) {
benchmark_width_ = atoi(width); // NOLINT
}
if (FLAGS_libyuv_width) {
benchmark_width_ = FLAGS_libyuv_width;
}
const char* height = getenv("LIBYUV_HEIGHT");
if (height) {
benchmark_height_ = atoi(height); // NOLINT
}
if (FLAGS_libyuv_height) {
benchmark_height_ = FLAGS_libyuv_height;
}
const char* cpu_flags = getenv("LIBYUV_FLAGS");
if (cpu_flags) {
disable_cpu_flags_ = atoi(cpu_flags); // NOLINT
}
if (FLAGS_libyuv_flags) {
disable_cpu_flags_ = FLAGS_libyuv_flags;
}
const char* cpu_info = getenv("LIBYUV_CPU_INFO");
if (cpu_info) {
benchmark_cpu_info_ = atoi(cpu_flags); // NOLINT
}
if (FLAGS_libyuv_cpu_info) {
benchmark_cpu_info_ = FLAGS_libyuv_cpu_info;
}
benchmark_pixels_div256_ = static_cast<int>((
static_cast<double>(Abs(benchmark_width_)) *
static_cast<double>(Abs(benchmark_height_)) *
static_cast<double>(benchmark_iterations_) + 255.0) / 256.0);
benchmark_pixels_div1280_ = static_cast<int>((
static_cast<double>(Abs(benchmark_width_)) *
static_cast<double>(Abs(benchmark_height_)) *
static_cast<double>(benchmark_iterations_) + 1279.0) / 1280.0);
}
LibYUVPlanarTest::LibYUVPlanarTest() :
benchmark_iterations_(BENCHMARK_ITERATIONS), benchmark_width_(128),
benchmark_height_(72), disable_cpu_flags_(1), benchmark_cpu_info_(-1) {
const char* repeat = getenv("LIBYUV_REPEAT");
if (repeat) {
benchmark_iterations_ = atoi(repeat); // NOLINT
}
if (FLAGS_libyuv_repeat) {
benchmark_iterations_ = FLAGS_libyuv_repeat;
}
if (benchmark_iterations_ > 1) {
benchmark_width_ = 1280;
benchmark_height_ = 720;
}
const char* width = getenv("LIBYUV_WIDTH");
if (width) {
benchmark_width_ = atoi(width); // NOLINT
}
if (FLAGS_libyuv_width) {
benchmark_width_ = FLAGS_libyuv_width;
}
const char* height = getenv("LIBYUV_HEIGHT");
if (height) {
benchmark_height_ = atoi(height); // NOLINT
}
if (FLAGS_libyuv_height) {
benchmark_height_ = FLAGS_libyuv_height;
}
const char* cpu_flags = getenv("LIBYUV_FLAGS");
if (cpu_flags) {
disable_cpu_flags_ = atoi(cpu_flags); // NOLINT
}
if (FLAGS_libyuv_flags) {
disable_cpu_flags_ = FLAGS_libyuv_flags;
}
const char* cpu_info = getenv("LIBYUV_CPU_INFO");
if (cpu_info) {
benchmark_cpu_info_ = atoi(cpu_flags); // NOLINT
}
if (FLAGS_libyuv_cpu_info) {
benchmark_cpu_info_ = FLAGS_libyuv_cpu_info;
}
benchmark_pixels_div256_ = static_cast<int>((
static_cast<double>(Abs(benchmark_width_)) *
static_cast<double>(Abs(benchmark_height_)) *
static_cast<double>(benchmark_iterations_) + 255.0) / 256.0);
benchmark_pixels_div1280_ = static_cast<int>((
static_cast<double>(Abs(benchmark_width_)) *
static_cast<double>(Abs(benchmark_height_)) *
static_cast<double>(benchmark_iterations_) + 1279.0) / 1280.0);
}
LibYUVBaseTest::LibYUVBaseTest() :
benchmark_iterations_(BENCHMARK_ITERATIONS), benchmark_width_(128),
benchmark_height_(72), disable_cpu_flags_(1), benchmark_cpu_info_(-1) {
const char* repeat = getenv("LIBYUV_REPEAT");
if (repeat) {
benchmark_iterations_ = atoi(repeat); // NOLINT
}
if (FLAGS_libyuv_repeat) {
benchmark_iterations_ = FLAGS_libyuv_repeat;
}
if (benchmark_iterations_ > 1) {
benchmark_width_ = 1280;
benchmark_height_ = 720;
}
const char* width = getenv("LIBYUV_WIDTH");
if (width) {
benchmark_width_ = atoi(width); // NOLINT
}
if (FLAGS_libyuv_width) {
benchmark_width_ = FLAGS_libyuv_width;
}
const char* height = getenv("LIBYUV_HEIGHT");
if (height) {
benchmark_height_ = atoi(height); // NOLINT
}
if (FLAGS_libyuv_height) {
benchmark_height_ = FLAGS_libyuv_height;
}
const char* cpu_flags = getenv("LIBYUV_FLAGS");
if (cpu_flags) {
disable_cpu_flags_ = atoi(cpu_flags); // NOLINT
}
if (FLAGS_libyuv_flags) {
disable_cpu_flags_ = FLAGS_libyuv_flags;
}
const char* cpu_info = getenv("LIBYUV_CPU_INFO");
if (cpu_info) {
benchmark_cpu_info_ = atoi(cpu_flags); // NOLINT
}
if (FLAGS_libyuv_cpu_info) {
benchmark_cpu_info_ = FLAGS_libyuv_cpu_info;
}
benchmark_pixels_div256_ = static_cast<int>((
static_cast<double>(Abs(benchmark_width_)) *
static_cast<double>(Abs(benchmark_height_)) *
static_cast<double>(benchmark_iterations_) + 255.0) / 256.0);
benchmark_pixels_div1280_ = static_cast<int>((
static_cast<double>(Abs(benchmark_width_)) *
static_cast<double>(Abs(benchmark_height_)) *
static_cast<double>(benchmark_iterations_) + 1279.0) / 1280.0);
benchmark_height_(72) {
const char* repeat = getenv("LIBYUV_REPEAT");
if (repeat) {
benchmark_iterations_ = atoi(repeat); // NOLINT
// For quicker unittests, default is 128 x 72. But when benchmarking,
// default to 720p. Allow size to specify.
if (benchmark_iterations_ > 1) {
benchmark_width_ = 1280;
benchmark_height_ = 720;
}
}
const char* width = getenv("LIBYUV_WIDTH");
if (width) {
benchmark_width_ = atoi(width); // NOLINT
}
const char* height = getenv("LIBYUV_HEIGHT");
if (height) {
benchmark_height_ = atoi(height); // NOLINT
}
benchmark_pixels_div256_ = static_cast<int>((
static_cast<double>(Abs(benchmark_width_)) *
static_cast<double>(Abs(benchmark_height_)) *
static_cast<double>(benchmark_iterations_) + 255.0) / 256.0);
benchmark_pixels_div1280_ = static_cast<int>((
static_cast<double>(Abs(benchmark_width_)) *
static_cast<double>(Abs(benchmark_height_)) *
static_cast<double>(benchmark_iterations_) + 1279.0) / 1280.0);
}
int main(int argc, char** argv) {
::testing::InitGoogleTest(&argc, argv);
// AllowCommandLineParsing allows us to ignore flags passed on to us by
// Chromium build bots without having to explicitly disable them.
google::AllowCommandLineReparsing();
google::ParseCommandLineFlags(&argc, &argv, true);
return RUN_ALL_TESTS();
}

View File

@ -22,54 +22,15 @@
#include "libyuv/basic_types.h"
#ifndef SIMD_ALIGNED
#if defined(_MSC_VER) && !defined(__CLR_VER)
#define SIMD_ALIGNED(var) __declspec(align(16)) var
#elif defined(__GNUC__) && !defined(__pnacl__)
#define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
#else
#define SIMD_ALIGNED(var) var
#endif
#endif
static __inline int Abs(int v) {
return v >= 0 ? v : -v;
}
#define OFFBY 0
// Scaling uses 16.16 fixed point to step thru the source image, so a
// maximum size of 32767.999 can be expressed. 32768 is valid because
// the step is 1 beyond the image but not used.
// Destination size is mainly constrained by valid scale step not the
// absolute size, so it may be possible to relax the destination size
// constraint.
// Source size is unconstrained for most specialized scalers. e.g.
// An image of 65536 scaled to half size would be valid. The test
// could be relaxed for special scale factors.
// If this test is removed, the scaling function should gracefully
// fail with a return code. The test could be changed to know that
// libyuv failed in a controlled way.
static const int kMaxWidth = 32768;
static const int kMaxHeight = 32768;
static inline bool SizeValid(int src_width, int src_height,
int dst_width, int dst_height) {
if (src_width > kMaxWidth || src_height > kMaxHeight ||
dst_width > kMaxWidth || dst_height > kMaxHeight) {
printf("Warning - size too large to test. Skipping\n");
return false;
}
return true;
}
#define align_buffer_page_end(var, size) \
uint8* var; \
uint8* var##_mem; \
var##_mem = reinterpret_cast<uint8*>(malloc(((size) + 4095 + 63) & ~4095)); \
var = (uint8*)((intptr_t)(var##_mem + (((size) + 4095 + 63) & ~4095) - \
(size)) & ~63);
var##_mem = reinterpret_cast<uint8*>(malloc(((size) + 4095) & ~4095)); \
var = var##_mem + (-(size) & 4095);
#define free_aligned_buffer_page_end(var) \
free(var##_mem); \
@ -82,6 +43,9 @@ static inline double get_time() {
QueryPerformanceFrequency(&f);
return static_cast<double>(t.QuadPart) / static_cast<double>(f.QuadPart);
}
#define random rand
#define srandom srand
#else
static inline double get_time() {
struct timeval t;
@ -91,109 +55,29 @@ static inline double get_time() {
}
#endif
#ifndef SIMD_ALIGNED
#if defined(_MSC_VER) && !defined(__CLR_VER)
#define SIMD_ALIGNED(var) __declspec(align(16)) var
#elif defined(__GNUC__) && !defined(__pnacl__)
#define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
#else
#define SIMD_ALIGNED(var) var
#endif
#endif
extern unsigned int fastrand_seed;
inline int fastrand() {
fastrand_seed = fastrand_seed * 214013u + 2531011u;
return static_cast<int>((fastrand_seed >> 16) & 0xffff);
}
static inline void MemRandomize(uint8* dst, int64 len) {
int64 i;
static inline void MemRandomize(uint8* dst, int len) {
int i;
for (i = 0; i < len - 1; i += 2) {
*reinterpret_cast<uint16*>(dst) = fastrand();
*reinterpret_cast<uint16*>(dst) = random();
dst += 2;
}
for (; i < len; ++i) {
*dst++ = fastrand();
*dst++ = random();
}
}
class LibYUVColorTest : public ::testing::Test {
class libyuvTest : public ::testing::Test {
protected:
LibYUVColorTest();
libyuvTest();
const int rotate_max_w_;
const int rotate_max_h_;
int benchmark_iterations_; // Default 1. Use 1000 for benchmarking.
int benchmark_width_; // Default 1280. Use 640 for benchmarking VGA.
int benchmark_height_; // Default 720. Use 360 for benchmarking VGA.
int benchmark_pixels_div256_; // Total pixels to benchmark / 256.
int benchmark_pixels_div1280_; // Total pixels to benchmark / 1280.
int disable_cpu_flags_; // Default 1. Use -1 for benchmarking.
int benchmark_cpu_info_; // Default -1. Use 1 to disable SIMD.
};
class LibYUVConvertTest : public ::testing::Test {
protected:
LibYUVConvertTest();
int benchmark_iterations_; // Default 1. Use 1000 for benchmarking.
int benchmark_width_; // Default 1280. Use 640 for benchmarking VGA.
int benchmark_height_; // Default 720. Use 360 for benchmarking VGA.
int benchmark_pixels_div256_; // Total pixels to benchmark / 256.
int benchmark_pixels_div1280_; // Total pixels to benchmark / 1280.
int disable_cpu_flags_; // Default 1. Use -1 for benchmarking.
int benchmark_cpu_info_; // Default -1. Use 1 to disable SIMD.
};
class LibYUVScaleTest : public ::testing::Test {
protected:
LibYUVScaleTest();
int benchmark_iterations_; // Default 1. Use 1000 for benchmarking.
int benchmark_width_; // Default 1280. Use 640 for benchmarking VGA.
int benchmark_height_; // Default 720. Use 360 for benchmarking VGA.
int benchmark_pixels_div256_; // Total pixels to benchmark / 256.
int benchmark_pixels_div1280_; // Total pixels to benchmark / 1280.
int disable_cpu_flags_; // Default 1. Use -1 for benchmarking.
int benchmark_cpu_info_; // Default -1. Use 1 to disable SIMD.
};
class LibYUVRotateTest : public ::testing::Test {
protected:
LibYUVRotateTest();
int benchmark_iterations_; // Default 1. Use 1000 for benchmarking.
int benchmark_width_; // Default 1280. Use 640 for benchmarking VGA.
int benchmark_height_; // Default 720. Use 360 for benchmarking VGA.
int benchmark_pixels_div256_; // Total pixels to benchmark / 256.
int benchmark_pixels_div1280_; // Total pixels to benchmark / 1280.
int disable_cpu_flags_; // Default 1. Use -1 for benchmarking.
int benchmark_cpu_info_; // Default -1. Use 1 to disable SIMD.
};
class LibYUVPlanarTest : public ::testing::Test {
protected:
LibYUVPlanarTest();
int benchmark_iterations_; // Default 1. Use 1000 for benchmarking.
int benchmark_width_; // Default 1280. Use 640 for benchmarking VGA.
int benchmark_height_; // Default 720. Use 360 for benchmarking VGA.
int benchmark_pixels_div256_; // Total pixels to benchmark / 256.
int benchmark_pixels_div1280_; // Total pixels to benchmark / 1280.
int disable_cpu_flags_; // Default 1. Use -1 for benchmarking.
int benchmark_cpu_info_; // Default -1. Use 1 to disable SIMD.
};
class LibYUVBaseTest : public ::testing::Test {
protected:
LibYUVBaseTest();
int benchmark_iterations_; // Default 1. Use 1000 for benchmarking.
int benchmark_width_; // Default 1280. Use 640 for benchmarking VGA.
int benchmark_height_; // Default 720. Use 360 for benchmarking VGA.
int benchmark_pixels_div256_; // Total pixels to benchmark / 256.
int benchmark_pixels_div1280_; // Total pixels to benchmark / 1280.
int disable_cpu_flags_; // Default 1. Use -1 for benchmarking.
int benchmark_cpu_info_; // Default -1. Use 1 to disable SIMD.
};
#endif // UNIT_TEST_UNIT_TEST_H_ NOLINT

View File

@ -41,9 +41,8 @@ static bool TestValidFourCC(uint32 fourcc, int bpp) {
return true;
}
TEST_F(LibYUVBaseTest, TestCanonicalFourCC) {
TEST_F(libyuvTest, TestCanonicalFourCC) {
EXPECT_EQ(FOURCC_I420, CanonicalFourCC(FOURCC_IYUV));
EXPECT_EQ(FOURCC_I420, CanonicalFourCC(FOURCC_YU12));
EXPECT_EQ(FOURCC_I422, CanonicalFourCC(FOURCC_YU16));
EXPECT_EQ(FOURCC_I444, CanonicalFourCC(FOURCC_YU24));
EXPECT_EQ(FOURCC_YUY2, CanonicalFourCC(FOURCC_YUYV));
@ -52,6 +51,7 @@ TEST_F(LibYUVBaseTest, TestCanonicalFourCC) {
EXPECT_EQ(FOURCC_UYVY, CanonicalFourCC(FOURCC_2VUY));
EXPECT_EQ(FOURCC_MJPG, CanonicalFourCC(FOURCC_JPEG));
EXPECT_EQ(FOURCC_MJPG, CanonicalFourCC(FOURCC_DMB1));
EXPECT_EQ(FOURCC_BGGR, CanonicalFourCC(FOURCC_BA81));
EXPECT_EQ(FOURCC_RAW, CanonicalFourCC(FOURCC_RGB3));
EXPECT_EQ(FOURCC_24BG, CanonicalFourCC(FOURCC_BGR3));
EXPECT_EQ(FOURCC_BGRA, CanonicalFourCC(FOURCC_CM32));
@ -61,7 +61,7 @@ TEST_F(LibYUVBaseTest, TestCanonicalFourCC) {
EXPECT_EQ(FOURCC_RGBO, CanonicalFourCC(FOURCC_5551));
}
TEST_F(LibYUVBaseTest, TestFourCC) {
TEST_F(libyuvTest, TestFourCC) {
EXPECT_TRUE(TestValidFourCC(FOURCC_I420, FOURCC_BPP_I420));
EXPECT_TRUE(TestValidFourCC(FOURCC_I420, FOURCC_BPP_I420));
EXPECT_TRUE(TestValidFourCC(FOURCC_I422, FOURCC_BPP_I422));
@ -73,7 +73,7 @@ TEST_F(LibYUVBaseTest, TestFourCC) {
EXPECT_TRUE(TestValidFourCC(FOURCC_YUY2, FOURCC_BPP_YUY2));
EXPECT_TRUE(TestValidFourCC(FOURCC_UYVY, FOURCC_BPP_UYVY));
EXPECT_TRUE(TestValidFourCC(FOURCC_M420, FOURCC_BPP_M420));
EXPECT_TRUE(TestValidFourCC(FOURCC_Q420, FOURCC_BPP_Q420)); // deprecated.
EXPECT_TRUE(TestValidFourCC(FOURCC_Q420, FOURCC_BPP_Q420));
EXPECT_TRUE(TestValidFourCC(FOURCC_ARGB, FOURCC_BPP_ARGB));
EXPECT_TRUE(TestValidFourCC(FOURCC_BGRA, FOURCC_BPP_BGRA));
EXPECT_TRUE(TestValidFourCC(FOURCC_ABGR, FOURCC_BPP_ABGR));
@ -83,6 +83,10 @@ TEST_F(LibYUVBaseTest, TestFourCC) {
EXPECT_TRUE(TestValidFourCC(FOURCC_RGBP, FOURCC_BPP_RGBP));
EXPECT_TRUE(TestValidFourCC(FOURCC_RGBO, FOURCC_BPP_RGBO));
EXPECT_TRUE(TestValidFourCC(FOURCC_R444, FOURCC_BPP_R444));
EXPECT_TRUE(TestValidFourCC(FOURCC_RGGB, FOURCC_BPP_RGGB));
EXPECT_TRUE(TestValidFourCC(FOURCC_BGGR, FOURCC_BPP_BGGR));
EXPECT_TRUE(TestValidFourCC(FOURCC_GRBG, FOURCC_BPP_GRBG));
EXPECT_TRUE(TestValidFourCC(FOURCC_GBRG, FOURCC_BPP_GBRG));
EXPECT_TRUE(TestValidFourCC(FOURCC_MJPG, FOURCC_BPP_MJPG));
EXPECT_TRUE(TestValidFourCC(FOURCC_YV12, FOURCC_BPP_YV12));
EXPECT_TRUE(TestValidFourCC(FOURCC_YV16, FOURCC_BPP_YV16));

View File

@ -1,6 +1,6 @@
psnr: psnr.cc ssim.cc psnr_main.cc
ifeq ($(CXX),icl)
$(CXX) /arch:SSE2 /Ox /openmp psnr.cc ssim.cc psnr_main.cc
else
$(CXX) -msse2 -O3 -fopenmp -static -o psnr psnr.cc ssim.cc psnr_main.cc -Wl,--strip-all
endif
psnr: psnr.cc ssim.cc psnr_main.cc
ifeq ($(CXX),icl)
$(CXX) /arch:SSE2 /Ox /openmp psnr.cc ssim.cc psnr_main.cc
else
$(CXX) -msse2 -O3 -fopenmp -static -o psnr psnr.cc ssim.cc psnr_main.cc -Wl,--strip-all
endif

View File

@ -66,8 +66,10 @@ int main(int argc, const char* argv[]) {
printf("Has NEON %x\n", has_neon);
}
if (has_mips) {
int has_dspr2 = TestCpuFlag(kCpuHasDSPR2);
printf("Has DSPR2 %x\n", has_dspr2);
int has_mips_dsp = TestCpuFlag(kCpuHasMIPS_DSP);
int has_mips_dspr2 = TestCpuFlag(kCpuHasMIPS_DSPR2);
printf("Has MIPS DSP %x\n", has_mips_dsp);
printf("Has MIPS DSPR2 %x\n", has_mips_dspr2);
}
if (has_x86) {
int has_sse2 = TestCpuFlag(kCpuHasSSE2);
@ -76,7 +78,6 @@ int main(int argc, const char* argv[]) {
int has_sse42 = TestCpuFlag(kCpuHasSSE42);
int has_avx = TestCpuFlag(kCpuHasAVX);
int has_avx2 = TestCpuFlag(kCpuHasAVX2);
int has_avx3 = TestCpuFlag(kCpuHasAVX3);
int has_erms = TestCpuFlag(kCpuHasERMS);
int has_fma3 = TestCpuFlag(kCpuHasFMA3);
printf("Has SSE2 %x\n", has_sse2);
@ -85,7 +86,6 @@ int main(int argc, const char* argv[]) {
printf("Has SSE4.2 %x\n", has_sse42);
printf("Has AVX %x\n", has_avx);
printf("Has AVX2 %x\n", has_avx2);
printf("Has AVX3 %x\n", has_avx3);
printf("Has ERMS %x\n", has_erms);
printf("Has FMA3 %x\n", has_fma3);
}

View File

@ -10,6 +10,8 @@
#include "./psnr.h" // NOLINT
#include <math.h>
#ifdef _OPENMP
#include <omp.h>
#endif
@ -32,22 +34,26 @@ typedef unsigned long long uint64; // NOLINT
#endif // __LP64__
#endif // _MSC_VER
// libyuv provides this function when linking library for jpeg support.
#if !defined(HAVE_JPEG)
// PSNR formula: psnr = 10 * log10 (Peak Signal^2 * size / sse)
double ComputePSNR(double sse, double size) {
const double kMINSSE = 255.0 * 255.0 * size / pow(10., kMaxPSNR / 10.);
if (sse <= kMINSSE)
sse = kMINSSE; // Produces max PSNR of 128
return 10.0 * log10(65025.0 * size / sse);
}
#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
!defined(__aarch64__)
#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)
#define HAS_SUMSQUAREERROR_NEON
static uint32 SumSquareError_NEON(const uint8* src_a,
const uint8* src_b, int count) {
volatile uint32 sse;
asm volatile (
asm volatile ( // NOLINT
"vmov.u8 q7, #0 \n"
"vmov.u8 q9, #0 \n"
"vmov.u8 q8, #0 \n"
"vmov.u8 q10, #0 \n"
"1: \n"
"1: \n"
"vld1.u8 {q0}, [%0]! \n"
"vld1.u8 {q1}, [%1]! \n"
"vsubl.u8 q2, d0, d2 \n"
@ -73,42 +79,6 @@ static uint32 SumSquareError_NEON(const uint8* src_a,
: "memory", "cc", "q0", "q1", "q2", "q3", "q7", "q8", "q9", "q10");
return sse;
}
#elif !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
#define HAS_SUMSQUAREERROR_NEON
static uint32 SumSquareError_NEON(const uint8* src_a,
const uint8* src_b, int count) {
volatile uint32 sse;
asm volatile (
"eor v16.16b, v16.16b, v16.16b \n"
"eor v18.16b, v18.16b, v18.16b \n"
"eor v17.16b, v17.16b, v17.16b \n"
"eor v19.16b, v19.16b, v19.16b \n"
"1: \n"
"ld1 {v0.16b}, [%0], #16 \n"
"ld1 {v1.16b}, [%1], #16 \n"
"subs %w2, %w2, #16 \n"
"usubl v2.8h, v0.8b, v1.8b \n"
"usubl2 v3.8h, v0.16b, v1.16b \n"
"smlal v16.4s, v2.4h, v2.4h \n"
"smlal v17.4s, v3.4h, v3.4h \n"
"smlal2 v18.4s, v2.8h, v2.8h \n"
"smlal2 v19.4s, v3.8h, v3.8h \n"
"b.gt 1b \n"
"add v16.4s, v16.4s, v17.4s \n"
"add v18.4s, v18.4s, v19.4s \n"
"add v19.4s, v16.4s, v18.4s \n"
"addv s0, v19.4s \n"
"fmov %w3, s0 \n"
: "+r"(src_a),
"+r"(src_b),
"+r"(count),
"=r"(sse)
:
: "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
return sse;
}
#elif !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
#define HAS_SUMSQUAREERROR_SSE2
__declspec(naked)
@ -206,8 +176,7 @@ static __inline void __cpuid(int cpu_info[4], int info_type) {
: "=a"(cpu_info[0]), "=D"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
: "a"(info_type));
}
// For gcc/clang but not clangcl.
#elif (defined(__i386__) || defined(__x86_64__)) && !defined(_MSC_VER)
#elif defined(__i386__) || defined(__x86_64__)
static __inline void __cpuid(int cpu_info[4], int info_type) {
asm volatile ( // NOLINT
"cpuid \n"
@ -272,16 +241,6 @@ double ComputeSumSquareError(const uint8* src_a,
}
return static_cast<double>(sse);
}
#endif
// PSNR formula: psnr = 10 * log10 (Peak Signal^2 * size / sse)
// Returns 128.0 (kMaxPSNR) if sse is 0 (perfect match).
double ComputePSNR(double sse, double size) {
const double kMINSSE = 255.0 * 255.0 * size / pow(10.0, kMaxPSNR / 10.0);
if (sse <= kMINSSE)
sse = kMINSSE; // Produces max PSNR of 128
return 10.0 * log10(255.0 * 255.0 * size / sse);
}
#ifdef __cplusplus
} // extern "C"

View File

@ -13,8 +13,6 @@
#ifndef UTIL_PSNR_H_ // NOLINT
#define UTIL_PSNR_H_
#include <math.h> // For log10()
#ifdef __cplusplus
extern "C" {
#endif
@ -26,17 +24,13 @@ typedef unsigned char uint8;
static const double kMaxPSNR = 128.0;
// libyuv provides this function when linking library for jpeg support.
// TODO(fbarchard): make psnr lib compatible subset of libyuv.
#if !defined(HAVE_JPEG)
// PSNR formula: psnr = 10 * log10 (Peak Signal^2 * size / sse).
// Returns 128.0 (kMaxPSNR) if sse is 0 (perfect match).
double ComputePSNR(double sse, double size);
// Computer Sum of Squared Error (SSE).
// Pass this to ComputePSNR for final result.
double ComputeSumSquareError(const uint8* org, const uint8* rec, int size);
#endif
// PSNR formula: psnr = 10 * log10 (Peak Signal^2 * size / sse)
// Returns 128.0 (kMaxPSNR) if sse is 0 (perfect match).
double ComputePSNR(double sse, double size);
#ifdef __cplusplus
} // extern "C"

View File

@ -32,10 +32,6 @@
#include "./psnr.h"
#include "./ssim.h"
#ifdef HAVE_JPEG
#include "libyuv/compare.h"
#include "libyuv/convert.h"
#endif
struct metric {
double y, u, v, all;
@ -79,29 +75,6 @@ bool ExtractResolutionFromFilename(const char* name,
}
}
}
#ifdef HAVE_JPEG
// Try parsing file as a jpeg.
FILE* const file_org = fopen(name, "rb");
if (file_org == NULL) {
fprintf(stderr, "Cannot open %s\n", name);
return false;
}
fseek(file_org, 0, SEEK_END);
size_t total_size = ftell(file_org);
fseek(file_org, 0, SEEK_SET);
uint8* const ch_org = new uint8[total_size];
memset(ch_org, 0, total_size);
size_t bytes_org = fread(ch_org, sizeof(uint8), total_size, file_org);
fclose(file_org);
if (bytes_org == total_size) {
if (0 == libyuv::MJPGSize(ch_org, total_size, width_ptr, height_ptr)) {
delete[] ch_org;
return true;
}
}
delete[] ch_org;
#endif // HAVE_JPEG
return false;
}
@ -121,9 +94,6 @@ double GetMSE(double sse, double size) {
void PrintHelp(const char * program) {
printf("%s [-options] org_seq rec_seq [rec_seq2.. etc]\n", program);
#ifdef HAVE_JPEG
printf("jpeg or raw YUV 420 supported.\n");
#endif
printf("options:\n");
printf(" -s <width> <height> .... specify YUV size, mandatory if none of the "
"sequences have the\n");
@ -245,18 +215,9 @@ bool UpdateMetrics(uint8* ch_org, uint8* ch_rec,
const uint8* const v_org = ch_org + y_size + (uv_size - uv_offset);
const uint8* const v_rec = ch_rec + y_size + uv_size;
if (do_psnr) {
#ifdef HAVE_JPEG
double y_err = static_cast<double>(
libyuv::ComputeSumSquareError(ch_org, ch_rec, y_size));
double u_err = static_cast<double>(
libyuv::ComputeSumSquareError(u_org, u_rec, uv_size));
double v_err = static_cast<double>(
libyuv::ComputeSumSquareError(v_org, v_rec, uv_size));
#else
double y_err = ComputeSumSquareError(ch_org, ch_rec, y_size);
double u_err = ComputeSumSquareError(u_org, u_rec, uv_size);
double v_err = ComputeSumSquareError(v_org, v_rec, uv_size);
#endif
const double total_err = y_err + u_err + v_err;
cur_distortion_psnr->global_y += y_err;
cur_distortion_psnr->global_u += u_err;
@ -269,10 +230,10 @@ bool UpdateMetrics(uint8* ch_org, uint8* ch_rec,
static_cast<double>(total_size));
} else {
distorted_frame->y = CalcSSIM(ch_org, ch_rec, image_width, image_height);
distorted_frame->u = CalcSSIM(u_org, u_rec, (image_width + 1) / 2,
(image_height + 1) / 2);
distorted_frame->v = CalcSSIM(v_org, v_rec, (image_width + 1) / 2,
(image_height + 1) / 2);
distorted_frame->u = CalcSSIM(u_org, u_rec, image_width / 2,
image_height / 2);
distorted_frame->v = CalcSSIM(v_org, v_rec, image_width / 2,
image_height / 2);
distorted_frame->all =
(distorted_frame->y + distorted_frame->u + distorted_frame->v)
/ total_size;
@ -425,62 +386,14 @@ int main(int argc, const char* argv[]) {
break;
size_t bytes_org = fread(ch_org, sizeof(uint8), total_size, file_org);
if (bytes_org < total_size) {
#ifdef HAVE_JPEG
// Try parsing file as a jpeg.
uint8* const ch_jpeg = new uint8[bytes_org];
memcpy(ch_jpeg, ch_org, bytes_org);
memset(ch_org, 0, total_size);
if (0 != libyuv::MJPGToI420(ch_jpeg, bytes_org,
ch_org,
image_width,
ch_org + y_size,
(image_width + 1) / 2,
ch_org + y_size + uv_size,
(image_width + 1) / 2,
image_width,
image_height,
image_width,
image_height)) {
delete[] ch_jpeg;
break;
}
delete[] ch_jpeg;
#else
if (bytes_org < total_size)
break;
#endif // HAVE_JPEG
}
for (int cur_rec = 0; cur_rec < num_rec; ++cur_rec) {
size_t bytes_rec = fread(ch_rec, sizeof(uint8),
total_size, file_rec[cur_rec]);
if (bytes_rec < total_size) {
#ifdef HAVE_JPEG
// Try parsing file as a jpeg.
uint8* const ch_jpeg = new uint8[bytes_rec];
memcpy(ch_jpeg, ch_rec, bytes_rec);
memset(ch_rec, 0, total_size);
if (0 != libyuv::MJPGToI420(ch_jpeg, bytes_rec,
ch_rec,
image_width,
ch_rec + y_size,
(image_width + 1) / 2,
ch_rec + y_size + uv_size,
(image_width + 1) / 2,
image_width,
image_height,
image_width,
image_height)) {
delete[] ch_jpeg;
break;
}
delete[] ch_jpeg;
#else
if (bytes_rec < total_size)
break;
#endif // HAVE_JPEG
}
if (verbose) {
printf("%5d", number_of_frames);

View File

@ -10,6 +10,7 @@
#include "../util/ssim.h" // NOLINT
#include <math.h>
#include <string.h>
#ifdef __cplusplus

View File

@ -13,8 +13,6 @@
#ifndef UTIL_SSIM_H_ // NOLINT
#define UTIL_SSIM_H_
#include <math.h> // For log10()
#ifdef __cplusplus
extern "C" {
#endif
@ -27,6 +25,7 @@ typedef unsigned char uint8;
double CalcSSIM(const uint8* org, const uint8* rec,
const int image_width, const int image_height);
// does -10.0 * log10(1.0 - ssim)
double CalcLSSIM(double ssim);
#ifdef __cplusplus

View File

@ -1,5 +1,4 @@
# This is a generic makefile for libyuv for Windows Arm.
# call "c:\Program Files (x86)\Microsoft Visual Studio 11.0\VC\bin\x86_arm\vcvarsx86_arm.bat"
# nmake /f winarm.mk
# make -f winarm.mk
# nmake /f winarm.mk clean
@ -20,15 +19,13 @@ LOCAL_OBJ_FILES = \
source/convert_to_argb.o\
source/convert_to_i420.o\
source/cpu_id.o\
source/format_conversion.o\
source/planar_functions.o\
source/rotate.o\
source/rotate_any.o\
source/rotate_argb.o\
source/rotate_common.o\
source/row_any.o\
source/row_common.o\
source/scale.o\
source/scale_any.o\
source/scale_argb.o\
source/scale_common.o\
source/video_common.o