Backed out changeset 65ee637b7e20 (bug 1284803)

2024-10-09 19:35:51 +00:00 · 2016-07-26 10:22:51 +08:00 · 2016-07-26 10:22:51 +08:00 · 2841a7655a
commit 2841a7655a
parent 601f1a35d5
79 changed files with 11873 additions and 16060 deletions
--- a/media/libyuv/Android.mk
+++ b/media/libyuv/Android.mk
@ -8,8 +8,7 @@ LOCAL_CPP_EXTENSION := .cc
 LOCAL_SRC_FILES := \
    source/compare.cc           \
    source/compare_common.cc    \
-    source/compare_neon64.cc    \
-    source/compare_gcc.cc       \
+    source/compare_posix.cc     \
    source/convert.cc           \
    source/convert_argb.cc      \
    source/convert_from.cc      \
@ -17,26 +16,20 @@ LOCAL_SRC_FILES := \
    source/convert_to_argb.cc   \
    source/convert_to_i420.cc   \
    source/cpu_id.cc            \
+    source/format_conversion.cc \
    source/planar_functions.cc  \
    source/rotate.cc            \
-    source/rotate_any.cc        \
    source/rotate_argb.cc       \
-    source/rotate_common.cc     \
    source/rotate_mips.cc       \
-    source/rotate_neon64.cc     \
-    source/rotate_gcc.cc        \
    source/row_any.cc           \
    source/row_common.cc        \
    source/row_mips.cc          \
-    source/row_neon64.cc        \
-    source/row_gcc.cc	        \
+    source/row_posix.cc         \
    source/scale.cc             \
-    source/scale_any.cc         \
    source/scale_argb.cc        \
    source/scale_common.cc      \
    source/scale_mips.cc        \
-    source/scale_neon64.cc      \
-    source/scale_gcc.cc         \
+    source/scale_posix.cc       \
    source/video_common.cc

 # TODO(fbarchard): Enable mjpeg encoder.
--- a/media/libyuv/DEPS
+++ b/media/libyuv/DEPS
@ -1,42 +1,130 @@
+use_relative_paths = True
+
 vars = {
+  "libyuv_trunk" : "https://libyuv.googlecode.com/svn/trunk",
+
  # Override root_dir in your .gclient's custom_vars to specify a custom root
  # folder name.
-  'root_dir': 'libyuv',
-  'extra_gyp_flag': '-Dextra_gyp_flag=0',
-  'chromium_git': 'https://chromium.googlesource.com',
+  "root_dir": "trunk",
+  "extra_gyp_flag": "-Dextra_gyp_flag=0",

-  # Roll the Chromium Git hash to pick up newer versions of all the
-  # dependencies and tools linked to in setup_links.py.
-  'chromium_revision': '2a818f54130d8c93f81490adce5a1e87307bf5f0',
+  # Use this googlecode_url variable only if there is an internal mirror for it.
+  # If you do not know, use the full path while defining your new deps entry.
+  "googlecode_url": "http://%s.googlecode.com/svn",
+  "chromium_trunk" : "http://src.chromium.org/svn/trunk",
+  # chrome://version/ for revision of canary Chrome.
+  "chromium_revision": "232627",
 }

 # NOTE: Prefer revision numbers to tags for svn deps. Use http rather than
 # https; the latter can cause problems for users behind proxies.
 deps = {
-  Var('root_dir') + '/third_party/gflags/src':
-    Var('chromium_git') + '/external/github.com/gflags/gflags@03bebcb065c83beff83d50ae025a55a4bf94dfca',
+  "../chromium_deps":
+    File(Var("chromium_trunk") + "/src/DEPS@" + Var("chromium_revision")),
+
+  "build":
+    Var("chromium_trunk") + "/src/build@" + Var("chromium_revision"),
+
+  # Needed by common.gypi.
+  "google_apis/build":
+    Var("chromium_trunk") + "/src/google_apis/build@" + Var("chromium_revision"),
+
+  "testing":
+    Var("chromium_trunk") + "/src/testing@" + Var("chromium_revision"),
+
+  "testing/gtest":
+    From("chromium_deps", "src/testing/gtest"),
+
+  "tools/clang":
+    Var("chromium_trunk") + "/src/tools/clang@" + Var("chromium_revision"),
+
+  "tools/gyp":
+    From("chromium_deps", "src/tools/gyp"),
+
+  "tools/python":
+    Var("chromium_trunk") + "/src/tools/python@" + Var("chromium_revision"),
+
+  "tools/valgrind":
+    Var("chromium_trunk") + "/src/tools/valgrind@" + Var("chromium_revision"),
+
+  # Needed by build/common.gypi.
+  "tools/win/supalink":
+    Var("chromium_trunk") + "/src/tools/win/supalink@" + Var("chromium_revision"),
+
+  "third_party/libjpeg_turbo":
+    From("chromium_deps", "src/third_party/libjpeg_turbo"),
+
+  # Yasm assember required for libjpeg_turbo
+  "third_party/yasm":
+    Var("chromium_trunk") + "/src/third_party/yasm@" + Var("chromium_revision"),
+
+  "third_party/yasm/source/patched-yasm":
+    Var("chromium_trunk") + "/deps/third_party/yasm/patched-yasm@" + Var("chromium_revision"),
 }

-# Define rules for which include paths are allowed in our source.
-include_rules = [ '+gflags' ]
+deps_os = {
+  "win": {
+    # Use WebRTC's, stripped down, version of Cygwin (required by GYP).
+    "third_party/cygwin":
+      (Var("googlecode_url") % "webrtc") + "/deps/third_party/cygwin@2672",
+
+    # Used by libjpeg-turbo.
+    # TODO(fbarchard): Remove binaries and run yasm from build folder.
+    "third_party/yasm/binaries":
+      Var("chromium_trunk") + "/deps/third_party/yasm/binaries@" + Var("chromium_revision"),
+    "third_party/yasm": None,
+  },
+  "unix": {
+    "third_party/gold":
+      From("chromium_deps", "src/third_party/gold"),
+  },
+  "android": {
+    "third_party/android_tools":
+      From("chromium_deps", "src/third_party/android_tools"),
+
+    "third_party/libjpeg":
+      From("chromium_deps", "src/third_party/libjpeg"),
+  },
+  "ios": {
+    # NSS, for SSLClientSocketNSS.
+    "third_party/nss":
+      From("chromium_deps", "src/third_party/nss"),
+
+    "net/third_party/nss":
+      Var("chromium_trunk") + "/src/net/third_party/nss@" + Var("chromium_revision"),
+
+    # class-dump utility to generate header files for undocumented SDKs.
+    "testing/iossim/third_party/class-dump":
+      From("chromium_deps", "src/testing/iossim/third_party/class-dump"),
+
+    # Helper for running under the simulator.
+    "testing/iossim":
+      Var("chromium_trunk") + "/src/testing/iossim@" + Var("chromium_revision"),
+  },
+}

 hooks = [
  {
-    # Clone chromium and its deps.
-    'name': 'sync chromium',
-    'pattern': '.',
-    'action': ['python', '-u', Var('root_dir') + '/sync_chromium.py',
-               '--target-revision', Var('chromium_revision')],
-  },
-  {
-    # Create links to shared dependencies in Chromium.
-    'name': 'setup_links',
-    'pattern': '.',
-    'action': ['python', Var('root_dir') + '/setup_links.py'],
+    # Pull clang on mac. If nothing changed, or on non-mac platforms, this takes
+    # zero seconds to run. If something changed, it downloads a prebuilt clang.
+    "pattern": ".",
+    "action": ["python", Var("root_dir") + "/tools/clang/scripts/update.py",
+               "--mac-only"],
  },
  {
    # A change to a .gyp, .gypi, or to GYP itself should run the generator.
-    'pattern': '.',
-    'action': ['python', Var('root_dir') + '/gyp_libyuv'],
+    "pattern": ".",
+    "action": ["python", Var("root_dir") + "/build/gyp_chromium",
+               "--depth=" + Var("root_dir"), Var("root_dir") + "/all.gyp",
+               Var("extra_gyp_flag")],
+  },
+  {
+    # Update the cygwin mount on Windows.
+    # This is necessary to get the correct mapping between e.g. /bin and the
+    # cygwin path on Windows. Without it we can't run bash scripts in actions.
+    # Ideally this should be solved in "pylib/gyp/msvs_emulation.py".
+    "pattern": ".",
+    "action": ["python", Var("root_dir") + "/build/win/setup_cygwin_mount.py",
+               "--win-only"],
  },
 ]
--- a/media/libyuv/OWNERS
+++ b/media/libyuv/OWNERS
@ -1,13 +1,2 @@
-fbarchard@chromium.org
-magjed@chromium.org
-torbjorng@chromium.org
-
-per-file *.gyp=kjellander@chromium.org
-per-file *.gn=kjellander@chromium.org
-per-file .gitignore=*
-per-file AUTHORS=*
-per-file DEPS=*
-per-file PRESUBMIT.py=kjellander@chromium.org
-per-file gyp_libyuv.py=kjellander@chromium.org
-per-file setup_links.py=*
-per-file sync_chromium.py=kjellander@chromium.org
+fbarchard@chromium.org
+mflodman@chromium.org
--- a/media/libyuv/README.chromium
+++ b/media/libyuv/README.chromium
@ -1,8 +1,9 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 1602
+Version: 971
 License: BSD
 License File: LICENSE

 Description:
-libyuv is an open source project that includes YUV conversion and scaling functionality.
+libyuv is an open source project that includes
+YUV conversion and scaling functionality.
--- a/media/libyuv/codereview.settings
+++ b/media/libyuv/codereview.settings
@ -1,10 +1,9 @@
 # This file is used by gcl to get repository specific information.
-CODE_REVIEW_SERVER: codereview.chromium.org
+# The LibYuv code review is via WebRtc's code review
+CODE_REVIEW_SERVER: webrtc-codereview.appspot.com
 #CC_LIST:
-VIEW_VC: https://chromium.googlesource.com/libyuv/libyuv/+/
+VIEW_VC: https://code.google.com/p/libyuv/source/detail?r=
 #STATUS:
-FORCE_HTTPS_COMMIT_URL: True
-PROJECT: libyuv
 TRY_ON_UPLOAD: False
 TRYSERVER_ROOT: src
 TRYSERVER_SVN_URL: svn://svn.chromium.org/chrome-try/try-libyuv
--- a/media/libyuv/include/libyuv.h
+++ b/media/libyuv/include/libyuv.h
@ -18,6 +18,7 @@
 #include "libyuv/convert_from.h"
 #include "libyuv/convert_from_argb.h"
 #include "libyuv/cpu_id.h"
+#include "libyuv/format_conversion.h"
 #include "libyuv/mjpeg_decoder.h"
 #include "libyuv/planar_functions.h"
 #include "libyuv/rotate.h"
--- a/media/libyuv/include/libyuv/basic_types.h
+++ b/media/libyuv/include/libyuv/basic_types.h
@ -13,12 +13,26 @@

 #include <stddef.h>  // for NULL, size_t

-#if defined(__ANDROID__) || (defined(_MSC_VER) && (_MSC_VER < 1600))
-#include <sys/types.h>  // for uintptr_t on x86
-#else
+#if !(defined(_MSC_VER) && (_MSC_VER < 1600))
 #include <stdint.h>  // for uintptr_t
 #endif

+typedef uint64_t uint64;
+typedef int64_t  int64;
+#if defined(_MSC_VER)
+// nsprpub/pr/include/obsolete/protypes.h defines these weirdly
+typedef long int32;
+typedef unsigned long uint32;
+#else
+typedef uint32_t uint32;
+typedef int32_t  int32;
+#endif
+typedef uint16_t uint16;
+typedef int16_t  int16;
+typedef uint8_t  uint8;
+typedef int8_t   int8;
+#define INT_TYPES_DEFINED 1
+
 #ifndef GG_LONGLONG
 #ifndef INT_TYPES_DEFINED
 #define INT_TYPES_DEFINED
--- a/media/libyuv/include/libyuv/compare.h
+++ b/media/libyuv/include/libyuv/compare.h
@ -22,11 +22,6 @@ extern "C" {
 LIBYUV_API
 uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed);

-// Scan an opaque argb image and return fourcc based on alpha offset.
-// Returns FOURCC_ARGB, FOURCC_BGRA, or 0 if unknown.
-LIBYUV_API
-uint32 ARGBDetect(const uint8* argb, int stride_argb, int width, int height);
-
 // Sum Square Error - used to compute Mean Square Error or PSNR.
 LIBYUV_API
 uint64 ComputeSumSquareError(const uint8* src_a,
--- a/media/libyuv/include/libyuv/convert.h
+++ b/media/libyuv/include/libyuv/convert.h
@ -12,8 +12,10 @@
 #define INCLUDE_LIBYUV_CONVERT_H_

 #include "libyuv/basic_types.h"
-
-#include "libyuv/rotate.h"  // For enum RotationMode.
+// TODO(fbarchard): Remove the following headers includes.
+#include "libyuv/convert_from.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/rotate.h"

 #ifdef __cplusplus
 namespace libyuv {
@ -69,8 +71,6 @@ int I400ToI420(const uint8* src_y, int src_stride_y,
               uint8* dst_v, int dst_stride_v,
               int width, int height);

-#define J400ToJ420 I400ToI420
-
 // Convert NV12 to I420.
 LIBYUV_API
 int NV12ToI420(const uint8* src_y, int src_stride_y,
@ -113,6 +113,15 @@ int M420ToI420(const uint8* src_m420, int src_stride_m420,
               uint8* dst_v, int dst_stride_v,
               int width, int height);

+// Convert Q420 to I420.
+LIBYUV_API
+int Q420ToI420(const uint8* src_y, int src_stride_y,
+               const uint8* src_yuy2, int src_stride_yuy2,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
 // ARGB little endian (bgra in memory) to I420.
 LIBYUV_API
 int ARGBToI420(const uint8* src_frame, int src_stride_frame,
@ -202,6 +211,8 @@ int MJPGSize(const uint8* sample, size_t sample_size,
             int* width, int* height);
 #endif

+// Note Bayer formats (BGGR) To I420 are in format_conversion.h
+
 // Convert camera sample to I420 with cropping, rotation and vertical flip.
 // "src_size" is needed to parse MJPG.
 // "dst_stride_y" number of bytes in a row of the dst_y plane.
--- a/media/libyuv/include/libyuv/convert_argb.h
+++ b/media/libyuv/include/libyuv/convert_argb.h
@ -12,10 +12,13 @@
 #define INCLUDE_LIBYUV_CONVERT_ARGB_H_

 #include "libyuv/basic_types.h"
-
-#include "libyuv/rotate.h"  // For enum RotationMode.
+// TODO(fbarchard): Remove the following headers includes
+#include "libyuv/convert_from.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/rotate.h"

 // TODO(fbarchard): This set of functions should exactly match convert.h
+// Add missing Q420.
 // TODO(fbarchard): Add tests. Create random content of right size and convert
 // with C vs Opt and or to I420 and compare.
 // TODO(fbarchard): Some of these functions lack parameter setting.
@ -58,22 +61,6 @@ int I444ToARGB(const uint8* src_y, int src_stride_y,
               uint8* dst_argb, int dst_stride_argb,
               int width, int height);

-// Convert J444 to ARGB.
-LIBYUV_API
-int J444ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
-
-// Convert I444 to ABGR.
-LIBYUV_API
-int I444ToABGR(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_abgr, int dst_stride_abgr,
-               int width, int height);
-
 // Convert I411 to ARGB.
 LIBYUV_API
 int I411ToARGB(const uint8* src_y, int src_stride_y,
@ -82,38 +69,20 @@ int I411ToARGB(const uint8* src_y, int src_stride_y,
               uint8* dst_argb, int dst_stride_argb,
               int width, int height);

-// Convert I420 with Alpha to preattenuated ARGB.
-LIBYUV_API
-int I420AlphaToARGB(const uint8* src_y, int src_stride_y,
-                    const uint8* src_u, int src_stride_u,
-                    const uint8* src_v, int src_stride_v,
-                    const uint8* src_a, int src_stride_a,
-                    uint8* dst_argb, int dst_stride_argb,
-                    int width, int height, int attenuate);
-
-// Convert I420 with Alpha to preattenuated ABGR.
-LIBYUV_API
-int I420AlphaToABGR(const uint8* src_y, int src_stride_y,
-                    const uint8* src_u, int src_stride_u,
-                    const uint8* src_v, int src_stride_v,
-                    const uint8* src_a, int src_stride_a,
-                    uint8* dst_abgr, int dst_stride_abgr,
-                    int width, int height, int attenuate);
-
-// Convert I400 (grey) to ARGB.  Reverse of ARGBToI400.
+// Convert I400 (grey) to ARGB.
 LIBYUV_API
 int I400ToARGB(const uint8* src_y, int src_stride_y,
               uint8* dst_argb, int dst_stride_argb,
               int width, int height);

-// Convert J400 (jpeg grey) to ARGB.
-LIBYUV_API
-int J400ToARGB(const uint8* src_y, int src_stride_y,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
-
 // Alias.
-#define YToARGB I400ToARGB
+#define YToARGB I400ToARGB_Reference
+
+// Convert I400 to ARGB. Reverse of ARGBToI400.
+LIBYUV_API
+int I400ToARGB_Reference(const uint8* src_y, int src_stride_y,
+                         uint8* dst_argb, int dst_stride_argb,
+                         int width, int height);

 // Convert NV12 to ARGB.
 LIBYUV_API
@ -135,6 +104,13 @@ int M420ToARGB(const uint8* src_m420, int src_stride_m420,
               uint8* dst_argb, int dst_stride_argb,
               int width, int height);

+// TODO(fbarchard): Convert Q420 to ARGB.
+// LIBYUV_API
+// int Q420ToARGB(const uint8* src_y, int src_stride_y,
+//                const uint8* src_yuy2, int src_stride_yuy2,
+//                uint8* dst_argb, int dst_stride_argb,
+//                int width, int height);
+
 // Convert YUY2 to ARGB.
 LIBYUV_API
 int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2,
@ -147,70 +123,6 @@ int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy,
               uint8* dst_argb, int dst_stride_argb,
               int width, int height);

-// Convert J420 to ARGB.
-LIBYUV_API
-int J420ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
-
-// Convert J422 to ARGB.
-LIBYUV_API
-int J422ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
-
-// Convert J420 to ABGR.
-LIBYUV_API
-int J420ToABGR(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_abgr, int dst_stride_abgr,
-               int width, int height);
-
-// Convert J422 to ABGR.
-LIBYUV_API
-int J422ToABGR(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_abgr, int dst_stride_abgr,
-               int width, int height);
-
-// Convert H420 to ARGB.
-LIBYUV_API
-int H420ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
-
-// Convert H422 to ARGB.
-LIBYUV_API
-int H422ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
-
-// Convert H420 to ABGR.
-LIBYUV_API
-int H420ToABGR(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_abgr, int dst_stride_abgr,
-               int width, int height);
-
-// Convert H422 to ABGR.
-LIBYUV_API
-int H422ToABGR(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_abgr, int dst_stride_abgr,
-               int width, int height);
-
 // BGRA little endian (argb in memory) to ARGB.
 LIBYUV_API
 int BGRAToARGB(const uint8* src_frame, int src_stride_frame,
@ -272,6 +184,8 @@ int MJPGToARGB(const uint8* sample, size_t sample_size,
               int dst_width, int dst_height);
 #endif

+// Note Bayer formats (BGGR) to ARGB are in format_conversion.h.
+
 // Convert camera sample to ARGB with cropping, rotation and vertical flip.
 // "src_size" is needed to parse MJPG.
 // "dst_stride_argb" number of bytes in a row of the dst_argb plane.
--- a/media/libyuv/include/libyuv/convert_from.h
+++ b/media/libyuv/include/libyuv/convert_from.h
@ -56,6 +56,9 @@ int I400Copy(const uint8* src_y, int src_stride_y,
             uint8* dst_y, int dst_stride_y,
             int width, int height);

+// TODO(fbarchard): I420ToM420
+// TODO(fbarchard): I420ToQ420
+
 LIBYUV_API
 int I420ToNV12(const uint8* src_y, int src_stride_y,
               const uint8* src_u, int src_stride_u,
@ -135,17 +138,6 @@ int I420ToRGB565(const uint8* src_y, int src_stride_y,
                 uint8* dst_frame, int dst_stride_frame,
                 int width, int height);

-// Convert I420 To RGB565 with 4x4 dither matrix (16 bytes).
-// Values in dither matrix from 0 to 7 recommended.
-// The order of the dither matrix is first byte is upper left.
-
-LIBYUV_API
-int I420ToRGB565Dither(const uint8* src_y, int src_stride_y,
-                       const uint8* src_u, int src_stride_u,
-                       const uint8* src_v, int src_stride_v,
-                       uint8* dst_frame, int dst_stride_frame,
-                       const uint8* dither4x4, int width, int height);
-
 LIBYUV_API
 int I420ToARGB1555(const uint8* src_y, int src_stride_y,
                   const uint8* src_u, int src_stride_u,
@ -160,6 +152,8 @@ int I420ToARGB4444(const uint8* src_y, int src_stride_y,
                   uint8* dst_frame, int dst_stride_frame,
                   int width, int height);

+// Note Bayer formats (BGGR) To I420 are in format_conversion.h.
+
 // Convert I420 to specified format.
 // "dst_sample_stride" is bytes in a row for the destination. Pass 0 if the
 //    buffer has contiguous rows. Can be negative. A multiple of 16 is optimal.
--- a/media/libyuv/include/libyuv/convert_from_argb.h
+++ b/media/libyuv/include/libyuv/convert_from_argb.h
@ -25,22 +25,24 @@ int ARGBCopy(const uint8* src_argb, int src_stride_argb,
             uint8* dst_argb, int dst_stride_argb,
             int width, int height);

-// Convert ARGB To BGRA.
+// Convert ARGB To BGRA. (alias)
+#define ARGBToBGRA BGRAToARGB
 LIBYUV_API
-int ARGBToBGRA(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_bgra, int dst_stride_bgra,
+int BGRAToARGB(const uint8* src_frame, int src_stride_frame,
+               uint8* dst_argb, int dst_stride_argb,
               int width, int height);

-// Convert ARGB To ABGR.
+// Convert ARGB To ABGR. (alias)
+#define ARGBToABGR ABGRToARGB
 LIBYUV_API
-int ARGBToABGR(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_abgr, int dst_stride_abgr,
+int ABGRToARGB(const uint8* src_frame, int src_stride_frame,
+               uint8* dst_argb, int dst_stride_argb,
               int width, int height);

 // Convert ARGB To RGBA.
 LIBYUV_API
-int ARGBToRGBA(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_rgba, int dst_stride_rgba,
+int ARGBToRGBA(const uint8* src_frame, int src_stride_frame,
+               uint8* dst_argb, int dst_stride_argb,
               int width, int height);

 // Convert ARGB To RGB24.
@ -61,16 +63,6 @@ int ARGBToRGB565(const uint8* src_argb, int src_stride_argb,
                 uint8* dst_rgb565, int dst_stride_rgb565,
                 int width, int height);

-// Convert ARGB To RGB565 with 4x4 dither matrix (16 bytes).
-// Values in dither matrix from 0 to 7 recommended.
-// The order of the dither matrix is first byte is upper left.
-// TODO(fbarchard): Consider pointer to 2d array for dither4x4.
-// const uint8(*dither)[4][4];
-LIBYUV_API
-int ARGBToRGB565Dither(const uint8* src_argb, int src_stride_argb,
-                       uint8* dst_rgb565, int dst_stride_rgb565,
-                       const uint8* dither4x4, int width, int height);
-
 // Convert ARGB To ARGB1555.
 LIBYUV_API
 int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb,
@ -115,14 +107,6 @@ int ARGBToJ420(const uint8* src_argb, int src_stride_argb,
               uint8* dst_v, int dst_stride_v,
               int width, int height);

-// Convert ARGB to J422.
-LIBYUV_API
-int ARGBToJ422(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_yj, int dst_stride_yj,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
-
 // Convert ARGB To I411.
 LIBYUV_API
 int ARGBToI411(const uint8* src_argb, int src_stride_argb,
@ -143,12 +127,6 @@ int ARGBToI400(const uint8* src_argb, int src_stride_argb,
               uint8* dst_y, int dst_stride_y,
               int width, int height);

-// Convert ARGB to G. (Reverse of J400toARGB, which replicates G back to ARGB)
-LIBYUV_API
-int ARGBToG(const uint8* src_argb, int src_stride_argb,
-            uint8* dst_g, int dst_stride_g,
-            int width, int height);
-
 // Convert ARGB To NV12.
 LIBYUV_API
 int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
--- a/media/libyuv/include/libyuv/cpu_id.h
+++ b/media/libyuv/include/libyuv/cpu_id.h
@ -18,8 +18,9 @@ namespace libyuv {
 extern "C" {
 #endif

+// TODO(fbarchard): Consider overlapping bits for different architectures.
 // Internal flag to indicate cpuid requires initialization.
-static const int kCpuInitialized = 0x1;
+#define kCpuInit 0x1

 // These flags are only valid on ARM processors.
 static const int kCpuHasARM = 0x2;
@ -36,12 +37,12 @@ static const int kCpuHasAVX = 0x200;
 static const int kCpuHasAVX2 = 0x400;
 static const int kCpuHasERMS = 0x800;
 static const int kCpuHasFMA3 = 0x1000;
-static const int kCpuHasAVX3 = 0x2000;
 // 0x2000, 0x4000, 0x8000 reserved for future X86 flags.

 // These flags are only valid on MIPS processors.
 static const int kCpuHasMIPS = 0x10000;
-static const int kCpuHasDSPR2 = 0x20000;
+static const int kCpuHasMIPS_DSP = 0x20000;
+static const int kCpuHasMIPS_DSPR2 = 0x40000;

 // Internal function used to auto-init.
 LIBYUV_API
@ -56,13 +57,13 @@ int ArmCpuCaps(const char* cpuinfo_name);
 // returns non-zero if instruction set is detected
 static __inline int TestCpuFlag(int test_flag) {
  LIBYUV_API extern int cpu_info_;
-  return (!cpu_info_ ? InitCpuFlags() : cpu_info_) & test_flag;
+  return (cpu_info_ == kCpuInit ? InitCpuFlags() : cpu_info_) & test_flag;
 }

 // For testing, allow CPU flags to be disabled.
 // ie MaskCpuFlags(~kCpuHasSSSE3) to disable SSSE3.
 // MaskCpuFlags(-1) to enable all cpu specific optimizations.
-// MaskCpuFlags(1) to disable all cpu specific optimizations.
+// MaskCpuFlags(0) to disable all cpu specific optimizations.
 LIBYUV_API
 void MaskCpuFlags(int enable_flags);

--- a/media/libyuv/include/libyuv/mjpeg_decoder.h
+++ b/media/libyuv/include/libyuv/mjpeg_decoder.h
@ -43,17 +43,6 @@ enum JpegSubsamplingType {
  kJpegUnknown
 };

-struct Buffer {
-  const uint8* data;
-  int len;
-};
-
-struct BufferVector {
-  Buffer* buffers;
-  int len;
-  int pos;
-};
-
 struct SetJmpErrorMgr;

 // MJPEG ("Motion JPEG") is a pseudo-standard video codec where the frames are
@ -153,6 +142,27 @@ class LIBYUV_API MJpegDecoder {
     int* subsample_x, int* subsample_y, int number_of_components);

 private:
+  struct Buffer {
+    const uint8* data;
+    int len;
+  };
+
+  struct BufferVector {
+    Buffer* buffers;
+    int len;
+    int pos;
+  };
+
+  // Methods that are passed to jpeglib.
+  static int fill_input_buffer(jpeg_decompress_struct* cinfo);
+  static void init_source(jpeg_decompress_struct* cinfo);
+  static void skip_input_data(jpeg_decompress_struct* cinfo,
+                              long num_bytes);  // NOLINT
+  static void term_source(jpeg_decompress_struct* cinfo);
+
+  static void ErrorHandler(jpeg_common_struct* cinfo);
+  static void OutputHandler(jpeg_common_struct* cinfo);
+
  void AllocOutputBuffers(int num_outbufs);
  void DestroyOutputBuffers();

--- a/media/libyuv/include/libyuv/planar_functions.h
+++ b/media/libyuv/include/libyuv/planar_functions.h
@ -28,11 +28,6 @@ void CopyPlane(const uint8* src_y, int src_stride_y,
               uint8* dst_y, int dst_stride_y,
               int width, int height);

-LIBYUV_API
-void CopyPlane_16(const uint16* src_y, int src_stride_y,
-                  uint16* dst_y, int dst_stride_y,
-                  int width, int height);
-
 // Set a plane of data to a 32 bit value.
 LIBYUV_API
 void SetPlane(uint8* dst_y, int dst_stride_y,
@ -45,7 +40,6 @@ int I400ToI400(const uint8* src_y, int src_stride_y,
               uint8* dst_y, int dst_stride_y,
               int width, int height);

-#define J400ToJ400 I400ToI400

 // Copy I422 to I422.
 #define I422ToI422 I422Copy
@ -85,18 +79,6 @@ int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,
               uint8* dst_v, int dst_stride_v,
               int width, int height);

-LIBYUV_API
-int YUY2ToNV12(const uint8* src_yuy2, int src_stride_yuy2,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_uv, int dst_stride_uv,
-               int width, int height);
-
-LIBYUV_API
-int UYVYToNV12(const uint8* src_uyvy, int src_stride_uyvy,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_uv, int dst_stride_uv,
-               int width, int height);
-
 // Convert I420 to I400. (calls CopyPlane ignoring u/v).
 LIBYUV_API
 int I420ToI400(const uint8* src_y, int src_stride_y,
@ -106,7 +88,6 @@ int I420ToI400(const uint8* src_y, int src_stride_y,
               int width, int height);

 // Alias
-#define J420ToJ400 I420ToI400
 #define I420ToI420Mirror I420Mirror

 // I420 mirror.
@ -145,6 +126,13 @@ int NV12ToRGB565(const uint8* src_y, int src_stride_y,
                 uint8* dst_rgb565, int dst_stride_rgb565,
                 int width, int height);

+// Convert NV21 to RGB565.
+LIBYUV_API
+int NV21ToRGB565(const uint8* src_y, int src_stride_y,
+                 const uint8* src_uv, int src_stride_uv,
+                 uint8* dst_rgb565, int dst_stride_rgb565,
+                 int width, int height);
+
 // I422ToARGB is in convert_argb.h
 // Convert I422 to BGRA.
 LIBYUV_API
@ -170,14 +158,6 @@ int I422ToRGBA(const uint8* src_y, int src_stride_y,
               uint8* dst_rgba, int dst_stride_rgba,
               int width, int height);

-// Alias
-#define RGB24ToRAW RAWToRGB24
-
-LIBYUV_API
-int RAWToRGB24(const uint8* src_raw, int src_stride_raw,
-               uint8* dst_rgb24, int dst_stride_rgb24,
-               int width, int height);
-
 // Draw a rectangle into I420.
 LIBYUV_API
 int I420Rect(uint8* dst_y, int dst_stride_y,
@ -282,19 +262,13 @@ int ARGBCopy(const uint8* src_argb, int src_stride_argb,
             uint8* dst_argb, int dst_stride_argb,
             int width, int height);

-// Copy Alpha channel of ARGB to alpha of ARGB.
+// Copy ARGB to ARGB.
 LIBYUV_API
 int ARGBCopyAlpha(const uint8* src_argb, int src_stride_argb,
                  uint8* dst_argb, int dst_stride_argb,
                  int width, int height);

-// Extract the alpha channel from ARGB.
-LIBYUV_API
-int ARGBExtractAlpha(const uint8* src_argb, int src_stride_argb,
-                     uint8* dst_a, int dst_stride_a,
-                     int width, int height);
-
-// Copy Y channel to Alpha of ARGB.
+// Copy ARGB to ARGB.
 LIBYUV_API
 int ARGBCopyYToAlpha(const uint8* src_y, int src_stride_y,
                     uint8* dst_argb, int dst_stride_argb,
@ -308,7 +282,6 @@ LIBYUV_API
 ARGBBlendRow GetARGBBlend();

 // Alpha Blend ARGB images and store to destination.
-// Source is pre-multiplied by alpha using ARGBAttenuate.
 // Alpha of destination is set to 255.
 LIBYUV_API
 int ARGBBlend(const uint8* src_argb0, int src_stride_argb0,
@ -316,31 +289,6 @@ int ARGBBlend(const uint8* src_argb0, int src_stride_argb0,
              uint8* dst_argb, int dst_stride_argb,
              int width, int height);

-// Alpha Blend plane and store to destination.
-// Source is not pre-multiplied by alpha.
-LIBYUV_API
-int BlendPlane(const uint8* src_y0, int src_stride_y0,
-               const uint8* src_y1, int src_stride_y1,
-               const uint8* alpha, int alpha_stride,
-               uint8* dst_y, int dst_stride_y,
-               int width, int height);
-
-// Alpha Blend YUV images and store to destination.
-// Source is not pre-multiplied by alpha.
-// Alpha is full width x height and subsampled to half size to apply to UV.
-LIBYUV_API
-int I420Blend(const uint8* src_y0, int src_stride_y0,
-              const uint8* src_u0, int src_stride_u0,
-              const uint8* src_v0, int src_stride_v0,
-              const uint8* src_y1, int src_stride_y1,
-              const uint8* src_u1, int src_stride_u1,
-              const uint8* src_v1, int src_stride_v1,
-              const uint8* alpha, int alpha_stride,
-              uint8* dst_y, int dst_stride_y,
-              uint8* dst_u, int dst_stride_u,
-              uint8* dst_v, int dst_stride_v,
-              int width, int height);
-
 // Multiply ARGB image by ARGB image. Shifted down by 8. Saturates to 255.
 LIBYUV_API
 int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0,
@ -390,6 +338,12 @@ int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb,
                    uint8* dst_argb, int dst_stride_argb,
                    int width, int height);

+// Convert MJPG to ARGB.
+LIBYUV_API
+int MJPGToARGB(const uint8* sample, size_t sample_size,
+               uint8* argb, int argb_stride,
+               int w, int h, int dw, int dh);
+
 // Internal function - do not call directly.
 // Computes table of cumulative sum for image where the value is the sum
 // of all values above and to the left of the entry. Used by ARGBBlur.
@ -416,63 +370,36 @@ int ARGBShade(const uint8* src_argb, int src_stride_argb,
              uint8* dst_argb, int dst_stride_argb,
              int width, int height, uint32 value);

-// Interpolate between two images using specified amount of interpolation
-// (0 to 255) and store to destination.
-// 'interpolation' is specified as 8 bit fraction where 0 means 100% src0
-// and 255 means 1% src0 and 99% src1.
-LIBYUV_API
-int InterpolatePlane(const uint8* src0, int src_stride0,
-                     const uint8* src1, int src_stride1,
-                     uint8* dst, int dst_stride,
-                     int width, int height, int interpolation);
-
 // Interpolate between two ARGB images using specified amount of interpolation
-// Internally calls InterpolatePlane with width * 4 (bpp).
+// (0 to 255) and store to destination.
+// 'interpolation' is specified as 8 bit fraction where 0 means 100% src_argb0
+// and 255 means 1% src_argb0 and 99% src_argb1.
+// Internally uses ARGBScale bilinear filtering.
+// Caveat: This function will write up to 16 bytes beyond the end of dst_argb.
 LIBYUV_API
 int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0,
                    const uint8* src_argb1, int src_stride_argb1,
                    uint8* dst_argb, int dst_stride_argb,
                    int width, int height, int interpolation);

-// Interpolate between two YUV images using specified amount of interpolation
-// Internally calls InterpolatePlane on each plane where the U and V planes
-// are half width and half height.
-LIBYUV_API
-int I420Interpolate(const uint8* src0_y, int src0_stride_y,
-                    const uint8* src0_u, int src0_stride_u,
-                    const uint8* src0_v, int src0_stride_v,
-                    const uint8* src1_y, int src1_stride_y,
-                    const uint8* src1_u, int src1_stride_u,
-                    const uint8* src1_v, int src1_stride_v,
-                    uint8* dst_y, int dst_stride_y,
-                    uint8* dst_u, int dst_stride_u,
-                    uint8* dst_v, int dst_stride_v,
-                    int width, int height, int interpolation);
-
-#if defined(__pnacl__) || defined(__CLR_VER) || \
-    (defined(__i386__) && !defined(__SSE2__))
+#if defined(__pnacl__) || defined(__CLR_VER) || defined(COVERAGE_ENABLED) || \
+    defined(TARGET_IPHONE_SIMULATOR)
 #define LIBYUV_DISABLE_X86
 #endif
-// MemorySanitizer does not support assembly code yet. http://crbug.com/344505
-#if defined(__has_feature)
-#if __has_feature(memory_sanitizer)
-#define LIBYUV_DISABLE_X86
-#endif
-#endif
-// The following are available on all x86 platforms:
-#if !defined(LIBYUV_DISABLE_X86) && \
-    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
-#define HAS_ARGBAFFINEROW_SSE2
-#endif

-// Row function for copying pixels from a source with a slope to a row
+// Row functions for copying a pixels from a source with a slope to a row
 // of destination. Useful for scaling, rotation, mirror, texture mapping.
 LIBYUV_API
 void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride,
                     uint8* dst_argb, const float* uv_dudv, int width);
+// The following are available on all x86 platforms:
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
 LIBYUV_API
 void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
                        uint8* dst_argb, const float* uv_dudv, int width);
+#define HAS_ARGBAFFINEROW_SSE2
+#endif  // LIBYUV_DISABLE_X86

 // Shuffle ARGB channel order.  e.g. BGRA to ARGB.
 // shuffler is 16 bytes and must be aligned.
--- a/media/libyuv/include/libyuv/row.h
+++ b/media/libyuv/include/libyuv/row.h
--- a/media/libyuv/include/libyuv/scale.h
+++ b/media/libyuv/include/libyuv/scale.h
@ -34,13 +34,6 @@ void ScalePlane(const uint8* src, int src_stride,
                int dst_width, int dst_height,
                enum FilterMode filtering);

-LIBYUV_API
-void ScalePlane_16(const uint16* src, int src_stride,
-                   int src_width, int src_height,
-                   uint16* dst, int dst_stride,
-                   int dst_width, int dst_height,
-                   enum FilterMode filtering);
-
 // Scales a YUV 4:2:0 image from the src width and height to the
 // dst width and height.
 // If filtering is kFilterNone, a simple nearest-neighbor algorithm is
@ -62,17 +55,6 @@ int I420Scale(const uint8* src_y, int src_stride_y,
              int dst_width, int dst_height,
              enum FilterMode filtering);

-LIBYUV_API
-int I420Scale_16(const uint16* src_y, int src_stride_y,
-                 const uint16* src_u, int src_stride_u,
-                 const uint16* src_v, int src_stride_v,
-                 int src_width, int src_height,
-                 uint16* dst_y, int dst_stride_y,
-                 uint16* dst_u, int dst_stride_u,
-                 uint16* dst_v, int dst_stride_v,
-                 int dst_width, int dst_height,
-                 enum FilterMode filtering);
-
 #ifdef __cplusplus
 // Legacy API.  Deprecated.
 LIBYUV_API
--- a/media/libyuv/include/libyuv/scale_argb.h
+++ b/media/libyuv/include/libyuv/scale_argb.h
@ -35,6 +35,7 @@ int ARGBScaleClip(const uint8* src_argb, int src_stride_argb,
                  int clip_x, int clip_y, int clip_width, int clip_height,
                  enum FilterMode filtering);

+// TODO(fbarchard): Implement this.
 // Scale with YUV conversion to ARGB and clipping.
 LIBYUV_API
 int YUVToARGBScaleClip(const uint8* src_y, int src_stride_y,
--- a/media/libyuv/include/libyuv/scale_row.h
+++ b/media/libyuv/include/libyuv/scale_row.h
@ -12,94 +12,54 @@
 #define INCLUDE_LIBYUV_SCALE_ROW_H_

 #include "libyuv/basic_types.h"
-#include "libyuv/scale.h"

 #ifdef __cplusplus
 namespace libyuv {
 extern "C" {
 #endif

-#if defined(__pnacl__) || defined(__CLR_VER) || \
-    (defined(__i386__) && !defined(__SSE2__))
+#if defined(__pnacl__) || defined(__CLR_VER) || defined(COVERAGE_ENABLED) || \
+    defined(TARGET_IPHONE_SIMULATOR)
 #define LIBYUV_DISABLE_X86
 #endif
-// MemorySanitizer does not support assembly code yet. http://crbug.com/344505
-#if defined(__has_feature)
-#if __has_feature(memory_sanitizer)
-#define LIBYUV_DISABLE_X86
-#endif
-#endif
-
-// GCC >= 4.7.0 required for AVX2.
-#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
-#if (__GNUC__ > 4) || (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7))
-#define GCC_HAS_AVX2 1
-#endif  // GNUC >= 4.7
-#endif  // __GNUC__
-
-// clang >= 3.4.0 required for AVX2.
-#if defined(__clang__) && (defined(__x86_64__) || defined(__i386__))
-#if (__clang_major__ > 3) || (__clang_major__ == 3 && (__clang_minor__ >= 4))
-#define CLANG_HAS_AVX2 1
-#endif  // clang >= 3.4
-#endif  // __clang__
-
-// Visual C 2012 required for AVX2.
-#if defined(_M_IX86) && !defined(__clang__) && \
-    defined(_MSC_VER) && _MSC_VER >= 1700
-#define VISUALC_HAS_AVX2 1
-#endif  // VisualStudio >= 2012

 // The following are available on all x86 platforms:
 #if !defined(LIBYUV_DISABLE_X86) && \
    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
-#define HAS_FIXEDDIV1_X86
-#define HAS_FIXEDDIV_X86
-#define HAS_SCALEARGBCOLS_SSE2
-#define HAS_SCALEARGBCOLSUP2_SSE2
-#define HAS_SCALEARGBFILTERCOLS_SSSE3
-#define HAS_SCALEARGBROWDOWN2_SSE2
-#define HAS_SCALEARGBROWDOWNEVEN_SSE2
-#define HAS_SCALECOLSUP2_SSE2
-#define HAS_SCALEFILTERCOLS_SSSE3
-#define HAS_SCALEROWDOWN2_SSSE3
+#define HAS_SCALEROWDOWN2_SSE2
+#define HAS_SCALEROWDOWN4_SSE2
 #define HAS_SCALEROWDOWN34_SSSE3
 #define HAS_SCALEROWDOWN38_SSSE3
-#define HAS_SCALEROWDOWN4_SSSE3
-#define HAS_SCALEADDROW_SSE2
-#endif
-
-// The following are available on all x86 platforms, but
-// require VS2012, clang 3.4 or gcc 4.7.
-// The code supports NaCL but requires a new compiler and validator.
-#if !defined(LIBYUV_DISABLE_X86) && (defined(VISUALC_HAS_AVX2) || \
-    defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2))
-#define HAS_SCALEADDROW_AVX2
-#define HAS_SCALEROWDOWN2_AVX2
-#define HAS_SCALEROWDOWN4_AVX2
+#define HAS_SCALEADDROWS_SSE2
+#define HAS_SCALEFILTERCOLS_SSSE3
+#define HAS_SCALECOLSUP2_SSE2
+#define HAS_SCALEARGBROWDOWN2_SSE2
+#define HAS_SCALEARGBROWDOWNEVEN_SSE2
+#define HAS_SCALEARGBCOLS_SSE2
+#define HAS_SCALEARGBFILTERCOLS_SSSE3
+#define HAS_SCALEARGBCOLSUP2_SSE2
+#define HAS_FIXEDDIV_X86
+#define HAS_FIXEDDIV1_X86
 #endif

 // The following are available on Neon platforms:
 #if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
-    (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
-#define HAS_SCALEARGBCOLS_NEON
-#define HAS_SCALEARGBROWDOWN2_NEON
-#define HAS_SCALEARGBROWDOWNEVEN_NEON
-#define HAS_SCALEFILTERCOLS_NEON
+    (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
 #define HAS_SCALEROWDOWN2_NEON
+#define HAS_SCALEROWDOWN4_NEON
 #define HAS_SCALEROWDOWN34_NEON
 #define HAS_SCALEROWDOWN38_NEON
-#define HAS_SCALEROWDOWN4_NEON
-#define HAS_SCALEARGBFILTERCOLS_NEON
+#define HAS_SCALEARGBROWDOWNEVEN_NEON
+#define HAS_SCALEARGBROWDOWN2_NEON
 #endif

 // The following are available on Mips platforms:
 #if !defined(LIBYUV_DISABLE_MIPS) && !defined(__native_client__) && \
    defined(__mips__) && defined(__mips_dsp) && (__mips_dsp_rev >= 2)
-#define HAS_SCALEROWDOWN2_DSPR2
-#define HAS_SCALEROWDOWN4_DSPR2
-#define HAS_SCALEROWDOWN34_DSPR2
-#define HAS_SCALEROWDOWN38_DSPR2
+#define HAS_SCALEROWDOWN2_MIPS_DSPR2
+#define HAS_SCALEROWDOWN4_MIPS_DSPR2
+#define HAS_SCALEROWDOWN34_MIPS_DSPR2
+#define HAS_SCALEROWDOWN38_MIPS_DSPR2
 #endif

 // Scale ARGB vertically with bilinear interpolation.
@ -110,13 +70,6 @@ void ScalePlaneVertical(int src_height,
                        int x, int y, int dy,
                        int bpp, enum FilterMode filtering);

-void ScalePlaneVertical_16(int src_height,
-                           int dst_width, int dst_height,
-                           int src_stride, int dst_stride,
-                           const uint16* src_argb, uint16* dst_argb,
-                           int x, int y, int dy,
-                           int wpp, enum FilterMode filtering);
-
 // Simplify the filtering based on scale factors.
 enum FilterMode ScaleFilterReduce(int src_width, int src_height,
                                  int dst_width, int dst_height,
@ -144,70 +97,37 @@ void ScaleSlope(int src_width, int src_height,

 void ScaleRowDown2_C(const uint8* src_ptr, ptrdiff_t src_stride,
                     uint8* dst, int dst_width);
-void ScaleRowDown2_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                        uint16* dst, int dst_width);
 void ScaleRowDown2Linear_C(const uint8* src_ptr, ptrdiff_t src_stride,
                           uint8* dst, int dst_width);
-void ScaleRowDown2Linear_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                              uint16* dst, int dst_width);
 void ScaleRowDown2Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
                        uint8* dst, int dst_width);
-void ScaleRowDown2Box_Odd_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* dst, int dst_width);
-void ScaleRowDown2Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                           uint16* dst, int dst_width);
 void ScaleRowDown4_C(const uint8* src_ptr, ptrdiff_t src_stride,
                     uint8* dst, int dst_width);
-void ScaleRowDown4_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                        uint16* dst, int dst_width);
 void ScaleRowDown4Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
                        uint8* dst, int dst_width);
-void ScaleRowDown4Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                           uint16* dst, int dst_width);
 void ScaleRowDown34_C(const uint8* src_ptr, ptrdiff_t src_stride,
                      uint8* dst, int dst_width);
-void ScaleRowDown34_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                         uint16* dst, int dst_width);
 void ScaleRowDown34_0_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
                            uint8* d, int dst_width);
-void ScaleRowDown34_0_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                               uint16* d, int dst_width);
 void ScaleRowDown34_1_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
                            uint8* d, int dst_width);
-void ScaleRowDown34_1_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                               uint16* d, int dst_width);
 void ScaleCols_C(uint8* dst_ptr, const uint8* src_ptr,
                 int dst_width, int x, int dx);
-void ScaleCols_16_C(uint16* dst_ptr, const uint16* src_ptr,
-                    int dst_width, int x, int dx);
 void ScaleColsUp2_C(uint8* dst_ptr, const uint8* src_ptr,
                    int dst_width, int, int);
-void ScaleColsUp2_16_C(uint16* dst_ptr, const uint16* src_ptr,
-                       int dst_width, int, int);
 void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr,
                       int dst_width, int x, int dx);
-void ScaleFilterCols_16_C(uint16* dst_ptr, const uint16* src_ptr,
-                          int dst_width, int x, int dx);
 void ScaleFilterCols64_C(uint8* dst_ptr, const uint8* src_ptr,
                         int dst_width, int x, int dx);
-void ScaleFilterCols64_16_C(uint16* dst_ptr, const uint16* src_ptr,
-                            int dst_width, int x, int dx);
 void ScaleRowDown38_C(const uint8* src_ptr, ptrdiff_t src_stride,
                      uint8* dst, int dst_width);
-void ScaleRowDown38_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                         uint16* dst, int dst_width);
 void ScaleRowDown38_3_Box_C(const uint8* src_ptr,
                            ptrdiff_t src_stride,
                            uint8* dst_ptr, int dst_width);
-void ScaleRowDown38_3_Box_16_C(const uint16* src_ptr,
-                               ptrdiff_t src_stride,
-                               uint16* dst_ptr, int dst_width);
 void ScaleRowDown38_2_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
                            uint8* dst_ptr, int dst_width);
-void ScaleRowDown38_2_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                               uint16* dst_ptr, int dst_width);
-void ScaleAddRow_C(const uint8* src_ptr, uint16* dst_ptr, int src_width);
-void ScaleAddRow_16_C(const uint16* src_ptr, uint32* dst_ptr, int src_width);
+void ScaleAddRows_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                    uint16* dst_ptr, int src_width, int src_height);
 void ScaleARGBRowDown2_C(const uint8* src_argb,
                         ptrdiff_t src_stride,
                         uint8* dst_argb, int dst_width);
@ -234,28 +154,25 @@ void ScaleARGBFilterCols_C(uint8* dst_argb, const uint8* src_argb,
 void ScaleARGBFilterCols64_C(uint8* dst_argb, const uint8* src_argb,
                             int dst_width, int x, int dx);

-// Specialized scalers for x86.
-void ScaleRowDown2_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                         uint8* dst_ptr, int dst_width);
-void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width);
-void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* dst_ptr, int dst_width);
-void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
                        uint8* dst_ptr, int dst_width);
-void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
                              uint8* dst_ptr, int dst_width);
-void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
                           uint8* dst_ptr, int dst_width);
-void ScaleRowDown4_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                         uint8* dst_ptr, int dst_width);
-void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* dst_ptr, int dst_width);
-void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint8* dst_ptr, int dst_width);
+void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr,
+                                        ptrdiff_t src_stride,
+                                        uint8* dst_ptr, int dst_width);
+void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr,
+                                     ptrdiff_t src_stride,
+                                     uint8* dst_ptr, int dst_width);
+void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
                        uint8* dst_ptr, int dst_width);
-void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
                           uint8* dst_ptr, int dst_width);
-
 void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
                          uint8* dst_ptr, int dst_width);
 void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
@ -272,128 +189,46 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
 void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
                                ptrdiff_t src_stride,
                                uint8* dst_ptr, int dst_width);
-void ScaleRowDown2_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                             uint8* dst_ptr, int dst_width);
-void ScaleRowDown2Linear_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                                   uint8* dst_ptr, int dst_width);
-void ScaleRowDown2Box_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                                uint8* dst_ptr, int dst_width);
-void ScaleRowDown2Box_Odd_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                                uint8* dst_ptr, int dst_width);
-void ScaleRowDown2_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* dst_ptr, int dst_width);
-void ScaleRowDown2Linear_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
-                                  uint8* dst_ptr, int dst_width);
-void ScaleRowDown2Box_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width);
-void ScaleRowDown2Box_Odd_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width);
-void ScaleRowDown4_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                             uint8* dst_ptr, int dst_width);
-void ScaleRowDown4Box_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                                uint8* dst_ptr, int dst_width);
-void ScaleRowDown4_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* dst_ptr, int dst_width);
-void ScaleRowDown4Box_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width);
-
-void ScaleRowDown34_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                              uint8* dst_ptr, int dst_width);
-void ScaleRowDown34_1_Box_Any_SSSE3(const uint8* src_ptr,
-                                    ptrdiff_t src_stride,
-                                    uint8* dst_ptr, int dst_width);
-void ScaleRowDown34_0_Box_Any_SSSE3(const uint8* src_ptr,
-                                    ptrdiff_t src_stride,
-                                    uint8* dst_ptr, int dst_width);
-void ScaleRowDown38_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                              uint8* dst_ptr, int dst_width);
-void ScaleRowDown38_3_Box_Any_SSSE3(const uint8* src_ptr,
-                                    ptrdiff_t src_stride,
-                                    uint8* dst_ptr, int dst_width);
-void ScaleRowDown38_2_Box_Any_SSSE3(const uint8* src_ptr,
-                                    ptrdiff_t src_stride,
-                                    uint8* dst_ptr, int dst_width);
-
-void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
-void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
-void ScaleAddRow_Any_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
-void ScaleAddRow_Any_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
-
+void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+                       uint16* dst_ptr, int src_width,
+                       int src_height);
 void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
                           int dst_width, int x, int dx);
 void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
                       int dst_width, int x, int dx);
-
-
-// ARGB Column functions
+void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
+                            ptrdiff_t src_stride,
+                            uint8* dst_argb, int dst_width);
+void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
+                                  ptrdiff_t src_stride,
+                                  uint8* dst_argb, int dst_width);
+void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
+                               ptrdiff_t src_stride,
+                               uint8* dst_argb, int dst_width);
+void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
+                               int src_stepx,
+                               uint8* dst_argb, int dst_width);
+void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
+                                  ptrdiff_t src_stride,
+                                  int src_stepx,
+                                  uint8* dst_argb, int dst_width);
 void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
                        int dst_width, int x, int dx);
 void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
                               int dst_width, int x, int dx);
 void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
                           int dst_width, int x, int dx);
-void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb,
-                              int dst_width, int x, int dx);
-void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,
-                        int dst_width, int x, int dx);
-void ScaleARGBFilterCols_Any_NEON(uint8* dst_argb, const uint8* src_argb,
-                                  int dst_width, int x, int dx);
-void ScaleARGBCols_Any_NEON(uint8* dst_argb, const uint8* src_argb,
-                            int dst_width, int x, int dx);
-
-// ARGB Row functions
-void ScaleARGBRowDown2_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
-                            uint8* dst_argb, int dst_width);
-void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
-                                  uint8* dst_argb, int dst_width);
-void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
-                               uint8* dst_argb, int dst_width);
-void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* dst, int dst_width);
-void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride,
-                                  uint8* dst_argb, int dst_width);
-void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                               uint8* dst, int dst_width);
-void ScaleARGBRowDown2_Any_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
-                                uint8* dst_argb, int dst_width);
-void ScaleARGBRowDown2Linear_Any_SSE2(const uint8* src_argb,
-                                      ptrdiff_t src_stride,
-                                      uint8* dst_argb, int dst_width);
-void ScaleARGBRowDown2Box_Any_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
-                                   uint8* dst_argb, int dst_width);
-void ScaleARGBRowDown2_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                                uint8* dst, int dst_width);
-void ScaleARGBRowDown2Linear_Any_NEON(const uint8* src_argb,
-                                      ptrdiff_t src_stride,
-                                      uint8* dst_argb, int dst_width);
-void ScaleARGBRowDown2Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                                   uint8* dst, int dst_width);
-
-void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
-                               int src_stepx, uint8* dst_argb, int dst_width);
-void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
-                                  int src_stepx,
-                                  uint8* dst_argb, int dst_width);
-void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride,
+// Row functions.
+void ScaleARGBRowDownEven_NEON(const uint8* src_argb, int src_stride,
                               int src_stepx,
                               uint8* dst_argb, int dst_width);
-void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
+void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, int src_stride,
                                  int src_stepx,
                                  uint8* dst_argb, int dst_width);
-void ScaleARGBRowDownEven_Any_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
-                                   int src_stepx,
-                                   uint8* dst_argb, int dst_width);
-void ScaleARGBRowDownEvenBox_Any_SSE2(const uint8* src_argb,
-                                      ptrdiff_t src_stride,
-                                      int src_stepx,
-                                      uint8* dst_argb, int dst_width);
-void ScaleARGBRowDownEven_Any_NEON(const uint8* src_argb, ptrdiff_t src_stride,
-                                   int src_stepx,
-                                   uint8* dst_argb, int dst_width);
-void ScaleARGBRowDownEvenBox_Any_NEON(const uint8* src_argb,
-                                      ptrdiff_t src_stride,
-                                      int src_stepx,
-                                      uint8* dst_argb, int dst_width);
+void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* dst, int dst_width);
+void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                               uint8* dst, int dst_width);

 // ScaleRowDown2Box also used by planar functions
 // NEON downscalers with interpolation.
@ -401,8 +236,7 @@ void ScaleARGBRowDownEvenBox_Any_NEON(const uint8* src_argb,
 // Note - not static due to reuse in convert for 444 to 420.
 void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
                        uint8* dst, int dst_width);
-void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                              uint8* dst, int dst_width);
+
 void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
                           uint8* dst, int dst_width);

@ -437,63 +271,27 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
                               ptrdiff_t src_stride,
                               uint8* dst_ptr, int dst_width);

-void ScaleRowDown2_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* dst, int dst_width);
-void ScaleRowDown2Linear_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                                  uint8* dst, int dst_width);
-void ScaleRowDown2Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                              uint8* dst, int dst_width);
+void ScaleRowDown2Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                                 uint8* dst, int dst_width);
+void ScaleRowDown4_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                              uint8* dst, int dst_width);
+void ScaleRowDown4Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                                 uint8* dst, int dst_width);
+void ScaleRowDown34_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
                               uint8* dst, int dst_width);
-void ScaleRowDown2Box_Odd_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+void ScaleRowDown34_0_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                                     uint8* d, int dst_width);
+void ScaleRowDown34_1_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                                     uint8* d, int dst_width);
+void ScaleRowDown38_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
                               uint8* dst, int dst_width);
-void ScaleRowDown4_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* dst_ptr, int dst_width);
-void ScaleRowDown4Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width);
-void ScaleRowDown34_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                             uint8* dst_ptr, int dst_width);
-void ScaleRowDown34_0_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                                   uint8* dst_ptr, int dst_width);
-void ScaleRowDown34_1_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                                   uint8* dst_ptr, int dst_width);
-// 32 -> 12
-void ScaleRowDown38_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                             uint8* dst_ptr, int dst_width);
-// 32x3 -> 12x1
-void ScaleRowDown38_3_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width);
-// 32x2 -> 12x1
-void ScaleRowDown38_2_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width);
-
-void ScaleAddRow_NEON(const uint8* src_ptr, uint16* dst_ptr, int src_width);
-void ScaleAddRow_Any_NEON(const uint8* src_ptr, uint16* dst_ptr, int src_width);
-
-void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
-                          int dst_width, int x, int dx);
-
-void ScaleFilterCols_Any_NEON(uint8* dst_ptr, const uint8* src_ptr,
-                              int dst_width, int x, int dx);
-
-void ScaleRowDown2_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                         uint8* dst, int dst_width);
-void ScaleRowDown2Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* dst, int dst_width);
-void ScaleRowDown4_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                         uint8* dst, int dst_width);
-void ScaleRowDown4Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* dst, int dst_width);
-void ScaleRowDown34_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                          uint8* dst, int dst_width);
-void ScaleRowDown34_0_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                                uint8* d, int dst_width);
-void ScaleRowDown34_1_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                                uint8* d, int dst_width);
-void ScaleRowDown38_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                          uint8* dst, int dst_width);
-void ScaleRowDown38_2_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                                uint8* dst_ptr, int dst_width);
-void ScaleRowDown38_3_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                                uint8* dst_ptr, int dst_width);
+void ScaleRowDown38_2_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                                     uint8* dst_ptr, int dst_width);
+void ScaleRowDown38_3_Box_MIPS_DSPR2(const uint8* src_ptr,
+                                     ptrdiff_t src_stride,
+                                     uint8* dst_ptr, int dst_width);

 #ifdef __cplusplus
 }  // extern "C"
--- a/media/libyuv/include/libyuv/version.h
+++ b/media/libyuv/include/libyuv/version.h
@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_

-#define LIBYUV_VERSION 1602
+#define LIBYUV_VERSION 971

 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
--- a/media/libyuv/include/libyuv/video_common.h
+++ b/media/libyuv/include/libyuv/video_common.h
@ -62,7 +62,7 @@ enum FourCC {

  // 2 Secondary YUV formats: row biplanar.
  FOURCC_M420 = FOURCC('M', '4', '2', '0'),
-  FOURCC_Q420 = FOURCC('Q', '4', '2', '0'),  // deprecated.
+  FOURCC_Q420 = FOURCC('Q', '4', '2', '0'),

  // 9 Primary RGB formats: 4 32 bpp, 2 24 bpp, 3 16 bpp.
  FOURCC_ARGB = FOURCC('A', 'R', 'G', 'B'),
@ -75,7 +75,7 @@ enum FourCC {
  FOURCC_RGBO = FOURCC('R', 'G', 'B', 'O'),  // argb1555 LE.
  FOURCC_R444 = FOURCC('R', '4', '4', '4'),  // argb4444 LE.

-  // 4 Secondary RGB formats: 4 Bayer Patterns. deprecated.
+  // 4 Secondary RGB formats: 4 Bayer Patterns.
  FOURCC_RGGB = FOURCC('R', 'G', 'G', 'B'),
  FOURCC_BGGR = FOURCC('B', 'G', 'G', 'R'),
  FOURCC_GRBG = FOURCC('G', 'R', 'B', 'G'),
@ -90,8 +90,7 @@ enum FourCC {
  FOURCC_YV24 = FOURCC('Y', 'V', '2', '4'),
  FOURCC_YU12 = FOURCC('Y', 'U', '1', '2'),  // Linux version of I420.
  FOURCC_J420 = FOURCC('J', '4', '2', '0'),
-  FOURCC_J400 = FOURCC('J', '4', '0', '0'),  // unofficial fourcc
-  FOURCC_H420 = FOURCC('H', '4', '2', '0'),  // unofficial fourcc
+  FOURCC_J400 = FOURCC('J', '4', '0', '0'),

  // 14 Auxiliary aliases.  CanonicalFourCC() maps these to canonical fourcc.
  FOURCC_IYUV = FOURCC('I', 'Y', 'U', 'V'),  // Alias for I420.
@ -151,7 +150,6 @@ enum FourCCBpp {
  FOURCC_BPP_YU12 = 12,
  FOURCC_BPP_J420 = 12,
  FOURCC_BPP_J400 = 8,
-  FOURCC_BPP_H420 = 12,
  FOURCC_BPP_MJPG = 0,  // 0 means unknown.
  FOURCC_BPP_H264 = 0,
  FOURCC_BPP_IYUV = 12,
--- a/media/libyuv/libyuv.gyp
+++ b/media/libyuv/libyuv.gyp
@ -10,83 +10,92 @@
  'includes': [
    'libyuv.gypi',
  ],
-  # Make sure that if we are being compiled to an xcodeproj, nothing tries to
-  # include a .pch.
-  'xcode_settings': {
-    'GCC_PREFIX_HEADER': '',
-    'GCC_PRECOMPILE_PREFIX_HEADER': 'NO',
-  },
  'variables': {
    'use_system_libjpeg%': 0,
-    'libyuv_disable_jpeg%': 0,
-    # 'chromium_code' treats libyuv as internal and increases warning level.
-    'chromium_code': 1,
-    # clang compiler default variable usable by other apps that include libyuv.
-    'clang%': 0,
-    # Link-Time Optimizations.
-    'use_lto%': 0,
+    'yuv_disable_asm%': 0,
+    'yuv_disable_avx2%': 0,
    'build_neon': 0,
    'conditions': [
-       ['(target_arch == "armv7" or target_arch == "armv7s" or \
-       (target_arch == "arm" and arm_version >= 7) or target_arch == "arm64")\
-       and (arm_neon == 1 or arm_neon_optional == 1)',
-       {
+       ['target_arch == "arm" and arm_version >= 7 and (arm_neon == 1 or arm_neon_optional == 1)', {
         'build_neon': 1,
       }],
    ],
  },
-
+  'conditions': [
+    [ 'build_neon != 0', {
+      'targets': [
+        # The NEON-specific components.
+        {
+          'target_name': 'libyuv_neon',
+          'type': 'static_library',
+          'standalone_static_library': 1,
+          'defines': [
+            'LIBYUV_NEON',
+          ],
+          # TODO(noahric): This should remove whatever mfpu is set, not
+          # just vfpv3-d16.
+          'cflags!': [
+            '-mfpu=vfp',
+            '-mfpu=vfpv3',
+            '-mfpu=vfpv3-d16',
+          ],
+          # XXX Doesn't work currently
+          'cflags_mozilla!': [
+            '-mfpu=vfp',
+            '-mfpu=vfpv3',
+            '-mfpu=vfpv3-d16',
+          ],
+          'cflags': [
+            '-mfpu=neon',
+          ],
+          'cflags_mozilla': [
+            '-mfpu=neon',
+          ],
+          'include_dirs': [
+            'include',
+            '.',
+          ],
+          'direct_dependent_settings': {
+            'include_dirs': [
+              'include',
+              '.',
+            ],
+          },
+          'sources': [
+            # sources.
+            'source/compare_neon.cc',
+            'source/rotate_neon.cc',
+            'source/row_neon.cc',
+            'source/scale_neon.cc',
+          ],
+        },
+      ],
+    }],
+  ],
  'targets': [
    {
      'target_name': 'libyuv',
      # Change type to 'shared_library' to build .so or .dll files.
      'type': 'static_library',
-      'variables': {
-        'optimize': 'max',  # enable O2 and ltcg.
-      },
      # Allows libyuv.a redistributable library without external dependencies.
-      'standalone_static_library': 1,
+      # 'standalone_static_library': 1,
      'conditions': [
-       # Disable -Wunused-parameter
-        ['clang == 1', {
-          'cflags': [
-            '-Wno-unused-parameter',
-         ],
-        }],
-        ['build_neon != 0', {
+        # TODO(fbarchard): Use gyp define to enable jpeg.
+        [ 'build_with_mozilla==1', {
          'defines': [
-            'LIBYUV_NEON',
+            'HAVE_JPEG'
          ],
-          'cflags!': [
-            '-mfpu=vfp',
-            '-mfpu=vfpv3',
-            '-mfpu=vfpv3-d16',
-            # '-mthumb',  # arm32 not thumb
-          ],
-          'conditions': [
-            # Disable LTO in libyuv_neon target due to gcc 4.9 compiler bug.
-            ['clang == 0 and use_lto == 1', {
-              'cflags!': [
-                '-flto',
-                '-ffat-lto-objects',
-              ],
-            }],
-            # arm64 does not need -mfpu=neon option as neon is not optional
-            ['target_arch != "arm64"', {
-              'cflags': [
-                '-mfpu=neon',
-                # '-marm',  # arm32 not thumb
-              ],
-            }],
+          'cflags_mozilla': [
+            '$(MOZ_JPEG_CFLAGS)',
          ],
        }],
-        ['OS != "ios" and libyuv_disable_jpeg != 1', {
+        [ 'OS != "ios" and build_with_mozilla!=1', {
          'defines': [
            'HAVE_JPEG'
          ],
          'conditions': [
-            # Caveat system jpeg support may not support motion jpeg
-            [ 'use_system_libjpeg == 1', {
+            # Android uses libjpeg for system jpeg support.
+            [ 'OS == "android" and use_system_libjpeg == 1', {
              'dependencies': [
                 '<(DEPTH)/third_party/libjpeg/libjpeg.gyp:libjpeg',
              ],
@ -104,15 +113,37 @@
            }],
          ],
        }],
-      ], #conditions
+        [ 'build_neon != 0', {
+          'dependencies': [
+            'libyuv_neon',
+          ],
+          'defines': [
+            'LIBYUV_NEON',
+          ]
+        }],
+        [ 'yuv_disable_asm!=0', {
+          'defines': [
+            # Enable the following 3 macros to turn off assembly for specified CPU.
+            'LIBYUV_DISABLE_X86',
+            'LIBYUV_DISABLE_NEON',
+            'LIBYUV_DISABLE_MIPS',
+          ],
+        }],
+        [ 'yuv_disable_avx2==1', {
+          'defines': [
+            'LIBYUV_DISABLE_AVX2',
+          ]
+        }],
+      ],
      'defines': [
        # Enable the following 3 macros to turn off assembly for specified CPU.
        # 'LIBYUV_DISABLE_X86',
        # 'LIBYUV_DISABLE_NEON',
        # 'LIBYUV_DISABLE_MIPS',
+        # This disables AVX2 (Haswell) support, overriding compiler checks
+        # 'LIBYUV_DISABLE_AVX2',
        # Enable the following macro to build libyuv as a shared library (dll).
        # 'LIBYUV_USING_SHARED_LIBRARY',
-        # TODO(fbarchard): Make these into gyp defines.
      ],
      'include_dirs': [
        'include',
@ -123,18 +154,6 @@
          'include',
          '.',
        ],
-        'conditions': [
-          ['OS == "android" and target_arch == "arm64"', {
-            'ldflags': [
-              '-Wl,--dynamic-linker,/system/bin/linker64',
-            ],
-          }],
-          ['OS == "android" and target_arch != "arm64"', {
-            'ldflags': [
-              '-Wl,--dynamic-linker,/system/bin/linker',
-            ],
-          }],
-        ], #conditions
      },
      'sources': [
        '<@(libyuv_sources)',
--- a/media/libyuv/libyuv.gypi
+++ b/media/libyuv/libyuv.gypi
@ -18,11 +18,11 @@
      'include/libyuv/convert_from.h',
      'include/libyuv/convert_from_argb.h',
      'include/libyuv/cpu_id.h',
+      'include/libyuv/format_conversion.h',
      'include/libyuv/mjpeg_decoder.h',
      'include/libyuv/planar_functions.h',
      'include/libyuv/rotate.h',
      'include/libyuv/rotate_argb.h',
-      'include/libyuv/rotate_row.h',
      'include/libyuv/row.h',
      'include/libyuv/scale.h',
      'include/libyuv/scale_argb.h',
@ -33,9 +33,7 @@
      # sources.
      'source/compare.cc',
      'source/compare_common.cc',
-      'source/compare_gcc.cc',
-      'source/compare_neon.cc',
-      'source/compare_neon64.cc',
+      'source/compare_posix.cc',
      'source/compare_win.cc',
      'source/convert.cc',
      'source/convert_argb.cc',
@ -45,33 +43,23 @@
      'source/convert_to_argb.cc',
      'source/convert_to_i420.cc',
      'source/cpu_id.cc',
+      'source/format_conversion.cc',
      'source/mjpeg_decoder.cc',
      'source/mjpeg_validate.cc',
      'source/planar_functions.cc',
      'source/rotate.cc',
-      'source/rotate_any.cc',
      'source/rotate_argb.cc',
-      'source/rotate_common.cc',
-      'source/rotate_gcc.cc',
      'source/rotate_mips.cc',
-      'source/rotate_neon.cc',
-      'source/rotate_neon64.cc',
-      'source/rotate_win.cc',
      'source/row_any.cc',
      'source/row_common.cc',
-      'source/row_gcc.cc',
      'source/row_mips.cc',
-      'source/row_neon.cc',
-      'source/row_neon64.cc',
+      'source/row_posix.cc',
      'source/row_win.cc',
      'source/scale.cc',
-      'source/scale_any.cc',
      'source/scale_argb.cc',
      'source/scale_common.cc',
-      'source/scale_gcc.cc',
      'source/scale_mips.cc',
-      'source/scale_neon.cc',
-      'source/scale_neon64.cc',
+      'source/scale_posix.cc',
      'source/scale_win.cc',
      'source/video_common.cc',
    ],
--- a/media/libyuv/libyuv_nacl.gyp
+++ b/media/libyuv/libyuv_nacl.gyp
@ -21,6 +21,9 @@
        'build_newlib': 0,
        'build_pnacl_newlib': 1,
      },
+      'dependencies': [
+        '../../native_client/tools.gyp:prep_toolchain',
+      ],
      'include_dirs': [
        'include',
      ],
--- a/media/libyuv/libyuv_test.gyp
+++ b/media/libyuv/libyuv_test.gyp
@ -7,25 +7,24 @@
 # be found in the AUTHORS file in the root of the source tree.

 {
-  'variables': {
-    'libyuv_disable_jpeg%': 0,
-  },
  'targets': [
    {
      'target_name': 'libyuv_unittest',
-      'type': '<(gtest_target_type)',
+      'type': 'executable',
      'dependencies': [
        'libyuv.gyp:libyuv',
+        # The tests are based on gtest
        'testing/gtest.gyp:gtest',
-        'third_party/gflags/gflags.gyp:gflags',
+        'testing/gtest.gyp:gtest_main',
      ],
-      'direct_dependent_settings': {
-        'defines': [
-          'GTEST_RELATIVE_PATH',
-        ],
-      },
-      'export_dependent_settings': [
-        '<(DEPTH)/testing/gtest.gyp:gtest',
+      'defines': [
+        'LIBYUV_SVNREVISION="<!(svnversion -n)"',
+        # Enable the following 3 macros to turn off assembly for specified CPU.
+        # 'LIBYUV_DISABLE_X86',
+        # 'LIBYUV_DISABLE_NEON',
+        # 'LIBYUV_DISABLE_MIPS',
+        # Enable the following macro to build libyuv as a shared library (dll).
+        # 'LIBYUV_USING_SHARED_LIBRARY',
      ],
      'sources': [
        # headers
@ -34,7 +33,6 @@
        # sources
        'unit_test/basictypes_test.cc',
        'unit_test/compare_test.cc',
-        'unit_test/color_test.cc',
        'unit_test/convert_test.cc',
        'unit_test/cpu_test.cc',
        'unit_test/math_test.cc',
@ -45,6 +43,7 @@
        'unit_test/scale_test.cc',
        'unit_test/unit_test.cc',
        'unit_test/video_common_test.cc',
+        'unit_test/version_test.cc',
      ],
      'conditions': [
        ['OS=="linux"', {
@ -52,55 +51,14 @@
            '-fexceptions',
          ],
        }],
-        [ 'OS == "ios" and target_subarch == 64', {
-          'defines': [
-            'LIBYUV_DISABLE_NEON'
-          ],
-        }],
-        [ 'OS == "ios"', {
-          'xcode_settings': {
-            'DEBUGGING_SYMBOLS': 'YES',
-            'DEBUG_INFORMATION_FORMAT' : 'dwarf-with-dsym',
-            # Work around compile issue with isosim.mm, see
-            # https://code.google.com/p/libyuv/issues/detail?id=548 for details.
-            'WARNING_CFLAGS': [
-              '-Wno-sometimes-uninitialized',
-            ],
-          },
-          'cflags': [
-            '-Wno-sometimes-uninitialized',
-          ],
-        }],
-        [ 'OS != "ios" and libyuv_disable_jpeg != 1', {
+        [ 'OS != "ios"', {
          'defines': [
            'HAVE_JPEG',
          ],
        }],
-        ['OS=="android"', {
-          'dependencies': [
-            '<(DEPTH)/testing/android/native_test.gyp:native_test_native_code',
-          ],
-        }],
-        # TODO(YangZhang): These lines can be removed when high accuracy
-        # YUV to RGB to Neon is ported.
-        [ '(target_arch == "armv7" or target_arch == "armv7s" \
-          or (target_arch == "arm" and arm_version >= 7) \
-          or target_arch == "arm64") \
-          and (arm_neon == 1 or arm_neon_optional == 1)', {
-          'defines': [
-            'LIBYUV_NEON'
-          ],
-        }],
      ], # conditions
-      'defines': [
-        # Enable the following 3 macros to turn off assembly for specified CPU.
-        # 'LIBYUV_DISABLE_X86',
-        # 'LIBYUV_DISABLE_NEON',
-        # 'LIBYUV_DISABLE_MIPS',
-        # Enable the following macro to build libyuv as a shared library (dll).
-        # 'LIBYUV_USING_SHARED_LIBRARY',
-      ],
    },
+
    {
      'target_name': 'compare',
      'type': 'executable',
@ -147,24 +105,7 @@
        'util/psnr.cc',
        'util/ssim.cc',
      ],
-      'dependencies': [
-        'libyuv.gyp:libyuv',
-      ],
-      'conditions': [
-        [ 'OS == "ios" and target_subarch == 64', {
-          'defines': [
-            'LIBYUV_DISABLE_NEON'
-          ],
-        }],
-
-        [ 'OS != "ios" and libyuv_disable_jpeg != 1', {
-          'defines': [
-            'HAVE_JPEG',
-          ],
-        }],
-      ], # conditions
    },
-
    {
      'target_name': 'cpuid',
      'type': 'executable',
@ -177,50 +118,6 @@
      ],
    },
  ], # targets
-  'conditions': [
-    ['OS=="android"', {
-      'targets': [
-        {
-          # TODO(kjellander): Figure out what to change in build/apk_test.gypi
-          # to it can be used instead of the copied code below. Using it in its
-          # current version was not possible, since the target starts with 'lib',
-          # which somewhere confuses the variables.
-          'target_name': 'libyuv_unittest_apk',
-          'type': 'none',
-          'variables': {
-            # These are used to configure java_apk.gypi included below.
-            'test_type': 'gtest',
-            'apk_name': 'libyuv_unittest',
-            'test_suite_name': 'libyuv_unittest',
-            'intermediate_dir': '<(PRODUCT_DIR)/libyuv_unittest_apk',
-            'input_shlib_path': '<(SHARED_LIB_DIR)/<(SHARED_LIB_PREFIX)libyuv_unittest<(SHARED_LIB_SUFFIX)',
-            'final_apk_path': '<(intermediate_dir)/libyuv_unittest-debug.apk',
-            'java_in_dir': '<(DEPTH)/testing/android/native_test/java',
-            'test_runner_path': '<(DEPTH)/util/android/test_runner.py',
-            'native_lib_target': 'libyuv_unittest',
-            'gyp_managed_install': 0,
-          },
-          'includes': [
-            'build/android/test_runner.gypi',
-            'build/java_apk.gypi',
-           ],
-          'dependencies': [
-            '<(DEPTH)/base/base.gyp:base_java',
-            # TODO(kjellander): Figure out why base_build_config_gen is needed
-            # here. It really shouldn't since it's a dependency of base_java
-            # above, but there's always 0 tests run if it's missing.
-            '<(DEPTH)/base/base.gyp:base_build_config_gen',
-            '<(DEPTH)/build/android/pylib/device/commands/commands.gyp:chromium_commands',
-            '<(DEPTH)/build/android/pylib/remote/device/dummy/dummy.gyp:remote_device_dummy_apk',
-            '<(DEPTH)/testing/android/appurify_support.gyp:appurify_support_java',
-            '<(DEPTH)/testing/android/on_device_instrumentation.gyp:reporter_java',
-            '<(DEPTH)/tools/android/android_tools.gyp:android_tools',
-            'libyuv_unittest',
-          ],
-        },
-      ],
-    }],
-  ],
 }

 # Local Variables:
--- a/media/libyuv/linux.mk
+++ b/media/libyuv/linux.mk
@ -1,81 +1,48 @@
 # This is a generic makefile for libyuv for gcc.
-# make -f linux.mk CXX=clang++
+# make -f linux.mk CC=clang++

-CC?=gcc
-CFLAGS?=-O2 -fomit-frame-pointer
-CFLAGS+=-Iinclude/
-
-CXX?=g++
-CXXFLAGS?=-O2 -fomit-frame-pointer
-CXXFLAGS+=-Iinclude/
+CC=g++
+CCFLAGS=-O2 -fomit-frame-pointer -Iinclude/

 LOCAL_OBJ_FILES := \
-	source/compare.o           \
-	source/compare_common.o    \
-	source/compare_gcc.o       \
-	source/compare_neon64.o    \
-	source/compare_neon.o      \
-	source/compare_win.o       \
-	source/convert_argb.o      \
-	source/convert.o           \
-	source/convert_from_argb.o \
-	source/convert_from.o      \
-	source/convert_jpeg.o      \
-	source/convert_to_argb.o   \
-	source/convert_to_i420.o   \
-	source/cpu_id.o            \
-	source/mjpeg_decoder.o     \
-	source/mjpeg_validate.o    \
-	source/planar_functions.o  \
-	source/rotate_any.o        \
-	source/rotate_argb.o       \
-	source/rotate.o            \
-	source/rotate_common.o     \
-	source/rotate_gcc.o        \
-	source/rotate_mips.o       \
-	source/rotate_neon64.o     \
-	source/rotate_neon.o       \
-	source/rotate_win.o        \
-	source/row_any.o           \
-	source/row_common.o        \
-	source/row_gcc.o           \
-	source/row_mips.o          \
-	source/row_neon64.o        \
-	source/row_neon.o          \
-	source/row_win.o           \
-	source/scale_any.o         \
-	source/scale_argb.o        \
-	source/scale.o             \
-	source/scale_common.o      \
-	source/scale_gcc.o         \
-	source/scale_mips.o        \
-	source/scale_neon64.o      \
-	source/scale_neon.o        \
-	source/scale_win.o         \
-	source/video_common.o
+    source/compare.o           \
+    source/compare_common.o    \
+    source/compare_posix.o     \
+    source/convert.o           \
+    source/convert_argb.o      \
+    source/convert_from.o      \
+    source/convert_from_argb.o \
+    source/convert_to_argb.o   \
+    source/convert_to_i420.o   \
+    source/cpu_id.o            \
+    source/format_conversion.o \
+    source/planar_functions.o  \
+    source/rotate.o            \
+    source/rotate_argb.o       \
+    source/rotate_mips.o       \
+    source/row_any.o           \
+    source/row_common.o        \
+    source/row_mips.o          \
+    source/row_posix.o         \
+    source/scale.o             \
+    source/scale_argb.o        \
+    source/scale_common.o      \
+    source/scale_mips.o        \
+    source/scale_posix.o       \
+    source/video_common.o

 .cc.o:
-	$(CXX) -c $(CXXFLAGS) $*.cc -o $*.o
+	$(CC) -c $(CCFLAGS) $*.cc -o $*.o

-.c.o:
-	$(CC) -c $(CFLAGS) $*.c -o $*.o
+all: libyuv.a convert linux.mk

-all: libyuv.a convert cpuid psnr
+libyuv.a: $(LOCAL_OBJ_FILES) linux.mk
+	$(AR) $(ARFLAGS) -o $@ $(LOCAL_OBJ_FILES)

-libyuv.a: $(LOCAL_OBJ_FILES)
-	$(AR) $(ARFLAGS) $@ $(LOCAL_OBJ_FILES)
-
-# A C++ test utility that uses libyuv conversion.
-convert: util/convert.cc libyuv.a
-	$(CXX) $(CXXFLAGS) -Iutil/ -o $@ util/convert.cc libyuv.a
-
-# A standalone test utility
-psnr: util/psnr.cc
-	$(CXX) $(CXXFLAGS) -Iutil/ -o $@ util/psnr.cc util/psnr_main.cc util/ssim.cc
-
-# A C test utility that uses libyuv conversion from C.
-cpuid: util/cpuid.c libyuv.a
-	$(CC) $(CFLAGS) -o $@ util/cpuid.c libyuv.a
+# A test utility that uses libyuv conversion.
+convert: util/convert.cc linux.mk
+	$(CC) $(CCFLAGS) -Iutil/ -o $@ util/convert.cc libyuv.a

 clean:
-	/bin/rm -f source/*.o *.ii *.s libyuv.a convert cpuid psnr
+	/bin/rm -f source/*.o *.ii *.s libyuv.a convert
+
--- a/media/libyuv/source/compare.cc
+++ b/media/libyuv/source/compare.cc
@ -17,23 +17,38 @@
 #endif

 #include "libyuv/basic_types.h"
-#include "libyuv/compare_row.h"
 #include "libyuv/cpu_id.h"
 #include "libyuv/row.h"
-#include "libyuv/video_common.h"

 #ifdef __cplusplus
 namespace libyuv {
 extern "C" {
 #endif

+// hash seed of 5381 recommended.
+// Internal C version of HashDjb2 with int sized count for efficiency.
+uint32 HashDjb2_C(const uint8* src, int count, uint32 seed);
+
+// This module is for Visual C x86
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(_M_IX86) || \
+    (defined(__x86_64__) || (defined(__i386__) && !defined(__pic__))))
+#define HAS_HASHDJB2_SSE41
+uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed);
+
+#if _MSC_VER >= 1700
+#define HAS_HASHDJB2_AVX2
+uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed);
+#endif
+
+#endif  // HAS_HASHDJB2_SSE41
+
 // hash seed of 5381 recommended.
 LIBYUV_API
 uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) {
  const int kBlockSize = 1 << 15;  // 32768;
  int remainder;
-  uint32 (*HashDjb2_SSE)(const uint8* src, int count, uint32 seed) =
-      HashDjb2_C;
+  uint32 (*HashDjb2_SSE)(const uint8* src, int count, uint32 seed) = HashDjb2_C;
 #if defined(HAS_HASHDJB2_SSE41)
  if (TestCpuFlag(kCpuHasSSE41)) {
    HashDjb2_SSE = HashDjb2_SSE41;
@ -63,53 +78,22 @@ uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) {
  return seed;
 }

-static uint32 ARGBDetectRow_C(const uint8* argb, int width) {
-  int x;
-  for (x = 0; x < width - 1; x += 2) {
-    if (argb[0] != 255) {  // First byte is not Alpha of 255, so not ARGB.
-      return FOURCC_BGRA;
-    }
-    if (argb[3] != 255) {  // 4th byte is not Alpha of 255, so not BGRA.
-      return FOURCC_ARGB;
-    }
-    if (argb[4] != 255) {  // Second pixel first byte is not Alpha of 255.
-      return FOURCC_BGRA;
-    }
-    if (argb[7] != 255) {  // Second pixel 4th byte is not Alpha of 255.
-      return FOURCC_ARGB;
-    }
-    argb += 8;
-  }
-  if (width & 1) {
-    if (argb[0] != 255) {  // First byte is not Alpha of 255, so not ARGB.
-      return FOURCC_BGRA;
-    }
-    if (argb[3] != 255) {  // 4th byte is not Alpha of 255, so not BGRA.
-      return FOURCC_ARGB;
-    }
-  }
-  return 0;
-}
-
-// Scan an opaque argb image and return fourcc based on alpha offset.
-// Returns FOURCC_ARGB, FOURCC_BGRA, or 0 if unknown.
-LIBYUV_API
-uint32 ARGBDetect(const uint8* argb, int stride_argb, int width, int height) {
-  uint32 fourcc = 0;
-  int h;
-
-  // Coalesce rows.
-  if (stride_argb == width * 4) {
-    width *= height;
-    height = 1;
-    stride_argb = 0;
-  }
-  for (h = 0; h < height && fourcc == 0; ++h) {
-    fourcc = ARGBDetectRow_C(argb, width);
-    argb += stride_argb;
-  }
-  return fourcc;
-}
+uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count);
+#if !defined(LIBYUV_DISABLE_NEON) && \
+    (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
+#define HAS_SUMSQUAREERROR_NEON
+uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count);
+#endif
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
+#define HAS_SUMSQUAREERROR_SSE2
+uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count);
+#endif
+// Visual C 2012 required for AVX2.
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && _MSC_VER >= 1700
+#define HAS_SUMSQUAREERROR_AVX2
+uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count);
+#endif

 // TODO(fbarchard): Refactor into row function.
 LIBYUV_API
@ -130,7 +114,8 @@ uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b,
  }
 #endif
 #if defined(HAS_SUMSQUAREERROR_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
+  if (TestCpuFlag(kCpuHasSSE2) &&
+      IS_ALIGNED(src_a, 16) && IS_ALIGNED(src_b, 16)) {
    // Note only used for multiples of 16 so count is not checked.
    SumSquareError = SumSquareError_SSE2;
  }
--- a/media/libyuv/source/compare_common.cc
+++ b/media/libyuv/source/compare_common.cc
@ -10,8 +10,6 @@

 #include "libyuv/basic_types.h"

-#include "libyuv/compare_row.h"
-
 #ifdef __cplusplus
 namespace libyuv {
 extern "C" {
--- a/media/libyuv/source/compare_neon.cc
+++ b/media/libyuv/source/compare_neon.cc
@ -10,16 +10,12 @@

 #include "libyuv/basic_types.h"

-#include "libyuv/compare_row.h"
-#include "libyuv/row.h"
-
 #ifdef __cplusplus
 namespace libyuv {
 extern "C" {
 #endif

-#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
-    !defined(__aarch64__)
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)

 uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
  volatile uint32 sse;
@ -29,10 +25,9 @@ uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
    "vmov.u8    q9, #0                         \n"
    "vmov.u8    q11, #0                        \n"

+    ".p2align  2                               \n"
  "1:                                          \n"
-    MEMACCESS(0)
    "vld1.8     {q0}, [%0]!                    \n"
-    MEMACCESS(1)
    "vld1.8     {q1}, [%1]!                    \n"
    "subs       %2, %2, #16                    \n"
    "vsubl.u8   q2, d0, d2                     \n"
@ -58,7 +53,7 @@ uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
  return sse;
 }

-#endif  // defined(__ARM_NEON__) && !defined(__aarch64__)
+#endif  // __ARM_NEON__

 #ifdef __cplusplus
 }  // extern "C"
--- a/media/libyuv/source/compare_win.cc
+++ b/media/libyuv/source/compare_win.cc
@ -9,8 +9,6 @@
 */

 #include "libyuv/basic_types.h"
-
-#include "libyuv/compare_row.h"
 #include "libyuv/row.h"

 #ifdef __cplusplus
@ -18,8 +16,7 @@ namespace libyuv {
 extern "C" {
 #endif

-// This module is for 32 bit Visual C x86 and clangcl
-#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)

 __declspec(naked)
 uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
@ -30,11 +27,13 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
    pxor       xmm0, xmm0
    pxor       xmm5, xmm5

+    align      4
  wloop:
-    movdqu     xmm1, [eax]
+    movdqa     xmm1, [eax]
    lea        eax,  [eax + 16]
-    movdqu     xmm2, [edx]
+    movdqa     xmm2, [edx]
    lea        edx,  [edx + 16]
+    sub        ecx, 16
    movdqa     xmm3, xmm1  // abs trick
    psubusb    xmm1, xmm2
    psubusb    xmm2, xmm3
@ -46,7 +45,6 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
    pmaddwd    xmm2, xmm2
    paddd      xmm0, xmm1
    paddd      xmm0, xmm2
-    sub        ecx, 16
    jg         wloop

    pshufd     xmm1, xmm0, 0xee
@ -72,10 +70,12 @@ uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
    vpxor      ymm5, ymm5, ymm5  // constant 0 for unpck
    sub        edx, eax

+    align      4
  wloop:
    vmovdqu    ymm1, [eax]
    vmovdqu    ymm2, [eax + edx]
    lea        eax,  [eax + 32]
+    sub        ecx, 32
    vpsubusb   ymm3, ymm1, ymm2  // abs difference trick
    vpsubusb   ymm2, ymm2, ymm1
    vpor       ymm1, ymm2, ymm3
@ -85,7 +85,6 @@ uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
    vpmaddwd   ymm1, ymm1, ymm1
    vpaddd     ymm0, ymm0, ymm1
    vpaddd     ymm0, ymm0, ymm2
-    sub        ecx, 32
    jg         wloop

    vpshufd    ymm1, ymm0, 0xee  // 3, 2 + 1, 0 both lanes.
@ -101,32 +100,41 @@ uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
 }
 #endif  // _MSC_VER >= 1700

-uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 };  // 33 ^ 16
-uvec32 kHashMul0 = {
+#define HAS_HASHDJB2_SSE41
+static uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 };  // 33 ^ 16
+static uvec32 kHashMul0 = {
  0x0c3525e1,  // 33 ^ 15
  0xa3476dc1,  // 33 ^ 14
  0x3b4039a1,  // 33 ^ 13
  0x4f5f0981,  // 33 ^ 12
 };
-uvec32 kHashMul1 = {
+static uvec32 kHashMul1 = {
  0x30f35d61,  // 33 ^ 11
  0x855cb541,  // 33 ^ 10
  0x040a9121,  // 33 ^ 9
  0x747c7101,  // 33 ^ 8
 };
-uvec32 kHashMul2 = {
+static uvec32 kHashMul2 = {
  0xec41d4e1,  // 33 ^ 7
  0x4cfa3cc1,  // 33 ^ 6
  0x025528a1,  // 33 ^ 5
  0x00121881,  // 33 ^ 4
 };
-uvec32 kHashMul3 = {
+static uvec32 kHashMul3 = {
  0x00008c61,  // 33 ^ 3
  0x00000441,  // 33 ^ 2
  0x00000021,  // 33 ^ 1
  0x00000001,  // 33 ^ 0
 };

+// 27: 66 0F 38 40 C6     pmulld      xmm0,xmm6
+// 44: 66 0F 38 40 DD     pmulld      xmm3,xmm5
+// 59: 66 0F 38 40 E5     pmulld      xmm4,xmm5
+// 72: 66 0F 38 40 D5     pmulld      xmm2,xmm5
+// 83: 66 0F 38 40 CD     pmulld      xmm1,xmm5
+#define pmulld(reg) _asm _emit 0x66 _asm _emit 0x0F _asm _emit 0x38 \
+    _asm _emit 0x40 _asm _emit reg
+
 __declspec(naked)
 uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
  __asm {
@ -135,32 +143,34 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
    movd       xmm0, [esp + 12]  // seed

    pxor       xmm7, xmm7        // constant 0 for unpck
-    movdqa     xmm6, xmmword ptr kHash16x33
+    movdqa     xmm6, kHash16x33

+    align      4
  wloop:
    movdqu     xmm1, [eax]       // src[0-15]
    lea        eax, [eax + 16]
-    pmulld     xmm0, xmm6        // hash *= 33 ^ 16
-    movdqa     xmm5, xmmword ptr kHashMul0
+    pmulld(0xc6)                 // pmulld      xmm0,xmm6  hash *= 33 ^ 16
+    movdqa     xmm5, kHashMul0
    movdqa     xmm2, xmm1
    punpcklbw  xmm2, xmm7        // src[0-7]
    movdqa     xmm3, xmm2
    punpcklwd  xmm3, xmm7        // src[0-3]
-    pmulld     xmm3, xmm5
-    movdqa     xmm5, xmmword ptr kHashMul1
+    pmulld(0xdd)                 // pmulld     xmm3, xmm5
+    movdqa     xmm5, kHashMul1
    movdqa     xmm4, xmm2
    punpckhwd  xmm4, xmm7        // src[4-7]
-    pmulld     xmm4, xmm5
-    movdqa     xmm5, xmmword ptr kHashMul2
+    pmulld(0xe5)                 // pmulld     xmm4, xmm5
+    movdqa     xmm5, kHashMul2
    punpckhbw  xmm1, xmm7        // src[8-15]
    movdqa     xmm2, xmm1
    punpcklwd  xmm2, xmm7        // src[8-11]
-    pmulld     xmm2, xmm5
-    movdqa     xmm5, xmmword ptr kHashMul3
+    pmulld(0xd5)                 // pmulld     xmm2, xmm5
+    movdqa     xmm5, kHashMul3
    punpckhwd  xmm1, xmm7        // src[12-15]
-    pmulld     xmm1, xmm5
+    pmulld(0xcd)                 // pmulld     xmm1, xmm5
    paddd      xmm3, xmm4        // add 16 results
    paddd      xmm1, xmm2
+    sub        ecx, 16
    paddd      xmm1, xmm3

    pshufd     xmm2, xmm1, 0x0e  // upper 2 dwords
@ -168,7 +178,6 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
    pshufd     xmm2, xmm1, 0x01
    paddd      xmm1, xmm2
    paddd      xmm0, xmm1
-    sub        ecx, 16
    jg         wloop

    movd       eax, xmm0         // return hash
@ -183,38 +192,39 @@ uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) {
  __asm {
    mov        eax, [esp + 4]    // src
    mov        ecx, [esp + 8]    // count
-    vmovd      xmm0, [esp + 12]  // seed
+    movd       xmm0, [esp + 12]  // seed
+    movdqa     xmm6, kHash16x33

+    align      4
  wloop:
-    vpmovzxbd  xmm3, [eax]  // src[0-3]
-    vpmulld    xmm0, xmm0, xmmword ptr kHash16x33  // hash *= 33 ^ 16
-    vpmovzxbd  xmm4, [eax + 4]  // src[4-7]
-    vpmulld    xmm3, xmm3, xmmword ptr kHashMul0
-    vpmovzxbd  xmm2, [eax + 8]  // src[8-11]
-    vpmulld    xmm4, xmm4, xmmword ptr kHashMul1
-    vpmovzxbd  xmm1, [eax + 12]  // src[12-15]
-    vpmulld    xmm2, xmm2, xmmword ptr kHashMul2
+    vpmovzxbd  xmm3, dword ptr [eax]  // src[0-3]
+    pmulld     xmm0, xmm6  // hash *= 33 ^ 16
+    vpmovzxbd  xmm4, dword ptr [eax + 4]  // src[4-7]
+    pmulld     xmm3, kHashMul0
+    vpmovzxbd  xmm2, dword ptr [eax + 8]  // src[8-11]
+    pmulld     xmm4, kHashMul1
+    vpmovzxbd  xmm1, dword ptr [eax + 12]  // src[12-15]
+    pmulld     xmm2, kHashMul2
    lea        eax, [eax + 16]
-    vpmulld    xmm1, xmm1, xmmword ptr kHashMul3
-    vpaddd     xmm3, xmm3, xmm4        // add 16 results
-    vpaddd     xmm1, xmm1, xmm2
-    vpaddd     xmm1, xmm1, xmm3
-    vpshufd    xmm2, xmm1, 0x0e  // upper 2 dwords
-    vpaddd     xmm1, xmm1,xmm2
-    vpshufd    xmm2, xmm1, 0x01
-    vpaddd     xmm1, xmm1, xmm2
-    vpaddd     xmm0, xmm0, xmm1
+    pmulld     xmm1, kHashMul3
+    paddd      xmm3, xmm4        // add 16 results
+    paddd      xmm1, xmm2
    sub        ecx, 16
+    paddd      xmm1, xmm3
+    pshufd     xmm2, xmm1, 0x0e  // upper 2 dwords
+    paddd      xmm1, xmm2
+    pshufd     xmm2, xmm1, 0x01
+    paddd      xmm1, xmm2
+    paddd      xmm0, xmm1
    jg         wloop

-    vmovd      eax, xmm0         // return hash
-    vzeroupper
+    movd       eax, xmm0         // return hash
    ret
  }
 }
 #endif  // _MSC_VER >= 1700

-#endif  // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
+#endif  // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)

 #ifdef __cplusplus
 }  // extern "C"
--- a/media/libyuv/source/convert.cc
+++ b/media/libyuv/source/convert.cc
--- a/media/libyuv/source/convert_argb.cc
+++ b/media/libyuv/source/convert_argb.cc
--- a/media/libyuv/source/convert_from.cc
+++ b/media/libyuv/source/convert_from.cc
--- a/media/libyuv/source/convert_from_argb.cc
+++ b/media/libyuv/source/convert_from_argb.cc
--- a/media/libyuv/source/convert_jpeg.cc
+++ b/media/libyuv/source/convert_jpeg.cc
@ -9,7 +9,6 @@
 */

 #include "libyuv/convert.h"
-#include "libyuv/convert_argb.h"

 #ifdef HAVE_JPEG
 #include "libyuv/mjpeg_decoder.h"
@ -219,7 +218,7 @@ int MJPGToI420(const uint8* sample,
      return 1;
    }
  }
-  return ret ? 0 : 1;
+  return ret ? 0 : -1;
 }

 #ifdef HAVE_JPEG
@ -381,7 +380,7 @@ int MJPGToARGB(const uint8* sample,
      return 1;
    }
  }
-  return ret ? 0 : 1;
+  return ret ? 0 : -1;
 }
 #endif

--- a/media/libyuv/source/convert_to_argb.cc
+++ b/media/libyuv/source/convert_to_argb.cc
@ -11,6 +11,7 @@
 #include "libyuv/convert_argb.h"

 #include "libyuv/cpu_id.h"
+#include "libyuv/format_conversion.h"
 #ifdef HAVE_JPEG
 #include "libyuv/mjpeg_decoder.h"
 #endif
@ -23,7 +24,7 @@ namespace libyuv {
 extern "C" {
 #endif

-// Convert camera sample to ARGB with cropping, rotation and vertical flip.
+// Convert camera sample to I420 with cropping, rotation and vertical flip.
 // src_width is used for source stride computation
 // src_height is used to compute location of planes, and indicate inversion
 // sample_size is measured in bytes and is the size of the frame.
@ -51,8 +52,8 @@ int ConvertToARGB(const uint8* sample, size_t sample_size,
  // also enable temporary buffer.
  LIBYUV_BOOL need_buf = (rotation && format != FOURCC_ARGB) ||
      crop_argb == sample;
-  uint8* dest_argb = crop_argb;
-  int dest_argb_stride = argb_stride;
+  uint8* tmp_argb = crop_argb;
+  int tmp_argb_stride = argb_stride;
  uint8* rotate_buffer = NULL;
  int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height;

@ -66,13 +67,13 @@ int ConvertToARGB(const uint8* sample, size_t sample_size,
  }

  if (need_buf) {
-    int argb_size = crop_width * 4 * abs_crop_height;
+    int argb_size = crop_width * abs_crop_height * 4;
    rotate_buffer = (uint8*)malloc(argb_size);
    if (!rotate_buffer) {
      return 1;  // Out of memory runtime error.
    }
    crop_argb = rotate_buffer;
-    argb_stride = crop_width * 4;
+    argb_stride = crop_width;
  }

  switch (format) {
@ -143,6 +144,36 @@ int ConvertToARGB(const uint8* sample, size_t sample_size,
                         crop_argb, argb_stride,
                         crop_width, inv_crop_height);
      break;
+    // TODO(fbarchard): Support cropping Bayer by odd numbers
+    // by adjusting fourcc.
+    case FOURCC_BGGR:
+      src = sample + (src_width * crop_y + crop_x);
+      r = BayerBGGRToARGB(src, src_width,
+                          crop_argb, argb_stride,
+                          crop_width, inv_crop_height);
+      break;
+
+    case FOURCC_GBRG:
+      src = sample + (src_width * crop_y + crop_x);
+      r = BayerGBRGToARGB(src, src_width,
+                          crop_argb, argb_stride,
+                          crop_width, inv_crop_height);
+      break;
+
+    case FOURCC_GRBG:
+      src = sample + (src_width * crop_y + crop_x);
+      r = BayerGRBGToARGB(src, src_width,
+                          crop_argb, argb_stride,
+                          crop_width, inv_crop_height);
+      break;
+
+    case FOURCC_RGGB:
+      src = sample + (src_width * crop_y + crop_x);
+      r = BayerRGGBToARGB(src, src_width,
+                          crop_argb, argb_stride,
+                          crop_width, inv_crop_height);
+      break;
+
    case FOURCC_I400:
      src = sample + src_width * crop_y + crop_x;
      r = I400ToARGB(src, src_width,
@ -174,8 +205,18 @@ int ConvertToARGB(const uint8* sample, size_t sample_size,
                     crop_argb, argb_stride,
                     crop_width, inv_crop_height);
      break;
+//    case FOURCC_Q420:
+//      src = sample + (src_width + aligned_src_width * 2) * crop_y + crop_x;
+//      src_uv = sample + (src_width + aligned_src_width * 2) * crop_y +
+//               src_width + crop_x * 2;
+//      r = Q420ToARGB(src, src_width * 3,
+//                    src_uv, src_width * 3,
+//                    crop_argb, argb_stride,
+//                    crop_width, inv_crop_height);
+//      break;
    // Triplanar formats
    case FOURCC_I420:
+    case FOURCC_YU12:
    case FOURCC_YV12: {
      const uint8* src_y = sample + (src_width * crop_y + crop_x);
      const uint8* src_u;
@ -200,25 +241,6 @@ int ConvertToARGB(const uint8* sample, size_t sample_size,
                     crop_width, inv_crop_height);
      break;
    }
-
-    case FOURCC_J420: {
-      const uint8* src_y = sample + (src_width * crop_y + crop_x);
-      const uint8* src_u;
-      const uint8* src_v;
-      int halfwidth = (src_width + 1) / 2;
-      int halfheight = (abs_src_height + 1) / 2;
-      src_u = sample + src_width * abs_src_height +
-          (halfwidth * crop_y + crop_x) / 2;
-      src_v = sample + src_width * abs_src_height +
-          halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
-      r = J420ToARGB(src_y, src_width,
-                     src_u, halfwidth,
-                     src_v, halfwidth,
-                     crop_argb, argb_stride,
-                     crop_width, inv_crop_height);
-      break;
-    }
-
    case FOURCC_I422:
    case FOURCC_YV16: {
      const uint8* src_y = sample + src_width * crop_y + crop_x;
@ -290,7 +312,7 @@ int ConvertToARGB(const uint8* sample, size_t sample_size,
  if (need_buf) {
    if (!r) {
      r = ARGBRotate(crop_argb, argb_stride,
-                     dest_argb, dest_argb_stride,
+                     tmp_argb, tmp_argb_stride,
                     crop_width, abs_crop_height, rotation);
    }
    free(rotate_buffer);
--- a/media/libyuv/source/convert_to_i420.cc
+++ b/media/libyuv/source/convert_to_i420.cc
@ -12,6 +12,7 @@

 #include "libyuv/convert.h"

+#include "libyuv/format_conversion.h"
 #include "libyuv/video_common.h"

 #ifdef __cplusplus
@ -39,13 +40,12 @@ int ConvertToI420(const uint8* sample,
  int aligned_src_width = (src_width + 1) & ~1;
  const uint8* src;
  const uint8* src_uv;
-  const int abs_src_height = (src_height < 0) ? -src_height : src_height;
-  // TODO(nisse): Why allow crop_height < 0?
-  const int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height;
+  int abs_src_height = (src_height < 0) ? -src_height : src_height;
+  int inv_crop_height = (crop_height < 0) ? -crop_height : crop_height;
  int r = 0;
  LIBYUV_BOOL need_buf = (rotation && format != FOURCC_I420 &&
      format != FOURCC_NV12 && format != FOURCC_NV21 &&
-      format != FOURCC_YV12) || y == sample;
+      format != FOURCC_YU12 && format != FOURCC_YV12) || y == sample;
  uint8* tmp_y = y;
  uint8* tmp_u = u;
  uint8* tmp_v = v;
@ -53,14 +53,16 @@ int ConvertToI420(const uint8* sample,
  int tmp_u_stride = u_stride;
  int tmp_v_stride = v_stride;
  uint8* rotate_buffer = NULL;
-  const int inv_crop_height =
-      (src_height < 0) ? -abs_crop_height : abs_crop_height;
+  int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height;

  if (!y || !u || !v || !sample ||
      src_width <= 0 || crop_width <= 0  ||
      src_height == 0 || crop_height == 0) {
    return -1;
  }
+  if (src_height < 0) {
+    inv_crop_height = -inv_crop_height;
+  }

  // One pass rotation is available for some formats. For the rest, convert
  // to I420 (with optional vertical flipping) into a temporary I420 buffer,
@ -171,6 +173,40 @@ int ConvertToI420(const uint8* sample,
                     v, v_stride,
                     crop_width, inv_crop_height);
      break;
+    // TODO(fbarchard): Support cropping Bayer by odd numbers
+    // by adjusting fourcc.
+    case FOURCC_BGGR:
+      src = sample + (src_width * crop_y + crop_x);
+      r = BayerBGGRToI420(src, src_width,
+                          y, y_stride,
+                          u, u_stride,
+                          v, v_stride,
+                          crop_width, inv_crop_height);
+      break;
+    case FOURCC_GBRG:
+      src = sample + (src_width * crop_y + crop_x);
+      r = BayerGBRGToI420(src, src_width,
+                          y, y_stride,
+                          u, u_stride,
+                          v, v_stride,
+                          crop_width, inv_crop_height);
+      break;
+    case FOURCC_GRBG:
+      src = sample + (src_width * crop_y + crop_x);
+      r = BayerGRBGToI420(src, src_width,
+                          y, y_stride,
+                          u, u_stride,
+                          v, v_stride,
+                          crop_width, inv_crop_height);
+      break;
+    case FOURCC_RGGB:
+      src = sample + (src_width * crop_y + crop_x);
+      r = BayerRGGBToI420(src, src_width,
+                          y, y_stride,
+                          u, u_stride,
+                          v, v_stride,
+                          crop_width, inv_crop_height);
+      break;
    case FOURCC_I400:
      src = sample + src_width * crop_y + crop_x;
      r = I400ToI420(src, src_width,
@ -182,8 +218,7 @@ int ConvertToI420(const uint8* sample,
    // Biplanar formats
    case FOURCC_NV12:
      src = sample + (src_width * crop_y + crop_x);
-      src_uv = sample + (src_width * src_height) +
-        ((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2);
+      src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x;
      r = NV12ToI420Rotate(src, src_width,
                           src_uv, aligned_src_width,
                           y, y_stride,
@ -193,8 +228,7 @@ int ConvertToI420(const uint8* sample,
      break;
    case FOURCC_NV21:
      src = sample + (src_width * crop_y + crop_x);
-      src_uv = sample + (src_width * src_height) +
-        ((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2);
+      src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x;
      // Call NV12 but with u and v parameters swapped.
      r = NV12ToI420Rotate(src, src_width,
                           src_uv, aligned_src_width,
@ -211,8 +245,20 @@ int ConvertToI420(const uint8* sample,
                     v, v_stride,
                     crop_width, inv_crop_height);
      break;
+    case FOURCC_Q420:
+      src = sample + (src_width + aligned_src_width * 2) * crop_y + crop_x;
+      src_uv = sample + (src_width + aligned_src_width * 2) * crop_y +
+               src_width + crop_x * 2;
+      r = Q420ToI420(src, src_width * 3,
+                    src_uv, src_width * 3,
+                    y, y_stride,
+                    u, u_stride,
+                    v, v_stride,
+                    crop_width, inv_crop_height);
+      break;
    // Triplanar formats
    case FOURCC_I420:
+    case FOURCC_YU12:
    case FOURCC_YV12: {
      const uint8* src_y = sample + (src_width * crop_y + crop_x);
      const uint8* src_u;
--- a/media/libyuv/source/cpu_id.cc
+++ b/media/libyuv/source/cpu_id.cc
@ -10,12 +10,12 @@

 #include "libyuv/cpu_id.h"

-#if defined(_MSC_VER)
+#if defined(_MSC_VER) && !defined(__clang__)
 #include <intrin.h>  // For __cpuidex()
 #endif
 #if !defined(__pnacl__) && !defined(__CLR_VER) && \
-    !defined(__native_client__) && (defined(_M_IX86) || defined(_M_X64)) && \
-    defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219)
+    !defined(__native_client__) && defined(_M_X64) && \
+    defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219)
 #include <immintrin.h>  // For _xgetbv()
 #endif

@ -36,22 +36,20 @@ extern "C" {

 // For functions that use the stack and have runtime checks for overflow,
 // use SAFEBUFFERS to avoid additional check.
-#if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219) && \
-    !defined(__clang__)
+#if defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219)
 #define SAFEBUFFERS __declspec(safebuffers)
 #else
 #define SAFEBUFFERS
 #endif

-// Low level cpuid for X86.
-#if (defined(_M_IX86) || defined(_M_X64) || \
-    defined(__i386__) || defined(__x86_64__)) && \
-    !defined(__pnacl__) && !defined(__CLR_VER)
+// Low level cpuid for X86. Returns zeros on other CPUs.
+#if !defined(__pnacl__) && !defined(__CLR_VER) && \
+    (defined(_M_IX86) || defined(_M_X64) || \
+    defined(__i386__) || defined(__x86_64__))
 LIBYUV_API
 void CpuId(uint32 info_eax, uint32 info_ecx, uint32* cpu_info) {
-#if defined(_MSC_VER)
-// Visual C version uses intrinsic or inline x86 assembly.
-#if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219)
+#if defined(_MSC_VER) && !defined(__clang__)
+#if (_MSC_FULL_VER >= 160040219)
  __cpuidex((int*)(cpu_info), info_eax, info_ecx);
 #elif defined(_M_IX86)
  __asm {
@ -64,17 +62,16 @@ void CpuId(uint32 info_eax, uint32 info_ecx, uint32* cpu_info) {
    mov        [edi + 8], ecx
    mov        [edi + 12], edx
  }
-#else  // Visual C but not x86
+#else
  if (info_ecx == 0) {
    __cpuid((int*)(cpu_info), info_eax);
  } else {
    cpu_info[3] = cpu_info[2] = cpu_info[1] = cpu_info[0] = 0;
  }
 #endif
-// GCC version uses inline x86 assembly.
 #else  // defined(_MSC_VER)
  uint32 info_ebx, info_edx;
-  asm volatile (
+  asm volatile (  // NOLINT
 #if defined( __i386__) && defined(__PIC__)
    // Preserve ebx for fpic 32 bit.
    "mov %%ebx, %%edi                          \n"
@ -92,78 +89,76 @@ void CpuId(uint32 info_eax, uint32 info_ecx, uint32* cpu_info) {
  cpu_info[3] = info_edx;
 #endif  // defined(_MSC_VER)
 }
-#else  // (defined(_M_IX86) || defined(_M_X64) ...
+
+#if !defined(__native_client__)
+#define HAS_XGETBV
+// X86 CPUs have xgetbv to detect OS saves high parts of ymm registers.
+int TestOsSaveYmm() {
+  uint32 xcr0 = 0u;
+#if defined(_MSC_VER) && defined(_XCR_XFEATURE_ENABLED_MASK)
+  xcr0 = (uint32)(_xgetbv(_XCR_XFEATURE_ENABLED_MASK));
+#elif defined(_MSC_VER) && defined(_M_IX86)
+  __asm {
+    xor        ecx, ecx    // xcr 0
+    _asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0  // For VS2010 and earlier.
+    mov        xcr0, eax
+  }
+#elif defined(__i386__) || defined(__x86_64__)
+  asm(".byte 0x0f, 0x01, 0xd0" : "=a" (xcr0) : "c" (0) : "%edx");
+#endif  // defined(_MSC_VER)
+  return((xcr0 & 6) == 6);  // Is ymm saved?
+}
+#endif  // !defined(__native_client__)
+#else
 LIBYUV_API
 void CpuId(uint32 eax, uint32 ecx, uint32* cpu_info) {
  cpu_info[0] = cpu_info[1] = cpu_info[2] = cpu_info[3] = 0;
 }
 #endif

-// For VS2010 and earlier emit can be used:
-//   _asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0  // For VS2010 and earlier.
-//  __asm {
-//    xor        ecx, ecx    // xcr 0
-//    xgetbv
-//    mov        xcr0, eax
-//  }
-// For VS2013 and earlier 32 bit, the _xgetbv(0) optimizer produces bad code.
-// https://code.google.com/p/libyuv/issues/detail?id=529
-#if defined(_M_IX86) && (_MSC_VER < 1900)
-#pragma optimize("g", off)
-#endif
-#if (defined(_M_IX86) || defined(_M_X64) || \
-    defined(__i386__) || defined(__x86_64__)) && \
-    !defined(__pnacl__) && !defined(__CLR_VER) && !defined(__native_client__)
-#define HAS_XGETBV
-// X86 CPUs have xgetbv to detect OS saves high parts of ymm registers.
-int GetXCR0() {
-  uint32 xcr0 = 0u;
-#if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219)
-  xcr0 = (uint32)(_xgetbv(0));  // VS2010 SP1 required.
-#elif defined(__i386__) || defined(__x86_64__)
-  asm(".byte 0x0f, 0x01, 0xd0" : "=a" (xcr0) : "c" (0) : "%edx");
-#endif  // defined(__i386__) || defined(__x86_64__)
-  return xcr0;
-}
-#endif  // defined(_M_IX86) || defined(_M_X64) ..
-// Return optimization to previous setting.
-#if defined(_M_IX86) && (_MSC_VER < 1900)
-#pragma optimize("g", on)
-#endif
-
 // based on libvpx arm_cpudetect.c
 // For Arm, but public to allow testing on any CPU
 LIBYUV_API SAFEBUFFERS
 int ArmCpuCaps(const char* cpuinfo_name) {
-  char cpuinfo_line[512];
  FILE* f = fopen(cpuinfo_name, "r");
-  if (!f) {
-    // Assume Neon if /proc/cpuinfo is unavailable.
-    // This will occur for Chrome sandbox for Pepper or Render process.
-    return kCpuHasNEON;
-  }
-  while (fgets(cpuinfo_line, sizeof(cpuinfo_line) - 1, f)) {
-    if (memcmp(cpuinfo_line, "Features", 8) == 0) {
-      char* p = strstr(cpuinfo_line, " neon");
-      if (p && (p[5] == ' ' || p[5] == '\n')) {
-        fclose(f);
-        return kCpuHasNEON;
-      }
-      // aarch64 uses asimd for Neon.
-      p = strstr(cpuinfo_line, " asimd");
-      if (p && (p[6] == ' ' || p[6] == '\n')) {
-        fclose(f);
-        return kCpuHasNEON;
+  if (f) {
+    char cpuinfo_line[512];
+    while (fgets(cpuinfo_line, sizeof(cpuinfo_line) - 1, f)) {
+      if (memcmp(cpuinfo_line, "Features", 8) == 0) {
+        char* p = strstr(cpuinfo_line, " neon");
+        if (p && (p[5] == ' ' || p[5] == '\n')) {
+          fclose(f);
+          return kCpuHasNEON;
+        }
      }
    }
+    fclose(f);
  }
-  fclose(f);
  return 0;
 }

+#if defined(__mips__) && defined(__linux__)
+static int MipsCpuCaps(const char* search_string) {
+  const char* file_name = "/proc/cpuinfo";
+  char cpuinfo_line[256];
+  FILE* f = NULL;
+  if ((f = fopen(file_name, "r")) != NULL) {
+    while (fgets(cpuinfo_line, sizeof(cpuinfo_line) - 1, f) != NULL) {
+      if (strstr(cpuinfo_line, search_string) != NULL) {
+        fclose(f);
+        return kCpuHasMIPS_DSP;
+      }
+    }
+    fclose(f);
+  }
+  /* Did not find string in the proc file, or not Linux ELF. */
+  return 0;
+}
+#endif
+
 // CPU detect function for SIMD instruction sets.
 LIBYUV_API
-int cpu_info_ = 0;  // cpu_info is not initialized yet.
+int cpu_info_ = kCpuInit;  // cpu_info is not initialized yet.

 // Test environment variable for disabling CPU features. Any non-zero value
 // to disable. Zero ignored to make it easy to set the variable on/off.
@ -186,109 +181,93 @@ static LIBYUV_BOOL TestEnv(const char*) {

 LIBYUV_API SAFEBUFFERS
 int InitCpuFlags(void) {
-  // TODO(fbarchard): swap kCpuInit logic so 0 means uninitialized.
-  int cpu_info = 0;
 #if !defined(__pnacl__) && !defined(__CLR_VER) && defined(CPU_X86)
-  uint32 cpu_info0[4] = { 0, 0, 0, 0 };
+
  uint32 cpu_info1[4] = { 0, 0, 0, 0 };
  uint32 cpu_info7[4] = { 0, 0, 0, 0 };
-  CpuId(0, 0, cpu_info0);
  CpuId(1, 0, cpu_info1);
-  if (cpu_info0[0] >= 7) {
-    CpuId(7, 0, cpu_info7);
-  }
-  cpu_info = ((cpu_info1[3] & 0x04000000) ? kCpuHasSSE2 : 0) |
-             ((cpu_info1[2] & 0x00000200) ? kCpuHasSSSE3 : 0) |
-             ((cpu_info1[2] & 0x00080000) ? kCpuHasSSE41 : 0) |
-             ((cpu_info1[2] & 0x00100000) ? kCpuHasSSE42 : 0) |
-             ((cpu_info7[1] & 0x00000200) ? kCpuHasERMS : 0) |
-             ((cpu_info1[2] & 0x00001000) ? kCpuHasFMA3 : 0) |
-             kCpuHasX86;
-
+  CpuId(7, 0, cpu_info7);
+  cpu_info_ = ((cpu_info1[3] & 0x04000000) ? kCpuHasSSE2 : 0) |
+              ((cpu_info1[2] & 0x00000200) ? kCpuHasSSSE3 : 0) |
+              ((cpu_info1[2] & 0x00080000) ? kCpuHasSSE41 : 0) |
+              ((cpu_info1[2] & 0x00100000) ? kCpuHasSSE42 : 0) |
+              ((cpu_info7[1] & 0x00000200) ? kCpuHasERMS : 0) |
+              ((cpu_info1[2] & 0x00001000) ? kCpuHasFMA3 : 0) |
+              kCpuHasX86;
 #ifdef HAS_XGETBV
-  // AVX requires CPU has AVX, XSAVE and OSXSave for xgetbv
-  if (((cpu_info1[2] & 0x1c000000) == 0x1c000000) &&  // AVX and OSXSave
-      ((GetXCR0() & 6) == 6)) {  // Test OS saves YMM registers
-    cpu_info |= ((cpu_info7[1] & 0x00000020) ? kCpuHasAVX2 : 0) | kCpuHasAVX;
-
-    // Detect AVX512bw
-    if ((GetXCR0() & 0xe0) == 0xe0) {
-      cpu_info |= (cpu_info7[1] & 0x40000000) ? kCpuHasAVX3 : 0;
-    }
+  if ((cpu_info1[2] & 0x18000000) == 0x18000000 &&  // AVX and OSSave
+      TestOsSaveYmm()) {  // Saves YMM.
+    cpu_info_ |= ((cpu_info7[1] & 0x00000020) ? kCpuHasAVX2 : 0) |
+                 kCpuHasAVX;
  }
 #endif
-
  // Environment variable overrides for testing.
  if (TestEnv("LIBYUV_DISABLE_X86")) {
-    cpu_info &= ~kCpuHasX86;
+    cpu_info_ &= ~kCpuHasX86;
  }
  if (TestEnv("LIBYUV_DISABLE_SSE2")) {
-    cpu_info &= ~kCpuHasSSE2;
+    cpu_info_ &= ~kCpuHasSSE2;
  }
  if (TestEnv("LIBYUV_DISABLE_SSSE3")) {
-    cpu_info &= ~kCpuHasSSSE3;
+    cpu_info_ &= ~kCpuHasSSSE3;
  }
  if (TestEnv("LIBYUV_DISABLE_SSE41")) {
-    cpu_info &= ~kCpuHasSSE41;
+    cpu_info_ &= ~kCpuHasSSE41;
  }
  if (TestEnv("LIBYUV_DISABLE_SSE42")) {
-    cpu_info &= ~kCpuHasSSE42;
+    cpu_info_ &= ~kCpuHasSSE42;
  }
  if (TestEnv("LIBYUV_DISABLE_AVX")) {
-    cpu_info &= ~kCpuHasAVX;
+    cpu_info_ &= ~kCpuHasAVX;
  }
  if (TestEnv("LIBYUV_DISABLE_AVX2")) {
-    cpu_info &= ~kCpuHasAVX2;
+    cpu_info_ &= ~kCpuHasAVX2;
  }
  if (TestEnv("LIBYUV_DISABLE_ERMS")) {
-    cpu_info &= ~kCpuHasERMS;
+    cpu_info_ &= ~kCpuHasERMS;
  }
  if (TestEnv("LIBYUV_DISABLE_FMA3")) {
-    cpu_info &= ~kCpuHasFMA3;
+    cpu_info_ &= ~kCpuHasFMA3;
  }
-  if (TestEnv("LIBYUV_DISABLE_AVX3")) {
-    cpu_info &= ~kCpuHasAVX3;
-  }
-#endif
-#if defined(__mips__) && defined(__linux__)
+#elif defined(__mips__) && defined(__linux__)
+  // Linux mips parse text file for dsp detect.
+  cpu_info_ = MipsCpuCaps("dsp");  // set kCpuHasMIPS_DSP.
 #if defined(__mips_dspr2)
-  cpu_info |= kCpuHasDSPR2;
+  cpu_info_ |= kCpuHasMIPS_DSPR2;
 #endif
-  cpu_info |= kCpuHasMIPS;
-  if (getenv("LIBYUV_DISABLE_DSPR2")) {
-    cpu_info &= ~kCpuHasDSPR2;
+  cpu_info_ |= kCpuHasMIPS;
+
+  if (getenv("LIBYUV_DISABLE_MIPS")) {
+    cpu_info_ &= ~kCpuHasMIPS;
  }
-#endif
-#if defined(__arm__) || defined(__aarch64__)
-// gcc -mfpu=neon defines __ARM_NEON__
-// __ARM_NEON__ generates code that requires Neon.  NaCL also requires Neon.
-// For Linux, /proc/cpuinfo can be tested but without that assume Neon.
-#if defined(__ARM_NEON__) || defined(__native_client__) || !defined(__linux__)
-  cpu_info = kCpuHasNEON;
-// For aarch64(arm64), /proc/cpuinfo's feature is not complete, e.g. no neon
-// flag in it.
-// So for aarch64, neon enabling is hard coded here.
-#endif
-#if defined(__aarch64__)
-  cpu_info = kCpuHasNEON;
-#else
+  if (getenv("LIBYUV_DISABLE_MIPS_DSP")) {
+    cpu_info_ &= ~kCpuHasMIPS_DSP;
+  }
+  if (getenv("LIBYUV_DISABLE_MIPS_DSPR2")) {
+    cpu_info_ &= ~kCpuHasMIPS_DSPR2;
+  }
+#elif defined(__arm__)
+#if defined(__linux__) && (defined(__ARM_NEON__) || defined(LIBYUV_NEON)) && \
+    !defined(__native_client__)
  // Linux arm parse text file for neon detect.
-  cpu_info = ArmCpuCaps("/proc/cpuinfo");
+  cpu_info_ = ArmCpuCaps("/proc/cpuinfo");
+#elif defined(__ARM_NEON__) || defined(__native_client__)
+  // gcc -mfpu=neon defines __ARM_NEON__
+  // Enable Neon if you want support for Neon and Arm, and use MaskCpuFlags
+  // to disable Neon on devices that do not have it.
+  cpu_info_ = kCpuHasNEON;
 #endif
-  cpu_info |= kCpuHasARM;
+  cpu_info_ |= kCpuHasARM;
  if (TestEnv("LIBYUV_DISABLE_NEON")) {
-    cpu_info &= ~kCpuHasNEON;
+    cpu_info_ &= ~kCpuHasNEON;
  }
 #endif  // __arm__
  if (TestEnv("LIBYUV_DISABLE_ASM")) {
-    cpu_info = 0;
+    cpu_info_ = 0;
  }
-  cpu_info  |= kCpuInitialized;
-  cpu_info_ = cpu_info;
-  return cpu_info;
+  return cpu_info_;
 }

-// Note that use of this function is not thread safe.
 LIBYUV_API
 void MaskCpuFlags(int enable_flags) {
  cpu_info_ = InitCpuFlags() & enable_flags;
--- a/media/libyuv/source/mjpeg_decoder.cc
+++ b/media/libyuv/source/mjpeg_decoder.cc
@ -13,20 +13,13 @@
 #ifdef HAVE_JPEG
 #include <assert.h>

-#if !defined(__pnacl__) && !defined(__CLR_VER) && \
-    !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
+#if !defined(__pnacl__) && !defined(__CLR_VER) && !defined(COVERAGE_ENABLED) &&\
+    !defined(TARGET_IPHONE_SIMULATOR)
 // Must be included before jpeglib.
 #include <setjmp.h>
 #define HAVE_SETJMP
-
-#if defined(_MSC_VER)
-// disable warning 4324: structure was padded due to __declspec(align())
-#pragma warning(disable:4324)
 #endif

-#endif
-struct FILE;  // For jpeglib.h.
-
 // C++ build requires extern C for jpeg internals.
 #ifdef __cplusplus
 extern "C" {
@ -56,13 +49,6 @@ const int MJpegDecoder::kColorSpaceYCbCr = JCS_YCbCr;
 const int MJpegDecoder::kColorSpaceCMYK = JCS_CMYK;
 const int MJpegDecoder::kColorSpaceYCCK = JCS_YCCK;

-// Methods that are passed to jpeglib.
-boolean fill_input_buffer(jpeg_decompress_struct* cinfo);
-void init_source(jpeg_decompress_struct* cinfo);
-void skip_input_data(jpeg_decompress_struct* cinfo, long num_bytes);  // NOLINT
-void term_source(jpeg_decompress_struct* cinfo);
-void ErrorHandler(jpeg_common_struct* cinfo);
-
 MJpegDecoder::MJpegDecoder()
    : has_scanline_padding_(LIBYUV_FALSE),
      num_outbufs_(0),
@ -77,6 +63,9 @@ MJpegDecoder::MJpegDecoder()
  decompress_struct_->err = jpeg_std_error(&error_mgr_->base);
  // Override standard exit()-based error handler.
  error_mgr_->base.error_exit = &ErrorHandler;
+#ifndef DEBUG_MJPEG
+  error_mgr_->base.output_message = &OutputHandler;
+#endif
 #endif
  decompress_struct_->client_data = NULL;
  source_mgr_->init_source = &init_source;
@ -106,7 +95,7 @@ LIBYUV_BOOL MJpegDecoder::LoadFrame(const uint8* src, size_t src_len) {
  }

  buf_.data = src;
-  buf_.len = static_cast<int>(src_len);
+  buf_.len = (int)(src_len);
  buf_vec_.pos = 0;
  decompress_struct_->client_data = &buf_vec_;
 #ifdef HAVE_SETJMP
@ -411,12 +400,12 @@ LIBYUV_BOOL MJpegDecoder::DecodeToCallback(CallbackFunction fn, void* opaque,
  return FinishDecode();
 }

-void init_source(j_decompress_ptr cinfo) {
+void MJpegDecoder::init_source(j_decompress_ptr cinfo) {
  fill_input_buffer(cinfo);
 }

-boolean fill_input_buffer(j_decompress_ptr cinfo) {
-  BufferVector* buf_vec = reinterpret_cast<BufferVector*>(cinfo->client_data);
+boolean MJpegDecoder::fill_input_buffer(j_decompress_ptr cinfo) {
+  BufferVector* buf_vec = (BufferVector*)(cinfo->client_data);
  if (buf_vec->pos >= buf_vec->len) {
    assert(0 && "No more data");
    // ERROR: No more data
@ -428,16 +417,17 @@ boolean fill_input_buffer(j_decompress_ptr cinfo) {
  return TRUE;
 }

-void skip_input_data(j_decompress_ptr cinfo, long num_bytes) {  // NOLINT
+void MJpegDecoder::skip_input_data(j_decompress_ptr cinfo,
+                                   long num_bytes) {  // NOLINT
  cinfo->src->next_input_byte += num_bytes;
 }

-void term_source(j_decompress_ptr cinfo) {
+void MJpegDecoder::term_source(j_decompress_ptr cinfo) {
  // Nothing to do.
 }

 #ifdef HAVE_SETJMP
-void ErrorHandler(j_common_ptr cinfo) {
+void MJpegDecoder::ErrorHandler(j_common_ptr cinfo) {
  // This is called when a jpeglib command experiences an error. Unfortunately
  // jpeglib's error handling model is not very flexible, because it expects the
  // error handler to not return--i.e., it wants the program to terminate. To
@ -451,12 +441,18 @@ void ErrorHandler(j_common_ptr cinfo) {
  // ERROR: Error in jpeglib: buf
 #endif

-  SetJmpErrorMgr* mgr = reinterpret_cast<SetJmpErrorMgr*>(cinfo->err);
+  SetJmpErrorMgr* mgr = (SetJmpErrorMgr*)(cinfo->err);
  // This rewinds the call stack to the point of the corresponding setjmp()
  // and causes it to return (for a second time) with value 1.
  longjmp(mgr->setjmp_buffer, 1);
 }
+
+#ifndef DEBUG_MJPEG
+void MJpegDecoder::OutputHandler(j_common_ptr cinfo) {
+  // silently eat messages
+}
 #endif
+#endif // HAVE_SETJMP

 void MJpegDecoder::AllocOutputBuffers(int num_outbufs) {
  if (num_outbufs != num_outbufs_) {
@ -503,11 +499,11 @@ LIBYUV_BOOL MJpegDecoder::StartDecode() {
  decompress_struct_->dct_method = JDCT_IFAST;  // JDCT_ISLOW is default
  decompress_struct_->dither_mode = JDITHER_NONE;
  // Not applicable to 'raw':
-  decompress_struct_->do_fancy_upsampling = (boolean)(LIBYUV_FALSE);
+  decompress_struct_->do_fancy_upsampling = LIBYUV_FALSE;
  // Only for buffered mode:
-  decompress_struct_->enable_2pass_quant = (boolean)(LIBYUV_FALSE);
+  decompress_struct_->enable_2pass_quant = LIBYUV_FALSE;
  // Blocky but fast:
-  decompress_struct_->do_block_smoothing = (boolean)(LIBYUV_FALSE);
+  decompress_struct_->do_block_smoothing = LIBYUV_FALSE;

  if (!jpeg_start_decompress(decompress_struct_)) {
    // ERROR: Couldn't start JPEG decompressor";
--- a/media/libyuv/source/mjpeg_validate.cc
+++ b/media/libyuv/source/mjpeg_validate.cc
@ -10,58 +10,34 @@

 #include "libyuv/mjpeg_decoder.h"

-#include <string.h>  // For memchr.
-
 #ifdef __cplusplus
 namespace libyuv {
 extern "C" {
 #endif

-// Helper function to scan for EOI marker (0xff 0xd9).
-static LIBYUV_BOOL ScanEOI(const uint8* sample, size_t sample_size) {
-  if (sample_size >= 2) {
-    const uint8* end = sample + sample_size - 1;
-    const uint8* it = sample;
-    while (it < end) {
-      // TODO(fbarchard): scan for 0xd9 instead.
-      it = static_cast<const uint8 *>(memchr(it, 0xff, end - it));
-      if (it == NULL) {
-        break;
-      }
-      if (it[1] == 0xd9) {
-        return LIBYUV_TRUE;  // Success: Valid jpeg.
-      }
-      ++it;  // Skip over current 0xff.
-    }
-  }
-  // ERROR: Invalid jpeg end code not found. Size sample_size
-  return LIBYUV_FALSE;
-}
-
 // Helper function to validate the jpeg appears intact.
+// TODO(fbarchard): Optimize case where SOI is found but EOI is not.
 LIBYUV_BOOL ValidateJpeg(const uint8* sample, size_t sample_size) {
-  // Maximum size that ValidateJpeg will consider valid.
-  const size_t kMaxJpegSize = 0x7fffffffull;
-  const size_t kBackSearchSize = 1024;
-  if (sample_size < 64 || sample_size > kMaxJpegSize || !sample) {
+  size_t i;
+  if (sample_size < 64) {
    // ERROR: Invalid jpeg size: sample_size
    return LIBYUV_FALSE;
  }
-  if (sample[0] != 0xff || sample[1] != 0xd8) {  // SOI marker
+  if (sample[0] != 0xff || sample[1] != 0xd8) {  // Start Of Image
    // ERROR: Invalid jpeg initial start code
    return LIBYUV_FALSE;
  }
-
-  // Look for the End Of Image (EOI) marker near the end of the buffer.
-  if (sample_size > kBackSearchSize) {
-    if (ScanEOI(sample + sample_size - kBackSearchSize, kBackSearchSize)) {
-      return LIBYUV_TRUE;  // Success: Valid jpeg.
+  for (i = sample_size - 2; i > 1;) {
+    if (sample[i] != 0xd9) {
+      if (sample[i] == 0xff && sample[i + 1] == 0xd9) {  // End Of Image
+        return LIBYUV_TRUE;  // Success: Valid jpeg.
+      }
+      --i;
    }
-    // Reduce search size for forward search.
-    sample_size = sample_size - kBackSearchSize + 1;
+    --i;
  }
-  // Step over SOI marker and scan for EOI.
-  return ScanEOI(sample + 2, sample_size - 2);
+  // ERROR: Invalid jpeg end code not found. Size sample_size
+  return LIBYUV_FALSE;
 }

 #ifdef __cplusplus
--- a/media/libyuv/source/planar_functions.cc
+++ b/media/libyuv/source/planar_functions.cc
--- a/media/libyuv/source/rotate.cc
+++ b/media/libyuv/source/rotate.cc
--- a/media/libyuv/source/rotate_argb.cc
+++ b/media/libyuv/source/rotate_argb.cc
@ -27,31 +27,36 @@ extern "C" {
    (defined(__x86_64__) && !defined(__native_client__)) || defined(__i386__))
 #define HAS_SCALEARGBROWDOWNEVEN_SSE2
 void ScaleARGBRowDownEven_SSE2(const uint8* src_ptr, int src_stride,
-                               int src_stepx, uint8* dst_ptr, int dst_width);
+                               int src_stepx,
+                               uint8* dst_ptr, int dst_width);
 #endif
 #if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
-    (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
+    (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
 #define HAS_SCALEARGBROWDOWNEVEN_NEON
 void ScaleARGBRowDownEven_NEON(const uint8* src_ptr, int src_stride,
-                               int src_stepx, uint8* dst_ptr, int dst_width);
+                               int src_stepx,
+                               uint8* dst_ptr, int dst_width);
 #endif

 void ScaleARGBRowDownEven_C(const uint8* src_ptr, int,
-                            int src_stepx, uint8* dst_ptr, int dst_width);
+                            int src_stepx,
+                            uint8* dst_ptr, int dst_width);

 static void ARGBTranspose(const uint8* src, int src_stride,
-                          uint8* dst, int dst_stride, int width, int height) {
+                          uint8* dst, int dst_stride,
+                          int width, int height) {
  int i;
  int src_pixel_step = src_stride >> 2;
  void (*ScaleARGBRowDownEven)(const uint8* src_ptr, int src_stride,
      int src_step, uint8* dst_ptr, int dst_width) = ScaleARGBRowDownEven_C;
 #if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(height, 4)) {  // Width of dest.
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(height, 4) &&  // Width of dest.
+      IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
    ScaleARGBRowDownEven = ScaleARGBRowDownEven_SSE2;
  }
-#endif
-#if defined(HAS_SCALEARGBROWDOWNEVEN_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(height, 4)) {  // Width of dest.
+#elif defined(HAS_SCALEARGBROWDOWNEVEN_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(height, 4) &&  // Width of dest.
+      IS_ALIGNED(src, 4)) {
    ScaleARGBRowDownEven = ScaleARGBRowDownEven_NEON;
  }
 #endif
@ -64,7 +69,8 @@ static void ARGBTranspose(const uint8* src, int src_stride,
 }

 void ARGBRotate90(const uint8* src, int src_stride,
-                  uint8* dst, int dst_stride, int width, int height) {
+                  uint8* dst, int dst_stride,
+                  int width, int height) {
  // Rotate by 90 is a ARGBTranspose with the source read
  // from bottom to top. So set the source pointer to the end
  // of the buffer and flip the sign of the source stride.
@ -74,7 +80,8 @@ void ARGBRotate90(const uint8* src, int src_stride,
 }

 void ARGBRotate270(const uint8* src, int src_stride,
-                    uint8* dst, int dst_stride, int width, int height) {
+                    uint8* dst, int dst_stride,
+                    int width, int height) {
  // Rotate by 270 is a ARGBTranspose with the destination written
  // from bottom to top. So set the destination pointer to the end
  // of the buffer and flip the sign of the destination stride.
@ -84,7 +91,8 @@ void ARGBRotate270(const uint8* src, int src_stride,
 }

 void ARGBRotate180(const uint8* src, int src_stride,
-                   uint8* dst, int dst_stride, int width, int height) {
+                   uint8* dst, int dst_stride,
+                   int width, int height) {
  // Swap first and last row and mirror the content. Uses a temporary row.
  align_buffer_64(row, width * 4);
  const uint8* src_bot = src + src_stride * (height - 1);
@ -94,38 +102,38 @@ void ARGBRotate180(const uint8* src, int src_stride,
  void (*ARGBMirrorRow)(const uint8* src, uint8* dst, int width) =
      ARGBMirrorRow_C;
  void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
-#if defined(HAS_ARGBMIRRORROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBMirrorRow = ARGBMirrorRow_Any_NEON;
-    if (IS_ALIGNED(width, 4)) {
-      ARGBMirrorRow = ARGBMirrorRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ARGBMIRRORROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    ARGBMirrorRow = ARGBMirrorRow_Any_SSE2;
-    if (IS_ALIGNED(width, 4)) {
-      ARGBMirrorRow = ARGBMirrorRow_SSE2;
-    }
+#if defined(HAS_ARGBMIRRORROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4) &&
+      IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
+      IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
+    ARGBMirrorRow = ARGBMirrorRow_SSSE3;
  }
 #endif
 #if defined(HAS_ARGBMIRRORROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBMirrorRow = ARGBMirrorRow_Any_AVX2;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBMirrorRow = ARGBMirrorRow_AVX2;
-    }
+  if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 8)) {
+    ARGBMirrorRow = ARGBMirrorRow_AVX2;
+  }
+#endif
+#if defined(HAS_ARGBMIRRORROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 4)) {
+    ARGBMirrorRow = ARGBMirrorRow_NEON;
+  }
+#endif
+#if defined(HAS_COPYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width * 4, 32)) {
+    CopyRow = CopyRow_NEON;
+  }
+#endif
+#if defined(HAS_COPYROW_X86)
+  if (TestCpuFlag(kCpuHasX86)) {
+    CopyRow = CopyRow_X86;
  }
 #endif
 #if defined(HAS_COPYROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
-  }
-#endif
-#if defined(HAS_COPYROW_AVX)
-  if (TestCpuFlag(kCpuHasAVX)) {
-    CopyRow = IS_ALIGNED(width * 4, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width * 4, 32) &&
+      IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
+      IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
+    CopyRow = CopyRow_SSE2;
  }
 #endif
 #if defined(HAS_COPYROW_ERMS)
@ -133,11 +141,6 @@ void ARGBRotate180(const uint8* src, int src_stride,
    CopyRow = CopyRow_ERMS;
  }
 #endif
-#if defined(HAS_COPYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
-  }
-#endif
 #if defined(HAS_COPYROW_MIPS)
  if (TestCpuFlag(kCpuHasMIPS)) {
    CopyRow = CopyRow_MIPS;
@ -159,7 +162,8 @@ void ARGBRotate180(const uint8* src, int src_stride,

 LIBYUV_API
 int ARGBRotate(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_argb, int dst_stride_argb, int width, int height,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height,
               enum RotationMode mode) {
  if (!src_argb || width <= 0 || height == 0 || !dst_argb) {
    return -1;
--- a/media/libyuv/source/rotate_mips.cc
+++ b/media/libyuv/source/rotate_mips.cc
@ -9,7 +9,6 @@
 */

 #include "libyuv/row.h"
-#include "libyuv/rotate_row.h"

 #include "libyuv/basic_types.h"

@ -19,11 +18,11 @@ extern "C" {
 #endif

 #if !defined(LIBYUV_DISABLE_MIPS) && \
-    defined(__mips_dsp) && (__mips_dsp_rev >= 2) && \
-    (_MIPS_SIM == _MIPS_SIM_ABI32)
+    defined(__mips_dsp) && (__mips_dsp_rev >= 2)

-void TransposeWx8_DSPR2(const uint8* src, int src_stride,
-                        uint8* dst, int dst_stride, int width) {
+void TransposeWx8_MIPS_DSPR2(const uint8* src, int src_stride,
+                             uint8* dst, int dst_stride,
+                             int width) {
   __asm__ __volatile__ (
      ".set push                                         \n"
      ".set noreorder                                    \n"
@ -106,8 +105,9 @@ void TransposeWx8_DSPR2(const uint8* src, int src_stride,
  );
 }

-void TransposeWx8_Fast_DSPR2(const uint8* src, int src_stride,
-                             uint8* dst, int dst_stride, int width) {
+void TransposeWx8_FAST_MIPS_DSPR2(const uint8* src, int src_stride,
+                                  uint8* dst, int dst_stride,
+                                  int width) {
  __asm__ __volatile__ (
      ".set noat                                         \n"
      ".set push                                         \n"
@ -303,15 +303,17 @@ void TransposeWx8_Fast_DSPR2(const uint8* src, int src_stride,
       [width] "+r" (width)
      :[src_stride] "r" (src_stride),
       [dst_stride] "r" (dst_stride)
-      : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9",
-        "s0", "s1", "s2", "s3", "s4", "s5", "s6", "s7"
+      : "t0", "t1",  "t2", "t3",  "t4", "t5",
+        "t6", "t7", "t8", "t9",
+        "s0", "s1", "s2", "s3", "s4",
+        "s5", "s6", "s7"
  );
 }

-void TransposeUVWx8_DSPR2(const uint8* src, int src_stride,
-                          uint8* dst_a, int dst_stride_a,
-                          uint8* dst_b, int dst_stride_b,
-                          int width) {
+void TransposeUVWx8_MIPS_DSPR2(const uint8* src, int src_stride,
+                               uint8* dst_a, int dst_stride_a,
+                               uint8* dst_b, int dst_stride_b,
+                               int width) {
  __asm__ __volatile__ (
      ".set push                                         \n"
      ".set noreorder                                    \n"
--- a/media/libyuv/source/rotate_neon.cc
+++ b/media/libyuv/source/rotate_neon.cc
@ -9,7 +9,6 @@
 */

 #include "libyuv/row.h"
-#include "libyuv/rotate_row.h"

 #include "libyuv/basic_types.h"

@ -18,42 +17,32 @@ namespace libyuv {
 extern "C" {
 #endif

-#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
-    !defined(__aarch64__)
-
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)
 static uvec8 kVTbl4x4Transpose =
  { 0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15 };

 void TransposeWx8_NEON(const uint8* src, int src_stride,
                       uint8* dst, int dst_stride,
                       int width) {
-  const uint8* src_temp;
  asm volatile (
    // loops are on blocks of 8. loop will stop when
    // counter gets to or below 0. starting the counter
    // at w-8 allow for this
-    "sub         %5, #8                        \n"
+    "sub         %4, #8                        \n"

    // handle 8x8 blocks. this should be the majority of the plane
+    ".p2align  2                               \n"
    "1:                                        \n"
-      "mov         %0, %1                      \n"
+      "mov         r9, %0                      \n"

-      MEMACCESS(0)
-      "vld1.8      {d0}, [%0], %2              \n"
-      MEMACCESS(0)
-      "vld1.8      {d1}, [%0], %2              \n"
-      MEMACCESS(0)
-      "vld1.8      {d2}, [%0], %2              \n"
-      MEMACCESS(0)
-      "vld1.8      {d3}, [%0], %2              \n"
-      MEMACCESS(0)
-      "vld1.8      {d4}, [%0], %2              \n"
-      MEMACCESS(0)
-      "vld1.8      {d5}, [%0], %2              \n"
-      MEMACCESS(0)
-      "vld1.8      {d6}, [%0], %2              \n"
-      MEMACCESS(0)
-      "vld1.8      {d7}, [%0]                  \n"
+      "vld1.8      {d0}, [r9], %1              \n"
+      "vld1.8      {d1}, [r9], %1              \n"
+      "vld1.8      {d2}, [r9], %1              \n"
+      "vld1.8      {d3}, [r9], %1              \n"
+      "vld1.8      {d4}, [r9], %1              \n"
+      "vld1.8      {d5}, [r9], %1              \n"
+      "vld1.8      {d6}, [r9], %1              \n"
+      "vld1.8      {d7}, [r9]                  \n"

      "vtrn.8      d1, d0                      \n"
      "vtrn.8      d3, d2                      \n"
@ -75,65 +64,48 @@ void TransposeWx8_NEON(const uint8* src, int src_stride,
      "vrev16.8    q2, q2                      \n"
      "vrev16.8    q3, q3                      \n"

-      "mov         %0, %3                      \n"
+      "mov         r9, %2                      \n"

-    MEMACCESS(0)
-      "vst1.8      {d1}, [%0], %4              \n"
-    MEMACCESS(0)
-      "vst1.8      {d0}, [%0], %4              \n"
-    MEMACCESS(0)
-      "vst1.8      {d3}, [%0], %4              \n"
-    MEMACCESS(0)
-      "vst1.8      {d2}, [%0], %4              \n"
-    MEMACCESS(0)
-      "vst1.8      {d5}, [%0], %4              \n"
-    MEMACCESS(0)
-      "vst1.8      {d4}, [%0], %4              \n"
-    MEMACCESS(0)
-      "vst1.8      {d7}, [%0], %4              \n"
-    MEMACCESS(0)
-      "vst1.8      {d6}, [%0]                  \n"
+      "vst1.8      {d1}, [r9], %3              \n"
+      "vst1.8      {d0}, [r9], %3              \n"
+      "vst1.8      {d3}, [r9], %3              \n"
+      "vst1.8      {d2}, [r9], %3              \n"
+      "vst1.8      {d5}, [r9], %3              \n"
+      "vst1.8      {d4}, [r9], %3              \n"
+      "vst1.8      {d7}, [r9], %3              \n"
+      "vst1.8      {d6}, [r9]                  \n"

-      "add         %1, #8                      \n"  // src += 8
-      "add         %3, %3, %4, lsl #3          \n"  // dst += 8 * dst_stride
-      "subs        %5,  #8                     \n"  // w   -= 8
+      "add         %0, #8                      \n"  // src += 8
+      "add         %2, %2, %3, lsl #3          \n"  // dst += 8 * dst_stride
+      "subs        %4,  #8                     \n"  // w   -= 8
      "bge         1b                          \n"

    // add 8 back to counter. if the result is 0 there are
    // no residuals.
-    "adds        %5, #8                        \n"
+    "adds        %4, #8                        \n"
    "beq         4f                            \n"

    // some residual, so between 1 and 7 lines left to transpose
-    "cmp         %5, #2                        \n"
+    "cmp         %4, #2                        \n"
    "blt         3f                            \n"

-    "cmp         %5, #4                        \n"
+    "cmp         %4, #4                        \n"
    "blt         2f                            \n"

    // 4x8 block
-    "mov         %0, %1                        \n"
-    MEMACCESS(0)
-    "vld1.32     {d0[0]}, [%0], %2             \n"
-    MEMACCESS(0)
-    "vld1.32     {d0[1]}, [%0], %2             \n"
-    MEMACCESS(0)
-    "vld1.32     {d1[0]}, [%0], %2             \n"
-    MEMACCESS(0)
-    "vld1.32     {d1[1]}, [%0], %2             \n"
-    MEMACCESS(0)
-    "vld1.32     {d2[0]}, [%0], %2             \n"
-    MEMACCESS(0)
-    "vld1.32     {d2[1]}, [%0], %2             \n"
-    MEMACCESS(0)
-    "vld1.32     {d3[0]}, [%0], %2             \n"
-    MEMACCESS(0)
-    "vld1.32     {d3[1]}, [%0]                 \n"
+    "mov         r9, %0                        \n"
+    "vld1.32     {d0[0]}, [r9], %1             \n"
+    "vld1.32     {d0[1]}, [r9], %1             \n"
+    "vld1.32     {d1[0]}, [r9], %1             \n"
+    "vld1.32     {d1[1]}, [r9], %1             \n"
+    "vld1.32     {d2[0]}, [r9], %1             \n"
+    "vld1.32     {d2[1]}, [r9], %1             \n"
+    "vld1.32     {d3[0]}, [r9], %1             \n"
+    "vld1.32     {d3[1]}, [r9]                 \n"

-    "mov         %0, %3                        \n"
+    "mov         r9, %2                        \n"

-    MEMACCESS(6)
-    "vld1.8      {q3}, [%6]                    \n"
+    "vld1.8      {q3}, [%5]                    \n"

    "vtbl.8      d4, {d0, d1}, d6              \n"
    "vtbl.8      d5, {d0, d1}, d7              \n"
@ -142,101 +114,73 @@ void TransposeWx8_NEON(const uint8* src, int src_stride,

    // TODO(frkoenig): Rework shuffle above to
    // write out with 4 instead of 8 writes.
-    MEMACCESS(0)
-    "vst1.32     {d4[0]}, [%0], %4             \n"
-    MEMACCESS(0)
-    "vst1.32     {d4[1]}, [%0], %4             \n"
-    MEMACCESS(0)
-    "vst1.32     {d5[0]}, [%0], %4             \n"
-    MEMACCESS(0)
-    "vst1.32     {d5[1]}, [%0]                 \n"
+    "vst1.32     {d4[0]}, [r9], %3             \n"
+    "vst1.32     {d4[1]}, [r9], %3             \n"
+    "vst1.32     {d5[0]}, [r9], %3             \n"
+    "vst1.32     {d5[1]}, [r9]                 \n"

-    "add         %0, %3, #4                    \n"
-    MEMACCESS(0)
-    "vst1.32     {d0[0]}, [%0], %4             \n"
-    MEMACCESS(0)
-    "vst1.32     {d0[1]}, [%0], %4             \n"
-    MEMACCESS(0)
-    "vst1.32     {d1[0]}, [%0], %4             \n"
-    MEMACCESS(0)
-    "vst1.32     {d1[1]}, [%0]                 \n"
+    "add         r9, %2, #4                    \n"
+    "vst1.32     {d0[0]}, [r9], %3             \n"
+    "vst1.32     {d0[1]}, [r9], %3             \n"
+    "vst1.32     {d1[0]}, [r9], %3             \n"
+    "vst1.32     {d1[1]}, [r9]                 \n"

-    "add         %1, #4                        \n"  // src += 4
-    "add         %3, %3, %4, lsl #2            \n"  // dst += 4 * dst_stride
-    "subs        %5,  #4                       \n"  // w   -= 4
+    "add         %0, #4                        \n"  // src += 4
+    "add         %2, %2, %3, lsl #2            \n"  // dst += 4 * dst_stride
+    "subs        %4,  #4                       \n"  // w   -= 4
    "beq         4f                            \n"

    // some residual, check to see if it includes a 2x8 block,
    // or less
-    "cmp         %5, #2                        \n"
+    "cmp         %4, #2                        \n"
    "blt         3f                            \n"

    // 2x8 block
    "2:                                        \n"
-    "mov         %0, %1                        \n"
-    MEMACCESS(0)
-    "vld1.16     {d0[0]}, [%0], %2             \n"
-    MEMACCESS(0)
-    "vld1.16     {d1[0]}, [%0], %2             \n"
-    MEMACCESS(0)
-    "vld1.16     {d0[1]}, [%0], %2             \n"
-    MEMACCESS(0)
-    "vld1.16     {d1[1]}, [%0], %2             \n"
-    MEMACCESS(0)
-    "vld1.16     {d0[2]}, [%0], %2             \n"
-    MEMACCESS(0)
-    "vld1.16     {d1[2]}, [%0], %2             \n"
-    MEMACCESS(0)
-    "vld1.16     {d0[3]}, [%0], %2             \n"
-    MEMACCESS(0)
-    "vld1.16     {d1[3]}, [%0]                 \n"
+    "mov         r9, %0                        \n"
+    "vld1.16     {d0[0]}, [r9], %1             \n"
+    "vld1.16     {d1[0]}, [r9], %1             \n"
+    "vld1.16     {d0[1]}, [r9], %1             \n"
+    "vld1.16     {d1[1]}, [r9], %1             \n"
+    "vld1.16     {d0[2]}, [r9], %1             \n"
+    "vld1.16     {d1[2]}, [r9], %1             \n"
+    "vld1.16     {d0[3]}, [r9], %1             \n"
+    "vld1.16     {d1[3]}, [r9]                 \n"

    "vtrn.8      d0, d1                        \n"

-    "mov         %0, %3                        \n"
+    "mov         r9, %2                        \n"

-    MEMACCESS(0)
-    "vst1.64     {d0}, [%0], %4                \n"
-    MEMACCESS(0)
-    "vst1.64     {d1}, [%0]                    \n"
+    "vst1.64     {d0}, [r9], %3                \n"
+    "vst1.64     {d1}, [r9]                    \n"

-    "add         %1, #2                        \n"  // src += 2
-    "add         %3, %3, %4, lsl #1            \n"  // dst += 2 * dst_stride
-    "subs        %5,  #2                       \n"  // w   -= 2
+    "add         %0, #2                        \n"  // src += 2
+    "add         %2, %2, %3, lsl #1            \n"  // dst += 2 * dst_stride
+    "subs        %4,  #2                       \n"  // w   -= 2
    "beq         4f                            \n"

    // 1x8 block
    "3:                                        \n"
-    MEMACCESS(1)
-    "vld1.8      {d0[0]}, [%1], %2             \n"
-    MEMACCESS(1)
-    "vld1.8      {d0[1]}, [%1], %2             \n"
-    MEMACCESS(1)
-    "vld1.8      {d0[2]}, [%1], %2             \n"
-    MEMACCESS(1)
-    "vld1.8      {d0[3]}, [%1], %2             \n"
-    MEMACCESS(1)
-    "vld1.8      {d0[4]}, [%1], %2             \n"
-    MEMACCESS(1)
-    "vld1.8      {d0[5]}, [%1], %2             \n"
-    MEMACCESS(1)
-    "vld1.8      {d0[6]}, [%1], %2             \n"
-    MEMACCESS(1)
-    "vld1.8      {d0[7]}, [%1]                 \n"
+    "vld1.8      {d0[0]}, [%0], %1             \n"
+    "vld1.8      {d0[1]}, [%0], %1             \n"
+    "vld1.8      {d0[2]}, [%0], %1             \n"
+    "vld1.8      {d0[3]}, [%0], %1             \n"
+    "vld1.8      {d0[4]}, [%0], %1             \n"
+    "vld1.8      {d0[5]}, [%0], %1             \n"
+    "vld1.8      {d0[6]}, [%0], %1             \n"
+    "vld1.8      {d0[7]}, [%0]                 \n"

-    MEMACCESS(3)
-    "vst1.64     {d0}, [%3]                    \n"
+    "vst1.64     {d0}, [%2]                    \n"

    "4:                                        \n"

-    : "=&r"(src_temp),         // %0
-      "+r"(src),               // %1
-      "+r"(src_stride),        // %2
-      "+r"(dst),               // %3
-      "+r"(dst_stride),        // %4
-      "+r"(width)              // %5
-    : "r"(&kVTbl4x4Transpose)  // %6
-    : "memory", "cc", "q0", "q1", "q2", "q3"
+    : "+r"(src),               // %0
+      "+r"(src_stride),        // %1
+      "+r"(dst),               // %2
+      "+r"(dst_stride),        // %3
+      "+r"(width)              // %4
+    : "r"(&kVTbl4x4Transpose)  // %5
+    : "memory", "cc", "r9", "q0", "q1", "q2", "q3"
  );
 }

@ -247,33 +191,25 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride,
                         uint8* dst_a, int dst_stride_a,
                         uint8* dst_b, int dst_stride_b,
                         int width) {
-  const uint8* src_temp;
  asm volatile (
    // loops are on blocks of 8. loop will stop when
    // counter gets to or below 0. starting the counter
    // at w-8 allow for this
-    "sub         %7, #8                        \n"
+    "sub         %6, #8                        \n"

    // handle 8x8 blocks. this should be the majority of the plane
+    ".p2align  2                               \n"
    "1:                                        \n"
-      "mov         %0, %1                      \n"
+      "mov         r9, %0                      \n"

-      MEMACCESS(0)
-      "vld2.8      {d0,  d1},  [%0], %2        \n"
-      MEMACCESS(0)
-      "vld2.8      {d2,  d3},  [%0], %2        \n"
-      MEMACCESS(0)
-      "vld2.8      {d4,  d5},  [%0], %2        \n"
-      MEMACCESS(0)
-      "vld2.8      {d6,  d7},  [%0], %2        \n"
-      MEMACCESS(0)
-      "vld2.8      {d16, d17}, [%0], %2        \n"
-      MEMACCESS(0)
-      "vld2.8      {d18, d19}, [%0], %2        \n"
-      MEMACCESS(0)
-      "vld2.8      {d20, d21}, [%0], %2        \n"
-      MEMACCESS(0)
-      "vld2.8      {d22, d23}, [%0]            \n"
+      "vld2.8      {d0,  d1},  [r9], %1        \n"
+      "vld2.8      {d2,  d3},  [r9], %1        \n"
+      "vld2.8      {d4,  d5},  [r9], %1        \n"
+      "vld2.8      {d6,  d7},  [r9], %1        \n"
+      "vld2.8      {d16, d17}, [r9], %1        \n"
+      "vld2.8      {d18, d19}, [r9], %1        \n"
+      "vld2.8      {d20, d21}, [r9], %1        \n"
+      "vld2.8      {d22, d23}, [r9]            \n"

      "vtrn.8      q1, q0                      \n"
      "vtrn.8      q3, q2                      \n"
@ -299,84 +235,59 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride,
      "vrev16.8    q10, q10                    \n"
      "vrev16.8    q11, q11                    \n"

-      "mov         %0, %3                      \n"
+      "mov         r9, %2                      \n"

-    MEMACCESS(0)
-      "vst1.8      {d2},  [%0], %4             \n"
-    MEMACCESS(0)
-      "vst1.8      {d0},  [%0], %4             \n"
-    MEMACCESS(0)
-      "vst1.8      {d6},  [%0], %4             \n"
-    MEMACCESS(0)
-      "vst1.8      {d4},  [%0], %4             \n"
-    MEMACCESS(0)
-      "vst1.8      {d18}, [%0], %4             \n"
-    MEMACCESS(0)
-      "vst1.8      {d16}, [%0], %4             \n"
-    MEMACCESS(0)
-      "vst1.8      {d22}, [%0], %4             \n"
-    MEMACCESS(0)
-      "vst1.8      {d20}, [%0]                 \n"
+      "vst1.8      {d2},  [r9], %3             \n"
+      "vst1.8      {d0},  [r9], %3             \n"
+      "vst1.8      {d6},  [r9], %3             \n"
+      "vst1.8      {d4},  [r9], %3             \n"
+      "vst1.8      {d18}, [r9], %3             \n"
+      "vst1.8      {d16}, [r9], %3             \n"
+      "vst1.8      {d22}, [r9], %3             \n"
+      "vst1.8      {d20}, [r9]                 \n"

-      "mov         %0, %5                      \n"
+      "mov         r9, %4                      \n"

-    MEMACCESS(0)
-      "vst1.8      {d3},  [%0], %6             \n"
-    MEMACCESS(0)
-      "vst1.8      {d1},  [%0], %6             \n"
-    MEMACCESS(0)
-      "vst1.8      {d7},  [%0], %6             \n"
-    MEMACCESS(0)
-      "vst1.8      {d5},  [%0], %6             \n"
-    MEMACCESS(0)
-      "vst1.8      {d19}, [%0], %6             \n"
-    MEMACCESS(0)
-      "vst1.8      {d17}, [%0], %6             \n"
-    MEMACCESS(0)
-      "vst1.8      {d23}, [%0], %6             \n"
-    MEMACCESS(0)
-      "vst1.8      {d21}, [%0]                 \n"
+      "vst1.8      {d3},  [r9], %5             \n"
+      "vst1.8      {d1},  [r9], %5             \n"
+      "vst1.8      {d7},  [r9], %5             \n"
+      "vst1.8      {d5},  [r9], %5             \n"
+      "vst1.8      {d19}, [r9], %5             \n"
+      "vst1.8      {d17}, [r9], %5             \n"
+      "vst1.8      {d23}, [r9], %5             \n"
+      "vst1.8      {d21}, [r9]                 \n"

-      "add         %1, #8*2                    \n"  // src   += 8*2
-      "add         %3, %3, %4, lsl #3          \n"  // dst_a += 8 * dst_stride_a
-      "add         %5, %5, %6, lsl #3          \n"  // dst_b += 8 * dst_stride_b
-      "subs        %7,  #8                     \n"  // w     -= 8
+      "add         %0, #8*2                    \n"  // src   += 8*2
+      "add         %2, %2, %3, lsl #3          \n"  // dst_a += 8 * dst_stride_a
+      "add         %4, %4, %5, lsl #3          \n"  // dst_b += 8 * dst_stride_b
+      "subs        %6,  #8                     \n"  // w     -= 8
      "bge         1b                          \n"

    // add 8 back to counter. if the result is 0 there are
    // no residuals.
-    "adds        %7, #8                        \n"
+    "adds        %6, #8                        \n"
    "beq         4f                            \n"

    // some residual, so between 1 and 7 lines left to transpose
-    "cmp         %7, #2                        \n"
+    "cmp         %6, #2                        \n"
    "blt         3f                            \n"

-    "cmp         %7, #4                        \n"
+    "cmp         %6, #4                        \n"
    "blt         2f                            \n"

-    // TODO(frkoenig): Clean this up
+    //TODO(frkoenig): Clean this up
    // 4x8 block
-    "mov         %0, %1                        \n"
-    MEMACCESS(0)
-    "vld1.64     {d0}, [%0], %2                \n"
-    MEMACCESS(0)
-    "vld1.64     {d1}, [%0], %2                \n"
-    MEMACCESS(0)
-    "vld1.64     {d2}, [%0], %2                \n"
-    MEMACCESS(0)
-    "vld1.64     {d3}, [%0], %2                \n"
-    MEMACCESS(0)
-    "vld1.64     {d4}, [%0], %2                \n"
-    MEMACCESS(0)
-    "vld1.64     {d5}, [%0], %2                \n"
-    MEMACCESS(0)
-    "vld1.64     {d6}, [%0], %2                \n"
-    MEMACCESS(0)
-    "vld1.64     {d7}, [%0]                    \n"
+    "mov         r9, %0                        \n"
+    "vld1.64     {d0}, [r9], %1                \n"
+    "vld1.64     {d1}, [r9], %1                \n"
+    "vld1.64     {d2}, [r9], %1                \n"
+    "vld1.64     {d3}, [r9], %1                \n"
+    "vld1.64     {d4}, [r9], %1                \n"
+    "vld1.64     {d5}, [r9], %1                \n"
+    "vld1.64     {d6}, [r9], %1                \n"
+    "vld1.64     {d7}, [r9]                    \n"

-    MEMACCESS(8)
-    "vld1.8      {q15}, [%8]                   \n"
+    "vld1.8      {q15}, [%7]                   \n"

    "vtrn.8      q0, q1                        \n"
    "vtrn.8      q2, q3                        \n"
@ -390,142 +301,103 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride,
    "vtbl.8      d22, {d6, d7}, d30            \n"
    "vtbl.8      d23, {d6, d7}, d31            \n"

-    "mov         %0, %3                        \n"
+    "mov         r9, %2                        \n"

-    MEMACCESS(0)
-    "vst1.32     {d16[0]},  [%0], %4           \n"
-    MEMACCESS(0)
-    "vst1.32     {d16[1]},  [%0], %4           \n"
-    MEMACCESS(0)
-    "vst1.32     {d17[0]},  [%0], %4           \n"
-    MEMACCESS(0)
-    "vst1.32     {d17[1]},  [%0], %4           \n"
+    "vst1.32     {d16[0]},  [r9], %3           \n"
+    "vst1.32     {d16[1]},  [r9], %3           \n"
+    "vst1.32     {d17[0]},  [r9], %3           \n"
+    "vst1.32     {d17[1]},  [r9], %3           \n"

-    "add         %0, %3, #4                    \n"
-    MEMACCESS(0)
-    "vst1.32     {d20[0]}, [%0], %4            \n"
-    MEMACCESS(0)
-    "vst1.32     {d20[1]}, [%0], %4            \n"
-    MEMACCESS(0)
-    "vst1.32     {d21[0]}, [%0], %4            \n"
-    MEMACCESS(0)
-    "vst1.32     {d21[1]}, [%0]                \n"
+    "add         r9, %2, #4                    \n"
+    "vst1.32     {d20[0]}, [r9], %3            \n"
+    "vst1.32     {d20[1]}, [r9], %3            \n"
+    "vst1.32     {d21[0]}, [r9], %3            \n"
+    "vst1.32     {d21[1]}, [r9]                \n"

-    "mov         %0, %5                        \n"
+    "mov         r9, %4                        \n"

-    MEMACCESS(0)
-    "vst1.32     {d18[0]}, [%0], %6            \n"
-    MEMACCESS(0)
-    "vst1.32     {d18[1]}, [%0], %6            \n"
-    MEMACCESS(0)
-    "vst1.32     {d19[0]}, [%0], %6            \n"
-    MEMACCESS(0)
-    "vst1.32     {d19[1]}, [%0], %6            \n"
+    "vst1.32     {d18[0]}, [r9], %5            \n"
+    "vst1.32     {d18[1]}, [r9], %5            \n"
+    "vst1.32     {d19[0]}, [r9], %5            \n"
+    "vst1.32     {d19[1]}, [r9], %5            \n"

-    "add         %0, %5, #4                    \n"
-    MEMACCESS(0)
-    "vst1.32     {d22[0]},  [%0], %6           \n"
-    MEMACCESS(0)
-    "vst1.32     {d22[1]},  [%0], %6           \n"
-    MEMACCESS(0)
-    "vst1.32     {d23[0]},  [%0], %6           \n"
-    MEMACCESS(0)
-    "vst1.32     {d23[1]},  [%0]               \n"
+    "add         r9, %4, #4                    \n"
+    "vst1.32     {d22[0]},  [r9], %5           \n"
+    "vst1.32     {d22[1]},  [r9], %5           \n"
+    "vst1.32     {d23[0]},  [r9], %5           \n"
+    "vst1.32     {d23[1]},  [r9]               \n"

-    "add         %1, #4*2                      \n"  // src   += 4 * 2
-    "add         %3, %3, %4, lsl #2            \n"  // dst_a += 4 * dst_stride_a
-    "add         %5, %5, %6, lsl #2            \n"  // dst_b += 4 * dst_stride_b
-    "subs        %7,  #4                       \n"  // w     -= 4
+    "add         %0, #4*2                      \n"  // src   += 4 * 2
+    "add         %2, %2, %3, lsl #2            \n"  // dst_a += 4 * dst_stride_a
+    "add         %4, %4, %5, lsl #2            \n"  // dst_b += 4 * dst_stride_b
+    "subs        %6,  #4                       \n"  // w     -= 4
    "beq         4f                            \n"

    // some residual, check to see if it includes a 2x8 block,
    // or less
-    "cmp         %7, #2                        \n"
+    "cmp         %6, #2                        \n"
    "blt         3f                            \n"

    // 2x8 block
    "2:                                        \n"
-    "mov         %0, %1                        \n"
-    MEMACCESS(0)
-    "vld2.16     {d0[0], d2[0]}, [%0], %2      \n"
-    MEMACCESS(0)
-    "vld2.16     {d1[0], d3[0]}, [%0], %2      \n"
-    MEMACCESS(0)
-    "vld2.16     {d0[1], d2[1]}, [%0], %2      \n"
-    MEMACCESS(0)
-    "vld2.16     {d1[1], d3[1]}, [%0], %2      \n"
-    MEMACCESS(0)
-    "vld2.16     {d0[2], d2[2]}, [%0], %2      \n"
-    MEMACCESS(0)
-    "vld2.16     {d1[2], d3[2]}, [%0], %2      \n"
-    MEMACCESS(0)
-    "vld2.16     {d0[3], d2[3]}, [%0], %2      \n"
-    MEMACCESS(0)
-    "vld2.16     {d1[3], d3[3]}, [%0]          \n"
+    "mov         r9, %0                        \n"
+    "vld2.16     {d0[0], d2[0]}, [r9], %1      \n"
+    "vld2.16     {d1[0], d3[0]}, [r9], %1      \n"
+    "vld2.16     {d0[1], d2[1]}, [r9], %1      \n"
+    "vld2.16     {d1[1], d3[1]}, [r9], %1      \n"
+    "vld2.16     {d0[2], d2[2]}, [r9], %1      \n"
+    "vld2.16     {d1[2], d3[2]}, [r9], %1      \n"
+    "vld2.16     {d0[3], d2[3]}, [r9], %1      \n"
+    "vld2.16     {d1[3], d3[3]}, [r9]          \n"

    "vtrn.8      d0, d1                        \n"
    "vtrn.8      d2, d3                        \n"

-    "mov         %0, %3                        \n"
+    "mov         r9, %2                        \n"

-    MEMACCESS(0)
-    "vst1.64     {d0}, [%0], %4                \n"
-    MEMACCESS(0)
-    "vst1.64     {d2}, [%0]                    \n"
+    "vst1.64     {d0}, [r9], %3                \n"
+    "vst1.64     {d2}, [r9]                    \n"

-    "mov         %0, %5                        \n"
+    "mov         r9, %4                        \n"

-    MEMACCESS(0)
-    "vst1.64     {d1}, [%0], %6                \n"
-    MEMACCESS(0)
-    "vst1.64     {d3}, [%0]                    \n"
+    "vst1.64     {d1}, [r9], %5                \n"
+    "vst1.64     {d3}, [r9]                    \n"

-    "add         %1, #2*2                      \n"  // src   += 2 * 2
-    "add         %3, %3, %4, lsl #1            \n"  // dst_a += 2 * dst_stride_a
-    "add         %5, %5, %6, lsl #1            \n"  // dst_b += 2 * dst_stride_b
-    "subs        %7,  #2                       \n"  // w     -= 2
+    "add         %0, #2*2                      \n"  // src   += 2 * 2
+    "add         %2, %2, %3, lsl #1            \n"  // dst_a += 2 * dst_stride_a
+    "add         %4, %4, %5, lsl #1            \n"  // dst_b += 2 * dst_stride_b
+    "subs        %6,  #2                       \n"  // w     -= 2
    "beq         4f                            \n"

    // 1x8 block
    "3:                                        \n"
-    MEMACCESS(1)
-    "vld2.8      {d0[0], d1[0]}, [%1], %2      \n"
-    MEMACCESS(1)
-    "vld2.8      {d0[1], d1[1]}, [%1], %2      \n"
-    MEMACCESS(1)
-    "vld2.8      {d0[2], d1[2]}, [%1], %2      \n"
-    MEMACCESS(1)
-    "vld2.8      {d0[3], d1[3]}, [%1], %2      \n"
-    MEMACCESS(1)
-    "vld2.8      {d0[4], d1[4]}, [%1], %2      \n"
-    MEMACCESS(1)
-    "vld2.8      {d0[5], d1[5]}, [%1], %2      \n"
-    MEMACCESS(1)
-    "vld2.8      {d0[6], d1[6]}, [%1], %2      \n"
-    MEMACCESS(1)
-    "vld2.8      {d0[7], d1[7]}, [%1]          \n"
+    "vld2.8      {d0[0], d1[0]}, [%0], %1      \n"
+    "vld2.8      {d0[1], d1[1]}, [%0], %1      \n"
+    "vld2.8      {d0[2], d1[2]}, [%0], %1      \n"
+    "vld2.8      {d0[3], d1[3]}, [%0], %1      \n"
+    "vld2.8      {d0[4], d1[4]}, [%0], %1      \n"
+    "vld2.8      {d0[5], d1[5]}, [%0], %1      \n"
+    "vld2.8      {d0[6], d1[6]}, [%0], %1      \n"
+    "vld2.8      {d0[7], d1[7]}, [%0]          \n"

-    MEMACCESS(3)
-    "vst1.64     {d0}, [%3]                    \n"
-    MEMACCESS(5)
-    "vst1.64     {d1}, [%5]                    \n"
+    "vst1.64     {d0}, [%2]                    \n"
+    "vst1.64     {d1}, [%4]                    \n"

    "4:                                        \n"

-    : "=&r"(src_temp),           // %0
-      "+r"(src),                 // %1
-      "+r"(src_stride),          // %2
-      "+r"(dst_a),               // %3
-      "+r"(dst_stride_a),        // %4
-      "+r"(dst_b),               // %5
-      "+r"(dst_stride_b),        // %6
-      "+r"(width)                // %7
-    : "r"(&kVTbl4x4TransposeDi)  // %8
-    : "memory", "cc",
+    : "+r"(src),                 // %0
+      "+r"(src_stride),          // %1
+      "+r"(dst_a),               // %2
+      "+r"(dst_stride_a),        // %3
+      "+r"(dst_b),               // %4
+      "+r"(dst_stride_b),        // %5
+      "+r"(width)                // %6
+    : "r"(&kVTbl4x4TransposeDi)  // %7
+    : "memory", "cc", "r9",
      "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
  );
 }
-#endif  // defined(__ARM_NEON__) && !defined(__aarch64__)
+#endif

 #ifdef __cplusplus
 }  // extern "C"
--- a/media/libyuv/source/row_any.cc
+++ b/media/libyuv/source/row_any.cc
--- a/media/libyuv/source/row_common.cc
+++ b/media/libyuv/source/row_common.cc
--- a/media/libyuv/source/row_mips.cc
+++ b/media/libyuv/source/row_mips.cc
@ -16,8 +16,13 @@ extern "C" {
 #endif

 // The following are available on Mips platforms:
-#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__) && \
-    (_MIPS_SIM == _MIPS_SIM_ABI32)
+#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__)
+
+#include <sgidefs.h>
+
+#if (_MIPS_ISA == _MIPS_ISA_MIPS4) || (_MIPS_ISA == _MIPS_ISA_MIPS5)
+#define HAS_MIPS_PREFETCH 1
+#endif

 #ifdef HAS_COPYROW_MIPS
 void CopyRow_MIPS(const uint8* src, uint8* dst, int count) {
@ -61,23 +66,31 @@ void CopyRow_MIPS(const uint8* src, uint8* dst, int count) {
    // Alternatively, for x=64 the last "safe" a1 address is "t0-96"
    // we will use "pref 30,128(a1)", so "t0-160" is the limit
    "subu      $t9, $t0, 160                     \n"
+#ifdef HAS_MIPS_PREFETCH
    // t9 is the "last safe pref 30,128(a1)" address
    "pref      0, 0(%[src])                      \n"  // first line of src
    "pref      0, 32(%[src])                     \n"  // second line of src
    "pref      0, 64(%[src])                     \n"
    "pref      30, 32(%[dst])                    \n"
+#endif
    // In case the a1 > t9 don't use "pref 30" at all
    "sgtu      $v1, %[dst], $t9                  \n"
    "bgtz      $v1, $loop16w                     \n"
    "nop                                         \n"
    // otherwise, start with using pref30
+#ifdef HAS_MIPS_PREFETCH
    "pref      30, 64(%[dst])                    \n"
+#endif
    "$loop16w:                                    \n"
+#ifdef HAS_MIPS_PREFETCH
    "pref      0, 96(%[src])                     \n"
+#endif
    "lw        $t0, 0(%[src])                    \n"
    "bgtz      $v1, $skip_pref30_96              \n"  // skip
    "lw        $t1, 4(%[src])                    \n"
+#ifdef HAS_MIPS_PREFETCH
    "pref      30, 96(%[dst])                    \n"  // continue
+#endif
    "$skip_pref30_96:                            \n"
    "lw        $t2, 8(%[src])                    \n"
    "lw        $t3, 12(%[src])                   \n"
@ -85,7 +98,9 @@ void CopyRow_MIPS(const uint8* src, uint8* dst, int count) {
    "lw        $t5, 20(%[src])                   \n"
    "lw        $t6, 24(%[src])                   \n"
    "lw        $t7, 28(%[src])                   \n"
+#ifdef HAS_MIPS_PREFETCH
    "pref      0, 128(%[src])                    \n"
+#endif
    //  bring the next lines of src, addr 128
    "sw        $t0, 0(%[dst])                    \n"
    "sw        $t1, 4(%[dst])                    \n"
@ -98,7 +113,9 @@ void CopyRow_MIPS(const uint8* src, uint8* dst, int count) {
    "lw        $t0, 32(%[src])                   \n"
    "bgtz      $v1, $skip_pref30_128             \n"  // skip pref 30,128(a1)
    "lw        $t1, 36(%[src])                   \n"
+#ifdef HAS_MIPS_PREFETCH
    "pref      30, 128(%[dst])                   \n"  // set dest, addr 128
+#endif
    "$skip_pref30_128:                           \n"
    "lw        $t2, 40(%[src])                   \n"
    "lw        $t3, 44(%[src])                   \n"
@ -106,7 +123,9 @@ void CopyRow_MIPS(const uint8* src, uint8* dst, int count) {
    "lw        $t5, 52(%[src])                   \n"
    "lw        $t6, 56(%[src])                   \n"
    "lw        $t7, 60(%[src])                   \n"
+#ifdef HAS_MIPS_PREFETCH
    "pref      0, 160(%[src])                    \n"
+#endif
    // bring the next lines of src, addr 160
    "sw        $t0, 32(%[dst])                   \n"
    "sw        $t1, 36(%[dst])                   \n"
@ -126,7 +145,9 @@ void CopyRow_MIPS(const uint8* src, uint8* dst, int count) {
    // Here we have src and dest word-aligned but less than 64-bytes to go

    "chk8w:                                      \n"
+#ifdef HAS_MIPS_PREFETCH
    "pref      0, 0x0(%[src])                    \n"
+#endif
    "andi      $t8, %[count], 0x1f               \n"  // 32-byte chunk?
    // the t8 is the reminder count past 32-bytes
    "beq       %[count], $t8, chk1w              \n"
@ -214,10 +235,12 @@ void CopyRow_MIPS(const uint8* src, uint8* dst, int count) {
    "addu      $t0, %[dst], %[count]             \n"  // t0 "past the end"
    "subu      $t9, $t0, 160                     \n"
    // t9 is the "last safe pref 30,128(a1)" address
+#ifdef HAS_MIPS_PREFETCH
    "pref      0, 0(%[src])                      \n"  // first line of src
    "pref      0, 32(%[src])                     \n"  // second line  addr 32
    "pref      0, 64(%[src])                     \n"
    "pref      30, 32(%[dst])                    \n"
+#endif
    // safe, as we have at least 64 bytes ahead
    // In case the a1 > t9 don't use "pref 30" at all
    "sgtu      $v1, %[dst], $t9                  \n"
@ -225,15 +248,21 @@ void CopyRow_MIPS(const uint8* src, uint8* dst, int count) {
    // skip "pref 30,64(a1)" for too short arrays
    " nop                                        \n"
    // otherwise, start with using pref30
+#ifdef HAS_MIPS_PREFETCH
    "pref      30, 64(%[dst])                    \n"
+#endif
    "$ua_loop16w:                                \n"
+#ifdef HAS_MIPS_PREFETCH
    "pref      0, 96(%[src])                     \n"
+#endif
    "lwr       $t0, 0(%[src])                    \n"
    "lwl       $t0, 3(%[src])                    \n"
    "lwr       $t1, 4(%[src])                    \n"
    "bgtz      $v1, $ua_skip_pref30_96           \n"
    " lwl      $t1, 7(%[src])                    \n"
+#ifdef HAS_MIPS_PREFETCH
    "pref      30, 96(%[dst])                    \n"
+#endif
    // continue setting up the dest, addr 96
    "$ua_skip_pref30_96:                         \n"
    "lwr       $t2, 8(%[src])                    \n"
@ -248,7 +277,9 @@ void CopyRow_MIPS(const uint8* src, uint8* dst, int count) {
    "lwl       $t6, 27(%[src])                   \n"
    "lwr       $t7, 28(%[src])                   \n"
    "lwl       $t7, 31(%[src])                   \n"
+#ifdef HAS_MIPS_PREFETCH
    "pref      0, 128(%[src])                    \n"
+#endif
    // bring the next lines of src, addr 128
    "sw        $t0, 0(%[dst])                    \n"
    "sw        $t1, 4(%[dst])                    \n"
@ -263,7 +294,9 @@ void CopyRow_MIPS(const uint8* src, uint8* dst, int count) {
    "lwr       $t1, 36(%[src])                   \n"
    "bgtz      $v1, ua_skip_pref30_128           \n"
    " lwl      $t1, 39(%[src])                   \n"
+#ifdef HAS_MIPS_PREFETCH
    "pref      30, 128(%[dst])                   \n"
+#endif
    // continue setting up the dest, addr 128
    "ua_skip_pref30_128:                         \n"

@ -279,7 +312,9 @@ void CopyRow_MIPS(const uint8* src, uint8* dst, int count) {
    "lwl       $t6, 59(%[src])                   \n"
    "lwr       $t7, 60(%[src])                   \n"
    "lwl       $t7, 63(%[src])                   \n"
+#ifdef HAS_MIPS_PREFETCH
    "pref      0, 160(%[src])                    \n"
+#endif
    // bring the next lines of src, addr 160
    "sw        $t0, 32(%[dst])                   \n"
    "sw        $t1, 36(%[dst])                   \n"
@ -299,7 +334,9 @@ void CopyRow_MIPS(const uint8* src, uint8* dst, int count) {
    // Here we have src and dest word-aligned but less than 64-bytes to go

    "ua_chk8w:                                   \n"
+#ifdef HAS_MIPS_PREFETCH
    "pref      0, 0x0(%[src])                    \n"
+#endif
    "andi      $t8, %[count], 0x1f               \n"  // 32-byte chunk?
    // the t8 is the reminder count
    "beq       %[count], $t8, $ua_chk1w          \n"
@ -375,13 +412,11 @@ void CopyRow_MIPS(const uint8* src, uint8* dst, int count) {
 }
 #endif  // HAS_COPYROW_MIPS

-// DSPR2 functions
+// MIPS DSPR2 functions
 #if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips_dsp) && \
-    (__mips_dsp_rev >= 2) && \
-    (_MIPS_SIM == _MIPS_SIM_ABI32) && (__mips_isa_rev < 6)
-
-void SplitUVRow_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
-                      int width) {
+    (__mips_dsp_rev >= 2)
+void SplitUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                           int width) {
  __asm__ __volatile__ (
    ".set push                                     \n"
    ".set noreorder                                \n"
@ -389,6 +424,7 @@ void SplitUVRow_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
    "blez            $t4, 2f                       \n"
    " andi           %[width], %[width], 0xf       \n"  // residual

+    ".p2align        2                             \n"
  "1:                                              \n"
    "addiu           $t4, $t4, -1                  \n"
    "lw              $t0, 0(%[src_uv])             \n"  // V1 | U1 | V0 | U0
@ -446,7 +482,90 @@ void SplitUVRow_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
  );
 }

-void MirrorRow_DSPR2(const uint8* src, uint8* dst, int width) {
+void SplitUVRow_Unaligned_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u,
+                                     uint8* dst_v, int width) {
+  __asm__ __volatile__ (
+    ".set push                                     \n"
+    ".set noreorder                                \n"
+    "srl             $t4, %[width], 4              \n"  // multiplies of 16
+    "blez            $t4, 2f                       \n"
+    " andi           %[width], %[width], 0xf       \n"  // residual
+
+    ".p2align        2                             \n"
+  "1:                                              \n"
+    "addiu           $t4, $t4, -1                  \n"
+    "lwr             $t0, 0(%[src_uv])             \n"
+    "lwl             $t0, 3(%[src_uv])             \n"  // V1 | U1 | V0 | U0
+    "lwr             $t1, 4(%[src_uv])             \n"
+    "lwl             $t1, 7(%[src_uv])             \n"  // V3 | U3 | V2 | U2
+    "lwr             $t2, 8(%[src_uv])             \n"
+    "lwl             $t2, 11(%[src_uv])            \n"  // V5 | U5 | V4 | U4
+    "lwr             $t3, 12(%[src_uv])            \n"
+    "lwl             $t3, 15(%[src_uv])            \n"  // V7 | U7 | V6 | U6
+    "lwr             $t5, 16(%[src_uv])            \n"
+    "lwl             $t5, 19(%[src_uv])            \n"  // V9 | U9 | V8 | U8
+    "lwr             $t6, 20(%[src_uv])            \n"
+    "lwl             $t6, 23(%[src_uv])            \n"  // V11 | U11 | V10 | U10
+    "lwr             $t7, 24(%[src_uv])            \n"
+    "lwl             $t7, 27(%[src_uv])            \n"  // V13 | U13 | V12 | U12
+    "lwr             $t8, 28(%[src_uv])            \n"
+    "lwl             $t8, 31(%[src_uv])            \n"  // V15 | U15 | V14 | U14
+    "precrq.qb.ph    $t9, $t1, $t0                 \n"  // V3 | V2 | V1 | V0
+    "precr.qb.ph     $t0, $t1, $t0                 \n"  // U3 | U2 | U1 | U0
+    "precrq.qb.ph    $t1, $t3, $t2                 \n"  // V7 | V6 | V5 | V4
+    "precr.qb.ph     $t2, $t3, $t2                 \n"  // U7 | U6 | U5 | U4
+    "precrq.qb.ph    $t3, $t6, $t5                 \n"  // V11 | V10 | V9 | V8
+    "precr.qb.ph     $t5, $t6, $t5                 \n"  // U11 | U10 | U9 | U8
+    "precrq.qb.ph    $t6, $t8, $t7                 \n"  // V15 | V14 | V13 | V12
+    "precr.qb.ph     $t7, $t8, $t7                 \n"  // U15 | U14 | U13 | U12
+    "addiu           %[src_uv], %[src_uv], 32      \n"
+    "swr             $t9, 0(%[dst_v])              \n"
+    "swl             $t9, 3(%[dst_v])              \n"
+    "swr             $t0, 0(%[dst_u])              \n"
+    "swl             $t0, 3(%[dst_u])              \n"
+    "swr             $t1, 4(%[dst_v])              \n"
+    "swl             $t1, 7(%[dst_v])              \n"
+    "swr             $t2, 4(%[dst_u])              \n"
+    "swl             $t2, 7(%[dst_u])              \n"
+    "swr             $t3, 8(%[dst_v])              \n"
+    "swl             $t3, 11(%[dst_v])             \n"
+    "swr             $t5, 8(%[dst_u])              \n"
+    "swl             $t5, 11(%[dst_u])             \n"
+    "swr             $t6, 12(%[dst_v])             \n"
+    "swl             $t6, 15(%[dst_v])             \n"
+    "swr             $t7, 12(%[dst_u])             \n"
+    "swl             $t7, 15(%[dst_u])             \n"
+    "addiu           %[dst_u], %[dst_u], 16        \n"
+    "bgtz            $t4, 1b                       \n"
+    " addiu          %[dst_v], %[dst_v], 16        \n"
+
+    "beqz            %[width], 3f                  \n"
+    " nop                                          \n"
+
+  "2:                                              \n"
+    "lbu             $t0, 0(%[src_uv])             \n"
+    "lbu             $t1, 1(%[src_uv])             \n"
+    "addiu           %[src_uv], %[src_uv], 2       \n"
+    "addiu           %[width], %[width], -1        \n"
+    "sb              $t0, 0(%[dst_u])              \n"
+    "sb              $t1, 0(%[dst_v])              \n"
+    "addiu           %[dst_u], %[dst_u], 1         \n"
+    "bgtz            %[width], 2b                  \n"
+    " addiu          %[dst_v], %[dst_v], 1         \n"
+
+  "3:                                              \n"
+    ".set pop                                      \n"
+     : [src_uv] "+r" (src_uv),
+       [width] "+r" (width),
+       [dst_u] "+r" (dst_u),
+       [dst_v] "+r" (dst_v)
+     :
+     : "t0", "t1", "t2", "t3",
+     "t4", "t5", "t6", "t7", "t8", "t9"
+  );
+}
+
+void MirrorRow_MIPS_DSPR2(const uint8* src, uint8* dst, int width) {
  __asm__ __volatile__ (
    ".set push                             \n"
    ".set noreorder                        \n"
@ -456,6 +575,7 @@ void MirrorRow_DSPR2(const uint8* src, uint8* dst, int width) {
    "blez      $t4, 2f                     \n"
    " addu     %[src], %[src], %[width]    \n"  // src += width

+    ".p2align  2                           \n"
   "1:                                     \n"
    "lw        $t0, -16(%[src])            \n"  // |3|2|1|0|
    "lw        $t1, -12(%[src])            \n"  // |7|6|5|4|
@ -496,10 +616,10 @@ void MirrorRow_DSPR2(const uint8* src, uint8* dst, int width) {
  );
 }

-void MirrorUVRow_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
-                       int width) {
-  int x;
-  int y;
+void MirrorUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                            int width) {
+  int x = 0;
+  int y = 0;
  __asm__ __volatile__ (
    ".set push                                    \n"
    ".set noreorder                               \n"
@ -510,6 +630,7 @@ void MirrorUVRow_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
    "blez            %[x], 2f                     \n"
    " addu           %[src_uv], %[src_uv], $t4    \n"

+    ".p2align        2                            \n"
   "1:                                            \n"
    "lw              $t0, -32(%[src_uv])          \n"  // |3|2|1|0|
    "lw              $t1, -28(%[src_uv])          \n"  // |7|6|5|4|
@ -579,7 +700,7 @@ void MirrorUVRow_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
        [dst_u] "+r" (dst_u),
        [dst_v] "+r" (dst_v),
        [x] "=&r" (x),
-        [y] "=&r" (y)
+        [y] "+r" (y)
      : [width] "r" (width)
      : "t0", "t1", "t2", "t3", "t4",
      "t5", "t7", "t8", "t9"
@ -593,7 +714,7 @@ void MirrorUVRow_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
 // t8 = | 0 | G1 | 0 | g1 |
 // t2 = | 0 | R0 | 0 | r0 |
 // t1 = | 0 | R1 | 0 | r1 |
-#define YUVTORGB                                                               \
+#define I422ToTransientMipsRGB                                                 \
      "lw                $t0, 0(%[y_buf])       \n"                            \
      "lhu               $t1, 0(%[u_buf])       \n"                            \
      "lhu               $t2, 0(%[v_buf])       \n"                            \
@ -652,13 +773,11 @@ void MirrorUVRow_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
      "addu.ph           $t2, $t2, $s5          \n"                            \
      "addu.ph           $t1, $t1, $s5          \n"

-// TODO(fbarchard): accept yuv conversion constants.
-void I422ToARGBRow_DSPR2(const uint8* y_buf,
-                         const uint8* u_buf,
-                         const uint8* v_buf,
-                         uint8* rgb_buf,
-                         const struct YuvConstants* yuvconstants,
-                         int width) {
+void I422ToARGBRow_MIPS_DSPR2(const uint8* y_buf,
+                              const uint8* u_buf,
+                              const uint8* v_buf,
+                              uint8* rgb_buf,
+                              int width) {
  __asm__ __volatile__ (
    ".set push                                \n"
    ".set noreorder                           \n"
@ -672,8 +791,9 @@ void I422ToARGBRow_DSPR2(const uint8* y_buf,
    "lui               $s6, 0xff00            \n"
    "ori               $s6, 0xff00            \n"  // |ff|00|ff|00|ff|

+    ".p2align          2                      \n"
   "1:                                        \n"
-      YUVTORGB
+      I422ToTransientMipsRGB
 // Arranging into argb format
    "precr.qb.ph       $t4, $t8, $t4          \n"  // |G1|g1|B1|b1|
    "precr.qb.ph       $t5, $t9, $t5          \n"  // |G0|g0|B0|b0|
@ -715,10 +835,136 @@ void I422ToARGBRow_DSPR2(const uint8* y_buf,
  );
 }

+void I422ToABGRRow_MIPS_DSPR2(const uint8* y_buf,
+                              const uint8* u_buf,
+                              const uint8* v_buf,
+                              uint8* rgb_buf,
+                              int width) {
+  __asm__ __volatile__ (
+    ".set push                                \n"
+    ".set noreorder                           \n"
+    "beqz              %[width], 2f           \n"
+    " repl.ph          $s0, 74                \n"  // |YG|YG| = |74|74|
+    "repl.ph           $s1, -25               \n"  // |UG|UG| = |-25|-25|
+    "repl.ph           $s2, -52               \n"  // |VG|VG| = |-52|-52|
+    "repl.ph           $s3, 102               \n"  // |VR|VR| = |102|102|
+    "repl.ph           $s4, 16                \n"  // |0|16|0|16|
+    "repl.ph           $s5, 128               \n"  // |128|128|
+    "lui               $s6, 0xff00            \n"
+    "ori               $s6, 0xff00            \n"  // |ff|00|ff|00|
+
+    ".p2align          2                       \n"
+   "1:                                         \n"
+      I422ToTransientMipsRGB
+// Arranging into abgr format
+    "precr.qb.ph      $t0, $t8, $t1           \n"  // |G1|g1|R1|r1|
+    "precr.qb.ph      $t3, $t9, $t2           \n"  // |G0|g0|R0|r0|
+    "precrq.qb.ph     $t8, $t0, $t3           \n"  // |G1|R1|G0|R0|
+    "precr.qb.ph      $t9, $t0, $t3           \n"  // |g1|r1|g0|r0|
+
+    "precr.qb.ph       $t2, $t4, $t5          \n"  // |B1|b1|B0|b0|
+    "addiu             %[width], -4           \n"
+    "addiu             %[y_buf], 4            \n"
+    "preceu.ph.qbla    $t1, $t2               \n"  // |0 |B1|0 |B0|
+    "preceu.ph.qbra    $t2, $t2               \n"  // |0 |b1|0 |b0|
+    "or                $t1, $t1, $s6          \n"  // |ff|B1|ff|B0|
+    "or                $t2, $t2, $s6          \n"  // |ff|b1|ff|b0|
+    "precrq.ph.w       $t0, $t2, $t9          \n"  // |ff|b1|g1|r1|
+    "precrq.ph.w       $t3, $t1, $t8          \n"  // |ff|B1|G1|R1|
+    "sll               $t9, $t9, 16           \n"
+    "sll               $t8, $t8, 16           \n"
+    "packrl.ph         $t2, $t2, $t9          \n"  // |ff|b0|g0|r0|
+    "packrl.ph         $t1, $t1, $t8          \n"  // |ff|B0|G0|R0|
+// Store results.
+    "sw                $t2, 0(%[rgb_buf])     \n"
+    "sw                $t0, 4(%[rgb_buf])     \n"
+    "sw                $t1, 8(%[rgb_buf])     \n"
+    "sw                $t3, 12(%[rgb_buf])    \n"
+    "bnez              %[width], 1b           \n"
+    " addiu            %[rgb_buf], 16         \n"
+   "2:                                        \n"
+    ".set pop                                 \n"
+      :[y_buf] "+r" (y_buf),
+       [u_buf] "+r" (u_buf),
+       [v_buf] "+r" (v_buf),
+       [width] "+r" (width),
+       [rgb_buf] "+r" (rgb_buf)
+      :
+      : "t0", "t1",  "t2", "t3",  "t4", "t5",
+      "t6", "t7", "t8", "t9",
+      "s0", "s1", "s2", "s3",
+      "s4", "s5", "s6"
+  );
+}
+
+void I422ToBGRARow_MIPS_DSPR2(const uint8* y_buf,
+                              const uint8* u_buf,
+                              const uint8* v_buf,
+                              uint8* rgb_buf,
+                              int width) {
+  __asm__ __volatile__ (
+    ".set push                                \n"
+    ".set noreorder                           \n"
+    "beqz              %[width], 2f           \n"
+    " repl.ph          $s0, 74                \n"  // |YG|YG| = |74 |74 |
+    "repl.ph           $s1, -25               \n"  // |UG|UG| = |-25|-25|
+    "repl.ph           $s2, -52               \n"  // |VG|VG| = |-52|-52|
+    "repl.ph           $s3, 102               \n"  // |VR|VR| = |102|102|
+    "repl.ph           $s4, 16                \n"  // |0|16|0|16|
+    "repl.ph           $s5, 128               \n"  // |128|128|
+    "lui               $s6, 0xff              \n"
+    "ori               $s6, 0xff              \n"  // |00|ff|00|ff|
+
+    ".p2align          2                      \n"
+   "1:                                        \n"
+      I422ToTransientMipsRGB
+      // Arranging into bgra format
+    "precr.qb.ph       $t4, $t4, $t8          \n"  // |B1|b1|G1|g1|
+    "precr.qb.ph       $t5, $t5, $t9          \n"  // |B0|b0|G0|g0|
+    "precrq.qb.ph      $t8, $t4, $t5          \n"  // |B1|G1|B0|G0|
+    "precr.qb.ph       $t9, $t4, $t5          \n"  // |b1|g1|b0|g0|
+
+    "precr.qb.ph       $t2, $t1, $t2          \n"  // |R1|r1|R0|r0|
+    "addiu             %[width], -4           \n"
+    "addiu             %[y_buf], 4            \n"
+    "preceu.ph.qbla    $t1, $t2               \n"  // |0 |R1|0 |R0|
+    "preceu.ph.qbra    $t2, $t2               \n"  // |0 |r1|0 |r0|
+    "sll               $t1, $t1, 8            \n"  // |R1|0 |R0|0 |
+    "sll               $t2, $t2, 8            \n"  // |r1|0 |r0|0 |
+    "or                $t1, $t1, $s6          \n"  // |R1|ff|R0|ff|
+    "or                $t2, $t2, $s6          \n"  // |r1|ff|r0|ff|
+    "precrq.ph.w       $t0, $t9, $t2          \n"  // |b1|g1|r1|ff|
+    "precrq.ph.w       $t3, $t8, $t1          \n"  // |B1|G1|R1|ff|
+    "sll               $t1, $t1, 16           \n"
+    "sll               $t2, $t2, 16           \n"
+    "packrl.ph         $t2, $t9, $t2          \n"  // |b0|g0|r0|ff|
+    "packrl.ph         $t1, $t8, $t1          \n"  // |B0|G0|R0|ff|
+// Store results.
+    "sw                $t2, 0(%[rgb_buf])     \n"
+    "sw                $t0, 4(%[rgb_buf])     \n"
+    "sw                $t1, 8(%[rgb_buf])     \n"
+    "sw                $t3, 12(%[rgb_buf])    \n"
+    "bnez              %[width], 1b           \n"
+    " addiu            %[rgb_buf], 16         \n"
+   "2:                                        \n"
+    ".set pop                                 \n"
+      :[y_buf] "+r" (y_buf),
+       [u_buf] "+r" (u_buf),
+       [v_buf] "+r" (v_buf),
+       [width] "+r" (width),
+       [rgb_buf] "+r" (rgb_buf)
+      :
+      : "t0", "t1",  "t2", "t3",  "t4", "t5",
+      "t6", "t7", "t8", "t9",
+      "s0", "s1", "s2", "s3",
+      "s4", "s5", "s6"
+  );
+}
+
 // Bilinear filter 8x2 -> 8x1
-void InterpolateRow_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
-                          ptrdiff_t src_stride, int dst_width,
-                          int source_y_fraction) {
+void InterpolateRows_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
+                                ptrdiff_t src_stride, int dst_width,
+                                int source_y_fraction) {
    int y0_fraction = 256 - source_y_fraction;
    const uint8* src_ptr1 = src_ptr + src_stride;

@ -729,6 +975,7 @@ void InterpolateRow_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
     "replv.ph          $t0, %[y0_fraction]               \n"
     "replv.ph          $t1, %[source_y_fraction]         \n"

+    ".p2align           2                                 \n"
   "1:                                                    \n"
     "lw                $t2, 0(%[src_ptr])                \n"
     "lw                $t3, 0(%[src_ptr1])               \n"
--- a/media/libyuv/source/row_neon.cc
+++ b/media/libyuv/source/row_neon.cc
--- a/media/libyuv/source/row_win.cc
+++ b/media/libyuv/source/row_win.cc
--- a/media/libyuv/source/scale.cc
+++ b/media/libyuv/source/scale.cc
--- a/media/libyuv/source/scale_argb.cc
+++ b/media/libyuv/source/scale_argb.cc
@ -53,27 +53,18 @@ static void ScaleARGBDown2(int src_width, int src_height,
  }

 #if defined(HAS_SCALEARGBROWDOWN2_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_Any_SSE2 :
-        (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_SSE2 :
-        ScaleARGBRowDown2Box_Any_SSE2);
-    if (IS_ALIGNED(dst_width, 4)) {
-      ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_SSE2 :
-          (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_SSE2 :
-          ScaleARGBRowDown2Box_SSE2);
-    }
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4) &&
+      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(row_stride, 16) &&
+      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
+    ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_SSE2 :
+        (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_SSE2 :
+        ScaleARGBRowDown2Box_SSE2);
  }
-#endif
-#if defined(HAS_SCALEARGBROWDOWN2_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_Any_NEON :
-        (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_NEON :
-        ScaleARGBRowDown2Box_Any_NEON);
-    if (IS_ALIGNED(dst_width, 8)) {
-      ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_NEON :
-          (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_NEON :
-          ScaleARGBRowDown2Box_NEON);
-    }
+#elif defined(HAS_SCALEARGBROWDOWN2_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8) &&
+      IS_ALIGNED(src_argb, 4) && IS_ALIGNED(row_stride, 4)) {
+    ScaleARGBRowDown2 = filtering ? ScaleARGBRowDown2Box_NEON :
+        ScaleARGBRowDown2_NEON;
  }
 #endif

@ -97,7 +88,7 @@ static void ScaleARGBDown4Box(int src_width, int src_height,
                              int x, int dx, int y, int dy) {
  int j;
  // Allocate 2 rows of ARGB.
-  const int kRowSize = (dst_width * 2 * 4 + 31) & ~31;
+  const int kRowSize = (dst_width * 2 * 4 + 15) & ~15;
  align_buffer_64(row, kRowSize * 2);
  int row_stride = src_stride * (dy >> 16);
  void (*ScaleARGBRowDown2)(const uint8* src_argb, ptrdiff_t src_stride,
@ -107,22 +98,17 @@ static void ScaleARGBDown4Box(int src_width, int src_height,
  assert(dx == 65536 * 4);  // Test scale factor of 4.
  assert((dy & 0x3ffff) == 0);  // Test vertical scale is multiple of 4.
 #if defined(HAS_SCALEARGBROWDOWN2_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    ScaleARGBRowDown2 = ScaleARGBRowDown2Box_Any_SSE2;
-    if (IS_ALIGNED(dst_width, 4)) {
-      ScaleARGBRowDown2 = ScaleARGBRowDown2Box_SSE2;
-    }
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4) &&
+      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(row_stride, 16) &&
+      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
+    ScaleARGBRowDown2 = ScaleARGBRowDown2Box_SSE2;
+  }
+#elif defined(HAS_SCALEARGBROWDOWN2_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8) &&
+      IS_ALIGNED(src_argb, 4) && IS_ALIGNED(row_stride, 4)) {
+    ScaleARGBRowDown2 = ScaleARGBRowDown2Box_NEON;
  }
 #endif
-#if defined(HAS_SCALEARGBROWDOWN2_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ScaleARGBRowDown2 = ScaleARGBRowDown2Box_Any_NEON;
-    if (IS_ALIGNED(dst_width, 8)) {
-      ScaleARGBRowDown2 = ScaleARGBRowDown2Box_NEON;
-    }
-  }
-#endif
-
  for (j = 0; j < dst_height; ++j) {
    ScaleARGBRowDown2(src_argb, src_stride, row, dst_width * 2);
    ScaleARGBRowDown2(src_argb + src_stride * 2, src_stride,
@ -153,23 +139,16 @@ static void ScaleARGBDownEven(int src_width, int src_height,
  assert(IS_ALIGNED(src_height, 2));
  src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
 #if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_SSE2 :
-        ScaleARGBRowDownEven_Any_SSE2;
-    if (IS_ALIGNED(dst_width, 4)) {
-      ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_SSE2 :
-          ScaleARGBRowDownEven_SSE2;
-    }
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4) &&
+      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
+    ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_SSE2 :
+        ScaleARGBRowDownEven_SSE2;
  }
-#endif
-#if defined(HAS_SCALEARGBROWDOWNEVEN_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_NEON :
-        ScaleARGBRowDownEven_Any_NEON;
-    if (IS_ALIGNED(dst_width, 4)) {
-      ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_NEON :
-          ScaleARGBRowDownEven_NEON;
-    }
+#elif defined(HAS_SCALEARGBROWDOWNEVEN_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 4) &&
+      IS_ALIGNED(src_argb, 4)) {
+    ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_NEON :
+        ScaleARGBRowDownEven_NEON;
  }
 #endif

@ -191,35 +170,42 @@ static void ScaleARGBBilinearDown(int src_width, int src_height,
                                  int x, int dx, int y, int dy,
                                  enum FilterMode filtering) {
  int j;
-  void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
-      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
-      InterpolateRow_C;
-  void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb,
-      int dst_width, int x, int dx) =
-      (src_width >= 32768) ? ScaleARGBFilterCols64_C : ScaleARGBFilterCols_C;
  int64 xlast = x + (int64)(dst_width - 1) * dx;
  int64 xl = (dx >= 0) ? x : xlast;
  int64 xr = (dx >= 0) ? xlast : x;
  int clip_src_width;
  xl = (xl >> 16) & ~3;  // Left edge aligned.
-  xr = (xr >> 16) + 1;  // Right most pixel used.  Bilinear uses 2 pixels.
-  xr = (xr + 1 + 3) & ~3;  // 1 beyond 4 pixel aligned right most pixel.
-  if (xr > src_width) {
-    xr = src_width;
-  }
-  clip_src_width = (int)(xr - xl) * 4;  // Width aligned to 4.
+  xr = (xr >> 16) + 1;  // Right most pixel used.
+  clip_src_width = (((xr - xl) + 1 + 3) & ~3) * 4;  // Width aligned to 4.
  src_argb += xl * 4;
  x -= (int)(xl << 16);
+  void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
+      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
+      InterpolateRow_C;
+#if defined(HAS_INTERPOLATEROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && clip_src_width >= 16) {
+    InterpolateRow = InterpolateRow_Any_SSE2;
+    if (IS_ALIGNED(clip_src_width, 16)) {
+      InterpolateRow = InterpolateRow_Unaligned_SSE2;
+      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16)) {
+        InterpolateRow = InterpolateRow_SSE2;
+      }
+    }
+  }
+#endif
 #if defined(HAS_INTERPOLATEROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
+  if (TestCpuFlag(kCpuHasSSSE3) && clip_src_width >= 16) {
    InterpolateRow = InterpolateRow_Any_SSSE3;
    if (IS_ALIGNED(clip_src_width, 16)) {
-      InterpolateRow = InterpolateRow_SSSE3;
+      InterpolateRow = InterpolateRow_Unaligned_SSSE3;
+      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16)) {
+        InterpolateRow = InterpolateRow_SSSE3;
+      }
    }
  }
 #endif
 #if defined(HAS_INTERPOLATEROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
+  if (TestCpuFlag(kCpuHasAVX2) && clip_src_width >= 32) {
    InterpolateRow = InterpolateRow_Any_AVX2;
    if (IS_ALIGNED(clip_src_width, 32)) {
      InterpolateRow = InterpolateRow_AVX2;
@ -227,62 +213,52 @@ static void ScaleARGBBilinearDown(int src_width, int src_height,
  }
 #endif
 #if defined(HAS_INTERPOLATEROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
+  if (TestCpuFlag(kCpuHasNEON) && clip_src_width >= 16) {
    InterpolateRow = InterpolateRow_Any_NEON;
    if (IS_ALIGNED(clip_src_width, 16)) {
      InterpolateRow = InterpolateRow_NEON;
    }
  }
 #endif
-#if defined(HAS_INTERPOLATEROW_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2) &&
+#if defined(HAS_INTERPOLATEROWS_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && clip_src_width >= 4 &&
      IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4)) {
-    InterpolateRow = InterpolateRow_Any_DSPR2;
+    InterpolateRow = InterpolateRow_Any_MIPS_DSPR2;
    if (IS_ALIGNED(clip_src_width, 4)) {
-      InterpolateRow = InterpolateRow_DSPR2;
+      InterpolateRow = InterpolateRow_MIPS_DSPR2;
    }
  }
 #endif
+  void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb,
+      int dst_width, int x, int dx) =
+      (src_width >= 32768) ? ScaleARGBFilterCols64_C : ScaleARGBFilterCols_C;
 #if defined(HAS_SCALEARGBFILTERCOLS_SSSE3)
  if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
    ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
  }
-#endif
-#if defined(HAS_SCALEARGBFILTERCOLS_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ScaleARGBFilterCols = ScaleARGBFilterCols_Any_NEON;
-    if (IS_ALIGNED(dst_width, 4)) {
-      ScaleARGBFilterCols = ScaleARGBFilterCols_NEON;
-    }
-  }
 #endif
  // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear.
  // Allocate a row of ARGB.
-  {
-    align_buffer_64(row, clip_src_width * 4);
+  align_buffer_64(row, clip_src_width * 4);

-    const int max_y = (src_height - 1) << 16;
+  const int max_y = (src_height - 1) << 16;
+  for (j = 0; j < dst_height; ++j) {
    if (y > max_y) {
      y = max_y;
    }
-    for (j = 0; j < dst_height; ++j) {
-      int yi = y >> 16;
-      const uint8* src = src_argb + yi * src_stride;
-      if (filtering == kFilterLinear) {
-        ScaleARGBFilterCols(dst_argb, src, dst_width, x, dx);
-      } else {
-        int yf = (y >> 8) & 255;
-        InterpolateRow(row, src, src_stride, clip_src_width, yf);
-        ScaleARGBFilterCols(dst_argb, row, dst_width, x, dx);
-      }
-      dst_argb += dst_stride;
-      y += dy;
-      if (y > max_y) {
-        y = max_y;
-      }
+    int yi = y >> 16;
+    const uint8* src = src_argb + yi * src_stride;
+    if (filtering == kFilterLinear) {
+      ScaleARGBFilterCols(dst_argb, src, dst_width, x, dx);
+    } else {
+      int yf = (y >> 8) & 255;
+      InterpolateRow(row, src, src_stride, clip_src_width, yf);
+      ScaleARGBFilterCols(dst_argb, row, dst_width, x, dx);
    }
-    free_aligned_buffer_64(row);
+    dst_argb += dst_stride;
+    y += dy;
  }
+  free_aligned_buffer_64(row);
 }

 // Scale ARGB up with bilinear interpolation.
@ -299,17 +275,30 @@ static void ScaleARGBBilinearUp(int src_width, int src_height,
  void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb,
      int dst_width, int x, int dx) =
      filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C;
-  const int max_y = (src_height - 1) << 16;
+#if defined(HAS_INTERPOLATEROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && dst_width >= 4) {
+    InterpolateRow = InterpolateRow_Any_SSE2;
+    if (IS_ALIGNED(dst_width, 4)) {
+      InterpolateRow = InterpolateRow_Unaligned_SSE2;
+      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
+        InterpolateRow = InterpolateRow_SSE2;
+      }
+    }
+  }
+#endif
 #if defined(HAS_INTERPOLATEROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
+  if (TestCpuFlag(kCpuHasSSSE3) && dst_width >= 4) {
    InterpolateRow = InterpolateRow_Any_SSSE3;
    if (IS_ALIGNED(dst_width, 4)) {
-      InterpolateRow = InterpolateRow_SSSE3;
+      InterpolateRow = InterpolateRow_Unaligned_SSSE3;
+      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
+        InterpolateRow = InterpolateRow_SSSE3;
+      }
    }
  }
 #endif
 #if defined(HAS_INTERPOLATEROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
+  if (TestCpuFlag(kCpuHasAVX2) && dst_width >= 8) {
    InterpolateRow = InterpolateRow_Any_AVX2;
    if (IS_ALIGNED(dst_width, 8)) {
      InterpolateRow = InterpolateRow_AVX2;
@ -317,17 +306,17 @@ static void ScaleARGBBilinearUp(int src_width, int src_height,
  }
 #endif
 #if defined(HAS_INTERPOLATEROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
+  if (TestCpuFlag(kCpuHasNEON) && dst_width >= 4) {
    InterpolateRow = InterpolateRow_Any_NEON;
    if (IS_ALIGNED(dst_width, 4)) {
      InterpolateRow = InterpolateRow_NEON;
    }
  }
 #endif
-#if defined(HAS_INTERPOLATEROW_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2) &&
+#if defined(HAS_INTERPOLATEROWS_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && dst_width >= 1 &&
      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) {
-    InterpolateRow = InterpolateRow_DSPR2;
+    InterpolateRow = InterpolateRow_MIPS_DSPR2;
  }
 #endif
  if (src_width >= 32768) {
@ -339,86 +328,70 @@ static void ScaleARGBBilinearUp(int src_width, int src_height,
    ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
  }
 #endif
-#if defined(HAS_SCALEARGBFILTERCOLS_NEON)
-  if (filtering && TestCpuFlag(kCpuHasNEON)) {
-    ScaleARGBFilterCols = ScaleARGBFilterCols_Any_NEON;
-    if (IS_ALIGNED(dst_width, 4)) {
-      ScaleARGBFilterCols = ScaleARGBFilterCols_NEON;
-    }
-  }
-#endif
 #if defined(HAS_SCALEARGBCOLS_SSE2)
  if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
    ScaleARGBFilterCols = ScaleARGBCols_SSE2;
  }
-#endif
-#if defined(HAS_SCALEARGBCOLS_NEON)
-  if (!filtering && TestCpuFlag(kCpuHasNEON)) {
-    ScaleARGBFilterCols = ScaleARGBCols_Any_NEON;
-    if (IS_ALIGNED(dst_width, 8)) {
-      ScaleARGBFilterCols = ScaleARGBCols_NEON;
-    }
-  }
 #endif
  if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
    ScaleARGBFilterCols = ScaleARGBColsUp2_C;
 #if defined(HAS_SCALEARGBCOLSUP2_SSE2)
-    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
+    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) &&
+        IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) &&
+        IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
      ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2;
    }
 #endif
  }

+  const int max_y = (src_height - 1) << 16;
  if (y > max_y) {
    y = max_y;
  }
+  int yi = y >> 16;
+  const uint8* src = src_argb + yi * src_stride;

-  {
-    int yi = y >> 16;
-    const uint8* src = src_argb + yi * src_stride;
+  // Allocate 2 rows of ARGB.
+  const int kRowSize = (dst_width * 4 + 15) & ~15;
+  align_buffer_64(row, kRowSize * 2);

-    // Allocate 2 rows of ARGB.
-    const int kRowSize = (dst_width * 4 + 31) & ~31;
-    align_buffer_64(row, kRowSize * 2);
+  uint8* rowptr = row;
+  int rowstride = kRowSize;
+  int lasty = yi;

-    uint8* rowptr = row;
-    int rowstride = kRowSize;
-    int lasty = yi;
-
-    ScaleARGBFilterCols(rowptr, src, dst_width, x, dx);
-    if (src_height > 1) {
-      src += src_stride;
-    }
-    ScaleARGBFilterCols(rowptr + rowstride, src, dst_width, x, dx);
+  ScaleARGBFilterCols(rowptr, src, dst_width, x, dx);
+  if (src_height > 1) {
    src += src_stride;
-
-    for (j = 0; j < dst_height; ++j) {
-      yi = y >> 16;
-      if (yi != lasty) {
-        if (y > max_y) {
-          y = max_y;
-          yi = y >> 16;
-          src = src_argb + yi * src_stride;
-        }
-        if (yi != lasty) {
-          ScaleARGBFilterCols(rowptr, src, dst_width, x, dx);
-          rowptr += rowstride;
-          rowstride = -rowstride;
-          lasty = yi;
-          src += src_stride;
-        }
-      }
-      if (filtering == kFilterLinear) {
-        InterpolateRow(dst_argb, rowptr, 0, dst_width * 4, 0);
-      } else {
-        int yf = (y >> 8) & 255;
-        InterpolateRow(dst_argb, rowptr, rowstride, dst_width * 4, yf);
-      }
-      dst_argb += dst_stride;
-      y += dy;
-    }
-    free_aligned_buffer_64(row);
  }
+  ScaleARGBFilterCols(rowptr + rowstride, src, dst_width, x, dx);
+  src += src_stride;
+
+  for (j = 0; j < dst_height; ++j) {
+    yi = y >> 16;
+    if (yi != lasty) {
+      if (y > max_y) {
+        y = max_y;
+        yi = y >> 16;
+        src = src_argb + yi * src_stride;
+      }
+      if (yi != lasty) {
+        ScaleARGBFilterCols(rowptr, src, dst_width, x, dx);
+        rowptr += rowstride;
+        rowstride = -rowstride;
+        lasty = yi;
+        src += src_stride;
+      }
+    }
+    if (filtering == kFilterLinear) {
+      InterpolateRow(dst_argb, rowptr, 0, dst_width * 4, 0);
+    } else {
+      int yf = (y >> 8) & 255;
+      InterpolateRow(dst_argb, rowptr, rowstride, dst_width * 4, yf);
+    }
+    dst_argb += dst_stride;
+    y += dy;
+  }
+  free_aligned_buffer_64(row);
 }

 #ifdef YUVSCALEUP
@ -442,15 +415,18 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
                        uint8* rgb_buf,
                        int width) = I422ToARGBRow_C;
 #if defined(HAS_I422TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
+  if (TestCpuFlag(kCpuHasSSSE3) && src_width >= 8) {
    I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
    if (IS_ALIGNED(src_width, 8)) {
-      I422ToARGBRow = I422ToARGBRow_SSSE3;
+      I422ToARGBRow = I422ToARGBRow_Unaligned_SSSE3;
+      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+        I422ToARGBRow = I422ToARGBRow_SSSE3;
+      }
    }
  }
 #endif
 #if defined(HAS_I422TOARGBROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
+  if (TestCpuFlag(kCpuHasAVX2) && src_width >= 16) {
    I422ToARGBRow = I422ToARGBRow_Any_AVX2;
    if (IS_ALIGNED(src_width, 16)) {
      I422ToARGBRow = I422ToARGBRow_AVX2;
@ -458,36 +434,50 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
  }
 #endif
 #if defined(HAS_I422TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
+  if (TestCpuFlag(kCpuHasNEON) && src_width >= 8) {
    I422ToARGBRow = I422ToARGBRow_Any_NEON;
    if (IS_ALIGNED(src_width, 8)) {
      I422ToARGBRow = I422ToARGBRow_NEON;
    }
  }
 #endif
-#if defined(HAS_I422TOARGBROW_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src_width, 4) &&
+#if defined(HAS_I422TOARGBROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(src_width, 4) &&
      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
-    I422ToARGBRow = I422ToARGBRow_DSPR2;
+    I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2;
  }
 #endif

  void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
      InterpolateRow_C;
+#if defined(HAS_INTERPOLATEROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && dst_width >= 4) {
+    InterpolateRow = InterpolateRow_Any_SSE2;
+    if (IS_ALIGNED(dst_width, 4)) {
+      InterpolateRow = InterpolateRow_Unaligned_SSE2;
+      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+        InterpolateRow = InterpolateRow_SSE2;
+      }
+    }
+  }
+#endif
 #if defined(HAS_INTERPOLATEROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
+  if (TestCpuFlag(kCpuHasSSSE3) && dst_width >= 4) {
    InterpolateRow = InterpolateRow_Any_SSSE3;
    if (IS_ALIGNED(dst_width, 4)) {
-      InterpolateRow = InterpolateRow_SSSE3;
+      InterpolateRow = InterpolateRow_Unaligned_SSSE3;
+      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+        InterpolateRow = InterpolateRow_SSSE3;
+      }
    }
  }
 #endif
 #if defined(HAS_INTERPOLATEROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
+  if (TestCpuFlag(kCpuHasAVX2) && dst_width >= 8) {
    InterpolateRow = InterpolateRow_Any_AVX2;
    if (IS_ALIGNED(dst_width, 8)) {
      InterpolateRow = InterpolateRow_AVX2;
@ -495,17 +485,17 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
  }
 #endif
 #if defined(HAS_INTERPOLATEROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
+  if (TestCpuFlag(kCpuHasNEON) && dst_width >= 4) {
    InterpolateRow = InterpolateRow_Any_NEON;
    if (IS_ALIGNED(dst_width, 4)) {
      InterpolateRow = InterpolateRow_NEON;
    }
  }
 #endif
-#if defined(HAS_INTERPOLATEROW_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2) &&
+#if defined(HAS_INTERPOLATEROWS_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && dst_width >= 1 &&
      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
-    InterpolateRow = InterpolateRow_DSPR2;
+    InterpolateRow = InterpolateRow_MIPS_DSPR2;
  }
 #endif

@ -521,31 +511,17 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
    ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
  }
 #endif
-#if defined(HAS_SCALEARGBFILTERCOLS_NEON)
-  if (filtering && TestCpuFlag(kCpuHasNEON)) {
-    ScaleARGBFilterCols = ScaleARGBFilterCols_Any_NEON;
-    if (IS_ALIGNED(dst_width, 4)) {
-      ScaleARGBFilterCols = ScaleARGBFilterCols_NEON;
-    }
-  }
-#endif
 #if defined(HAS_SCALEARGBCOLS_SSE2)
  if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
    ScaleARGBFilterCols = ScaleARGBCols_SSE2;
  }
-#endif
-#if defined(HAS_SCALEARGBCOLS_NEON)
-  if (!filtering && TestCpuFlag(kCpuHasNEON)) {
-    ScaleARGBFilterCols = ScaleARGBCols_Any_NEON;
-    if (IS_ALIGNED(dst_width, 8)) {
-      ScaleARGBFilterCols = ScaleARGBCols_NEON;
-    }
-  }
 #endif
  if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
    ScaleARGBFilterCols = ScaleARGBColsUp2_C;
 #if defined(HAS_SCALEARGBCOLSUP2_SSE2)
-    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
+    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) &&
+        IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) &&
+        IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
      ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2;
    }
 #endif
@ -563,7 +539,7 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
  const uint8* src_row_v = src_v + uv_yi * src_stride_v;

  // Allocate 2 rows of ARGB.
-  const int kRowSize = (dst_width * 4 + 31) & ~31;
+  const int kRowSize = (dst_width * 4 + 15) & ~15;
  align_buffer_64(row, kRowSize * 2);

  // Allocate 1 row of ARGB for source conversion.
@ -648,19 +624,13 @@ static void ScaleARGBSimple(int src_width, int src_height,
  if (TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
    ScaleARGBCols = ScaleARGBCols_SSE2;
  }
-#endif
-#if defined(HAS_SCALEARGBCOLS_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ScaleARGBCols = ScaleARGBCols_Any_NEON;
-    if (IS_ALIGNED(dst_width, 8)) {
-      ScaleARGBCols = ScaleARGBCols_NEON;
-    }
-  }
 #endif
  if (src_width * 2 == dst_width && x < 0x8000) {
    ScaleARGBCols = ScaleARGBColsUp2_C;
 #if defined(HAS_SCALEARGBCOLSUP2_SSE2)
-    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
+    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) &&
+        IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) &&
+        IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
      ScaleARGBCols = ScaleARGBColsUp2_SSE2;
    }
 #endif
@ -794,7 +764,6 @@ int ARGBScaleClip(const uint8* src_argb, int src_stride_argb,
  if (!src_argb || src_width == 0 || src_height == 0 ||
      !dst_argb || dst_width <= 0 || dst_height <= 0 ||
      clip_x < 0 || clip_y < 0 ||
-      clip_width > 32768 || clip_height > 32768 ||
      (clip_x + clip_width) > dst_width ||
      (clip_y + clip_height) > dst_height) {
    return -1;
@ -813,7 +782,6 @@ int ARGBScale(const uint8* src_argb, int src_stride_argb,
              int dst_width, int dst_height,
              enum FilterMode filtering) {
  if (!src_argb || src_width == 0 || src_height == 0 ||
-      src_width > 32768 || src_height > 32768 ||
      !dst_argb || dst_width <= 0 || dst_height <= 0) {
    return -1;
  }
@ -823,36 +791,6 @@ int ARGBScale(const uint8* src_argb, int src_stride_argb,
  return 0;
 }

-// Scale with YUV conversion to ARGB and clipping.
-LIBYUV_API
-int YUVToARGBScaleClip(const uint8* src_y, int src_stride_y,
-                       const uint8* src_u, int src_stride_u,
-                       const uint8* src_v, int src_stride_v,
-                       uint32 src_fourcc,
-                       int src_width, int src_height,
-                       uint8* dst_argb, int dst_stride_argb,
-                       uint32 dst_fourcc,
-                       int dst_width, int dst_height,
-                       int clip_x, int clip_y, int clip_width, int clip_height,
-                       enum FilterMode filtering) {
-  uint8* argb_buffer = (uint8*)malloc(src_width * src_height * 4);
-  int r;
-  I420ToARGB(src_y, src_stride_y,
-             src_u, src_stride_u,
-             src_v, src_stride_v,
-             argb_buffer, src_width * 4,
-             src_width, src_height);
-
-  r = ARGBScaleClip(argb_buffer, src_width * 4,
-                    src_width, src_height,
-                    dst_argb, dst_stride_argb,
-                    dst_width, dst_height,
-                    clip_x, clip_y, clip_width, clip_height,
-                    filtering);
-  free(argb_buffer);
-  return r;
-}
-
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
--- a/media/libyuv/source/scale_common.cc
+++ b/media/libyuv/source/scale_common.cc
@ -42,20 +42,6 @@ void ScaleRowDown2_C(const uint8* src_ptr, ptrdiff_t src_stride,
  }
 }

-void ScaleRowDown2_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                        uint16* dst, int dst_width) {
-  int x;
-  for (x = 0; x < dst_width - 1; x += 2) {
-    dst[0] = src_ptr[1];
-    dst[1] = src_ptr[3];
-    dst += 2;
-    src_ptr += 4;
-  }
-  if (dst_width & 1) {
-    dst[0] = src_ptr[1];
-  }
-}
-
 void ScaleRowDown2Linear_C(const uint8* src_ptr, ptrdiff_t src_stride,
                           uint8* dst, int dst_width) {
  const uint8* s = src_ptr;
@ -71,21 +57,6 @@ void ScaleRowDown2Linear_C(const uint8* src_ptr, ptrdiff_t src_stride,
  }
 }

-void ScaleRowDown2Linear_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                              uint16* dst, int dst_width) {
-  const uint16* s = src_ptr;
-  int x;
-  for (x = 0; x < dst_width - 1; x += 2) {
-    dst[0] = (s[0] + s[1] + 1) >> 1;
-    dst[1] = (s[2] + s[3] + 1) >> 1;
-    dst += 2;
-    s += 4;
-  }
-  if (dst_width & 1) {
-    dst[0] = (s[0] + s[1] + 1) >> 1;
-  }
-}
-
 void ScaleRowDown2Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
                        uint8* dst, int dst_width) {
  const uint8* s = src_ptr;
@ -103,45 +74,6 @@ void ScaleRowDown2Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
  }
 }

-void ScaleRowDown2Box_Odd_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* dst, int dst_width) {
-  const uint8* s = src_ptr;
-  const uint8* t = src_ptr + src_stride;
-  int x;
-  dst_width -= 1;
-  for (x = 0; x < dst_width - 1; x += 2) {
-    dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
-    dst[1] = (s[2] + s[3] + t[2] + t[3] + 2) >> 2;
-    dst += 2;
-    s += 4;
-    t += 4;
-  }
-  if (dst_width & 1) {
-    dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
-    dst += 1;
-    s += 2;
-    t += 2;
-  }
-  dst[0] = (s[0] + t[0] + 1) >> 1;
-}
-
-void ScaleRowDown2Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                           uint16* dst, int dst_width) {
-  const uint16* s = src_ptr;
-  const uint16* t = src_ptr + src_stride;
-  int x;
-  for (x = 0; x < dst_width - 1; x += 2) {
-    dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
-    dst[1] = (s[2] + s[3] + t[2] + t[3] + 2) >> 2;
-    dst += 2;
-    s += 4;
-    t += 4;
-  }
-  if (dst_width & 1) {
-    dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
-  }
-}
-
 void ScaleRowDown4_C(const uint8* src_ptr, ptrdiff_t src_stride,
                     uint8* dst, int dst_width) {
  int x;
@ -156,20 +88,6 @@ void ScaleRowDown4_C(const uint8* src_ptr, ptrdiff_t src_stride,
  }
 }

-void ScaleRowDown4_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                        uint16* dst, int dst_width) {
-  int x;
-  for (x = 0; x < dst_width - 1; x += 2) {
-    dst[0] = src_ptr[2];
-    dst[1] = src_ptr[6];
-    dst += 2;
-    src_ptr += 8;
-  }
-  if (dst_width & 1) {
-    dst[0] = src_ptr[2];
-  }
-}
-
 void ScaleRowDown4Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
                        uint8* dst, int dst_width) {
  intptr_t stride = src_stride;
@ -206,42 +124,6 @@ void ScaleRowDown4Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
  }
 }

-void ScaleRowDown4Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                           uint16* dst, int dst_width) {
-  intptr_t stride = src_stride;
-  int x;
-  for (x = 0; x < dst_width - 1; x += 2) {
-    dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
-             src_ptr[stride + 0] + src_ptr[stride + 1] +
-             src_ptr[stride + 2] + src_ptr[stride + 3] +
-             src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +
-             src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +
-             src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +
-             src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +
-             8) >> 4;
-    dst[1] = (src_ptr[4] + src_ptr[5] + src_ptr[6] + src_ptr[7] +
-             src_ptr[stride + 4] + src_ptr[stride + 5] +
-             src_ptr[stride + 6] + src_ptr[stride + 7] +
-             src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5] +
-             src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7] +
-             src_ptr[stride * 3 + 4] + src_ptr[stride * 3 + 5] +
-             src_ptr[stride * 3 + 6] + src_ptr[stride * 3 + 7] +
-             8) >> 4;
-    dst += 2;
-    src_ptr += 8;
-  }
-  if (dst_width & 1) {
-    dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
-             src_ptr[stride + 0] + src_ptr[stride + 1] +
-             src_ptr[stride + 2] + src_ptr[stride + 3] +
-             src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +
-             src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +
-             src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +
-             src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +
-             8) >> 4;
-  }
-}
-
 void ScaleRowDown34_C(const uint8* src_ptr, ptrdiff_t src_stride,
                      uint8* dst, int dst_width) {
  int x;
@ -255,19 +137,6 @@ void ScaleRowDown34_C(const uint8* src_ptr, ptrdiff_t src_stride,
  }
 }

-void ScaleRowDown34_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                         uint16* dst, int dst_width) {
-  int x;
-  assert((dst_width % 3 == 0) && (dst_width > 0));
-  for (x = 0; x < dst_width; x += 3) {
-    dst[0] = src_ptr[0];
-    dst[1] = src_ptr[1];
-    dst[2] = src_ptr[3];
-    dst += 3;
-    src_ptr += 4;
-  }
-}
-
 // Filter rows 0 and 1 together, 3 : 1
 void ScaleRowDown34_0_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
                            uint8* d, int dst_width) {
@ -291,28 +160,6 @@ void ScaleRowDown34_0_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
  }
 }

-void ScaleRowDown34_0_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                               uint16* d, int dst_width) {
-  const uint16* s = src_ptr;
-  const uint16* t = src_ptr + src_stride;
-  int x;
-  assert((dst_width % 3 == 0) && (dst_width > 0));
-  for (x = 0; x < dst_width; x += 3) {
-    uint16 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
-    uint16 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
-    uint16 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
-    uint16 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
-    uint16 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
-    uint16 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
-    d[0] = (a0 * 3 + b0 + 2) >> 2;
-    d[1] = (a1 * 3 + b1 + 2) >> 2;
-    d[2] = (a2 * 3 + b2 + 2) >> 2;
-    d += 3;
-    s += 4;
-    t += 4;
-  }
-}
-
 // Filter rows 1 and 2 together, 1 : 1
 void ScaleRowDown34_1_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
                            uint8* d, int dst_width) {
@ -336,28 +183,6 @@ void ScaleRowDown34_1_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
  }
 }

-void ScaleRowDown34_1_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                               uint16* d, int dst_width) {
-  const uint16* s = src_ptr;
-  const uint16* t = src_ptr + src_stride;
-  int x;
-  assert((dst_width % 3 == 0) && (dst_width > 0));
-  for (x = 0; x < dst_width; x += 3) {
-    uint16 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
-    uint16 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
-    uint16 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
-    uint16 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
-    uint16 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
-    uint16 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
-    d[0] = (a0 + b0 + 1) >> 1;
-    d[1] = (a1 + b1 + 1) >> 1;
-    d[2] = (a2 + b2 + 1) >> 1;
-    d += 3;
-    s += 4;
-    t += 4;
-  }
-}
-
 // Scales a single row of pixels using point sampling.
 void ScaleCols_C(uint8* dst_ptr, const uint8* src_ptr,
                 int dst_width, int x, int dx) {
@ -374,21 +199,6 @@ void ScaleCols_C(uint8* dst_ptr, const uint8* src_ptr,
  }
 }

-void ScaleCols_16_C(uint16* dst_ptr, const uint16* src_ptr,
-                    int dst_width, int x, int dx) {
-  int j;
-  for (j = 0; j < dst_width - 1; j += 2) {
-    dst_ptr[0] = src_ptr[x >> 16];
-    x += dx;
-    dst_ptr[1] = src_ptr[x >> 16];
-    x += dx;
-    dst_ptr += 2;
-  }
-  if (dst_width & 1) {
-    dst_ptr[0] = src_ptr[x >> 16];
-  }
-}
-
 // Scales a single row of pixels up by 2x using point sampling.
 void ScaleColsUp2_C(uint8* dst_ptr, const uint8* src_ptr,
                    int dst_width, int x, int dx) {
@ -403,28 +213,9 @@ void ScaleColsUp2_C(uint8* dst_ptr, const uint8* src_ptr,
  }
 }

-void ScaleColsUp2_16_C(uint16* dst_ptr, const uint16* src_ptr,
-                       int dst_width, int x, int dx) {
-  int j;
-  for (j = 0; j < dst_width - 1; j += 2) {
-    dst_ptr[1] = dst_ptr[0] = src_ptr[0];
-    src_ptr += 1;
-    dst_ptr += 2;
-  }
-  if (dst_width & 1) {
-    dst_ptr[0] = src_ptr[0];
-  }
-}
-
 // (1-f)a + fb can be replaced with a + f(b-a)
-#if defined(__arm__) || defined(__aarch64__)
 #define BLENDER(a, b, f) (uint8)((int)(a) + \
-    ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
-#else
-// inteluses 7 bit math with rounding.
-#define BLENDER(a, b, f) (uint8)((int)(a) + \
-    (((int)((f) >> 9) * ((int)(b) - (int)(a)) + 0x40) >> 7))
-#endif
+    ((int)(f) * ((int)(b) - (int)(a)) >> 16))

 void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr,
                       int dst_width, int x, int dx) {
@ -476,60 +267,6 @@ void ScaleFilterCols64_C(uint8* dst_ptr, const uint8* src_ptr,
 }
 #undef BLENDER

-// Same as 8 bit arm blender but return is cast to uint16
-#define BLENDER(a, b, f) (uint16)((int)(a) + \
-    ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
-
-void ScaleFilterCols_16_C(uint16* dst_ptr, const uint16* src_ptr,
-                       int dst_width, int x, int dx) {
-  int j;
-  for (j = 0; j < dst_width - 1; j += 2) {
-    int xi = x >> 16;
-    int a = src_ptr[xi];
-    int b = src_ptr[xi + 1];
-    dst_ptr[0] = BLENDER(a, b, x & 0xffff);
-    x += dx;
-    xi = x >> 16;
-    a = src_ptr[xi];
-    b = src_ptr[xi + 1];
-    dst_ptr[1] = BLENDER(a, b, x & 0xffff);
-    x += dx;
-    dst_ptr += 2;
-  }
-  if (dst_width & 1) {
-    int xi = x >> 16;
-    int a = src_ptr[xi];
-    int b = src_ptr[xi + 1];
-    dst_ptr[0] = BLENDER(a, b, x & 0xffff);
-  }
-}
-
-void ScaleFilterCols64_16_C(uint16* dst_ptr, const uint16* src_ptr,
-                         int dst_width, int x32, int dx) {
-  int64 x = (int64)(x32);
-  int j;
-  for (j = 0; j < dst_width - 1; j += 2) {
-    int64 xi = x >> 16;
-    int a = src_ptr[xi];
-    int b = src_ptr[xi + 1];
-    dst_ptr[0] = BLENDER(a, b, x & 0xffff);
-    x += dx;
-    xi = x >> 16;
-    a = src_ptr[xi];
-    b = src_ptr[xi + 1];
-    dst_ptr[1] = BLENDER(a, b, x & 0xffff);
-    x += dx;
-    dst_ptr += 2;
-  }
-  if (dst_width & 1) {
-    int64 xi = x >> 16;
-    int a = src_ptr[xi];
-    int b = src_ptr[xi + 1];
-    dst_ptr[0] = BLENDER(a, b, x & 0xffff);
-  }
-}
-#undef BLENDER
-
 void ScaleRowDown38_C(const uint8* src_ptr, ptrdiff_t src_stride,
                      uint8* dst, int dst_width) {
  int x;
@ -543,19 +280,6 @@ void ScaleRowDown38_C(const uint8* src_ptr, ptrdiff_t src_stride,
  }
 }

-void ScaleRowDown38_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                         uint16* dst, int dst_width) {
-  int x;
-  assert(dst_width % 3 == 0);
-  for (x = 0; x < dst_width; x += 3) {
-    dst[0] = src_ptr[0];
-    dst[1] = src_ptr[3];
-    dst[2] = src_ptr[6];
-    dst += 3;
-    src_ptr += 8;
-  }
-}
-
 // 8x3 -> 3x1
 void ScaleRowDown38_3_Box_C(const uint8* src_ptr,
                            ptrdiff_t src_stride,
@ -583,32 +307,6 @@ void ScaleRowDown38_3_Box_C(const uint8* src_ptr,
  }
 }

-void ScaleRowDown38_3_Box_16_C(const uint16* src_ptr,
-                               ptrdiff_t src_stride,
-                               uint16* dst_ptr, int dst_width) {
-  intptr_t stride = src_stride;
-  int i;
-  assert((dst_width % 3 == 0) && (dst_width > 0));
-  for (i = 0; i < dst_width; i += 3) {
-    dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
-        src_ptr[stride + 0] + src_ptr[stride + 1] +
-        src_ptr[stride + 2] + src_ptr[stride * 2 + 0] +
-        src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) *
-        (65536 / 9) >> 16;
-    dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
-        src_ptr[stride + 3] + src_ptr[stride + 4] +
-        src_ptr[stride + 5] + src_ptr[stride * 2 + 3] +
-        src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) *
-        (65536 / 9) >> 16;
-    dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
-        src_ptr[stride + 6] + src_ptr[stride + 7] +
-        src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) *
-        (65536 / 6) >> 16;
-    src_ptr += 8;
-    dst_ptr += 3;
-  }
-}
-
 // 8x2 -> 3x1
 void ScaleRowDown38_2_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
                            uint8* dst_ptr, int dst_width) {
@ -630,51 +328,21 @@ void ScaleRowDown38_2_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
  }
 }

-void ScaleRowDown38_2_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                               uint16* dst_ptr, int dst_width) {
-  intptr_t stride = src_stride;
-  int i;
-  assert((dst_width % 3 == 0) && (dst_width > 0));
-  for (i = 0; i < dst_width; i += 3) {
-    dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
-        src_ptr[stride + 0] + src_ptr[stride + 1] +
-        src_ptr[stride + 2]) * (65536 / 6) >> 16;
-    dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
-        src_ptr[stride + 3] + src_ptr[stride + 4] +
-        src_ptr[stride + 5]) * (65536 / 6) >> 16;
-    dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
-        src_ptr[stride + 6] + src_ptr[stride + 7]) *
-        (65536 / 4) >> 16;
-    src_ptr += 8;
-    dst_ptr += 3;
-  }
-}
-
-void ScaleAddRow_C(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
+void ScaleAddRows_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                    uint16* dst_ptr, int src_width, int src_height) {
  int x;
  assert(src_width > 0);
-  for (x = 0; x < src_width - 1; x += 2) {
-    dst_ptr[0] += src_ptr[0];
-    dst_ptr[1] += src_ptr[1];
-    src_ptr += 2;
-    dst_ptr += 2;
-  }
-  if (src_width & 1) {
-    dst_ptr[0] += src_ptr[0];
-  }
-}
-
-void ScaleAddRow_16_C(const uint16* src_ptr, uint32* dst_ptr, int src_width) {
-  int x;
-  assert(src_width > 0);
-  for (x = 0; x < src_width - 1; x += 2) {
-    dst_ptr[0] += src_ptr[0];
-    dst_ptr[1] += src_ptr[1];
-    src_ptr += 2;
-    dst_ptr += 2;
-  }
-  if (src_width & 1) {
-    dst_ptr[0] += src_ptr[0];
+  assert(src_height > 0);
+  for (x = 0; x < src_width; ++x) {
+    const uint8* s = src_ptr + x;
+    unsigned int sum = 0u;
+    int y;
+    for (y = 0; y < src_height; ++y) {
+      sum += s[0];
+      s += src_stride;
+    }
+    // TODO(fbarchard): Consider limitting height to 256 to avoid overflow.
+    dst_ptr[x] = sum < 65535u ? sum : 65535u;
  }
 }

@ -816,7 +484,6 @@ void ScaleARGBColsUp2_C(uint8* dst_argb, const uint8* src_argb,
  }
 }

-// TODO(fbarchard): Replace 0x7f ^ f with 128-f.  bug=607.
 // Mimics SSSE3 blender
 #define BLENDER1(a, b, f) ((a) * (0x7f ^ f) + (b) * f) >> 7
 #define BLENDERC(a, b, f, s) (uint32)( \
@ -906,16 +573,32 @@ void ScalePlaneVertical(int src_height,
  assert(dst_width > 0);
  assert(dst_height > 0);
  src_argb += (x >> 16) * bpp;
+#if defined(HAS_INTERPOLATEROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && dst_width_bytes >= 16) {
+    InterpolateRow = InterpolateRow_Any_SSE2;
+    if (IS_ALIGNED(dst_width_bytes, 16)) {
+      InterpolateRow = InterpolateRow_Unaligned_SSE2;
+      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) &&
+          IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
+        InterpolateRow = InterpolateRow_SSE2;
+      }
+    }
+  }
+#endif
 #if defined(HAS_INTERPOLATEROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
+  if (TestCpuFlag(kCpuHasSSSE3) && dst_width_bytes >= 16) {
    InterpolateRow = InterpolateRow_Any_SSSE3;
    if (IS_ALIGNED(dst_width_bytes, 16)) {
-      InterpolateRow = InterpolateRow_SSSE3;
+      InterpolateRow = InterpolateRow_Unaligned_SSSE3;
+      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) &&
+          IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
+        InterpolateRow = InterpolateRow_SSSE3;
+      }
    }
  }
 #endif
 #if defined(HAS_INTERPOLATEROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
+  if (TestCpuFlag(kCpuHasAVX2) && dst_width_bytes >= 32) {
    InterpolateRow = InterpolateRow_Any_AVX2;
    if (IS_ALIGNED(dst_width_bytes, 32)) {
      InterpolateRow = InterpolateRow_AVX2;
@ -923,20 +606,20 @@ void ScalePlaneVertical(int src_height,
  }
 #endif
 #if defined(HAS_INTERPOLATEROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
+  if (TestCpuFlag(kCpuHasNEON) && dst_width_bytes >= 16) {
    InterpolateRow = InterpolateRow_Any_NEON;
    if (IS_ALIGNED(dst_width_bytes, 16)) {
      InterpolateRow = InterpolateRow_NEON;
    }
  }
 #endif
-#if defined(HAS_INTERPOLATEROW_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2) &&
+#if defined(HAS_INTERPOLATEROWS_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && dst_width_bytes >= 4 &&
      IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4) &&
      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) {
-    InterpolateRow = InterpolateRow_Any_DSPR2;
+    InterpolateRow = InterpolateRow_Any_MIPS_DSPR2;
    if (IS_ALIGNED(dst_width_bytes, 4)) {
-      InterpolateRow = InterpolateRow_DSPR2;
+      InterpolateRow = InterpolateRow_MIPS_DSPR2;
    }
  }
 #endif
@ -954,80 +637,6 @@ void ScalePlaneVertical(int src_height,
    y += dy;
  }
 }
-void ScalePlaneVertical_16(int src_height,
-                           int dst_width, int dst_height,
-                           int src_stride, int dst_stride,
-                           const uint16* src_argb, uint16* dst_argb,
-                           int x, int y, int dy,
-                           int wpp, enum FilterMode filtering) {
-  // TODO(fbarchard): Allow higher wpp.
-  int dst_width_words = dst_width * wpp;
-  void (*InterpolateRow)(uint16* dst_argb, const uint16* src_argb,
-      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
-      InterpolateRow_16_C;
-  const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
-  int j;
-  assert(wpp >= 1 && wpp <= 2);
-  assert(src_height != 0);
-  assert(dst_width > 0);
-  assert(dst_height > 0);
-  src_argb += (x >> 16) * wpp;
-#if defined(HAS_INTERPOLATEROW_16_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    InterpolateRow = InterpolateRow_Any_16_SSE2;
-    if (IS_ALIGNED(dst_width_bytes, 16)) {
-      InterpolateRow = InterpolateRow_16_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_16_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    InterpolateRow = InterpolateRow_Any_16_SSSE3;
-    if (IS_ALIGNED(dst_width_bytes, 16)) {
-      InterpolateRow = InterpolateRow_16_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_16_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    InterpolateRow = InterpolateRow_Any_16_AVX2;
-    if (IS_ALIGNED(dst_width_bytes, 32)) {
-      InterpolateRow = InterpolateRow_16_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_16_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    InterpolateRow = InterpolateRow_Any_16_NEON;
-    if (IS_ALIGNED(dst_width_bytes, 16)) {
-      InterpolateRow = InterpolateRow_16_NEON;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_16_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2) &&
-      IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4) &&
-      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) {
-    InterpolateRow = InterpolateRow_Any_16_DSPR2;
-    if (IS_ALIGNED(dst_width_bytes, 4)) {
-      InterpolateRow = InterpolateRow_16_DSPR2;
-    }
-  }
-#endif
-  for (j = 0; j < dst_height; ++j) {
-    int yi;
-    int yf;
-    if (y > max_y) {
-      y = max_y;
-    }
-    yi = y >> 16;
-    yf = filtering ? ((y >> 8) & 255) : 0;
-    InterpolateRow(dst_argb, src_argb + yi * src_stride,
-                   src_stride, dst_width_words, yf);
-    dst_argb += dst_stride;
-    y += dy;
-  }
-}

 // Simplify the filtering based on scale factors.
 enum FilterMode ScaleFilterReduce(int src_width, int src_height,
@ -1044,6 +653,10 @@ enum FilterMode ScaleFilterReduce(int src_width, int src_height,
    if (dst_width * 2 >= src_width && dst_height * 2 >= src_height) {
      filtering = kFilterBilinear;
    }
+    // If scaling to larger, switch from Box to Bilinear.
+    if (dst_width >= src_width || dst_height >= src_height) {
+      filtering = kFilterBilinear;
+    }
  }
  if (filtering == kFilterBilinear) {
    if (src_height == 1) {
--- a/media/libyuv/source/scale_mips.cc
+++ b/media/libyuv/source/scale_mips.cc
@ -18,11 +18,10 @@ extern "C" {

 // This module is for GCC MIPS DSPR2
 #if !defined(LIBYUV_DISABLE_MIPS) && \
-    defined(__mips_dsp) && (__mips_dsp_rev >= 2) && \
-    (_MIPS_SIM == _MIPS_SIM_ABI32)
+    defined(__mips_dsp) && (__mips_dsp_rev >= 2)

-void ScaleRowDown2_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                         uint8* dst, int dst_width) {
+void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                              uint8* dst, int dst_width) {
  __asm__ __volatile__(
    ".set push                                     \n"
    ".set noreorder                                \n"
@ -31,6 +30,7 @@ void ScaleRowDown2_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
    "beqz           $t9, 2f                        \n"
    " nop                                          \n"

+    ".p2align       2                              \n"
  "1:                                              \n"
    "lw             $t0, 0(%[src_ptr])             \n"  // |3|2|1|0|
    "lw             $t1, 4(%[src_ptr])             \n"  // |7|6|5|4|
@ -77,8 +77,8 @@ void ScaleRowDown2_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
  );
 }

-void ScaleRowDown2Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* dst, int dst_width) {
+void ScaleRowDown2Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                                 uint8* dst, int dst_width) {
  const uint8* t = src_ptr + src_stride;

  __asm__ __volatile__ (
@ -89,6 +89,7 @@ void ScaleRowDown2Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
    "bltz           $t9, 2f                       \n"
    " nop                                         \n"

+    ".p2align       2                             \n"
  "1:                                             \n"
    "lw             $t0, 0(%[src_ptr])            \n"  // |3|2|1|0|
    "lw             $t1, 4(%[src_ptr])            \n"  // |7|6|5|4|
@ -176,8 +177,8 @@ void ScaleRowDown2Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
  );
 }

-void ScaleRowDown4_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                         uint8* dst, int dst_width) {
+void ScaleRowDown4_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                              uint8* dst, int dst_width) {
  __asm__ __volatile__ (
      ".set push                                    \n"
      ".set noreorder                               \n"
@ -186,6 +187,7 @@ void ScaleRowDown4_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
      "beqz           $t9, 2f                       \n"
      " nop                                         \n"

+      ".p2align       2                             \n"
     "1:                                            \n"
      "lw             $t1, 0(%[src_ptr])            \n"  // |3|2|1|0|
      "lw             $t2, 4(%[src_ptr])            \n"  // |7|6|5|4|
@ -231,8 +233,8 @@ void ScaleRowDown4_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
  );
 }

-void ScaleRowDown4Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* dst, int dst_width) {
+void ScaleRowDown4Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                                 uint8* dst, int dst_width) {
  intptr_t stride = src_stride;
  const uint8* s1 = src_ptr + stride;
  const uint8* s2 = s1 + stride;
@ -245,6 +247,7 @@ void ScaleRowDown4Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
      "srl           $t9, %[dst_width], 1         \n"
      "andi          $t8, %[dst_width], 1         \n"

+      ".p2align      2                            \n"
     "1:                                          \n"
      "lw            $t0, 0(%[src_ptr])           \n"  // |3|2|1|0|
      "lw            $t1, 0(%[s1])                \n"  // |7|6|5|4|
@ -310,11 +313,12 @@ void ScaleRowDown4Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
  );
 }

-void ScaleRowDown34_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                          uint8* dst, int dst_width) {
+void ScaleRowDown34_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                               uint8* dst, int dst_width) {
  __asm__ __volatile__ (
      ".set push                                          \n"
      ".set noreorder                                     \n"
+      ".p2align        2                                  \n"
    "1:                                                   \n"
      "lw              $t1, 0(%[src_ptr])                 \n"  // |3|2|1|0|
      "lw              $t2, 4(%[src_ptr])                 \n"  // |7|6|5|4|
@ -356,13 +360,14 @@ void ScaleRowDown34_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
  );
 }

-void ScaleRowDown34_0_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                                uint8* d, int dst_width) {
+void ScaleRowDown34_0_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                                     uint8* d, int dst_width) {
  __asm__ __volatile__ (
      ".set push                                         \n"
      ".set noreorder                                    \n"
      "repl.ph           $t3, 3                          \n"  // 0x00030003

+     ".p2align           2                               \n"
    "1:                                                  \n"
      "lw                $t0, 0(%[src_ptr])              \n"  // |S3|S2|S1|S0|
      "lwx               $t1, %[src_stride](%[src_ptr])  \n"  // |T3|T2|T1|T0|
@ -412,13 +417,14 @@ void ScaleRowDown34_0_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
  );
 }

-void ScaleRowDown34_1_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                                uint8* d, int dst_width) {
+void ScaleRowDown34_1_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                                     uint8* d, int dst_width) {
  __asm__ __volatile__ (
      ".set push                                           \n"
      ".set noreorder                                      \n"
      "repl.ph           $t2, 3                            \n"  // 0x00030003

+      ".p2align          2                                 \n"
    "1:                                                    \n"
      "lw                $t0, 0(%[src_ptr])                \n"  // |S3|S2|S1|S0|
      "lwx               $t1, %[src_stride](%[src_ptr])    \n"  // |T3|T2|T1|T0|
@ -464,12 +470,13 @@ void ScaleRowDown34_1_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
  );
 }

-void ScaleRowDown38_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                          uint8* dst, int dst_width) {
+void ScaleRowDown38_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                               uint8* dst, int dst_width) {
  __asm__ __volatile__ (
      ".set push                                     \n"
      ".set noreorder                                \n"

+      ".p2align   2                                  \n"
    "1:                                              \n"
      "lw         $t0, 0(%[src_ptr])                 \n"  // |3|2|1|0|
      "lw         $t1, 4(%[src_ptr])                 \n"  // |7|6|5|4|
@ -510,8 +517,8 @@ void ScaleRowDown38_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
  );
 }

-void ScaleRowDown38_2_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                                uint8* dst_ptr, int dst_width) {
+void ScaleRowDown38_2_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                                     uint8* dst_ptr, int dst_width) {
  intptr_t stride = src_stride;
  const uint8* t = src_ptr + stride;
  const int c = 0x2AAA;
@ -520,6 +527,7 @@ void ScaleRowDown38_2_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
      ".set push                                         \n"
      ".set noreorder                                    \n"

+      ".p2align        2                                 \n"
    "1:                                                  \n"
      "lw              $t0, 0(%[src_ptr])                \n"  // |S3|S2|S1|S0|
      "lw              $t1, 4(%[src_ptr])                \n"  // |S7|S6|S5|S4|
@ -563,9 +571,9 @@ void ScaleRowDown38_2_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
  );
 }

-void ScaleRowDown38_3_Box_DSPR2(const uint8* src_ptr,
-                                ptrdiff_t src_stride,
-                                uint8* dst_ptr, int dst_width) {
+void ScaleRowDown38_3_Box_MIPS_DSPR2(const uint8* src_ptr,
+                                     ptrdiff_t src_stride,
+                                     uint8* dst_ptr, int dst_width) {
  intptr_t stride = src_stride;
  const uint8* s1 = src_ptr + stride;
  stride += stride;
@ -577,6 +585,7 @@ void ScaleRowDown38_3_Box_DSPR2(const uint8* src_ptr,
      ".set push                                         \n"
      ".set noreorder                                    \n"

+      ".p2align        2                                 \n"
    "1:                                                  \n"
      "lw              $t0, 0(%[src_ptr])                \n"  // |S3|S2|S1|S0|
      "lw              $t1, 4(%[src_ptr])                \n"  // |S7|S6|S5|S4|
--- a/media/libyuv/source/scale_neon.cc
+++ b/media/libyuv/source/scale_neon.cc
@ -16,8 +16,7 @@ extern "C" {
 #endif

 // This module is for GCC Neon.
-#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
-    !defined(__aarch64__)
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)

 // NEON downscalers with interpolation.
 // Provided by Fritz Koenig
@ -26,12 +25,11 @@ extern "C" {
 void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
                        uint8* dst, int dst_width) {
  asm volatile (
+    ".p2align   2                              \n"
  "1:                                          \n"
    // load even pixels into q0, odd into q1
-    MEMACCESS(0)
    "vld2.8     {q0, q1}, [%0]!                \n"
    "subs       %2, %2, #16                    \n"  // 16 processed per loop
-    MEMACCESS(1)
    "vst1.8     {q1}, [%1]!                    \n"  // store odd pixels
    "bgt        1b                             \n"
  : "+r"(src_ptr),          // %0
@ -42,39 +40,15 @@ void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
  );
 }

-// Read 32x1 average down and write 16x1.
-void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst, int dst_width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {q0, q1}, [%0]!                \n"  // load pixels and post inc
-    "subs       %2, %2, #16                    \n"  // 16 processed per loop
-    "vpaddl.u8  q0, q0                         \n"  // add adjacent
-    "vpaddl.u8  q1, q1                         \n"
-    "vrshrn.u16 d0, q0, #1                     \n"  // downshift, round and pack
-    "vrshrn.u16 d1, q1, #1                     \n"
-    MEMACCESS(1)
-    "vst1.8     {q0}, [%1]!                    \n"
-    "bgt        1b                             \n"
-  : "+r"(src_ptr),          // %0
-    "+r"(dst),              // %1
-    "+r"(dst_width)         // %2
-  :
-  : "q0", "q1"     // Clobber List
-  );
-}
-
 // Read 32x2 average down and write 16x1.
 void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
                           uint8* dst, int dst_width) {
  asm volatile (
    // change the stride to row 2 pointer
    "add        %1, %0                         \n"
+    ".p2align   2                              \n"
  "1:                                          \n"
-    MEMACCESS(0)
    "vld1.8     {q0, q1}, [%0]!                \n"  // load row 1 and post inc
-    MEMACCESS(1)
    "vld1.8     {q2, q3}, [%1]!                \n"  // load row 2 and post inc
    "subs       %3, %3, #16                    \n"  // 16 processed per loop
    "vpaddl.u8  q0, q0                         \n"  // row 1 add adjacent
@ -83,7 +57,6 @@ void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
    "vpadal.u8  q1, q3                         \n"
    "vrshrn.u16 d0, q0, #2                     \n"  // downshift, round and pack
    "vrshrn.u16 d1, q1, #2                     \n"
-    MEMACCESS(2)
    "vst1.8     {q0}, [%2]!                    \n"
    "bgt        1b                             \n"
  : "+r"(src_ptr),          // %0
@ -98,11 +71,10 @@ void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
 void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
                        uint8* dst_ptr, int dst_width) {
  asm volatile (
+    ".p2align   2                              \n"
  "1:                                          \n"
-    MEMACCESS(0)
    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n" // src line 0
    "subs       %2, %2, #8                     \n" // 8 processed per loop
-    MEMACCESS(1)
    "vst1.8     {d2}, [%1]!                    \n"
    "bgt        1b                             \n"
  : "+r"(src_ptr),          // %0
@ -115,19 +87,16 @@ void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,

 void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
                           uint8* dst_ptr, int dst_width) {
-  const uint8* src_ptr1 = src_ptr + src_stride;
-  const uint8* src_ptr2 = src_ptr + src_stride * 2;
-  const uint8* src_ptr3 = src_ptr + src_stride * 3;
-asm volatile (
+  asm volatile (
+    "add        r4, %0, %3                     \n"
+    "add        r5, r4, %3                     \n"
+    "add        %3, r5, %3                     \n"
+    ".p2align   2                              \n"
  "1:                                          \n"
-    MEMACCESS(0)
    "vld1.8     {q0}, [%0]!                    \n"   // load up 16x4
-    MEMACCESS(3)
-    "vld1.8     {q1}, [%3]!                    \n"
-    MEMACCESS(4)
-    "vld1.8     {q2}, [%4]!                    \n"
-    MEMACCESS(5)
-    "vld1.8     {q3}, [%5]!                    \n"
+    "vld1.8     {q1}, [r4]!                    \n"
+    "vld1.8     {q2}, [r5]!                    \n"
+    "vld1.8     {q3}, [%3]!                    \n"
    "subs       %2, %2, #4                     \n"
    "vpaddl.u8  q0, q0                         \n"
    "vpadal.u8  q0, q1                         \n"
@ -136,17 +105,13 @@ asm volatile (
    "vpaddl.u16 q0, q0                         \n"
    "vrshrn.u32 d0, q0, #4                     \n"   // divide by 16 w/rounding
    "vmovn.u16  d0, q0                         \n"
-    MEMACCESS(1)
    "vst1.32    {d0[0]}, [%1]!                 \n"
    "bgt        1b                             \n"
-  : "+r"(src_ptr),   // %0
-    "+r"(dst_ptr),   // %1
-    "+r"(dst_width), // %2
-    "+r"(src_ptr1),  // %3
-    "+r"(src_ptr2),  // %4
-    "+r"(src_ptr3)   // %5
-  :
-  : "q0", "q1", "q2", "q3", "memory", "cc"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst_ptr),          // %1
+    "+r"(dst_width)         // %2
+  : "r"(src_stride)         // %3
+  : "r4", "r5", "q0", "q1", "q2", "q3", "memory", "cc"
  );
 }

@ -157,12 +122,11 @@ void ScaleRowDown34_NEON(const uint8* src_ptr,
                         ptrdiff_t src_stride,
                         uint8* dst_ptr, int dst_width) {
  asm volatile (
+    ".p2align   2                              \n"
  "1:                                          \n"
-    MEMACCESS(0)
    "vld4.8     {d0, d1, d2, d3}, [%0]!      \n" // src line 0
    "subs       %2, %2, #24                  \n"
    "vmov       d2, d3                       \n" // order d0, d1, d2
-    MEMACCESS(1)
    "vst3.8     {d0, d1, d2}, [%1]!          \n"
    "bgt        1b                           \n"
  : "+r"(src_ptr),          // %0
@ -179,10 +143,9 @@ void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
  asm volatile (
    "vmov.u8    d24, #3                        \n"
    "add        %3, %0                         \n"
+    ".p2align   2                              \n"
  "1:                                          \n"
-    MEMACCESS(0)
    "vld4.8       {d0, d1, d2, d3}, [%0]!      \n" // src line 0
-    MEMACCESS(3)
    "vld4.8       {d4, d5, d6, d7}, [%3]!      \n" // src line 1
    "subs         %2, %2, #24                  \n"

@ -219,7 +182,6 @@ void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
    "vmlal.u8     q8, d3, d24                  \n"
    "vqrshrn.u16  d2, q8, #2                   \n"

-    MEMACCESS(1)
    "vst3.8       {d0, d1, d2}, [%1]!          \n"

    "bgt          1b                           \n"
@ -238,10 +200,9 @@ void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
  asm volatile (
    "vmov.u8    d24, #3                        \n"
    "add        %3, %0                         \n"
+    ".p2align   2                              \n"
  "1:                                          \n"
-    MEMACCESS(0)
    "vld4.8       {d0, d1, d2, d3}, [%0]!      \n" // src line 0
-    MEMACCESS(3)
    "vld4.8       {d4, d5, d6, d7}, [%3]!      \n" // src line 1
    "subs         %2, %2, #24                  \n"
    // average src line 0 with src line 1
@ -261,7 +222,6 @@ void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
    "vmlal.u8     q3, d3, d24                  \n"
    "vqrshrn.u16  d2, q3, #2                   \n"

-    MEMACCESS(1)
    "vst3.8       {d0, d1, d2}, [%1]!          \n"
    "bgt          1b                           \n"
  : "+r"(src_ptr),          // %0
@ -290,17 +250,14 @@ void ScaleRowDown38_NEON(const uint8* src_ptr,
                         ptrdiff_t src_stride,
                         uint8* dst_ptr, int dst_width) {
  asm volatile (
-    MEMACCESS(3)
    "vld1.8     {q3}, [%3]                     \n"
+    ".p2align   2                              \n"
  "1:                                          \n"
-    MEMACCESS(0)
    "vld1.8     {d0, d1, d2, d3}, [%0]!        \n"
    "subs       %2, %2, #12                    \n"
    "vtbl.u8    d4, {d0, d1, d2, d3}, d6       \n"
    "vtbl.u8    d5, {d0, d1, d2, d3}, d7       \n"
-    MEMACCESS(1)
    "vst1.8     {d4}, [%1]!                    \n"
-    MEMACCESS(1)
    "vst1.32    {d5[0]}, [%1]!                 \n"
    "bgt        1b                             \n"
  : "+r"(src_ptr),          // %0
@ -315,28 +272,22 @@ void ScaleRowDown38_NEON(const uint8* src_ptr,
 void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
                                      ptrdiff_t src_stride,
                                      uint8* dst_ptr, int dst_width) {
-  const uint8* src_ptr1 = src_ptr + src_stride * 2;
-
  asm volatile (
-    MEMACCESS(5)
-    "vld1.16    {q13}, [%5]                    \n"
-    MEMACCESS(6)
-    "vld1.8     {q14}, [%6]                    \n"
-    MEMACCESS(7)
-    "vld1.8     {q15}, [%7]                    \n"
+    "vld1.16    {q13}, [%4]                    \n"
+    "vld1.8     {q14}, [%5]                    \n"
+    "vld1.8     {q15}, [%6]                    \n"
+    "add        r4, %0, %3, lsl #1             \n"
    "add        %3, %0                         \n"
+    ".p2align   2                              \n"
  "1:                                          \n"

    // d0 = 00 40 01 41 02 42 03 43
    // d1 = 10 50 11 51 12 52 13 53
    // d2 = 20 60 21 61 22 62 23 63
    // d3 = 30 70 31 71 32 72 33 73
-    MEMACCESS(0)
    "vld4.8       {d0, d1, d2, d3}, [%0]!      \n"
-    MEMACCESS(3)
    "vld4.8       {d4, d5, d6, d7}, [%3]!      \n"
-    MEMACCESS(4)
-    "vld4.8       {d16, d17, d18, d19}, [%4]!  \n"
+    "vld4.8       {d16, d17, d18, d19}, [r4]!  \n"
    "subs         %2, %2, #12                  \n"

    // Shuffle the input data around to get align the data
@ -413,20 +364,18 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
    "vtbl.u8      d3, {d0, d1, d2}, d28        \n"
    "vtbl.u8      d4, {d0, d1, d2}, d29        \n"

-    MEMACCESS(1)
    "vst1.8       {d3}, [%1]!                  \n"
-    MEMACCESS(1)
    "vst1.32      {d4[0]}, [%1]!               \n"
    "bgt          1b                           \n"
  : "+r"(src_ptr),          // %0
    "+r"(dst_ptr),          // %1
    "+r"(dst_width),        // %2
-    "+r"(src_stride),       // %3
-    "+r"(src_ptr1)          // %4
-  : "r"(&kMult38_Div6),     // %5
-    "r"(&kShuf38_2),        // %6
-    "r"(&kMult38_Div9)      // %7
-  : "q0", "q1", "q2", "q3", "q8", "q9", "q13", "q14", "q15", "memory", "cc"
+    "+r"(src_stride)        // %3
+  : "r"(&kMult38_Div6),     // %4
+    "r"(&kShuf38_2),        // %5
+    "r"(&kMult38_Div9)      // %6
+  : "r4", "q0", "q1", "q2", "q3", "q8", "q9",
+    "q13", "q14", "q15", "memory", "cc"
  );
 }

@ -435,20 +384,17 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
                               ptrdiff_t src_stride,
                               uint8* dst_ptr, int dst_width) {
  asm volatile (
-    MEMACCESS(4)
    "vld1.16    {q13}, [%4]                    \n"
-    MEMACCESS(5)
    "vld1.8     {q14}, [%5]                    \n"
    "add        %3, %0                         \n"
+    ".p2align   2                              \n"
  "1:                                          \n"

    // d0 = 00 40 01 41 02 42 03 43
    // d1 = 10 50 11 51 12 52 13 53
    // d2 = 20 60 21 61 22 62 23 63
    // d3 = 30 70 31 71 32 72 33 73
-    MEMACCESS(0)
    "vld4.8       {d0, d1, d2, d3}, [%0]!      \n"
-    MEMACCESS(3)
    "vld4.8       {d4, d5, d6, d7}, [%3]!      \n"
    "subs         %2, %2, #12                  \n"

@ -515,9 +461,7 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
    "vtbl.u8      d3, {d0, d1, d2}, d28        \n"
    "vtbl.u8      d4, {d0, d1, d2}, d29        \n"

-    MEMACCESS(1)
    "vst1.8       {d3}, [%1]!                  \n"
-    MEMACCESS(1)
    "vst1.32      {d4[0]}, [%1]!               \n"
    "bgt          1b                           \n"
  : "+r"(src_ptr),       // %0
@ -530,114 +474,6 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
  );
 }

-void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                    uint16* dst_ptr, int src_width, int src_height) {
-  const uint8* src_tmp;
-  asm volatile (
-  "1:                                          \n"
-    "mov       %0, %1                          \n"
-    "mov       r12, %5                         \n"
-    "veor      q2, q2, q2                      \n"
-    "veor      q3, q3, q3                      \n"
-  "2:                                          \n"
-    // load 16 pixels into q0
-    MEMACCESS(0)
-    "vld1.8     {q0}, [%0], %3                 \n"
-    "vaddw.u8   q3, q3, d1                     \n"
-    "vaddw.u8   q2, q2, d0                     \n"
-    "subs       r12, r12, #1                   \n"
-    "bgt        2b                             \n"
-    MEMACCESS(2)
-    "vst1.16    {q2, q3}, [%2]!                \n"  // store pixels
-    "add        %1, %1, #16                    \n"
-    "subs       %4, %4, #16                    \n"  // 16 processed per loop
-    "bgt        1b                             \n"
-  : "=&r"(src_tmp),    // %0
-    "+r"(src_ptr),     // %1
-    "+r"(dst_ptr),     // %2
-    "+r"(src_stride),  // %3
-    "+r"(src_width),   // %4
-    "+r"(src_height)   // %5
-  :
-  : "memory", "cc", "r12", "q0", "q1", "q2", "q3"  // Clobber List
-  );
-}
-
-// TODO(Yang Zhang): Investigate less load instructions for
-// the x/dx stepping
-#define LOAD2_DATA8_LANE(n)                                    \
-    "lsr        %5, %3, #16                    \n"             \
-    "add        %6, %1, %5                     \n"             \
-    "add        %3, %3, %4                     \n"             \
-    MEMACCESS(6)                                               \
-    "vld2.8     {d6["#n"], d7["#n"]}, [%6]     \n"
-
-// The NEON version mimics this formula:
-// #define BLENDER(a, b, f) (uint8)((int)(a) +
-//    ((int)(f) * ((int)(b) - (int)(a)) >> 16))
-
-void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
-                          int dst_width, int x, int dx) {
-  int dx_offset[4] = {0, 1, 2, 3};
-  int* tmp = dx_offset;
-  const uint8* src_tmp = src_ptr;
-  asm volatile (
-    "vdup.32    q0, %3                         \n"  // x
-    "vdup.32    q1, %4                         \n"  // dx
-    "vld1.32    {q2}, [%5]                     \n"  // 0 1 2 3
-    "vshl.i32   q3, q1, #2                     \n"  // 4 * dx
-    "vmul.s32   q1, q1, q2                     \n"
-    // x         , x + 1 * dx, x + 2 * dx, x + 3 * dx
-    "vadd.s32   q1, q1, q0                     \n"
-    // x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx
-    "vadd.s32   q2, q1, q3                     \n"
-    "vshl.i32   q0, q3, #1                     \n"  // 8 * dx
-  "1:                                          \n"
-    LOAD2_DATA8_LANE(0)
-    LOAD2_DATA8_LANE(1)
-    LOAD2_DATA8_LANE(2)
-    LOAD2_DATA8_LANE(3)
-    LOAD2_DATA8_LANE(4)
-    LOAD2_DATA8_LANE(5)
-    LOAD2_DATA8_LANE(6)
-    LOAD2_DATA8_LANE(7)
-    "vmov       q10, q1                        \n"
-    "vmov       q11, q2                        \n"
-    "vuzp.16    q10, q11                       \n"
-    "vmovl.u8   q8, d6                         \n"
-    "vmovl.u8   q9, d7                         \n"
-    "vsubl.s16  q11, d18, d16                  \n"
-    "vsubl.s16  q12, d19, d17                  \n"
-    "vmovl.u16  q13, d20                       \n"
-    "vmovl.u16  q10, d21                       \n"
-    "vmul.s32   q11, q11, q13                  \n"
-    "vmul.s32   q12, q12, q10                  \n"
-    "vrshrn.s32  d18, q11, #16                 \n"
-    "vrshrn.s32  d19, q12, #16                 \n"
-    "vadd.s16   q8, q8, q9                     \n"
-    "vmovn.s16  d6, q8                         \n"
-
-    MEMACCESS(0)
-    "vst1.8     {d6}, [%0]!                    \n"  // store pixels
-    "vadd.s32   q1, q1, q0                     \n"
-    "vadd.s32   q2, q2, q0                     \n"
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop
-    "bgt        1b                             \n"
-  : "+r"(dst_ptr),          // %0
-    "+r"(src_ptr),          // %1
-    "+r"(dst_width),        // %2
-    "+r"(x),                // %3
-    "+r"(dx),               // %4
-    "+r"(tmp),              // %5
-    "+r"(src_tmp)           // %6
-  :
-  : "memory", "cc", "q0", "q1", "q2", "q3",
-    "q8", "q9", "q10", "q11", "q12", "q13"
-  );
-}
-
-#undef LOAD2_DATA8_LANE
-
 // 16x2 -> 16x1
 void ScaleFilterRows_NEON(uint8* dst_ptr,
                          const uint8* src_ptr, ptrdiff_t src_stride,
@ -658,9 +494,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
    "vdup.8       d4, %4                       \n"
    // General purpose row blend.
  "1:                                          \n"
-    MEMACCESS(1)
    "vld1.8       {q0}, [%1]!                  \n"
-    MEMACCESS(2)
    "vld1.8       {q1}, [%2]!                  \n"
    "subs         %3, %3, #16                  \n"
    "vmull.u8     q13, d0, d4                  \n"
@ -669,63 +503,50 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
    "vmlal.u8     q14, d3, d5                  \n"
    "vrshrn.u16   d0, q13, #8                  \n"
    "vrshrn.u16   d1, q14, #8                  \n"
-    MEMACCESS(0)
    "vst1.8       {q0}, [%0]!                  \n"
    "bgt          1b                           \n"
    "b            99f                          \n"

    // Blend 25 / 75.
  "25:                                         \n"
-    MEMACCESS(1)
    "vld1.8       {q0}, [%1]!                  \n"
-    MEMACCESS(2)
    "vld1.8       {q1}, [%2]!                  \n"
    "subs         %3, %3, #16                  \n"
    "vrhadd.u8    q0, q1                       \n"
    "vrhadd.u8    q0, q1                       \n"
-    MEMACCESS(0)
    "vst1.8       {q0}, [%0]!                  \n"
    "bgt          25b                          \n"
    "b            99f                          \n"

    // Blend 50 / 50.
  "50:                                         \n"
-    MEMACCESS(1)
    "vld1.8       {q0}, [%1]!                  \n"
-    MEMACCESS(2)
    "vld1.8       {q1}, [%2]!                  \n"
    "subs         %3, %3, #16                  \n"
    "vrhadd.u8    q0, q1                       \n"
-    MEMACCESS(0)
    "vst1.8       {q0}, [%0]!                  \n"
    "bgt          50b                          \n"
    "b            99f                          \n"

    // Blend 75 / 25.
  "75:                                         \n"
-    MEMACCESS(1)
    "vld1.8       {q1}, [%1]!                  \n"
-    MEMACCESS(2)
    "vld1.8       {q0}, [%2]!                  \n"
    "subs         %3, %3, #16                  \n"
    "vrhadd.u8    q0, q1                       \n"
    "vrhadd.u8    q0, q1                       \n"
-    MEMACCESS(0)
    "vst1.8       {q0}, [%0]!                  \n"
    "bgt          75b                          \n"
    "b            99f                          \n"

    // Blend 100 / 0 - Copy row unchanged.
  "100:                                        \n"
-    MEMACCESS(1)
    "vld1.8       {q0}, [%1]!                  \n"
    "subs         %3, %3, #16                  \n"
-    MEMACCESS(0)
    "vst1.8       {q0}, [%0]!                  \n"
    "bgt          100b                         \n"

  "99:                                         \n"
-    MEMACCESS(0)
    "vst1.8       {d1[7]}, [%0]                \n"
  : "+r"(dst_ptr),          // %0
    "+r"(src_ptr),          // %1
@ -740,16 +561,13 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
 void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
                            uint8* dst, int dst_width) {
  asm volatile (
+    ".p2align   2                              \n"
  "1:                                          \n"
    // load even pixels into q0, odd into q1
-    MEMACCESS(0)
    "vld2.32    {q0, q1}, [%0]!                \n"
-    MEMACCESS(0)
    "vld2.32    {q2, q3}, [%0]!                \n"
    "subs       %2, %2, #8                     \n"  // 8 processed per loop
-    MEMACCESS(1)
    "vst1.8     {q1}, [%1]!                    \n"  // store odd pixels
-    MEMACCESS(1)
    "vst1.8     {q3}, [%1]!                    \n"
    "bgt        1b                             \n"
  : "+r"(src_ptr),          // %0
@ -760,52 +578,21 @@ void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
  );
 }

-void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride,
-                                  uint8* dst_argb, int dst_width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
-    MEMACCESS(0)
-    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop
-    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
-    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
-    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
-    "vpaddl.u8  q3, q3                         \n"  // A 16 bytes -> 8 shorts.
-    "vrshrn.u16 d0, q0, #1                     \n"  // downshift, round and pack
-    "vrshrn.u16 d1, q1, #1                     \n"
-    "vrshrn.u16 d2, q2, #1                     \n"
-    "vrshrn.u16 d3, q3, #1                     \n"
-    MEMACCESS(1)
-    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"
-    "bgt       1b                              \n"
-  : "+r"(src_argb),         // %0
-    "+r"(dst_argb),         // %1
-    "+r"(dst_width)         // %2
-  :
-  : "memory", "cc", "q0", "q1", "q2", "q3"     // Clobber List
-  );
-}
-
 void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
                               uint8* dst, int dst_width) {
  asm volatile (
    // change the stride to row 2 pointer
    "add        %1, %1, %0                     \n"
+    ".p2align   2                              \n"
  "1:                                          \n"
-    MEMACCESS(0)
    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
-    MEMACCESS(0)
    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
    "vpaddl.u8  q3, q3                         \n"  // A 16 bytes -> 8 shorts.
-    MEMACCESS(1)
    "vld4.8     {d16, d18, d20, d22}, [%1]!    \n"  // load 8 more ARGB pixels.
-    MEMACCESS(1)
    "vld4.8     {d17, d19, d21, d23}, [%1]!    \n"  // load last 8 ARGB pixels.
    "vpadal.u8  q0, q8                         \n"  // B 16 bytes -> 8 shorts.
    "vpadal.u8  q1, q9                         \n"  // G 16 bytes -> 8 shorts.
@ -815,7 +602,6 @@ void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
    "vrshrn.u16 d1, q1, #2                     \n"
    "vrshrn.u16 d2, q2, #2                     \n"
    "vrshrn.u16 d3, q3, #2                     \n"
-    MEMACCESS(2)
    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"
    "bgt        1b                             \n"
  : "+r"(src_ptr),          // %0
@ -833,17 +619,13 @@ void ScaleARGBRowDownEven_NEON(const uint8* src_argb,  ptrdiff_t src_stride,
                               int src_stepx, uint8* dst_argb, int dst_width) {
  asm volatile (
    "mov        r12, %3, lsl #2                \n"
+    ".p2align   2                              \n"
  "1:                                          \n"
-    MEMACCESS(0)
    "vld1.32    {d0[0]}, [%0], r12             \n"
-    MEMACCESS(0)
    "vld1.32    {d0[1]}, [%0], r12             \n"
-    MEMACCESS(0)
    "vld1.32    {d1[0]}, [%0], r12             \n"
-    MEMACCESS(0)
    "vld1.32    {d1[1]}, [%0], r12             \n"
    "subs       %2, %2, #4                     \n"  // 4 pixels per loop.
-    MEMACCESS(1)
    "vst1.8     {q0}, [%1]!                    \n"
    "bgt        1b                             \n"
  : "+r"(src_argb),    // %0
@ -862,22 +644,15 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
  asm volatile (
    "mov        r12, %4, lsl #2                \n"
    "add        %1, %1, %0                     \n"
+    ".p2align   2                              \n"
  "1:                                          \n"
-    MEMACCESS(0)
    "vld1.8     {d0}, [%0], r12                \n"  // Read 4 2x2 blocks -> 2x1
-    MEMACCESS(1)
    "vld1.8     {d1}, [%1], r12                \n"
-    MEMACCESS(0)
    "vld1.8     {d2}, [%0], r12                \n"
-    MEMACCESS(1)
    "vld1.8     {d3}, [%1], r12                \n"
-    MEMACCESS(0)
    "vld1.8     {d4}, [%0], r12                \n"
-    MEMACCESS(1)
    "vld1.8     {d5}, [%1], r12                \n"
-    MEMACCESS(0)
    "vld1.8     {d6}, [%0], r12                \n"
-    MEMACCESS(1)
    "vld1.8     {d7}, [%1], r12                \n"
    "vaddl.u8   q0, d0, d1                     \n"
    "vaddl.u8   q1, d2, d3                     \n"
@ -890,7 +665,6 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
    "vrshrn.u16 d0, q0, #2                     \n"  // first 2 pixels.
    "vrshrn.u16 d1, q2, #2                     \n"  // next 2 pixels.
    "subs       %3, %3, #4                     \n"  // 4 pixels per loop.
-    MEMACCESS(2)
    "vst1.8     {q0}, [%2]!                    \n"
    "bgt        1b                             \n"
  : "+r"(src_argb),    // %0
@ -902,118 +676,7 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
  );
 }

-// TODO(Yang Zhang): Investigate less load instructions for
-// the x/dx stepping
-#define LOAD1_DATA32_LANE(dn, n)                               \
-    "lsr        %5, %3, #16                    \n"             \
-    "add        %6, %1, %5, lsl #2             \n"             \
-    "add        %3, %3, %4                     \n"             \
-    MEMACCESS(6)                                               \
-    "vld1.32    {"#dn"["#n"]}, [%6]            \n"
-
-void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,
-                        int dst_width, int x, int dx) {
-  int tmp;
-  const uint8* src_tmp = src_argb;
-  asm volatile (
-  "1:                                          \n"
-    LOAD1_DATA32_LANE(d0, 0)
-    LOAD1_DATA32_LANE(d0, 1)
-    LOAD1_DATA32_LANE(d1, 0)
-    LOAD1_DATA32_LANE(d1, 1)
-    LOAD1_DATA32_LANE(d2, 0)
-    LOAD1_DATA32_LANE(d2, 1)
-    LOAD1_DATA32_LANE(d3, 0)
-    LOAD1_DATA32_LANE(d3, 1)
-
-    MEMACCESS(0)
-    "vst1.32     {q0, q1}, [%0]!               \n"  // store pixels
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop
-    "bgt        1b                             \n"
-  : "+r"(dst_argb),   // %0
-    "+r"(src_argb),   // %1
-    "+r"(dst_width),  // %2
-    "+r"(x),          // %3
-    "+r"(dx),         // %4
-    "=&r"(tmp),       // %5
-    "+r"(src_tmp)     // %6
-  :
-  : "memory", "cc", "q0", "q1"
-  );
-}
-
-#undef LOAD1_DATA32_LANE
-
-// TODO(Yang Zhang): Investigate less load instructions for
-// the x/dx stepping
-#define LOAD2_DATA32_LANE(dn1, dn2, n)                         \
-    "lsr        %5, %3, #16                           \n"      \
-    "add        %6, %1, %5, lsl #2                    \n"      \
-    "add        %3, %3, %4                            \n"      \
-    MEMACCESS(6)                                               \
-    "vld2.32    {"#dn1"["#n"], "#dn2"["#n"]}, [%6]    \n"
-
-void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb,
-                              int dst_width, int x, int dx) {
-  int dx_offset[4] = {0, 1, 2, 3};
-  int* tmp = dx_offset;
-  const uint8* src_tmp = src_argb;
-  asm volatile (
-    "vdup.32    q0, %3                         \n"  // x
-    "vdup.32    q1, %4                         \n"  // dx
-    "vld1.32    {q2}, [%5]                     \n"  // 0 1 2 3
-    "vshl.i32   q9, q1, #2                     \n"  // 4 * dx
-    "vmul.s32   q1, q1, q2                     \n"
-    "vmov.i8    q3, #0x7f                      \n"  // 0x7F
-    "vmov.i16   q15, #0x7f                     \n"  // 0x7F
-    // x         , x + 1 * dx, x + 2 * dx, x + 3 * dx
-    "vadd.s32   q8, q1, q0                     \n"
-  "1:                                          \n"
-    // d0, d1: a
-    // d2, d3: b
-    LOAD2_DATA32_LANE(d0, d2, 0)
-    LOAD2_DATA32_LANE(d0, d2, 1)
-    LOAD2_DATA32_LANE(d1, d3, 0)
-    LOAD2_DATA32_LANE(d1, d3, 1)
-    "vshrn.i32   d22, q8, #9                   \n"
-    "vand.16     d22, d22, d30                 \n"
-    "vdup.8      d24, d22[0]                   \n"
-    "vdup.8      d25, d22[2]                   \n"
-    "vdup.8      d26, d22[4]                   \n"
-    "vdup.8      d27, d22[6]                   \n"
-    "vext.8      d4, d24, d25, #4              \n"
-    "vext.8      d5, d26, d27, #4              \n"  // f
-    "veor.8      q10, q2, q3                   \n"  // 0x7f ^ f
-    "vmull.u8    q11, d0, d20                  \n"
-    "vmull.u8    q12, d1, d21                  \n"
-    "vmull.u8    q13, d2, d4                   \n"
-    "vmull.u8    q14, d3, d5                   \n"
-    "vadd.i16    q11, q11, q13                 \n"
-    "vadd.i16    q12, q12, q14                 \n"
-    "vshrn.i16   d0, q11, #7                   \n"
-    "vshrn.i16   d1, q12, #7                   \n"
-
-    MEMACCESS(0)
-    "vst1.32     {d0, d1}, [%0]!               \n"  // store pixels
-    "vadd.s32    q8, q8, q9                    \n"
-    "subs        %2, %2, #4                    \n"  // 4 processed per loop
-    "bgt         1b                            \n"
-  : "+r"(dst_argb),         // %0
-    "+r"(src_argb),         // %1
-    "+r"(dst_width),        // %2
-    "+r"(x),                // %3
-    "+r"(dx),               // %4
-    "+r"(tmp),              // %5
-    "+r"(src_tmp)           // %6
-  :
-  : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9",
-    "q10", "q11", "q12", "q13", "q14", "q15"
-  );
-}
-
-#undef LOAD2_DATA32_LANE
-
-#endif  // defined(__ARM_NEON__) && !defined(__aarch64__)
+#endif  // __ARM_NEON__

 #ifdef __cplusplus
 }  // extern "C"
--- a/media/libyuv/source/scale_win.cc
+++ b/media/libyuv/source/scale_win.cc
--- a/media/libyuv/source/video_common.cc
+++ b/media/libyuv/source/video_common.cc
@ -25,7 +25,6 @@ struct FourCCAliasEntry {

 static const struct FourCCAliasEntry kFourCCAliases[] = {
  {FOURCC_IYUV, FOURCC_I420},
-  {FOURCC_YU12, FOURCC_I420},
  {FOURCC_YU16, FOURCC_I422},
  {FOURCC_YU24, FOURCC_I444},
  {FOURCC_YUYV, FOURCC_YUY2},
@ -34,7 +33,7 @@ static const struct FourCCAliasEntry kFourCCAliases[] = {
  {FOURCC_2VUY, FOURCC_UYVY},  // kCMPixelFormat_422YpCbCr8
  {FOURCC_JPEG, FOURCC_MJPG},  // Note: JPEG has DHT while MJPG does not.
  {FOURCC_DMB1, FOURCC_MJPG},
-  {FOURCC_BA81, FOURCC_BGGR},  // deprecated.
+  {FOURCC_BA81, FOURCC_BGGR},
  {FOURCC_RGB3, FOURCC_RAW },
  {FOURCC_BGR3, FOURCC_24BG},
  {FOURCC_CM32, FOURCC_BGRA},  // kCMPixelFormat_32ARGB
--- a/media/libyuv/tools/valgrind-libyuv/libyuv_tests.py
+++ b/media/libyuv/tools/valgrind-libyuv/libyuv_tests.py
@ -84,14 +84,6 @@ def main(_):
                    help='Additional arguments to --gtest_filter')
  parser.add_option('', '--gtest_repeat',
                    help='Argument for --gtest_repeat')
-  parser.add_option("--gtest_shuffle", action="store_true", default=False,
-                    help="Randomize tests' orders on every iteration.")
-  parser.add_option("--gtest_break_on_failure", action="store_true",
-                    default=False,
-                    help="Drop in to debugger on assertion failure. Also "
-                         "useful for forcing tests to exit with a stack dump "
-                         "on the first assertion failure when running with "
-                         "--gtest_repeat=-1")
  parser.add_option('-v', '--verbose', action='store_true', default=False,
                    help='Verbose output - enable debug log messages')
  parser.add_option('', '--tool', dest='valgrind_tool', default='memcheck',
@ -103,12 +95,6 @@ def main(_):
                          'instead of /tmp.\nThis can be useful for tool '
                          'developers/maintainers.\nPlease note that the <tool>'
                          '.logs directory will be clobbered on tool startup.'))
-  parser.add_option("--test-launcher-bot-mode", action="store_true",
-                    help="run the tests with --test-launcher-bot-mode")
-  parser.add_option("--test-launcher-total-shards", type=int,
-                    help="run the tests with --test-launcher-total-shards")
-  parser.add_option("--test-launcher-shard-index", type=int,
-                    help="run the tests with --test-launcher-shard-index")
  options, args = parser.parse_args()

  if options.verbose:
--- a/media/libyuv/unit_test/basictypes_test.cc
+++ b/media/libyuv/unit_test/basictypes_test.cc
@ -13,7 +13,7 @@

 namespace libyuv {

-TEST_F(LibYUVBaseTest, Endian) {
+TEST_F(libyuvTest, Endian) {
  uint16 v16 = 0x1234u;
  uint8 first_byte = *reinterpret_cast<uint8*>(&v16);
 #if defined(LIBYUV_LITTLE_ENDIAN)
@ -23,7 +23,7 @@ TEST_F(LibYUVBaseTest, Endian) {
 #endif
 }

-TEST_F(LibYUVBaseTest, SizeOfTypes) {
+TEST_F(libyuvTest, SizeOfTypes) {
  int8 i8 = -1;
  uint8 u8 = 1u;
  int16 i16 = -1;
@ -50,7 +50,7 @@ TEST_F(LibYUVBaseTest, SizeOfTypes) {
  EXPECT_LT(0u, u64);
 }

-TEST_F(LibYUVBaseTest, SizeOfConstants) {
+TEST_F(libyuvTest, SizeOfConstants) {
  EXPECT_EQ(8u, sizeof(INT64_C(0)));
  EXPECT_EQ(8u, sizeof(UINT64_C(0)));
  EXPECT_EQ(8u, sizeof(INT64_C(0x1234567887654321)));
--- a/media/libyuv/unit_test/compare_test.cc
+++ b/media/libyuv/unit_test/compare_test.cc
@ -16,7 +16,7 @@
 #include "libyuv/basic_types.h"
 #include "libyuv/compare.h"
 #include "libyuv/cpu_id.h"
-#include "libyuv/video_common.h"
+#include "libyuv/row.h"

 namespace libyuv {

@ -31,10 +31,10 @@ static uint32 ReferenceHashDjb2(const uint8* src, uint64 count, uint32 seed) {
  return hash;
 }

-TEST_F(LibYUVBaseTest, Djb2_Test) {
+TEST_F(libyuvTest, Djb2_Test) {
  const int kMaxTest = benchmark_width_ * benchmark_height_;
-  align_buffer_page_end(src_a, kMaxTest);
-  align_buffer_page_end(src_b, kMaxTest);
+  align_buffer_64(src_a, kMaxTest);
+  align_buffer_64(src_b, kMaxTest);

  const char* fox = "The quick brown fox jumps over the lazy dog"
      " and feels as if he were in the seventh heaven of typography"
@ -44,8 +44,8 @@ TEST_F(LibYUVBaseTest, Djb2_Test) {
  EXPECT_EQ(kExpectedFoxHash, foxhash);

  for (int i = 0; i < kMaxTest; ++i) {
-    src_a[i] = (fastrand() & 0xff);
-    src_b[i] = (fastrand() & 0xff);
+    src_a[i] = (random() & 0xff);
+    src_b[i] = (random() & 0xff);
  }
  // Compare different buffers. Expect hash is different.
  uint32 h1 = HashDjb2(src_a, kMaxTest, 5381);
@ -111,13 +111,13 @@ TEST_F(LibYUVBaseTest, Djb2_Test) {
  h2 = HashDjb2(src_a, kMaxTest / 2, 0);
  EXPECT_EQ(h1, h2);

-  free_aligned_buffer_page_end(src_a);
-  free_aligned_buffer_page_end(src_b);
+  free_aligned_buffer_64(src_a);
+  free_aligned_buffer_64(src_b);
 }

-TEST_F(LibYUVBaseTest, BenchmarkDjb2_Opt) {
+TEST_F(libyuvTest, BenchmarkDjb2_Opt) {
  const int kMaxTest = benchmark_width_ * benchmark_height_;
-  align_buffer_page_end(src_a, kMaxTest);
+  align_buffer_64(src_a, kMaxTest);

  for (int i = 0; i < kMaxTest; ++i) {
    src_a[i] = i;
@ -128,12 +128,12 @@ TEST_F(LibYUVBaseTest, BenchmarkDjb2_Opt) {
    h1 = HashDjb2(src_a, kMaxTest, 5381);
  }
  EXPECT_EQ(h1, h2);
-  free_aligned_buffer_page_end(src_a);
+  free_aligned_buffer_64(src_a);
 }

-TEST_F(LibYUVBaseTest, BenchmarkDjb2_Unaligned) {
+TEST_F(libyuvTest, BenchmarkDjb2_Unaligned) {
  const int kMaxTest = benchmark_width_ * benchmark_height_;
-  align_buffer_page_end(src_a, kMaxTest + 1);
+  align_buffer_64(src_a, kMaxTest + 1);
  for (int i = 0; i < kMaxTest; ++i) {
    src_a[i + 1] = i;
  }
@ -143,68 +143,13 @@ TEST_F(LibYUVBaseTest, BenchmarkDjb2_Unaligned) {
    h1 = HashDjb2(src_a + 1, kMaxTest, 5381);
  }
  EXPECT_EQ(h1, h2);
-  free_aligned_buffer_page_end(src_a);
+  free_aligned_buffer_64(src_a);
 }

-TEST_F(LibYUVBaseTest, BenchmarkARGBDetect_Opt) {
-  uint32 fourcc;
-  const int kMaxTest = benchmark_width_ * benchmark_height_ * 4;
-  align_buffer_page_end(src_a, kMaxTest);
-  for (int i = 0; i < kMaxTest; ++i) {
-    src_a[i] = 255;
-  }
-
-  src_a[0] = 0;
-  fourcc = ARGBDetect(src_a, benchmark_width_ * 4,
-                      benchmark_width_, benchmark_height_);
-  EXPECT_EQ(libyuv::FOURCC_BGRA, fourcc);
-  src_a[0] = 255;
-  src_a[3] = 0;
-  fourcc = ARGBDetect(src_a, benchmark_width_ * 4,
-                      benchmark_width_, benchmark_height_);
-  EXPECT_EQ(libyuv::FOURCC_ARGB, fourcc);
-  src_a[3] = 255;
-
-  for (int i = 0; i < benchmark_iterations_; ++i) {
-    fourcc = ARGBDetect(src_a, benchmark_width_ * 4,
-                        benchmark_width_, benchmark_height_);
-  }
-  EXPECT_EQ(0, fourcc);
-
-  free_aligned_buffer_page_end(src_a);
-}
-
-TEST_F(LibYUVBaseTest, BenchmarkARGBDetect_Unaligned) {
-  uint32 fourcc;
-  const int kMaxTest = benchmark_width_ * benchmark_height_ * 4 + 1;
-  align_buffer_page_end(src_a, kMaxTest);
-  for (int i = 1; i < kMaxTest; ++i) {
-    src_a[i] = 255;
-  }
-
-  src_a[0 + 1] = 0;
-  fourcc = ARGBDetect(src_a + 1, benchmark_width_ * 4,
-                      benchmark_width_, benchmark_height_);
-  EXPECT_EQ(libyuv::FOURCC_BGRA, fourcc);
-  src_a[0 + 1] = 255;
-  src_a[3 + 1] = 0;
-  fourcc = ARGBDetect(src_a + 1, benchmark_width_ * 4,
-                      benchmark_width_, benchmark_height_);
-  EXPECT_EQ(libyuv::FOURCC_ARGB, fourcc);
-  src_a[3 + 1] = 255;
-
-  for (int i = 0; i < benchmark_iterations_; ++i) {
-    fourcc = ARGBDetect(src_a + 1, benchmark_width_ * 4,
-                        benchmark_width_, benchmark_height_);
-  }
-  EXPECT_EQ(0, fourcc);
-
-  free_aligned_buffer_page_end(src_a);
-}
-TEST_F(LibYUVBaseTest, BenchmarkSumSquareError_Opt) {
+TEST_F(libyuvTest, BenchmarkSumSquareError_Opt) {
  const int kMaxWidth = 4096 * 3;
-  align_buffer_page_end(src_a, kMaxWidth);
-  align_buffer_page_end(src_b, kMaxWidth);
+  align_buffer_64(src_a, kMaxWidth);
+  align_buffer_64(src_b, kMaxWidth);
  memset(src_a, 0, kMaxWidth);
  memset(src_b, 0, kMaxWidth);

@ -228,14 +173,14 @@ TEST_F(LibYUVBaseTest, BenchmarkSumSquareError_Opt) {

  EXPECT_EQ(0, h1);

-  free_aligned_buffer_page_end(src_a);
-  free_aligned_buffer_page_end(src_b);
+  free_aligned_buffer_64(src_a);
+  free_aligned_buffer_64(src_b);
 }

-TEST_F(LibYUVBaseTest, SumSquareError) {
+TEST_F(libyuvTest, SumSquareError) {
  const int kMaxWidth = 4096 * 3;
-  align_buffer_page_end(src_a, kMaxWidth);
-  align_buffer_page_end(src_b, kMaxWidth);
+  align_buffer_64(src_a, kMaxWidth);
+  align_buffer_64(src_b, kMaxWidth);
  memset(src_a, 0, kMaxWidth);
  memset(src_b, 0, kMaxWidth);

@ -255,32 +200,34 @@ TEST_F(LibYUVBaseTest, SumSquareError) {

  EXPECT_EQ(kMaxWidth * 3 * 3, err);

+  srandom(time(NULL));
+
  for (int i = 0; i < kMaxWidth; ++i) {
-    src_a[i] = (fastrand() & 0xff);
-    src_b[i] = (fastrand() & 0xff);
+    src_a[i] = (random() & 0xff);
+    src_b[i] = (random() & 0xff);
  }

-  MaskCpuFlags(disable_cpu_flags_);
+  MaskCpuFlags(0);
  uint64 c_err = ComputeSumSquareError(src_a, src_b, kMaxWidth);

-  MaskCpuFlags(benchmark_cpu_info_);
+  MaskCpuFlags(-1);
  uint64 opt_err = ComputeSumSquareError(src_a, src_b, kMaxWidth);

  EXPECT_EQ(c_err, opt_err);

-  free_aligned_buffer_page_end(src_a);
-  free_aligned_buffer_page_end(src_b);
+  free_aligned_buffer_64(src_a);
+  free_aligned_buffer_64(src_b);
 }

-TEST_F(LibYUVBaseTest, BenchmarkPsnr_Opt) {
-  align_buffer_page_end(src_a, benchmark_width_ * benchmark_height_);
-  align_buffer_page_end(src_b, benchmark_width_ * benchmark_height_);
+TEST_F(libyuvTest, BenchmarkPsnr_Opt) {
+  align_buffer_64(src_a, benchmark_width_ * benchmark_height_);
+  align_buffer_64(src_b, benchmark_width_ * benchmark_height_);
  for (int i = 0; i < benchmark_width_ * benchmark_height_; ++i) {
    src_a[i] = i;
    src_b[i] = i;
  }

-  MaskCpuFlags(benchmark_cpu_info_);
+  MaskCpuFlags(-1);

  double opt_time = get_time();
  for (int i = 0; i < benchmark_iterations_; ++i)
@ -293,43 +240,18 @@ TEST_F(LibYUVBaseTest, BenchmarkPsnr_Opt) {

  EXPECT_EQ(0, 0);

-  free_aligned_buffer_page_end(src_a);
-  free_aligned_buffer_page_end(src_b);
+  free_aligned_buffer_64(src_a);
+  free_aligned_buffer_64(src_b);
 }

-TEST_F(LibYUVBaseTest, BenchmarkPsnr_Unaligned) {
-  align_buffer_page_end(src_a, benchmark_width_ * benchmark_height_ + 1);
-  align_buffer_page_end(src_b, benchmark_width_ * benchmark_height_);
-  for (int i = 0; i < benchmark_width_ * benchmark_height_; ++i) {
-    src_a[i + 1] = i;
-    src_b[i] = i;
-  }
-
-  MaskCpuFlags(benchmark_cpu_info_);
-
-  double opt_time = get_time();
-  for (int i = 0; i < benchmark_iterations_; ++i)
-    CalcFramePsnr(src_a + 1, benchmark_width_,
-                  src_b, benchmark_width_,
-                  benchmark_width_, benchmark_height_);
-
-  opt_time = (get_time() - opt_time) / benchmark_iterations_;
-  printf("BenchmarkPsnr_Opt - %8.2f us opt\n", opt_time * 1e6);
-
-  EXPECT_EQ(0, 0);
-
-  free_aligned_buffer_page_end(src_a);
-  free_aligned_buffer_page_end(src_b);
-}
-
-TEST_F(LibYUVBaseTest, Psnr) {
+TEST_F(libyuvTest, Psnr) {
  const int kSrcWidth = benchmark_width_;
  const int kSrcHeight = benchmark_height_;
  const int b = 128;
  const int kSrcPlaneSize = (kSrcWidth + b * 2) * (kSrcHeight + b * 2);
  const int kSrcStride = 2 * b + kSrcWidth;
-  align_buffer_page_end(src_a, kSrcPlaneSize);
-  align_buffer_page_end(src_b, kSrcPlaneSize);
+  align_buffer_64(src_a, kSrcPlaneSize);
+  align_buffer_64(src_b, kSrcPlaneSize);
  memset(src_a, 0, kSrcPlaneSize);
  memset(src_b, 0, kSrcPlaneSize);

@ -370,24 +292,26 @@ TEST_F(LibYUVBaseTest, Psnr) {
    EXPECT_LT(err, 6.0);
  }

+  srandom(time(NULL));
+
  memset(src_a, 0, kSrcPlaneSize);
  memset(src_b, 0, kSrcPlaneSize);

  for (int i = b; i < (kSrcHeight + b); ++i) {
    for (int j = b; j < (kSrcWidth + b); ++j) {
-      src_a[(i * kSrcStride) + j] = (fastrand() & 0xff);
-      src_b[(i * kSrcStride) + j] = (fastrand() & 0xff);
+      src_a[(i * kSrcStride) + j] = (random() & 0xff);
+      src_b[(i * kSrcStride) + j] = (random() & 0xff);
    }
  }

-  MaskCpuFlags(disable_cpu_flags_);
+  MaskCpuFlags(0);
  double c_err, opt_err;

  c_err = CalcFramePsnr(src_a + kSrcStride * b + b, kSrcStride,
                        src_b + kSrcStride * b + b, kSrcStride,
                        kSrcWidth, kSrcHeight);

-  MaskCpuFlags(benchmark_cpu_info_);
+  MaskCpuFlags(-1);

  opt_err = CalcFramePsnr(src_a + kSrcStride * b + b, kSrcStride,
                          src_b + kSrcStride * b + b, kSrcStride,
@ -395,19 +319,19 @@ TEST_F(LibYUVBaseTest, Psnr) {

  EXPECT_EQ(opt_err, c_err);

-  free_aligned_buffer_page_end(src_a);
-  free_aligned_buffer_page_end(src_b);
+  free_aligned_buffer_64(src_a);
+  free_aligned_buffer_64(src_b);
 }

-TEST_F(LibYUVBaseTest, DISABLED_BenchmarkSsim_Opt) {
-  align_buffer_page_end(src_a, benchmark_width_ * benchmark_height_);
-  align_buffer_page_end(src_b, benchmark_width_ * benchmark_height_);
+TEST_F(libyuvTest, DISABLED_BenchmarkSsim_Opt) {
+  align_buffer_64(src_a, benchmark_width_ * benchmark_height_);
+  align_buffer_64(src_b, benchmark_width_ * benchmark_height_);
  for (int i = 0; i < benchmark_width_ * benchmark_height_; ++i) {
    src_a[i] = i;
    src_b[i] = i;
  }

-  MaskCpuFlags(benchmark_cpu_info_);
+  MaskCpuFlags(-1);

  double opt_time = get_time();
  for (int i = 0; i < benchmark_iterations_; ++i)
@ -420,18 +344,18 @@ TEST_F(LibYUVBaseTest, DISABLED_BenchmarkSsim_Opt) {

  EXPECT_EQ(0, 0);  // Pass if we get this far.

-  free_aligned_buffer_page_end(src_a);
-  free_aligned_buffer_page_end(src_b);
+  free_aligned_buffer_64(src_a);
+  free_aligned_buffer_64(src_b);
 }

-TEST_F(LibYUVBaseTest, Ssim) {
+TEST_F(libyuvTest, Ssim) {
  const int kSrcWidth = benchmark_width_;
  const int kSrcHeight = benchmark_height_;
  const int b = 128;
  const int kSrcPlaneSize = (kSrcWidth + b * 2) * (kSrcHeight + b * 2);
  const int kSrcStride = 2 * b + kSrcWidth;
-  align_buffer_page_end(src_a, kSrcPlaneSize);
-  align_buffer_page_end(src_b, kSrcPlaneSize);
+  align_buffer_64(src_a, kSrcPlaneSize);
+  align_buffer_64(src_b, kSrcPlaneSize);
  memset(src_a, 0, kSrcPlaneSize);
  memset(src_b, 0, kSrcPlaneSize);

@ -482,21 +406,22 @@ TEST_F(LibYUVBaseTest, Ssim) {
    EXPECT_LT(err, 0.01);
  }

+  srandom(time(NULL));
  for (int i = b; i < (kSrcHeight + b); ++i) {
    for (int j = b; j < (kSrcWidth + b); ++j) {
-      src_a[(i * kSrcStride) + j] = (fastrand() & 0xff);
-      src_b[(i * kSrcStride) + j] = (fastrand() & 0xff);
+      src_a[(i * kSrcStride) + j] = (random() & 0xff);
+      src_b[(i * kSrcStride) + j] = (random() & 0xff);
    }
  }

-  MaskCpuFlags(disable_cpu_flags_);
+  MaskCpuFlags(0);
  double c_err, opt_err;

  c_err = CalcFrameSsim(src_a + kSrcStride * b + b, kSrcStride,
                        src_b + kSrcStride * b + b, kSrcStride,
                        kSrcWidth, kSrcHeight);

-  MaskCpuFlags(benchmark_cpu_info_);
+  MaskCpuFlags(-1);

  opt_err = CalcFrameSsim(src_a + kSrcStride * b + b, kSrcStride,
                          src_b + kSrcStride * b + b, kSrcStride,
@ -506,8 +431,8 @@ TEST_F(LibYUVBaseTest, Ssim) {
    EXPECT_EQ(opt_err, c_err);
  }

-  free_aligned_buffer_page_end(src_a);
-  free_aligned_buffer_page_end(src_b);
+  free_aligned_buffer_64(src_a);
+  free_aligned_buffer_64(src_b);
 }

 }  // namespace libyuv
--- a/media/libyuv/unit_test/convert_test.cc
+++ b/media/libyuv/unit_test/convert_test.cc
--- a/media/libyuv/unit_test/cpu_test.cc
+++ b/media/libyuv/unit_test/cpu_test.cc
@ -18,7 +18,7 @@

 namespace libyuv {

-TEST_F(LibYUVBaseTest, TestCpuHas) {
+TEST_F(libyuvTest, TestCpuHas) {
  int cpu_flags = TestCpuFlag(-1);
  printf("Cpu Flags %x\n", cpu_flags);
  int has_arm = TestCpuFlag(kCpuHasARM);
@ -43,39 +43,17 @@ TEST_F(LibYUVBaseTest, TestCpuHas) {
  printf("Has ERMS %x\n", has_erms);
  int has_fma3 = TestCpuFlag(kCpuHasFMA3);
  printf("Has FMA3 %x\n", has_fma3);
-  int has_avx3 = TestCpuFlag(kCpuHasAVX3);
-  printf("Has AVX3 %x\n", has_avx3);
  int has_mips = TestCpuFlag(kCpuHasMIPS);
  printf("Has MIPS %x\n", has_mips);
-  int has_dspr2 = TestCpuFlag(kCpuHasDSPR2);
-  printf("Has DSPR2 %x\n", has_dspr2);
-}
-
-TEST_F(LibYUVBaseTest, TestCpuCompilerEnabled) {
-#if defined(__aarch64__)
-  printf("Arm64 build\n");
-#endif
-#if defined(__aarch64__) || defined(__ARM_NEON__) || defined(LIBYUV_NEON)
-  printf("Neon build enabled\n");
-#endif
-#if defined(__x86_64__) || defined(_M_X64)
-  printf("x64 build\n");
-#endif
-#ifdef _MSC_VER
-printf("_MSC_VER %d\n", _MSC_VER);
-#endif
-#if !defined(LIBYUV_DISABLE_X86) && (defined(GCC_HAS_AVX2) || \
-    defined(CLANG_HAS_AVX2) || defined(VISUALC_HAS_AVX2))
-  printf("Has AVX2 1\n");
-#else
-  printf("Has AVX2 0\n");
-  // If compiler does not support AVX2, the following function not expected:
-#endif
+  int has_mips_dsp = TestCpuFlag(kCpuHasMIPS_DSP);
+  printf("Has MIPS DSP %x\n", has_mips_dsp);
+  int has_mips_dspr2 = TestCpuFlag(kCpuHasMIPS_DSPR2);
+  printf("Has MIPS DSPR2 %x\n", has_mips_dspr2);
 }

 #if defined(__i386__) || defined(__x86_64__) || \
    defined(_M_IX86) || defined(_M_X64)
-TEST_F(LibYUVBaseTest, TestCpuId) {
+TEST_F(libyuvTest, TestCpuId) {
  int has_x86 = TestCpuFlag(kCpuHasX86);
  if (has_x86) {
    uint32 cpu_info[4];
@ -114,25 +92,16 @@ TEST_F(LibYUVBaseTest, TestCpuId) {
 }
 #endif

-static int FileExists(const char* file_name) {
-  FILE* f = fopen(file_name, "r");
-  if (!f) {
-    return 0;
-  }
-  fclose(f);
-  return 1;
-}
-
-TEST_F(LibYUVBaseTest, TestLinuxNeon) {
-  if (FileExists("../../unit_test/testdata/arm_v7.txt")) {
-    EXPECT_EQ(0, ArmCpuCaps("../../unit_test/testdata/arm_v7.txt"));
-    EXPECT_EQ(kCpuHasNEON, ArmCpuCaps("../../unit_test/testdata/tegra3.txt"));
-    EXPECT_EQ(kCpuHasNEON, ArmCpuCaps("../../unit_test/testdata/juno.txt"));
+TEST_F(libyuvTest, TestLinuxNeon) {
+  int testdata = ArmCpuCaps("unit_test/testdata/arm_v7.txt");
+  if (testdata) {
+    EXPECT_EQ(0, ArmCpuCaps("unit_test/testdata/arm_v7.txt"));
+    EXPECT_EQ(kCpuHasNEON, ArmCpuCaps("unit_test/testdata/tegra3.txt"));
  } else {
-    printf("WARNING: unable to load \"../../unit_test/testdata/arm_v7.txt\"\n");
+    printf("WARNING: unable to load \"unit_test/testdata/arm_v7.txt\"\n");
  }
 #if defined(__linux__) && defined(__ARM_NEON__)
-  EXPECT_EQ(kCpuHasNEON, ArmCpuCaps("/proc/cpuinfo"));
+  EXPECT_NE(0, ArmCpuCaps("/proc/cpuinfo"));
 #endif
 }

--- a/media/libyuv/unit_test/math_test.cc
+++ b/media/libyuv/unit_test/math_test.cc
@ -10,17 +10,17 @@

 #include <stdlib.h>
 #include <string.h>
-#include <time.h>

 #include "libyuv/basic_types.h"
 #include "libyuv/cpu_id.h"
+#include "libyuv/row.h"
 #include "libyuv/scale.h"
 #include "libyuv/scale_row.h"
 #include "../unit_test/unit_test.h"

 namespace libyuv {

-TEST_F(LibYUVBaseTest, TestFixedDiv) {
+TEST_F(libyuvTest, TestFixedDiv) {
  int num[1280];
  int div[1280];
  int result_opt[1280];
@ -65,6 +65,7 @@ TEST_F(LibYUVBaseTest, TestFixedDiv) {
  }
  EXPECT_EQ(123 * 65536, libyuv::FixedDiv(123, 1));

+  srandom(time(NULL));
  MemRandomize(reinterpret_cast<uint8*>(&num[0]), sizeof(num));
  MemRandomize(reinterpret_cast<uint8*>(&div[0]), sizeof(div));
  for (int j = 0; j < 1280; ++j) {
@ -84,12 +85,13 @@ TEST_F(LibYUVBaseTest, TestFixedDiv) {
  }
 }

-TEST_F(LibYUVBaseTest, TestFixedDiv_Opt) {
+TEST_F(libyuvTest, TestFixedDiv_Opt) {
  int num[1280];
  int div[1280];
  int result_opt[1280];
  int result_c[1280];

+  srandom(time(NULL));
  MemRandomize(reinterpret_cast<uint8*>(&num[0]), sizeof(num));
  MemRandomize(reinterpret_cast<uint8*>(&div[0]), sizeof(div));
  for (int j = 0; j < 1280; ++j) {
@ -118,12 +120,13 @@ TEST_F(LibYUVBaseTest, TestFixedDiv_Opt) {
  }
 }

-TEST_F(LibYUVBaseTest, TestFixedDiv1_Opt) {
+TEST_F(libyuvTest, TestFixedDiv1_Opt) {
  int num[1280];
  int div[1280];
  int result_opt[1280];
  int result_c[1280];

+  srandom(time(NULL));
  MemRandomize(reinterpret_cast<uint8*>(&num[0]), sizeof(num));
  MemRandomize(reinterpret_cast<uint8*>(&div[0]), sizeof(div));
  for (int j = 0; j < 1280; ++j) {
--- a/media/libyuv/unit_test/planar_test.cc
+++ b/media/libyuv/unit_test/planar_test.cc
--- a/media/libyuv/unit_test/rotate_argb_test.cc
+++ b/media/libyuv/unit_test/rotate_argb_test.cc
@ -9,9 +9,11 @@
 */

 #include <stdlib.h>
+#include <time.h>

 #include "libyuv/cpu_id.h"
 #include "libyuv/rotate_argb.h"
+#include "libyuv/row.h"
 #include "../unit_test/unit_test.h"

 namespace libyuv {
@ -20,8 +22,6 @@ void TestRotateBpp(int src_width, int src_height,
                   int dst_width, int dst_height,
                   libyuv::RotationMode mode,
                   int benchmark_iterations,
-                   int disable_cpu_flags,
-                   int benchmark_cpu_info,
                   const int kBpp) {
  if (src_width < 1) {
    src_width = 1;
@ -36,38 +36,38 @@ void TestRotateBpp(int src_width, int src_height,
    dst_height = 1;
  }
  int src_stride_argb = src_width * kBpp;
-  int src_argb_plane_size = src_stride_argb * abs(src_height);
-  align_buffer_page_end(src_argb, src_argb_plane_size);
+  int src_argb_plane_size = src_stride_argb * src_height;
+  align_buffer_64(src_argb, src_argb_plane_size);
  for (int i = 0; i < src_argb_plane_size; ++i) {
-    src_argb[i] = fastrand() & 0xff;
+    src_argb[i] = random() & 0xff;
  }

  int dst_stride_argb = dst_width * kBpp;
  int dst_argb_plane_size = dst_stride_argb * dst_height;
-  align_buffer_page_end(dst_argb_c, dst_argb_plane_size);
-  align_buffer_page_end(dst_argb_opt, dst_argb_plane_size);
+  align_buffer_64(dst_argb_c, dst_argb_plane_size);
+  align_buffer_64(dst_argb_opt, dst_argb_plane_size);
  memset(dst_argb_c, 2, dst_argb_plane_size);
  memset(dst_argb_opt, 3, dst_argb_plane_size);

  if (kBpp == 1) {
-    MaskCpuFlags(disable_cpu_flags);  // Disable all CPU optimization.
+    MaskCpuFlags(0);  // Disable all CPU optimization.
    RotatePlane(src_argb, src_stride_argb,
                dst_argb_c, dst_stride_argb,
                src_width, src_height, mode);

-    MaskCpuFlags(benchmark_cpu_info);  // Enable all CPU optimization.
+    MaskCpuFlags(-1);  // Enable all CPU optimization.
    for (int i = 0; i < benchmark_iterations; ++i) {
      RotatePlane(src_argb, src_stride_argb,
                  dst_argb_opt, dst_stride_argb,
                  src_width, src_height, mode);
    }
  } else if (kBpp == 4) {
-    MaskCpuFlags(disable_cpu_flags);  // Disable all CPU optimization.
+    MaskCpuFlags(0);  // Disable all CPU optimization.
    ARGBRotate(src_argb, src_stride_argb,
               dst_argb_c, dst_stride_argb,
               src_width, src_height, mode);

-    MaskCpuFlags(benchmark_cpu_info);  // Enable all CPU optimization.
+    MaskCpuFlags(-1);  // Enable all CPU optimization.
    for (int i = 0; i < benchmark_iterations; ++i) {
      ARGBRotate(src_argb, src_stride_argb,
                 dst_argb_opt, dst_stride_argb,
@ -80,117 +80,123 @@ void TestRotateBpp(int src_width, int src_height,
    EXPECT_EQ(dst_argb_c[i], dst_argb_opt[i]);
  }

-  free_aligned_buffer_page_end(dst_argb_c);
-  free_aligned_buffer_page_end(dst_argb_opt);
-  free_aligned_buffer_page_end(src_argb);
+  free_aligned_buffer_64(dst_argb_c);
+  free_aligned_buffer_64(dst_argb_opt);
+  free_aligned_buffer_64(src_argb);
 }

 static void ARGBTestRotate(int src_width, int src_height,
                           int dst_width, int dst_height,
                           libyuv::RotationMode mode,
-                           int benchmark_iterations,
-                           int disable_cpu_flags,
-                           int benchmark_cpu_info) {
+                           int benchmark_iterations) {
  TestRotateBpp(src_width, src_height,
                dst_width, dst_height,
-                mode, benchmark_iterations,
-                disable_cpu_flags, benchmark_cpu_info, 4);
+                mode, benchmark_iterations, 4);
 }

-TEST_F(LibYUVRotateTest, ARGBRotate0_Opt) {
+TEST_F(libyuvTest, ARGBRotate0) {
  ARGBTestRotate(benchmark_width_, benchmark_height_,
                 benchmark_width_, benchmark_height_,
-                 kRotate0, benchmark_iterations_,
-                 disable_cpu_flags_, benchmark_cpu_info_);
+                 kRotate0, benchmark_iterations_);
 }

-TEST_F(LibYUVRotateTest, ARGBRotate90_Opt) {
+TEST_F(libyuvTest, ARGBRotate90) {
  ARGBTestRotate(benchmark_width_, benchmark_height_,
                 benchmark_height_, benchmark_width_,
-                 kRotate90, benchmark_iterations_,
-                 disable_cpu_flags_, benchmark_cpu_info_);
+                 kRotate90, benchmark_iterations_);
 }

-TEST_F(LibYUVRotateTest, ARGBRotate180_Opt) {
+TEST_F(libyuvTest, ARGBRotate180) {
  ARGBTestRotate(benchmark_width_, benchmark_height_,
                 benchmark_width_, benchmark_height_,
-                 kRotate180, benchmark_iterations_,
-                 disable_cpu_flags_, benchmark_cpu_info_);
+                 kRotate180, benchmark_iterations_);
 }

-TEST_F(LibYUVRotateTest, ARGBRotate270_Opt) {
+TEST_F(libyuvTest, ARGBRotate270) {
  ARGBTestRotate(benchmark_width_, benchmark_height_,
                 benchmark_height_, benchmark_width_,
-                 kRotate270, benchmark_iterations_,
-                 disable_cpu_flags_, benchmark_cpu_info_);
+                 kRotate270, benchmark_iterations_);
+}
+
+TEST_F(libyuvTest, ARGBRotate0_Odd) {
+  ARGBTestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
+                 benchmark_width_ - 3, benchmark_height_ - 1,
+                 kRotate0, benchmark_iterations_);
+}
+
+TEST_F(libyuvTest, ARGBRotate90_Odd) {
+  ARGBTestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
+                 benchmark_height_ - 1, benchmark_width_ - 3,
+                 kRotate90, benchmark_iterations_);
+}
+
+TEST_F(libyuvTest, ARGBRotate180_Odd) {
+  ARGBTestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
+                 benchmark_width_ - 3, benchmark_height_ - 1,
+                 kRotate180, benchmark_iterations_);
+}
+
+TEST_F(libyuvTest, ARGBRotate270_Odd) {
+  ARGBTestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
+                 benchmark_height_ - 1, benchmark_width_ - 3,
+                 kRotate270, benchmark_iterations_);
 }

 static void TestRotatePlane(int src_width, int src_height,
                            int dst_width, int dst_height,
                            libyuv::RotationMode mode,
-                            int benchmark_iterations,
-                            int disable_cpu_flags,
-                            int benchmark_cpu_info) {
+                            int benchmark_iterations) {
  TestRotateBpp(src_width, src_height,
                dst_width, dst_height,
-                mode, benchmark_iterations,
-                disable_cpu_flags, benchmark_cpu_info, 1);
+                mode, benchmark_iterations, 1);
 }

-TEST_F(LibYUVRotateTest, RotatePlane0_Opt) {
+TEST_F(libyuvTest, RotatePlane0) {
  TestRotatePlane(benchmark_width_, benchmark_height_,
                  benchmark_width_, benchmark_height_,
-                  kRotate0, benchmark_iterations_,
-                 disable_cpu_flags_, benchmark_cpu_info_);
+                  kRotate0, benchmark_iterations_);
 }

-TEST_F(LibYUVRotateTest, RotatePlane90_Opt) {
+TEST_F(libyuvTest, RotatePlane90) {
  TestRotatePlane(benchmark_width_, benchmark_height_,
                  benchmark_height_, benchmark_width_,
-                  kRotate90, benchmark_iterations_,
-                 disable_cpu_flags_, benchmark_cpu_info_);
+                  kRotate90, benchmark_iterations_);
 }

-TEST_F(LibYUVRotateTest, RotatePlane180_Opt) {
+TEST_F(libyuvTest, RotatePlane180) {
  TestRotatePlane(benchmark_width_, benchmark_height_,
                  benchmark_width_, benchmark_height_,
-                  kRotate180, benchmark_iterations_,
-                 disable_cpu_flags_, benchmark_cpu_info_);
+                  kRotate180, benchmark_iterations_);
 }

-TEST_F(LibYUVRotateTest, RotatePlane270_Opt) {
+TEST_F(libyuvTest, RotatePlane270) {
  TestRotatePlane(benchmark_width_, benchmark_height_,
                  benchmark_height_, benchmark_width_,
-                  kRotate270, benchmark_iterations_,
-                 disable_cpu_flags_, benchmark_cpu_info_);
+                  kRotate270, benchmark_iterations_);
 }

-TEST_F(LibYUVRotateTest, DISABLED_RotatePlane0_Odd) {
+TEST_F(libyuvTest, RotatePlane0_Odd) {
  TestRotatePlane(benchmark_width_ - 3, benchmark_height_ - 1,
                  benchmark_width_ - 3, benchmark_height_ - 1,
-                  kRotate0, benchmark_iterations_,
-                 disable_cpu_flags_, benchmark_cpu_info_);
+                  kRotate0, benchmark_iterations_);
 }

-TEST_F(LibYUVRotateTest, DISABLED_RotatePlane90_Odd) {
+TEST_F(libyuvTest, RotatePlane90_Odd) {
  TestRotatePlane(benchmark_width_ - 3, benchmark_height_ - 1,
                  benchmark_height_ - 1, benchmark_width_ - 3,
-                  kRotate90, benchmark_iterations_,
-                 disable_cpu_flags_, benchmark_cpu_info_);
+                  kRotate90, benchmark_iterations_);
 }

-TEST_F(LibYUVRotateTest, DISABLED_RotatePlane180_Odd) {
+TEST_F(libyuvTest, RotatePlane180_Odd) {
  TestRotatePlane(benchmark_width_ - 3, benchmark_height_ - 1,
                  benchmark_width_ - 3, benchmark_height_ - 1,
-                  kRotate180, benchmark_iterations_,
-                 disable_cpu_flags_, benchmark_cpu_info_);
+                  kRotate180, benchmark_iterations_);
 }

-TEST_F(LibYUVRotateTest, DISABLED_RotatePlane270_Odd) {
+TEST_F(libyuvTest, RotatePlane270_Odd) {
  TestRotatePlane(benchmark_width_ - 3, benchmark_height_ - 1,
                  benchmark_height_ - 1, benchmark_width_ - 3,
-                  kRotate270, benchmark_iterations_,
-                 disable_cpu_flags_, benchmark_cpu_info_);
+                  kRotate270, benchmark_iterations_);
 }

 }  // namespace libyuv
--- a/media/libyuv/unit_test/rotate_test.cc
+++ b/media/libyuv/unit_test/rotate_test.cc
@ -9,9 +9,11 @@
 */

 #include <stdlib.h>
+#include <time.h>

 #include "libyuv/cpu_id.h"
 #include "libyuv/rotate.h"
+#include "libyuv/row.h"
 #include "../unit_test/unit_test.h"

 namespace libyuv {
@ -19,12 +21,11 @@ namespace libyuv {
 static void I420TestRotate(int src_width, int src_height,
                           int dst_width, int dst_height,
                           libyuv::RotationMode mode,
-                           int benchmark_iterations,
-                           int disable_cpu_flags, int benchmark_cpu_info) {
+                           int benchmark_iterations) {
  if (src_width < 1) {
    src_width = 1;
  }
-  if (src_height == 0) {
+  if (src_height < 1) {
    src_height = 1;
  }
  if (dst_width < 1) {
@ -33,23 +34,23 @@ static void I420TestRotate(int src_width, int src_height,
  if (dst_height < 1) {
    dst_height = 1;
  }
-  int src_i420_y_size = src_width * Abs(src_height);
-  int src_i420_uv_size = ((src_width + 1) / 2) * ((Abs(src_height) + 1) / 2);
+  int src_i420_y_size = src_width * src_height;
+  int src_i420_uv_size = ((src_width + 1) / 2) * ((src_height + 1) / 2);
  int src_i420_size = src_i420_y_size + src_i420_uv_size * 2;
-  align_buffer_page_end(src_i420, src_i420_size);
+  align_buffer_64(src_i420, src_i420_size);
  for (int i = 0; i < src_i420_size; ++i) {
-    src_i420[i] = fastrand() & 0xff;
+    src_i420[i] = random() & 0xff;
  }

  int dst_i420_y_size = dst_width * dst_height;
  int dst_i420_uv_size = ((dst_width + 1) / 2) * ((dst_height + 1) / 2);
  int dst_i420_size = dst_i420_y_size + dst_i420_uv_size * 2;
-  align_buffer_page_end(dst_i420_c, dst_i420_size);
-  align_buffer_page_end(dst_i420_opt, dst_i420_size);
+  align_buffer_64(dst_i420_c, dst_i420_size);
+  align_buffer_64(dst_i420_opt, dst_i420_size);
  memset(dst_i420_c, 2, dst_i420_size);
  memset(dst_i420_opt, 3, dst_i420_size);

-  MaskCpuFlags(disable_cpu_flags);  // Disable all CPU optimization.
+  MaskCpuFlags(0);  // Disable all CPU optimization.
  I420Rotate(src_i420, src_width,
             src_i420 + src_i420_y_size, (src_width + 1) / 2,
             src_i420 + src_i420_y_size + src_i420_uv_size, (src_width + 1) / 2,
@ -59,7 +60,7 @@ static void I420TestRotate(int src_width, int src_height,
               (dst_width + 1) / 2,
             src_width, src_height, mode);

-  MaskCpuFlags(benchmark_cpu_info);  // Enable all CPU optimization.
+  MaskCpuFlags(-1);  // Enable all CPU optimization.
  for (int i = 0; i < benchmark_iterations; ++i) {
    I420Rotate(src_i420, src_width,
               src_i420 + src_i420_y_size, (src_width + 1) / 2,
@ -77,79 +78,67 @@ static void I420TestRotate(int src_width, int src_height,
    EXPECT_EQ(dst_i420_c[i], dst_i420_opt[i]);
  }

-  free_aligned_buffer_page_end(dst_i420_c);
-  free_aligned_buffer_page_end(dst_i420_opt);
-  free_aligned_buffer_page_end(src_i420);
+  free_aligned_buffer_64(dst_i420_c);
+  free_aligned_buffer_64(dst_i420_opt);
+  free_aligned_buffer_64(src_i420);
 }

-TEST_F(LibYUVRotateTest, I420Rotate0_Opt) {
+TEST_F(libyuvTest, I420Rotate0) {
  I420TestRotate(benchmark_width_, benchmark_height_,
                 benchmark_width_, benchmark_height_,
-                 kRotate0, benchmark_iterations_,
-                 disable_cpu_flags_, benchmark_cpu_info_);
+                 kRotate0, benchmark_iterations_);
 }

-TEST_F(LibYUVRotateTest, I420Rotate90_Opt) {
+TEST_F(libyuvTest, I420Rotate90) {
  I420TestRotate(benchmark_width_, benchmark_height_,
                 benchmark_height_, benchmark_width_,
-                 kRotate90, benchmark_iterations_,
-                 disable_cpu_flags_, benchmark_cpu_info_);
+                 kRotate90, benchmark_iterations_);
 }

-TEST_F(LibYUVRotateTest, I420Rotate180_Opt) {
+TEST_F(libyuvTest, I420Rotate180) {
  I420TestRotate(benchmark_width_, benchmark_height_,
                 benchmark_width_, benchmark_height_,
-                 kRotate180, benchmark_iterations_,
-                 disable_cpu_flags_, benchmark_cpu_info_);
+                 kRotate180, benchmark_iterations_);
 }

-TEST_F(LibYUVRotateTest, I420Rotate270_Opt) {
+TEST_F(libyuvTest, I420Rotate270) {
  I420TestRotate(benchmark_width_, benchmark_height_,
                 benchmark_height_, benchmark_width_,
-                 kRotate270, benchmark_iterations_,
-                 disable_cpu_flags_, benchmark_cpu_info_);
+                 kRotate270, benchmark_iterations_);
 }

-// TODO(fbarchard): Remove odd width tests.
-// Odd width tests work but disabled because they use C code and can be
-// tested by passing an odd width command line or environment variable.
-TEST_F(LibYUVRotateTest, DISABLED_I420Rotate0_Odd) {
+TEST_F(libyuvTest, I420Rotate0_Odd) {
  I420TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
                 benchmark_width_ - 3, benchmark_height_ - 1,
-                 kRotate0, benchmark_iterations_,
-                 disable_cpu_flags_, benchmark_cpu_info_);
+                 kRotate0, benchmark_iterations_);
 }

-TEST_F(LibYUVRotateTest, DISABLED_I420Rotate90_Odd) {
+TEST_F(libyuvTest, I420Rotate90_Odd) {
  I420TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
                 benchmark_height_ - 1, benchmark_width_ - 3,
-                 kRotate90, benchmark_iterations_,
-                 disable_cpu_flags_, benchmark_cpu_info_);
+                 kRotate90, benchmark_iterations_);
 }

-TEST_F(LibYUVRotateTest, DISABLED_I420Rotate180_Odd) {
+TEST_F(libyuvTest, I420Rotate180_Odd) {
  I420TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
                 benchmark_width_ - 3, benchmark_height_ - 1,
-                 kRotate180, benchmark_iterations_,
-                 disable_cpu_flags_, benchmark_cpu_info_);
+                 kRotate180, benchmark_iterations_);
 }

-TEST_F(LibYUVRotateTest, DISABLED_I420Rotate270_Odd) {
+TEST_F(libyuvTest, I420Rotate270_Odd) {
  I420TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
                 benchmark_height_ - 1, benchmark_width_ - 3,
-                 kRotate270, benchmark_iterations_,
-                 disable_cpu_flags_, benchmark_cpu_info_);
+                 kRotate270, benchmark_iterations_);
 }

 static void NV12TestRotate(int src_width, int src_height,
                           int dst_width, int dst_height,
                           libyuv::RotationMode mode,
-                           int benchmark_iterations,
-                           int disable_cpu_flags, int benchmark_cpu_info) {
+                           int benchmark_iterations) {
  if (src_width < 1) {
    src_width = 1;
  }
-  if (src_height == 0) {  // allow negative for inversion test.
+  if (src_height < 1) {
    src_height = 1;
  }
  if (dst_width < 1) {
@ -158,24 +147,23 @@ static void NV12TestRotate(int src_width, int src_height,
  if (dst_height < 1) {
    dst_height = 1;
  }
-  int src_nv12_y_size = src_width * Abs(src_height);
-  int src_nv12_uv_size =
-      ((src_width + 1) / 2) * ((Abs(src_height) + 1) / 2) * 2;
+  int src_nv12_y_size = src_width * src_height;
+  int src_nv12_uv_size = ((src_width + 1) / 2) * ((src_height + 1) / 2) * 2;
  int src_nv12_size = src_nv12_y_size + src_nv12_uv_size;
-  align_buffer_page_end(src_nv12, src_nv12_size);
+  align_buffer_64(src_nv12, src_nv12_size);
  for (int i = 0; i < src_nv12_size; ++i) {
-    src_nv12[i] = fastrand() & 0xff;
+    src_nv12[i] = random() & 0xff;
  }

  int dst_i420_y_size = dst_width * dst_height;
  int dst_i420_uv_size = ((dst_width + 1) / 2) * ((dst_height + 1) / 2);
  int dst_i420_size = dst_i420_y_size + dst_i420_uv_size * 2;
-  align_buffer_page_end(dst_i420_c, dst_i420_size);
-  align_buffer_page_end(dst_i420_opt, dst_i420_size);
+  align_buffer_64(dst_i420_c, dst_i420_size);
+  align_buffer_64(dst_i420_opt, dst_i420_size);
  memset(dst_i420_c, 2, dst_i420_size);
  memset(dst_i420_opt, 3, dst_i420_size);

-  MaskCpuFlags(disable_cpu_flags);  // Disable all CPU optimization.
+  MaskCpuFlags(0);  // Disable all CPU optimization.
  NV12ToI420Rotate(src_nv12, src_width,
                   src_nv12 + src_nv12_y_size, (src_width + 1) & ~1,
                   dst_i420_c, dst_width,
@ -184,7 +172,7 @@ static void NV12TestRotate(int src_width, int src_height,
                     (dst_width + 1) / 2,
                   src_width, src_height, mode);

-  MaskCpuFlags(benchmark_cpu_info);  // Enable all CPU optimization.
+  MaskCpuFlags(-1);  // Enable all CPU optimization.
  for (int i = 0; i < benchmark_iterations; ++i) {
    NV12ToI420Rotate(src_nv12, src_width,
                     src_nv12 + src_nv12_y_size, (src_width + 1) & ~1,
@ -200,97 +188,57 @@ static void NV12TestRotate(int src_width, int src_height,
    EXPECT_EQ(dst_i420_c[i], dst_i420_opt[i]);
  }

-  free_aligned_buffer_page_end(dst_i420_c);
-  free_aligned_buffer_page_end(dst_i420_opt);
-  free_aligned_buffer_page_end(src_nv12);
+  free_aligned_buffer_64(dst_i420_c);
+  free_aligned_buffer_64(dst_i420_opt);
+  free_aligned_buffer_64(src_nv12);
 }

-TEST_F(LibYUVRotateTest, NV12Rotate0_Opt) {
+TEST_F(libyuvTest, NV12Rotate0) {
  NV12TestRotate(benchmark_width_, benchmark_height_,
                 benchmark_width_, benchmark_height_,
-                 kRotate0, benchmark_iterations_,
-                 disable_cpu_flags_, benchmark_cpu_info_);
+                 kRotate0, benchmark_iterations_);
 }

-TEST_F(LibYUVRotateTest, NV12Rotate90_Opt) {
+TEST_F(libyuvTest, NV12Rotate90) {
  NV12TestRotate(benchmark_width_, benchmark_height_,
                 benchmark_height_, benchmark_width_,
-                 kRotate90, benchmark_iterations_,
-                 disable_cpu_flags_, benchmark_cpu_info_);
+                 kRotate90, benchmark_iterations_);
 }

-TEST_F(LibYUVRotateTest, NV12Rotate180_Opt) {
+TEST_F(libyuvTest, NV12Rotate180) {
  NV12TestRotate(benchmark_width_, benchmark_height_,
                 benchmark_width_, benchmark_height_,
-                 kRotate180, benchmark_iterations_,
-                 disable_cpu_flags_, benchmark_cpu_info_);
+                 kRotate180, benchmark_iterations_);
 }

-TEST_F(LibYUVRotateTest, NV12Rotate270_Opt) {
+TEST_F(libyuvTest, NV12Rotate270) {
  NV12TestRotate(benchmark_width_, benchmark_height_,
                 benchmark_height_, benchmark_width_,
-                 kRotate270, benchmark_iterations_,
-                 disable_cpu_flags_, benchmark_cpu_info_);
+                 kRotate270, benchmark_iterations_);
 }

-TEST_F(LibYUVRotateTest, DISABLED_NV12Rotate0_Odd) {
+TEST_F(libyuvTest, NV12Rotate0_Odd) {
  NV12TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
                 benchmark_width_ - 3, benchmark_height_ - 1,
-                 kRotate0, benchmark_iterations_,
-                 disable_cpu_flags_, benchmark_cpu_info_);
+                 kRotate0, benchmark_iterations_);
 }

-TEST_F(LibYUVRotateTest, DISABLED_NV12Rotate90_Odd) {
+TEST_F(libyuvTest, NV12Rotate90_Odd) {
  NV12TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
                 benchmark_height_ - 1, benchmark_width_ - 3,
-                 kRotate90, benchmark_iterations_,
-                 disable_cpu_flags_, benchmark_cpu_info_);
+                 kRotate90, benchmark_iterations_);
 }

-TEST_F(LibYUVRotateTest, DISABLED_NV12Rotate180_Odd) {
+TEST_F(libyuvTest, NV12Rotate180_Odd) {
  NV12TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
                 benchmark_width_ - 3, benchmark_height_ - 1,
-                 kRotate180, benchmark_iterations_,
-                 disable_cpu_flags_, benchmark_cpu_info_);
+                 kRotate180, benchmark_iterations_);
 }

-TEST_F(LibYUVRotateTest, DISABLED_NV12Rotate270_Odd) {
+TEST_F(libyuvTest, NV12Rotate270_Odd) {
  NV12TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
                 benchmark_height_ - 1, benchmark_width_ - 3,
-                 kRotate270, benchmark_iterations_,
-                 disable_cpu_flags_, benchmark_cpu_info_);
+                 kRotate270, benchmark_iterations_);
 }

-TEST_F(LibYUVRotateTest, NV12Rotate0_Invert) {
-  NV12TestRotate(benchmark_width_, -benchmark_height_,
-                 benchmark_width_, benchmark_height_,
-                 kRotate0, benchmark_iterations_,
-                 disable_cpu_flags_, benchmark_cpu_info_);
-}
-
-TEST_F(LibYUVRotateTest, NV12Rotate90_Invert) {
-  NV12TestRotate(benchmark_width_, -benchmark_height_,
-                 benchmark_height_, benchmark_width_,
-                 kRotate90, benchmark_iterations_,
-                 disable_cpu_flags_, benchmark_cpu_info_);
-}
-
-TEST_F(LibYUVRotateTest, NV12Rotate180_Invert) {
-  NV12TestRotate(benchmark_width_, -benchmark_height_,
-                 benchmark_width_, benchmark_height_,
-                 kRotate180, benchmark_iterations_,
-                 disable_cpu_flags_, benchmark_cpu_info_);
-}
-
-TEST_F(LibYUVRotateTest, NV12Rotate270_Invert) {
-  NV12TestRotate(benchmark_width_, -benchmark_height_,
-                 benchmark_height_, benchmark_width_,
-                 kRotate270, benchmark_iterations_,
-                 disable_cpu_flags_, benchmark_cpu_info_);
-}
-
-
-
-
-
 }  // namespace libyuv
--- a/media/libyuv/unit_test/scale_argb_test.cc
+++ b/media/libyuv/unit_test/scale_argb_test.cc
@ -11,64 +11,48 @@
 #include <stdlib.h>
 #include <time.h>

-#include "libyuv/convert_argb.h"
 #include "libyuv/cpu_id.h"
 #include "libyuv/scale_argb.h"
-#include "libyuv/video_common.h"
+#include "libyuv/row.h"
 #include "../unit_test/unit_test.h"

 namespace libyuv {

-#define STRINGIZE(line) #line
-#define FILELINESTR(file, line) file ":" STRINGIZE(line)
-
 // Test scaling with C vs Opt and return maximum pixel difference. 0 = exact.
 static int ARGBTestFilter(int src_width, int src_height,
                          int dst_width, int dst_height,
-                          FilterMode f, int benchmark_iterations,
-                          int disable_cpu_flags, int benchmark_cpu_info) {
-  if (!SizeValid(src_width, src_height, dst_width, dst_height)) {
-    return 0;
-  }
-
+                          FilterMode f, int benchmark_iterations) {
+  const int b = 128;
  int i, j;
-  const int b = 0;  // 128 to test for padding/stride.
-  int64 src_argb_plane_size = (Abs(src_width) + b * 2) *
-      (Abs(src_height) + b * 2) * 4LL;
+  int src_argb_plane_size = (Abs(src_width) + b * 2) *
+      (Abs(src_height) + b * 2) * 4;
  int src_stride_argb = (b * 2 + Abs(src_width)) * 4;

-  align_buffer_page_end(src_argb, src_argb_plane_size);
-  if (!src_argb) {
-    printf("Skipped.  Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n");
-    return 0;
-  }
+  align_buffer_64(src_argb, src_argb_plane_size);
+  srandom(time(NULL));
  MemRandomize(src_argb, src_argb_plane_size);

-  int64 dst_argb_plane_size = (dst_width + b * 2) * (dst_height + b * 2) * 4LL;
+  int dst_argb_plane_size = (dst_width + b * 2) * (dst_height + b * 2) * 4;
  int dst_stride_argb = (b * 2 + dst_width) * 4;

-  align_buffer_page_end(dst_argb_c, dst_argb_plane_size);
-  align_buffer_page_end(dst_argb_opt, dst_argb_plane_size);
-  if (!dst_argb_c || !dst_argb_opt) {
-    printf("Skipped.  Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n");
-    return 0;
-  }
+  align_buffer_64(dst_argb_c, dst_argb_plane_size);
+  align_buffer_64(dst_argb_opt, dst_argb_plane_size);
  memset(dst_argb_c, 2, dst_argb_plane_size);
  memset(dst_argb_opt, 3, dst_argb_plane_size);

  // Warm up both versions for consistent benchmarks.
-  MaskCpuFlags(disable_cpu_flags);  // Disable all CPU optimization.
+  MaskCpuFlags(0);  // Disable all CPU optimization.
  ARGBScale(src_argb + (src_stride_argb * b) + b * 4, src_stride_argb,
            src_width, src_height,
            dst_argb_c + (dst_stride_argb * b) + b * 4, dst_stride_argb,
            dst_width, dst_height, f);
-  MaskCpuFlags(benchmark_cpu_info);  // Enable all CPU optimization.
+  MaskCpuFlags(-1);  // Enable all CPU optimization.
  ARGBScale(src_argb + (src_stride_argb * b) + b * 4, src_stride_argb,
            src_width, src_height,
            dst_argb_opt + (dst_stride_argb * b) + b * 4, dst_stride_argb,
            dst_width, dst_height, f);

-  MaskCpuFlags(disable_cpu_flags);  // Disable all CPU optimization.
+  MaskCpuFlags(0);  // Disable all CPU optimization.
  double c_time = get_time();
  ARGBScale(src_argb + (src_stride_argb * b) + b * 4, src_stride_argb,
            src_width, src_height,
@ -77,7 +61,7 @@ static int ARGBTestFilter(int src_width, int src_height,

  c_time = (get_time() - c_time);

-  MaskCpuFlags(benchmark_cpu_info);  // Enable all CPU optimization.
+  MaskCpuFlags(-1);  // Enable all CPU optimization.
  double opt_time = get_time();
  for (i = 0; i < benchmark_iterations; ++i) {
    ARGBScale(src_argb + (src_stride_argb * b) + b * 4, src_stride_argb,
@ -106,9 +90,9 @@ static int ARGBTestFilter(int src_width, int src_height,
    }
  }

-  free_aligned_buffer_page_end(dst_argb_c);
-  free_aligned_buffer_page_end(dst_argb_opt);
-  free_aligned_buffer_page_end(src_argb);
+  free_aligned_buffer_64(dst_argb_c);
+  free_aligned_buffer_64(dst_argb_opt);
+  free_aligned_buffer_64(src_argb);
  return max_diff;
 }

@ -146,38 +130,28 @@ static int TileARGBScale(const uint8* src_argb, int src_stride_argb,
 static int ARGBClipTestFilter(int src_width, int src_height,
                              int dst_width, int dst_height,
                              FilterMode f, int benchmark_iterations) {
-  if (!SizeValid(src_width, src_height, dst_width, dst_height)) {
-    return 0;
-  }
-
  const int b = 128;
-  int64 src_argb_plane_size = (Abs(src_width) + b * 2) *
+  int src_argb_plane_size = (Abs(src_width) + b * 2) *
      (Abs(src_height) + b * 2) * 4;
  int src_stride_argb = (b * 2 + Abs(src_width)) * 4;

-  align_buffer_page_end(src_argb, src_argb_plane_size);
-  if (!src_argb) {
-    printf("Skipped.  Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n");
-    return 0;
-  }
+  align_buffer_64(src_argb, src_argb_plane_size);
  memset(src_argb, 1, src_argb_plane_size);

-  int64 dst_argb_plane_size = (dst_width + b * 2) * (dst_height + b * 2) * 4;
+  int dst_argb_plane_size = (dst_width + b * 2) * (dst_height + b * 2) * 4;
  int dst_stride_argb = (b * 2 + dst_width) * 4;

+  srandom(time(NULL));
+
  int i, j;
  for (i = b; i < (Abs(src_height) + b); ++i) {
    for (j = b; j < (Abs(src_width) + b) * 4; ++j) {
-      src_argb[(i * src_stride_argb) + j] = (fastrand() & 0xff);
+      src_argb[(i * src_stride_argb) + j] = (random() & 0xff);
    }
  }

-  align_buffer_page_end(dst_argb_c, dst_argb_plane_size);
-  align_buffer_page_end(dst_argb_opt, dst_argb_plane_size);
-  if (!dst_argb_c || !dst_argb_opt) {
-    printf("Skipped.  Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n");
-    return 0;
-  }
+  align_buffer_64(dst_argb_c, dst_argb_plane_size);
+  align_buffer_64(dst_argb_opt, dst_argb_plane_size);
  memset(dst_argb_c, 2, dst_argb_plane_size);
  memset(dst_argb_opt, 3, dst_argb_plane_size);

@ -215,81 +189,66 @@ static int ARGBClipTestFilter(int src_width, int src_height,
    }
  }

-  free_aligned_buffer_page_end(dst_argb_c);
-  free_aligned_buffer_page_end(dst_argb_opt);
-  free_aligned_buffer_page_end(src_argb);
+  free_aligned_buffer_64(dst_argb_c);
+  free_aligned_buffer_64(dst_argb_opt);
+  free_aligned_buffer_64(src_argb);
  return max_diff;
 }

-// The following adjustments in dimensions ensure the scale factor will be
-// exactly achieved.
-#define DX(x, nom, denom) static_cast<int>((Abs(x) / nom) * nom)
-#define SX(x, nom, denom) static_cast<int>((x / nom) * denom)
-
-#define TEST_FACTOR1(name, filter, nom, denom, max_diff)                       \
-    TEST_F(LibYUVScaleTest, ARGBScaleDownBy##name##_##filter) {                \
-      int diff = ARGBTestFilter(SX(benchmark_width_, nom, denom),              \
-                                SX(benchmark_height_, nom, denom),             \
-                                DX(benchmark_width_, nom, denom),              \
-                                DX(benchmark_height_, nom, denom),             \
-                                kFilter##filter, benchmark_iterations_,        \
-                                disable_cpu_flags_, benchmark_cpu_info_);      \
+#define TEST_FACTOR1(name, filter, hfactor, vfactor, max_diff)                 \
+    TEST_F(libyuvTest, ARGBScaleDownBy##name##_##filter) {                     \
+      int diff = ARGBTestFilter(benchmark_width_, benchmark_height_,           \
+                                Abs(benchmark_width_) * hfactor,               \
+                                Abs(benchmark_height_) * vfactor,              \
+                                kFilter##filter, benchmark_iterations_);       \
      EXPECT_LE(diff, max_diff);                                               \
    }                                                                          \
-    TEST_F(LibYUVScaleTest, ARGBScaleDownClipBy##name##_##filter) {            \
-      int diff = ARGBClipTestFilter(SX(benchmark_width_, nom, denom),          \
-                                    SX(benchmark_height_, nom, denom),         \
-                                    DX(benchmark_width_, nom, denom),          \
-                                    DX(benchmark_height_, nom, denom),         \
-                                    kFilter##filter, benchmark_iterations_);   \
+    TEST_F(libyuvTest, ARGBScaleDownClipBy##name##_##filter) {                 \
+      int diff = ARGBClipTestFilter(benchmark_width_, benchmark_height_,       \
+                                Abs(benchmark_width_) * hfactor,               \
+                                Abs(benchmark_height_) * vfactor,              \
+                                kFilter##filter, benchmark_iterations_);       \
      EXPECT_LE(diff, max_diff);                                               \
    }

-// Test a scale factor with all 4 filters.  Expect unfiltered to be exact, but
+// Test a scale factor with 2 filters.  Expect unfiltered to be exact, but
 // filtering is different fixed point implementations for SSSE3, Neon and C.
-#define TEST_FACTOR(name, nom, denom)                                          \
-    TEST_FACTOR1(name, None, nom, denom, 0)                                    \
-    TEST_FACTOR1(name, Linear, nom, denom, 3)                                  \
-    TEST_FACTOR1(name, Bilinear, nom, denom, 3)                                \
-    TEST_FACTOR1(name, Box, nom, denom, 3)
+#define TEST_FACTOR(name, hfactor, vfactor)                                    \
+    TEST_FACTOR1(name, None, hfactor, vfactor, 2)                              \
+    TEST_FACTOR1(name, Linear, hfactor, vfactor, 2)                            \
+    TEST_FACTOR1(name, Bilinear, hfactor, vfactor, 2)                          \
+    TEST_FACTOR1(name, Box, hfactor, vfactor, 2)

-TEST_FACTOR(2, 1, 2)
-TEST_FACTOR(4, 1, 4)
-TEST_FACTOR(8, 1, 8)
-TEST_FACTOR(3by4, 3, 4)
-TEST_FACTOR(3by8, 3, 8)
-TEST_FACTOR(3, 1, 3)
+TEST_FACTOR(2, 1 / 2, 1 / 2)
+TEST_FACTOR(4, 1 / 4, 1 / 4)
+TEST_FACTOR(8, 1 / 8, 1 / 8)
+TEST_FACTOR(3by4, 3 / 4, 3 / 4)
 #undef TEST_FACTOR1
 #undef TEST_FACTOR
-#undef SX
-#undef DX

 #define TEST_SCALETO1(name, width, height, filter, max_diff)                   \
-    TEST_F(LibYUVScaleTest, name##To##width##x##height##_##filter) {           \
+    TEST_F(libyuvTest, name##To##width##x##height##_##filter) {                \
      int diff = ARGBTestFilter(benchmark_width_, benchmark_height_,           \
                                width, height,                                 \
-                                kFilter##filter, benchmark_iterations_,        \
-                                disable_cpu_flags_, benchmark_cpu_info_);      \
+                                kFilter##filter, benchmark_iterations_);       \
      EXPECT_LE(diff, max_diff);                                               \
    }                                                                          \
-    TEST_F(LibYUVScaleTest, name##From##width##x##height##_##filter) {         \
+    TEST_F(libyuvTest, name##From##width##x##height##_##filter) {              \
      int diff = ARGBTestFilter(width, height,                                 \
                                Abs(benchmark_width_), Abs(benchmark_height_), \
-                                kFilter##filter, benchmark_iterations_,        \
-                                disable_cpu_flags_, benchmark_cpu_info_);      \
+                                kFilter##filter, benchmark_iterations_);       \
      EXPECT_LE(diff, max_diff);                                               \
    }                                                                          \
-    TEST_F(LibYUVScaleTest, name##ClipTo##width##x##height##_##filter) {       \
+    TEST_F(libyuvTest, name##ClipTo##width##x##height##_##filter) {            \
      int diff = ARGBClipTestFilter(benchmark_width_, benchmark_height_,       \
-                                    width, height,                             \
-                                    kFilter##filter, benchmark_iterations_);   \
+                                width, height,                                 \
+                                kFilter##filter, benchmark_iterations_);       \
      EXPECT_LE(diff, max_diff);                                               \
    }                                                                          \
-    TEST_F(LibYUVScaleTest, name##ClipFrom##width##x##height##_##filter) {     \
+    TEST_F(libyuvTest, name##ClipFrom##width##x##height##_##filter) {          \
      int diff = ARGBClipTestFilter(width, height,                             \
-                                    Abs(benchmark_width_),                     \
-                                    Abs(benchmark_height_),                    \
-                                    kFilter##filter, benchmark_iterations_);   \
+                                Abs(benchmark_width_), Abs(benchmark_height_), \
+                                kFilter##filter, benchmark_iterations_);       \
      EXPECT_LE(diff, max_diff);                                               \
    }

@ -297,166 +256,15 @@ TEST_FACTOR(3, 1, 3)
 #define TEST_SCALETO(name, width, height)                                      \
    TEST_SCALETO1(name, width, height, None, 0)                                \
    TEST_SCALETO1(name, width, height, Linear, 3)                              \
-    TEST_SCALETO1(name, width, height, Bilinear, 3)
+    TEST_SCALETO1(name, width, height, Bilinear, 3)                            \
+    TEST_SCALETO1(name, width, height, Box, 3)

 TEST_SCALETO(ARGBScale, 1, 1)
 TEST_SCALETO(ARGBScale, 320, 240)
 TEST_SCALETO(ARGBScale, 352, 288)
-TEST_SCALETO(ARGBScale, 569, 480)
 TEST_SCALETO(ARGBScale, 640, 360)
 TEST_SCALETO(ARGBScale, 1280, 720)
 #undef TEST_SCALETO1
 #undef TEST_SCALETO

-// Scale with YUV conversion to ARGB and clipping.
-LIBYUV_API
-int YUVToARGBScaleReference2(const uint8* src_y, int src_stride_y,
-                             const uint8* src_u, int src_stride_u,
-                             const uint8* src_v, int src_stride_v,
-                             uint32 src_fourcc,
-                             int src_width, int src_height,
-                             uint8* dst_argb, int dst_stride_argb,
-                             uint32 dst_fourcc,
-                             int dst_width, int dst_height,
-                             int clip_x, int clip_y,
-                             int clip_width, int clip_height,
-                             enum FilterMode filtering) {
-  uint8* argb_buffer = static_cast<uint8*>(malloc(src_width * src_height * 4));
-  int r;
-  I420ToARGB(src_y, src_stride_y,
-             src_u, src_stride_u,
-             src_v, src_stride_v,
-             argb_buffer, src_width * 4,
-             src_width, src_height);
-
-  r = ARGBScaleClip(argb_buffer, src_width * 4,
-                    src_width, src_height,
-                    dst_argb, dst_stride_argb,
-                    dst_width, dst_height,
-                    clip_x, clip_y, clip_width, clip_height,
-                    filtering);
-  free(argb_buffer);
-  return r;
-}
-
-static void FillRamp(uint8* buf, int width, int height, int v, int dx, int dy) {
-  int rv = v;
-  for (int y = 0; y < height; ++y) {
-    for (int x = 0; x < width; ++x) {
-      *buf++ = v;
-      v += dx;
-      if (v < 0 || v > 255) {
-        dx = -dx;
-        v += dx;
-      }
-    }
-    v = rv + dy;
-    if (v < 0 || v > 255) {
-      dy = -dy;
-      v += dy;
-    }
-    rv = v;
-  }
-}
-
-// Test scaling with C vs Opt and return maximum pixel difference. 0 = exact.
-static int YUVToARGBTestFilter(int src_width, int src_height,
-                               int dst_width, int dst_height,
-                               FilterMode f, int benchmark_iterations,
-                               int disable_cpu_flags, int benchmark_cpu_info) {
-  int64 src_y_plane_size = Abs(src_width) * Abs(src_height);
-  int64 src_uv_plane_size = ((Abs(src_width) + 1) / 2) *
-      ((Abs(src_height) + 1) / 2);
-  int src_stride_y = Abs(src_width);
-  int src_stride_uv = (Abs(src_width) + 1) / 2;
-
-  align_buffer_page_end(src_y, src_y_plane_size);
-  align_buffer_page_end(src_u, src_uv_plane_size);
-  align_buffer_page_end(src_v, src_uv_plane_size);
-
-  int64 dst_argb_plane_size = (dst_width) * (dst_height) * 4LL;
-  int dst_stride_argb = (dst_width) * 4;
-  align_buffer_page_end(dst_argb_c, dst_argb_plane_size);
-  align_buffer_page_end(dst_argb_opt, dst_argb_plane_size);
-  if (!dst_argb_c || !dst_argb_opt || !src_y || !src_u || !src_v) {
-    printf("Skipped.  Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n");
-    return 0;
-  }
-  // Fill YUV image with continuous ramp, which is less sensitive to
-  // subsampling and filtering differences for test purposes.
-  FillRamp(src_y, Abs(src_width), Abs(src_height), 128, 1, 1);
-  FillRamp(src_u, (Abs(src_width) + 1) / 2, (Abs(src_height) + 1) / 2, 3, 1, 1);
-  FillRamp(src_v, (Abs(src_width) + 1) / 2, (Abs(src_height) + 1) / 2, 4, 1, 1);
-  memset(dst_argb_c, 2, dst_argb_plane_size);
-  memset(dst_argb_opt, 3, dst_argb_plane_size);
-
-  YUVToARGBScaleReference2(src_y, src_stride_y,
-                           src_u, src_stride_uv,
-                           src_v, src_stride_uv,
-                           libyuv::FOURCC_I420,
-                           src_width, src_height,
-                           dst_argb_c, dst_stride_argb,
-                           libyuv::FOURCC_I420,
-                           dst_width, dst_height,
-                           0, 0, dst_width, dst_height,
-                           f);
-
-  for (int i = 0; i < benchmark_iterations; ++i) {
-    YUVToARGBScaleClip(src_y, src_stride_y,
-                       src_u, src_stride_uv,
-                       src_v, src_stride_uv,
-                       libyuv::FOURCC_I420,
-                       src_width, src_height,
-                       dst_argb_opt, dst_stride_argb,
-                       libyuv::FOURCC_I420,
-                       dst_width, dst_height,
-                       0, 0, dst_width, dst_height,
-                       f);
-  }
-  int max_diff = 0;
-  for (int i = 0; i < dst_height; ++i) {
-    for (int j = 0; j < dst_width * 4; ++j) {
-      int abs_diff = Abs(dst_argb_c[(i * dst_stride_argb) + j] -
-                         dst_argb_opt[(i * dst_stride_argb) + j]);
-      if (abs_diff > max_diff) {
-        printf("error %d at %d,%d c %d opt %d",
-               abs_diff,
-               j, i,
-               dst_argb_c[(i * dst_stride_argb) + j],
-               dst_argb_opt[(i * dst_stride_argb) + j]);
-        EXPECT_LE(abs_diff, 40);
-        max_diff = abs_diff;
-      }
-    }
-  }
-
-  free_aligned_buffer_page_end(dst_argb_c);
-  free_aligned_buffer_page_end(dst_argb_opt);
-  free_aligned_buffer_page_end(src_y);
-  free_aligned_buffer_page_end(src_u);
-  free_aligned_buffer_page_end(src_v);
-  return max_diff;
-}
-
-TEST_F(LibYUVScaleTest, YUVToRGBScaleUp) {
-  int diff = YUVToARGBTestFilter(benchmark_width_, benchmark_height_,
-                                 benchmark_width_ * 3 / 2,
-                                 benchmark_height_ * 3 / 2,
-                                 libyuv::kFilterBilinear,
-                                 benchmark_iterations_,
-                                 disable_cpu_flags_, benchmark_cpu_info_);
-  EXPECT_LE(diff, 10);
-}
-
-TEST_F(LibYUVScaleTest, YUVToRGBScaleDown) {
-  int diff = YUVToARGBTestFilter(benchmark_width_ * 3 / 2,
-                                 benchmark_height_ * 3 / 2,
-                                 benchmark_width_, benchmark_height_,
-                                 libyuv::kFilterBilinear,
-                                 benchmark_iterations_,
-                                 disable_cpu_flags_, benchmark_cpu_info_);
-  EXPECT_LE(diff, 10);
-}
-
-
 }  // namespace libyuv
--- a/media/libyuv/unit_test/scale_test.cc
+++ b/media/libyuv/unit_test/scale_test.cc
@ -15,27 +15,19 @@
 #include "libyuv/scale.h"
 #include "../unit_test/unit_test.h"

-#define STRINGIZE(line) #line
-#define FILELINESTR(file, line) file ":" STRINGIZE(line)
-
 namespace libyuv {

 // Test scaling with C vs Opt and return maximum pixel difference. 0 = exact.
 static int TestFilter(int src_width, int src_height,
                      int dst_width, int dst_height,
-                      FilterMode f, int benchmark_iterations,
-                      int disable_cpu_flags, int benchmark_cpu_info) {
-  if (!SizeValid(src_width, src_height, dst_width, dst_height)) {
-    return 0;
-  }
-
+                      FilterMode f, int benchmark_iterations) {
  int i, j;
-  const int b = 0;  // 128 to test for padding/stride.
+  const int b = 128;
  int src_width_uv = (Abs(src_width) + 1) >> 1;
  int src_height_uv = (Abs(src_height) + 1) >> 1;

-  int64 src_y_plane_size = (Abs(src_width) + b * 2) * (Abs(src_height) + b * 2);
-  int64 src_uv_plane_size = (src_width_uv + b * 2) * (src_height_uv + b * 2);
+  int src_y_plane_size = (Abs(src_width) + b * 2) * (Abs(src_height) + b * 2);
+  int src_uv_plane_size = (src_width_uv + b * 2) * (src_height_uv + b * 2);

  int src_stride_y = b * 2 + Abs(src_width);
  int src_stride_uv = b * 2 + src_width_uv;
@ -43,10 +35,7 @@ static int TestFilter(int src_width, int src_height,
  align_buffer_page_end(src_y, src_y_plane_size)
  align_buffer_page_end(src_u, src_uv_plane_size)
  align_buffer_page_end(src_v, src_uv_plane_size)
-  if (!src_y || !src_u || !src_v) {
-    printf("Skipped.  Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n");
-    return 0;
-  }
+  srandom(time(NULL));
  MemRandomize(src_y, src_y_plane_size);
  MemRandomize(src_u, src_uv_plane_size);
  MemRandomize(src_v, src_uv_plane_size);
@ -54,8 +43,8 @@ static int TestFilter(int src_width, int src_height,
  int dst_width_uv = (dst_width + 1) >> 1;
  int dst_height_uv = (dst_height + 1) >> 1;

-  int64 dst_y_plane_size = (dst_width + b * 2) * (dst_height + b * 2);
-  int64 dst_uv_plane_size = (dst_width_uv + b * 2) * (dst_height_uv + b * 2);
+  int dst_y_plane_size = (dst_width + b * 2) * (dst_height + b * 2);
+  int dst_uv_plane_size = (dst_width_uv + b * 2) * (dst_height_uv + b * 2);

  int dst_stride_y = b * 2 + dst_width;
  int dst_stride_uv = b * 2 + dst_width_uv;
@ -66,13 +55,9 @@ static int TestFilter(int src_width, int src_height,
  align_buffer_page_end(dst_y_opt, dst_y_plane_size)
  align_buffer_page_end(dst_u_opt, dst_uv_plane_size)
  align_buffer_page_end(dst_v_opt, dst_uv_plane_size)
-  if (!dst_y_c || !dst_u_c || !dst_v_c ||
-      !dst_y_opt|| !dst_u_opt|| !dst_v_opt) {
-    printf("Skipped.  Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n");
-    return 0;
-  }

-  MaskCpuFlags(disable_cpu_flags);  // Disable all CPU optimization.
+
+  MaskCpuFlags(0);  // Disable all CPU optimization.
  double c_time = get_time();
  I420Scale(src_y + (src_stride_y * b) + b, src_stride_y,
            src_u + (src_stride_uv * b) + b, src_stride_uv,
@ -84,7 +69,7 @@ static int TestFilter(int src_width, int src_height,
            dst_width, dst_height, f);
  c_time = (get_time() - c_time);

-  MaskCpuFlags(benchmark_cpu_info);  // Enable all CPU optimization.
+  MaskCpuFlags(-1);  // Enable all CPU optimization.
  double opt_time = get_time();
  for (i = 0; i < benchmark_iterations; ++i) {
    I420Scale(src_y + (src_stride_y * b) + b, src_stride_y,
@ -147,223 +132,54 @@ static int TestFilter(int src_width, int src_height,
  return max_diff;
 }

-// Test scaling with 8 bit C vs 16 bit C and return maximum pixel difference.
-// 0 = exact.
-static int TestFilter_16(int src_width, int src_height,
-                         int dst_width, int dst_height,
-                         FilterMode f, int benchmark_iterations) {
-  if (!SizeValid(src_width, src_height, dst_width, dst_height)) {
-    return 0;
-  }
-
-  int i, j;
-  const int b = 0;  // 128 to test for padding/stride.
-  int src_width_uv = (Abs(src_width) + 1) >> 1;
-  int src_height_uv = (Abs(src_height) + 1) >> 1;
-
-  int64 src_y_plane_size = (Abs(src_width) + b * 2) *
-      (Abs(src_height) + b * 2);
-  int64 src_uv_plane_size = (src_width_uv + b * 2) * (src_height_uv + b * 2);
-
-  int src_stride_y = b * 2 + Abs(src_width);
-  int src_stride_uv = b * 2 + src_width_uv;
-
-  align_buffer_page_end(src_y, src_y_plane_size)
-  align_buffer_page_end(src_u, src_uv_plane_size)
-  align_buffer_page_end(src_v, src_uv_plane_size)
-  align_buffer_page_end(src_y_16, src_y_plane_size * 2)
-  align_buffer_page_end(src_u_16, src_uv_plane_size * 2)
-  align_buffer_page_end(src_v_16, src_uv_plane_size * 2)
-  uint16* p_src_y_16 = reinterpret_cast<uint16*>(src_y_16);
-  uint16* p_src_u_16 = reinterpret_cast<uint16*>(src_u_16);
-  uint16* p_src_v_16 = reinterpret_cast<uint16*>(src_v_16);
-
-  MemRandomize(src_y, src_y_plane_size);
-  MemRandomize(src_u, src_uv_plane_size);
-  MemRandomize(src_v, src_uv_plane_size);
-
-  for (i = b; i < src_height + b; ++i) {
-    for (j = b; j < src_width + b; ++j) {
-      p_src_y_16[(i * src_stride_y) + j] = src_y[(i * src_stride_y) + j];
-    }
-  }
-
-  for (i = b; i < (src_height_uv + b); ++i) {
-    for (j = b; j < (src_width_uv + b); ++j) {
-      p_src_u_16[(i * src_stride_uv) + j] = src_u[(i * src_stride_uv) + j];
-      p_src_v_16[(i * src_stride_uv) + j] = src_v[(i * src_stride_uv) + j];
-    }
-  }
-
-  int dst_width_uv = (dst_width + 1) >> 1;
-  int dst_height_uv = (dst_height + 1) >> 1;
-
-  int dst_y_plane_size = (dst_width + b * 2) * (dst_height + b * 2);
-  int dst_uv_plane_size = (dst_width_uv + b * 2) * (dst_height_uv + b * 2);
-
-  int dst_stride_y = b * 2 + dst_width;
-  int dst_stride_uv = b * 2 + dst_width_uv;
-
-  align_buffer_page_end(dst_y_8, dst_y_plane_size)
-  align_buffer_page_end(dst_u_8, dst_uv_plane_size)
-  align_buffer_page_end(dst_v_8, dst_uv_plane_size)
-  align_buffer_page_end(dst_y_16, dst_y_plane_size * 2)
-  align_buffer_page_end(dst_u_16, dst_uv_plane_size * 2)
-  align_buffer_page_end(dst_v_16, dst_uv_plane_size * 2)
-
-  uint16* p_dst_y_16 = reinterpret_cast<uint16*>(dst_y_16);
-  uint16* p_dst_u_16 = reinterpret_cast<uint16*>(dst_u_16);
-  uint16* p_dst_v_16 = reinterpret_cast<uint16*>(dst_v_16);
-
-  I420Scale(src_y + (src_stride_y * b) + b, src_stride_y,
-            src_u + (src_stride_uv * b) + b, src_stride_uv,
-            src_v + (src_stride_uv * b) + b, src_stride_uv,
-            src_width, src_height,
-            dst_y_8 + (dst_stride_y * b) + b, dst_stride_y,
-            dst_u_8 + (dst_stride_uv * b) + b, dst_stride_uv,
-            dst_v_8 + (dst_stride_uv * b) + b, dst_stride_uv,
-            dst_width, dst_height, f);
-
-  for (i = 0; i < benchmark_iterations; ++i) {
-    I420Scale_16(p_src_y_16 + (src_stride_y * b) + b, src_stride_y,
-                 p_src_u_16 + (src_stride_uv * b) + b, src_stride_uv,
-                 p_src_v_16 + (src_stride_uv * b) + b, src_stride_uv,
-                 src_width, src_height,
-                 p_dst_y_16 + (dst_stride_y * b) + b, dst_stride_y,
-                 p_dst_u_16 + (dst_stride_uv * b) + b, dst_stride_uv,
-                 p_dst_v_16 + (dst_stride_uv * b) + b, dst_stride_uv,
-                 dst_width, dst_height, f);
-  }
-
-  // Expect an exact match
-  int max_diff = 0;
-  for (i = b; i < (dst_height + b); ++i) {
-    for (j = b; j < (dst_width + b); ++j) {
-      int abs_diff = Abs(dst_y_8[(i * dst_stride_y) + j] -
-                         p_dst_y_16[(i * dst_stride_y) + j]);
-      if (abs_diff > max_diff) {
-        max_diff = abs_diff;
-      }
-    }
-  }
-
-  for (i = b; i < (dst_height_uv + b); ++i) {
-    for (j = b; j < (dst_width_uv + b); ++j) {
-      int abs_diff = Abs(dst_u_8[(i * dst_stride_uv) + j] -
-                         p_dst_u_16[(i * dst_stride_uv) + j]);
-      if (abs_diff > max_diff) {
-        max_diff = abs_diff;
-      }
-      abs_diff = Abs(dst_v_8[(i * dst_stride_uv) + j] -
-                     p_dst_v_16[(i * dst_stride_uv) + j]);
-      if (abs_diff > max_diff) {
-        max_diff = abs_diff;
-      }
-    }
-  }
-
-  free_aligned_buffer_page_end(dst_y_8)
-  free_aligned_buffer_page_end(dst_u_8)
-  free_aligned_buffer_page_end(dst_v_8)
-  free_aligned_buffer_page_end(dst_y_16)
-  free_aligned_buffer_page_end(dst_u_16)
-  free_aligned_buffer_page_end(dst_v_16)
-
-  free_aligned_buffer_page_end(src_y)
-  free_aligned_buffer_page_end(src_u)
-  free_aligned_buffer_page_end(src_v)
-  free_aligned_buffer_page_end(src_y_16)
-  free_aligned_buffer_page_end(src_u_16)
-  free_aligned_buffer_page_end(src_v_16)
-
-  return max_diff;
-}
-
-// The following adjustments in dimensions ensure the scale factor will be
-// exactly achieved.
-// 2 is chroma subsample
-#define DX(x, nom, denom) static_cast<int>(((Abs(x) / nom + 1) / 2) * nom * 2)
-#define SX(x, nom, denom) static_cast<int>(((x / nom + 1) / 2) * denom * 2)
-
-#define TEST_FACTOR1(name, filter, nom, denom, max_diff)                       \
-    TEST_F(LibYUVScaleTest, ScaleDownBy##name##_##filter) {                    \
-      int diff = TestFilter(SX(benchmark_width_, nom, denom),                  \
-                            SX(benchmark_height_, nom, denom),                 \
-                            DX(benchmark_width_, nom, denom),                  \
-                            DX(benchmark_height_, nom, denom),                 \
-                            kFilter##filter, benchmark_iterations_,            \
-                            disable_cpu_flags_, benchmark_cpu_info_);          \
-      EXPECT_LE(diff, max_diff);                                               \
-    }                                                                          \
-    TEST_F(LibYUVScaleTest, DISABLED_ScaleDownBy##name##_##filter##_16) {      \
-      int diff = TestFilter_16(SX(benchmark_width_, nom, denom),               \
-                               SX(benchmark_height_, nom, denom),              \
-                               DX(benchmark_width_, nom, denom),               \
-                               DX(benchmark_height_, nom, denom),              \
-                               kFilter##filter, benchmark_iterations_);        \
+#define TEST_FACTOR1(name, filter, hfactor, vfactor, max_diff)                 \
+    TEST_F(libyuvTest, ScaleDownBy##name##_##filter) {                         \
+      int diff = TestFilter(benchmark_width_, benchmark_height_,               \
+                            Abs(benchmark_width_) * hfactor,                   \
+                            Abs(benchmark_height_) * vfactor,                  \
+                            kFilter##filter, benchmark_iterations_);           \
      EXPECT_LE(diff, max_diff);                                               \
    }

 // Test a scale factor with all 4 filters.  Expect unfiltered to be exact, but
 // filtering is different fixed point implementations for SSSE3, Neon and C.
-#define TEST_FACTOR(name, nom, denom, boxdiff)                                 \
-    TEST_FACTOR1(name, None, nom, denom, 0)                                    \
-    TEST_FACTOR1(name, Linear, nom, denom, 3)                                  \
-    TEST_FACTOR1(name, Bilinear, nom, denom, 3)                                \
-    TEST_FACTOR1(name, Box, nom, denom, boxdiff)
+#define TEST_FACTOR(name, hfactor, vfactor)                                    \
+    TEST_FACTOR1(name, None, hfactor, vfactor, 0)                              \
+    TEST_FACTOR1(name, Linear, hfactor, vfactor, 3)                            \
+    TEST_FACTOR1(name, Bilinear, hfactor, vfactor, 3)                          \
+    TEST_FACTOR1(name, Box, hfactor, vfactor, 3)                               \

-TEST_FACTOR(2, 1, 2, 0)
-TEST_FACTOR(4, 1, 4, 0)
-TEST_FACTOR(8, 1, 8, 0)
-TEST_FACTOR(3by4, 3, 4, 1)
-TEST_FACTOR(3by8, 3, 8, 1)
-TEST_FACTOR(3, 1, 3, 0)
+TEST_FACTOR(2, 1 / 2, 1 / 2)
+TEST_FACTOR(4, 1 / 4, 1 / 4)
+TEST_FACTOR(8, 1 / 8, 1 / 8)
+TEST_FACTOR(3by4, 3 / 4, 3 / 4)
 #undef TEST_FACTOR1
 #undef TEST_FACTOR
-#undef SX
-#undef DX

 #define TEST_SCALETO1(name, width, height, filter, max_diff)                   \
-    TEST_F(LibYUVScaleTest, name##To##width##x##height##_##filter) {           \
+    TEST_F(libyuvTest, name##To##width##x##height##_##filter) {                \
      int diff = TestFilter(benchmark_width_, benchmark_height_,               \
                            width, height,                                     \
-                            kFilter##filter, benchmark_iterations_,            \
-                            disable_cpu_flags_, benchmark_cpu_info_);          \
+                            kFilter##filter, benchmark_iterations_);           \
      EXPECT_LE(diff, max_diff);                                               \
    }                                                                          \
-    TEST_F(LibYUVScaleTest, name##From##width##x##height##_##filter) {         \
+    TEST_F(libyuvTest, name##From##width##x##height##_##filter) {              \
      int diff = TestFilter(width, height,                                     \
                            Abs(benchmark_width_), Abs(benchmark_height_),     \
-                            kFilter##filter, benchmark_iterations_,            \
-                            disable_cpu_flags_, benchmark_cpu_info_);          \
-      EXPECT_LE(diff, max_diff);                                               \
-    }                                                                          \
-    TEST_F(LibYUVScaleTest,                                                    \
-        DISABLED_##name##To##width##x##height##_##filter##_16) {               \
-      int diff = TestFilter_16(benchmark_width_, benchmark_height_,            \
-                               width, height,                                  \
-                               kFilter##filter, benchmark_iterations_);        \
-      EXPECT_LE(diff, max_diff);                                               \
-    }                                                                          \
-    TEST_F(LibYUVScaleTest,                                                    \
-        DISABLED_##name##From##width##x##height##_##filter##_16) {             \
-      int diff = TestFilter_16(width, height,                                  \
-                               Abs(benchmark_width_), Abs(benchmark_height_),  \
-                               kFilter##filter, benchmark_iterations_);        \
+                            kFilter##filter, benchmark_iterations_);           \
      EXPECT_LE(diff, max_diff);                                               \
    }

 // Test scale to a specified size with all 4 filters.
 #define TEST_SCALETO(name, width, height)                                      \
    TEST_SCALETO1(name, width, height, None, 0)                                \
-    TEST_SCALETO1(name, width, height, Linear, 0)                              \
-    TEST_SCALETO1(name, width, height, Bilinear, 0)                            \
-    TEST_SCALETO1(name, width, height, Box, 0)
+    TEST_SCALETO1(name, width, height, Linear, 3)                              \
+    TEST_SCALETO1(name, width, height, Bilinear, 3)                            \
+    TEST_SCALETO1(name, width, height, Box, 3)

 TEST_SCALETO(Scale, 1, 1)
 TEST_SCALETO(Scale, 320, 240)
 TEST_SCALETO(Scale, 352, 288)
-TEST_SCALETO(Scale, 569, 480)
 TEST_SCALETO(Scale, 640, 360)
 TEST_SCALETO(Scale, 1280, 720)
 #undef TEST_SCALETO1
--- a/media/libyuv/unit_test/unit_test.cc
+++ b/media/libyuv/unit_test/unit_test.cc
@ -14,343 +14,42 @@

 #include <cstring>

-#include "gflags/gflags.h"
-
 // Change this to 1000 for benchmarking.
 // TODO(fbarchard): Add command line parsing to pass this as option.
 #define BENCHMARK_ITERATIONS 1

-unsigned int fastrand_seed = 0xfb;
-
-DEFINE_int32(libyuv_width, 0, "width of test image.");
-DEFINE_int32(libyuv_height, 0, "height of test image.");
-DEFINE_int32(libyuv_repeat, 0, "number of times to repeat test.");
-DEFINE_int32(libyuv_flags, 0,
-             "cpu flags for reference code. 1 = C, -1 = SIMD");
-DEFINE_int32(libyuv_cpu_info, 0,
-             "cpu flags for benchmark code. 1 = C, -1 = SIMD");
-
-// For quicker unittests, default is 128 x 72.  But when benchmarking,
-// default to 720p.  Allow size to specify.
-// Set flags to -1 for benchmarking to avoid slower C code.
-
-LibYUVConvertTest::LibYUVConvertTest() :
+libyuvTest::libyuvTest() : rotate_max_w_(128), rotate_max_h_(128),
    benchmark_iterations_(BENCHMARK_ITERATIONS), benchmark_width_(128),
-    benchmark_height_(72), disable_cpu_flags_(1), benchmark_cpu_info_(-1) {
-  const char* repeat = getenv("LIBYUV_REPEAT");
-  if (repeat) {
-    benchmark_iterations_ = atoi(repeat);  // NOLINT
-  }
-  if (FLAGS_libyuv_repeat) {
-    benchmark_iterations_ = FLAGS_libyuv_repeat;
-  }
-  if (benchmark_iterations_ > 1) {
-    benchmark_width_ = 1280;
-    benchmark_height_ = 720;
-  }
-  const char* width = getenv("LIBYUV_WIDTH");
-  if (width) {
-    benchmark_width_ = atoi(width);  // NOLINT
-  }
-  if (FLAGS_libyuv_width) {
-    benchmark_width_ = FLAGS_libyuv_width;
-  }
-  const char* height = getenv("LIBYUV_HEIGHT");
-  if (height) {
-    benchmark_height_ = atoi(height);  // NOLINT
-  }
-  if (FLAGS_libyuv_height) {
-    benchmark_height_ = FLAGS_libyuv_height;
-  }
-  const char* cpu_flags = getenv("LIBYUV_FLAGS");
-  if (cpu_flags) {
-    disable_cpu_flags_ = atoi(cpu_flags);  // NOLINT
-  }
-  if (FLAGS_libyuv_flags) {
-    disable_cpu_flags_ = FLAGS_libyuv_flags;
-  }
-  const char* cpu_info = getenv("LIBYUV_CPU_INFO");
-  if (cpu_info) {
-    benchmark_cpu_info_ = atoi(cpu_flags);  // NOLINT
-  }
-  if (FLAGS_libyuv_cpu_info) {
-    benchmark_cpu_info_ = FLAGS_libyuv_cpu_info;
-  }
-  benchmark_pixels_div256_ = static_cast<int>((
-      static_cast<double>(Abs(benchmark_width_)) *
-      static_cast<double>(Abs(benchmark_height_)) *
-      static_cast<double>(benchmark_iterations_)  + 255.0) / 256.0);
-  benchmark_pixels_div1280_ = static_cast<int>((
-      static_cast<double>(Abs(benchmark_width_)) *
-      static_cast<double>(Abs(benchmark_height_)) *
-      static_cast<double>(benchmark_iterations_)  + 1279.0) / 1280.0);
-}
-
-LibYUVColorTest::LibYUVColorTest() :
-    benchmark_iterations_(BENCHMARK_ITERATIONS), benchmark_width_(128),
-    benchmark_height_(72), disable_cpu_flags_(1), benchmark_cpu_info_(-1) {
-  const char* repeat = getenv("LIBYUV_REPEAT");
-  if (repeat) {
-    benchmark_iterations_ = atoi(repeat);  // NOLINT
-  }
-  if (FLAGS_libyuv_repeat) {
-    benchmark_iterations_ = FLAGS_libyuv_repeat;
-  }
-  if (benchmark_iterations_ > 1) {
-    benchmark_width_ = 1280;
-    benchmark_height_ = 720;
-  }
-  const char* width = getenv("LIBYUV_WIDTH");
-  if (width) {
-    benchmark_width_ = atoi(width);  // NOLINT
-  }
-  if (FLAGS_libyuv_width) {
-    benchmark_width_ = FLAGS_libyuv_width;
-  }
-  const char* height = getenv("LIBYUV_HEIGHT");
-  if (height) {
-    benchmark_height_ = atoi(height);  // NOLINT
-  }
-  if (FLAGS_libyuv_height) {
-    benchmark_height_ = FLAGS_libyuv_height;
-  }
-  const char* cpu_flags = getenv("LIBYUV_FLAGS");
-  if (cpu_flags) {
-    disable_cpu_flags_ = atoi(cpu_flags);  // NOLINT
-  }
-  if (FLAGS_libyuv_flags) {
-    disable_cpu_flags_ = FLAGS_libyuv_flags;
-  }
-  const char* cpu_info = getenv("LIBYUV_CPU_INFO");
-  if (cpu_info) {
-    benchmark_cpu_info_ = atoi(cpu_flags);  // NOLINT
-  }
-  if (FLAGS_libyuv_cpu_info) {
-    benchmark_cpu_info_ = FLAGS_libyuv_cpu_info;
-  }
-  benchmark_pixels_div256_ = static_cast<int>((
-      static_cast<double>(Abs(benchmark_width_)) *
-      static_cast<double>(Abs(benchmark_height_)) *
-      static_cast<double>(benchmark_iterations_)  + 255.0) / 256.0);
-  benchmark_pixels_div1280_ = static_cast<int>((
-      static_cast<double>(Abs(benchmark_width_)) *
-      static_cast<double>(Abs(benchmark_height_)) *
-      static_cast<double>(benchmark_iterations_)  + 1279.0) / 1280.0);
-}
-
-LibYUVScaleTest::LibYUVScaleTest() :
-    benchmark_iterations_(BENCHMARK_ITERATIONS), benchmark_width_(128),
-    benchmark_height_(72), disable_cpu_flags_(1), benchmark_cpu_info_(-1) {
-  const char* repeat = getenv("LIBYUV_REPEAT");
-  if (repeat) {
-    benchmark_iterations_ = atoi(repeat);  // NOLINT
-  }
-  if (FLAGS_libyuv_repeat) {
-    benchmark_iterations_ = FLAGS_libyuv_repeat;
-  }
-  if (benchmark_iterations_ > 1) {
-    benchmark_width_ = 1280;
-    benchmark_height_ = 720;
-  }
-  const char* width = getenv("LIBYUV_WIDTH");
-  if (width) {
-    benchmark_width_ = atoi(width);  // NOLINT
-  }
-  if (FLAGS_libyuv_width) {
-    benchmark_width_ = FLAGS_libyuv_width;
-  }
-  const char* height = getenv("LIBYUV_HEIGHT");
-  if (height) {
-    benchmark_height_ = atoi(height);  // NOLINT
-  }
-  if (FLAGS_libyuv_height) {
-    benchmark_height_ = FLAGS_libyuv_height;
-  }
-  const char* cpu_flags = getenv("LIBYUV_FLAGS");
-  if (cpu_flags) {
-    disable_cpu_flags_ = atoi(cpu_flags);  // NOLINT
-  }
-  if (FLAGS_libyuv_flags) {
-    disable_cpu_flags_ = FLAGS_libyuv_flags;
-  }
-  const char* cpu_info = getenv("LIBYUV_CPU_INFO");
-  if (cpu_info) {
-    benchmark_cpu_info_ = atoi(cpu_flags);  // NOLINT
-  }
-  if (FLAGS_libyuv_cpu_info) {
-    benchmark_cpu_info_ = FLAGS_libyuv_cpu_info;
-  }
-  benchmark_pixels_div256_ = static_cast<int>((
-      static_cast<double>(Abs(benchmark_width_)) *
-      static_cast<double>(Abs(benchmark_height_)) *
-      static_cast<double>(benchmark_iterations_)  + 255.0) / 256.0);
-  benchmark_pixels_div1280_ = static_cast<int>((
-      static_cast<double>(Abs(benchmark_width_)) *
-      static_cast<double>(Abs(benchmark_height_)) *
-      static_cast<double>(benchmark_iterations_)  + 1279.0) / 1280.0);
-}
-
-LibYUVRotateTest::LibYUVRotateTest() :
-    benchmark_iterations_(BENCHMARK_ITERATIONS), benchmark_width_(128),
-    benchmark_height_(72), disable_cpu_flags_(1), benchmark_cpu_info_(-1) {
-  const char* repeat = getenv("LIBYUV_REPEAT");
-  if (repeat) {
-    benchmark_iterations_ = atoi(repeat);  // NOLINT
-  }
-  if (FLAGS_libyuv_repeat) {
-    benchmark_iterations_ = FLAGS_libyuv_repeat;
-  }
-  if (benchmark_iterations_ > 1) {
-    benchmark_width_ = 1280;
-    benchmark_height_ = 720;
-  }
-  const char* width = getenv("LIBYUV_WIDTH");
-  if (width) {
-    benchmark_width_ = atoi(width);  // NOLINT
-  }
-  if (FLAGS_libyuv_width) {
-    benchmark_width_ = FLAGS_libyuv_width;
-  }
-  const char* height = getenv("LIBYUV_HEIGHT");
-  if (height) {
-    benchmark_height_ = atoi(height);  // NOLINT
-  }
-  if (FLAGS_libyuv_height) {
-    benchmark_height_ = FLAGS_libyuv_height;
-  }
-  const char* cpu_flags = getenv("LIBYUV_FLAGS");
-  if (cpu_flags) {
-    disable_cpu_flags_ = atoi(cpu_flags);  // NOLINT
-  }
-  if (FLAGS_libyuv_flags) {
-    disable_cpu_flags_ = FLAGS_libyuv_flags;
-  }
-  const char* cpu_info = getenv("LIBYUV_CPU_INFO");
-  if (cpu_info) {
-    benchmark_cpu_info_ = atoi(cpu_flags);  // NOLINT
-  }
-  if (FLAGS_libyuv_cpu_info) {
-    benchmark_cpu_info_ = FLAGS_libyuv_cpu_info;
-  }
-  benchmark_pixels_div256_ = static_cast<int>((
-      static_cast<double>(Abs(benchmark_width_)) *
-      static_cast<double>(Abs(benchmark_height_)) *
-      static_cast<double>(benchmark_iterations_)  + 255.0) / 256.0);
-  benchmark_pixels_div1280_ = static_cast<int>((
-      static_cast<double>(Abs(benchmark_width_)) *
-      static_cast<double>(Abs(benchmark_height_)) *
-      static_cast<double>(benchmark_iterations_)  + 1279.0) / 1280.0);
-}
-
-LibYUVPlanarTest::LibYUVPlanarTest() :
-    benchmark_iterations_(BENCHMARK_ITERATIONS), benchmark_width_(128),
-    benchmark_height_(72), disable_cpu_flags_(1), benchmark_cpu_info_(-1) {
-  const char* repeat = getenv("LIBYUV_REPEAT");
-  if (repeat) {
-    benchmark_iterations_ = atoi(repeat);  // NOLINT
-  }
-  if (FLAGS_libyuv_repeat) {
-    benchmark_iterations_ = FLAGS_libyuv_repeat;
-  }
-  if (benchmark_iterations_ > 1) {
-    benchmark_width_ = 1280;
-    benchmark_height_ = 720;
-  }
-  const char* width = getenv("LIBYUV_WIDTH");
-  if (width) {
-    benchmark_width_ = atoi(width);  // NOLINT
-  }
-  if (FLAGS_libyuv_width) {
-    benchmark_width_ = FLAGS_libyuv_width;
-  }
-  const char* height = getenv("LIBYUV_HEIGHT");
-  if (height) {
-    benchmark_height_ = atoi(height);  // NOLINT
-  }
-  if (FLAGS_libyuv_height) {
-    benchmark_height_ = FLAGS_libyuv_height;
-  }
-  const char* cpu_flags = getenv("LIBYUV_FLAGS");
-  if (cpu_flags) {
-    disable_cpu_flags_ = atoi(cpu_flags);  // NOLINT
-  }
-  if (FLAGS_libyuv_flags) {
-    disable_cpu_flags_ = FLAGS_libyuv_flags;
-  }
-  const char* cpu_info = getenv("LIBYUV_CPU_INFO");
-  if (cpu_info) {
-    benchmark_cpu_info_ = atoi(cpu_flags);  // NOLINT
-  }
-  if (FLAGS_libyuv_cpu_info) {
-    benchmark_cpu_info_ = FLAGS_libyuv_cpu_info;
-  }
-  benchmark_pixels_div256_ = static_cast<int>((
-      static_cast<double>(Abs(benchmark_width_)) *
-      static_cast<double>(Abs(benchmark_height_)) *
-      static_cast<double>(benchmark_iterations_)  + 255.0) / 256.0);
-  benchmark_pixels_div1280_ = static_cast<int>((
-      static_cast<double>(Abs(benchmark_width_)) *
-      static_cast<double>(Abs(benchmark_height_)) *
-      static_cast<double>(benchmark_iterations_)  + 1279.0) / 1280.0);
-}
-
-LibYUVBaseTest::LibYUVBaseTest() :
-    benchmark_iterations_(BENCHMARK_ITERATIONS), benchmark_width_(128),
-    benchmark_height_(72), disable_cpu_flags_(1), benchmark_cpu_info_(-1) {
-  const char* repeat = getenv("LIBYUV_REPEAT");
-  if (repeat) {
-    benchmark_iterations_ = atoi(repeat);  // NOLINT
-  }
-  if (FLAGS_libyuv_repeat) {
-    benchmark_iterations_ = FLAGS_libyuv_repeat;
-  }
-  if (benchmark_iterations_ > 1) {
-    benchmark_width_ = 1280;
-    benchmark_height_ = 720;
-  }
-  const char* width = getenv("LIBYUV_WIDTH");
-  if (width) {
-    benchmark_width_ = atoi(width);  // NOLINT
-  }
-  if (FLAGS_libyuv_width) {
-    benchmark_width_ = FLAGS_libyuv_width;
-  }
-  const char* height = getenv("LIBYUV_HEIGHT");
-  if (height) {
-    benchmark_height_ = atoi(height);  // NOLINT
-  }
-  if (FLAGS_libyuv_height) {
-    benchmark_height_ = FLAGS_libyuv_height;
-  }
-  const char* cpu_flags = getenv("LIBYUV_FLAGS");
-  if (cpu_flags) {
-    disable_cpu_flags_ = atoi(cpu_flags);  // NOLINT
-  }
-  if (FLAGS_libyuv_flags) {
-    disable_cpu_flags_ = FLAGS_libyuv_flags;
-  }
-  const char* cpu_info = getenv("LIBYUV_CPU_INFO");
-  if (cpu_info) {
-    benchmark_cpu_info_ = atoi(cpu_flags);  // NOLINT
-  }
-  if (FLAGS_libyuv_cpu_info) {
-    benchmark_cpu_info_ = FLAGS_libyuv_cpu_info;
-  }
-  benchmark_pixels_div256_ = static_cast<int>((
-      static_cast<double>(Abs(benchmark_width_)) *
-      static_cast<double>(Abs(benchmark_height_)) *
-      static_cast<double>(benchmark_iterations_)  + 255.0) / 256.0);
-  benchmark_pixels_div1280_ = static_cast<int>((
-      static_cast<double>(Abs(benchmark_width_)) *
-      static_cast<double>(Abs(benchmark_height_)) *
-      static_cast<double>(benchmark_iterations_)  + 1279.0) / 1280.0);
+    benchmark_height_(72) {
+    const char* repeat = getenv("LIBYUV_REPEAT");
+    if (repeat) {
+      benchmark_iterations_ = atoi(repeat);  // NOLINT
+      // For quicker unittests, default is 128 x 72.  But when benchmarking,
+      // default to 720p.  Allow size to specify.
+      if (benchmark_iterations_ > 1) {
+        benchmark_width_ = 1280;
+        benchmark_height_ = 720;
+      }
+    }
+    const char* width = getenv("LIBYUV_WIDTH");
+    if (width) {
+      benchmark_width_ = atoi(width);  // NOLINT
+    }
+    const char* height = getenv("LIBYUV_HEIGHT");
+    if (height) {
+      benchmark_height_ = atoi(height);  // NOLINT
+    }
+    benchmark_pixels_div256_ = static_cast<int>((
+        static_cast<double>(Abs(benchmark_width_)) *
+        static_cast<double>(Abs(benchmark_height_)) *
+        static_cast<double>(benchmark_iterations_)  + 255.0) / 256.0);
+    benchmark_pixels_div1280_ = static_cast<int>((
+        static_cast<double>(Abs(benchmark_width_)) *
+        static_cast<double>(Abs(benchmark_height_)) *
+        static_cast<double>(benchmark_iterations_)  + 1279.0) / 1280.0);
 }

 int main(int argc, char** argv) {
  ::testing::InitGoogleTest(&argc, argv);
-  // AllowCommandLineParsing allows us to ignore flags passed on to us by
-  // Chromium build bots without having to explicitly disable them.
-  google::AllowCommandLineReparsing();
-  google::ParseCommandLineFlags(&argc, &argv, true);
  return RUN_ALL_TESTS();
 }
--- a/media/libyuv/unit_test/unit_test.h
+++ b/media/libyuv/unit_test/unit_test.h
@ -22,54 +22,15 @@

 #include "libyuv/basic_types.h"

-#ifndef SIMD_ALIGNED
-#if defined(_MSC_VER) && !defined(__CLR_VER)
-#define SIMD_ALIGNED(var) __declspec(align(16)) var
-#elif defined(__GNUC__) && !defined(__pnacl__)
-#define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
-#else
-#define SIMD_ALIGNED(var) var
-#endif
-#endif
-
 static __inline int Abs(int v) {
  return v >= 0 ? v : -v;
 }

-#define OFFBY 0
-
-// Scaling uses 16.16 fixed point to step thru the source image, so a
-// maximum size of 32767.999 can be expressed.  32768 is valid because
-// the step is 1 beyond the image but not used.
-// Destination size is mainly constrained by valid scale step not the
-// absolute size, so it may be possible to relax the destination size
-// constraint.
-// Source size is unconstrained for most specialized scalers.  e.g.
-// An image of 65536 scaled to half size would be valid.  The test
-// could be relaxed for special scale factors.
-// If this test is removed, the scaling function should gracefully
-// fail with a return code.  The test could be changed to know that
-// libyuv failed in a controlled way.
-
-static const int kMaxWidth = 32768;
-static const int kMaxHeight = 32768;
-
-static inline bool SizeValid(int src_width, int src_height,
-                             int dst_width, int dst_height) {
-  if (src_width > kMaxWidth || src_height > kMaxHeight ||
-      dst_width > kMaxWidth || dst_height > kMaxHeight) {
-    printf("Warning - size too large to test.  Skipping\n");
-    return false;
-  }
-  return true;
-}
-
 #define align_buffer_page_end(var, size)                                       \
  uint8* var;                                                                  \
  uint8* var##_mem;                                                            \
-  var##_mem = reinterpret_cast<uint8*>(malloc(((size) + 4095 + 63) & ~4095));  \
-  var = (uint8*)((intptr_t)(var##_mem + (((size) + 4095 + 63) & ~4095) -       \
-      (size)) & ~63);
+  var##_mem = reinterpret_cast<uint8*>(malloc(((size) + 4095) & ~4095));       \
+  var = var##_mem + (-(size) & 4095);

 #define free_aligned_buffer_page_end(var) \
  free(var##_mem);  \
@ -82,6 +43,9 @@ static inline double get_time() {
  QueryPerformanceFrequency(&f);
  return static_cast<double>(t.QuadPart) / static_cast<double>(f.QuadPart);
 }
+
+#define random rand
+#define srandom srand
 #else
 static inline double get_time() {
  struct timeval t;
@ -91,109 +55,29 @@ static inline double get_time() {
 }
 #endif

-#ifndef SIMD_ALIGNED
-#if defined(_MSC_VER) && !defined(__CLR_VER)
-#define SIMD_ALIGNED(var) __declspec(align(16)) var
-#elif defined(__GNUC__) && !defined(__pnacl__)
-#define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
-#else
-#define SIMD_ALIGNED(var) var
-#endif
-#endif
-
-extern unsigned int fastrand_seed;
-inline int fastrand() {
-  fastrand_seed = fastrand_seed * 214013u + 2531011u;
-  return static_cast<int>((fastrand_seed >> 16) & 0xffff);
-}
-
-static inline void MemRandomize(uint8* dst, int64 len) {
-  int64 i;
+static inline void MemRandomize(uint8* dst, int len) {
+  int i;
  for (i = 0; i < len - 1; i += 2) {
-    *reinterpret_cast<uint16*>(dst) = fastrand();
+    *reinterpret_cast<uint16*>(dst) = random();
    dst += 2;
  }
  for (; i < len; ++i) {
-    *dst++ = fastrand();
+    *dst++ = random();
  }
 }

-class LibYUVColorTest : public ::testing::Test {
+class libyuvTest : public ::testing::Test {
 protected:
-  LibYUVColorTest();
+  libyuvTest();
+
+  const int rotate_max_w_;
+  const int rotate_max_h_;

  int benchmark_iterations_;  // Default 1. Use 1000 for benchmarking.
  int benchmark_width_;  // Default 1280.  Use 640 for benchmarking VGA.
  int benchmark_height_;  // Default 720.  Use 360 for benchmarking VGA.
  int benchmark_pixels_div256_;  // Total pixels to benchmark / 256.
  int benchmark_pixels_div1280_;  // Total pixels to benchmark / 1280.
-  int disable_cpu_flags_;  // Default 1.  Use -1 for benchmarking.
-  int benchmark_cpu_info_;  // Default -1.  Use 1 to disable SIMD.
-};
-
-class LibYUVConvertTest : public ::testing::Test {
- protected:
-  LibYUVConvertTest();
-
-  int benchmark_iterations_;  // Default 1. Use 1000 for benchmarking.
-  int benchmark_width_;  // Default 1280.  Use 640 for benchmarking VGA.
-  int benchmark_height_;  // Default 720.  Use 360 for benchmarking VGA.
-  int benchmark_pixels_div256_;  // Total pixels to benchmark / 256.
-  int benchmark_pixels_div1280_;  // Total pixels to benchmark / 1280.
-  int disable_cpu_flags_;  // Default 1.  Use -1 for benchmarking.
-  int benchmark_cpu_info_;  // Default -1.  Use 1 to disable SIMD.
-};
-
-class LibYUVScaleTest : public ::testing::Test {
- protected:
-  LibYUVScaleTest();
-
-  int benchmark_iterations_;  // Default 1. Use 1000 for benchmarking.
-  int benchmark_width_;  // Default 1280.  Use 640 for benchmarking VGA.
-  int benchmark_height_;  // Default 720.  Use 360 for benchmarking VGA.
-  int benchmark_pixels_div256_;  // Total pixels to benchmark / 256.
-  int benchmark_pixels_div1280_;  // Total pixels to benchmark / 1280.
-  int disable_cpu_flags_;  // Default 1.  Use -1 for benchmarking.
-  int benchmark_cpu_info_;  // Default -1.  Use 1 to disable SIMD.
-};
-
-class LibYUVRotateTest : public ::testing::Test {
- protected:
-  LibYUVRotateTest();
-
-  int benchmark_iterations_;  // Default 1. Use 1000 for benchmarking.
-  int benchmark_width_;  // Default 1280.  Use 640 for benchmarking VGA.
-  int benchmark_height_;  // Default 720.  Use 360 for benchmarking VGA.
-  int benchmark_pixels_div256_;  // Total pixels to benchmark / 256.
-  int benchmark_pixels_div1280_;  // Total pixels to benchmark / 1280.
-  int disable_cpu_flags_;  // Default 1.  Use -1 for benchmarking.
-  int benchmark_cpu_info_;  // Default -1.  Use 1 to disable SIMD.
-};
-
-class LibYUVPlanarTest : public ::testing::Test {
- protected:
-  LibYUVPlanarTest();
-
-  int benchmark_iterations_;  // Default 1. Use 1000 for benchmarking.
-  int benchmark_width_;  // Default 1280.  Use 640 for benchmarking VGA.
-  int benchmark_height_;  // Default 720.  Use 360 for benchmarking VGA.
-  int benchmark_pixels_div256_;  // Total pixels to benchmark / 256.
-  int benchmark_pixels_div1280_;  // Total pixels to benchmark / 1280.
-  int disable_cpu_flags_;  // Default 1.  Use -1 for benchmarking.
-  int benchmark_cpu_info_;  // Default -1.  Use 1 to disable SIMD.
-};
-
-class LibYUVBaseTest : public ::testing::Test {
- protected:
-  LibYUVBaseTest();
-
-  int benchmark_iterations_;  // Default 1. Use 1000 for benchmarking.
-  int benchmark_width_;  // Default 1280.  Use 640 for benchmarking VGA.
-  int benchmark_height_;  // Default 720.  Use 360 for benchmarking VGA.
-  int benchmark_pixels_div256_;  // Total pixels to benchmark / 256.
-  int benchmark_pixels_div1280_;  // Total pixels to benchmark / 1280.
-  int disable_cpu_flags_;  // Default 1.  Use -1 for benchmarking.
-  int benchmark_cpu_info_;  // Default -1.  Use 1 to disable SIMD.
 };

 #endif  // UNIT_TEST_UNIT_TEST_H_  NOLINT
--- a/media/libyuv/unit_test/video_common_test.cc
+++ b/media/libyuv/unit_test/video_common_test.cc
@ -41,9 +41,8 @@ static bool TestValidFourCC(uint32 fourcc, int bpp) {
  return true;
 }

-TEST_F(LibYUVBaseTest, TestCanonicalFourCC) {
+TEST_F(libyuvTest, TestCanonicalFourCC) {
  EXPECT_EQ(FOURCC_I420, CanonicalFourCC(FOURCC_IYUV));
-  EXPECT_EQ(FOURCC_I420, CanonicalFourCC(FOURCC_YU12));
  EXPECT_EQ(FOURCC_I422, CanonicalFourCC(FOURCC_YU16));
  EXPECT_EQ(FOURCC_I444, CanonicalFourCC(FOURCC_YU24));
  EXPECT_EQ(FOURCC_YUY2, CanonicalFourCC(FOURCC_YUYV));
@ -52,6 +51,7 @@ TEST_F(LibYUVBaseTest, TestCanonicalFourCC) {
  EXPECT_EQ(FOURCC_UYVY, CanonicalFourCC(FOURCC_2VUY));
  EXPECT_EQ(FOURCC_MJPG, CanonicalFourCC(FOURCC_JPEG));
  EXPECT_EQ(FOURCC_MJPG, CanonicalFourCC(FOURCC_DMB1));
+  EXPECT_EQ(FOURCC_BGGR, CanonicalFourCC(FOURCC_BA81));
  EXPECT_EQ(FOURCC_RAW,  CanonicalFourCC(FOURCC_RGB3));
  EXPECT_EQ(FOURCC_24BG, CanonicalFourCC(FOURCC_BGR3));
  EXPECT_EQ(FOURCC_BGRA, CanonicalFourCC(FOURCC_CM32));
@ -61,7 +61,7 @@ TEST_F(LibYUVBaseTest, TestCanonicalFourCC) {
  EXPECT_EQ(FOURCC_RGBO, CanonicalFourCC(FOURCC_5551));
 }

-TEST_F(LibYUVBaseTest, TestFourCC) {
+TEST_F(libyuvTest, TestFourCC) {
  EXPECT_TRUE(TestValidFourCC(FOURCC_I420, FOURCC_BPP_I420));
  EXPECT_TRUE(TestValidFourCC(FOURCC_I420, FOURCC_BPP_I420));
  EXPECT_TRUE(TestValidFourCC(FOURCC_I422, FOURCC_BPP_I422));
@ -73,7 +73,7 @@ TEST_F(LibYUVBaseTest, TestFourCC) {
  EXPECT_TRUE(TestValidFourCC(FOURCC_YUY2, FOURCC_BPP_YUY2));
  EXPECT_TRUE(TestValidFourCC(FOURCC_UYVY, FOURCC_BPP_UYVY));
  EXPECT_TRUE(TestValidFourCC(FOURCC_M420, FOURCC_BPP_M420));
-  EXPECT_TRUE(TestValidFourCC(FOURCC_Q420, FOURCC_BPP_Q420));  // deprecated.
+  EXPECT_TRUE(TestValidFourCC(FOURCC_Q420, FOURCC_BPP_Q420));
  EXPECT_TRUE(TestValidFourCC(FOURCC_ARGB, FOURCC_BPP_ARGB));
  EXPECT_TRUE(TestValidFourCC(FOURCC_BGRA, FOURCC_BPP_BGRA));
  EXPECT_TRUE(TestValidFourCC(FOURCC_ABGR, FOURCC_BPP_ABGR));
@ -83,6 +83,10 @@ TEST_F(LibYUVBaseTest, TestFourCC) {
  EXPECT_TRUE(TestValidFourCC(FOURCC_RGBP, FOURCC_BPP_RGBP));
  EXPECT_TRUE(TestValidFourCC(FOURCC_RGBO, FOURCC_BPP_RGBO));
  EXPECT_TRUE(TestValidFourCC(FOURCC_R444, FOURCC_BPP_R444));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_RGGB, FOURCC_BPP_RGGB));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_BGGR, FOURCC_BPP_BGGR));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_GRBG, FOURCC_BPP_GRBG));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_GBRG, FOURCC_BPP_GBRG));
  EXPECT_TRUE(TestValidFourCC(FOURCC_MJPG, FOURCC_BPP_MJPG));
  EXPECT_TRUE(TestValidFourCC(FOURCC_YV12, FOURCC_BPP_YV12));
  EXPECT_TRUE(TestValidFourCC(FOURCC_YV16, FOURCC_BPP_YV16));
--- a/media/libyuv/util/Makefile
+++ b/media/libyuv/util/Makefile
@ -1,6 +1,6 @@
-psnr: psnr.cc ssim.cc psnr_main.cc
-ifeq ($(CXX),icl)
-	$(CXX) /arch:SSE2 /Ox /openmp psnr.cc ssim.cc psnr_main.cc
-else
-	$(CXX) -msse2 -O3 -fopenmp -static -o psnr psnr.cc ssim.cc psnr_main.cc -Wl,--strip-all
-endif
+psnr: psnr.cc ssim.cc psnr_main.cc
+ifeq ($(CXX),icl)
+	$(CXX) /arch:SSE2 /Ox /openmp psnr.cc ssim.cc psnr_main.cc
+else
+	$(CXX) -msse2 -O3 -fopenmp -static -o psnr psnr.cc ssim.cc psnr_main.cc -Wl,--strip-all
+endif
--- a/media/libyuv/util/cpuid.c
+++ b/media/libyuv/util/cpuid.c
@ -66,8 +66,10 @@ int main(int argc, const char* argv[]) {
    printf("Has NEON %x\n", has_neon);
  }
  if (has_mips) {
-    int has_dspr2 = TestCpuFlag(kCpuHasDSPR2);
-    printf("Has DSPR2 %x\n", has_dspr2);
+    int has_mips_dsp = TestCpuFlag(kCpuHasMIPS_DSP);
+    int has_mips_dspr2 = TestCpuFlag(kCpuHasMIPS_DSPR2);
+    printf("Has MIPS DSP %x\n", has_mips_dsp);
+    printf("Has MIPS DSPR2 %x\n", has_mips_dspr2);
  }
  if (has_x86) {
    int has_sse2 = TestCpuFlag(kCpuHasSSE2);
@ -76,7 +78,6 @@ int main(int argc, const char* argv[]) {
    int has_sse42 = TestCpuFlag(kCpuHasSSE42);
    int has_avx = TestCpuFlag(kCpuHasAVX);
    int has_avx2 = TestCpuFlag(kCpuHasAVX2);
-    int has_avx3 = TestCpuFlag(kCpuHasAVX3);
    int has_erms = TestCpuFlag(kCpuHasERMS);
    int has_fma3 = TestCpuFlag(kCpuHasFMA3);
    printf("Has SSE2 %x\n", has_sse2);
@ -85,7 +86,6 @@ int main(int argc, const char* argv[]) {
    printf("Has SSE4.2 %x\n", has_sse42);
    printf("Has AVX %x\n", has_avx);
    printf("Has AVX2 %x\n", has_avx2);
-    printf("Has AVX3 %x\n", has_avx3);
    printf("Has ERMS %x\n", has_erms);
    printf("Has FMA3 %x\n", has_fma3);
  }
--- a/media/libyuv/util/psnr.cc
+++ b/media/libyuv/util/psnr.cc
@ -10,6 +10,8 @@

 #include "./psnr.h"  // NOLINT

+#include <math.h>
+
 #ifdef _OPENMP
 #include <omp.h>
 #endif
@ -32,22 +34,26 @@ typedef unsigned long long uint64;  // NOLINT
 #endif  // __LP64__
 #endif  // _MSC_VER

-// libyuv provides this function when linking library for jpeg support.
-#if !defined(HAVE_JPEG)
+// PSNR formula: psnr = 10 * log10 (Peak Signal^2 * size / sse)
+double ComputePSNR(double sse, double size) {
+  const double kMINSSE = 255.0 * 255.0 * size / pow(10., kMaxPSNR / 10.);
+  if (sse <= kMINSSE)
+    sse = kMINSSE;  // Produces max PSNR of 128
+  return 10.0 * log10(65025.0 * size / sse);
+}

-#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
-    !defined(__aarch64__)
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)
 #define HAS_SUMSQUAREERROR_NEON
 static uint32 SumSquareError_NEON(const uint8* src_a,
                                  const uint8* src_b, int count) {
  volatile uint32 sse;
-  asm volatile (
+  asm volatile (  // NOLINT
    "vmov.u8    q7, #0                         \n"
    "vmov.u8    q9, #0                         \n"
    "vmov.u8    q8, #0                         \n"
    "vmov.u8    q10, #0                        \n"

-  "1:                                          \n"
+    "1:                                        \n"
    "vld1.u8    {q0}, [%0]!                    \n"
    "vld1.u8    {q1}, [%1]!                    \n"
    "vsubl.u8   q2, d0, d2                     \n"
@ -73,42 +79,6 @@ static uint32 SumSquareError_NEON(const uint8* src_a,
    : "memory", "cc", "q0", "q1", "q2", "q3", "q7", "q8", "q9", "q10");
  return sse;
 }
-#elif !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
-#define HAS_SUMSQUAREERROR_NEON
-static uint32 SumSquareError_NEON(const uint8* src_a,
-                                  const uint8* src_b, int count) {
-  volatile uint32 sse;
-  asm volatile (
-    "eor        v16.16b, v16.16b, v16.16b      \n"
-    "eor        v18.16b, v18.16b, v18.16b      \n"
-    "eor        v17.16b, v17.16b, v17.16b      \n"
-    "eor        v19.16b, v19.16b, v19.16b      \n"
-
-  "1:                                          \n"
-    "ld1        {v0.16b}, [%0], #16            \n"
-    "ld1        {v1.16b}, [%1], #16            \n"
-    "subs       %w2, %w2, #16                  \n"
-    "usubl      v2.8h, v0.8b, v1.8b            \n"
-    "usubl2     v3.8h, v0.16b, v1.16b          \n"
-    "smlal      v16.4s, v2.4h, v2.4h           \n"
-    "smlal      v17.4s, v3.4h, v3.4h           \n"
-    "smlal2     v18.4s, v2.8h, v2.8h           \n"
-    "smlal2     v19.4s, v3.8h, v3.8h           \n"
-    "b.gt       1b                             \n"
-
-    "add        v16.4s, v16.4s, v17.4s         \n"
-    "add        v18.4s, v18.4s, v19.4s         \n"
-    "add        v19.4s, v16.4s, v18.4s         \n"
-    "addv       s0, v19.4s                     \n"
-    "fmov       %w3, s0                        \n"
-    : "+r"(src_a),
-      "+r"(src_b),
-      "+r"(count),
-      "=r"(sse)
-    :
-    : "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
-  return sse;
-}
 #elif !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
 #define HAS_SUMSQUAREERROR_SSE2
 __declspec(naked)
@ -206,8 +176,7 @@ static __inline void __cpuid(int cpu_info[4], int info_type) {
    : "=a"(cpu_info[0]), "=D"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
    : "a"(info_type));
 }
-// For gcc/clang but not clangcl.
-#elif (defined(__i386__) || defined(__x86_64__)) && !defined(_MSC_VER)
+#elif defined(__i386__) || defined(__x86_64__)
 static __inline void __cpuid(int cpu_info[4], int info_type) {
  asm volatile (  // NOLINT
    "cpuid                                     \n"
@ -272,16 +241,6 @@ double ComputeSumSquareError(const uint8* src_a,
  }
  return static_cast<double>(sse);
 }
-#endif
-
-// PSNR formula: psnr = 10 * log10 (Peak Signal^2 * size / sse)
-// Returns 128.0 (kMaxPSNR) if sse is 0 (perfect match).
-double ComputePSNR(double sse, double size) {
-  const double kMINSSE = 255.0 * 255.0 * size / pow(10.0, kMaxPSNR / 10.0);
-  if (sse <= kMINSSE)
-    sse = kMINSSE;  // Produces max PSNR of 128
-  return 10.0 * log10(255.0 * 255.0 * size / sse);
-}

 #ifdef __cplusplus
 }  // extern "C"
--- a/media/libyuv/util/psnr.h
+++ b/media/libyuv/util/psnr.h
@ -13,8 +13,6 @@
 #ifndef UTIL_PSNR_H_  // NOLINT
 #define UTIL_PSNR_H_

-#include <math.h>  // For log10()
-
 #ifdef __cplusplus
 extern "C" {
 #endif
@ -26,17 +24,13 @@ typedef unsigned char uint8;

 static const double kMaxPSNR = 128.0;

-// libyuv provides this function when linking library for jpeg support.
-// TODO(fbarchard): make psnr lib compatible subset of libyuv.
-#if !defined(HAVE_JPEG)
+// PSNR formula: psnr = 10 * log10 (Peak Signal^2 * size / sse).
+// Returns 128.0 (kMaxPSNR) if sse is 0 (perfect match).
+double ComputePSNR(double sse, double size);
+
 // Computer Sum of Squared Error (SSE).
 // Pass this to ComputePSNR for final result.
 double ComputeSumSquareError(const uint8* org, const uint8* rec, int size);
-#endif
-
-// PSNR formula: psnr = 10 * log10 (Peak Signal^2 * size / sse)
-// Returns 128.0 (kMaxPSNR) if sse is 0 (perfect match).
-double ComputePSNR(double sse, double size);

 #ifdef __cplusplus
 }  // extern "C"
--- a/media/libyuv/util/psnr_main.cc
+++ b/media/libyuv/util/psnr_main.cc
@ -32,10 +32,6 @@

 #include "./psnr.h"
 #include "./ssim.h"
-#ifdef HAVE_JPEG
-#include "libyuv/compare.h"
-#include "libyuv/convert.h"
-#endif

 struct metric {
  double y, u, v, all;
@ -79,29 +75,6 @@ bool ExtractResolutionFromFilename(const char* name,
      }
    }
  }
-
-#ifdef HAVE_JPEG
-  // Try parsing file as a jpeg.
-  FILE* const file_org = fopen(name, "rb");
-  if (file_org == NULL) {
-    fprintf(stderr, "Cannot open %s\n", name);
-    return false;
-  }
-  fseek(file_org, 0, SEEK_END);
-  size_t total_size  = ftell(file_org);
-  fseek(file_org, 0, SEEK_SET);
-  uint8* const ch_org = new uint8[total_size];
-  memset(ch_org, 0, total_size);
-  size_t bytes_org = fread(ch_org, sizeof(uint8), total_size, file_org);
-  fclose(file_org);
-  if (bytes_org == total_size) {
-    if (0 == libyuv::MJPGSize(ch_org, total_size, width_ptr, height_ptr)) {
-      delete[] ch_org;
-      return true;
-    }
-  }
-  delete[] ch_org;
-#endif  // HAVE_JPEG
  return false;
 }

@ -121,9 +94,6 @@ double GetMSE(double sse, double size) {

 void PrintHelp(const char * program) {
  printf("%s [-options] org_seq rec_seq [rec_seq2.. etc]\n", program);
-#ifdef HAVE_JPEG
-  printf("jpeg or raw YUV 420 supported.\n");
-#endif
  printf("options:\n");
  printf(" -s <width> <height> .... specify YUV size, mandatory if none of the "
         "sequences have the\n");
@ -245,18 +215,9 @@ bool UpdateMetrics(uint8* ch_org, uint8* ch_rec,
  const uint8* const v_org = ch_org + y_size + (uv_size - uv_offset);
  const uint8* const v_rec = ch_rec + y_size + uv_size;
  if (do_psnr) {
-#ifdef HAVE_JPEG
-    double y_err = static_cast<double>(
-      libyuv::ComputeSumSquareError(ch_org, ch_rec, y_size));
-    double u_err = static_cast<double>(
-      libyuv::ComputeSumSquareError(u_org, u_rec, uv_size));
-    double v_err = static_cast<double>(
-      libyuv::ComputeSumSquareError(v_org, v_rec, uv_size));
-#else
    double y_err = ComputeSumSquareError(ch_org, ch_rec, y_size);
    double u_err = ComputeSumSquareError(u_org, u_rec, uv_size);
    double v_err = ComputeSumSquareError(v_org, v_rec, uv_size);
-#endif
    const double total_err = y_err + u_err + v_err;
    cur_distortion_psnr->global_y += y_err;
    cur_distortion_psnr->global_u += u_err;
@ -269,10 +230,10 @@ bool UpdateMetrics(uint8* ch_org, uint8* ch_rec,
                                       static_cast<double>(total_size));
  } else {
    distorted_frame->y = CalcSSIM(ch_org, ch_rec, image_width, image_height);
-    distorted_frame->u = CalcSSIM(u_org, u_rec, (image_width + 1) / 2,
-                                 (image_height + 1) / 2);
-    distorted_frame->v = CalcSSIM(v_org, v_rec, (image_width + 1) / 2,
-                                 (image_height + 1) / 2);
+    distorted_frame->u = CalcSSIM(u_org, u_rec, image_width / 2,
+                                 image_height / 2);
+    distorted_frame->v = CalcSSIM(v_org, v_rec, image_width / 2,
+                                 image_height / 2);
    distorted_frame->all =
      (distorted_frame->y + distorted_frame->u + distorted_frame->v)
        / total_size;
@ -425,62 +386,14 @@ int main(int argc, const char* argv[]) {
      break;

    size_t bytes_org = fread(ch_org, sizeof(uint8), total_size, file_org);
-    if (bytes_org < total_size) {
-#ifdef HAVE_JPEG
-      // Try parsing file as a jpeg.
-      uint8* const ch_jpeg = new uint8[bytes_org];
-      memcpy(ch_jpeg, ch_org, bytes_org);
-      memset(ch_org, 0, total_size);
-
-      if (0 != libyuv::MJPGToI420(ch_jpeg, bytes_org,
-                                  ch_org,
-                                  image_width,
-                                  ch_org + y_size,
-                                  (image_width + 1) / 2,
-                                  ch_org + y_size + uv_size,
-                                  (image_width + 1) / 2,
-                                  image_width,
-                                  image_height,
-                                  image_width,
-                                  image_height)) {
-        delete[] ch_jpeg;
-        break;
-      }
-      delete[] ch_jpeg;
-#else
+    if (bytes_org < total_size)
      break;
-#endif  // HAVE_JPEG
-    }

    for (int cur_rec = 0; cur_rec < num_rec; ++cur_rec) {
      size_t bytes_rec = fread(ch_rec, sizeof(uint8),
                               total_size, file_rec[cur_rec]);
-      if (bytes_rec < total_size) {
-#ifdef HAVE_JPEG
-        // Try parsing file as a jpeg.
-        uint8* const ch_jpeg = new uint8[bytes_rec];
-        memcpy(ch_jpeg, ch_rec, bytes_rec);
-        memset(ch_rec, 0, total_size);
-
-        if (0 != libyuv::MJPGToI420(ch_jpeg, bytes_rec,
-                                    ch_rec,
-                                    image_width,
-                                    ch_rec + y_size,
-                                    (image_width + 1) / 2,
-                                    ch_rec + y_size + uv_size,
-                                    (image_width + 1) / 2,
-                                    image_width,
-                                    image_height,
-                                    image_width,
-                                    image_height)) {
-          delete[] ch_jpeg;
-          break;
-        }
-        delete[] ch_jpeg;
-#else
+      if (bytes_rec < total_size)
        break;
-#endif  // HAVE_JPEG
-      }

      if (verbose) {
        printf("%5d", number_of_frames);
--- a/media/libyuv/util/ssim.cc
+++ b/media/libyuv/util/ssim.cc
@ -10,6 +10,7 @@

 #include "../util/ssim.h"  // NOLINT

+#include <math.h>
 #include <string.h>

 #ifdef __cplusplus
--- a/media/libyuv/util/ssim.h
+++ b/media/libyuv/util/ssim.h
@ -13,8 +13,6 @@
 #ifndef UTIL_SSIM_H_  // NOLINT
 #define UTIL_SSIM_H_

-#include <math.h>  // For log10()
-
 #ifdef __cplusplus
 extern "C" {
 #endif
@ -27,6 +25,7 @@ typedef unsigned char uint8;
 double CalcSSIM(const uint8* org, const uint8* rec,
                const int image_width, const int image_height);

+// does -10.0 * log10(1.0 - ssim)
 double CalcLSSIM(double ssim);

 #ifdef __cplusplus
--- a/media/libyuv/winarm.mk
+++ b/media/libyuv/winarm.mk
@ -1,5 +1,4 @@
 # This is a generic makefile for libyuv for Windows Arm.
-# call "c:\Program Files (x86)\Microsoft Visual Studio 11.0\VC\bin\x86_arm\vcvarsx86_arm.bat"
 # nmake /f winarm.mk
 # make -f winarm.mk
 # nmake /f winarm.mk clean
@ -20,15 +19,13 @@ LOCAL_OBJ_FILES = \
 	source/convert_to_argb.o\
 	source/convert_to_i420.o\
 	source/cpu_id.o\
+	source/format_conversion.o\
 	source/planar_functions.o\
 	source/rotate.o\
-	source/rotate_any.o\
 	source/rotate_argb.o\
-	source/rotate_common.o\
 	source/row_any.o\
 	source/row_common.o\
 	source/scale.o\
-	source/scale_any.o\
 	source/scale_argb.o\
 	source/scale_common.o\
 	source/video_common.o