Bug 1873510 - Update libjxl to b26041c708d523ac53bb7d95d4f5c4a5d3b1ce30 r=saschanaz

Differential Revision: https://phabricator.services.mozilla.com/D197950
2025-02-24 03:19:06 +00:00 · 2024-01-09 08:26:00 +00:00 · 2024-01-09 08:26:00 +00:00 · c02f0ef1c5
commit c02f0ef1c5
parent a1a2edc0b9
61 changed files with 805 additions and 564 deletions
--- a/media/libjxl/moz.yaml
+++ b/media/libjxl/moz.yaml
@ -10,9 +10,9 @@ origin:

  url: https://github.com/libjxl/libjxl

-  release: 1d1006ce9dbe4224c1299992b57f28a4f62df9b7 (2023-12-29T13:36:27Z).
+  release: b26041c708d523ac53bb7d95d4f5c4a5d3b1ce30 (2024-01-08T13:55:50Z).

-  revision: 1d1006ce9dbe4224c1299992b57f28a4f62df9b7
+  revision: b26041c708d523ac53bb7d95d4f5c4a5d3b1ce30

  license: Apache-2.0

--- a/third_party/jpeg-xl/BUILD.bazel
+++ b/third_party/jpeg-xl/BUILD.bazel
@ -20,3 +20,4 @@ filegroup(
        "testdata/position_encoding/*.txt",
    ]),
 )
+
--- a/third_party/jpeg-xl/deps.sh
+++ b/third_party/jpeg-xl/deps.sh
@ -13,6 +13,7 @@ MYDIR=$(dirname $(realpath "$0"))

 # Git revisions we use for the given submodules. Update these whenever you
 # update a git submodule.
+TESTDATA="873045a9c42ed60721756e26e2a6b32e17415205"
 THIRD_PARTY_BROTLI="36533a866ed1ca4b75cf049f4521e4ec5fe24727"
 THIRD_PARTY_HIGHWAY="ba0900a4957b929390ab73827235557959234fea"
 THIRD_PARTY_SKCMS="42030a771244ba67f86b1c1c76a6493f873c5f91"
@ -79,6 +80,7 @@ EOF
  fi

  # Sources downloaded from a tarball.
+  download_github testdata libjxl/testdata
  download_github third_party/brotli google/brotli
  download_github third_party/highway google/highway
  download_github third_party/sjpeg webmproject/sjpeg
--- a/third_party/jpeg-xl/lib/extras/codec.cc
+++ b/third_party/jpeg-xl/lib/extras/codec.cc
@ -133,8 +133,8 @@ Status Encode(const CodecInOut& io, const ColorEncoding& c_desired,
              size_t bits_per_sample, const std::string& pathname,
              std::vector<uint8_t>* bytes, ThreadPool* pool) {
  std::string extension;
-  const extras::Codec codec = extras::CodecFromPath(
-      pathname, &bits_per_sample, /* filename */ nullptr, &extension);
+  const extras::Codec codec =
+      extras::CodecFromPath(pathname, &bits_per_sample, &extension);

  // Warn about incorrect usage of PGM/PGX/PPM - only the latter supports
  // color, but CodecFromPath lumps them all together.
--- a/third_party/jpeg-xl/lib/extras/dec/decode.cc
+++ b/third_party/jpeg-xl/lib/extras/dec/decode.cc
@ -22,41 +22,30 @@ namespace {
 // Any valid encoding is larger (ensures codecs can read the first few bytes)
 constexpr size_t kMinBytes = 9;

-void BasenameAndExtension(const std::string& path, std::string* filename,
-                          std::string* extension) {
-  // Pattern: "png:name" or "png:-"
-  size_t pos = path.find_first_of(':');
-  if (pos != std::string::npos) {
-    *extension = "." + path.substr(0, pos);
-    *filename = path.substr(pos + 1);
-    //+ ((path.length() == pos + 2 && path.substr(pos + 1, 1) == "-") ? "" :
-    //*extension);
-    return;
-  }
-
+std::string GetExtension(const std::string& path) {
  // Pattern: "name.png"
-  pos = path.find_last_of('.');
+  size_t pos = path.find_last_of('.');
  if (pos != std::string::npos) {
-    *extension = path.substr(pos);
-    *filename = path;
-    return;
+    return path.substr(pos);
  }

  // Extension not found
-  *filename = path;
-  *extension = "";
+  return "";
 }

 }  // namespace

 Codec CodecFromPath(std::string path, size_t* JXL_RESTRICT bits_per_sample,
-                    std::string* filename, std::string* extension) {
+                    std::string* extension) {
  std::string base;
-  std::string ext;
-  BasenameAndExtension(path, &base, &ext);
-  if (filename) *filename = base;
-  if (extension) *extension = ext;
-
+  std::string ext = GetExtension(path);
+  if (extension) {
+    if (extension->empty()) {
+      *extension = ext;
+    } else {
+      ext = *extension;
+    }
+  }
  std::transform(ext.begin(), ext.end(), ext.begin(), [](char c) {
    return std::tolower(c, std::locale::classic());
  });
--- a/third_party/jpeg-xl/lib/extras/dec/decode.h
+++ b/third_party/jpeg-xl/lib/extras/dec/decode.h
@ -42,7 +42,6 @@ bool CanDecode(Codec codec);
 // that Encode() would encode to PFM instead of PPM.
 Codec CodecFromPath(std::string path,
                    size_t* JXL_RESTRICT bits_per_sample = nullptr,
-                    std::string* filename = nullptr,
                    std::string* extension = nullptr);

 // Decodes "bytes" info *ppf.
--- a/third_party/jpeg-xl/lib/extras/dec/jxl.cc
+++ b/third_party/jpeg-xl/lib/extras/dec/jxl.cc
@ -10,10 +10,12 @@
 #include <jxl/decode_cxx.h>
 #include <jxl/types.h>

+#include <cinttypes>
+
 #include "lib/extras/common.h"
 #include "lib/extras/dec/color_description.h"
+#include "lib/jxl/base/exif.h"
 #include "lib/jxl/base/printf_macros.h"
-#include "lib/jxl/exif.h"

 namespace jxl {
 namespace extras {
--- a/third_party/jpeg-xl/lib/extras/enc/jxl.cc
+++ b/third_party/jpeg-xl/lib/extras/enc/jxl.cc
@ -8,7 +8,7 @@
 #include <jxl/encode.h>
 #include <jxl/encode_cxx.h>

-#include "lib/jxl/exif.h"
+#include "lib/jxl/base/exif.h"

 namespace jxl {
 namespace extras {
--- a/third_party/jpeg-xl/lib/include/jxl/encode.h
+++ b/third_party/jpeg-xl/lib/include/jxl/encode.h
@ -824,8 +824,8 @@ struct JxlChunkedFrameInputSource {

  /**
   * Callback to retrieve a rectangle of color channel data at a specific
-   * location. It is guaranteed that xpos and ypos are multiples of 128. xsize,
-   * ysize will be multiples of 128, unless the resulting rectangle would be out
+   * location. It is guaranteed that xpos and ypos are multiples of 8. xsize,
+   * ysize will be multiples of 8, unless the resulting rectangle would be out
   * of image bounds. Moreover, xsize and ysize will be at most 2048. The
   * returned data will be assumed to be in the format returned by the
   * (preceding) call to get_color_channels_pixel_format, except the `align`
@ -868,7 +868,7 @@ struct JxlChunkedFrameInputSource {
  /**
   * Callback to retrieve a rectangle of extra channel `ec_index` data at a
   * specific location. It is guaranteed that xpos and ypos are multiples of
-   * 128. xsize, ysize will be multiples of 128, unless the resulting rectangle
+   * 8. xsize, ysize will be multiples of 8, unless the resulting rectangle
   * would be out of image bounds. Moreover, xsize and ysize will be at most
   * 2048. The returned data will be assumed to be in the format returned by the
   * (preceding) call to get_extra_channels_pixel_format_at with the
--- a/third_party/jpeg-xl/lib/jxl/base/exif.h
+++ b/third_party/jpeg-xl/lib/jxl/base/exif.h
@ -11,7 +11,12 @@

 #include <jxl/codestream_header.h>

-#include "lib/jxl/image_metadata.h"
+#include <cstddef>
+#include <cstdint>
+#include <vector>
+
+#include "lib/jxl/base/byte_order.h"
+#include "lib/jxl/base/compiler_specific.h"

 namespace jxl {

@ -19,7 +24,7 @@ constexpr uint16_t kExifOrientationTag = 274;

 // Checks if a blob looks like Exif, and if so, sets bigendian
 // according to the tiff endianness
-inline bool IsExif(const std::vector<uint8_t>& exif, bool* bigendian) {
+JXL_INLINE bool IsExif(const std::vector<uint8_t>& exif, bool* bigendian) {
  if (exif.size() < 12) return false;  // not enough bytes for a valid exif blob
  const uint8_t* t = exif.data();
  if (LoadLE32(t) == 0x2A004D4D) {
@ -33,8 +38,8 @@ inline bool IsExif(const std::vector<uint8_t>& exif, bool* bigendian) {
 }

 // Finds the position of an Exif tag, or 0 if it is not found
-inline size_t FindExifTagPosition(const std::vector<uint8_t>& exif,
-                                  uint16_t tagname) {
+JXL_INLINE size_t FindExifTagPosition(const std::vector<uint8_t>& exif,
+                                      uint16_t tagname) {
  bool bigendian;
  if (!IsExif(exif, &bigendian)) return 0;
  const uint8_t* t = exif.data() + 4;
@ -62,8 +67,8 @@ inline size_t FindExifTagPosition(const std::vector<uint8_t>& exif,
 // Parses the Exif data just enough to extract any render-impacting info.
 // If the Exif data is invalid or could not be parsed, then it is treated
 // as a no-op.
-inline void InterpretExif(const std::vector<uint8_t>& exif,
-                          JxlOrientation* orientation) {
+JXL_INLINE void InterpretExif(const std::vector<uint8_t>& exif,
+                              JxlOrientation* orientation) {
  bool bigendian;
  if (!IsExif(exif, &bigendian)) return;
  size_t o_pos = FindExifTagPosition(exif, kExifOrientationTag);
--- a/third_party/jpeg-xl/lib/jxl/cms/color_encoding_cms.h
+++ b/third_party/jpeg-xl/lib/jxl/cms/color_encoding_cms.h
@ -6,7 +6,6 @@
 #ifndef LIB_JXL_CMS_COLOR_ENCODING_CMS_H_
 #define LIB_JXL_CMS_COLOR_ENCODING_CMS_H_

-#include <jxl/cms.h>
 #include <jxl/cms_interface.h>
 #include <jxl/color_encoding.h>
 #include <jxl/types.h>
@ -20,7 +19,6 @@

 #include "lib/jxl/base/common.h"
 #include "lib/jxl/base/status.h"
-#include "lib/jxl/cms/jxl_cms_internal.h"

 namespace jxl {
 namespace cms {
@ -521,7 +519,6 @@ struct ColorEncoding {
                                                new_icc.data(), new_icc.size(),
                                                &external, &new_cmyk));
    cmyk = new_cmyk;
-    if (cmyk) return true;
    JXL_RETURN_IF_ERROR(FromExternal(external));
    icc = std::move(new_icc);
    return true;
--- a/third_party/jpeg-xl/lib/jxl/cms/jxl_cms_internal.h
+++ b/third_party/jpeg-xl/lib/jxl/cms/jxl_cms_internal.h
@ -18,6 +18,7 @@
 #include <string>
 #include <vector>

+#include "lib/jxl/base/common.h"
 #include "lib/jxl/base/compiler_specific.h"
 #include "lib/jxl/base/matrix_ops.h"
 #include "lib/jxl/base/span.h"  // Bytes
@ -1065,14 +1066,15 @@ static Status MaybeCreateProfileImpl(const JxlColorEncoding& c,

 // Returns a representation of the ColorEncoding fields (not icc).
 // Example description: "RGB_D65_SRG_Rel_Lin"
-static std::string ColorEncodingDescription(const JxlColorEncoding& c) {
+static JXL_MAYBE_UNUSED std::string ColorEncodingDescription(
+    const JxlColorEncoding& c) {
  return detail::ColorEncodingDescriptionImpl(c);
 }

 // NOTE: for XYB colorspace, the created profile can be used to transform a
 // *scaled* XYB image (created by ScaleXYB()) to another colorspace.
-static Status MaybeCreateProfile(const JxlColorEncoding& c,
-                                 std::vector<uint8_t>* icc) {
+static JXL_MAYBE_UNUSED Status MaybeCreateProfile(const JxlColorEncoding& c,
+                                                  std::vector<uint8_t>* icc) {
  return detail::MaybeCreateProfileImpl(c, icc);
 }

--- a/third_party/jpeg-xl/lib/jxl/cms/tone_mapping.h
+++ b/third_party/jpeg-xl/lib/jxl/cms/tone_mapping.h
@ -8,8 +8,10 @@

 #include <algorithm>
 #include <cmath>
+#include <utility>

 #include "lib/jxl/base/common.h"
+#include "lib/jxl/base/compiler_specific.h"
 #include "lib/jxl/cms/transfer_functions.h"

 namespace jxl {
@ -122,9 +124,10 @@ class HlgOOTF_Base {
  const float blue_Y_;
 };

-static void GamutMapScalar(float* red, float* green, float* blue,
-                           const float primaries_luminances[3],
-                           float preserve_saturation = 0.1f) {
+static JXL_MAYBE_UNUSED void GamutMapScalar(float* red, float* green,
+                                            float* blue,
+                                            const float primaries_luminances[3],
+                                            float preserve_saturation = 0.1f) {
  const float luminance = primaries_luminances[0] * *red +
                          primaries_luminances[1] * *green +
                          primaries_luminances[2] * *blue;
--- a/third_party/jpeg-xl/lib/jxl/cms/transfer_functions.h
+++ b/third_party/jpeg-xl/lib/jxl/cms/transfer_functions.h
@ -11,7 +11,6 @@
 #include <algorithm>
 #include <cmath>

-#include "lib/jxl/base/compiler_specific.h"
 #include "lib/jxl/base/status.h"

 namespace jxl {
--- a/third_party/jpeg-xl/lib/jxl/color_encoding_internal.h
+++ b/third_party/jpeg-xl/lib/jxl/color_encoding_internal.h
@ -8,7 +8,6 @@

 // Metadata for color space conversions.

-#include <jxl/cms.h>
 #include <jxl/cms_interface.h>
 #include <jxl/color_encoding.h>
 #include <stddef.h>
--- a/third_party/jpeg-xl/lib/jxl/convolve.h
+++ b/third_party/jpeg-xl/lib/jxl/convolve.h
@ -76,18 +76,22 @@ void SlowSymmetric3(const ImageF& in, const Rect& rect,
                    const WeightsSymmetric3& weights, ThreadPool* pool,
                    ImageF* JXL_RESTRICT out);

-void SlowSeparable5(const ImageF& in, const Rect& rect,
+void SlowSeparable5(const ImageF& in, const Rect& in_rect,
                    const WeightsSeparable5& weights, ThreadPool* pool,
-                    ImageF* out);
+                    ImageF* out, const Rect& out_rect);

-void SlowSeparable7(const ImageF& in, const Rect& rect,
+void SlowSeparable7(const ImageF& in, const Rect& in_rect,
                    const WeightsSeparable7& weights, ThreadPool* pool,
-                    ImageF* out);
+                    ImageF* out, const Rect& out_rect);

 void Symmetric3(const ImageF& in, const Rect& rect,
                const WeightsSymmetric3& weights, ThreadPool* pool,
                ImageF* out);

+void Symmetric5(const ImageF& in, const Rect& in_rect,
+                const WeightsSymmetric5& weights, ThreadPool* pool,
+                ImageF* JXL_RESTRICT out, const Rect& out_rect);
+
 void Symmetric5(const ImageF& in, const Rect& rect,
                const WeightsSymmetric5& weights, ThreadPool* pool,
                ImageF* JXL_RESTRICT out);
--- a/third_party/jpeg-xl/lib/jxl/convolve_separable5.cc
+++ b/third_party/jpeg-xl/lib/jxl/convolve_separable5.cc
@ -239,7 +239,7 @@ void Separable5(const ImageF& in, const Rect& rect,
    return Conv::Run(in, rect, weights, pool, out);
  }

-  return SlowSeparable5(in, rect, weights, pool, out);
+  return SlowSeparable5(in, rect, weights, pool, out, Rect(*out));
 }

 // NOLINTNEXTLINE(google-readability-namespace-comments)
--- a/third_party/jpeg-xl/lib/jxl/convolve_separable7.cc
+++ b/third_party/jpeg-xl/lib/jxl/convolve_separable7.cc
@ -263,7 +263,7 @@ void Separable7(const ImageF& in, const Rect& rect,
    return Conv::Run(in, rect, weights, pool, out);
  }

-  return SlowSeparable7(in, rect, weights, pool, out);
+  return SlowSeparable7(in, rect, weights, pool, out, Rect(*out));
 }

 // NOLINTNEXTLINE(google-readability-namespace-comments)
--- a/third_party/jpeg-xl/lib/jxl/convolve_slow.cc
+++ b/third_party/jpeg-xl/lib/jxl/convolve_slow.cc
@ -141,19 +141,19 @@ float SlowSeparablePixel(const ImageF& in, const Rect& rect, const int64_t x,
                         const int64_t y, const int64_t radius,
                         const float* JXL_RESTRICT horz_weights,
                         const float* JXL_RESTRICT vert_weights) {
-  const size_t xsize = rect.xsize();
-  const size_t ysize = rect.ysize();
+  const size_t xsize = in.xsize();
+  const size_t ysize = in.ysize();
  const WrapMirror wrap;

  float mul = 0.0f;
  for (int dy = -radius; dy <= radius; ++dy) {
    const float wy = vert_weights[std::abs(dy) * 4];
-    const size_t sy = wrap(y + dy, ysize);
+    const size_t sy = wrap(rect.y0() + y + dy, ysize);
    JXL_CHECK(sy < ysize);
-    const float* const JXL_RESTRICT row = rect.ConstRow(in, sy);
+    const float* const JXL_RESTRICT row = in.ConstRow(sy);
    for (int dx = -radius; dx <= radius; ++dx) {
      const float wx = horz_weights[std::abs(dx) * 4];
-      const size_t sx = wrap(x + dx, xsize);
+      const size_t sx = wrap(rect.x0() + x + dx, xsize);
      JXL_CHECK(sx < xsize);
      mul += row[sx] * wx * wy;
    }
@ -161,48 +161,44 @@ float SlowSeparablePixel(const ImageF& in, const Rect& rect, const int64_t x,
  return mul;
 }

-}  // namespace
-
-void SlowSeparable5(const ImageF& in, const Rect& rect,
-                    const WeightsSeparable5& weights, ThreadPool* pool,
-                    ImageF* out) {
+template <int R, typename Weights>
+void SlowSeparable(const ImageF& in, const Rect& in_rect,
+                   const Weights& weights, ThreadPool* pool, ImageF* out,
+                   const Rect& out_rect) {
+  JXL_ASSERT(in_rect.xsize() == out_rect.xsize());
+  JXL_ASSERT(in_rect.ysize() == out_rect.ysize());
+  JXL_ASSERT(in_rect.IsInside(Rect(in)));
+  JXL_ASSERT(out_rect.IsInside(Rect(*out)));
  const float* horz_weights = &weights.horz[0];
  const float* vert_weights = &weights.vert[0];

-  const size_t ysize = rect.ysize();
+  const size_t ysize = in_rect.ysize();
  JXL_CHECK(RunOnPool(
      pool, 0, static_cast<uint32_t>(ysize), ThreadPool::NoInit,
      [&](const uint32_t task, size_t /*thread*/) {
        const int64_t y = task;

-        float* const JXL_RESTRICT row_out = out->Row(y);
-        for (size_t x = 0; x < rect.xsize(); ++x) {
-          row_out[x] = SlowSeparablePixel(in, rect, x, y, /*radius=*/2,
+        float* const JXL_RESTRICT row_out = out_rect.Row(out, y);
+        for (size_t x = 0; x < in_rect.xsize(); ++x) {
+          row_out[x] = SlowSeparablePixel(in, in_rect, x, y, /*radius=*/R,
                                          horz_weights, vert_weights);
        }
      },
-      "SlowSeparable5"));
+      "SlowSeparable"));
 }

-void SlowSeparable7(const ImageF& in, const Rect& rect,
+}  // namespace
+
+void SlowSeparable5(const ImageF& in, const Rect& in_rect,
+                    const WeightsSeparable5& weights, ThreadPool* pool,
+                    ImageF* out, const Rect& out_rect) {
+  SlowSeparable<2>(in, in_rect, weights, pool, out, out_rect);
+}
+
+void SlowSeparable7(const ImageF& in, const Rect& in_rect,
                    const WeightsSeparable7& weights, ThreadPool* pool,
-                    ImageF* out) {
-  const float* horz_weights = &weights.horz[0];
-  const float* vert_weights = &weights.vert[0];
-
-  const size_t ysize = rect.ysize();
-  JXL_CHECK(RunOnPool(
-      pool, 0, static_cast<uint32_t>(ysize), ThreadPool::NoInit,
-      [&](const uint32_t task, size_t /*thread*/) {
-        const int64_t y = task;
-
-        float* const JXL_RESTRICT row_out = out->Row(y);
-        for (size_t x = 0; x < rect.xsize(); ++x) {
-          row_out[x] = SlowSeparablePixel(in, rect, x, y, /*radius=*/3,
-                                          horz_weights, vert_weights);
-        }
-      },
-      "SlowSeparable7"));
+                    ImageF* out, const Rect& out_rect) {
+  SlowSeparable<3>(in, in_rect, weights, pool, out, out_rect);
 }

 }  // namespace jxl
--- a/third_party/jpeg-xl/lib/jxl/convolve_symmetric5.cc
+++ b/third_party/jpeg-xl/lib/jxl/convolve_symmetric5.cc
@ -52,7 +52,7 @@ static V WeightedSum(const ImageF& in, const WrapY wrap_y, const size_t ix,
  const auto in_p2 = LoadU(d, center + 2);
  const auto in_m1 = LoadU(d, center - 1);
  const auto in_p1 = LoadU(d, center + 1);
-  const auto in_00 = Load(d, center);
+  const auto in_00 = LoadU(d, center);
  const auto sum_2 = Mul(wx2, Add(in_m2, in_p2));
  const auto sum_1 = Mul(wx1, Add(in_m1, in_p1));
  const auto sum_0 = Mul(wx0, in_00);
@ -61,8 +61,8 @@ static V WeightedSum(const ImageF& in, const WrapY wrap_y, const size_t ix,

 // Produces result for one pixel
 template <class WrapY>
-float Symmetric5Border(const ImageF& in, const Rect& rect, const int64_t ix,
-                       const int64_t iy, const WeightsSymmetric5& weights) {
+float Symmetric5Border(const ImageF& in, const int64_t ix, const int64_t iy,
+                       const WeightsSymmetric5& weights) {
  const float w0 = weights.c[0];
  const float w1 = weights.r[0];
  const float w2 = weights.R[0];
@ -70,8 +70,8 @@ float Symmetric5Border(const ImageF& in, const Rect& rect, const int64_t ix,
  const float w5 = weights.L[0];
  const float w8 = weights.D[0];

-  const size_t xsize = rect.xsize();
-  const size_t ysize = rect.ysize();
+  const size_t xsize = in.xsize();
+  const size_t ysize = in.ysize();
  const WrapY wrap_y;
  // Unrolled loop over all 5 rows of the kernel.
  float sum0 = WeightedSumBorder(in, wrap_y, ix, iy, xsize, ysize, w0, w1, w2);
@ -88,8 +88,8 @@ float Symmetric5Border(const ImageF& in, const Rect& rect, const int64_t ix,

 // Produces result for one vector's worth of pixels
 template <class WrapY>
-static void Symmetric5Interior(const ImageF& in, const Rect& rect,
-                               const int64_t ix, const int64_t iy,
+static void Symmetric5Interior(const ImageF& in, const int64_t ix,
+                               const int64_t rix, const int64_t iy,
                               const WeightsSymmetric5& weights,
                               float* JXL_RESTRICT row_out) {
  const HWY_FULL(float) d;
@ -101,7 +101,7 @@ static void Symmetric5Interior(const ImageF& in, const Rect& rect,
  const auto w5 = LoadDup128(d, weights.L);
  const auto w8 = LoadDup128(d, weights.D);

-  const size_t ysize = rect.ysize();
+  const size_t ysize = in.ysize();
  const WrapY wrap_y;
  // Unrolled loop over all 5 rows of the kernel.
  auto sum0 = WeightedSum(in, wrap_y, ix, iy, ysize, w0, w1, w2);
@ -112,7 +112,7 @@ static void Symmetric5Interior(const ImageF& in, const Rect& rect,
  sum0 = Add(sum0, WeightedSum(in, wrap_y, ix, iy - 1, ysize, w1, w4, w5));
  sum1 = Add(sum1, WeightedSum(in, wrap_y, ix, iy + 1, ysize, w1, w4, w5));

-  Store(Add(sum0, sum1), d, row_out + ix);
+  StoreU(Add(sum0, sum1), d, row_out + rix);
 }

 template <class WrapY>
@ -120,45 +120,44 @@ static void Symmetric5Row(const ImageF& in, const Rect& rect, const int64_t iy,
                          const WeightsSymmetric5& weights,
                          float* JXL_RESTRICT row_out) {
  const int64_t kRadius = 2;
-  const size_t xsize = rect.xsize();
+  const size_t xend = rect.x1();

-  size_t ix = 0;
+  size_t rix = 0;
+  size_t ix = rect.x0();
  const HWY_FULL(float) d;
  const size_t N = Lanes(d);
  const size_t aligned_x = RoundUpTo(kRadius, N);
-  for (; ix < std::min(aligned_x, xsize); ++ix) {
-    row_out[ix] = Symmetric5Border<WrapY>(in, rect, ix, iy, weights);
+  for (; ix < std::min(aligned_x, xend); ++ix, ++rix) {
+    row_out[rix] = Symmetric5Border<WrapY>(in, ix, iy, weights);
  }
-  for (; ix + N + kRadius <= xsize; ix += N) {
-    Symmetric5Interior<WrapY>(in, rect, ix, iy, weights, row_out);
+  for (; ix + N + kRadius <= xend; ix += N, rix += N) {
+    Symmetric5Interior<WrapY>(in, ix, rix, iy, weights, row_out);
  }
-  for (; ix < xsize; ++ix) {
-    row_out[ix] = Symmetric5Border<WrapY>(in, rect, ix, iy, weights);
+  for (; ix < xend; ++ix, ++rix) {
+    row_out[rix] = Symmetric5Border<WrapY>(in, ix, iy, weights);
  }
 }

-static JXL_NOINLINE void Symmetric5BorderRow(const ImageF& in, const Rect& rect,
-                                             const int64_t iy,
-                                             const WeightsSymmetric5& weights,
-                                             float* JXL_RESTRICT row_out) {
-  return Symmetric5Row<WrapMirror>(in, rect, iy, weights, row_out);
-}
-
 // Semi-vectorized (interior pixels Fonly); called directly like slow::, unlike
 // the fully vectorized strategies below.
-void Symmetric5(const ImageF& in, const Rect& rect,
+void Symmetric5(const ImageF& in, const Rect& in_rect,
                const WeightsSymmetric5& weights, ThreadPool* pool,
-                ImageF* JXL_RESTRICT out) {
-  const size_t ysize = rect.ysize();
+                ImageF* JXL_RESTRICT out, const Rect& out_rect) {
+  JXL_ASSERT(in_rect.xsize() == out_rect.xsize());
+  JXL_ASSERT(in_rect.ysize() == out_rect.ysize());
+  const size_t ysize = in_rect.ysize();
  JXL_CHECK(RunOnPool(
      pool, 0, static_cast<uint32_t>(ysize), ThreadPool::NoInit,
      [&](const uint32_t task, size_t /*thread*/) {
-        const int64_t iy = task;
+        const int64_t riy = task;
+        const int64_t iy = in_rect.y0() + riy;

-        if (iy < 2 || iy >= static_cast<ssize_t>(ysize) - 2) {
-          Symmetric5BorderRow(in, rect, iy, weights, out->Row(iy));
+        if (iy < 2 || iy >= static_cast<ssize_t>(in.ysize()) - 2) {
+          Symmetric5Row<WrapMirror>(in, in_rect, iy, weights,
+                                    out_rect.Row(out, riy));
        } else {
-          Symmetric5Row<WrapUnchanged>(in, rect, iy, weights, out->Row(iy));
+          Symmetric5Row<WrapUnchanged>(in, in_rect, iy, weights,
+                                       out_rect.Row(out, riy));
        }
      },
      "Symmetric5x5Convolution"));
@ -173,10 +172,17 @@ HWY_AFTER_NAMESPACE();
 namespace jxl {

 HWY_EXPORT(Symmetric5);
+void Symmetric5(const ImageF& in, const Rect& in_rect,
+                const WeightsSymmetric5& weights, ThreadPool* pool,
+                ImageF* JXL_RESTRICT out, const Rect& out_rect) {
+  return HWY_DYNAMIC_DISPATCH(Symmetric5)(in, in_rect, weights, pool, out,
+                                          out_rect);
+}
+
 void Symmetric5(const ImageF& in, const Rect& rect,
                const WeightsSymmetric5& weights, ThreadPool* pool,
                ImageF* JXL_RESTRICT out) {
-  return HWY_DYNAMIC_DISPATCH(Symmetric5)(in, rect, weights, pool, out);
+  return Symmetric5(in, rect, weights, pool, out, Rect(*out));
 }

 }  // namespace jxl
--- a/third_party/jpeg-xl/lib/jxl/convolve_test.cc
+++ b/third_party/jpeg-xl/lib/jxl/convolve_test.cc
@ -81,21 +81,57 @@ void VerifySymmetric3(const size_t xsize, const size_t ysize, ThreadPool* pool,
  JXL_ASSERT_OK(VerifyRelativeError(out_expected, out_actual, 1E-5f, 1E-5f, _));
 }

+std::vector<Rect> GenerateTestRectangles(size_t xsize, size_t ysize) {
+  std::vector<Rect> out;
+  for (size_t tl : {0, 1, 13}) {
+    for (size_t br : {0, 1, 13}) {
+      if (xsize > tl + br && ysize > tl + br) {
+        out.push_back(Rect(tl, tl, xsize - tl - br, ysize - tl - br));
+      }
+    }
+  }
+  return out;
+}
+
 // Ensures Symmetric and Separable give the same result.
 void VerifySymmetric5(const size_t xsize, const size_t ysize, ThreadPool* pool,
                      Rng* rng) {
-  const Rect rect(0, 0, xsize, ysize);
-
  ImageF in(xsize, ysize);
  GenerateImage(*rng, &in, 0.0f, 1.0f);

-  ImageF out_expected(xsize, ysize);
-  ImageF out_actual(xsize, ysize);
+  for (const Rect& in_rect : GenerateTestRectangles(xsize, ysize)) {
+    JXL_DEBUG(JXL_DEBUG_CONVOLVE,
+              "in_rect: %" PRIuS "x%" PRIuS "+%" PRIuS ",%" PRIuS "",
+              in_rect.xsize(), in_rect.ysize(), in_rect.x0(), in_rect.y0());
+    {
+      Rect out_rect = in_rect;
+      ImageF out_expected(xsize, ysize);
+      ImageF out_actual(xsize, ysize);
+      FillImage(-1.0f, &out_expected);
+      FillImage(-1.0f, &out_actual);

-  Separable5(in, Rect(in), WeightsSeparable5Lowpass(), pool, &out_expected);
-  Symmetric5(in, rect, WeightsSymmetric5Lowpass(), pool, &out_actual);
+      SlowSeparable5(in, in_rect, WeightsSeparable5Lowpass(), pool,
+                     &out_expected, out_rect);
+      Symmetric5(in, in_rect, WeightsSymmetric5Lowpass(), pool, &out_actual,
+                 out_rect);

-  JXL_ASSERT_OK(VerifyRelativeError(out_expected, out_actual, 1E-5f, 1E-5f, _));
+      JXL_ASSERT_OK(
+          VerifyRelativeError(out_expected, out_actual, 1E-5f, 1E-5f, _));
+    }
+    {
+      Rect out_rect(0, 0, in_rect.xsize(), in_rect.ysize());
+      ImageF out_expected(out_rect.xsize(), out_rect.ysize());
+      ImageF out_actual(out_rect.xsize(), out_rect.ysize());
+
+      SlowSeparable5(in, in_rect, WeightsSeparable5Lowpass(), pool,
+                     &out_expected, out_rect);
+      Symmetric5(in, in_rect, WeightsSymmetric5Lowpass(), pool, &out_actual,
+                 out_rect);
+
+      JXL_ASSERT_OK(
+          VerifyRelativeError(out_expected, out_actual, 1E-5f, 1E-5f, _));
+    }
+  }
 }

 void VerifySeparable5(const size_t xsize, const size_t ysize, ThreadPool* pool,
@ -109,8 +145,8 @@ void VerifySeparable5(const size_t xsize, const size_t ysize, ThreadPool* pool,
  ImageF out_actual(xsize, ysize);

  const WeightsSeparable5& weights = WeightsSeparable5Lowpass();
-  Separable5(in, Rect(in), weights, pool, &out_expected);
-  SlowSeparable5(in, rect, weights, pool, &out_actual);
+  SlowSeparable5(in, rect, weights, pool, &out_expected, rect);
+  Separable5(in, rect, weights, pool, &out_actual);

  JXL_ASSERT_OK(VerifyRelativeError(out_expected, out_actual, 1E-5f, 1E-5f, _));
 }
@ -131,8 +167,8 @@ void VerifySeparable7(const size_t xsize, const size_t ysize, ThreadPool* pool,
                                     {HWY_REP4(0.383103f), HWY_REP4(0.241843f),
                                      HWY_REP4(0.060626f), HWY_REP4(0.00598f)}};

-  SlowSeparable7(in, rect, weights, pool, &out_expected);
-  Separable7(in, Rect(in), weights, pool, &out_actual);
+  SlowSeparable7(in, rect, weights, pool, &out_expected, rect);
+  Separable7(in, rect, weights, pool, &out_actual);

  JXL_ASSERT_OK(VerifyRelativeError(out_expected, out_actual, 1E-5f, 1E-5f, _));
 }
--- a/third_party/jpeg-xl/lib/jxl/enc_ac_strategy.cc
+++ b/third_party/jpeg-xl/lib/jxl/enc_ac_strategy.cc
@ -790,8 +790,9 @@ void FindBestFirstLevelDivisionForSquare(
  }
 }

-void ProcessRectACS(PassesEncoderState* JXL_RESTRICT enc_state,
-                    const ACSConfig& config, const Rect& rect) {
+void ProcessRectACS(const CompressParams& cparams, const ACSConfig& config,
+                    const Rect& rect, const ColorCorrelationMap& cmap,
+                    AcStrategyImage* ac_strategy) {
  // Main philosophy here:
  // 1. First find best 8x8 transform for each area.
  // 2. Merging them into larger transforms where possibly, but
@ -802,9 +803,7 @@ void ProcessRectACS(PassesEncoderState* JXL_RESTRICT enc_state,
  // maps happen to be at that resolution, and having
  // integral transforms cross these boundaries leads to
  // additional complications.
-  const CompressParams& cparams = enc_state->cparams;
  const float butteraugli_target = cparams.butteraugli_distance;
-  AcStrategyImage* ac_strategy = &enc_state->shared.ac_strategy;
  const size_t dct_scratch_size =
      3 * (MaxVectorSize() / sizeof(float)) * AcStrategy::kMaxBlockDim;
  // TODO(veluca): reuse allocations
@ -821,11 +820,9 @@ void ProcessRectACS(PassesEncoderState* JXL_RESTRICT enc_state,
  size_t tx = bx / kColorTileDimInBlocks;
  size_t ty = by / kColorTileDimInBlocks;
  const float cmap_factors[3] = {
-      enc_state->shared.cmap.YtoXRatio(
-          enc_state->shared.cmap.ytox_map.ConstRow(ty)[tx]),
+      cmap.YtoXRatio(cmap.ytox_map.ConstRow(ty)[tx]),
      0.0f,
-      enc_state->shared.cmap.YtoBRatio(
-          enc_state->shared.cmap.ytob_map.ConstRow(ty)[tx]),
+      cmap.YtoBRatio(cmap.ytob_map.ConstRow(ty)[tx]),
  };
  if (cparams.speed_tier > SpeedTier::kHare) return;
  // First compute the best 8x8 transform for each square. Later, we do not
@ -1040,28 +1037,26 @@ HWY_AFTER_NAMESPACE();
 namespace jxl {
 HWY_EXPORT(ProcessRectACS);

-void AcStrategyHeuristics::Init(const Image3F& src,
-                                PassesEncoderState* enc_state) {
-  this->enc_state = enc_state;
-  config.dequant = &enc_state->shared.matrices;
-  const CompressParams& cparams = enc_state->cparams;
+void AcStrategyHeuristics::Init(const Image3F& src, const Rect& rect_in,
+                                const ImageF& quant_field, const ImageF& mask,
+                                const ImageF& mask1x1,
+                                DequantMatrices* matrices) {
+  config.dequant = matrices;

  if (cparams.speed_tier >= SpeedTier::kCheetah) {
-    JXL_CHECK(enc_state->shared.matrices.EnsureComputed(1));  // DCT8 only
+    JXL_CHECK(matrices->EnsureComputed(1));  // DCT8 only
  } else {
    uint32_t acs_mask = 0;
    // All transforms up to 64x64.
    for (size_t i = 0; i < AcStrategy::DCT128X128; i++) {
      acs_mask |= (1 << i);
    }
-    JXL_CHECK(enc_state->shared.matrices.EnsureComputed(acs_mask));
+    JXL_CHECK(matrices->EnsureComputed(acs_mask));
  }

  // Image row pointers and strides.
-  config.quant_field_row = enc_state->initial_quant_field.Row(0);
-  config.quant_field_stride = enc_state->initial_quant_field.PixelsPerRow();
-  auto& mask = enc_state->initial_quant_masking;
-  auto& mask1x1 = enc_state->initial_quant_masking1x1;
+  config.quant_field_row = quant_field.Row(0);
+  config.quant_field_stride = quant_field.PixelsPerRow();
  if (mask.xsize() > 0 && mask.ysize() > 0) {
    config.masking_field_row = mask.Row(0);
    config.masking_field_stride = mask.PixelsPerRow();
@ -1071,9 +1066,9 @@ void AcStrategyHeuristics::Init(const Image3F& src,
    config.masking1x1_field_stride = mask1x1.PixelsPerRow();
  }

-  config.src_rows[0] = src.ConstPlaneRow(0, 0);
-  config.src_rows[1] = src.ConstPlaneRow(1, 0);
-  config.src_rows[2] = src.ConstPlaneRow(2, 0);
+  config.src_rows[0] = rect_in.ConstPlaneRow(src, 0, 0);
+  config.src_rows[1] = rect_in.ConstPlaneRow(src, 1, 0);
+  config.src_rows[2] = rect_in.ConstPlaneRow(src, 2, 0);
  config.src_stride = src.PixelsPerRow();

  // Entropy estimate is composed of two factors:
@ -1093,25 +1088,23 @@ void AcStrategyHeuristics::Init(const Image3F& src,
  config.info_loss_multiplier *= pow(ratio, kPow1);
  config.zeros_mul *= pow(ratio, kPow2);
  config.cost_delta *= pow(ratio, kPow3);
-  JXL_ASSERT(enc_state->shared.ac_strategy.xsize() ==
-             enc_state->shared.frame_dim.xsize_blocks);
-  JXL_ASSERT(enc_state->shared.ac_strategy.ysize() ==
-             enc_state->shared.frame_dim.ysize_blocks);
 }

-void AcStrategyHeuristics::ProcessRect(const Rect& rect) {
-  const CompressParams& cparams = enc_state->cparams;
+void AcStrategyHeuristics::ProcessRect(const Rect& rect,
+                                       const ColorCorrelationMap& cmap,
+                                       AcStrategyImage* ac_strategy) {
  // In Falcon mode, use DCT8 everywhere and uniform quantization.
  if (cparams.speed_tier >= SpeedTier::kCheetah) {
-    enc_state->shared.ac_strategy.FillDCT8(rect);
+    ac_strategy->FillDCT8(rect);
    return;
  }
  HWY_DYNAMIC_DISPATCH(ProcessRectACS)
-  (enc_state, config, rect);
+  (cparams, config, rect, cmap, ac_strategy);
 }

-void AcStrategyHeuristics::Finalize(AuxOut* aux_out) {
-  const auto& ac_strategy = enc_state->shared.ac_strategy;
+void AcStrategyHeuristics::Finalize(const FrameDimensions& frame_dim,
+                                    const AcStrategyImage& ac_strategy,
+                                    AuxOut* aux_out) {
  // Accounting and debug output.
  if (aux_out != nullptr) {
    aux_out->num_small_blocks =
@ -1147,10 +1140,9 @@ void AcStrategyHeuristics::Finalize(AuxOut* aux_out) {
  }

  // if (JXL_DEBUG_AC_STRATEGY && WantDebugOutput(aux_out)) {
-  if (JXL_DEBUG_AC_STRATEGY && WantDebugOutput(enc_state->cparams)) {
-    DumpAcStrategy(ac_strategy, enc_state->shared.frame_dim.xsize,
-                   enc_state->shared.frame_dim.ysize, "ac_strategy", aux_out,
-                   enc_state->cparams);
+  if (JXL_DEBUG_AC_STRATEGY && WantDebugOutput(cparams)) {
+    DumpAcStrategy(ac_strategy, frame_dim.xsize, frame_dim.ysize, "ac_strategy",
+                   aux_out, cparams);
  }
 }

--- a/third_party/jpeg-xl/lib/jxl/enc_ac_strategy.h
+++ b/third_party/jpeg-xl/lib/jxl/enc_ac_strategy.h
@ -25,11 +25,11 @@ struct AuxOut;

 struct ACSConfig {
  const DequantMatrices* JXL_RESTRICT dequant;
-  float* JXL_RESTRICT quant_field_row;
+  const float* JXL_RESTRICT quant_field_row;
  size_t quant_field_stride;
-  float* JXL_RESTRICT masking_field_row;
+  const float* JXL_RESTRICT masking_field_row;
  size_t masking_field_stride;
-  float* JXL_RESTRICT masking1x1_field_row;
+  const float* JXL_RESTRICT masking1x1_field_row;
  size_t masking1x1_field_stride;
  const float* JXL_RESTRICT src_rows[3];
  size_t src_stride;
@ -43,7 +43,7 @@ struct ACSConfig {
    JXL_DASSERT(masking_field_row[by * masking_field_stride + bx] > 0);
    return masking_field_row[by * masking_field_stride + bx];
  }
-  float* MaskingPtr1x1(size_t bx, size_t by) const {
+  const float* MaskingPtr1x1(size_t bx, size_t by) const {
    JXL_DASSERT(masking1x1_field_row[by * masking1x1_field_stride + bx] > 0);
    return &masking1x1_field_row[by * masking1x1_field_stride + bx];
  }
@ -54,11 +54,16 @@ struct ACSConfig {
 };

 struct AcStrategyHeuristics {
-  void Init(const Image3F& src, PassesEncoderState* enc_state);
-  void ProcessRect(const Rect& rect);
-  void Finalize(AuxOut* aux_out);
+  AcStrategyHeuristics(const CompressParams& cparams) : cparams(cparams) {}
+  void Init(const Image3F& src, const Rect& rect_in, const ImageF& quant_field,
+            const ImageF& mask, const ImageF& mask1x1,
+            DequantMatrices* matrices);
+  void ProcessRect(const Rect& rect, const ColorCorrelationMap& cmap,
+                   AcStrategyImage* ac_strategy);
+  void Finalize(const FrameDimensions& frame_dim,
+                const AcStrategyImage& ac_strategy, AuxOut* aux_out);
+  const CompressParams& cparams;
  ACSConfig config;
-  PassesEncoderState* enc_state;
 };

 }  // namespace jxl
--- a/third_party/jpeg-xl/lib/jxl/enc_adaptive_quantization.cc
+++ b/third_party/jpeg-xl/lib/jxl/enc_adaptive_quantization.cc
@ -164,7 +164,8 @@ V SimpleGamma(const D d, V v) {

 template <class D, class V>
 V GammaModulation(const D d, const size_t x, const size_t y,
-                  const ImageF& xyb_x, const ImageF& xyb_y, const V out_val) {
+                  const ImageF& xyb_x, const ImageF& xyb_y, const Rect& rect,
+                  const V out_val) {
  const float kBias = 0.16f;
  JXL_DASSERT(kBias > jxl::cms::kOpsinAbsorbanceBias[0]);
  JXL_DASSERT(kBias > jxl::cms::kOpsinAbsorbanceBias[1]);
@ -173,8 +174,8 @@ V GammaModulation(const D d, const size_t x, const size_t y,
  auto bias = Set(d, kBias);
  auto half = Set(d, 0.5f);
  for (size_t dy = 0; dy < 8; ++dy) {
-    const float* const JXL_RESTRICT row_in_x = xyb_x.Row(y + dy);
-    const float* const JXL_RESTRICT row_in_y = xyb_y.Row(y + dy);
+    const float* const JXL_RESTRICT row_in_x = rect.ConstRow(xyb_x, y + dy);
+    const float* const JXL_RESTRICT row_in_y = rect.ConstRow(xyb_y, y + dy);
    for (size_t dx = 0; dx < 8; dx += Lanes(d)) {
      const auto iny = Add(Load(d, row_in_y + x + dx), bias);
      const auto inx = Load(d, row_in_x + x + dx);
@ -201,7 +202,7 @@ V GammaModulation(const D d, const size_t x, const size_t y,
 // Change precision in 8x8 blocks that have high frequency content.
 template <class D, class V>
 V HfModulation(const D d, const size_t x, const size_t y, const ImageF& xyb,
-               const V out_val) {
+               const Rect& rect, const V out_val) {
  // Zero out the invalid differences for the rightmost value per row.
  const Rebind<uint32_t, D> du;
  HWY_ALIGN constexpr uint32_t kMaskRight[kBlockDim] = {~0u, ~0u, ~0u, ~0u,
@ -212,9 +213,9 @@ V HfModulation(const D d, const size_t x, const size_t y, const ImageF& xyb,
  static const float valmin = 0.020602694503245016f;
  auto valminv = Set(d, valmin);
  for (size_t dy = 0; dy < 8; ++dy) {
-    const float* JXL_RESTRICT row_in = xyb.Row(y + dy) + x;
+    const float* JXL_RESTRICT row_in = rect.ConstRow(xyb, y + dy) + x;
    const float* JXL_RESTRICT row_in_next =
-        dy == 7 ? row_in : xyb.Row(y + dy + 1) + x;
+        dy == 7 ? row_in : rect.ConstRow(xyb, y + dy + 1) + x;

    // In SCALAR, there is no guarantee of having extra row padding.
    // Hence, we need to ensure we don't access pixels outside the row itself.
@ -252,11 +253,8 @@ V HfModulation(const D d, const size_t x, const size_t y, const ImageF& xyb,

 void PerBlockModulations(const float butteraugli_target, const ImageF& xyb_x,
                         const ImageF& xyb_y, const ImageF& xyb_b,
-                         const float scale, const Rect& rect, ImageF* out) {
-  JXL_ASSERT(SameSize(xyb_x, xyb_y));
-  JXL_ASSERT(DivCeil(xyb_x.xsize(), kBlockDim) == out->xsize());
-  JXL_ASSERT(DivCeil(xyb_x.ysize(), kBlockDim) == out->ysize());
-
+                         const Rect& rect_in, const float scale,
+                         const Rect& rect_out, ImageF* out) {
  float base_level = 0.48f * scale;
  float kDampenRampStart = 2.0f;
  float kDampenRampEnd = 14.0f;
@ -270,16 +268,16 @@ void PerBlockModulations(const float butteraugli_target, const ImageF& xyb_x,
  }
  const float mul = scale * dampen;
  const float add = (1.0f - dampen) * base_level;
-  for (size_t iy = rect.y0(); iy < rect.y0() + rect.ysize(); iy++) {
+  for (size_t iy = rect_out.y0(); iy < rect_out.y1(); iy++) {
    const size_t y = iy * 8;
    float* const JXL_RESTRICT row_out = out->Row(iy);
    const HWY_CAPPED(float, kBlockDim) df;
-    for (size_t ix = rect.x0(); ix < rect.x0() + rect.xsize(); ix++) {
+    for (size_t ix = rect_out.x0(); ix < rect_out.x1(); ix++) {
      size_t x = ix * 8;
      auto out_val = Set(df, row_out[ix]);
      out_val = ComputeMask(df, out_val);
-      out_val = HfModulation(df, x, y, xyb_y, out_val);
-      out_val = GammaModulation(df, x, y, xyb_x, xyb_y, out_val);
+      out_val = HfModulation(df, x, y, xyb_y, rect_in, out_val);
+      out_val = GammaModulation(df, x, y, xyb_x, xyb_y, rect_in, out_val);
      // We want multiplicative quantization field, so everything
      // until this point has been modulating the exponent.
      row_out[ix] = FastPow2f(GetLane(out_val) * 1.442695041f) * mul + add;
@ -399,13 +397,6 @@ void FuzzyErosion(const float butteraugli_target, const Rect& from_rect,
 }

 struct AdaptiveQuantizationImpl {
-  void Init(const Image3F& xyb) {
-    JXL_DASSERT(xyb.xsize() % kBlockDim == 0);
-    JXL_DASSERT(xyb.ysize() % kBlockDim == 0);
-    const size_t xsize = xyb.xsize();
-    const size_t ysize = xyb.ysize();
-    aq_map = ImageF(xsize / kBlockDim, ysize / kBlockDim);
-  }
  void PrepareBuffers(size_t num_threads) {
    diff_buffer = ImageF(kEncTileDim + 8, num_threads);
    for (size_t i = pre_erosion.size(); i < num_threads; i++) {
@ -415,10 +406,10 @@ struct AdaptiveQuantizationImpl {
  }

  void ComputeTile(float butteraugli_target, float scale, const Image3F& xyb,
-                   const Rect& rect, const int thread, ImageF* mask,
-                   ImageF* mask1x1) {
-    const size_t xsize = xyb.xsize();
-    const size_t ysize = xyb.ysize();
+                   const Rect& rect_in, const Rect& rect_out, const int thread,
+                   ImageF* mask, ImageF* mask1x1) {
+    const size_t xsize = rect_in.xsize();
+    const size_t ysize = rect_in.ysize();

    // The XYB gamma is 3.0 to be able to decode faster with two muls.
    // Butteraugli's gamma is matching the gamma of human eye, around 2.6.
@ -429,11 +420,11 @@ struct AdaptiveQuantizationImpl {

    const HWY_FULL(float) df;

-    size_t y_start = rect.y0() * 8;
-    size_t y_end = y_start + rect.ysize() * 8;
+    size_t y_start = rect_out.y0() * 8;
+    size_t y_end = y_start + rect_out.ysize() * 8;

-    size_t x_start = rect.x0() * 8;
-    size_t x_end = x_start + rect.xsize() * 8;
+    size_t x_start = rect_out.x0() * 8;
+    size_t x_end = x_start + rect_out.xsize() * 8;

    // Computes image (padded to multiple of 8x8) of local pixel differences.
    // Subsample both directions by 4.
@ -441,9 +432,9 @@ struct AdaptiveQuantizationImpl {
    for (size_t y = y_start; y < y_end; ++y) {
      const size_t y2 = y + 1 < ysize ? y + 1 : y;
      const size_t y1 = y > 0 ? y - 1 : y;
-      const float* row_in = xyb.PlaneRow(1, y);
-      const float* row_in1 = xyb.PlaneRow(1, y1);
-      const float* row_in2 = xyb.PlaneRow(1, y2);
+      const float* row_in = rect_in.ConstPlaneRow(xyb, 1, y);
+      const float* row_in1 = rect_in.ConstPlaneRow(xyb, 1, y1);
+      const float* row_in2 = rect_in.ConstPlaneRow(xyb, 1, y2);
      float* mask1x1_out = mask1x1->Row(y);
      auto scalar_pixel1x1 = [&](size_t x) {
        const size_t x2 = x + 1 < xsize ? x + 1 : x;
@ -466,9 +457,9 @@ struct AdaptiveQuantizationImpl {
    }

    if (x_start != 0) x_start -= 4;
-    if (x_end != xyb.xsize()) x_end += 4;
+    if (x_end != rect_in.xsize()) x_end += 4;
    if (y_start != 0) y_start -= 4;
-    if (y_end != xyb.ysize()) y_end += 4;
+    if (y_end != rect_in.ysize()) y_end += 4;
    pre_erosion[thread].ShrinkTo((x_end - x_start) / 4, (y_end - y_start) / 4);

    static const float limit = 0.2f;
@ -476,9 +467,9 @@ struct AdaptiveQuantizationImpl {
      size_t y2 = y + 1 < ysize ? y + 1 : y;
      size_t y1 = y > 0 ? y - 1 : y;

-      const float* row_in = xyb.PlaneRow(1, y);
-      const float* row_in1 = xyb.PlaneRow(1, y1);
-      const float* row_in2 = xyb.PlaneRow(1, y2);
+      const float* row_in = rect_in.ConstPlaneRow(xyb, 1, y);
+      const float* row_in1 = rect_in.ConstPlaneRow(xyb, 1, y1);
+      const float* row_in2 = rect_in.ConstPlaneRow(xyb, 1, y2);
      float* JXL_RESTRICT row_out = diff_buffer.Row(thread);

      auto scalar_pixel = [&](size_t x) {
@ -543,26 +534,25 @@ struct AdaptiveQuantizationImpl {
      }
    }
    Rect from_rect(x_start % 8 == 0 ? 0 : 1, y_start % 8 == 0 ? 0 : 1,
-                   rect.xsize() * 2, rect.ysize() * 2);
-    FuzzyErosion(butteraugli_target, from_rect, pre_erosion[thread], rect,
+                   rect_out.xsize() * 2, rect_out.ysize() * 2);
+    FuzzyErosion(butteraugli_target, from_rect, pre_erosion[thread], rect_out,
                 &aq_map);
-    for (size_t y = 0; y < rect.ysize(); ++y) {
-      const float* aq_map_row = rect.ConstRow(aq_map, y);
-      float* mask_row = rect.Row(mask, y);
-      for (size_t x = 0; x < rect.xsize(); ++x) {
+    for (size_t y = 0; y < rect_out.ysize(); ++y) {
+      const float* aq_map_row = rect_out.ConstRow(aq_map, y);
+      float* mask_row = rect_out.Row(mask, y);
+      for (size_t x = 0; x < rect_out.xsize(); ++x) {
        mask_row[x] = ComputeMaskForAcStrategyUse(aq_map_row[x]);
      }
    }
    PerBlockModulations(butteraugli_target, xyb.Plane(0), xyb.Plane(1),
-                        xyb.Plane(2), scale, rect, &aq_map);
+                        xyb.Plane(2), rect_in, scale, rect_out, &aq_map);
  }
  std::vector<ImageF> pre_erosion;
  ImageF aq_map;
  ImageF diff_buffer;
 };

-static void Blur1x1Masking(const FrameDimensions& frame_dim, ThreadPool* pool,
-                           ImageF* mask1x1) {
+static void Blur1x1Masking(ThreadPool* pool, ImageF* mask1x1) {
  // Blur the mask1x1 to obtain the masking image.
  // Before blurring it contains an image of absolute value of the
  // Laplacian of the intensity channel.
@ -588,47 +578,47 @@ static void Blur1x1Masking(const FrameDimensions& frame_dim, ThreadPool* pool,
                        {HWY_REP4(normalize_mul * kFilterMask1x1[1])},
                        {HWY_REP4(normalize_mul * kFilterMask1x1[4])},
                        {HWY_REP4(normalize_mul * kFilterMask1x1[3])}};
-  Rect from_rect(0, 0, 8 * frame_dim.xsize_blocks, 8 * frame_dim.ysize_blocks);
+  Rect from_rect(0, 0, mask1x1->xsize(), mask1x1->ysize());
  ImageF temp(mask1x1->xsize(), mask1x1->ysize());
  Symmetric5(*mask1x1, from_rect, weights, pool, &temp);
  CopyImageTo(temp, mask1x1);  // TODO: make it a swap
 }

 ImageF AdaptiveQuantizationMap(const float butteraugli_target,
-                               const Image3F& xyb,
-                               const FrameDimensions& frame_dim, float scale,
-                               ThreadPool* pool, ImageF* mask,
+                               const Image3F& xyb, const Rect& rect,
+                               float scale, ThreadPool* pool, ImageF* mask,
                               ImageF* mask1x1) {
+  JXL_DASSERT(rect.xsize() % kBlockDim == 0);
+  JXL_DASSERT(rect.ysize() % kBlockDim == 0);
  AdaptiveQuantizationImpl impl;
-  impl.Init(xyb);
-  *mask = ImageF(frame_dim.xsize_blocks, frame_dim.ysize_blocks);
-  *mask1x1 = ImageF(8 * frame_dim.xsize_blocks, 8 * frame_dim.ysize_blocks);
+  const size_t xsize_blocks = rect.xsize() / kBlockDim;
+  const size_t ysize_blocks = rect.ysize() / kBlockDim;
+  impl.aq_map = ImageF(xsize_blocks, ysize_blocks);
+  *mask = ImageF(xsize_blocks, ysize_blocks);
+  *mask1x1 = ImageF(rect.xsize(), rect.ysize());
  JXL_CHECK(RunOnPool(
      pool, 0,
-      DivCeil(frame_dim.xsize_blocks, kEncTileDimInBlocks) *
-          DivCeil(frame_dim.ysize_blocks, kEncTileDimInBlocks),
+      DivCeil(xsize_blocks, kEncTileDimInBlocks) *
+          DivCeil(ysize_blocks, kEncTileDimInBlocks),
      [&](const size_t num_threads) {
        impl.PrepareBuffers(num_threads);
        return true;
      },
      [&](const uint32_t tid, const size_t thread) {
-        size_t n_enc_tiles =
-            DivCeil(frame_dim.xsize_blocks, kEncTileDimInBlocks);
+        size_t n_enc_tiles = DivCeil(xsize_blocks, kEncTileDimInBlocks);
        size_t tx = tid % n_enc_tiles;
        size_t ty = tid / n_enc_tiles;
        size_t by0 = ty * kEncTileDimInBlocks;
-        size_t by1 =
-            std::min((ty + 1) * kEncTileDimInBlocks, frame_dim.ysize_blocks);
+        size_t by1 = std::min((ty + 1) * kEncTileDimInBlocks, ysize_blocks);
        size_t bx0 = tx * kEncTileDimInBlocks;
-        size_t bx1 =
-            std::min((tx + 1) * kEncTileDimInBlocks, frame_dim.xsize_blocks);
-        Rect r(bx0, by0, bx1 - bx0, by1 - by0);
-        impl.ComputeTile(butteraugli_target, scale, xyb, r, thread, mask,
-                         mask1x1);
+        size_t bx1 = std::min((tx + 1) * kEncTileDimInBlocks, xsize_blocks);
+        Rect rect_out(bx0, by0, bx1 - bx0, by1 - by0);
+        impl.ComputeTile(butteraugli_target, scale, xyb, rect, rect_out, thread,
+                         mask, mask1x1);
      },
      "AQ DiffPrecompute"));

-  Blur1x1Masking(frame_dim, pool, mask1x1);
+  Blur1x1Masking(pool, mask1x1);
  return std::move(impl).aq_map;
 }

@ -770,8 +760,9 @@ ImageBundle RoundtripImage(const FrameHeader& frame_header,
  size_t num_special_frames = enc_state->special_frames.size();
  size_t num_passes = enc_state->progressive_splitter.GetNumPasses();
  ModularFrameEncoder modular_frame_encoder(frame_header, enc_state->cparams);
-  JXL_CHECK(InitializePassesEncoder(frame_header, opsin, cms, pool, enc_state,
-                                    &modular_frame_encoder, nullptr));
+  JXL_CHECK(InitializePassesEncoder(frame_header, opsin, Rect(opsin), cms, pool,
+                                    enc_state, &modular_frame_encoder,
+                                    nullptr));
  JXL_CHECK(dec_state->Init(frame_header));
  JXL_CHECK(dec_state->InitForAC(num_passes, pool));

@ -830,7 +821,7 @@ constexpr int kMaxButteraugliIters = 4;

 void FindBestQuantization(const FrameHeader& frame_header,
                          const Image3F& linear, const Image3F& opsin,
-                          PassesEncoderState* enc_state,
+                          ImageF& quant_field, PassesEncoderState* enc_state,
                          const JxlCmsInterface& cms, ThreadPool* pool,
                          AuxOut* aux_out) {
  const CompressParams& cparams = enc_state->cparams;
@ -843,7 +834,6 @@ void FindBestQuantization(const FrameHeader& frame_header,
  }
  Quantizer& quantizer = enc_state->shared.quantizer;
  ImageI& raw_quant_field = enc_state->shared.raw_quant_field;
-  ImageF& quant_field = enc_state->initial_quant_field;

  const float butteraugli_target = cparams.butteraugli_distance;
  const float original_butteraugli = cparams.original_butteraugli_distance;
@ -998,7 +988,7 @@ void FindBestQuantization(const FrameHeader& frame_header,
 }

 void FindBestQuantizationMaxError(const FrameHeader& frame_header,
-                                  const Image3F& opsin,
+                                  const Image3F& opsin, ImageF& quant_field,
                                  PassesEncoderState* enc_state,
                                  const JxlCmsInterface& cms, ThreadPool* pool,
                                  AuxOut* aux_out) {
@ -1006,7 +996,6 @@ void FindBestQuantizationMaxError(const FrameHeader& frame_header,
  const CompressParams& cparams = enc_state->cparams;
  Quantizer& quantizer = enc_state->shared.quantizer;
  ImageI& raw_quant_field = enc_state->shared.raw_quant_field;
-  ImageF& quant_field = enc_state->initial_quant_field;

  // TODO(veluca): better choice of this value.
  const float initial_quant_dc =
@ -1138,26 +1127,26 @@ float InitialQuantDC(float butteraugli_target) {
 }

 ImageF InitialQuantField(const float butteraugli_target, const Image3F& opsin,
-                         const FrameDimensions& frame_dim, ThreadPool* pool,
-                         float rescale, ImageF* mask, ImageF* mask1x1) {
+                         const Rect& rect, ThreadPool* pool, float rescale,
+                         ImageF* mask, ImageF* mask1x1) {
  const float quant_ac = kAcQuant / butteraugli_target;
  return HWY_DYNAMIC_DISPATCH(AdaptiveQuantizationMap)(
-      butteraugli_target, opsin, frame_dim, quant_ac * rescale, pool, mask,
-      mask1x1);
+      butteraugli_target, opsin, rect, quant_ac * rescale, pool, mask, mask1x1);
 }

 void FindBestQuantizer(const FrameHeader& frame_header, const Image3F* linear,
-                       const Image3F& opsin, PassesEncoderState* enc_state,
+                       const Image3F& opsin, ImageF& quant_field,
+                       PassesEncoderState* enc_state,
                       const JxlCmsInterface& cms, ThreadPool* pool,
                       AuxOut* aux_out, double rescale) {
  const CompressParams& cparams = enc_state->cparams;
  if (cparams.max_error_mode) {
-    FindBestQuantizationMaxError(frame_header, opsin, enc_state, cms, pool,
-                                 aux_out);
+    FindBestQuantizationMaxError(frame_header, opsin, quant_field, enc_state,
+                                 cms, pool, aux_out);
  } else if (linear && cparams.speed_tier <= SpeedTier::kKitten) {
    // Normal encoding to a butteraugli score.
-    FindBestQuantization(frame_header, *linear, opsin, enc_state, cms, pool,
-                         aux_out);
+    FindBestQuantization(frame_header, *linear, opsin, quant_field, enc_state,
+                         cms, pool, aux_out);
  }
 }

--- a/third_party/jpeg-xl/lib/jxl/enc_adaptive_quantization.h
+++ b/third_party/jpeg-xl/lib/jxl/enc_adaptive_quantization.h
@ -37,8 +37,8 @@ struct AuxOut;
 // fine-grained quantization should be enough. Returns a mask, too, which
 // can later be used to make better decisions about ac strategy.
 ImageF InitialQuantField(float butteraugli_target, const Image3F& opsin,
-                         const FrameDimensions& frame_dim, ThreadPool* pool,
-                         float rescale, ImageF* initial_quant_mask,
+                         const Rect& rect, ThreadPool* pool, float rescale,
+                         ImageF* initial_quant_mask,
                         ImageF* initial_quant_mask1x1);

 float InitialQuantDC(float butteraugli_target);
@ -51,7 +51,8 @@ void AdjustQuantField(const AcStrategyImage& ac_strategy, const Rect& rect,
 // dequant_float_map and chosen quantization levels.
 // `linear` is only used in Kitten mode or slower.
 void FindBestQuantizer(const FrameHeader& frame_header, const Image3F* linear,
-                       const Image3F& opsin, PassesEncoderState* enc_state,
+                       const Image3F& opsin, ImageF& quant_field,
+                       PassesEncoderState* enc_state,
                       const JxlCmsInterface& cms, ThreadPool* pool,
                       AuxOut* aux_out, double rescale = 1.0);

--- a/third_party/jpeg-xl/lib/jxl/enc_ar_control_field.cc
+++ b/third_party/jpeg-xl/lib/jxl/enc_ar_control_field.cc
@ -40,18 +40,14 @@ using hwy::HWY_NAMESPACE::Mul;
 using hwy::HWY_NAMESPACE::MulAdd;
 using hwy::HWY_NAMESPACE::Sqrt;

-void ProcessTile(const FrameHeader& frame_header, const Image3F& opsin,
-                 PassesEncoderState* enc_state, const Rect& rect,
+void ProcessTile(const CompressParams& cparams, const FrameHeader& frame_header,
+                 const Image3F& opsin, const Rect& opsin_rect,
+                 const ImageF& quant_field, const AcStrategyImage& ac_strategy,
+                 ImageB* epf_sharpness, const Rect& rect,
                 ArControlFieldHeuristics::TempImages* temp_image) {
  constexpr size_t N = kBlockDim;
-  ImageB* JXL_RESTRICT epf_sharpness = &enc_state->shared.epf_sharpness;
-  ImageF* JXL_RESTRICT quant = &enc_state->initial_quant_field;
-  JXL_ASSERT(
-      epf_sharpness->xsize() == enc_state->shared.frame_dim.xsize_blocks &&
-      epf_sharpness->ysize() == enc_state->shared.frame_dim.ysize_blocks);
-
-  if (enc_state->cparams.butteraugli_distance < kMinButteraugliForDynamicAR ||
-      enc_state->cparams.speed_tier > SpeedTier::kWombat ||
+  if (cparams.butteraugli_distance < kMinButteraugliForDynamicAR ||
+      cparams.speed_tier > SpeedTier::kWombat ||
      frame_header.loop_filter.epf_iters == 0) {
    FillPlane(static_cast<uint8_t>(4), epf_sharpness, rect);
    return;
@ -76,11 +72,13 @@ void ProcessTile(const FrameHeader& frame_header, const Image3F& opsin,
  // (for example 32x32 dct). This relates to transforms ability
  // to propagate artefacts.
  size_t y0 = by0 == 0 ? 2 : 0;
-  size_t y1 = by1 * N + 4 <= opsin.ysize() + 2 ? (by1 - by0) * N + 4
-                                               : opsin.ysize() + 2 - by0 * N;
+  size_t y1 = by1 * N + 4 <= opsin_rect.ysize() + 2
+                  ? (by1 - by0) * N + 4
+                  : opsin_rect.ysize() + 2 - by0 * N;
  size_t x0 = bx0 == 0 ? 2 : 0;
-  size_t x1 = bx1 * N + 4 <= opsin.xsize() + 2 ? (bx1 - bx0) * N + 4
-                                               : opsin.xsize() + 2 - bx0 * N;
+  size_t x1 = bx1 * N + 4 <= opsin_rect.xsize() + 2
+                  ? (bx1 - bx0) * N + 4
+                  : opsin_rect.xsize() + 2 - bx0 * N;
  HWY_FULL(float) df;
  for (size_t y = y0; y < y1; y++) {
    float* JXL_RESTRICT laplacian_sqrsum_row = laplacian_sqrsum.Row(y);
@ -89,14 +87,15 @@ void ProcessTile(const FrameHeader& frame_header, const Image3F& opsin,
    const float* JXL_RESTRICT in_row[3];
    const float* JXL_RESTRICT in_row_b[3];
    for (size_t c = 0; c < 3; c++) {
-      in_row_t[c] = opsin.PlaneRow(c, cy > 0 ? cy - 1 : cy);
-      in_row[c] = opsin.PlaneRow(c, cy);
-      in_row_b[c] = opsin.PlaneRow(c, cy + 1 < opsin.ysize() ? cy + 1 : cy);
+      in_row_t[c] = opsin_rect.ConstPlaneRow(opsin, c, cy > 0 ? cy - 1 : cy);
+      in_row[c] = opsin_rect.ConstPlaneRow(opsin, c, cy);
+      in_row_b[c] = opsin_rect.ConstPlaneRow(
+          opsin, c, cy + 1 < opsin_rect.ysize() ? cy + 1 : cy);
    }
    auto compute_laplacian_scalar = [&](size_t x) {
      size_t cx = x + bx0 * N - 2;
      const size_t prevX = cx >= 1 ? cx - 1 : cx;
-      const size_t nextX = cx + 1 < opsin.xsize() ? cx + 1 : cx;
+      const size_t nextX = cx + 1 < opsin_rect.xsize() ? cx + 1 : cx;
      float sumsqr = 0;
      for (size_t c = 0; c < 3; c++) {
        float laplacian =
@ -114,7 +113,8 @@ void ProcessTile(const FrameHeader& frame_header, const Image3F& opsin,
      compute_laplacian_scalar(x);
    }
    // Interior. One extra pixel of border as the last pixel is special.
-    for (; x + Lanes(df) <= x1 && x + Lanes(df) + bx0 * N - 1 <= opsin.xsize();
+    for (; x + Lanes(df) <= x1 &&
+           x + Lanes(df) + bx0 * N - 1 <= opsin_rect.xsize();
         x += Lanes(df)) {
      size_t cx = x + bx0 * N - 2;
      auto sumsqr = Zero(df);
@ -182,16 +182,16 @@ void ProcessTile(const FrameHeader& frame_header, const Image3F& opsin,
    // ignore pixels outside the image.
    // Y coordinates are relative to by0*8+y*4.
    size_t sy = y * 4 + by0 * 8 > 0 ? 0 : 2;
-    size_t ey = y * 4 + by0 * 8 + 4 <= opsin.ysize() + 2
+    size_t ey = y * 4 + by0 * 8 + 4 <= opsin_rect.ysize() + 2
                    ? 4
-                    : opsin.ysize() - y * 4 - by0 * 8 + 2;
+                    : opsin_rect.ysize() - y * 4 - by0 * 8 + 2;
    for (size_t x = 0; x < (bx1 - bx0) * 2 + 1; x++) {
      // ignore pixels outside the image.
      // X coordinates are relative to bx0*8.
      size_t sx = x * 4 + bx0 * 8 > 0 ? x * 4 : x * 4 + 2;
-      size_t ex = x * 4 + bx0 * 8 + 4 <= opsin.xsize() + 2
+      size_t ex = x * 4 + bx0 * 8 + 4 <= opsin_rect.xsize() + 2
                      ? x * 4 + 4
-                      : opsin.xsize() - bx0 * 8 + 2;
+                      : opsin_rect.xsize() - bx0 * 8 + 2;
      if (ex - sx == 4 && ey - sy == 4) {
        auto sum = Zero(df4);
        for (size_t iy = 0; iy < 4; iy++) {
@ -212,9 +212,9 @@ void ProcessTile(const FrameHeader& frame_header, const Image3F& opsin,
    }
  }
  for (size_t by = by0; by < by1; by++) {
-    AcStrategyRow acs_row = enc_state->shared.ac_strategy.ConstRow(by);
+    AcStrategyRow acs_row = ac_strategy.ConstRow(by);
    uint8_t* JXL_RESTRICT out_row = epf_sharpness->Row(by);
-    float* JXL_RESTRICT quant_row = quant->Row(by);
+    const float* JXL_RESTRICT quant_row = quant_field.Row(by);
    for (size_t bx = bx0; bx < bx1; bx++) {
      AcStrategy acs = acs_row[bx];
      if (!acs.IsFirstBlock()) continue;
@ -311,13 +311,14 @@ HWY_AFTER_NAMESPACE();
 namespace jxl {
 HWY_EXPORT(ProcessTile);

-void ArControlFieldHeuristics::RunRect(const FrameHeader& frame_header,
-                                       const Rect& block_rect,
-                                       const Image3F& opsin,
-                                       PassesEncoderState* enc_state,
-                                       size_t thread) {
+void ArControlFieldHeuristics::RunRect(
+    const CompressParams& cparams, const FrameHeader& frame_header,
+    const Rect& block_rect, const Image3F& opsin, const Rect& opsin_rect,
+    const ImageF& quant_field, const AcStrategyImage& ac_strategy,
+    ImageB* epf_sharpness, size_t thread) {
  HWY_DYNAMIC_DISPATCH(ProcessTile)
-  (frame_header, opsin, enc_state, block_rect, &temp_images[thread]);
+  (cparams, frame_header, opsin, opsin_rect, quant_field, ac_strategy,
+   epf_sharpness, block_rect, &temp_images[thread]);
 }

 }  // namespace jxl
--- a/third_party/jpeg-xl/lib/jxl/enc_ar_control_field.h
+++ b/third_party/jpeg-xl/lib/jxl/enc_ar_control_field.h
@ -36,14 +36,13 @@ struct ArControlFieldHeuristics {
    temp_images.resize(num_threads);
  }

-  void RunRect(const FrameHeader& frame_header, const Rect& block_rect,
-               const Image3F& opsin, PassesEncoderState* enc_state,
+  void RunRect(const CompressParams& cparams, const FrameHeader& frame_header,
+               const Rect& block_rect, const Image3F& opsin,
+               const Rect& opsin_rect, const ImageF& quant_field,
+               const AcStrategyImage& ac_strategy, ImageB* epf_sharpness,
               size_t thread);

  std::vector<TempImages> temp_images;
-  ImageB* epf_sharpness;
-  ImageF* quant;
-  bool all_default;
 };

 }  // namespace jxl
--- a/third_party/jpeg-xl/lib/jxl/enc_aux_out.cc
+++ b/third_party/jpeg-xl/lib/jxl/enc_aux_out.cc
@ -5,6 +5,10 @@

 #include "lib/jxl/enc_aux_out.h"

+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
 #include <inttypes.h>
 #include <stddef.h>
 #include <stdint.h>
--- a/third_party/jpeg-xl/lib/jxl/enc_cache.cc
+++ b/third_party/jpeg-xl/lib/jxl/enc_cache.cc
@ -35,8 +35,9 @@
 namespace jxl {

 Status InitializePassesEncoder(const FrameHeader& frame_header,
-                               const Image3F& opsin, const JxlCmsInterface& cms,
-                               ThreadPool* pool, PassesEncoderState* enc_state,
+                               const Image3F& opsin, const Rect& rect,
+                               const JxlCmsInterface& cms, ThreadPool* pool,
+                               PassesEncoderState* enc_state,
                               ModularFrameEncoder* modular_frame_encoder,
                               AuxOut* aux_out) {
  PassesSharedState& JXL_RESTRICT shared = enc_state->shared;
@ -68,7 +69,7 @@ Status InitializePassesEncoder(const FrameHeader& frame_header,
  JXL_RETURN_IF_ERROR(RunOnPool(
      pool, 0, shared.frame_dim.num_groups, ThreadPool::NoInit,
      [&](size_t group_idx, size_t _) {
-        ComputeCoefficients(group_idx, enc_state, opsin, &dc);
+        ComputeCoefficients(group_idx, enc_state, opsin, rect, &dc);
      },
      "Compute coeffs"));

--- a/third_party/jpeg-xl/lib/jxl/enc_cache.h
+++ b/third_party/jpeg-xl/lib/jxl/enc_cache.h
@ -40,10 +40,6 @@ struct PassesEncoderState {
  bool initialize_global_state = true;
  size_t dc_group_index = 0;

-  ImageF initial_quant_field;    // Invalid in Falcon mode.
-  ImageF initial_quant_masking;  // Invalid in Falcon mode.
-  ImageF initial_quant_masking1x1;  // Invalid in Falcon mode.
-
  // Per-pass DCT coefficients for the image. One row per group.
  std::vector<std::unique_ptr<ACImage>> coeffs;

@ -77,8 +73,8 @@ struct PassesEncoderState {
 // Initialize per-frame information.
 class ModularFrameEncoder;
 Status InitializePassesEncoder(const FrameHeader& frame_header,
-                               const Image3F& opsin, const JxlCmsInterface& cms,
-                               ThreadPool* pool,
+                               const Image3F& opsin, const Rect& rect,
+                               const JxlCmsInterface& cms, ThreadPool* pool,
                               PassesEncoderState* passes_enc_state,
                               ModularFrameEncoder* modular_frame_encoder,
                               AuxOut* aux_out);
--- a/third_party/jpeg-xl/lib/jxl/enc_chroma_from_luma.cc
+++ b/third_party/jpeg-xl/lib/jxl/enc_chroma_from_luma.cc
@ -205,22 +205,23 @@ void ComputeDC(const ImageF& dc_values, bool fast, int32_t* dc_x,
                             jxl::cms::kYToBRatio, kDistanceMultiplierDC, fast);
 }

-void ComputeTile(const Image3F& opsin, const DequantMatrices& dequant,
+void ComputeTile(const Image3F& opsin, const Rect& opsin_rect,
+                 const DequantMatrices& dequant,
                 const AcStrategyImage* ac_strategy,
                 const ImageI* raw_quant_field, const Quantizer* quantizer,
-                 const Rect& r, bool fast, bool use_dct8, ImageSB* map_x,
+                 const Rect& rect, bool fast, bool use_dct8, ImageSB* map_x,
                 ImageSB* map_b, ImageF* dc_values, float* mem) {
  static_assert(kEncTileDimInBlocks == kColorTileDimInBlocks,
                "Invalid color tile dim");
-  size_t xsize_blocks = opsin.xsize() / kBlockDim;
+  size_t xsize_blocks = opsin_rect.xsize() / kBlockDim;
  constexpr float kDistanceMultiplierAC = 1e-9f;
  const size_t dct_scratch_size =
      3 * (MaxVectorSize() / sizeof(float)) * AcStrategy::kMaxBlockDim;

-  const size_t y0 = r.y0();
-  const size_t x0 = r.x0();
-  const size_t x1 = r.x0() + r.xsize();
-  const size_t y1 = r.y0() + r.ysize();
+  const size_t y0 = rect.y0();
+  const size_t x0 = rect.x0();
+  const size_t x1 = rect.x0() + rect.xsize();
+  const size_t y1 = rect.y0() + rect.ysize();

  int ty = y0 / kColorTileDimInBlocks;
  int tx = x0 / kColorTileDimInBlocks;
@ -257,9 +258,12 @@ void ComputeTile(const Image3F& opsin, const DequantMatrices& dequant,
  size_t num_ac = 0;

  for (size_t y = y0; y < y1; ++y) {
-    const float* JXL_RESTRICT row_y = opsin.ConstPlaneRow(1, y * kBlockDim);
-    const float* JXL_RESTRICT row_x = opsin.ConstPlaneRow(0, y * kBlockDim);
-    const float* JXL_RESTRICT row_b = opsin.ConstPlaneRow(2, y * kBlockDim);
+    const float* JXL_RESTRICT row_y =
+        opsin_rect.ConstPlaneRow(opsin, 1, y * kBlockDim);
+    const float* JXL_RESTRICT row_x =
+        opsin_rect.ConstPlaneRow(opsin, 0, y * kBlockDim);
+    const float* JXL_RESTRICT row_b =
+        opsin_rect.ConstPlaneRow(opsin, 2, y * kBlockDim);
    size_t stride = opsin.PixelsPerRow();

    for (size_t x = x0; x < x1; x++) {
@ -362,14 +366,15 @@ HWY_EXPORT(InitDCStorage);
 HWY_EXPORT(ComputeDC);
 HWY_EXPORT(ComputeTile);

-void CfLHeuristics::Init(const Image3F& opsin) {
-  size_t xsize_blocks = opsin.xsize() / kBlockDim;
-  size_t ysize_blocks = opsin.ysize() / kBlockDim;
+void CfLHeuristics::Init(const Rect& rect) {
+  size_t xsize_blocks = rect.xsize() / kBlockDim;
+  size_t ysize_blocks = rect.ysize() / kBlockDim;
  HWY_DYNAMIC_DISPATCH(InitDCStorage)
  (xsize_blocks * ysize_blocks, &dc_values);
 }

 void CfLHeuristics::ComputeTile(const Rect& r, const Image3F& opsin,
+                                const Rect& opsin_rect,
                                const DequantMatrices& dequant,
                                const AcStrategyImage* ac_strategy,
                                const ImageI* raw_quant_field,
@ -377,8 +382,8 @@ void CfLHeuristics::ComputeTile(const Rect& r, const Image3F& opsin,
                                size_t thread, ColorCorrelationMap* cmap) {
  bool use_dct8 = ac_strategy == nullptr;
  HWY_DYNAMIC_DISPATCH(ComputeTile)
-  (opsin, dequant, ac_strategy, raw_quant_field, quantizer, r, fast, use_dct8,
-   &cmap->ytox_map, &cmap->ytob_map, &dc_values,
+  (opsin, opsin_rect, dequant, ac_strategy, raw_quant_field, quantizer, r, fast,
+   use_dct8, &cmap->ytox_map, &cmap->ytob_map, &dc_values,
   mem.get() + thread * ItemsPerThread());
 }

--- a/third_party/jpeg-xl/lib/jxl/enc_chroma_from_luma.h
+++ b/third_party/jpeg-xl/lib/jxl/enc_chroma_from_luma.h
@ -29,13 +29,13 @@ void ColorCorrelationMapEncodeDC(const ColorCorrelationMap& map,
                                 AuxOut* aux_out);

 struct CfLHeuristics {
-  void Init(const Image3F& opsin);
+  void Init(const Rect& rect);

  void PrepareForThreads(size_t num_threads) {
    mem = hwy::AllocateAligned<float>(num_threads * ItemsPerThread());
  }

-  void ComputeTile(const Rect& r, const Image3F& opsin,
+  void ComputeTile(const Rect& r, const Image3F& opsin, const Rect& opsin_rect,
                   const DequantMatrices& dequant,
                   const AcStrategyImage* ac_strategy,
                   const ImageI* raw_quant_field, const Quantizer* quantizer,
--- a/third_party/jpeg-xl/lib/jxl/enc_cluster.cc
+++ b/third_party/jpeg-xl/lib/jxl/enc_cluster.cc
@ -82,13 +82,33 @@ float HistogramDistance(const Histogram& a, const Histogram& b) {
  return total_distance - a.entropy_ - b.entropy_;
 }

-bool HasNewSymbol(const Histogram& a, const Histogram& b) {
-  for (size_t i = 0; i < a.data_.size(); ++i) {
-    if (a.data_[i] > 0 && (i >= b.data_.size() || b.data_[i] == 0)) {
-      return true;
-    }
+constexpr const float kInfinity = std::numeric_limits<float>::infinity();
+
+float HistogramKLDivergence(const Histogram& actual, const Histogram& coding) {
+  if (actual.total_count_ == 0) return 0;
+  if (coding.total_count_ == 0) return kInfinity;
+
+  const HWY_CAPPED(float, Histogram::kRounding) df;
+  const HWY_CAPPED(int32_t, Histogram::kRounding) di;
+
+  const auto coding_inv = Set(df, 1.0f / coding.total_count_);
+  auto cost_lanes = Zero(df);
+
+  for (size_t i = 0; i < actual.data_.size(); i += Lanes(di)) {
+    const auto counts = LoadU(di, &actual.data_[i]);
+    const auto coding_counts =
+        coding.data_.size() > i ? LoadU(di, &coding.data_[i]) : Zero(di);
+    const auto coding_probs = Mul(ConvertTo(df, coding_counts), coding_inv);
+    const auto neg_coding_cost = BitCast(
+        df,
+        IfThenZeroElse(Eq(counts, Zero(di)),
+                       IfThenElse(Eq(coding_counts, Zero(di)),
+                                  BitCast(di, Set(df, -kInfinity)),
+                                  BitCast(di, FastLog2f(df, coding_probs)))));
+    cost_lanes = NegMulAdd(ConvertTo(df, counts), neg_coding_cost, cost_lanes);
  }
-  return false;
+  const float total_cost = GetLane(SumOfLanes(df, cost_lanes));
+  return total_cost - actual.entropy_;
 }

 // First step of a k-means clustering with a fancy distance metric.
@ -121,7 +141,7 @@ void FastClusterHistograms(const std::vector<Histogram>& in,
    for (size_t i = 0; i < in.size(); i++) {
      if (dists[i] == 0.0f) continue;
      for (size_t j = 0; j < prev_histograms; ++j) {
-        dists[i] = std::min(HistogramDistance(in[i], (*out)[j]), dists[i]);
+        dists[i] = std::min(HistogramKLDivergence(in[i], (*out)[j]), dists[i]);
      }
    }
    auto max_dist = std::max_element(dists.begin(), dists.end());
@ -149,10 +169,8 @@ void FastClusterHistograms(const std::vector<Histogram>& in,
    size_t best = 0;
    float best_dist = std::numeric_limits<float>::max();
    for (size_t j = 0; j < out->size(); j++) {
-      // TODO(szabadka) Choose a different distance metric for previous
-      // histograms (i.e. the cost of the new histogram with the old one).
-      if (j < prev_histograms && HasNewSymbol(in[i], (*out)[j])) continue;
-      float dist = HistogramDistance(in[i], (*out)[j]);
+      float dist = j < prev_histograms ? HistogramKLDivergence(in[i], (*out)[j])
+                                       : HistogramDistance(in[i], (*out)[j]);
      if (dist < best_dist) {
        best = j;
        best_dist = dist;
--- a/third_party/jpeg-xl/lib/jxl/enc_fast_lossless.cc
+++ b/third_party/jpeg-xl/lib/jxl/enc_fast_lossless.cc
@ -706,12 +706,12 @@ void JxlFastLosslessPrepareHeader(JxlFastLosslessFrameState* frame,
  if (!is_last) {
    output->Write(2, 0b00);  // can not be saved as reference
  }
-  output->Write(2, 0b00);     // a frame has no name
-  output->Write(1, 0);        // loop filter is not all_default
-  output->Write(1, 0);        // no gaborish
-  output->Write(2, 0);        // 0 EPF iters
-  output->Write(2, 0b00);     // No LF extensions
-  output->Write(2, 0b00);     // No FH extensions
+  output->Write(2, 0b00);  // a frame has no name
+  output->Write(1, 0);     // loop filter is not all_default
+  output->Write(1, 0);     // no gaborish
+  output->Write(2, 0);     // 0 EPF iters
+  output->Write(2, 0b00);  // No LF extensions
+  output->Write(2, 0b00);  // No FH extensions

  output->Write(1, 0);      // No TOC permutation
  output->ZeroPadToByte();  // TOC is byte-aligned.
@ -3632,13 +3632,11 @@ bool detect_palette(const unsigned char* r, size_t width,
  return collided;
 }

-// TODO(szabadka): Add some parameter to indicate whether the input is
-// truly streaming.
 template <typename BitDepth>
 JxlFastLosslessFrameState* LLPrepare(JxlChunkedFrameInputSource input,
                                     size_t width, size_t height,
                                     BitDepth bitdepth, size_t nb_chans,
-                                     bool big_endian, int effort) {
+                                     bool big_endian, int effort, int oneshot) {
  assert(width != 0);
  assert(height != 0);

@ -3647,7 +3645,7 @@ JxlFastLosslessFrameState* LLPrepare(JxlChunkedFrameInputSource input,
  std::vector<int16_t> lookup(kHashSize);
  lookup[0] = 0;
  int pcolors = 0;
-  bool collided = effort < 2 || bitdepth.bitdepth != 8;
+  bool collided = effort < 2 || bitdepth.bitdepth != 8 || !oneshot;
  for (size_t y0 = 0; y0 < height && !collided; y0 += 256) {
    size_t ys = std::min<size_t>(height - y0, 256);
    for (size_t x0 = 0; x0 < width && !collided; x0 += 256) {
@ -3728,11 +3726,7 @@ JxlFastLosslessFrameState* LLPrepare(JxlChunkedFrameInputSource input,

  bool onegroup = num_groups_x == 1 && num_groups_y == 1;

-  // sample the middle (effort * 2) rows of every group
-  // TODO(szabadka): Optimize sampling rule for low-memory code-path.
-  for (size_t g = 0; g < num_groups_y * num_groups_x; g++) {
-    size_t xg = g % num_groups_x;
-    size_t yg = g / num_groups_x;
+  auto sample_rows = [&](size_t xg, size_t yg, size_t num_rows) {
    size_t y0 = yg * 256;
    size_t x0 = xg * 256;
    size_t ys = std::min<size_t>(height - y0, 256);
@ -3742,12 +3736,31 @@ JxlFastLosslessFrameState* LLPrepare(JxlChunkedFrameInputSource input,
        input.get_color_channel_data_at(input.opaque, x0, y0, xs, ys, &stride);
    auto rgba = reinterpret_cast<const unsigned char*>(buffer);
    int y_begin = std::max<int>(0, ys - 2 * effort) / 2;
-    int y_count = std::min<int>(2 * effort * ys / 256, y0 + ys - y_begin - 1);
+    int y_count = std::min<int>(num_rows, y0 + ys - y_begin - 1);
    int x_max = xs / kChunkSize * kChunkSize;
    CollectSamples(rgba, 0, y_begin, x_max, stride, y_count, raw_counts,
                   lz77_counts, onegroup, !collided, bitdepth, nb_chans,
                   big_endian, lookup.data());
    input.release_buffer(input.opaque, buffer);
+  };
+
+  // TODO(veluca): that `64` is an arbitrary constant, meant to correspond to
+  // the point where the number of processed rows is large enough that loading
+  // the entire image is cost-effective.
+  if (oneshot || effort >= 64) {
+    for (size_t g = 0; g < num_groups_y * num_groups_x; g++) {
+      size_t xg = g % num_groups_x;
+      size_t yg = g / num_groups_x;
+      size_t y0 = yg * 256;
+      size_t ys = std::min<size_t>(height - y0, 256);
+      size_t num_rows = 2 * effort * ys / 256;
+      sample_rows(xg, yg, num_rows);
+    }
+  } else {
+    // sample the middle (effort * 2 * num_groups) rows of the center group
+    // (possibly all of them).
+    sample_rows((num_groups_x - 1) / 2, (num_groups_y - 1) / 2,
+                2 * effort * num_groups_x * num_groups_y);
  }

  // TODO(veluca): can probably improve this and make it bitdepth-dependent.
@ -3946,24 +3959,25 @@ void LLProcess(JxlFastLosslessFrameState* frame_state, bool is_last,

 JxlFastLosslessFrameState* JxlFastLosslessPrepareImpl(
    JxlChunkedFrameInputSource input, size_t width, size_t height,
-    size_t nb_chans, size_t bitdepth, bool big_endian, int effort) {
+    size_t nb_chans, size_t bitdepth, bool big_endian, int effort,
+    int oneshot) {
  assert(bitdepth > 0);
  assert(nb_chans <= 4);
  assert(nb_chans != 0);
  if (bitdepth <= 8) {
    return LLPrepare(input, width, height, UpTo8Bits(bitdepth), nb_chans,
-                     big_endian, effort);
+                     big_endian, effort, oneshot);
  }
  if (bitdepth <= 13) {
    return LLPrepare(input, width, height, From9To13Bits(bitdepth), nb_chans,
-                     big_endian, effort);
+                     big_endian, effort, oneshot);
  }
  if (bitdepth == 14) {
    return LLPrepare(input, width, height, Exactly14Bits(bitdepth), nb_chans,
-                     big_endian, effort);
+                     big_endian, effort, oneshot);
  }
  return LLPrepare(input, width, height, MoreThan14Bits(bitdepth), nb_chans,
-                   big_endian, effort);
+                   big_endian, effort, oneshot);
 }

 void JxlFastLosslessProcessFrameImpl(
@ -4074,10 +4088,8 @@ class FJxlFrameInput {
        bytes_per_pixel_(bitdepth <= 8 ? nb_chans : 2 * nb_chans) {}

  JxlChunkedFrameInputSource GetInputSource() {
-    return JxlChunkedFrameInputSource{
-        this,
-        GetDataAt,
-        [](void*, const void*) {}};
+    return JxlChunkedFrameInputSource{this, GetDataAt,
+                                      [](void*, const void*) {}};
  }

 private:
@ -4099,9 +4111,9 @@ size_t JxlFastLosslessEncode(const unsigned char* rgba, size_t width,
                             unsigned char** output, void* runner_opaque,
                             FJxlParallelRunner runner) {
  FJxlFrameInput input(rgba, row_stride, nb_chans, bitdepth);
-  auto frame_state =
-      JxlFastLosslessPrepareFrame(input.GetInputSource(), width, height,
-                                  nb_chans, bitdepth, big_endian, effort);
+  auto frame_state = JxlFastLosslessPrepareFrame(
+      input.GetInputSource(), width, height, nb_chans, bitdepth, big_endian,
+      effort, /*oneshot=*/true);
  JxlFastLosslessProcessFrame(frame_state, /*is_last=*/true, runner_opaque,
                              runner, nullptr);
  JxlFastLosslessPrepareHeader(frame_state, /*add_image_header=*/1,
@ -4121,25 +4133,25 @@ size_t JxlFastLosslessEncode(const unsigned char* rgba, size_t width,

 JxlFastLosslessFrameState* JxlFastLosslessPrepareFrame(
    JxlChunkedFrameInputSource input, size_t width, size_t height,
-    size_t nb_chans, size_t bitdepth, int big_endian, int effort) {
+    size_t nb_chans, size_t bitdepth, int big_endian, int effort, int oneshot) {
 #if FJXL_ENABLE_AVX512
  if (__builtin_cpu_supports("avx512cd") &&
      __builtin_cpu_supports("avx512vbmi") &&
      __builtin_cpu_supports("avx512bw") && __builtin_cpu_supports("avx512f") &&
      __builtin_cpu_supports("avx512vl")) {
-    return AVX512::JxlFastLosslessPrepareImpl(input, width, height, nb_chans,
-                                              bitdepth, big_endian, effort);
+    return AVX512::JxlFastLosslessPrepareImpl(
+        input, width, height, nb_chans, bitdepth, big_endian, effort, oneshot);
  }
 #endif
 #if FJXL_ENABLE_AVX2
  if (__builtin_cpu_supports("avx2")) {
-    return AVX2::JxlFastLosslessPrepareImpl(input, width, height, nb_chans,
-                                            bitdepth, big_endian, effort);
+    return AVX2::JxlFastLosslessPrepareImpl(
+        input, width, height, nb_chans, bitdepth, big_endian, effort, oneshot);
  }
 #endif

  return default_implementation::JxlFastLosslessPrepareImpl(
-      input, width, height, nb_chans, bitdepth, big_endian, effort);
+      input, width, height, nb_chans, bitdepth, big_endian, effort, oneshot);
 }

 void JxlFastLosslessProcessFrame(
--- a/third_party/jpeg-xl/lib/jxl/enc_fast_lossless.h
+++ b/third_party/jpeg-xl/lib/jxl/enc_fast_lossless.h
@ -69,7 +69,7 @@ struct JxlFastLosslessFrameState;
 // JxlFastLosslessFreeFrameState.
 JxlFastLosslessFrameState* JxlFastLosslessPrepareFrame(
    JxlChunkedFrameInputSource input, size_t width, size_t height,
-    size_t nb_chans, size_t bitdepth, int big_endian, int effort);
+    size_t nb_chans, size_t bitdepth, int big_endian, int effort, int oneshot);

 #if !FJXL_STANDALONE
 class JxlEncoderOutputProcessorWrapper;
--- a/third_party/jpeg-xl/lib/jxl/enc_fields.cc
+++ b/third_party/jpeg-xl/lib/jxl/enc_fields.cc
@ -5,6 +5,8 @@

 #include "lib/jxl/enc_fields.h"

+#include <cinttypes>
+
 #include "lib/jxl/enc_aux_out.h"
 #include "lib/jxl/fields.h"

--- a/third_party/jpeg-xl/lib/jxl/enc_frame.cc
+++ b/third_party/jpeg-xl/lib/jxl/enc_frame.cc
@ -121,8 +121,7 @@ uint32_t GetBitDepth(JxlBitDepth bit_depth, const T& metadata,
  }
 }

-Status CopyColorChannels(JxlChunkedFrameInputSource input, size_t x0, size_t y0,
-                         size_t xsize, size_t ysize,
+Status CopyColorChannels(JxlChunkedFrameInputSource input, Rect rect,
                         const FrameInfo& frame_info,
                         const ImageMetadata& metadata, ThreadPool* pool,
                         Image3F* color, ImageF* alpha,
@ -133,7 +132,8 @@ Status CopyColorChannels(JxlChunkedFrameInputSource input, size_t x0, size_t y0,
  size_t bits_per_sample =
      GetBitDepth(frame_info.image_bit_depth, metadata, format);
  size_t row_offset;
-  auto buffer = GetColorBuffer(input, x0, y0, xsize, ysize, &row_offset);
+  auto buffer = GetColorBuffer(input, rect.x0(), rect.y0(), rect.xsize(),
+                               rect.ysize(), &row_offset);
  if (!buffer) {
    return JXL_FAILURE("no buffer for color channels given");
  }
@ -145,12 +145,12 @@ Status CopyColorChannels(JxlChunkedFrameInputSource input, size_t x0, size_t y0,
                       " color channels, received only %u channels",
                       color_channels, format.num_channels);
  }
-  *color = Image3F(xsize, ysize);
+  *color = Image3F(rect.xsize(), rect.ysize());
  const uint8_t* data = reinterpret_cast<const uint8_t*>(buffer.get());
  for (size_t c = 0; c < color_channels; ++c) {
    JXL_RETURN_IF_ERROR(ConvertFromExternalNoSizeCheck(
-        data, xsize, ysize, row_offset, bits_per_sample, format, c, pool,
-        &color->Plane(c)));
+        data, rect.xsize(), rect.ysize(), row_offset, bits_per_sample, format,
+        c, pool, &color->Plane(c)));
  }
  if (color_channels == 1) {
    CopyImageTo(color->Plane(0), &color->Plane(1));
@ -159,7 +159,7 @@ Status CopyColorChannels(JxlChunkedFrameInputSource input, size_t x0, size_t y0,
  if (alpha) {
    if (*has_interleaved_alpha) {
      JXL_RETURN_IF_ERROR(ConvertFromExternalNoSizeCheck(
-          data, xsize, ysize, row_offset, bits_per_sample, format,
+          data, rect.xsize(), rect.ysize(), row_offset, bits_per_sample, format,
          format.num_channels - 1, pool, alpha));
    } else {
      // if alpha is not passed, but it is expected, then assume
@ -170,8 +170,7 @@ Status CopyColorChannels(JxlChunkedFrameInputSource input, size_t x0, size_t y0,
  return true;
 }

-Status CopyExtraChannels(JxlChunkedFrameInputSource input, size_t x0, size_t y0,
-                         size_t xsize, size_t ysize,
+Status CopyExtraChannels(JxlChunkedFrameInputSource input, Rect rect,
                         const FrameInfo& frame_info,
                         const ImageMetadata& metadata,
                         bool has_interleaved_alpha, ThreadPool* pool,
@ -189,15 +188,16 @@ Status CopyExtraChannels(JxlChunkedFrameInputSource input, size_t x0, size_t y0,
    ec_format.num_channels = 1;
    size_t row_offset;
    auto buffer =
-        GetExtraChannelBuffer(input, ec, x0, y0, xsize, ysize, &row_offset);
+        GetExtraChannelBuffer(input, ec, rect.x0(), rect.y0(), rect.xsize(),
+                              rect.ysize(), &row_offset);
    if (!buffer) {
      return JXL_FAILURE("no buffer for extra channel given");
    }
    size_t bits_per_sample = GetBitDepth(
        frame_info.image_bit_depth, metadata.extra_channel_info[ec], ec_format);
    if (!ConvertFromExternalNoSizeCheck(
-            reinterpret_cast<const uint8_t*>(buffer.get()), xsize, ysize,
-            row_offset, bits_per_sample, ec_format, 0, pool,
+            reinterpret_cast<const uint8_t*>(buffer.get()), rect.xsize(),
+            rect.ysize(), row_offset, bits_per_sample, ec_format, 0, pool,
            &(*extra_channels)[ec])) {
      return JXL_FAILURE("Failed to set buffer for extra channel");
    }
@ -258,12 +258,10 @@ Status LoopFilterFromParams(const CompressParams& cparams, bool streaming_mode,
  LoopFilter* loop_filter = &frame_header->loop_filter;

  // Gaborish defaults to enabled in Hare or slower.
-  // TODO(szabadka) Re-enable Gaborish for streaming mode.
-  loop_filter->gab =
-      ApplyOverride(cparams.gaborish,
-                    !streaming_mode && cparams.speed_tier <= SpeedTier::kHare &&
-                        frame_header->encoding == FrameEncoding::kVarDCT &&
-                        cparams.decoding_speed_tier < 4);
+  loop_filter->gab = ApplyOverride(
+      cparams.gaborish, cparams.speed_tier <= SpeedTier::kHare &&
+                            frame_header->encoding == FrameEncoding::kVarDCT &&
+                            cparams.decoding_speed_tier < 4);

  if (cparams.epf != -1) {
    loop_filter->epf_iters = cparams.epf;
@ -540,14 +538,14 @@ struct PixelStatsForChromacityAdjustment {
  float dx = 0;
  float db = 0;
  float exposed_blue = 0;
-  float CalcPlane(const ImageF* JXL_RESTRICT plane) const {
+  float CalcPlane(const ImageF* JXL_RESTRICT plane, const Rect& rect) const {
    float xmax = 0;
    float ymax = 0;
-    for (size_t ty = 1; ty < plane->ysize(); ++ty) {
-      for (size_t tx = 1; tx < plane->xsize(); ++tx) {
-        float cur = plane->Row(ty)[tx];
-        float prev_row = plane->Row(ty - 1)[tx];
-        float prev = plane->Row(ty)[tx - 1];
+    for (size_t ty = 1; ty < rect.ysize(); ++ty) {
+      for (size_t tx = 1; tx < rect.xsize(); ++tx) {
+        float cur = rect.Row(plane, ty)[tx];
+        float prev_row = rect.Row(plane, ty - 1)[tx];
+        float prev = rect.Row(plane, ty)[tx - 1];
        xmax = std::max(xmax, std::abs(cur - prev));
        ymax = std::max(ymax, std::abs(cur - prev_row));
      }
@ -555,20 +553,20 @@ struct PixelStatsForChromacityAdjustment {
    return std::max(xmax, ymax);
  }
  void CalcExposedBlue(const ImageF* JXL_RESTRICT plane_y,
-                       const ImageF* JXL_RESTRICT plane_b) {
+                       const ImageF* JXL_RESTRICT plane_b, const Rect& rect) {
    float eb = 0;
    float xmax = 0;
    float ymax = 0;
-    for (size_t ty = 1; ty < plane_y->ysize(); ++ty) {
-      for (size_t tx = 1; tx < plane_y->xsize(); ++tx) {
-        float cur_y = plane_y->Row(ty)[tx];
-        float cur_b = plane_b->Row(ty)[tx];
+    for (size_t ty = 1; ty < rect.ysize(); ++ty) {
+      for (size_t tx = 1; tx < rect.xsize(); ++tx) {
+        float cur_y = rect.Row(plane_y, ty)[tx];
+        float cur_b = rect.Row(plane_b, ty)[tx];
        float exposed_b = cur_b - cur_y * 1.2;
        float diff_b = cur_b - cur_y;
-        float prev_row = plane_b->Row(ty - 1)[tx];
-        float prev = plane_b->Row(ty)[tx - 1];
-        float diff_prev_row = prev_row - plane_y->Row(ty - 1)[tx];
-        float diff_prev = prev - plane_y->Row(ty)[tx - 1];
+        float prev_row = rect.Row(plane_b, ty - 1)[tx];
+        float prev = rect.Row(plane_b, ty)[tx - 1];
+        float diff_prev_row = prev_row - rect.Row(plane_y, ty - 1)[tx];
+        float diff_prev = prev - rect.Row(plane_y, ty)[tx - 1];
        xmax = std::max(xmax, std::abs(diff_b - diff_prev));
        ymax = std::max(ymax, std::abs(diff_b - diff_prev_row));
        if (exposed_b >= 0) {
@ -580,9 +578,9 @@ struct PixelStatsForChromacityAdjustment {
    exposed_blue = eb;
    db = std::max(xmax, ymax);
  }
-  void Calc(const Image3F* JXL_RESTRICT opsin) {
-    dx = CalcPlane(&opsin->Plane(0));
-    CalcExposedBlue(&opsin->Plane(1), &opsin->Plane(2));
+  void Calc(const Image3F* JXL_RESTRICT opsin, const Rect& rect) {
+    dx = CalcPlane(&opsin->Plane(0), rect);
+    CalcExposedBlue(&opsin->Plane(1), &opsin->Plane(2), rect);
  }
  int HowMuchIsXChannelPixelized() {
    if (dx >= 0.03) {
@ -609,7 +607,7 @@ struct PixelStatsForChromacityAdjustment {
 };

 void ComputeChromacityAdjustments(const CompressParams& cparams,
-                                  const Image3F& opsin,
+                                  const Image3F& opsin, const Rect& rect,
                                  FrameHeader* frame_header) {
  if (frame_header->encoding != FrameEncoding::kVarDCT ||
      cparams.max_error_mode) {
@ -633,7 +631,7 @@ void ComputeChromacityAdjustments(const CompressParams& cparams,
  // the image would be based on the worst case pixel.
  PixelStatsForChromacityAdjustment pixel_stats;
  if (cparams.speed_tier <= SpeedTier::kWombat) {
-    pixel_stats.Calc(&opsin);
+    pixel_stats.Calc(&opsin, rect);
  }
  // For X take the most severe adjustment.
  frame_header->x_qm_scale = std::max<int>(
@ -1041,18 +1039,19 @@ Status ComputeJPEGTranscodingData(const jpeg::JPEGData& jpeg_data,

 Status ComputeVarDCTEncodingData(const FrameHeader& frame_header,
                                 const Image3F* linear,
-                                 Image3F* JXL_RESTRICT opsin,
+                                 Image3F* JXL_RESTRICT opsin, const Rect& rect,
                                 const JxlCmsInterface& cms, ThreadPool* pool,
                                 ModularFrameEncoder* enc_modular,
                                 PassesEncoderState* enc_state,
                                 AuxOut* aux_out) {
-  JXL_ASSERT((opsin->xsize() % kBlockDim) == 0 &&
-             (opsin->ysize() % kBlockDim) == 0);
+  JXL_ASSERT((rect.xsize() % kBlockDim) == 0 &&
+             (rect.ysize() % kBlockDim) == 0);
  JXL_RETURN_IF_ERROR(LossyFrameHeuristics(frame_header, enc_state, enc_modular,
-                                           linear, opsin, cms, pool, aux_out));
+                                           linear, opsin, rect, cms, pool,
+                                           aux_out));

-  JXL_RETURN_IF_ERROR(InitializePassesEncoder(frame_header, *opsin, cms, pool,
-                                              enc_state, enc_modular, aux_out));
+  JXL_RETURN_IF_ERROR(InitializePassesEncoder(
+      frame_header, *opsin, rect, cms, pool, enc_state, enc_modular, aux_out));
  return true;
 }

@ -1364,6 +1363,7 @@ Status ComputeEncodingData(
  } else {
    shared.frame_dim = frame_header.ToFrameDimensions();
  }
+
  shared.image_features.patches.SetPassesSharedState(&shared);
  const FrameDimensions& frame_dim = shared.frame_dim;
  shared.ac_strategy =
@ -1389,6 +1389,14 @@ Status ComputeEncodingData(
  const size_t black_idx = black_eci - metadata->m.extra_channel_info.data();
  const ColorEncoding c_enc = metadata->m.color_encoding;

+  // Make the image patch bigger than the currently processed group in streaming
+  // mode so that we can take into account border pixels around the group when
+  // computing inverse Gaborish and adaptive quantization map.
+  int max_border = enc_state.streaming_mode ? kBlockDim : 0;
+  Rect frame_rect(0, 0, frame_data.xsize, frame_data.ysize);
+  Rect patch_rect = Rect(x0, y0, xsize, ysize).Extend(max_border, frame_rect);
+  JXL_ASSERT(patch_rect.IsInside(frame_rect));
+
  Image3F color;
  std::vector<ImageF> extra_channels(num_extra_channels);
  for (auto& extra_channel : extra_channels) {
@ -1399,11 +1407,11 @@ Status ComputeEncodingData(
  bool has_interleaved_alpha = false;
  JxlChunkedFrameInputSource input = frame_data.GetInputSource();
  if (!frame_data.IsJPEG()) {
-    JXL_RETURN_IF_ERROR(CopyColorChannels(input, x0, y0, xsize, ysize,
-                                          frame_info, metadata->m, pool, &color,
-                                          alpha, &has_interleaved_alpha));
+    JXL_RETURN_IF_ERROR(CopyColorChannels(input, patch_rect, frame_info,
+                                          metadata->m, pool, &color, alpha,
+                                          &has_interleaved_alpha));
  }
-  JXL_RETURN_IF_ERROR(CopyExtraChannels(input, x0, y0, xsize, ysize, frame_info,
+  JXL_RETURN_IF_ERROR(CopyExtraChannels(input, patch_rect, frame_info,
                                        metadata->m, has_interleaved_alpha,
                                        pool, &extra_channels));

@ -1416,13 +1424,14 @@ Status ComputeEncodingData(
  Image3F opsin;
  if (!jpeg_data) {
    // Allocating a large enough image avoids a copy when padding.
-    opsin = Image3F(RoundUpToBlockDim(xsize), RoundUpToBlockDim(ysize));
-    opsin.ShrinkTo(xsize, ysize);
+    opsin = Image3F(RoundUpToBlockDim(color.xsize()),
+                    RoundUpToBlockDim(color.ysize()));
+    opsin.ShrinkTo(color.xsize(), color.ysize());
    if (frame_header.color_transform == ColorTransform::kXYB &&
        frame_info.ib_needs_color_transform) {
      if (frame_header.encoding == FrameEncoding::kVarDCT &&
          cparams.speed_tier <= SpeedTier::kKitten) {
-        linear_storage = Image3F(xsize, ysize);
+        linear_storage = Image3F(color.xsize(), color.ysize());
        linear = &linear_storage;
      }
      ToXYB(color, c_enc, metadata->m.IntensityTarget(), black, pool, &opsin,
@ -1447,9 +1456,16 @@ Status ComputeEncodingData(
    }
    PadImageToBlockMultipleInPlace(&opsin);
  }
+  color = Image3F();
+
+  // Rectangle within opsin that corresponds to the currently processed group in
+  // streaming mode.
+  Rect opsin_rect(x0 - patch_rect.x0(), y0 - patch_rect.y0(),
+                  RoundUpToBlockDim(xsize), RoundUpToBlockDim(ysize));

  if (enc_state.initialize_global_state && !jpeg_data) {
-    ComputeChromacityAdjustments(cparams, opsin, &mutable_frame_header);
+    ComputeChromacityAdjustments(cparams, opsin, opsin_rect,
+                                 &mutable_frame_header);
  }

  if (!enc_state.streaming_mode) {
@ -1466,6 +1482,10 @@ Status ComputeEncodingData(
    }
  }

+  if (!enc_state.streaming_mode) {
+    opsin_rect = Rect(opsin);
+  }
+
  if (frame_header.encoding == FrameEncoding::kVarDCT) {
    enc_state.passes.resize(enc_state.progressive_splitter.GetNumPasses());
    for (PassesEncoderState::PassData& pass : enc_state.passes) {
@ -1475,9 +1495,9 @@ Status ComputeEncodingData(
      JXL_RETURN_IF_ERROR(ComputeJPEGTranscodingData(
          *jpeg_data, frame_header, pool, &enc_modular, &enc_state));
    } else {
-      JXL_RETURN_IF_ERROR(
-          ComputeVarDCTEncodingData(frame_header, linear, &opsin, cms, pool,
-                                    &enc_modular, &enc_state, aux_out));
+      JXL_RETURN_IF_ERROR(ComputeVarDCTEncodingData(
+          frame_header, linear, &opsin, opsin_rect, cms, pool, &enc_modular,
+          &enc_state, aux_out));
    }
    ComputeAllCoeffOrders(enc_state, frame_dim);
    if (!enc_state.streaming_mode) {
--- a/third_party/jpeg-xl/lib/jxl/enc_gaborish.cc
+++ b/third_party/jpeg-xl/lib/jxl/enc_gaborish.cc
@ -15,7 +15,8 @@

 namespace jxl {

-void GaborishInverse(Image3F* in_out, float mul[3], ThreadPool* pool) {
+void GaborishInverse(Image3F* in_out, const Rect& rect, float mul[3],
+                     ThreadPool* pool) {
  WeightsSymmetric5 weights[3];
  // Only an approximation. One or even two 3x3, and rank-1 (separable) 5x5
  // are insufficient. The numbers here have been obtained by butteraugli
@ -48,11 +49,9 @@ void GaborishInverse(Image3F* in_out, float mul[3], ThreadPool* pool) {
  // image and reuse the existing planes of the in/out image.
  ImageF temp(in_out->Plane(2).xsize(), in_out->Plane(2).ysize());
  CopyImageTo(in_out->Plane(2), &temp);
-  Symmetric5(in_out->Plane(0), Rect(*in_out), weights[0], pool,
-             &in_out->Plane(2));
-  Symmetric5(in_out->Plane(1), Rect(*in_out), weights[1], pool,
-             &in_out->Plane(0));
-  Symmetric5(temp, Rect(*in_out), weights[2], pool, &in_out->Plane(1));
+  Symmetric5(in_out->Plane(0), rect, weights[0], pool, &in_out->Plane(2), rect);
+  Symmetric5(in_out->Plane(1), rect, weights[1], pool, &in_out->Plane(0), rect);
+  Symmetric5(temp, rect, weights[2], pool, &in_out->Plane(1), rect);
  // Now planes are 1, 2, 0.
  in_out->Plane(0).Swap(in_out->Plane(1));
  // 2 1 0
--- a/third_party/jpeg-xl/lib/jxl/enc_gaborish.h
+++ b/third_party/jpeg-xl/lib/jxl/enc_gaborish.h
@ -19,7 +19,8 @@ namespace jxl {
 // Used in encoder to reduce the impact of the decoder's smoothing.
 // This is not exact. Works in-place to reduce memory use.
 // The input is typically in XYB space.
-void GaborishInverse(Image3F* in_out, float mul[3], ThreadPool* pool);
+void GaborishInverse(Image3F* in_out, const Rect& rect, float mul[3],
+                     ThreadPool* pool);

 }  // namespace jxl

--- a/third_party/jpeg-xl/lib/jxl/enc_gaborish_test.cc
+++ b/third_party/jpeg-xl/lib/jxl/enc_gaborish_test.cc
@ -47,7 +47,7 @@ void TestRoundTrip(const Image3F& in, float max_l1) {
      w,
      w,
  };
-  GaborishInverse(&fwd, weights, null_pool);
+  GaborishInverse(&fwd, Rect(fwd), weights, null_pool);
  JXL_ASSERT_OK(VerifyRelativeError(in, fwd, max_l1, 1E-4f, _));
 }

--- a/third_party/jpeg-xl/lib/jxl/enc_group.cc
+++ b/third_party/jpeg-xl/lib/jxl/enc_group.cc
@ -357,15 +357,17 @@ void QuantizeRoundtripYBlockAC(PassesEncoderState* enc_state, const size_t size,
 }

 void ComputeCoefficients(size_t group_idx, PassesEncoderState* enc_state,
-                         const Image3F& opsin, Image3F* dc) {
+                         const Image3F& opsin, const Rect& rect, Image3F* dc) {
  const Rect block_group_rect =
      enc_state->shared.frame_dim.BlockGroupRect(group_idx);
-  const Rect group_rect = enc_state->shared.frame_dim.GroupRect(group_idx);
  const Rect cmap_rect(
      block_group_rect.x0() / kColorTileDimInBlocks,
      block_group_rect.y0() / kColorTileDimInBlocks,
      DivCeil(block_group_rect.xsize(), kColorTileDimInBlocks),
      DivCeil(block_group_rect.ysize(), kColorTileDimInBlocks));
+  const Rect group_rect =
+      enc_state->shared.frame_dim.GroupRect(group_idx).Translate(rect.x0(),
+                                                                 rect.y0());

  const size_t xsize_blocks = block_group_rect.xsize();
  const size_t ysize_blocks = block_group_rect.ysize();
@ -504,9 +506,9 @@ HWY_AFTER_NAMESPACE();
 namespace jxl {
 HWY_EXPORT(ComputeCoefficients);
 void ComputeCoefficients(size_t group_idx, PassesEncoderState* enc_state,
-                         const Image3F& opsin, Image3F* dc) {
+                         const Image3F& opsin, const Rect& rect, Image3F* dc) {
  return HWY_DYNAMIC_DISPATCH(ComputeCoefficients)(group_idx, enc_state, opsin,
-                                                   dc);
+                                                   rect, dc);
 }

 Status EncodeGroupTokenizedCoefficients(size_t group_idx, size_t pass_idx,
--- a/third_party/jpeg-xl/lib/jxl/enc_group.h
+++ b/third_party/jpeg-xl/lib/jxl/enc_group.h
@ -20,7 +20,7 @@ struct PassesEncoderState;

 // Fills DC
 void ComputeCoefficients(size_t group_idx, PassesEncoderState* enc_state,
-                         const Image3F& opsin, Image3F* dc);
+                         const Image3F& opsin, const Rect& rect, Image3F* dc);

 Status EncodeGroupTokenizedCoefficients(size_t group_idx, size_t pass_idx,
                                        size_t histogram_idx,
--- a/third_party/jpeg-xl/lib/jxl/enc_heuristics.cc
+++ b/third_party/jpeg-xl/lib/jxl/enc_heuristics.cc
@ -30,8 +30,10 @@ namespace jxl {

 struct AuxOut;

-void FindBestBlockEntropyModel(PassesEncoderState& enc_state) {
-  if (enc_state.cparams.decoding_speed_tier >= 1) {
+void FindBestBlockEntropyModel(const CompressParams& cparams, const ImageI& rqf,
+                               const AcStrategyImage& ac_strategy,
+                               BlockCtxMap* block_ctx_map) {
+  if (cparams.decoding_speed_tier >= 1) {
    static constexpr uint8_t kSimpleCtxMap[] = {
        // Cluster all blocks together
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  //
@ -42,20 +44,18 @@ void FindBestBlockEntropyModel(PassesEncoderState& enc_state) {
        3 * kNumOrders == sizeof(kSimpleCtxMap) / sizeof *kSimpleCtxMap,
        "Update simple context map");

-    auto bcm = enc_state.shared.block_ctx_map;
+    auto bcm = *block_ctx_map;
    bcm.ctx_map.assign(std::begin(kSimpleCtxMap), std::end(kSimpleCtxMap));
    bcm.num_ctxs = 2;
    bcm.num_dc_ctxs = 1;
    return;
  }
-  if (enc_state.cparams.speed_tier >= SpeedTier::kFalcon) {
+  if (cparams.speed_tier >= SpeedTier::kFalcon) {
    return;
  }
-  const ImageI& rqf = enc_state.shared.raw_quant_field;
  // No need to change context modeling for small images.
  size_t tot = rqf.xsize() * rqf.ysize();
-  size_t size_for_ctx_model =
-      (1 << 10) * enc_state.cparams.butteraugli_distance;
+  size_t size_for_ctx_model = (1 << 10) * cparams.butteraugli_distance;
  if (tot < size_for_ctx_model) return;

  struct OccCounters {
@ -79,14 +79,13 @@ void FindBestBlockEntropyModel(PassesEncoderState& enc_state) {
    size_t ord_counts[kNumOrders] = {};
  };
  // The OccCounters struct is too big to allocate on the stack.
-  std::unique_ptr<OccCounters> counters(
-      new OccCounters(rqf, enc_state.shared.ac_strategy));
+  std::unique_ptr<OccCounters> counters(new OccCounters(rqf, ac_strategy));

  // Splitting the context model according to the quantization field seems to
  // mostly benefit only large images.
-  size_t size_for_qf_split = (1 << 13) * enc_state.cparams.butteraugli_distance;
+  size_t size_for_qf_split = (1 << 13) * cparams.butteraugli_distance;
  size_t num_qf_segments = tot < size_for_qf_split ? 1 : 2;
-  std::vector<uint32_t>& qft = enc_state.shared.block_ctx_map.qf_thresholds;
+  std::vector<uint32_t>& qft = block_ctx_map->qf_thresholds;
  qft.clear();
  // Divide the quant field in up to num_qf_segments segments.
  size_t cumsum = 0;
@ -153,7 +152,7 @@ void FindBestBlockEntropyModel(PassesEncoderState& enc_state) {
    remap[i] = remap_remap[remap[i]];
  }
  // Write the block context map.
-  auto& ctx_map = enc_state.shared.block_ctx_map.ctx_map;
+  auto& ctx_map = block_ctx_map->ctx_map;
  ctx_map = remap;
  ctx_map.resize(remap.size() * 3);
  // for chroma, only use up to nb_clusters_chroma separate block contexts
@ -162,14 +161,13 @@ void FindBestBlockEntropyModel(PassesEncoderState& enc_state) {
    ctx_map[i] = num + Clamp1((int)remap[i % remap.size()], 0,
                              (int)nb_clusters_chroma - 1);
  }
-  enc_state.shared.block_ctx_map.num_ctxs =
+  block_ctx_map->num_ctxs =
      *std::max_element(ctx_map.begin(), ctx_map.end()) + 1;
 }

 namespace {

 void FindBestDequantMatrices(const CompressParams& cparams,
-                             const Image3F& opsin,
                             ModularFrameEncoder* modular_frame_encoder,
                             DequantMatrices* dequant_matrices) {
  // TODO(veluca): quant matrices for no-gaborish.
@ -701,38 +699,47 @@ Status LossyFrameHeuristics(const FrameHeader& frame_header,
                            PassesEncoderState* enc_state,
                            ModularFrameEncoder* modular_frame_encoder,
                            const Image3F* original_pixels, Image3F* opsin,
-                            const JxlCmsInterface& cms, ThreadPool* pool,
-                            AuxOut* aux_out) {
-  CompressParams& cparams = enc_state->cparams;
+                            const Rect& rect, const JxlCmsInterface& cms,
+                            ThreadPool* pool, AuxOut* aux_out) {
+  const CompressParams& cparams = enc_state->cparams;
+  const bool streaming_mode = enc_state->streaming_mode;
+  const bool initialize_global_state = enc_state->initialize_global_state;
  PassesSharedState& shared = enc_state->shared;
+  const FrameDimensions& frame_dim = shared.frame_dim;
+  ImageFeatures& image_features = shared.image_features;
+  DequantMatrices& matrices = shared.matrices;
+  Quantizer& quantizer = shared.quantizer;
+  ImageI& raw_quant_field = shared.raw_quant_field;
+  ColorCorrelationMap& cmap = shared.cmap;
+  AcStrategyImage& ac_strategy = shared.ac_strategy;
+  ImageB& epf_sharpness = shared.epf_sharpness;
+  BlockCtxMap& block_ctx_map = shared.block_ctx_map;

  // Find and subtract splines.
-  if (!enc_state->streaming_mode &&
-      cparams.speed_tier <= SpeedTier::kSquirrel) {
+  if (!streaming_mode && cparams.speed_tier <= SpeedTier::kSquirrel) {
    if (cparams.custom_splines.HasAny()) {
-      shared.image_features.splines = cparams.custom_splines;
+      image_features.splines = cparams.custom_splines;
    } else {
-      shared.image_features.splines = FindSplines(*opsin);
+      image_features.splines = FindSplines(*opsin);
    }
-    JXL_RETURN_IF_ERROR(shared.image_features.splines.InitializeDrawCache(
-        opsin->xsize(), opsin->ysize(), shared.cmap));
-    shared.image_features.splines.SubtractFrom(opsin);
+    JXL_RETURN_IF_ERROR(image_features.splines.InitializeDrawCache(
+        opsin->xsize(), opsin->ysize(), cmap));
+    image_features.splines.SubtractFrom(opsin);
  }

  // Find and subtract patches/dots.
-  if (!enc_state->streaming_mode &&
+  if (!streaming_mode &&
      ApplyOverride(cparams.patches,
                    cparams.speed_tier <= SpeedTier::kSquirrel)) {
    FindBestPatchDictionary(*opsin, enc_state, cms, pool, aux_out);
-    PatchDictionaryEncoder::SubtractFrom(shared.image_features.patches, opsin);
+    PatchDictionaryEncoder::SubtractFrom(image_features.patches, opsin);
  }

  static const float kAcQuant = 0.79f;
  const float quant_dc = InitialQuantDC(cparams.butteraugli_distance);
-  Quantizer& quantizer = enc_state->shared.quantizer;
  // We don't know the quant field yet, but for computing the global scale
  // assuming that it will be the same as for Falcon mode is good enough.
-  if (enc_state->initialize_global_state) {
+  if (initialize_global_state) {
    quantizer.ComputeGlobalScaleAndQuant(
        quant_dc, kAcQuant / cparams.butteraugli_distance, 0);
  }
@ -757,34 +764,35 @@ Status LossyFrameHeuristics(const FrameHeader& frame_header,
  // output: Gaborished XYB, CfL, ACS, raw quant field, EPF control field.

  ArControlFieldHeuristics ar_heuristics;
-  AcStrategyHeuristics acs_heuristics;
+  AcStrategyHeuristics acs_heuristics(cparams);
  CfLHeuristics cfl_heuristics;
+  ImageF initial_quant_field;
+  ImageF initial_quant_masking;
+  ImageF initial_quant_masking1x1;

  // Compute an initial estimate of the quantization field.
  // Call InitialQuantField only in Hare mode or slower. Otherwise, rely
  // on simple heuristics in FindBestAcStrategy, or set a constant for Falcon
  // mode.
  if (cparams.speed_tier > SpeedTier::kHare) {
-    enc_state->initial_quant_field =
-        ImageF(shared.frame_dim.xsize_blocks, shared.frame_dim.ysize_blocks);
-    enc_state->initial_quant_masking =
-        ImageF(shared.frame_dim.xsize_blocks, shared.frame_dim.ysize_blocks);
+    initial_quant_field =
+        ImageF(frame_dim.xsize_blocks, frame_dim.ysize_blocks);
+    initial_quant_masking =
+        ImageF(frame_dim.xsize_blocks, frame_dim.ysize_blocks);
    float q = kAcQuant / cparams.butteraugli_distance;
-    FillImage(q, &enc_state->initial_quant_field);
-    FillImage(1.0f / (q + 0.001f), &enc_state->initial_quant_masking);
+    FillImage(q, &initial_quant_field);
+    FillImage(1.0f / (q + 0.001f), &initial_quant_masking);
  } else {
    // Call this here, as it relies on pre-gaborish values.
    float butteraugli_distance_for_iqf = cparams.butteraugli_distance;
    if (!frame_header.loop_filter.gab) {
      butteraugli_distance_for_iqf *= 0.73f;
    }
-    enc_state->initial_quant_field = InitialQuantField(
-        butteraugli_distance_for_iqf, *opsin, shared.frame_dim, pool, 1.0f,
-        &enc_state->initial_quant_masking,
-        &enc_state->initial_quant_masking1x1);
-    if (enc_state->initialize_global_state) {
-      quantizer.SetQuantField(quant_dc, enc_state->initial_quant_field,
-                              nullptr);
+    initial_quant_field = InitialQuantField(
+        butteraugli_distance_for_iqf, *opsin, rect, pool, 1.0f,
+        &initial_quant_masking, &initial_quant_masking1x1);
+    if (initialize_global_state) {
+      quantizer.SetQuantField(quant_dc, initial_quant_field, nullptr);
    }
  }

@ -798,71 +806,67 @@ Status LossyFrameHeuristics(const FrameHeader& frame_header,
        0.99406123118127299f,
        0.99719338015886894f,
    };
-    GaborishInverse(opsin, weight, pool);
+    GaborishInverse(opsin, rect, weight, pool);
  }

-  if (enc_state->initialize_global_state) {
-    FindBestDequantMatrices(cparams, *opsin, modular_frame_encoder,
-                            &enc_state->shared.matrices);
+  if (initialize_global_state) {
+    FindBestDequantMatrices(cparams, modular_frame_encoder, &matrices);
  }

-  cfl_heuristics.Init(*opsin);
-  acs_heuristics.Init(*opsin, enc_state);
+  cfl_heuristics.Init(rect);
+  acs_heuristics.Init(*opsin, rect, initial_quant_field, initial_quant_masking,
+                      initial_quant_masking1x1, &matrices);

  auto process_tile = [&](const uint32_t tid, const size_t thread) {
-    size_t n_enc_tiles =
-        DivCeil(enc_state->shared.frame_dim.xsize_blocks, kEncTileDimInBlocks);
+    size_t n_enc_tiles = DivCeil(frame_dim.xsize_blocks, kEncTileDimInBlocks);
    size_t tx = tid % n_enc_tiles;
    size_t ty = tid / n_enc_tiles;
    size_t by0 = ty * kEncTileDimInBlocks;
-    size_t by1 = std::min((ty + 1) * kEncTileDimInBlocks,
-                          enc_state->shared.frame_dim.ysize_blocks);
+    size_t by1 =
+        std::min((ty + 1) * kEncTileDimInBlocks, frame_dim.ysize_blocks);
    size_t bx0 = tx * kEncTileDimInBlocks;
-    size_t bx1 = std::min((tx + 1) * kEncTileDimInBlocks,
-                          enc_state->shared.frame_dim.xsize_blocks);
+    size_t bx1 =
+        std::min((tx + 1) * kEncTileDimInBlocks, frame_dim.xsize_blocks);
    Rect r(bx0, by0, bx1 - bx0, by1 - by0);

    // For speeds up to Wombat, we only compute the color correlation map
    // once we know the transform type and the quantization map.
    if (cparams.speed_tier <= SpeedTier::kSquirrel) {
-      cfl_heuristics.ComputeTile(r, *opsin, enc_state->shared.matrices,
+      cfl_heuristics.ComputeTile(r, *opsin, rect, matrices,
                                 /*ac_strategy=*/nullptr,
                                 /*raw_quant_field=*/nullptr,
                                 /*quantizer=*/nullptr, /*fast=*/false, thread,
-                                 &enc_state->shared.cmap);
+                                 &cmap);
    }

    // Choose block sizes.
-    acs_heuristics.ProcessRect(r);
+    acs_heuristics.ProcessRect(r, cmap, &ac_strategy);

    // Choose amount of post-processing smoothing.
    // TODO(veluca): should this go *after* AdjustQuantField?
-    ar_heuristics.RunRect(frame_header, r, *opsin, enc_state, thread);
+    ar_heuristics.RunRect(cparams, frame_header, r, *opsin, rect,
+                          initial_quant_field, ac_strategy, &epf_sharpness,
+                          thread);

    // Always set the initial quant field, so we can compute the CfL map with
    // more accuracy. The initial quant field might change in slower modes, but
    // adjusting the quant field with butteraugli when all the other encoding
    // parameters are fixed is likely a more reliable choice anyway.
-    AdjustQuantField(enc_state->shared.ac_strategy, r,
-                     cparams.butteraugli_distance,
-                     &enc_state->initial_quant_field);
-    quantizer.SetQuantFieldRect(enc_state->initial_quant_field, r,
-                                &enc_state->shared.raw_quant_field);
+    AdjustQuantField(ac_strategy, r, cparams.butteraugli_distance,
+                     &initial_quant_field);
+    quantizer.SetQuantFieldRect(initial_quant_field, r, &raw_quant_field);

    // Compute a non-default CfL map if we are at Hare speed, or slower.
    if (cparams.speed_tier <= SpeedTier::kHare) {
      cfl_heuristics.ComputeTile(
-          r, *opsin, enc_state->shared.matrices, &enc_state->shared.ac_strategy,
-          &enc_state->shared.raw_quant_field, &enc_state->shared.quantizer,
-          /*fast=*/cparams.speed_tier >= SpeedTier::kWombat, thread,
-          &enc_state->shared.cmap);
+          r, *opsin, rect, matrices, &ac_strategy, &raw_quant_field, &quantizer,
+          /*fast=*/cparams.speed_tier >= SpeedTier::kWombat, thread, &cmap);
    }
  };
  JXL_RETURN_IF_ERROR(RunOnPool(
      pool, 0,
-      DivCeil(enc_state->shared.frame_dim.xsize_blocks, kEncTileDimInBlocks) *
-          DivCeil(enc_state->shared.frame_dim.ysize_blocks,
-                  kEncTileDimInBlocks),
+      DivCeil(frame_dim.xsize_blocks, kEncTileDimInBlocks) *
+          DivCeil(frame_dim.ysize_blocks, kEncTileDimInBlocks),
      [&](const size_t num_threads) {
        ar_heuristics.PrepareForThreads(num_threads);
        cfl_heuristics.PrepareForThreads(num_threads);
@ -870,23 +874,22 @@ Status LossyFrameHeuristics(const FrameHeader& frame_header,
      },
      process_tile, "Enc Heuristics"));

-  acs_heuristics.Finalize(aux_out);
-  if (cparams.speed_tier <= SpeedTier::kHare &&
-      enc_state->initialize_global_state) {
+  acs_heuristics.Finalize(frame_dim, ac_strategy, aux_out);
+  if (cparams.speed_tier <= SpeedTier::kHare && initialize_global_state) {
    cfl_heuristics.ComputeDC(/*fast=*/cparams.speed_tier >= SpeedTier::kWombat,
-                             &enc_state->shared.cmap);
+                             &cmap);
  }

  // Refine quantization levels.
-  if (!enc_state->streaming_mode) {
-    FindBestQuantizer(frame_header, original_pixels, *opsin, enc_state, cms,
-                      pool, aux_out);
+  if (!streaming_mode) {
+    FindBestQuantizer(frame_header, original_pixels, *opsin,
+                      initial_quant_field, enc_state, cms, pool, aux_out);
  }

  // Choose a context model that depends on the amount of quantization for AC.
-  if (cparams.speed_tier < SpeedTier::kFalcon &&
-      enc_state->initialize_global_state) {
-    FindBestBlockEntropyModel(*enc_state);
+  if (cparams.speed_tier < SpeedTier::kFalcon && initialize_global_state) {
+    FindBestBlockEntropyModel(cparams, raw_quant_field, ac_strategy,
+                              &block_ctx_map);
  }
  return true;
 }
--- a/third_party/jpeg-xl/lib/jxl/enc_heuristics.h
+++ b/third_party/jpeg-xl/lib/jxl/enc_heuristics.h
@ -37,8 +37,8 @@ Status LossyFrameHeuristics(const FrameHeader& frame_header,
                            PassesEncoderState* enc_state,
                            ModularFrameEncoder* modular_frame_encoder,
                            const Image3F* original_pixels, Image3F* opsin,
-                            const JxlCmsInterface& cms, ThreadPool* pool,
-                            AuxOut* aux_out);
+                            const Rect& rect, const JxlCmsInterface& cms,
+                            ThreadPool* pool, AuxOut* aux_out);

 void FindBestBlockEntropyModel(PassesEncoderState& enc_state);

--- a/third_party/jpeg-xl/lib/jxl/enc_modular.cc
+++ b/third_party/jpeg-xl/lib/jxl/enc_modular.cc
@ -490,7 +490,7 @@ Status ModularFrameEncoder::ComputeEncodingData(
  if (do_color && frame_header.loop_filter.gab) {
    float w = 0.9908511000000001f;
    float weights[3] = {w, w, w};
-    GaborishInverse(color, weights, pool);
+    GaborishInverse(color, Rect(*color), weights, pool);
  }

  if (do_color && metadata.bit_depth.bits_per_sample <= 16 &&
--- a/third_party/jpeg-xl/lib/jxl/encode.cc
+++ b/third_party/jpeg-xl/lib/jxl/encode.cc
@ -17,6 +17,7 @@
 #include "lib/jxl/base/byte_order.h"
 #include "lib/jxl/base/common.h"
 #include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/exif.h"
 #include "lib/jxl/base/printf_macros.h"
 #include "lib/jxl/base/span.h"
 #include "lib/jxl/base/status.h"
@ -31,7 +32,6 @@
 #include "lib/jxl/enc_icc_codec.h"
 #include "lib/jxl/enc_params.h"
 #include "lib/jxl/encode_internal.h"
-#include "lib/jxl/exif.h"
 #include "lib/jxl/jpeg/enc_jpeg_data.h"
 #include "lib/jxl/luminance.h"
 #include "lib/jxl/memory_manager_internal.h"
@ -2252,7 +2252,7 @@ JxlEncoderStatus JxlEncoderAddImageFrameInternal(
    auto frame_state = JxlFastLosslessPrepareFrame(
        frame_data.GetInputSource(), xsize, ysize, num_channels,
        frame_settings->enc->metadata.m.bit_depth.bits_per_sample, big_endian,
-        /*effort=*/2);
+        /*effort=*/2, /*oneshot=*/!frame_data.StreamingInput());
    if (!streaming) {
      JxlFastLosslessProcessFrame(frame_state, /*is_last=*/false,
                                  frame_settings->enc->thread_pool.get(),
--- a/third_party/jpeg-xl/lib/jxl/encode_internal.h
+++ b/third_party/jpeg-xl/lib/jxl/encode_internal.h
@ -267,6 +267,8 @@ class JxlEncoderChunkedFrameAdapter {
    return true;
  }

+  bool StreamingInput() const { return has_input_source_; }
+
  const size_t xsize;
  const size_t ysize;

--- a/third_party/jpeg-xl/lib/jxl/encode_test.cc
+++ b/third_party/jpeg-xl/lib/jxl/encode_test.cc
@ -2009,3 +2009,64 @@ TEST_P(EncoderStreamingTest, ChunkedAndOutputCallback) {
 JXL_GTEST_INSTANTIATE_TEST_SUITE_P(
    EncoderStreamingTest, EncoderStreamingTest,
    testing::ValuesIn(StreamingTestParam::All()));
+
+TEST(EncoderTest, CMYK) {
+  size_t xsize = 257;
+  size_t ysize = 259;
+  jxl::test::TestImage image;
+  image.SetDimensions(xsize, ysize)
+      .SetDataType(JXL_TYPE_UINT8)
+      .SetChannels(3)
+      .SetAllBitDepths(8);
+  image.AddFrame().RandomFill();
+  jxl::test::TestImage ec_image;
+  ec_image.SetDataType(JXL_TYPE_UINT8)
+      .SetDimensions(xsize, ysize)
+      .SetChannels(1)
+      .SetAllBitDepths(8);
+  ec_image.AddFrame().RandomFill();
+  const auto& frame = image.ppf().frames[0].color;
+  const auto& ec_frame = ec_image.ppf().frames[0].color;
+  JxlBasicInfo basic_info = image.ppf().info;
+  basic_info.xsize = xsize;
+  basic_info.ysize = ysize;
+  basic_info.num_extra_channels = 1;
+  basic_info.uses_original_profile = JXL_TRUE;
+
+  std::vector<uint8_t> compressed = std::vector<uint8_t>(64);
+  JxlEncoderPtr enc_ptr = JxlEncoderMake(nullptr);
+  JxlEncoderStruct* enc = enc_ptr.get();
+  ASSERT_NE(nullptr, enc);
+  JxlEncoderFrameSettings* frame_settings =
+      JxlEncoderFrameSettingsCreate(enc, NULL);
+
+  EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetBasicInfo(enc, &basic_info));
+  JxlExtraChannelInfo channel_info;
+  JxlExtraChannelType channel_type = JXL_CHANNEL_BLACK;
+  JxlEncoderInitExtraChannelInfo(channel_type, &channel_info);
+  EXPECT_EQ(JXL_ENC_SUCCESS,
+            JxlEncoderSetExtraChannelInfo(enc, 0, &channel_info));
+  const std::vector<uint8_t> icc = jxl::test::ReadTestData(
+      "external/Compact-ICC-Profiles/profiles/"
+      "CGATS001Compat-v2-micro.icc");
+  EXPECT_EQ(JXL_ENC_SUCCESS,
+            JxlEncoderSetICCProfile(enc, icc.data(), icc.size()));
+  EXPECT_EQ(JXL_ENC_SUCCESS,
+            JxlEncoderAddImageFrame(frame_settings, &frame.format,
+                                    frame.pixels(), frame.pixels_size));
+  EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetExtraChannelBuffer(
+                                 frame_settings, &ec_frame.format,
+                                 ec_frame.pixels(), ec_frame.pixels_size, 0));
+  JxlEncoderCloseInput(frame_settings->enc);
+  uint8_t* next_out = compressed.data();
+  size_t avail_out = compressed.size();
+  ProcessEncoder(enc, compressed, next_out, avail_out);
+
+  jxl::extras::JXLDecompressParams dparams;
+  dparams.accepted_formats = {
+      {3, JXL_TYPE_UINT8, JXL_LITTLE_ENDIAN, 0},
+  };
+  jxl::extras::PackedPixelFile ppf;
+  EXPECT_TRUE(DecodeImageJXL(compressed.data(), compressed.size(), dparams,
+                             nullptr, &ppf, nullptr));
+}
--- a/third_party/jpeg-xl/lib/jxl/fields.cc
+++ b/third_party/jpeg-xl/lib/jxl/fields.cc
@ -5,10 +5,10 @@

 #include "lib/jxl/fields.h"

-#include <stddef.h>
-
 #include <algorithm>
+#include <cinttypes>
 #include <cmath>
+#include <cstddef>
 #include <hwy/base.h>

 #include "lib/jxl/base/bits.h"
--- a/third_party/jpeg-xl/lib/jxl/fields.h
+++ b/third_party/jpeg-xl/lib/jxl/fields.h
@ -8,15 +8,12 @@

 // Forward/backward-compatible 'bundles' with auto-serialized 'fields'.

-#include <inttypes.h>
-#include <stddef.h>
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include <cinttypes>
 #include <cmath>  // abs
 #include <cstdarg>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>

 #include "lib/jxl/base/bits.h"
 #include "lib/jxl/base/compiler_specific.h"
--- a/third_party/jpeg-xl/lib/jxl/icc_codec_common.cc
+++ b/third_party/jpeg-xl/lib/jxl/icc_codec_common.cc
@ -81,8 +81,8 @@ void AppendKeyword(const Tag& keyword, PaddedBytes* data) {
 }

 // Checks if a + b > size, taking possible integer overflow into account.
-Status CheckOutOfBounds(size_t a, size_t b, size_t size) {
-  size_t pos = a + b;
+Status CheckOutOfBounds(uint64_t a, uint64_t b, uint64_t size) {
+  uint64_t pos = a + b;
  if (pos > size) return JXL_FAILURE("Out of bounds");
  if (pos < a) return JXL_FAILURE("Out of bounds");  // overflow happened
  return true;
--- a/third_party/jpeg-xl/lib/jxl/icc_codec_common.h
+++ b/third_party/jpeg-xl/lib/jxl/icc_codec_common.h
@ -92,7 +92,7 @@ void EncodeKeyword(const Tag& keyword, uint8_t* data, size_t size, size_t pos);
 void AppendKeyword(const Tag& keyword, PaddedBytes* data);

 // Checks if a + b > size, taking possible integer overflow into account.
-Status CheckOutOfBounds(size_t a, size_t b, size_t size);
+Status CheckOutOfBounds(uint64_t a, uint64_t b, uint64_t size);
 Status CheckIs32Bit(uint64_t v);

 const Span<const uint8_t> ICCInitialHeaderPrediction();
--- a/third_party/jpeg-xl/lib/jxl/image.h
+++ b/third_party/jpeg-xl/lib/jxl/image.h
@ -8,7 +8,11 @@

 // SIMD/multicore-friendly planar image representation with row accessors.

+#if defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER) || \
+    defined(THREAD_SANITIZER)
 #include <inttypes.h>
+#endif
+
 #include <stddef.h>
 #include <stdint.h>
 #include <string.h>
@ -324,6 +328,14 @@ class RectT {
    return CeilShiftRight(shift, shift);
  }

+  RectT<T> Extend(T border, RectT<T> parent) {
+    T new_x0 = x0() > parent.x0() + border ? x0() - border : parent.x0();
+    T new_y0 = y0() > parent.y0() + border ? y0() - border : parent.y0();
+    T new_x1 = x1() + border > parent.x1() ? parent.x1() : x1() + border;
+    T new_y1 = y1() + border > parent.y1() ? parent.y1() : y1() + border;
+    return RectT<T>(new_x0, new_y0, new_x1 - new_x0, new_y1 - new_y0);
+  }
+
  template <typename U>
  RectT<U> As() const {
    return RectT<U>(U(x0_), U(y0_), U(xsize_), U(ysize_));
--- a/third_party/jpeg-xl/lib/jxl/modular/encoding/enc_debug_tree.cc
+++ b/third_party/jpeg-xl/lib/jxl/modular/encoding/enc_debug_tree.cc
@ -5,8 +5,9 @@

 #include "lib/jxl/modular/encoding/enc_debug_tree.h"

-#include <stdint.h>
-#include <stdlib.h>
+#include <cinttypes>
+#include <cstdint>
+#include <cstdlib>

 #include "lib/jxl/base/os_macros.h"
 #include "lib/jxl/base/printf_macros.h"
--- a/third_party/jpeg-xl/lib/jxl/modular/transform/transform.cc
+++ b/third_party/jpeg-xl/lib/jxl/modular/transform/transform.cc
@ -5,6 +5,8 @@

 #include "lib/jxl/modular/transform/transform.h"

+#include <cinttypes>
+
 #include "lib/jxl/base/printf_macros.h"
 #include "lib/jxl/fields.h"
 #include "lib/jxl/modular/modular_image.h"
--- a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_write.cc
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_write.cc
@ -5,6 +5,8 @@

 #include "lib/jxl/render_pipeline/stage_write.h"

+#include <type_traits>
+
 #include "lib/jxl/alpha.h"
 #include "lib/jxl/base/common.h"
 #include "lib/jxl/dec_cache.h"
@ -21,6 +23,7 @@ namespace jxl {
 namespace HWY_NAMESPACE {

 // These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Add;
 using hwy::HWY_NAMESPACE::Clamp;
 using hwy::HWY_NAMESPACE::Div;
 using hwy::HWY_NAMESPACE::Max;
@ -30,6 +33,78 @@ using hwy::HWY_NAMESPACE::Or;
 using hwy::HWY_NAMESPACE::Rebind;
 using hwy::HWY_NAMESPACE::ShiftLeftSame;
 using hwy::HWY_NAMESPACE::ShiftRightSame;
+using hwy::HWY_NAMESPACE::VFromD;
+
+// 8x8 ordered dithering pattern from
+// https://en.wikipedia.org/wiki/Ordered_dithering
+// scaled to have an average of 0 and be fully contained in (-0.5, 0.5).
+// Matrix is duplicated in width to avoid inconsistencies or out-of-bound-reads
+// if doing unaligned operations.
+const float kDither[(2 * 8) * 8] = {
+    -0.4921875, 0.0078125, -0.3671875, 0.1328125,  //
+    -0.4609375, 0.0390625, -0.3359375, 0.1640625,  //
+    -0.4921875, 0.0078125, -0.3671875, 0.1328125,  //
+    -0.4609375, 0.0390625, -0.3359375, 0.1640625,  //
+                                                   //
+    0.2578125, -0.2421875, 0.3828125, -0.1171875,  //
+    0.2890625, -0.2109375, 0.4140625, -0.0859375,  //
+    0.2578125, -0.2421875, 0.3828125, -0.1171875,  //
+    0.2890625, -0.2109375, 0.4140625, -0.0859375,  //
+                                                   //
+    -0.3046875, 0.1953125, -0.4296875, 0.0703125,  //
+    -0.2734375, 0.2265625, -0.3984375, 0.1015625,  //
+    -0.3046875, 0.1953125, -0.4296875, 0.0703125,  //
+    -0.2734375, 0.2265625, -0.3984375, 0.1015625,  //
+                                                   //
+    0.4453125, -0.0546875, 0.3203125, -0.1796875,  //
+    0.4765625, -0.0234375, 0.3515625, -0.1484375,  //
+    0.4453125, -0.0546875, 0.3203125, -0.1796875,  //
+    0.4765625, -0.0234375, 0.3515625, -0.1484375,  //
+                                                   //
+    -0.4453125, 0.0546875, -0.3203125, 0.1796875,  //
+    -0.4765625, 0.0234375, -0.3515625, 0.1484375,  //
+    -0.4453125, 0.0546875, -0.3203125, 0.1796875,  //
+    -0.4765625, 0.0234375, -0.3515625, 0.1484375,  //
+                                                   //
+    0.3046875, -0.1953125, 0.4296875, -0.0703125,  //
+    0.2734375, -0.2265625, 0.3984375, -0.1015625,  //
+    0.3046875, -0.1953125, 0.4296875, -0.0703125,  //
+    0.2734375, -0.2265625, 0.3984375, -0.1015625,  //
+                                                   //
+    -0.2578125, 0.2421875, -0.3828125, 0.1171875,  //
+    -0.2890625, 0.2109375, -0.4140625, 0.0859375,  //
+    -0.2578125, 0.2421875, -0.3828125, 0.1171875,  //
+    -0.2890625, 0.2109375, -0.4140625, 0.0859375,  //
+                                                   //
+    0.4921875, -0.0078125, 0.3671875, -0.1328125,  //
+    0.4609375, -0.0390625, 0.3359375, -0.1640625,  //
+    0.4921875, -0.0078125, 0.3671875, -0.1328125,  //
+    0.4609375, -0.0390625, 0.3359375, -0.1640625,  //
+};
+
+using DF = HWY_FULL(float);
+
+// Converts `v` to an appropriate value for the given unsigned type.
+// If the unsigned type is an 8-bit type, performs ordered dithering.
+template <typename T>
+VFromD<Rebind<T, DF>> MakeUnsigned(VFromD<DF> v, size_t x0, size_t y0,
+                                   VFromD<DF> mul) {
+  static_assert(std::is_unsigned<T>::value, "T must be an unsigned type");
+  using DU = Rebind<T, DF>;
+  v = Mul(v, mul);
+  // TODO(veluca): if constexpr with C++17
+  if (sizeof(T) == 1) {
+    size_t pos = (y0 % 8) * (2 * 8) + (x0 % 8);
+#if HWY_TARGET != HWY_SCALAR
+    auto dither = LoadDup128(DF(), kDither + pos);
+#else
+    auto dither = LoadU(DF(), kDither + pos);
+#endif
+    v = Add(v, dither);
+  }
+  v = Clamp(Zero(DF()), v, mul);
+  return DemoteTo(DU(), NearestInt(v));
+}

 class WriteToOutputStage : public RenderPipelineStage {
 public:
@ -229,14 +304,14 @@ class WriteToOutputStage : public RenderPipelineStage {
    if (out.data_type_ == JXL_TYPE_UINT8) {
      uint8_t* JXL_RESTRICT temp =
          reinterpret_cast<uint8_t*>(temp_out_[thread_id].get());
-      StoreUnsignedRow(out, input, len, temp);
+      StoreUnsignedRow(out, input, len, temp, xstart, ypos);
      WriteToOutput(out, thread_id, ypos, xstart, len, temp);
    } else if (out.data_type_ == JXL_TYPE_UINT16 ||
               out.data_type_ == JXL_TYPE_FLOAT16) {
      uint16_t* JXL_RESTRICT temp =
          reinterpret_cast<uint16_t*>(temp_out_[thread_id].get());
      if (out.data_type_ == JXL_TYPE_UINT16) {
-        StoreUnsignedRow(out, input, len, temp);
+        StoreUnsignedRow(out, input, len, temp, xstart, ypos);
      } else {
        StoreFloat16Row(out, input, len, temp);
      }
@ -289,10 +364,8 @@ class WriteToOutputStage : public RenderPipelineStage {

  template <typename T>
  void StoreUnsignedRow(const Output& out, const float* input[4], size_t len,
-                        T* output) const {
+                        T* output, size_t xstart, size_t ypos) const {
    const HWY_FULL(float) d;
-    auto zero = Zero(d);
-    auto one = Set(d, 1.0f);
    auto mul = Set(d, (1u << (out.bits_per_sample_)) - 1);
    const Rebind<T, decltype(d)> du;
    const size_t padding = RoundUpTo(len, Lanes(d)) - len;
@ -301,35 +374,32 @@ class WriteToOutputStage : public RenderPipelineStage {
    }
    if (out.num_channels_ == 1) {
      for (size_t i = 0; i < len; i += Lanes(d)) {
-        auto v0 = Mul(Clamp(zero, LoadU(d, &input[0][i]), one), mul);
-        StoreU(DemoteTo(du, NearestInt(v0)), du, &output[i]);
+        StoreU(MakeUnsigned<T>(LoadU(d, &input[0][i]), xstart + i, ypos, mul),
+               du, &output[i]);
      }
    } else if (out.num_channels_ == 2) {
      for (size_t i = 0; i < len; i += Lanes(d)) {
-        auto v0 = Mul(Clamp(zero, LoadU(d, &input[0][i]), one), mul);
-        auto v1 = Mul(Clamp(zero, LoadU(d, &input[1][i]), one), mul);
-        StoreInterleaved2(DemoteTo(du, NearestInt(v0)),
-                          DemoteTo(du, NearestInt(v1)), du, &output[2 * i]);
+        StoreInterleaved2(
+            MakeUnsigned<T>(LoadU(d, &input[0][i]), xstart + i, ypos, mul),
+            MakeUnsigned<T>(LoadU(d, &input[1][i]), xstart + i, ypos, mul), du,
+            &output[2 * i]);
      }
    } else if (out.num_channels_ == 3) {
      for (size_t i = 0; i < len; i += Lanes(d)) {
-        auto v0 = Mul(Clamp(zero, LoadU(d, &input[0][i]), one), mul);
-        auto v1 = Mul(Clamp(zero, LoadU(d, &input[1][i]), one), mul);
-        auto v2 = Mul(Clamp(zero, LoadU(d, &input[2][i]), one), mul);
-        StoreInterleaved3(DemoteTo(du, NearestInt(v0)),
-                          DemoteTo(du, NearestInt(v1)),
-                          DemoteTo(du, NearestInt(v2)), du, &output[3 * i]);
+        StoreInterleaved3(
+            MakeUnsigned<T>(LoadU(d, &input[0][i]), xstart + i, ypos, mul),
+            MakeUnsigned<T>(LoadU(d, &input[1][i]), xstart + i, ypos, mul),
+            MakeUnsigned<T>(LoadU(d, &input[2][i]), xstart + i, ypos, mul), du,
+            &output[3 * i]);
      }
    } else if (out.num_channels_ == 4) {
      for (size_t i = 0; i < len; i += Lanes(d)) {
-        auto v0 = Mul(Clamp(zero, LoadU(d, &input[0][i]), one), mul);
-        auto v1 = Mul(Clamp(zero, LoadU(d, &input[1][i]), one), mul);
-        auto v2 = Mul(Clamp(zero, LoadU(d, &input[2][i]), one), mul);
-        auto v3 = Mul(Clamp(zero, LoadU(d, &input[3][i]), one), mul);
-        StoreInterleaved4(DemoteTo(du, NearestInt(v0)),
-                          DemoteTo(du, NearestInt(v1)),
-                          DemoteTo(du, NearestInt(v2)),
-                          DemoteTo(du, NearestInt(v3)), du, &output[4 * i]);
+        StoreInterleaved4(
+            MakeUnsigned<T>(LoadU(d, &input[0][i]), xstart + i, ypos, mul),
+            MakeUnsigned<T>(LoadU(d, &input[1][i]), xstart + i, ypos, mul),
+            MakeUnsigned<T>(LoadU(d, &input[2][i]), xstart + i, ypos, mul),
+            MakeUnsigned<T>(LoadU(d, &input[3][i]), xstart + i, ypos, mul), du,
+            &output[4 * i]);
      }
    }
    msan::PoisonMemory(output + out.num_channels_ * len,
--- a/third_party/jpeg-xl/lib/jxl/sanitizers.h
+++ b/third_party/jpeg-xl/lib/jxl/sanitizers.h
@ -6,7 +6,6 @@
 #ifndef LIB_JXL_SANITIZERS_H_
 #define LIB_JXL_SANITIZERS_H_

-#include <inttypes.h>
 #include <stddef.h>

 #include "lib/jxl/base/compiler_specific.h"
@ -14,6 +13,7 @@
 #include "lib/jxl/image.h"

 #if JXL_MEMORY_SANITIZER
+#include <inttypes.h>
 #include <stdio.h>

 #include <algorithm>
--- a/third_party/jpeg-xl/lib/jxl/splines.cc
+++ b/third_party/jpeg-xl/lib/jxl/splines.cc
@ -6,6 +6,7 @@
 #include "lib/jxl/splines.h"

 #include <algorithm>
+#include <cinttypes>
 #include <cmath>
 #include <limits>

--- a/third_party/jpeg-xl/lib/jxl/test_utils.cc
+++ b/third_party/jpeg-xl/lib/jxl/test_utils.cc
@ -7,6 +7,7 @@

 #include <jxl/cms.h>
 #include <jxl/cms_interface.h>
+#include <jxl/types.h>

 #include <cstddef>
 #include <fstream>
@ -487,6 +488,10 @@ size_t ComparePixels(const uint8_t* a, const uint8_t* b, size_t xsize,
    // TODO(lode): Set the required precision back to 11 bits when possible.
    precision = 0.5 * threshold_multiplier / ((1ull << (bits - 1)) - 1ull);
  }
+  if (format_b.data_type == JXL_TYPE_UINT8) {
+    // Increase the threshold by the maximum difference introduced by dithering.
+    precision += 63.0 / 128.0;
+  }
  size_t numdiff = 0;
  for (size_t y = 0; y < ysize; y++) {
    for (size_t x = 0; x < xsize; x++) {
--- a/third_party/jpeg-xl/lib/jxl_lists.bzl
+++ b/third_party/jpeg-xl/lib/jxl_lists.bzl
@ -14,6 +14,7 @@ libjxl_base_sources = [
    "jxl/base/common.h",
    "jxl/base/compiler_specific.h",
    "jxl/base/data_parallel.h",
+    "jxl/base/exif.h",
    "jxl/base/fast_math-inl.h",
    "jxl/base/float.h",
    "jxl/base/iaca.h",
@ -185,7 +186,6 @@ libjxl_dec_sources = [
    "jxl/entropy_coder.h",
    "jxl/epf.cc",
    "jxl/epf.h",
-    "jxl/exif.h",
    "jxl/fast_dct-inl.h",
    "jxl/fast_dct.cc",
    "jxl/fast_dct.h",
--- a/third_party/jpeg-xl/lib/jxl_lists.cmake
+++ b/third_party/jpeg-xl/lib/jxl_lists.cmake
@ -14,6 +14,7 @@ set(JPEGXL_INTERNAL_BASE_SOURCES
  jxl/base/common.h
  jxl/base/compiler_specific.h
  jxl/base/data_parallel.h
+  jxl/base/exif.h
  jxl/base/fast_math-inl.h
  jxl/base/float.h
  jxl/base/iaca.h
@ -185,7 +186,6 @@ set(JPEGXL_INTERNAL_DEC_SOURCES
  jxl/entropy_coder.h
  jxl/epf.cc
  jxl/epf.h
-  jxl/exif.h
  jxl/fast_dct-inl.h
  jxl/fast_dct.cc
  jxl/fast_dct.h