Merge pull request #8338 from unknownbrackets/xbrz

Update xBRZ to 1.4 and improve scaling/caching limits
2024-12-04 07:00:51 +00:00 · 2015-12-30 23:11:30 +01:00 · 2015-12-30 23:11:30 +01:00 · 37cfea0fdc
commit 37cfea0fdc
parent 716d7a38f5 f32e4bc3c9
5 changed files with 376 additions and 423 deletions
--- a/GPU/Common/TextureScalerCommon.cpp
+++ b/GPU/Common/TextureScalerCommon.cpp
@ -490,7 +490,6 @@ TextureScaler::TextureScaler() {
 }

 TextureScaler::~TextureScaler() {
-	xbrz::shutdown();
 }

 bool TextureScaler::IsEmptyOrFlat(u32* data, int pixels, int fmt) {
@ -564,7 +563,6 @@ void TextureScaler::Scale(u32* &data, u32 &dstFmt, int &width, int &height, int

 void TextureScaler::ScaleXBRZ(int factor, u32* source, u32* dest, int width, int height) {
 	xbrz::ScalerCfg cfg;
-	xbrz::init();
 	GlobalThreadPool::Loop(std::bind(&xbrz::scale, factor, source, dest, width, height, xbrz::ColorFormat::ARGB, cfg, placeholder::_1, placeholder::_2), 0, height);
 }

--- a/GPU/GLES/TextureCache.cpp
+++ b/GPU/GLES/TextureCache.cpp
@ -19,10 +19,12 @@
 #include <cstring>

 #include "ext/xxhash.h"
+#include "i18n/i18n.h"
 #include "math/math_util.h"
 #include "profiler/profiler.h"

 #include "Common/ColorConv.h"
+#include "Core/Config.h"
 #include "Core/Host.h"
 #include "Core/MemMap.h"
 #include "Core/Reporting.h"
@ -36,8 +38,7 @@
 #include "GPU/GLES/ShaderManager.h"
 #include "GPU/GLES/TransformPipeline.h"
 #include "GPU/Common/TextureDecoder.h"
-#include "Core/Config.h"
-#include "Core/Host.h"
+#include "UI/OnScreenDisplay.h"

 #ifdef _M_SSE
 #include <xmmintrin.h>
@ -54,6 +55,8 @@

 // Changes more frequent than this will be considered "frequent" and prevent texture scaling.
 #define TEXCACHE_FRAME_CHANGE_FREQUENT 6
+// Note: only used when hash backoff is disabled.
+#define TEXCACHE_FRAME_CHANGE_FREQUENT_REGAIN_TRUST 33

 #define TEXCACHE_NAME_CACHE_SIZE 16

@ -241,6 +244,13 @@ void TextureCache::Invalidate(u32 addr, int size, GPUInvalidationType type) {
 				gpuStats.numTextureInvalidations++;
 				// Start it over from 0 (unless it's safe.)
 				iter->second.numFrames = type == GPU_INVALIDATE_SAFE ? 256 : 0;
+				if (type == GPU_INVALIDATE_SAFE) {
+					u32 diff = gpuStats.numFlips - iter->second.lastFrame;
+					// We still need to mark if the texture is frequently changing, even if it's safely changing.
+					if (diff < TEXCACHE_FRAME_CHANGE_FREQUENT) {
+						iter->second.status |= TexCacheEntry::STATUS_CHANGE_FREQUENT;
+					}
+				}
 				iter->second.framesUntilNextFullHash = 0;
 			} else if (!iter->second.framebuffer) {
 				iter->second.invalidHint++;
@ -1333,12 +1343,16 @@ void TextureCache::SetTexture(bool force) {
 				fullhash = QuickTexHash(texaddr, bufw, w, h, format, entry);
 				if (fullhash != entry->fullhash) {
 					hashFail = true;
-				} else if (entry->GetHashStatus() != TexCacheEntry::STATUS_HASHING && entry->numFrames > TexCacheEntry::FRAMES_REGAIN_TRUST) {
-					// Reset to STATUS_HASHING.
+				} else {
 					if (g_Config.bTextureBackoffCache) {
-						entry->SetHashStatus(TexCacheEntry::STATUS_HASHING);
+						if (entry->GetHashStatus() != TexCacheEntry::STATUS_HASHING && entry->numFrames > TexCacheEntry::FRAMES_REGAIN_TRUST) {
+							// Reset to STATUS_HASHING.
+							entry->SetHashStatus(TexCacheEntry::STATUS_HASHING);
+							entry->status &= ~TexCacheEntry::STATUS_CHANGE_FREQUENT;
+						}
+					} else if (entry->numFrames > TEXCACHE_FRAME_CHANGE_FREQUENT_REGAIN_TRUST) {
+						entry->status &= ~TexCacheEntry::STATUS_CHANGE_FREQUENT;
 					}
-					entry->status &= ~TexCacheEntry::STATUS_CHANGE_FREQUENT;
 				}
 			}

@ -1550,15 +1564,25 @@ void TextureCache::SetTexture(bool force) {
 		scaleFactor = g_Config.iTexScalingLevel;
 	}

+	// Rachet down scale factor in low-memory mode.
+	if (lowMemoryMode_) {
+		// Keep it even, though, just in case of npot troubles.
+		scaleFactor = scaleFactor > 4 ? 4 : (scaleFactor > 2 ? 2 : 1);
+	}
+
 	// Don't scale the PPGe texture.
 	if (entry->addr > 0x05000000 && entry->addr < 0x08800000)
 		scaleFactor = 1;
+	if ((entry->status & TexCacheEntry::STATUS_CHANGE_FREQUENT) == 0) {
+		// Remember for later that we /wanted/ to scale this texture.
+		entry->status |= TexCacheEntry::STATUS_TO_SCALE;
+		scaleFactor = 1;
+	}

-	if (scaleFactor != 1 && (entry->status & TexCacheEntry::STATUS_CHANGE_FREQUENT) == 0) {
+	if (scaleFactor != 1) {
 		if (texelsScaledThisFrame_ >= TEXCACHE_MAX_TEXELS_SCALED) {
 			entry->status |= TexCacheEntry::STATUS_TO_SCALE;
 			scaleFactor = 1;
-			// INFO_LOG(G3D, "Skipped scaling for now..");
 		} else {
 			entry->status &= ~TexCacheEntry::STATUS_TO_SCALE;
 			texelsScaledThisFrame_ += w * h;
@ -1957,7 +1981,7 @@ void TextureCache::LoadTextureLevel(TexCacheEntry &entry, int level, bool replac
 	gpuStats.numTexturesDecoded++;

 	// Can restore these and remove the fixup at the end of DecodeTextureLevel on desktop GL and GLES 3.
-	if ((g_Config.iTexScalingLevel == 1 && gstate_c.Supports(GPU_SUPPORTS_UNPACK_SUBIMAGE)) && w != bufw) {
+	if (scaleFactor == 1 && gstate_c.Supports(GPU_SUPPORTS_UNPACK_SUBIMAGE) && w != bufw) {
 		glPixelStorei(GL_UNPACK_ROW_LENGTH, bufw);
 		useUnpack = true;
 	}
@ -1967,7 +1991,7 @@ void TextureCache::LoadTextureLevel(TexCacheEntry &entry, int level, bool replac
 	useBGRA = UseBGRA8888() && dstFmt == GL_UNSIGNED_BYTE;

 	pixelData = (u32 *)finalBuf;
-	if (scaleFactor > 1 && (entry.status & TexCacheEntry::STATUS_CHANGE_FREQUENT) == 0)
+	if (scaleFactor > 1)
 		scaler.Scale(pixelData, dstFmt, w, h, scaleFactor);

 	if ((entry.status & TexCacheEntry::STATUS_CHANGE_FREQUENT) == 0) {
@ -2000,6 +2024,16 @@ void TextureCache::LoadTextureLevel(TexCacheEntry &entry, int level, bool replac
 				Decimate();
 				// Try again, now that we've cleared out textures in lowMemoryMode_.
 				glTexImage2D(GL_TEXTURE_2D, level, components, w, h, 0, components2, dstFmt, pixelData);
+
+				I18NCategory *err = GetI18NCategory("Error");
+				if (scaleFactor > 1) {
+					osm.Show(err->T("Warning: Video memory FULL, reducing upscaling and switching to slow caching mode"), 2.0f);
+				} else {
+					osm.Show(err->T("Warning: Video memory FULL, switching to slow caching mode"), 2.0f);
+				}
+			} else if (err != GL_NO_ERROR) {
+				// We checked the err anyway, might as well log if there is one.
+				WARN_LOG(G3D, "Got an error in texture upload: %08x", err);
 			}
 		}
 	}
--- a/ext/xbrz/config.h
+++ b/ext/xbrz/config.h
@ -23,17 +23,17 @@ namespace xbrz
 struct ScalerCfg
 {
    ScalerCfg() :
-        luminanceWeight_(1),
-        equalColorTolerance_(30),
+        luminanceWeight(1),
+        equalColorTolerance(30),
        dominantDirectionThreshold(3.6),
        steepDirectionThreshold(2.2),
-        newTestAttribute_(0) {}
+        newTestAttribute(0) {}

-    double luminanceWeight_;
-    double equalColorTolerance_;
+    double luminanceWeight;
+    double equalColorTolerance;
    double dominantDirectionThreshold;
    double steepDirectionThreshold;
-    double newTestAttribute_; //unused; test new parameters
+    double newTestAttribute; //unused; test new parameters
 };
 }

--- a/ext/xbrz/xbrz.cpp
+++ b/ext/xbrz/xbrz.cpp
@ -27,47 +27,65 @@ unsigned char getByte(uint32_t val) { return static_cast<unsigned char>((val >>

 // adjusted for RGBA
 // - Durante
-inline unsigned char getRed  (uint32_t val) { return getByte<0>(val); }
-inline unsigned char getGreen(uint32_t val) { return getByte<1>(val); }
-inline unsigned char getBlue (uint32_t val) { return getByte<2>(val); }
-inline unsigned char getAlpha(uint32_t val) { return getByte<3>(val); }
+inline unsigned char getRed  (uint32_t pix) { return getByte<0>(pix); }
+inline unsigned char getGreen(uint32_t pix) { return getByte<1>(pix); }
+inline unsigned char getBlue (uint32_t pix) { return getByte<2>(pix); }
+inline unsigned char getAlpha(uint32_t pix) { return getByte<3>(pix); }
+
+inline uint32_t makePixel(                 unsigned char r, unsigned char g, unsigned char b) { return             (b << 16) | (g << 8) | r; }
+inline uint32_t makePixel(unsigned char a, unsigned char r, unsigned char g, unsigned char b) { return (a << 24) | (b << 16) | (g << 8) | r; }


-template <class T> inline
-T abs(T value)
+template <unsigned int M, unsigned int N> inline
+uint32_t gradientRGB(uint32_t pixFront, uint32_t pixBack) //blend front color with opacity M / N over opaque background: http://en.wikipedia.org/wiki/Alpha_compositing#Alpha_blending
 {
-	static_assert(std::numeric_limits<T>::is_signed, "abs() requires signed types");
-	return value < 0 ? -value : value;
-}
+	static_assert(0 < M && M < N && N <= 1000, "");

-const uint32_t redMask   = 0x00ff0000;
-const uint32_t greenMask = 0x0000ff00;
-const uint32_t blueMask  = 0x000000ff;
+	auto calcColor = [](unsigned char colFront, unsigned char colBack) -> unsigned char { return (colFront * M + colBack * (N - M)) / N; };

-template <unsigned int N, unsigned int M> inline
-void alphaBlend(uint32_t& dst, uint32_t col) //blend color over destination with opacity N / M
-{
-	static_assert(N < 256, "possible overflow of (col & redMask) * N");
-	static_assert(M < 256, "possible overflow of (col & redMask  ) * N + (dst & redMask  ) * (M - N)");
-	static_assert(0 < N && N < M, "");
-	//dst = (redMask   & ((col & redMask  ) * N + (dst & redMask  ) * (M - N)) / M) | //this works because 8 upper bits are free
-	//      (greenMask & ((col & greenMask) * N + (dst & greenMask) * (M - N)) / M) |
-	//      (blueMask  & ((col & blueMask ) * N + (dst & blueMask ) * (M - N)) / M);
-
-	// the upper 8 bits are not free in our case, so we need to do this differently
-	// could probably be MUCH faster
-	// - Durante
-	uint8_t a = (((col	          ) >> 24) * N + ((dst	          ) >> 24) * (M - N) ) / M;
-	uint8_t r = (((col &   redMask) >> 16) * N + ((dst &   redMask) >> 16) * (M - N) ) / M;
-	uint8_t g = (((col & greenMask) >>  8) * N + ((dst & greenMask) >>  8) * (M - N) ) / M;
-	uint8_t b = (((col &  blueMask)      ) * N + ((dst &  blueMask)      ) * (M - N) ) / M;
-
-	dst = (a << 24) | (r << 16) | (g << 8) | (b << 0); 
+	return makePixel(calcColor(getRed  (pixFront), getRed  (pixBack)),
+					 calcColor(getGreen(pixFront), getGreen(pixBack)),
+					 calcColor(getBlue (pixFront), getBlue (pixBack)));
 }


-uint32_t*       byteAdvance(      uint32_t* ptr, int bytes) {  return reinterpret_cast<	     uint32_t*>(reinterpret_cast<      char*>(ptr) + bytes); }
-const uint32_t* byteAdvance(const uint32_t* ptr, int bytes) {  return reinterpret_cast<const uint32_t*>(reinterpret_cast<const char*>(ptr) + bytes); }
+template <unsigned int M, unsigned int N> inline
+uint32_t gradientARGB(uint32_t pixFront, uint32_t pixBack) //find intermediate color between two colors with alpha channels (=> NO alpha blending!!!)
+{
+	static_assert(0 < M && M < N && N <= 1000, "");
+
+	const unsigned int weightFront = getAlpha(pixFront) * M;
+	const unsigned int weightBack  = getAlpha(pixBack) * (N - M);
+	const unsigned int weightSum   = weightFront + weightBack;
+	if (weightSum == 0)
+		return 0;
+
+	auto calcColor = [=](unsigned char colFront, unsigned char colBack)
+	{
+		return static_cast<unsigned char>((colFront * weightFront + colBack * weightBack) / weightSum);
+	};
+
+	return makePixel(static_cast<unsigned char>(weightSum / N),
+					 calcColor(getRed  (pixFront), getRed  (pixBack)),
+					 calcColor(getGreen(pixFront), getGreen(pixBack)),
+					 calcColor(getBlue (pixFront), getBlue (pixBack)));
+}
+
+
+//inline
+//double fastSqrt(double n)
+//{
+//    __asm //speeds up xBRZ by about 9% compared to std::sqrt which internally uses the same assembler instructions but adds some "fluff"
+//    {
+//        fld n
+//        fsqrt
+//    }
+//}
+//
+
+
+uint32_t*       byteAdvance(      uint32_t* ptr, int bytes) { return reinterpret_cast<      uint32_t*>(reinterpret_cast<      char*>(ptr) + bytes); }
+const uint32_t* byteAdvance(const uint32_t* ptr, int bytes) { return reinterpret_cast<const uint32_t*>(reinterpret_cast<const char*>(ptr) + bytes); }


 //fill block  with the given color
@ -148,199 +166,6 @@ template <class T> inline
 T square(T value) { return value * value; }


-/*
-inline
-void rgbtoLuv(uint32_t c, double& L, double& u, double& v)
-{
-	//http://www.easyrgb.com/index.php?X=MATH&H=02#text2
-	double r = getRed  (c) / 255.0;
-	double g = getGreen(c) / 255.0;
-	double b = getBlue (c) / 255.0;
-
-	if ( r > 0.04045 )
-		r = std::pow(( ( r + 0.055 ) / 1.055 ) , 2.4);
-	else
-		r /= 12.92;
-	if ( g > 0.04045 )
-		g = std::pow(( ( g + 0.055 ) / 1.055 ) , 2.4);
-	else
-		g /=  12.92;
-	if ( b > 0.04045 )
-		b  = std::pow(( ( b + 0.055 ) / 1.055 ) , 2.4);
-	else
-		b /=  12.92;
-
-	r *= 100;
-	g *= 100;
-	b *= 100;
-
-	double x = 0.4124564 * r + 0.3575761 * g + 0.1804375 * b;
-	double y = 0.2126729 * r + 0.7151522 * g + 0.0721750 * b;
-	double z = 0.0193339 * r + 0.1191920 * g + 0.9503041 * b;
-	//---------------------
-	double var_U =  4 * x  / ( x +  15 * y  +  3 * z  );
-	double var_V =  9 * y  / ( x +  15 * y  +  3 * z  );
-	double var_Y = y / 100;
-
-	if ( var_Y > 0.008856 ) var_Y = std::pow(var_Y , 1.0/3 );
-	else					var_Y =  7.787 * var_Y  +  16.0 / 116;
-
-	const double ref_X =  95.047;		//Observer= 2°, Illuminant= D65
-	const double ref_Y = 100.000;
-	const double ref_Z = 108.883;
-
-	const double ref_U = ( 4 * ref_X ) / ( ref_X + ( 15 * ref_Y ) + ( 3 * ref_Z ) );
-	const double ref_V = ( 9 * ref_Y ) / ( ref_X + ( 15 * ref_Y ) + ( 3 * ref_Z ) );
-
-	L = ( 116 * var_Y ) - 16;
-	u = 13 * L * ( var_U - ref_U );
-	v = 13 * L * ( var_V - ref_V );
-}
-*/
-
-inline
-void rgbtoLab(uint32_t c, unsigned char& L, signed char& A, signed char& B)
-{
-	//code: http://www.easyrgb.com/index.php?X=MATH
-	//test: http://www.workwithcolor.com/color-converter-01.htm
-	//------RGB to XYZ------
-	double r = getRed  (c) / 255.0;
-	double g = getGreen(c) / 255.0;
-	double b = getBlue (c) / 255.0;
-
-	r = r > 0.04045 ? std::pow(( r + 0.055 ) / 1.055, 2.4) : r / 12.92;
-	r = g > 0.04045 ? std::pow(( g + 0.055 ) / 1.055, 2.4) : g / 12.92;
-	r = b > 0.04045 ? std::pow(( b + 0.055 ) / 1.055, 2.4) : b / 12.92;
-
-	r *= 100;
-	g *= 100;
-	b *= 100;
-
-	double x = 0.4124564 * r + 0.3575761 * g + 0.1804375 * b;
-	double y = 0.2126729 * r + 0.7151522 * g + 0.0721750 * b;
-	double z = 0.0193339 * r + 0.1191920 * g + 0.9503041 * b;
-	//------XYZ to Lab------
-	const double refX = 95.047;  //
-	const double refY = 100.000; //Observer= 2°, Illuminant= D65
-	const double refZ = 108.883; //
-	double var_X = x / refX;
-	double var_Y = y / refY;
-	double var_Z = z / refZ;
-
-	var_X = var_X > 0.008856 ? std::pow(var_X, 1.0 / 3) : 7.787 * var_X + 4.0 / 29;
-	var_Y = var_Y > 0.008856 ? std::pow(var_Y, 1.0 / 3) : 7.787 * var_Y + 4.0 / 29;
-	var_Z = var_Z > 0.008856 ? std::pow(var_Z, 1.0 / 3) : 7.787 * var_Z + 4.0 / 29;
-
-	L = static_cast<unsigned char>(116 * var_Y  - 16);
-	A = static_cast<  signed char>(500 * (var_X - var_Y));
-	B = static_cast<  signed char>(200 * (var_Y - var_Z));
-};
-
-
-inline
-double distLAB(uint32_t pix1, uint32_t pix2)
-{
-	unsigned char L1 = 0; //[0, 100]
-	signed   char a1 = 0; //[-128, 127]
-	signed   char b1 = 0; //[-128, 127]
-	rgbtoLab(pix1, L1, a1, b1);
-
-	unsigned char L2 = 0;
-	signed   char a2 = 0;
-	signed   char b2 = 0;
-	rgbtoLab(pix2, L2, a2, b2);
-
-	//-----------------------------
-	//http://www.easyrgb.com/index.php?X=DELT
-
-	//Delta E/CIE76
-	return std::sqrt(square(1.0 * L1 - L2) +
-					 square(1.0 * a1 - a2) +
-					 square(1.0 * b1 - b2));
-}
-
-
-/*
-inline
-void rgbtoHsl(uint32_t c, double& h, double& s, double& l)
-{
-	//http://www.easyrgb.com/index.php?X=MATH&H=18#text18
-	const int r = getRed  (c);
-	const int g = getGreen(c);
-	const int b = getBlue (c);
-
-	const int varMin = numeric::min(r, g, b);
-	const int varMax = numeric::max(r, g, b);
-	const int delMax = varMax - varMin;
-
-	l = (varMax + varMin) / 2.0 / 255.0;
-
-	if (delMax == 0) //gray, no chroma...
-	{
-		h = 0;
-		s = 0;
-	}
-	else
-	{
-		s = l < 0.5 ?
-			delMax / (1.0 * varMax + varMin) :
-			delMax / (2.0 * 255 - varMax - varMin);
-
-		double delR = ((varMax - r) / 6.0 + delMax / 2.0) / delMax;
-		double delG = ((varMax - g) / 6.0 + delMax / 2.0) / delMax;
-		double delB = ((varMax - b) / 6.0 + delMax / 2.0) / delMax;
-
-		if (r == varMax)
-			h = delB - delG;
-		else if (g == varMax)
-			h = 1 / 3.0 + delR - delB;
-		else if (b == varMax)
-			h = 2 / 3.0 + delG - delR;
-
-		if (h < 0)
-			h += 1;
-		if (h > 1)
-			h -= 1;
-	}
-}
-
-inline
-double distHSL(uint32_t pix1, uint32_t pix2, double lightningWeight)
-{
-	double h1 = 0;
-	double s1 = 0;
-	double l1 = 0;
-	rgbtoHsl(pix1, h1, s1, l1);
-	double h2 = 0;
-	double s2 = 0;
-	double l2 = 0;
-	rgbtoHsl(pix2, h2, s2, l2);
-
-	//HSL is in cylindric coordinatates where L represents height, S radius, H angle,
-	//however we interpret the cylinder as a bi-conic solid with top/bottom radius 0, middle radius 1
-	assert(0 <= h1 && h1 <= 1);
-	assert(0 <= h2 && h2 <= 1);
-
-	double r1 = l1 < 0.5 ?
-				l1 * 2 :
-				2 - l1 * 2;
-
-	double x1 = r1 * s1 * std::cos(h1 * 2 * numeric::pi);
-	double y1 = r1 * s1 * std::sin(h1 * 2 * numeric::pi);
-	double z1 = l1;
-
-	double r2 = l2 < 0.5 ?
-				l2 * 2 :
-				2 - l2 * 2;
-
-	double x2 = r2 * s2 * std::cos(h2 * 2 * numeric::pi);
-	double y2 = r2 * s2 * std::sin(h2 * 2 * numeric::pi);
-	double z2 = l2;
-
-	return 255 * std::sqrt(square(x1 - x2) + square(y1 - y2) +  square(lightningWeight * (z1 - z2)));
-}
-*/
-

 inline
 double distRGB(uint32_t pix1, uint32_t pix2)
@ -354,19 +179,6 @@ double distRGB(uint32_t pix1, uint32_t pix2)
 }


-inline
-double distNonLinearRGB(uint32_t pix1, uint32_t pix2)
-{
-	//non-linear rgb: http://www.compuphase.com/cmetric.htm
-	const double r_diff = static_cast<int>(getRed  (pix1)) - getRed  (pix2);
-	const double g_diff = static_cast<int>(getGreen(pix1)) - getGreen(pix2);
-	const double b_diff = static_cast<int>(getBlue (pix1)) - getBlue (pix2);
-
-	const double r_avg = (static_cast<double>(getRed(pix1)) + getRed(pix2)) / 2;
-	return std::sqrt((2 + r_avg / 255) * square(r_diff) + 4 * square(g_diff) + (2 + (255 - r_avg) / 255) * square(b_diff));
-}
-
-
 inline
 double distYCbCr(uint32_t pix1, uint32_t pix2, double lumaWeight)
 {
@ -390,13 +202,23 @@ double distYCbCr(uint32_t pix1, uint32_t pix2, double lumaWeight)
 	const double c_r = scale_r * (r_diff - y);

 	//we skip division by 255 to have similar range like other distance functions
-	return std::sqrt(square(lumaWeight * y) + square(c_b) +  square(c_r));
+	return std::sqrt(square(lumaWeight * y) + square(c_b) + square(c_r));
 }


 struct DistYCbCrBuffer //30% perf boost compared to distYCbCr()!
 {
 public:
+	static double dist(uint32_t pix1, uint32_t pix2)
+	{
+#if defined _MSC_VER && _MSC_VER < 1900
+#error function scope static initialization is not yet thread-safe!
+#endif
+		static const DistYCbCrBuffer inst;
+		return inst.distImpl(pix1, pix2);
+	}
+
+private:
 	DistYCbCrBuffer() : buffer(256 * 256 * 256)
 	{
 		for (uint32_t i = 0; i < 256 * 256 * 256; ++i) //startup time: 114 ms on Intel Core i5 (four cores)
@ -420,7 +242,7 @@ public:
 		}
 	}

-	double dist(uint32_t pix1, uint32_t pix2) const
+	double distImpl(uint32_t pix1, uint32_t pix2) const
 	{
 		//if (pix1 == pix2) -> 8% perf degradation!
 		//	return 0;
@ -436,45 +258,8 @@ public:
 					  (( b_diff + 255) / 2)];
 	}

-private:
-	std::vector<float> buffer; //consumes 64 MB memory; using double is 2% faster, but takes 128 MB
+	std::vector<float> buffer; //consumes 64 MB memory; using double is only 2% faster, but takes 128 MB
 };
-DistYCbCrBuffer *distYCbCrBuffer = nullptr;
-
-
-inline
-double distYUV(uint32_t pix1, uint32_t pix2, double luminanceWeight)
-{
-	//perf: it's not worthwhile to buffer the YUV-conversion, the direct code is faster by ~ 6%
-	//since RGB -> YUV conversion is essentially a matrix multiplication, we can calculate the RGB diff before the conversion (distributive property)
-	const double r_diff = static_cast<int>(getRed  (pix1)) - getRed  (pix2);
-	const double g_diff = static_cast<int>(getGreen(pix1)) - getGreen(pix2);
-	const double b_diff = static_cast<int>(getBlue (pix1)) - getBlue (pix2);
-
-	//http://en.wikipedia.org/wiki/YUV#Conversion_to.2Ffrom_RGB
-	const double w_b = 0.114;
-	const double w_r = 0.299;
-	const double w_g = 1 - w_r - w_b;
-
-	const double u_max = 0.436;
-	const double v_max = 0.615;
-
-	const double scale_u = u_max / (1 - w_b);
-	const double scale_v = v_max / (1 - w_r);
-
-	double y = w_r * r_diff + w_g * g_diff + w_b * b_diff;//value range: 255 * [-1, 1]
-	double u = scale_u * (b_diff - y);					  //value range: 255 * 2 * u_max * [-1, 1]
-	double v = scale_v * (r_diff - y);					  //value range: 255 * 2 * v_max * [-1, 1]
-
-#ifdef _DEBUG
-	const double eps = 0.5;
-	assert(abs(y) <= 255 + eps);
-	assert(abs(u) <= 255 * 2 * u_max + eps);
-	assert(abs(v) <= 255 * 2 * v_max + eps);
-#endif
-
-	return std::sqrt(square(luminanceWeight * y) + square(u) +  square(v));
-}


 enum BlendType
@ -526,7 +311,7 @@ BlendResult preProcessCorners(const Kernel_4x4& ker, const xbrz::ScalerCfg& cfg)
 		 ker.g == ker.k))
 		return result;

-	auto dist = [&](uint32_t pix1, uint32_t pix2) { return ColorDistance::dist(pix1, pix2, cfg.luminanceWeight_); };
+	auto dist = [&](uint32_t pix1, uint32_t pix2) { return ColorDistance::dist(pix1, pix2, cfg.luminanceWeight); };

 	const int weight = 4;
 	double jg = dist(ker.i, ker.f) + dist(ker.f, ker.c) + dist(ker.n, ker.k) + dist(ker.k, ker.h) + weight * dist(ker.j, ker.g);
@ -626,7 +411,7 @@ input kernel area naming convention:
 */
 template <class Scaler, class ColorDistance, RotationDegree rotDeg>
 FORCE_INLINE //perf: quite worth it!
-void scalePixel(const Kernel_3x3& ker,
+void blendPixel(const Kernel_3x3& ker,
 				uint32_t* target, int trgWidth,
 				unsigned char blendInfo, //result of preprocessing all four corners of pixel "e"
 				const xbrz::ScalerCfg& cfg)
@ -641,12 +426,17 @@ void scalePixel(const Kernel_3x3& ker,
 #define h get_h<rotDeg>(ker)
 #define i get_i<rotDeg>(ker)

+#ifdef _DEBUG
+	if (breakIntoDebugger)
+		__debugbreak(); //__asm int 3;
+#endif
+
 	const unsigned char blend = rotateBlendInfo<rotDeg>(blendInfo);

 	if (getBottomR(blend) >= BLEND_NORMAL)
 	{
-		auto eq   = [&](uint32_t pix1, uint32_t pix2) { return ColorDistance::dist(pix1, pix2, cfg.luminanceWeight_) < cfg.equalColorTolerance_; };
-		auto dist = [&](uint32_t pix1, uint32_t pix2) { return ColorDistance::dist(pix1, pix2, cfg.luminanceWeight_); };
+		auto eq   = [&](uint32_t pix1, uint32_t pix2) { return ColorDistance::dist(pix1, pix2, cfg.luminanceWeight) < cfg.equalColorTolerance; };
+		auto dist = [&](uint32_t pix1, uint32_t pix2) { return ColorDistance::dist(pix1, pix2, cfg.luminanceWeight); };

 		const bool doLineBlend = [&]() -> bool
 		{
@ -726,7 +516,7 @@ void scaleImage(const uint32_t* src, uint32_t* trg, int srcWidth, int srcHeight,
 	std::fill(preProcBuffer, preProcBuffer + bufferSize, 0);
 	static_assert(BLEND_NONE == 0, "");

-	//initialize preprocessing buffer for first row: detect upper left and right corner blending
+	//initialize preprocessing buffer for first row of current stripe: detect upper left and right corner blending
 	//this cannot be optimized for adjacent processing stripes; we must not allow for a memory race condition!
 	if (yFirst > 0)
 	{
@ -853,7 +643,7 @@ void scaleImage(const uint32_t* src, uint32_t* trg, int srcWidth, int srcHeight,
 			fillBlock(out, trgWidth * sizeof(uint32_t), ker4.f, Scaler::scale); //place *after* preprocessing step, to not overwrite the results while processing the the last pixel!

 			//blend four corners of current pixel
-			if (blendingNeeded(blend_xy)) //good 20% perf-improvement
+			if (blendingNeeded(blend_xy)) //good 5% perf-improvement
 			{
 				Kernel_3x3 ker3 = {}; //perf: initialization is negligible

@ -869,10 +659,10 @@ void scaleImage(const uint32_t* src, uint32_t* trg, int srcWidth, int srcHeight,
 				ker3.h = ker4.j;
 				ker3.i = ker4.k;

-				scalePixel<Scaler, ColorDistance, ROT_0  >(ker3, out, trgWidth, blend_xy, cfg);
-				scalePixel<Scaler, ColorDistance, ROT_90 >(ker3, out, trgWidth, blend_xy, cfg);
-				scalePixel<Scaler, ColorDistance, ROT_180>(ker3, out, trgWidth, blend_xy, cfg);
-				scalePixel<Scaler, ColorDistance, ROT_270>(ker3, out, trgWidth, blend_xy, cfg);
+				blendPixel<Scaler, ColorDistance, ROT_0  >(ker3, out, trgWidth, blend_xy, cfg);
+				blendPixel<Scaler, ColorDistance, ROT_90 >(ker3, out, trgWidth, blend_xy, cfg);
+				blendPixel<Scaler, ColorDistance, ROT_180>(ker3, out, trgWidth, blend_xy, cfg);
+				blendPixel<Scaler, ColorDistance, ROT_270>(ker3, out, trgWidth, blend_xy, cfg);
 			}
 		}
 	}
@ -880,112 +670,127 @@ void scaleImage(const uint32_t* src, uint32_t* trg, int srcWidth, int srcHeight,

 //------------------------------------------------------------------------------------

-struct Scaler2x
+template <class ColorGradient>
+struct Scaler2x : public ColorGradient
 {
 	static const int scale = 2;

+	template <unsigned int M, unsigned int N> //bring template function into scope for GCC
+	static void alphaGrad(uint32_t& pixBack, uint32_t pixFront) { ColorGradient::template alphaGrad<M, N>(pixBack, pixFront); }
+
+
 	template <class OutputMatrix>
 	static void blendLineShallow(uint32_t col, OutputMatrix& out)
 	{
-		alphaBlend<1, 4>(out.template ref<scale - 1, 0>(), col);
-		alphaBlend<3, 4>(out.template ref<scale - 1, 1>(), col);
+		alphaGrad<1, 4>(out.template ref<scale - 1, 0>(), col);
+		alphaGrad<3, 4>(out.template ref<scale - 1, 1>(), col);
 	}

 	template <class OutputMatrix>
 	static void blendLineSteep(uint32_t col, OutputMatrix& out)
 	{
-		alphaBlend<1, 4>(out.template ref<0, scale - 1>(), col);
-		alphaBlend<3, 4>(out.template ref<1, scale - 1>(), col);
+		alphaGrad<1, 4>(out.template ref<0, scale - 1>(), col);
+		alphaGrad<3, 4>(out.template ref<1, scale - 1>(), col);
 	}

 	template <class OutputMatrix>
 	static void blendLineSteepAndShallow(uint32_t col, OutputMatrix& out)
 	{
-		alphaBlend<1, 4>(out.template ref<1, 0>(), col);
-		alphaBlend<1, 4>(out.template ref<0, 1>(), col);
-		alphaBlend<5, 6>(out.template ref<1, 1>(), col); //[!] fixes 7/8 used in xBR
+		alphaGrad<1, 4>(out.template ref<1, 0>(), col);
+		alphaGrad<1, 4>(out.template ref<0, 1>(), col);
+		alphaGrad<5, 6>(out.template ref<1, 1>(), col); //[!] fixes 7/8 used in xBR
 	}

 	template <class OutputMatrix>
 	static void blendLineDiagonal(uint32_t col, OutputMatrix& out)
 	{
-		alphaBlend<1, 2>(out.template ref<1, 1>(), col);
+		alphaGrad<1, 2>(out.template ref<1, 1>(), col);
 	}

 	template <class OutputMatrix>
 	static void blendCorner(uint32_t col, OutputMatrix& out)
 	{
 		//model a round corner
-		alphaBlend<21, 100>(out.template ref<1, 1>(), col); //exact: 1 - pi/4 = 0.2146018366
+		alphaGrad<21, 100>(out.template ref<1, 1>(), col); //exact: 1 - pi/4 = 0.2146018366
 	}
 };


-struct Scaler3x
+template <class ColorGradient>
+struct Scaler3x : public ColorGradient
 {
 	static const int scale = 3;

+	template <unsigned int M, unsigned int N> //bring template function into scope for GCC
+	static void alphaGrad(uint32_t& pixBack, uint32_t pixFront) { ColorGradient::template alphaGrad<M, N>(pixBack, pixFront); }
+
+
 	template <class OutputMatrix>
 	static void blendLineShallow(uint32_t col, OutputMatrix& out)
 	{
-		alphaBlend<1, 4>(out.template ref<scale - 1, 0>(), col);
-		alphaBlend<1, 4>(out.template ref<scale - 2, 2>(), col);
+		alphaGrad<1, 4>(out.template ref<scale - 1, 0>(), col);
+		alphaGrad<1, 4>(out.template ref<scale - 2, 2>(), col);

-		alphaBlend<3, 4>(out.template ref<scale - 1, 1>(), col);
+		alphaGrad<3, 4>(out.template ref<scale - 1, 1>(), col);
 		out.template ref<scale - 1, 2>() = col;
 	}

 	template <class OutputMatrix>
 	static void blendLineSteep(uint32_t col, OutputMatrix& out)
 	{
-		alphaBlend<1, 4>(out.template ref<0, scale - 1>(), col);
-		alphaBlend<1, 4>(out.template ref<2, scale - 2>(), col);
+		alphaGrad<1, 4>(out.template ref<0, scale - 1>(), col);
+		alphaGrad<1, 4>(out.template ref<2, scale - 2>(), col);

-		alphaBlend<3, 4>(out.template ref<1, scale - 1>(), col);
+		alphaGrad<3, 4>(out.template ref<1, scale - 1>(), col);
 		out.template ref<2, scale - 1>() = col;
 	}

 	template <class OutputMatrix>
 	static void blendLineSteepAndShallow(uint32_t col, OutputMatrix& out)
 	{
-		alphaBlend<1, 4>(out.template ref<2, 0>(), col);
-		alphaBlend<1, 4>(out.template ref<0, 2>(), col);
-		alphaBlend<3, 4>(out.template ref<2, 1>(), col);
-		alphaBlend<3, 4>(out.template ref<1, 2>(), col);
+		alphaGrad<1, 4>(out.template ref<2, 0>(), col);
+		alphaGrad<1, 4>(out.template ref<0, 2>(), col);
+		alphaGrad<3, 4>(out.template ref<2, 1>(), col);
+		alphaGrad<3, 4>(out.template ref<1, 2>(), col);
 		out.template ref<2, 2>() = col;
 	}

 	template <class OutputMatrix>
 	static void blendLineDiagonal(uint32_t col, OutputMatrix& out)
 	{
-		alphaBlend<1, 8>(out.template ref<1, 2>(), col);
-		alphaBlend<1, 8>(out.template ref<2, 1>(), col);
-		alphaBlend<7, 8>(out.template ref<2, 2>(), col);
+		alphaGrad<1, 8>(out.template ref<1, 2>(), col); //conflict with other rotations for this odd scale
+		alphaGrad<1, 8>(out.template ref<2, 1>(), col);
+		alphaGrad<7, 8>(out.template ref<2, 2>(), col); //
 	}

 	template <class OutputMatrix>
 	static void blendCorner(uint32_t col, OutputMatrix& out)
 	{
 		//model a round corner
-		alphaBlend<45, 100>(out.template ref<2, 2>(), col); //exact: 0.4545939598
-		//alphaBlend<14, 1000>(out.template ref<2, 1>(), col); //0.01413008627 -> negligible
-		//alphaBlend<14, 1000>(out.template ref<1, 2>(), col); //0.01413008627
+		alphaGrad<45, 100>(out.template ref<2, 2>(), col); //exact: 0.4545939598
+		//alphaGrad<7, 256>(out.template ref<2, 1>(), col); //0.02826017254 -> negligible + avoid conflicts with other rotations for this odd scale
+		//alphaGrad<7, 256>(out.template ref<1, 2>(), col); //0.02826017254
 	}
 };


-struct Scaler4x
+template <class ColorGradient>
+struct Scaler4x : public ColorGradient
 {
 	static const int scale = 4;

+	template <unsigned int M, unsigned int N> //bring template function into scope for GCC
+	static void alphaGrad(uint32_t& pixBack, uint32_t pixFront) { ColorGradient::template alphaGrad<M, N>(pixBack, pixFront); }
+
+
 	template <class OutputMatrix>
 	static void blendLineShallow(uint32_t col, OutputMatrix& out)
 	{
-		alphaBlend<1, 4>(out.template ref<scale - 1, 0>(), col);
-		alphaBlend<1, 4>(out.template ref<scale - 2, 2>(), col);
+		alphaGrad<1, 4>(out.template ref<scale - 1, 0>(), col);
+		alphaGrad<1, 4>(out.template ref<scale - 2, 2>(), col);

-		alphaBlend<3, 4>(out.template ref<scale - 1, 1>(), col);
-		alphaBlend<3, 4>(out.template ref<scale - 2, 3>(), col);
+		alphaGrad<3, 4>(out.template ref<scale - 1, 1>(), col);
+		alphaGrad<3, 4>(out.template ref<scale - 2, 3>(), col);

 		out.template ref<scale - 1, 2>() = col;
 		out.template ref<scale - 1, 3>() = col;
@ -994,11 +799,11 @@ struct Scaler4x
 	template <class OutputMatrix>
 	static void blendLineSteep(uint32_t col, OutputMatrix& out)
 	{
-		alphaBlend<1, 4>(out.template ref<0, scale - 1>(), col);
-		alphaBlend<1, 4>(out.template ref<2, scale - 2>(), col);
+		alphaGrad<1, 4>(out.template ref<0, scale - 1>(), col);
+		alphaGrad<1, 4>(out.template ref<2, scale - 2>(), col);

-		alphaBlend<3, 4>(out.template ref<1, scale - 1>(), col);
-		alphaBlend<3, 4>(out.template ref<3, scale - 2>(), col);
+		alphaGrad<3, 4>(out.template ref<1, scale - 1>(), col);
+		alphaGrad<3, 4>(out.template ref<3, scale - 2>(), col);

 		out.template ref<2, scale - 1>() = col;
 		out.template ref<3, scale - 1>() = col;
@ -1007,19 +812,23 @@ struct Scaler4x
 	template <class OutputMatrix>
 	static void blendLineSteepAndShallow(uint32_t col, OutputMatrix& out)
 	{
-		alphaBlend<3, 4>(out.template ref<3, 1>(), col);
-		alphaBlend<3, 4>(out.template ref<1, 3>(), col);
-		alphaBlend<1, 4>(out.template ref<3, 0>(), col);
-		alphaBlend<1, 4>(out.template ref<0, 3>(), col);
-		alphaBlend<1, 3>(out.template ref<2, 2>(), col); //[!] fixes 1/4 used in xBR
-		out.template ref<3, 3>() = out.template ref<3, 2>() = out.template ref<2, 3>() = col;
+		alphaGrad<3, 4>(out.template ref<3, 1>(), col);
+		alphaGrad<3, 4>(out.template ref<1, 3>(), col);
+		alphaGrad<1, 4>(out.template ref<3, 0>(), col);
+		alphaGrad<1, 4>(out.template ref<0, 3>(), col);
+
+		alphaGrad<1, 3>(out.template ref<2, 2>(), col); //[!] fixes 1/4 used in xBR
+
+		out.template ref<3, 3>() = col;
+		out.template ref<3, 2>() = col;
+		out.template ref<2, 3>() = col;
 	}

 	template <class OutputMatrix>
 	static void blendLineDiagonal(uint32_t col, OutputMatrix& out)
 	{
-		alphaBlend<1, 2>(out.template ref<scale - 1, scale / 2	>(), col);
-		alphaBlend<1, 2>(out.template ref<scale - 2, scale / 2 + 1>(), col);
+		alphaGrad<1, 2>(out.template ref<scale - 1, scale / 2    >(), col);
+		alphaGrad<1, 2>(out.template ref<scale - 2, scale / 2 + 1>(), col);
 		out.template ref<scale - 1, scale - 1>() = col;
 	}

@ -1027,26 +836,31 @@ struct Scaler4x
 	static void blendCorner(uint32_t col, OutputMatrix& out)
 	{
 		//model a round corner
-		alphaBlend<68, 100>(out.template ref<3, 3>(), col); //exact: 0.6848532563
-		alphaBlend< 9, 100>(out.template ref<3, 2>(), col); //0.08677704501
-		alphaBlend< 9, 100>(out.template ref<2, 3>(), col); //0.08677704501
+		alphaGrad<68, 100>(out.template ref<3, 3>(), col); //exact: 0.6848532563
+		alphaGrad< 9, 100>(out.template ref<3, 2>(), col); //0.08677704501
+		alphaGrad< 9, 100>(out.template ref<2, 3>(), col); //0.08677704501
 	}
 };


-struct Scaler5x
+template <class ColorGradient>
+struct Scaler5x : public ColorGradient
 {
 	static const int scale = 5;

+	template <unsigned int M, unsigned int N> //bring template function into scope for GCC
+	static void alphaGrad(uint32_t& pixBack, uint32_t pixFront) { ColorGradient::template alphaGrad<M, N>(pixBack, pixFront); }
+
+
 	template <class OutputMatrix>
 	static void blendLineShallow(uint32_t col, OutputMatrix& out)
 	{
-		alphaBlend<1, 4>(out.template ref<scale - 1, 0>(), col);
-		alphaBlend<1, 4>(out.template ref<scale - 2, 2>(), col);
-		alphaBlend<1, 4>(out.template ref<scale - 3, 4>(), col);
+		alphaGrad<1, 4>(out.template ref<scale - 1, 0>(), col);
+		alphaGrad<1, 4>(out.template ref<scale - 2, 2>(), col);
+		alphaGrad<1, 4>(out.template ref<scale - 3, 4>(), col);

-		alphaBlend<3, 4>(out.template ref<scale - 1, 1>(), col);
-		alphaBlend<3, 4>(out.template ref<scale - 2, 3>(), col);
+		alphaGrad<3, 4>(out.template ref<scale - 1, 1>(), col);
+		alphaGrad<3, 4>(out.template ref<scale - 2, 3>(), col);

 		out.template ref<scale - 1, 2>() = col;
 		out.template ref<scale - 1, 3>() = col;
@ -1057,12 +871,12 @@ struct Scaler5x
 	template <class OutputMatrix>
 	static void blendLineSteep(uint32_t col, OutputMatrix& out)
 	{
-		alphaBlend<1, 4>(out.template ref<0, scale - 1>(), col);
-		alphaBlend<1, 4>(out.template ref<2, scale - 2>(), col);
-		alphaBlend<1, 4>(out.template ref<4, scale - 3>(), col);
+		alphaGrad<1, 4>(out.template ref<0, scale - 1>(), col);
+		alphaGrad<1, 4>(out.template ref<2, scale - 2>(), col);
+		alphaGrad<1, 4>(out.template ref<4, scale - 3>(), col);

-		alphaBlend<3, 4>(out.template ref<1, scale - 1>(), col);
-		alphaBlend<3, 4>(out.template ref<3, scale - 2>(), col);
+		alphaGrad<3, 4>(out.template ref<1, scale - 1>(), col);
+		alphaGrad<3, 4>(out.template ref<3, scale - 2>(), col);

 		out.template ref<2, scale - 1>() = col;
 		out.template ref<3, scale - 1>() = col;
@ -1073,34 +887,33 @@ struct Scaler5x
 	template <class OutputMatrix>
 	static void blendLineSteepAndShallow(uint32_t col, OutputMatrix& out)
 	{
-		alphaBlend<1, 4>(out.template ref<0, scale - 1>(), col);
-		alphaBlend<1, 4>(out.template ref<2, scale - 2>(), col);
-		alphaBlend<3, 4>(out.template ref<1, scale - 1>(), col);
+		alphaGrad<1, 4>(out.template ref<0, scale - 1>(), col);
+		alphaGrad<1, 4>(out.template ref<2, scale - 2>(), col);
+		alphaGrad<3, 4>(out.template ref<1, scale - 1>(), col);

-		alphaBlend<1, 4>(out.template ref<scale - 1, 0>(), col);
-		alphaBlend<1, 4>(out.template ref<scale - 2, 2>(), col);
-		alphaBlend<3, 4>(out.template ref<scale - 1, 1>(), col);
+		alphaGrad<1, 4>(out.template ref<scale - 1, 0>(), col);
+		alphaGrad<1, 4>(out.template ref<scale - 2, 2>(), col);
+		alphaGrad<3, 4>(out.template ref<scale - 1, 1>(), col);
+
+		alphaGrad<2, 3>(out.template ref<3, 3>(), col);

 		out.template ref<2, scale - 1>() = col;
 		out.template ref<3, scale - 1>() = col;
+		out.template ref<4, scale - 1>() = col;

 		out.template ref<scale - 1, 2>() = col;
 		out.template ref<scale - 1, 3>() = col;
-
-		out.template ref<4, scale - 1>() = col;
-
-		alphaBlend<2, 3>(out.template ref<3, 3>(), col);
 	}

 	template <class OutputMatrix>
 	static void blendLineDiagonal(uint32_t col, OutputMatrix& out)
 	{
-		alphaBlend<1, 8>(out.template ref<scale - 1, scale / 2	>(), col);
-		alphaBlend<1, 8>(out.template ref<scale - 2, scale / 2 + 1>(), col);
-		alphaBlend<1, 8>(out.template ref<scale - 3, scale / 2 + 2>(), col);
+		alphaGrad<1, 8>(out.template ref<scale - 1, scale / 2    >(), col); //conflict with other rotations for this odd scale
+		alphaGrad<1, 8>(out.template ref<scale - 2, scale / 2 + 1>(), col);
+		alphaGrad<1, 8>(out.template ref<scale - 3, scale / 2 + 2>(), col); //

-		alphaBlend<7, 8>(out.template ref<4, 3>(), col);
-		alphaBlend<7, 8>(out.template ref<3, 4>(), col);
+		alphaGrad<7, 8>(out.template ref<4, 3>(), col);
+		alphaGrad<7, 8>(out.template ref<3, 4>(), col);

 		out.template ref<4, 4>() = col;
 	}
@ -1109,11 +922,110 @@ struct Scaler5x
 	static void blendCorner(uint32_t col, OutputMatrix& out)
 	{
 		//model a round corner
-		alphaBlend<86, 100>(out.template ref<4, 4>(), col); //exact: 0.8631434088
-		alphaBlend<23, 100>(out.template ref<4, 3>(), col); //0.2306749731
-		alphaBlend<23, 100>(out.template ref<3, 4>(), col); //0.2306749731
-		//alphaBlend<8, 1000>(out.template ref<4, 2>(), col); //0.008384061834 -> negligible
-		//alphaBlend<8, 1000>(out.template ref<2, 4>(), col); //0.008384061834
+		alphaGrad<86, 100>(out.template ref<4, 4>(), col); //exact: 0.8631434088
+		alphaGrad<23, 100>(out.template ref<4, 3>(), col); //0.2306749731
+		alphaGrad<23, 100>(out.template ref<3, 4>(), col); //0.2306749731
+		//alphaGrad<1, 64>(out.template ref<4, 2>(), col); //0.01676812367 -> negligible + avoid conflicts with other rotations for this odd scale
+		//alphaGrad<1, 64>(out.template ref<2, 4>(), col); //0.01676812367
+	}
+};
+
+
+template <class ColorGradient>
+struct Scaler6x : public ColorGradient
+{
+	static const int scale = 6;
+
+	template <unsigned int M, unsigned int N> //bring template function into scope for GCC
+	static void alphaGrad(uint32_t& pixBack, uint32_t pixFront) { ColorGradient::template alphaGrad<M, N>(pixBack, pixFront); }
+
+
+	template <class OutputMatrix>
+	static void blendLineShallow(uint32_t col, OutputMatrix& out)
+	{
+		alphaGrad<1, 4>(out.template ref<scale - 1, 0>(), col);
+		alphaGrad<1, 4>(out.template ref<scale - 2, 2>(), col);
+		alphaGrad<1, 4>(out.template ref<scale - 3, 4>(), col);
+
+		alphaGrad<3, 4>(out.template ref<scale - 1, 1>(), col);
+		alphaGrad<3, 4>(out.template ref<scale - 2, 3>(), col);
+		alphaGrad<3, 4>(out.template ref<scale - 3, 5>(), col);
+
+		out.template ref<scale - 1, 2>() = col;
+		out.template ref<scale - 1, 3>() = col;
+		out.template ref<scale - 1, 4>() = col;
+		out.template ref<scale - 1, 5>() = col;
+
+		out.template ref<scale - 2, 4>() = col;
+		out.template ref<scale - 2, 5>() = col;
+	}
+
+	template <class OutputMatrix>
+	static void blendLineSteep(uint32_t col, OutputMatrix& out)
+	{
+		alphaGrad<1, 4>(out.template ref<0, scale - 1>(), col);
+		alphaGrad<1, 4>(out.template ref<2, scale - 2>(), col);
+		alphaGrad<1, 4>(out.template ref<4, scale - 3>(), col);
+
+		alphaGrad<3, 4>(out.template ref<1, scale - 1>(), col);
+		alphaGrad<3, 4>(out.template ref<3, scale - 2>(), col);
+		alphaGrad<3, 4>(out.template ref<5, scale - 3>(), col);
+
+		out.template ref<2, scale - 1>() = col;
+		out.template ref<3, scale - 1>() = col;
+		out.template ref<4, scale - 1>() = col;
+		out.template ref<5, scale - 1>() = col;
+
+		out.template ref<4, scale - 2>() = col;
+		out.template ref<5, scale - 2>() = col;
+	}
+
+	template <class OutputMatrix>
+	static void blendLineSteepAndShallow(uint32_t col, OutputMatrix& out)
+	{
+		alphaGrad<1, 4>(out.template ref<0, scale - 1>(), col);
+		alphaGrad<1, 4>(out.template ref<2, scale - 2>(), col);
+		alphaGrad<3, 4>(out.template ref<1, scale - 1>(), col);
+		alphaGrad<3, 4>(out.template ref<3, scale - 2>(), col);
+
+		alphaGrad<1, 4>(out.template ref<scale - 1, 0>(), col);
+		alphaGrad<1, 4>(out.template ref<scale - 2, 2>(), col);
+		alphaGrad<3, 4>(out.template ref<scale - 1, 1>(), col);
+		alphaGrad<3, 4>(out.template ref<scale - 2, 3>(), col);
+
+		out.template ref<2, scale - 1>() = col;
+		out.template ref<3, scale - 1>() = col;
+		out.template ref<4, scale - 1>() = col;
+		out.template ref<5, scale - 1>() = col;
+
+		out.template ref<4, scale - 2>() = col;
+		out.template ref<5, scale - 2>() = col;
+
+		out.template ref<scale - 1, 2>() = col;
+		out.template ref<scale - 1, 3>() = col;
+	}
+
+	template <class OutputMatrix>
+	static void blendLineDiagonal(uint32_t col, OutputMatrix& out)
+	{
+		alphaGrad<1, 2>(out.template ref<scale - 1, scale / 2    >(), col);
+		alphaGrad<1, 2>(out.template ref<scale - 2, scale / 2 + 1>(), col);
+		alphaGrad<1, 2>(out.template ref<scale - 3, scale / 2 + 2>(), col);
+
+		out.template ref<scale - 2, scale - 1>() = col;
+		out.template ref<scale - 1, scale - 1>() = col;
+		out.template ref<scale - 1, scale - 2>() = col;
+	}
+
+	template <class OutputMatrix>
+	static void blendCorner(uint32_t col, OutputMatrix& out)
+	{
+		//model a round corner
+		alphaGrad<97, 100>(out.template ref<5, 5>(), col); //exact: 0.9711013910
+		alphaGrad<42, 100>(out.template ref<4, 5>(), col); //0.4236372243
+		alphaGrad<42, 100>(out.template ref<5, 4>(), col); //0.4236372243
+		alphaGrad< 6, 100>(out.template ref<5, 3>(), col); //0.05652034508
+		alphaGrad< 6, 100>(out.template ref<3, 5>(), col); //0.05652034508
 	}
 };

@ -1123,7 +1035,7 @@ struct ColorDistanceRGB
 {
 	static double dist(uint32_t pix1, uint32_t pix2, double luminanceWeight)
 	{
-		return distYCbCrBuffer->dist(pix1, pix2);
+		return DistYCbCrBuffer::dist(pix1, pix2);

 		//if (pix1 == pix2) //about 4% perf boost
 		//	return 0;
@ -1142,18 +1054,37 @@ struct ColorDistanceARGB

 			1. if a1 = a2, distance should be: a1 * distYCbCr()
 			2. if a1 = 0,  distance should be: a2 * distYCbCr(black, white) = a2 * 255
-			3. if a1 = 1,  distance should be: 255 * (1 - a2) + a2 * distYCbCr()
+			3. if a1 = 1,  ??? maybe: 255 * (1 - a2) + a2 * distYCbCr()
 		*/

-		const double d = distYCbCrBuffer->dist(pix1, pix2);
-		if (a1 > a2)
-			return a2 * d + 255 * (a1 - a2);
-		else
+		//return std::min(a1, a2) * DistYCbCrBuffer::dist(pix1, pix2) + 255 * abs(a1 - a2);
+		//=> following code is 15% faster:
+		const double d = DistYCbCrBuffer::dist(pix1, pix2);
+		if (a1 < a2)
 			return a1 * d + 255 * (a2 - a1);
+		else
+			return a2 * d + 255 * (a1 - a2);

-		//if (pix1 == pix2)
-		//	return 0;
-		//return std::min(a1, a2) * distYCbCr(pix1, pix2, luminanceWeight) + 255 * abs(a1 - a2);
+		//alternative? return std::sqrt(a1 * a2 * square(DistYCbCrBuffer::dist(pix1, pix2)) + square(255 * (a1 - a2)));
+	}
+};
+
+
+struct ColorGradientRGB
+{
+	template <unsigned int M, unsigned int N>
+	static void alphaGrad(uint32_t& pixBack, uint32_t pixFront)
+	{
+		pixBack = gradientRGB<M, N>(pixFront, pixBack);
+	}
+};
+
+struct ColorGradientARGB
+{
+	template <unsigned int M, unsigned int N>
+	static void alphaGrad(uint32_t& pixBack, uint32_t pixFront)
+	{
+		pixBack = gradientARGB<M, N>(pixFront, pixBack);
 	}
 };
 }
@ -1167,45 +1098,38 @@ void xbrz::scale(size_t factor, const uint32_t* src, uint32_t* trg, int srcWidth
 			switch (factor)
 			{
 				case 2:
-					return scaleImage<Scaler2x, ColorDistanceARGB>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
+					return scaleImage<Scaler2x<ColorGradientARGB>, ColorDistanceARGB>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
 				case 3:
-					return scaleImage<Scaler3x, ColorDistanceARGB>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
+					return scaleImage<Scaler3x<ColorGradientARGB>, ColorDistanceARGB>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
 				case 4:
-					return scaleImage<Scaler4x, ColorDistanceARGB>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
+					return scaleImage<Scaler4x<ColorGradientARGB>, ColorDistanceARGB>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
 				case 5:
-					return scaleImage<Scaler5x, ColorDistanceARGB>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
+					return scaleImage<Scaler5x<ColorGradientARGB>, ColorDistanceARGB>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
+				case 6:
+					return scaleImage<Scaler6x<ColorGradientARGB>, ColorDistanceARGB>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
 			}
+			break;
+
 		case ColorFormat::RGB:
 			switch (factor)
 			{
 				case 2:
-					return scaleImage<Scaler2x, ColorDistanceRGB>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
+					return scaleImage<Scaler2x<ColorGradientRGB>, ColorDistanceRGB>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
 				case 3:
-					return scaleImage<Scaler3x, ColorDistanceRGB>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
+					return scaleImage<Scaler3x<ColorGradientRGB>, ColorDistanceRGB>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
 				case 4:
-					return scaleImage<Scaler4x, ColorDistanceRGB>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
+					return scaleImage<Scaler4x<ColorGradientRGB>, ColorDistanceRGB>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
 				case 5:
-					return scaleImage<Scaler5x, ColorDistanceRGB>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
+					return scaleImage<Scaler5x<ColorGradientRGB>, ColorDistanceRGB>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
+				case 6:
+					return scaleImage<Scaler6x<ColorGradientRGB>, ColorDistanceRGB>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
 			}
+			break;
 	}
 	assert(false);
 }


-void xbrz::init()
-{
-	if (distYCbCrBuffer == nullptr)
-		distYCbCrBuffer = new DistYCbCrBuffer();
-}
-
-
-void xbrz::shutdown()
-{
-	delete distYCbCrBuffer;
-	distYCbCrBuffer = nullptr;
-}
-
-
 bool xbrz::equalColorTest(uint32_t col1, uint32_t col2, ColorFormat colFmt, double luminanceWeight, double equalColorTolerance)
 {
 	switch (colFmt)
--- a/ext/xbrz/xbrz.h
+++ b/ext/xbrz/xbrz.h
@ -42,12 +42,13 @@ http://board.byuu.org/viewtopic.php?f=10&t=2248
 - support multithreading
 - support 64-bit architectures
 - support processing image slices
+- support scaling up to 6xBRZ
 */

 enum class ColorFormat //from high bits -> low bits, 8 bit per channel
 {
-    ARGB, //including alpha channel, BGRA byte order on little-endian machines
    RGB,  //8 bit for each red, green, blue, upper 8 bits unused
+    ARGB, //including alpha channel, BGRA byte order on little-endian machines
 };

 /*
@ -59,18 +60,14 @@ enum class ColorFormat //from high bits -> low bits, 8 bit per channel
   in the target image data if you are using multiple threads for processing each enlarged slice!

 THREAD-SAFETY: - parts of the same image may be scaled by multiple threads as long as the [yFirst, yLast) ranges do not overlap!
-               - there is a minor inefficiency for the first row of a slice, so avoid processing single rows only; suggestion: process 6 rows at least
+               - there is a minor inefficiency for the first row of a slice, so avoid processing single rows only; suggestion: process 8-16 rows at least
 */
-void scale(size_t factor, //valid range: 2 - 5
+void scale(size_t factor, //valid range: 2 - 6
           const uint32_t* src, uint32_t* trg, int srcWidth, int srcHeight,
           ColorFormat colFmt,
           const ScalerCfg& cfg = ScalerCfg(),
           int yFirst = 0, int yLast = std::numeric_limits<int>::max()); //slice of source image

-void init();
-
-void shutdown();
-
 void nearestNeighborScale(const uint32_t* src, int srcWidth, int srcHeight,
                          uint32_t* trg, int trgWidth, int trgHeight);