Merge pull request #16533 from hrydgard/texture-decode-overrun-fix

OpenGL: Fix case in tex decoder where we could write off the end of a buffer
2024-11-23 21:39:52 +00:00 · 2022-12-09 16:32:57 -08:00 · 2022-12-09 16:32:57 -08:00 · 24b62465b7
commit 24b62465b7
parent 03433c42ce 37b0c90a2d
5 changed files with 58 additions and 34 deletions
--- a/Common/UI/UIScreen.h
+++ b/Common/UI/UIScreen.h
@ -282,7 +282,7 @@ public:
 	UI::Event OnChoice;

 protected:
-	bool HasTitleBar() const { return false; }
+	bool HasTitleBar() const override { return false; }

 private:
 	const ContextMenuItem *items_;
--- a/GPU/Common/TextureCacheCommon.cpp
+++ b/GPU/Common/TextureCacheCommon.cpp
@ -1556,15 +1556,16 @@ static CheckAlphaResult DecodeDXTBlocks(uint8_t *out, int outPitch, uint32_t tex
 		u32 blockIndex = (y / 4) * (bufw / 4);
 		int blockHeight = std::min(h - y, 4);
 		for (int x = 0; x < minw; x += 4) {
+			int blockWidth = std::min(minw - x, 4);
 			switch (n) {
 			case 1:
-				DecodeDXT1Block(dst + outPitch32 * y + x, (const DXT1Block *)src + blockIndex, outPitch32, blockHeight, &alphaSum);
+				DecodeDXT1Block(dst + outPitch32 * y + x, (const DXT1Block *)src + blockIndex, outPitch32, blockWidth, blockHeight, &alphaSum);
 				break;
 			case 3:
-				DecodeDXT3Block(dst + outPitch32 * y + x, (const DXT3Block *)src + blockIndex, outPitch32, blockHeight);
+				DecodeDXT3Block(dst + outPitch32 * y + x, (const DXT3Block *)src + blockIndex, outPitch32, blockWidth, blockHeight);
 				break;
 			case 5:
-				DecodeDXT5Block(dst + outPitch32 * y + x, (const DXT5Block *)src + blockIndex, outPitch32, blockHeight);
+				DecodeDXT5Block(dst + outPitch32 * y + x, (const DXT5Block *)src + blockIndex, outPitch32, blockWidth, blockHeight);
 				break;
 			}
 			blockIndex++;
@ -1673,7 +1674,9 @@ CheckAlphaResult TextureCacheCommon::DecodeTextureLevel(u8 *out, int outPitch, G
 		case GE_CMODE_16BIT_ABGR5551:
 		case GE_CMODE_16BIT_ABGR4444:
 		{
-			if (clutAlphaLinear_ && mipmapShareClut && !expandTo32bit) {
+			// The w > 1 check is to not need a case that handles a single pixel
+			// in DeIndexTexture4Optimal<u16>.
+			if (clutAlphaLinear_ && mipmapShareClut && !expandTo32bit && w >= 4) {
 				// We don't bother with fullalpha here (clutAlphaLinear_)
 				// Here, reverseColors means the CLUT is already reversed.
 				if (reverseColors) {
--- a/GPU/Common/TextureDecoder.cpp
+++ b/GPU/Common/TextureDecoder.cpp
@ -421,9 +421,9 @@ class DXTDecoder {
 public:
 	inline void DecodeColors(const DXT1Block *src, bool ignore1bitAlpha);
 	inline void DecodeAlphaDXT5(const DXT5Block *src);
-	inline void WriteColorsDXT1(u32 *dst, const DXT1Block *src, int pitch, int height);
-	inline void WriteColorsDXT3(u32 *dst, const DXT3Block *src, int pitch, int height);
-	inline void WriteColorsDXT5(u32 *dst, const DXT5Block *src, int pitch, int height);
+	inline void WriteColorsDXT1(u32 *dst, const DXT1Block *src, int pitch, int width, int height);
+	inline void WriteColorsDXT3(u32 *dst, const DXT3Block *src, int pitch, int width, int height);
+	inline void WriteColorsDXT5(u32 *dst, const DXT5Block *src, int pitch, int width, int height);

 	bool AnyNonFullAlpha() const { return anyNonFullAlpha_; }

@ -507,11 +507,11 @@ void DXTDecoder::DecodeAlphaDXT5(const DXT5Block *src) {
 	}
 }

-void DXTDecoder::WriteColorsDXT1(u32 *dst, const DXT1Block *src, int pitch, int height) {
+void DXTDecoder::WriteColorsDXT1(u32 *dst, const DXT1Block *src, int pitch, int width, int height) {
 	bool anyColor3 = false;
 	for (int y = 0; y < height; y++) {
 		int colordata = src->lines[y];
-		for (int x = 0; x < 4; x++) {
+		for (int x = 0; x < width; x++) {
 			int col = colordata & 3;
 			if (col == 3) {
 				anyColor3 = true;
@ -527,11 +527,11 @@ void DXTDecoder::WriteColorsDXT1(u32 *dst, const DXT1Block *src, int pitch, int
 	}
 }

-void DXTDecoder::WriteColorsDXT3(u32 *dst, const DXT3Block *src, int pitch, int height) {
+void DXTDecoder::WriteColorsDXT3(u32 *dst, const DXT3Block *src, int pitch, int width, int height) {
 	for (int y = 0; y < height; y++) {
 		int colordata = src->color.lines[y];
 		u32 alphadata = src->alphaLines[y];
-		for (int x = 0; x < 4; x++) {
+		for (int x = 0; x < width; x++) {
 			dst[x] = colors_[colordata & 3] | (alphadata << 28);
 			colordata >>= 2;
 			alphadata >>= 4;
@ -540,13 +540,13 @@ void DXTDecoder::WriteColorsDXT3(u32 *dst, const DXT3Block *src, int pitch, int
 	}
 }

-void DXTDecoder::WriteColorsDXT5(u32 *dst, const DXT5Block *src, int pitch, int height) {
+void DXTDecoder::WriteColorsDXT5(u32 *dst, const DXT5Block *src, int pitch, int width, int height) {
 	// 48 bits, 3 bit index per pixel, 12 bits per line.
 	u64 alphadata = ((u64)(u16)src->alphadata1 << 32) | (u32)src->alphadata2;

 	for (int y = 0; y < height; y++) {
 		int colordata = src->color.lines[y];
-		for (int x = 0; x < 4; x++) {
+		for (int x = 0; x < width; x++) {
 			dst[x] = colors_[colordata & 3] | (alpha_[alphadata & 7] << 24);
 			colordata >>= 2;
 			alphadata >>= 3;
@ -619,24 +619,24 @@ uint32_t GetDXT5Texel(const DXT5Block *src, int x, int y) {
 }

 // This could probably be done faster by decoding two or four blocks at a time with SSE/NEON.
-void DecodeDXT1Block(u32 *dst, const DXT1Block *src, int pitch, int height, u32 *alpha) {
+void DecodeDXT1Block(u32 *dst, const DXT1Block *src, int pitch, int width, int height, u32 *alpha) {
 	DXTDecoder dxt;
 	dxt.DecodeColors(src, false);
-	dxt.WriteColorsDXT1(dst, src, pitch, height);
+	dxt.WriteColorsDXT1(dst, src, pitch, width, height);
 	*alpha &= dxt.AnyNonFullAlpha() ? 0 : 1;
 }

-void DecodeDXT3Block(u32 *dst, const DXT3Block *src, int pitch, int height) {
+void DecodeDXT3Block(u32 *dst, const DXT3Block *src, int pitch, int width,  int height) {
 	DXTDecoder dxt;
 	dxt.DecodeColors(&src->color, true);
-	dxt.WriteColorsDXT3(dst, src, pitch, height);
+	dxt.WriteColorsDXT3(dst, src, pitch, width, height);
 }

-void DecodeDXT5Block(u32 *dst, const DXT5Block *src, int pitch, int height) {
+void DecodeDXT5Block(u32 *dst, const DXT5Block *src, int pitch, int width, int height) {
 	DXTDecoder dxt;
 	dxt.DecodeColors(&src->color, true);
 	dxt.DecodeAlphaDXT5(src);
-	dxt.WriteColorsDXT5(dst, src, pitch, height);
+	dxt.WriteColorsDXT5(dst, src, pitch, width, height);
 }

 #ifdef _M_SSE
--- a/GPU/Common/TextureDecoder.h
+++ b/GPU/Common/TextureDecoder.h
@ -65,9 +65,9 @@ struct DXT5Block {
 	u8 alpha1; u8 alpha2;
 };

-void DecodeDXT1Block(u32 *dst, const DXT1Block *src, int pitch, int height, u32 *alpha);
-void DecodeDXT3Block(u32 *dst, const DXT3Block *src, int pitch, int height);
-void DecodeDXT5Block(u32 *dst, const DXT5Block *src, int pitch, int height);
+void DecodeDXT1Block(u32 *dst, const DXT1Block *src, int pitch, int width, int height, u32 *alpha);
+void DecodeDXT3Block(u32 *dst, const DXT3Block *src, int pitch, int width, int height);
+void DecodeDXT5Block(u32 *dst, const DXT5Block *src, int pitch, int width, int height);

 uint32_t GetDXT1Texel(const DXT1Block *src, int x, int y);
 uint32_t GetDXT3Texel(const DXT3Block *src, int x, int y);
@ -163,22 +163,36 @@ inline void DeIndexTexture4(/*WRITEONLY*/ ClutT *dest, const u8 *indexed, int le

 	ClutT alphaSum = (ClutT)(-1);
 	if (nakedIndex) {
-		for (int i = 0; i < length; i += 2) {
+		while (length >= 2) {
 			u8 index = *indexed++;
 			ClutT color0 = clut[index & 0xf];
 			ClutT color1 = clut[index >> 4];
-			dest[i + 0] = color0;
-			dest[i + 1] = color1;
+			*dest++ = color0;
+			*dest++ = color1;
 			alphaSum &= color0 & color1;
+			length -= 2;
+		}
+		if (length) {  // Last pixel. Can really only happen in 1xY textures, but making this work generically.
+			u8 index = *indexed++;
+			ClutT color0 = clut[index & 0xf];
+			*dest = color0;
+			alphaSum &= color0;
 		}
 	} else {
-		for (int i = 0; i < length; i += 2) {
+		while (length >= 2) {
 			u8 index = *indexed++;
 			ClutT color0 = clut[gstate.transformClutIndex((index >> 0) & 0xf)];
 			ClutT color1 = clut[gstate.transformClutIndex((index >> 4) & 0xf)];
-			dest[i + 0] = color0;
-			dest[i + 1] = color1;
+			*dest++ = color0;
+			*dest++ = color1;
 			alphaSum &= color0 & color1;
+			length -= 2;
+		}
+		if (length) {
+			u8 index = *indexed++;
+			ClutT color0 = clut[gstate.transformClutIndex((index >> 0) & 0xf)];
+			*dest = color0;
+			alphaSum &= color0;
 		}
 	}

@ -187,10 +201,15 @@ inline void DeIndexTexture4(/*WRITEONLY*/ ClutT *dest, const u8 *indexed, int le

 template <typename ClutT>
 inline void DeIndexTexture4Optimal(ClutT *dest, const u8 *indexed, int length, ClutT color) {
-	for (int i = 0; i < length; i += 2) {
+	while (length >= 2) {
 		u8 index = *indexed++;
-		dest[i + 0] = color | ((index >> 0) & 0xf);
-		dest[i + 1] = color | ((index >> 4) & 0xf);
+		*dest++ = color | ((index >> 0) & 0xf);
+		*dest++ = color | ((index >> 4) & 0xf);
+		length -= 2;
+	}
+	if (length) {
+		u8 index = *indexed++;
+		*dest++ = color | ((index >> 0) & 0xf);
 	}
 }

--- a/GPU/Math3D.h
+++ b/GPU/Math3D.h
@ -898,7 +898,8 @@ inline void Vec3ByMatrix43(float vecOut[3], const float v[3], const float m[12])
 	vecOut[1] = vectorGetByIndex<1>(sum);
 	vecOut[2] = vectorGetByIndex<2>(sum);
 #elif PPSSPP_ARCH(ARM64_NEON)
-	float32x4_t sum = Vec3ByMatrix43Internal(vld1q_f32(v), m);
+	float vecIn[4] = {v[0], v[1], v[2], 1.0f};
+	float32x4_t sum = Vec3ByMatrix43Internal(vld1q_f32(vecIn), m);
 	vecOut[0] = vgetq_lane_f32(sum, 0);
 	vecOut[1] = vgetq_lane_f32(sum, 1);
 	vecOut[2] = vgetq_lane_f32(sum, 2);
@ -957,7 +958,8 @@ inline void Vec3ByMatrix44(float vecOut[4], const float v[3], const float m[16])
 	__m128 sum = Vec3ByMatrix44Internal(x, y, z, m);
 	_mm_storeu_ps(vecOut, sum);
 #elif PPSSPP_ARCH(ARM64_NEON)
-	float32x4_t sum = Vec3ByMatrix44Internal(vld1q_f32(v), m);
+	float vecIn[4] = {v[0], v[1], v[2], 1.0f};
+	float32x4_t sum = Vec3ByMatrix44Internal(vld1q_f32(vecIn), m);
 	vst1q_f32(vecOut, sum);
 #else
 	vecOut[0] = v[0] * m[0] + v[1] * m[4] + v[2] * m[8] + m[12];