PPGe: Interpret invalid UTF-8 sequences better.

This matches PSP firmware behavior per tests.
2024-11-23 05:19:56 +00:00 · 2021-03-28 14:16:17 -07:00 · 2021-03-28 14:16:17 -07:00 · 5ef8762c32
commit 5ef8762c32
parent 7d085966f1
3 changed files with 40 additions and 4 deletions
--- a/Common/Data/Encoding/Utf8.cpp
+++ b/Common/Data/Encoding/Utf8.cpp
@ -234,6 +234,32 @@ uint32_t u8_nextchar(const char *s, int *i)
  return ch;
 }

+uint32_t u8_nextchar_unsafe(const char *s, int *i) {
+	uint32_t ch = (unsigned char)s[(*i)++];
+	int sz = 1;
+
+	if (ch >= 0xF0) {
+		sz++;
+		ch &= ~0x10;
+	}
+	if (ch >= 0xE0) {
+		sz++;
+		ch &= ~0x20;
+	}
+	if (ch >= 0xC0) {
+		sz++;
+		ch &= ~0xC0;
+	}
+
+	// Just assume the bytes must be there.  This is the logic used on the PSP.
+	for (int j = 1; j < sz; ++j) {
+		ch <<= 6;
+		ch += ((unsigned char)s[(*i)++]) & 0x3F;
+	}
+
+	return ch;
+}
+
 void u8_inc(const char *s, int *i)
 {
  (void)(isutf(s[++(*i)]) || isutf(s[++(*i)]) ||
@ -489,9 +515,10 @@ std::string SanitizeUTF8(const std::string &utf8string) {
 	// Worst case.
 	s.resize(utf8string.size() * 4);

+	// This stops at invalid start bytes.
 	size_t pos = 0;
-	while (!utf.end_or_overlong_end()) {
-		int c = utf.next();
+	while (!utf.end() && !utf.invalid()) {
+		int c = utf.next_unsafe();
 		pos += UTF8::encode(&s[pos], c);
 	}
 	s.resize(pos);
--- a/Common/Data/Encoding/Utf8.h
+++ b/Common/Data/Encoding/Utf8.h
@ -20,6 +20,7 @@
 #include <string>

 uint32_t u8_nextchar(const char *s, int *i);
+uint32_t u8_nextchar_unsafe(const char *s, int *i);
 int u8_wc_toutf8(char *dest, uint32_t ch);
 int u8_strlen(const char *s);
 void u8_inc(const char *s, int *i);
@ -31,10 +32,18 @@ public:
 	UTF8(const char *c) : c_(c), index_(0) {}
 	UTF8(const char *c, int index) : c_(c), index_(index) {}
 	bool end() const { return c_[index_] == 0; }
-	bool end_or_overlong_end() const { return peek() == 0; }
+	// Returns true if the next character is outside BMP and Planes 1 - 16.
+	bool invalid() const {
+		unsigned char c = (unsigned char)c_[index_];
+		return (c >= 0x80 && c <= 0xC1) || c >= 0xF5;
+	}
 	uint32_t next() {
 		return u8_nextchar(c_, &index_);
 	}
+	// Allow invalid continuation bytes.
+	uint32_t next_unsafe() {
+		return u8_nextchar_unsafe(c_, &index_);
+	}
 	uint32_t peek() const {
 		int tempIndex = index_;
 		return u8_nextchar(c_, &tempIndex);
--- a/Core/Util/PPGeDraw.cpp
+++ b/Core/Util/PPGeDraw.cpp
@ -960,7 +960,7 @@ static std::string PPGeSanitizeText(const std::string &text) {
 	// the overlong null, the rest of the string is missing in the bottom left corner (save size, etc).
 	// It doesn't seem to be using sceCcc.
 	// Note how the double "" is required in the middle of the string to end the \x80 constant (otherwise it takes E).
-	// TODO: Potentially if the string is only ended by a C080, ReplaceAll might overshoot :(
+	// This behavior doesn't replicate within other games, so it may be a game bug workaround.
 	std::string str = ReplaceAll(text, "\xC0\x80""ENTR", "");
 	// Then SanitizeUTF8 is needed to get rid of various other overlong encodings.
 	return SanitizeUTF8(str);