PPGe: Interpret invalid UTF-8 sequences better.

This matches PSP firmware behavior per tests.
This commit is contained in:
Unknown W. Brackets 2021-03-28 14:16:17 -07:00
parent 7d085966f1
commit 5ef8762c32
3 changed files with 40 additions and 4 deletions

View File

@ -234,6 +234,32 @@ uint32_t u8_nextchar(const char *s, int *i)
return ch;
}
uint32_t u8_nextchar_unsafe(const char *s, int *i) {
uint32_t ch = (unsigned char)s[(*i)++];
int sz = 1;
if (ch >= 0xF0) {
sz++;
ch &= ~0x10;
}
if (ch >= 0xE0) {
sz++;
ch &= ~0x20;
}
if (ch >= 0xC0) {
sz++;
ch &= ~0xC0;
}
// Just assume the bytes must be there. This is the logic used on the PSP.
for (int j = 1; j < sz; ++j) {
ch <<= 6;
ch += ((unsigned char)s[(*i)++]) & 0x3F;
}
return ch;
}
void u8_inc(const char *s, int *i)
{
(void)(isutf(s[++(*i)]) || isutf(s[++(*i)]) ||
@ -489,9 +515,10 @@ std::string SanitizeUTF8(const std::string &utf8string) {
// Worst case.
s.resize(utf8string.size() * 4);
// This stops at invalid start bytes.
size_t pos = 0;
while (!utf.end_or_overlong_end()) {
int c = utf.next();
while (!utf.end() && !utf.invalid()) {
int c = utf.next_unsafe();
pos += UTF8::encode(&s[pos], c);
}
s.resize(pos);

View File

@ -20,6 +20,7 @@
#include <string>
uint32_t u8_nextchar(const char *s, int *i);
uint32_t u8_nextchar_unsafe(const char *s, int *i);
int u8_wc_toutf8(char *dest, uint32_t ch);
int u8_strlen(const char *s);
void u8_inc(const char *s, int *i);
@ -31,10 +32,18 @@ public:
UTF8(const char *c) : c_(c), index_(0) {}
UTF8(const char *c, int index) : c_(c), index_(index) {}
bool end() const { return c_[index_] == 0; }
bool end_or_overlong_end() const { return peek() == 0; }
// Returns true if the next character is outside BMP and Planes 1 - 16.
bool invalid() const {
unsigned char c = (unsigned char)c_[index_];
return (c >= 0x80 && c <= 0xC1) || c >= 0xF5;
}
uint32_t next() {
return u8_nextchar(c_, &index_);
}
// Allow invalid continuation bytes.
uint32_t next_unsafe() {
return u8_nextchar_unsafe(c_, &index_);
}
uint32_t peek() const {
int tempIndex = index_;
return u8_nextchar(c_, &tempIndex);

View File

@ -960,7 +960,7 @@ static std::string PPGeSanitizeText(const std::string &text) {
// the overlong null, the rest of the string is missing in the bottom left corner (save size, etc).
// It doesn't seem to be using sceCcc.
// Note how the double "" is required in the middle of the string to end the \x80 constant (otherwise it takes E).
// TODO: Potentially if the string is only ended by a C080, ReplaceAll might overshoot :(
// This behavior doesn't replicate within other games, so it may be a game bug workaround.
std::string str = ReplaceAll(text, "\xC0\x80""ENTR", "");
// Then SanitizeUTF8 is needed to get rid of various other overlong encodings.
return SanitizeUTF8(str);