mirror of
https://github.com/hrydgard/ppsspp.git
synced 2024-11-23 05:19:56 +00:00
PPGe: Interpret invalid UTF-8 sequences better.
This matches PSP firmware behavior per tests.
This commit is contained in:
parent
7d085966f1
commit
5ef8762c32
@ -234,6 +234,32 @@ uint32_t u8_nextchar(const char *s, int *i)
|
||||
return ch;
|
||||
}
|
||||
|
||||
uint32_t u8_nextchar_unsafe(const char *s, int *i) {
|
||||
uint32_t ch = (unsigned char)s[(*i)++];
|
||||
int sz = 1;
|
||||
|
||||
if (ch >= 0xF0) {
|
||||
sz++;
|
||||
ch &= ~0x10;
|
||||
}
|
||||
if (ch >= 0xE0) {
|
||||
sz++;
|
||||
ch &= ~0x20;
|
||||
}
|
||||
if (ch >= 0xC0) {
|
||||
sz++;
|
||||
ch &= ~0xC0;
|
||||
}
|
||||
|
||||
// Just assume the bytes must be there. This is the logic used on the PSP.
|
||||
for (int j = 1; j < sz; ++j) {
|
||||
ch <<= 6;
|
||||
ch += ((unsigned char)s[(*i)++]) & 0x3F;
|
||||
}
|
||||
|
||||
return ch;
|
||||
}
|
||||
|
||||
void u8_inc(const char *s, int *i)
|
||||
{
|
||||
(void)(isutf(s[++(*i)]) || isutf(s[++(*i)]) ||
|
||||
@ -489,9 +515,10 @@ std::string SanitizeUTF8(const std::string &utf8string) {
|
||||
// Worst case.
|
||||
s.resize(utf8string.size() * 4);
|
||||
|
||||
// This stops at invalid start bytes.
|
||||
size_t pos = 0;
|
||||
while (!utf.end_or_overlong_end()) {
|
||||
int c = utf.next();
|
||||
while (!utf.end() && !utf.invalid()) {
|
||||
int c = utf.next_unsafe();
|
||||
pos += UTF8::encode(&s[pos], c);
|
||||
}
|
||||
s.resize(pos);
|
||||
|
@ -20,6 +20,7 @@
|
||||
#include <string>
|
||||
|
||||
uint32_t u8_nextchar(const char *s, int *i);
|
||||
uint32_t u8_nextchar_unsafe(const char *s, int *i);
|
||||
int u8_wc_toutf8(char *dest, uint32_t ch);
|
||||
int u8_strlen(const char *s);
|
||||
void u8_inc(const char *s, int *i);
|
||||
@ -31,10 +32,18 @@ public:
|
||||
UTF8(const char *c) : c_(c), index_(0) {}
|
||||
UTF8(const char *c, int index) : c_(c), index_(index) {}
|
||||
bool end() const { return c_[index_] == 0; }
|
||||
bool end_or_overlong_end() const { return peek() == 0; }
|
||||
// Returns true if the next character is outside BMP and Planes 1 - 16.
|
||||
bool invalid() const {
|
||||
unsigned char c = (unsigned char)c_[index_];
|
||||
return (c >= 0x80 && c <= 0xC1) || c >= 0xF5;
|
||||
}
|
||||
uint32_t next() {
|
||||
return u8_nextchar(c_, &index_);
|
||||
}
|
||||
// Allow invalid continuation bytes.
|
||||
uint32_t next_unsafe() {
|
||||
return u8_nextchar_unsafe(c_, &index_);
|
||||
}
|
||||
uint32_t peek() const {
|
||||
int tempIndex = index_;
|
||||
return u8_nextchar(c_, &tempIndex);
|
||||
|
@ -960,7 +960,7 @@ static std::string PPGeSanitizeText(const std::string &text) {
|
||||
// the overlong null, the rest of the string is missing in the bottom left corner (save size, etc).
|
||||
// It doesn't seem to be using sceCcc.
|
||||
// Note how the double "" is required in the middle of the string to end the \x80 constant (otherwise it takes E).
|
||||
// TODO: Potentially if the string is only ended by a C080, ReplaceAll might overshoot :(
|
||||
// This behavior doesn't replicate within other games, so it may be a game bug workaround.
|
||||
std::string str = ReplaceAll(text, "\xC0\x80""ENTR", "");
|
||||
// Then SanitizeUTF8 is needed to get rid of various other overlong encodings.
|
||||
return SanitizeUTF8(str);
|
||||
|
Loading…
Reference in New Issue
Block a user