Support UTF32-BE decoding (#15472) ##bin

2025-01-26 07:44:29 +00:00 · 2019-11-16 18:27:07 +08:00 · 2019-11-16 18:27:07 +08:00 · c4e80f8c6d
commit c4e80f8c6d
parent ac84c4ee2f
7 changed files with 54 additions and 25 deletions
--- a/libr/core/cconfig.c
+++ b/libr/core/cconfig.c
@ -3122,7 +3122,7 @@ R_API int r_core_config_init(RCore *core) {
 	SETICB ("bin.maxstrbuf", 1024*1024*10, & cb_binmaxstrbuf, "Maximum size of range to load strings from");
 	n = NODECB ("bin.str.enc", "guess", &cb_binstrenc);
 	SETDESC (n, "Default string encoding of binary");
-	SETOPTIONS (n, "latin1", "utf8", "utf16le", "utf32le", "utf16be", "guess", NULL);
+	SETOPTIONS (n, "latin1", "utf8", "utf16le", "utf32le", "utf16be", "utf32be", "guess", NULL);
 	SETCB ("bin.prefix", NULL, &cb_binprefix, "Prefix all symbols/sections/relocs with a specific string");
 	SETCB ("bin.rawstr", "false", &cb_rawstr, "Load strings from raw binaries");
 	SETCB ("bin.strings", "true", &cb_binstrings, "Load strings from rbin on startup");
--- a/libr/core/disasm.c
+++ b/libr/core/disasm.c
@ -746,6 +746,8 @@ static RDisasmState * ds_init(RCore *core) {
 		ds->strenc = R_STRING_ENC_UTF32LE;
 	} else if (!strcmp (strenc_str, "utf16be")) {
 		ds->strenc = R_STRING_ENC_UTF16BE;
+	} else if (!strcmp (strenc_str, "utf32be")) {
+		ds->strenc = R_STRING_ENC_UTF32BE;
 	} else {
 		ds->strenc = R_STRING_ENC_GUESS;
 	}
@ -3720,6 +3722,10 @@ static char *ds_esc_str(RDisasmState *ds, const char *str, int len, const char *
 		escstr = r_str_escape_utf16be (str, len, ds->show_asciidot, esc_bslash);
 		prefix = "ub";
 		break;
+	case R_STRING_ENC_UTF32BE:
+		escstr = r_str_escape_utf32be (str, len, ds->show_asciidot, esc_bslash);
+		prefix = "Ub";
+		break;
 	default:
 		str_len = strlen (str);
 		if ((str_len == 1 && len > 3 && str[2] && !str[3])
--- a/libr/include/r_util/r_str.h
+++ b/libr/include/r_util/r_str.h
@ -15,6 +15,7 @@ typedef enum {
 	R_STRING_ENC_UTF16LE = 'u',
 	R_STRING_ENC_UTF32LE = 'U',
 	R_STRING_ENC_UTF16BE = 'b',
+	R_STRING_ENC_UTF32BE = 'B',
 	R_STRING_ENC_GUESS = 'g',
 } RStrEnc;

@ -155,6 +156,7 @@ R_API char *r_str_escape_utf8(const char *buf, bool show_asciidot, bool esc_bsla
 R_API char *r_str_escape_utf16le(const char *buf, int buf_size, bool show_asciidot, bool esc_bslash);
 R_API char *r_str_escape_utf32le(const char *buf, int buf_size, bool show_asciidot, bool esc_bslash);
 R_API char *r_str_escape_utf16be(const char *buf, int buf_size, bool show_asciidot, bool esc_bslash);
+R_API char *r_str_escape_utf32be(const char *buf, int buf_size, bool show_asciidot, bool esc_bslash);
 R_API void r_str_byte_escape(const char *p, char **dst, int dot_nl, bool default_dot, bool esc_bslash);
 R_API void r_str_uri_decode(char *buf);
 R_API char *r_str_uri_encode(const char *buf);
--- a/libr/include/r_util/r_utf32.h
+++ b/libr/include/r_util/r_utf32.h
@ -4,6 +4,8 @@
 /* For RRune definition */
 #include "r_utf8.h"

+R_API int r_utf32_decode(const ut8 *ptr, int ptrlen, RRune *ch, bool bigendian);
+R_API int r_utf32le_decode(const ut8 *ptr, int ptrlen, RRune *ch);
 R_API int r_utf32le_decode(const ut8 *ptr, int ptrlen, RRune *ch);

 #endif //  R_UTF32_H
--- a/libr/util/str.c
+++ b/libr/util/str.c
@ -1349,6 +1349,7 @@ static char *r_str_escape_utf(const char *buf, int buf_size, RStrEnc enc, bool s
 	case R_STRING_ENC_UTF16LE:
 	case R_STRING_ENC_UTF16BE:
 	case R_STRING_ENC_UTF32LE:
+	case R_STRING_ENC_UTF32BE:
 		if (buf_size < 0) {
 			return NULL;
 		}
@ -1378,10 +1379,11 @@ static char *r_str_escape_utf(const char *buf, int buf_size, RStrEnc enc, bool s
 		case R_STRING_ENC_UTF16LE:
 		case R_STRING_ENC_UTF16BE:
 		case R_STRING_ENC_UTF32LE:
+		case R_STRING_ENC_UTF32BE:
 			if (enc == R_STRING_ENC_UTF16LE || enc == R_STRING_ENC_UTF16BE) {
 				ch_bytes = r_utf16_decode ((ut8 *)p, end - p, &ch, enc == R_STRING_ENC_UTF16BE);
 			} else {
-				ch_bytes = r_utf32le_decode ((ut8 *)p, end - p, &ch);
+				ch_bytes = r_utf32_decode ((ut8 *)p, end - p, &ch, enc == R_STRING_ENC_UTF32BE);
 			}
 			if (ch_bytes == 0) {
 				p++;
@ -1404,7 +1406,8 @@ static char *r_str_escape_utf(const char *buf, int buf_size, RStrEnc enc, bool s
 				*q++ = "0123456789abcdef"[ch >> 4 * i & 0xf];
 			}
 		} else {
-			r_str_byte_escape (p + (enc == R_STRING_ENC_UTF16BE), &q, false, false, esc_bslash);
+			int offset = enc == R_STRING_ENC_UTF16BE ? 1 : enc == R_STRING_ENC_UTF32BE ? 3 : 0;
+			r_str_byte_escape (p + offset, &q, false, false, esc_bslash);
 		}
 		switch (enc) {
 		case R_STRING_ENC_UTF16LE:
@ -1412,6 +1415,7 @@ static char *r_str_escape_utf(const char *buf, int buf_size, RStrEnc enc, bool s
 			p += ch_bytes < 2 ? 2 : ch_bytes;
 			break;
 		case R_STRING_ENC_UTF32LE:
+		case R_STRING_ENC_UTF32BE:
 			p += 4;
 			break;
 		default:
@ -1438,6 +1442,10 @@ R_API char *r_str_escape_utf16be(const char *buf, int buf_size, bool show_asciid
 	return r_str_escape_utf (buf, buf_size, R_STRING_ENC_UTF16BE, show_asciidot, esc_bslash);
 }

+R_API char *r_str_escape_utf32be(const char *buf, int buf_size, bool show_asciidot, bool esc_bslash) {
+	return r_str_escape_utf (buf, buf_size, R_STRING_ENC_UTF32BE, show_asciidot, esc_bslash);
+}
+
 // JSON has special escaping requirements
 // TODO: merge with r_str_escape_utf() and r_str_byte_escape() using RStrEsc
 R_API char *r_str_escape_utf8_for_json(const char *buf, int buf_size) {
--- a/libr/util/utf32.c
+++ b/libr/util/utf32.c
@ -3,34 +3,43 @@
 #include <r_types.h>
 #include <r_util.h>

-/* Convert an UTF-32LE buf into a unicode RRune */
-R_API int r_utf32le_decode(const ut8 *ptr, int ptrlen, RRune *ch) {
+/* Convert an UTF-32 buf into a unicode RRune */
+R_API int r_utf32_decode(const ut8 *ptr, int ptrlen, RRune *ch, bool bigendian) {
 	if (ptrlen < 1) {
 		return 0;
 	}
+	int low = 0;
+	int high = 3;
+	if (bigendian) {
+		low = 3;
+		high = 0;
+	}
 	if (ptrlen > 3) {
-		if (ptr[3]) {
-			if (ch) {
-				*ch = (ut32)ptr[3] << 24 | (ut32)ptr[2] << 16 | (ut32)ptr[1] << 8 | ptr[0];
-			}
-			return 4;
-		}
-		if (ptr[2]) {
-			if (ch) {
-				*ch = (ut32)ptr[2] << 16 | (ut32)ptr[1] << 8 | ptr[0];
-			}
-			return 4;
-		}
-		if (ptr[1]) {
-			if (ch) {
-				*ch = (ut32)ptr[1] << 8 | ptr[0];
-			}
-			return 2;
-		}
+		int sign = bigendian ? -1 : 1;
 		if (ch) {
-			*ch = (ut32)ptr[0];
+			int i;
+			*ch = (ut32)ptr[low];
+			for (i = 1; i < 4; i++) {
+				*ch |= (ut32)ptr[3 - high + i * sign] << 8 * i;
+			}
+		}
+		if (ptr[high] || ptr[high - 1 * sign]) {
+			return 4;
+		}
+		if (ptr[low + 1 * sign]) {
+			return 2;
 		}
 		return 1;
 	}
 	return 0;
 }
+
+/* Convert an UTF-32LE buf into a unicode RRune */
+R_API int r_utf32le_decode(const ut8 *ptr, int ptrlen, RRune *ch) {
+	return r_utf32_decode (ptr, ptrlen, ch, false);
+}
+
+/* Convert an UTF-32BE buf into a unicode RRune */
+R_API int r_utf32be_decode(const ut8 *ptr, int ptrlen, RRune *ch) {
+	return r_utf32_decode (ptr, ptrlen, ch, true);
+}
--- a/libr/util/utf8.c
+++ b/libr/util/utf8.c
@ -791,7 +791,9 @@ R_API RStrEnc r_utf_bom_encoding(const ut8 *ptr, int ptrlen) {
 		if (ptr[0] == 0xff && ptr[1] == 0xfe && !ptr[2] && !ptr[3]) {
 			return R_STRING_ENC_UTF32LE;
 		}
-		/* TODO: R_STRING_ENC_UTF32BE */
+		if (!ptr[0] && !ptr[1] && ptr[2] == 0xfe && ptr[3] == 0xff) {
+			return R_STRING_ENC_UTF32BE;
+		}
 	}
 	if (ptrlen > 2) {
 		if (ptr[0] == 0xef && ptr[1] == 0xbb && ptr[2] == 0xbf) {