mirror of
https://github.com/radareorg/radare2.git
synced 2024-11-30 16:40:57 +00:00
537 lines
11 KiB
C
537 lines
11 KiB
C
/* radare - LGPL - Copyright 2007-2022 - pancake */
|
|
|
|
#if 0
|
|
Very simple code parser in C
|
|
============================
|
|
|
|
Takes a string representing the code and runs a callback everytime a token is found
|
|
|
|
r_codetok ("string", callback, &userdata);
|
|
#endif
|
|
|
|
#include <r_util.h>
|
|
|
|
static const char *tokentypes[] = {
|
|
"none", "intn", "flot", "word", "hash", "strn", "cmnt", "math", "grup", "begin", "end", NULL
|
|
};
|
|
|
|
|
|
R_API RTokenizer *r_tokenizer_new(void) {
|
|
RTokenizer *t = R_NEW0 (RTokenizer);
|
|
return t;
|
|
}
|
|
|
|
static bool is_token_begin(RTokenizer *tok, char ch) {
|
|
return !(IS_WHITESPACE (ch) || ch == '\n');
|
|
}
|
|
|
|
static bool end_token(RTokenizer *tok) {
|
|
const char* tt = tokentypes[tok->type];
|
|
const int tok_len = (size_t)(tok->end - tok->begin);
|
|
const char* tok_buf = tok->buf + tok->begin;
|
|
const char *r = r_str_pad (' ', tok->indent * 4);
|
|
if (tok->cb) {
|
|
return tok->cb (tok);
|
|
}
|
|
eprintf ("[%s]%s%.*s%c", tt, r, tok_len, tok_buf, 10);
|
|
return true;
|
|
}
|
|
|
|
static bool start_token(RTokenizer *tok, char ch) {
|
|
switch (ch) {
|
|
case '\'':
|
|
case '"':
|
|
tok->type = R_TOKEN_STRING;
|
|
tok->ch = ch;
|
|
return true;
|
|
case '/':
|
|
tok->type = R_TOKEN_COMMENT;
|
|
break;
|
|
case '(':
|
|
case '{':
|
|
case '[':
|
|
tok->indent ++;
|
|
tok->type = R_TOKEN_GROUP;
|
|
return false;
|
|
case ')':
|
|
case '}':
|
|
case ']':
|
|
tok->indent --;
|
|
tok->type = R_TOKEN_GROUP;
|
|
return false;
|
|
case '#':
|
|
tok->type = R_TOKEN_HASH;
|
|
return true;
|
|
case '<':
|
|
case '>':
|
|
case '=':
|
|
case '+':
|
|
case '-':
|
|
case '*':
|
|
case '?':
|
|
case '|':
|
|
case '&':
|
|
case '%':
|
|
case '^':
|
|
case ':':
|
|
case ';':
|
|
case ',':
|
|
case '.':
|
|
tok->type = R_TOKEN_MATH;
|
|
return false;
|
|
}
|
|
if (isalpha (ch)) {
|
|
tok->type = R_TOKEN_WORD;
|
|
}
|
|
if (ch >= '0' && ch <= '9') {
|
|
tok->type = R_TOKEN_INT;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
static bool is_token_char(RTokenizer *tok, char ch) {
|
|
switch (tok->type) {
|
|
case R_TOKEN_BEGIN:
|
|
case R_TOKEN_END:
|
|
return false;
|
|
case R_TOKEN_NONE:
|
|
// ERROR
|
|
return false;
|
|
case R_TOKEN_HASH:
|
|
return (isdigit (ch) || ch == '#' || ch == '_') || (isalpha (ch) && !IS_WHITESPACE (ch));
|
|
case R_TOKEN_COMMENT:
|
|
if (tok->end-tok->begin == 0) {
|
|
if (ch != '/') {
|
|
tok->type = R_TOKEN_MATH;
|
|
return false;
|
|
}
|
|
}
|
|
return (ch != '\n');
|
|
case R_TOKEN_WORD:
|
|
return (isdigit (ch) || ch == '#' || ch == '_') || (isalpha (ch) && !IS_WHITESPACE (ch));
|
|
case R_TOKEN_INT:
|
|
if (ch == 'x') {
|
|
tok->hex = true;
|
|
return true;
|
|
}
|
|
if (ch == '.') {
|
|
tok->type = R_TOKEN_FLOAT;
|
|
return true;
|
|
}
|
|
if (tok->hex) {
|
|
if (ch >= 'a' && ch <= 'f') {
|
|
return true;
|
|
}
|
|
if (ch >= 'A' && ch <= 'F') {
|
|
return true;
|
|
}
|
|
}
|
|
return ch >= '0' && ch <= '9';
|
|
case R_TOKEN_FLOAT:
|
|
return isdigit (ch) || ch == 'f'; // XXX 'f' is the last char
|
|
case R_TOKEN_STRING:
|
|
if (tok->escape) {
|
|
tok->escape = false;
|
|
} else {
|
|
if (ch == tok->ch) {
|
|
return false;
|
|
}
|
|
if (ch == '\\') {
|
|
tok->escape = true;
|
|
}
|
|
}
|
|
return true;
|
|
case R_TOKEN_GROUP:
|
|
case R_TOKEN_MATH:
|
|
// those are one char tokens
|
|
return false;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
R_API void r_str_tokenize(const char *buf, RTokenizerCallback cb, void *user) {
|
|
// eprintf ("tokenize(%s)%c", buf, 10);
|
|
RTokenizer *tok = R_NEW0 (RTokenizer);
|
|
tok->cb = cb;
|
|
tok->user = user;
|
|
size_t i = 0;
|
|
size_t len = strlen (buf);
|
|
tok->buf = buf;
|
|
|
|
tok->type = R_TOKEN_BEGIN;
|
|
end_token (tok);
|
|
|
|
while (i < len) {
|
|
tok->hex = false;
|
|
tok->type = R_TOKEN_NONE;
|
|
while (i < len && !is_token_begin (tok, buf[i])) {
|
|
i++;
|
|
}
|
|
if (i == len) {
|
|
break;
|
|
}
|
|
tok->ch = buf[i];
|
|
tok->begin = i;
|
|
tok->end = i;
|
|
if (start_token (tok, buf[i])) {
|
|
tok->begin++;
|
|
i++;
|
|
}
|
|
while (i < len && is_token_char (tok, buf[i])) {
|
|
i++;
|
|
tok->end = i;
|
|
}
|
|
if (tok->type == R_TOKEN_GROUP) {
|
|
tok->end = i;
|
|
i++;
|
|
} else if (tok->type == R_TOKEN_MATH) {
|
|
i++;
|
|
tok->end = i;
|
|
} else {
|
|
tok->end = i;
|
|
}
|
|
if (tok->type == R_TOKEN_STRING) {
|
|
i++;
|
|
}
|
|
if (tok->type != R_TOKEN_NONE) {
|
|
end_token (tok);
|
|
} else {
|
|
i++;
|
|
}
|
|
}
|
|
tok->type = R_TOKEN_END;
|
|
end_token (tok);
|
|
}
|
|
|
|
typedef struct {
|
|
char* word;
|
|
int parlevel;
|
|
bool inswitch;
|
|
bool incase;
|
|
bool inassign;
|
|
bool inreturn;
|
|
RList *args;
|
|
char *s;
|
|
PJ *pj;
|
|
} Data;
|
|
|
|
static void indent(RTokenizer *tok) {
|
|
Data *data = tok->user;
|
|
int n = 2 * ((tok->type == R_TOKEN_GROUP)? (tok->indent-1): tok->indent);
|
|
if (data->incase) {
|
|
n++;
|
|
}
|
|
eprintf ("%s", r_str_pad (' ', n));
|
|
}
|
|
|
|
bool callback(RTokenizer *tok) {
|
|
Data *data = tok->user;
|
|
switch (tok->type) {
|
|
case R_TOKEN_NONE:
|
|
case R_TOKEN_COMMENT:
|
|
break;
|
|
case R_TOKEN_HASH:
|
|
{
|
|
char *h = r_str_ndup (tok->buf + tok->begin, tok->end - tok->begin);
|
|
if (data->pj) {
|
|
pj_ks (data->pj, "directive", h);
|
|
} else {
|
|
eprintf ("DIRECTIVE (%s)%c", h, 10);
|
|
}
|
|
free (h);
|
|
}
|
|
break;
|
|
case R_TOKEN_WORD:
|
|
free (data->word);
|
|
data->word = r_str_ndup (tok->buf + tok->begin, tok->end - tok->begin);
|
|
// eprintf ("WORD (%s)%c", data->word, 10);
|
|
if (data->incase) {
|
|
// eprintf ("CASE WORD (%s)%c", data->word, 10);
|
|
// data->incase = false;
|
|
break;
|
|
}
|
|
if (!strcmp (data->word, "case")) {
|
|
R_FREE (data->word);
|
|
data->incase = true;
|
|
break;
|
|
}
|
|
if (!strcmp (data->word, "default")) {
|
|
break;
|
|
}
|
|
if (!strcmp (data->word, "return")) {
|
|
if (data->pj) {
|
|
pj_o (data->pj);
|
|
pj_ks (data->pj, "node", "return");
|
|
} else {
|
|
indent (tok);
|
|
eprintf ("RETURN%c",10);
|
|
}
|
|
R_FREE (data->s);
|
|
data->inreturn = true;
|
|
return false;
|
|
}
|
|
if (!strcmp (data->word, "break")) {
|
|
break;
|
|
}
|
|
if (data->s) {
|
|
data->s = r_str_append (data->s, " ");
|
|
}
|
|
data->s = r_str_appendlen (data->s, tok->buf + tok->begin, tok->end - tok->begin);
|
|
// eprintf ("WORD(%s)%c", data->word, 10);
|
|
break;
|
|
case R_TOKEN_STRING:
|
|
{ char *word = r_str_ndup (tok->buf + tok->begin, tok->end - tok->begin);
|
|
// eprintf ("STRING(%s)%c", word, 10);
|
|
free (word);
|
|
}
|
|
if (data->s) {
|
|
data->s = r_str_append (data->s, " ");
|
|
}
|
|
data->s = r_str_appendlen (data->s, tok->buf + tok->begin, tok->end - tok->begin);
|
|
break;
|
|
case R_TOKEN_GROUP:
|
|
if (data->inassign) {
|
|
break;
|
|
}
|
|
switch (tok->ch) {
|
|
case '}':
|
|
R_FREE (data->s);
|
|
pj_end (data->pj);
|
|
pj_end (data->pj);
|
|
break;
|
|
}
|
|
if (tok->ch == ')') {
|
|
data->parlevel--;
|
|
data->s = r_str_appendlen (data->s, tok->buf + tok->begin, tok->end - tok->begin);
|
|
if (data->args) {
|
|
r_list_append (data->args, data->s);
|
|
data->s = NULL;
|
|
char *arg;
|
|
RListIter *iter;
|
|
r_list_foreach (data->args, iter, arg) {
|
|
if (arg) {
|
|
eprintf ("%s", r_str_pad (' ', (tok->indent + 1) * 2));
|
|
eprintf (" - %s%c", arg, 10);
|
|
if (data->pj) {
|
|
char *lz = (char *)r_str_rchr (arg, NULL, ' ');
|
|
if (lz) {
|
|
*lz++ = 0;
|
|
pj_o (data->pj);
|
|
pj_ks (data->pj, "name", lz);
|
|
pj_ks (data->pj, "type", arg);
|
|
pj_end (data->pj);
|
|
} else {
|
|
pj_s (data->pj, arg);
|
|
pj_s (data->pj, arg);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
r_list_free (data->args);
|
|
data->args = NULL;
|
|
if (data->pj) {
|
|
pj_end (data->pj);
|
|
}
|
|
}
|
|
} else if (tok->ch == '{') {
|
|
if (data->word) {
|
|
if (!strcmp (data->word, "else")) {
|
|
indent (tok);
|
|
eprintf ("ELSE %d%c", tok->indent, 10);
|
|
r_list_free (data->args);
|
|
data->args = NULL;
|
|
R_FREE (data->s);
|
|
}
|
|
} else {
|
|
R_FREE (data->s);
|
|
}
|
|
pj_ka (data->pj, "body");
|
|
} else if (tok->ch == '(') {
|
|
data->parlevel++;
|
|
if (data->word) {
|
|
if (!strcmp (data->word, "if")) {
|
|
indent (tok);
|
|
eprintf ("IF %d%c", tok->indent, 10);
|
|
} else if (!strcmp (data->word, "switch")) {
|
|
data->inswitch = true;
|
|
indent (tok);
|
|
eprintf ("SWITCH%c", 10);
|
|
R_FREE (data->word);
|
|
} else {
|
|
if (tok->indent == 1) {
|
|
if (data->pj) {
|
|
pj_ko (data->pj, data->word);
|
|
pj_ks (data->pj, "type", "symbol");
|
|
pj_ks (data->pj, "name", data->word);
|
|
pj_ka (data->pj, "args");
|
|
} else {
|
|
eprintf ("FUNC (%s)%c", data->word, 10);
|
|
}
|
|
} else {
|
|
if (data->pj) {
|
|
pj_o (data->pj);
|
|
pj_ks (data->pj, "type", "call");
|
|
pj_ks (data->pj, "name", data->word);
|
|
pj_ka (data->pj, "args");
|
|
} else {
|
|
indent (tok);
|
|
eprintf ("CALL (%s)%c", data->word, 10);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
R_FREE (data->s);
|
|
if (data->word) {
|
|
data->args = r_list_newf (free);
|
|
}
|
|
R_FREE (data->word);
|
|
}
|
|
break;
|
|
case R_TOKEN_INT:
|
|
case R_TOKEN_FLOAT:
|
|
if (data->incase || data->inassign) {
|
|
R_FREE (data->word);
|
|
data->s = r_str_appendlen (data->s, tok->buf + tok->begin, tok->end - tok->begin);
|
|
data->incase = false;
|
|
// data->inassign = false;
|
|
break;
|
|
} else {
|
|
if (!data->s) {
|
|
data->s = r_str_ndup (tok->buf + tok->begin, tok->end - tok->begin);
|
|
}
|
|
}
|
|
if (data->incase) {
|
|
char *s = r_str_ndup (tok->buf + tok->begin, tok->end - tok->begin);
|
|
// data->s = r_str_appendlen (data->s, tok->buf + tok->begin, tok->end - tok->begin);
|
|
indent (tok);
|
|
eprintf ("CASE (%s)%c", s, 10);
|
|
data->incase = false;
|
|
R_FREE (data->word);
|
|
}
|
|
break;
|
|
// fallthru
|
|
case R_TOKEN_MATH:
|
|
if (data->incase) {
|
|
data->incase = false;
|
|
R_FREE (data->word);
|
|
}
|
|
switch (tok->ch) {
|
|
case '=':
|
|
// eprintf ("PAR %d %c", data->parlevel, 10);
|
|
if (data->parlevel == 0) {
|
|
indent (tok);
|
|
data->inassign = true;
|
|
if (data->word && data->pj) {
|
|
pj_o (data->pj);
|
|
pj_ks (data->pj, "node", "assign");
|
|
if (strchr (data->s, '>') || strchr (data->s, '<')) {
|
|
pj_ks (data->pj, "var", data->word);
|
|
} else {
|
|
pj_ks (data->pj, "var", data->word);
|
|
//pj_ks (data->pj, "type", data->s);
|
|
R_FREE (data->s);
|
|
}
|
|
}
|
|
R_FREE (data->word);
|
|
}
|
|
break;
|
|
case ':':
|
|
if (data->word) {
|
|
// eprintf ("CASE %s%c", data->word, 10);
|
|
break;
|
|
}
|
|
case '\n':
|
|
case ';':
|
|
if (data->inreturn) {
|
|
indent (tok);
|
|
// eprintf ("-- ARG (%s)%c", data->s, 10);
|
|
if (data->pj) {
|
|
pj_ks (data->pj, "value", data->s);
|
|
pj_end (data->pj);
|
|
}
|
|
}
|
|
if (data->inassign) {
|
|
indent(tok);
|
|
// eprintf ("-- ARG (%s)%c", data->s, 10);
|
|
data->inassign = false;
|
|
if (data->pj) {
|
|
pj_ks (data->pj, "value", data->s);
|
|
pj_end (data->pj);
|
|
}
|
|
}
|
|
R_FREE (data->word);
|
|
R_FREE (data->s);
|
|
break;
|
|
case '*':
|
|
case '+':
|
|
case '-':
|
|
case '%':
|
|
case '&':
|
|
case '|':
|
|
case '<':
|
|
case '>':
|
|
data->s = r_str_appendlen (data->s, tok->buf + tok->begin, tok->end - tok->begin);
|
|
break;
|
|
case ',':
|
|
if (data->s) {
|
|
if (data->args) {
|
|
r_list_append (data->args, data->s);
|
|
}
|
|
data->s = NULL;
|
|
}
|
|
R_FREE (data->word);
|
|
break;
|
|
default:
|
|
R_FREE (data->word);
|
|
R_FREE (data->s);
|
|
break;
|
|
}
|
|
// eprintf ("ARG%c%c",tok->ch, 10);
|
|
break;
|
|
case R_TOKEN_BEGIN:
|
|
case R_TOKEN_END:
|
|
// free the data
|
|
// eprintf ("DONE%c", 10);
|
|
break;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
R_API char *r_str_tokenize_json(const char *buf) {
|
|
Data data = {0};
|
|
data.pj = pj_new ();
|
|
pj_o (data.pj);
|
|
r_str_tokenize (buf, (RTokenizerCallback)callback, &data);
|
|
pj_end (data.pj);
|
|
data.pj->level = 0; // force level 0 to permit invalid jsons for now
|
|
char *o = pj_drain (data.pj);
|
|
char *p = r_str_newf ("%s\n", o);
|
|
free (o);
|
|
return p;
|
|
}
|
|
|
|
#if 0
|
|
//
|
|
int main() {
|
|
tokenize("Hello World", NULL, NULL);
|
|
tokenize("hello('this', 33, true);", NULL, NULL);
|
|
tokenize(
|
|
" // hello world this is very new\n"
|
|
" int main(int argc, char **argv) {\n"
|
|
" printf (\"Hello %s\", \"world\");}\n"
|
|
" }\n"
|
|
, NULL, NULL
|
|
);
|
|
Data data = {0};
|
|
char *s = r_file_slurp ("a.c", NULL);
|
|
data.pj = pj_new ();
|
|
pj_o (data.pj);
|
|
tokenize (s, callback, &data);
|
|
pj_end (data.pj);
|
|
char *o = pj_drain (data.pj);
|
|
printf ("%s%c", o, 10);
|
|
free (o);
|
|
free (s);
|
|
}
|
|
#endif
|