mirror of
https://github.com/radareorg/radare2.git
synced 2024-12-09 05:54:57 +00:00
306 lines
7.4 KiB
Plaintext
306 lines
7.4 KiB
Plaintext
x86 opcodes: http://en.wikipedia.org/wiki/X86_instruction_listings
|
|
|
|
NOTE: Most of the information in this document is not matching with reality.
|
|
Take it as random ideas, proposals and so on
|
|
|
|
Code analysis module
|
|
====================
|
|
* Opcodes that will be executed depending on cond?
|
|
- for example: (x86, arm..) (0f94c2 setz dl)
|
|
* Direction of the stack? (inc/dec) required?
|
|
* Register value source type
|
|
- This is static entropy level for a register at some point
|
|
- Constant value
|
|
mov eax, 33
|
|
mov eax, [const] ; from ro memory
|
|
static_entropy = 0;
|
|
- Variable
|
|
mov eax, [rwmem] ; from rw memory (variable)
|
|
static_entropy = 1;
|
|
- Modification
|
|
add eax, ebx ; from rw memory (variable)
|
|
static_entropy ++;
|
|
|
|
* At any point of the program we can determine if a register
|
|
has a static fixed value or the level of possible polimorfism
|
|
|
|
-- store register values in execution traces
|
|
|
|
////////////////////////////////////////
|
|
|
|
Global picture
|
|
|
|
(anal) -> can keep track of results of different context (functions ...)
|
|
|
|
|
`---> we get a context.. so we work there with
|
|
(anal context owns stack, regs, ...)
|
|
- able to detect function arguments
|
|
- we can configure the context in a way or other
|
|
- it is able to get info from global anal
|
|
- feeded with bytes
|
|
|
|
r_anal_get_bb(an, 0x804800);
|
|
r_anal_op_t * op = r_anal_get_op(an, 0x804800);
|
|
r_anal_get_fun(an, 0x804800);
|
|
|
|
----------------------------------------
|
|
|
|
// Must use r_alloc_pool for every type of structure (per function level)
|
|
// Must store all this info using r_db
|
|
// Only index when requested (tempral analysis are temporal)
|
|
// Memory selectors are just modifiers .. how?
|
|
// How to handle with self-modifying code?
|
|
- if its a conditional branch, refs are true , false
|
|
- if not and there is more than one branch is all the possibilities
|
|
- if an address is accessed in read|write and exec mode we should warn!
|
|
xrefs[] = {
|
|
addr = 0x8048480
|
|
type = R|W|X - executable xrefs are control flow branches,
|
|
- read/write are for data
|
|
}
|
|
refs[] = {
|
|
op = eq,add,mul ??
|
|
reg = regidx
|
|
addr = 0x8048580
|
|
type = R|W|X
|
|
}
|
|
|
|
// we need an api in r_buf to modify bits with endian and values..
|
|
struct bin {
|
|
int offset;
|
|
int size;
|
|
int endian;
|
|
};
|
|
|
|
enum type {
|
|
IMM
|
|
REG
|
|
MEM
|
|
};
|
|
|
|
struct r_anal_value_t {
|
|
int op; // NOP, ADD, SEL, ...
|
|
int type; // opcode, reg, imm, addr
|
|
ut64 num; // idxofreg, immvalue, addrnum
|
|
struct bin bin;
|
|
int size;
|
|
int nextop; // ADD, MUL, ...
|
|
struct r_anal_value_t *next;
|
|
};
|
|
|
|
struct arg {
|
|
int rw; // READ | WRITE direction
|
|
int nv; // number of values
|
|
struct r_anal_value_t *v;
|
|
};
|
|
|
|
mov eax, [0x8048+eax*4]
|
|
|
|
mov -> args = { "eax", {0x8048 {+eax*4}} }
|
|
|
|
struct r_anal_ref_t {
|
|
int type; // READ, WRITE, EXEC
|
|
struct r_anal_value_t value;
|
|
};
|
|
|
|
struct r_anal_op_t {
|
|
ut64 addr;
|
|
int frame;
|
|
int type;
|
|
int cond;
|
|
int nestlevel;
|
|
int length;
|
|
int crc;
|
|
struct r_anal_value_t rep;
|
|
int nargs;
|
|
struct arg args[];
|
|
|
|
struct r_anal_op_t *next;
|
|
int nrefs;
|
|
struct r_anal_ref_t refs[];
|
|
int nxrefs;
|
|
struct r_anal_ref_t xrefs[];
|
|
};
|
|
|
|
/* basic block */
|
|
struct r_anal_bb_t {
|
|
ut64 addr;
|
|
int type;
|
|
int size;
|
|
ut8 *bytes;
|
|
struct r_anal_op_t *head; // opcode heading this basic block
|
|
struct r_anal_ref_t refs[];
|
|
struct r_anal_ref_t xrefs[];
|
|
};
|
|
|
|
/* function */
|
|
struct r_anal_fun_t {
|
|
char *name;
|
|
ut64 addr;
|
|
int size;
|
|
// XXX: use r_ranges instead of addr+size?
|
|
struct r_anal_ref_t refs[];
|
|
struct r_anal_ref_t xrefs[];
|
|
};
|
|
|
|
/* used to emulate */
|
|
struct r_anal_arch_t {
|
|
struct r_reg_t reg;
|
|
char **regs;
|
|
int pc; // program counter
|
|
int sp; // stack pointer
|
|
int bp; // base pointer
|
|
int gp; // global pointer
|
|
int sr; // src
|
|
int dr; // dst
|
|
};
|
|
|
|
const char **regs = { "eax", "ebx", "ecx", "...", NULL };
|
|
|
|
if (opcode.xrefs[i].type & R_ANAL_XS_EXEC)
|
|
|
|
// compilation process defines a mapping between the binary representation
|
|
// of an opcode into an AST of structs describing the opcode itself or
|
|
// we can just serialize it into a evaluable string
|
|
// - evaluable strings are cheaper in memory consumption
|
|
// - strstr(es, "%eax") easy way to check if a register is used
|
|
// - the eval string should be converted into an AST at some point
|
|
|
|
Analysis levels:
|
|
================
|
|
- opcode level
|
|
- frame size
|
|
- conditional (used by branches(jumps) and arm opcodes)
|
|
- weight (importance) (if <0, it is a nop) trash detection
|
|
- XXX file/line (dwarf nfo??? here) i think no
|
|
- lifetime of register value (detect if
|
|
- nesting level (branch analysis)
|
|
- sign
|
|
- type
|
|
-- operand level:
|
|
- bitsize
|
|
- mem | reg | imm
|
|
- value
|
|
- direction (read|write)
|
|
- operand index
|
|
- basic block level
|
|
- bytes + length + (checksum?)
|
|
- type (head, tail, body, last)
|
|
- xrefs (branches to here)
|
|
- refs (must be an array)
|
|
- true branch
|
|
- false branch
|
|
- destinations[] // for call eax and so
|
|
- function level
|
|
- name
|
|
- offset range (r_range here, functions do not need to be linear)
|
|
- variables (use r_var) (( merge r_var here? ))
|
|
- arguments ("")
|
|
- xrefs
|
|
- calls (outrefs)
|
|
== graph simplification (serialize blocks with direct branches (jmp))
|
|
- program
|
|
- comprends data + code trees
|
|
- all references must be stored twice
|
|
- r_range of functions, data and other shit
|
|
|
|
Context analysis:
|
|
=================
|
|
- Merge r_vm here -- multiarchitecture code emulation
|
|
- Allows to track register lifetime,
|
|
- Detect possible values for 'call eax' f.ex
|
|
- Identify fake conditional branches
|
|
|
|
TEH RIR
|
|
=======
|
|
The radare intermediate representation.
|
|
- ascii representation of opcode level analysis
|
|
|
|
-- epilog/prolog bytez for extra function detection
|
|
|
|
Architecture language
|
|
=====================
|
|
Allows to describe an architecture (byte parsing, read/write)
|
|
- opcode reassembling
|
|
- automatic code analysis
|
|
r_anal_opcode_set(op, R_OPTYPE_ADD);
|
|
- opcode level analysis can be manually modified in runtime
|
|
- basic blocks can change
|
|
|
|
Decompilation
|
|
=============
|
|
Use ALT .. in a inverse way OMG thats freaking
|
|
|
|
/////////////////////////////////////////////////////////////
|
|
|
|
opcode_analyze ()
|
|
- parse bytes and fill an structure
|
|
- opcode type and arguments
|
|
- underlying vm code
|
|
opcode_modify ()
|
|
- modify the bytes based on the structure changes
|
|
- the structure should expose the bit level info to make this possible
|
|
// this is //
|
|
* modify reg, immediate or memory values
|
|
|
|
+--------------+
|
|
| AnalArchLang | **
|
|
+--------------+
|
|
if [arg0 == 0xff] {
|
|
reg = { eax, ecx, edx, ebx, esp, ebp, esi, edi }
|
|
jmp [0xe0+reg]
|
|
jmp [0xe8+reg]
|
|
|
|
reg = { eax, ecx, edx, ebx, esp, ebp, esi, edi }
|
|
push [0xf0+reg]
|
|
|
|
reg = { eax, ecx, edx, ebx, esp, ebp, esi, edi }
|
|
call [0xd0+reg]
|
|
call [0xd8+reg]
|
|
}
|
|
|
|
[0:7]=e8 {
|
|
type = "call"
|
|
addr = [8:31]
|
|
len = 5
|
|
}
|
|
[0:7]=50 && [0:7]<60 {
|
|
type = "push"
|
|
len = 1
|
|
}
|
|
[0:7]=c3 {
|
|
type = "ret"
|
|
len = 1
|
|
}
|
|
|
|
BASIC OPS we need for the IR
|
|
============================ -- this is RISC! :D
|
|
|
|
Each opcode must support a size value. The format is:
|
|
We need some intermediate temporal registers
|
|
|
|
|
|
lispy assembly:
|
|
(addi eax 3)
|
|
(addi *(+ eax 8) 3)
|
|
|
|
lea edi, [ecx*4-0x4]
|
|
(set edi (- (* ecx 4) 4)
|
|
(set edi (* ecx 4 - 4)) ; iterative format
|
|
|
|
1 byte 1 N N
|
|
[ opcode ] [ type|size ] [ arg ] [ arg ]
|
|
|
|
type = [ op | reg | mem | imm ] ; 2 bits is enought
|
|
size = 1, 2, 4, 8 ; byte level
|
|
|
|
ADD reg, reg
|
|
SUB reg, reg
|
|
JMP reg
|
|
JMP imm
|
|
JMP mem
|
|
SET reg, imm
|
|
STO mem, reg ; store register value into memory
|
|
LOA reg, mem ; load memory value into register
|
|
...
|