mirror of
https://github.com/radareorg/radare2.git
synced 2025-01-25 07:15:19 +00:00
x86 opcodes: http://en.wikipedia.org/wiki/X86_instruction_listings NOTE: Most of the information in this document is not matching with reality. Take it as random ideas, proposals and so on Code analysis module ==================== * Opcodes that will be executed depending on cond? - for example: (x86, arm..) (0f94c2 setz dl) * Direction of the stack? (inc/dec) required? * Register value source type - This is static entropy level for a register at some point - Constant value mov eax, 33 mov eax, [const] ; from ro memory static_entropy = 0; - Variable mov eax, [rwmem] ; from rw memory (variable) static_entropy = 1; - Modification add eax, ebx ; from rw memory (variable) static_entropy ++; * At any point of the program we can determine if a register has a static fixed value or the level of possible polimorfism -- store register values in execution traces //////////////////////////////////////// Global picture (anal) -> can keep track of results of different context (functions ...) | `---> we get a context.. so we work there with (anal context owns stack, regs, ...) - able to detect function arguments - we can configure the context in a way or other - it is able to get info from global anal - feeded with bytes r_anal_get_bb(an, 0x804800); r_anal_op_t * op = r_anal_get_op(an, 0x804800); r_anal_get_fun(an, 0x804800); ---------------------------------------- // Must use r_alloc_pool for every type of structure (per function level) // Must store all this info using r_db // Only index when requested (tempral analysis are temporal) // Memory selectors are just modifiers .. how? // How to handle with self-modifying code? - if its a conditional branch, refs are true , false - if not and there is more than one branch is all the possibilities - if an address is accessed in read|write and exec mode we should warn! xrefs[] = { addr = 0x8048480 type = R|W|X - executable xrefs are control flow branches, - read/write are for data } refs[] = { op = eq,add,mul ?? reg = regidx addr = 0x8048580 type = R|W|X } // we need an api in r_buf to modify bits with endian and values.. struct bin { int offset; int size; int endian; }; enum type { IMM REG MEM }; struct r_anal_value_t { int op; // NOP, ADD, SEL, ... int type; // opcode, reg, imm, addr ut64 num; // idxofreg, immvalue, addrnum struct bin bin; int size; int nextop; // ADD, MUL, ... struct r_anal_value_t *next; }; struct arg { int rw; // READ | WRITE direction int nv; // number of values struct r_anal_value_t *v; }; mov eax, [0x8048+eax*4] mov -> args = { "eax", {0x8048 {+eax*4}} } struct r_anal_ref_t { int type; // READ, WRITE, EXEC struct r_anal_value_t value; }; struct r_anal_op_t { ut64 addr; int frame; int type; int cond; int nestlevel; int length; int crc; struct r_anal_value_t rep; int nargs; struct arg args[]; struct r_anal_op_t *next; int nrefs; struct r_anal_ref_t refs[]; int nxrefs; struct r_anal_ref_t xrefs[]; }; /* basic block */ struct r_anal_bb_t { ut64 addr; int type; int size; ut8 *bytes; struct r_anal_op_t *head; // opcode heading this basic block struct r_anal_ref_t refs[]; struct r_anal_ref_t xrefs[]; }; /* function */ struct r_anal_fun_t { char *name; ut64 addr; int size; // XXX: use r_ranges instead of addr+size? struct r_anal_ref_t refs[]; struct r_anal_ref_t xrefs[]; }; /* used to emulate */ struct r_anal_arch_t { struct r_reg_t reg; char **regs; int pc; // program counter int sp; // stack pointer int bp; // base pointer int gp; // global pointer int sr; // src int dr; // dst }; const char **regs = { "eax", "ebx", "ecx", "...", NULL }; if (opcode.xrefs[i].type & R_ANAL_XS_EXEC) // compilation process defines a mapping between the binary representation // of an opcode into an AST of structs describing the opcode itself or // we can just serialize it into a evaluable string // - evaluable strings are cheaper in memory consumption // - strstr(es, "%eax") easy way to check if a register is used // - the eval string should be converted into an AST at some point Analysis levels: ================ - opcode level - frame size - conditional (used by branches(jumps) and arm opcodes) - weight (importance) (if <0, it is a nop) trash detection - XXX file/line (dwarf nfo??? here) i think no - lifetime of register value (detect if - nesting level (branch analysis) - sign - type -- operand level: - bitsize - mem | reg | imm - value - direction (read|write) - operand index - basic block level - bytes + length + (checksum?) - type (head, tail, body, last) - xrefs (branches to here) - refs (must be an array) - true branch - false branch - destinations[] // for call eax and so - function level - name - offset range (r_range here, functions do not need to be linear) - variables (use r_var) (( merge r_var here? )) - arguments ("") - xrefs - calls (outrefs) == graph simplification (serialize blocks with direct branches (jmp)) - program - comprends data + code trees - all references must be stored twice - r_range of functions, data and other shit Context analysis: ================= - Merge r_vm here -- multiarchitecture code emulation - Allows to track register lifetime, - Detect possible values for 'call eax' f.ex - Identify fake conditional branches TEH RIR ======= The radare intermediate representation. - ascii representation of opcode level analysis -- epilog/prolog bytez for extra function detection Architecture language ===================== Allows to describe an architecture (byte parsing, read/write) - opcode reassembling - automatic code analysis r_anal_opcode_set(op, R_OPTYPE_ADD); - opcode level analysis can be manually modified in runtime - basic blocks can change Decompilation ============= Use ALT .. in a inverse way OMG thats freaking ///////////////////////////////////////////////////////////// opcode_analyze () - parse bytes and fill an structure - opcode type and arguments - underlying vm code opcode_modify () - modify the bytes based on the structure changes - the structure should expose the bit level info to make this possible // this is // * modify reg, immediate or memory values +--------------+ | AnalArchLang | ** +--------------+ if [arg0 == 0xff] { reg = { eax, ecx, edx, ebx, esp, ebp, esi, edi } jmp [0xe0+reg] jmp [0xe8+reg] reg = { eax, ecx, edx, ebx, esp, ebp, esi, edi } push [0xf0+reg] reg = { eax, ecx, edx, ebx, esp, ebp, esi, edi } call [0xd0+reg] call [0xd8+reg] } [0:7]=e8 { type = "call" addr = [8:31] len = 5 } [0:7]=50 && [0:7]<60 { type = "push" len = 1 } [0:7]=c3 { type = "ret" len = 1 } BASIC OPS we need for the IR ============================ -- this is RISC! :D Each opcode must support a size value. The format is: We need some intermediate temporal registers lispy assembly: (addi eax 3) (addi *(+ eax 8) 3) lea edi, [ecx*4-0x4] (set edi (- (* ecx 4) 4) (set edi (* ecx 4 - 4)) ; iterative format 1 byte 1 N N [ opcode ] [ type|size ] [ arg ] [ arg ] type = [ op | reg | mem | imm ] ; 2 bits is enought size = 1, 2, 4, 8 ; byte level ADD reg, reg SUB reg, reg JMP reg JMP imm JMP mem SET reg, imm STO mem, reg ; store register value into memory LOA reg, mem ; load memory value into register ...